Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@
class CCIfNotInReg<CCAction A> : CCIf<"!ArgFlags.isInReg()", A> {}
class CCIfExtend<CCAction A>
: CCIf<"ArgFlags.isSExt() || ArgFlags.isZExt()", A>;
class CCIfOrigTypeShaderCCIsSGPR<CCAction A>
: CCIf<[{(!OrigTy->getScalarType()->isFloatTy() &&
!OrigTy->getScalarType()->isHalfTy()) }], A>;


// Calling convention for SI
def CC_SI_Gfx : CallingConv<[
Expand Down Expand Up @@ -56,14 +60,15 @@ def CC_SI_SHADER : CallingConv<[
>>>
]>;


def RetCC_SI_Shader : CallingConv<[
CCIfType<[i1, i16], CCIfExtend<CCPromoteToType<i32>>>,
CCIfType<[i32, i16, v2i16] , CCAssignToReg<
CCIfType<[i32, i16, v2i16], CCIfOrigTypeShaderCCIsSGPR<CCAssignToReg<
!foreach(i, !range(0, 44), !cast<Register>("SGPR"#i)) // SGPR0-43
>>,
>>>,

// 32*4 + 4 is the minimum for a fetch shader with 32 outputs.
CCIfType<[f32, f16, v2f16, bf16, v2bf16] , CCAssignToReg<
CCIfType<[f32, f16, v2f16, bf16, v2bf16, i32, i16, v2i16] , CCAssignToReg<
!foreach(i, !range(0, 136), !cast<Register>("VGPR"#i)) // VGPR0-135
>>
]>;
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -481,7 +481,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64},
Custom);

setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
setOperationAction({ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP}, MVT::f64,
Expand);
setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom);

const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1121,6 +1121,9 @@ MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
}

if (!Subtarget->has16BitInsts() && VT.getSizeInBits() == 16)
return MVT::i32;

if (VT.getSizeInBits() > 32)
return MVT::i32;

Expand Down
1 change: 1 addition & 0 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1664,6 +1664,7 @@ define amdgpu_ps <2 x half> @fma_v2s16_uniform(<2 x half> inreg %a, <2 x half> i
; GFX6-NEXT: s_and_b32 s1, 0xffff, s1
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: fma_v2s16_uniform:
Expand Down
94 changes: 48 additions & 46 deletions llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll
Original file line number Diff line number Diff line change
Expand Up @@ -15,22 +15,17 @@ define half @bitcast_i16_to_f16(i16 %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr0
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execnz .LBB0_3
; SI-NEXT: ; %bb.1: ; %Flow
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execnz .LBB0_4
; SI-NEXT: .LBB0_2: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB0_3: ; %cmp.false
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: v_cvt_f32_f16_e32 v0, v2
; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: ; %bb.2: ; %Flow
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB0_2
; SI-NEXT: .LBB0_4: ; %cmp.true
; SI-NEXT: ; %bb.3: ; %cmp.true
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v2
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: ; %bb.4: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: bitcast_i16_to_f16:
Expand Down Expand Up @@ -125,6 +120,7 @@ define inreg half @bitcast_i16_to_f16_scalar(i16 inreg %a, i32 inreg %b) {
; SI-NEXT: s_add_i32 s6, s6, 3
; SI-NEXT: v_cvt_f32_f16_e32 v0, s6
; SI-NEXT: .LBB1_3: ; %end
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB1_4:
; SI-NEXT: ; implicit-def: $vgpr0
Expand Down Expand Up @@ -199,8 +195,9 @@ define i16 @bitcast_f16_to_i16(half %a, i32 %b) {
; SI-LABEL: bitcast_f16_to_i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
Expand Down Expand Up @@ -294,8 +291,9 @@ define inreg i16 @bitcast_f16_to_i16_scalar(half inreg %a, i32 inreg %b) {
; SI-LABEL: bitcast_f16_to_i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, s16
; SI-NEXT: v_cvt_f32_f16_e32 v0, s16
; SI-NEXT: s_cmp_lg_u32 s17, 0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_cbranch_scc0 .LBB3_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_cbranch_execnz .LBB3_3
Expand Down Expand Up @@ -408,6 +406,8 @@ define bfloat @bitcast_i16_to_bf16(i16 %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0
; SI-NEXT: ; %bb.2: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: bitcast_i16_to_bf16:
Expand Down Expand Up @@ -502,7 +502,8 @@ define inreg bfloat @bitcast_i16_to_bf16_scalar(i16 inreg %a, i32 inreg %b) {
; SI-NEXT: s_lshl_b32 s4, s6, 16
; SI-NEXT: s_add_i32 s7, s4, 0x30000
; SI-NEXT: .LBB5_3: ; %end
; SI-NEXT: v_mov_b32_e32 v0, s7
; SI-NEXT: v_mul_f32_e64 v0, 1.0, s7
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB5_4:
; SI-NEXT: ; implicit-def: $sgpr7
Expand Down Expand Up @@ -577,6 +578,7 @@ define i16 @bitcast_bf16_to_i16(bfloat %a, i32 %b) {
; SI-LABEL: bitcast_bf16_to_i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v0
; SI-NEXT: ; implicit-def: $vgpr0
Expand Down Expand Up @@ -720,8 +722,9 @@ define inreg i16 @bitcast_bf16_to_i16_scalar(bfloat inreg %a, i32 inreg %b) {
; SI-LABEL: bitcast_bf16_to_i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_lshl_b32 s4, s16, 16
; SI-NEXT: s_cmp_lg_u32 s17, 0
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s16
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4
; SI-NEXT: s_cbranch_scc0 .LBB7_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1
Expand Down Expand Up @@ -835,29 +838,27 @@ define bfloat @bitcast_f16_to_bf16(half %a, i32 %b) {
; SI-LABEL: bitcast_f16_to_bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v0
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
; SI-NEXT: v_cvt_f16_f32_e32 v2, v0
; SI-NEXT: ; implicit-def: $vgpr0
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execnz .LBB8_3
; SI-NEXT: ; %bb.1: ; %Flow
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execnz .LBB8_4
; SI-NEXT: .LBB8_2: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB8_3: ; %cmp.false
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2
; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: ; %bb.2: ; %Flow
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB8_2
; SI-NEXT: .LBB8_4: ; %cmp.true
; SI-NEXT: s_cbranch_execz .LBB8_4
; SI-NEXT: ; %bb.3: ; %cmp.true
; SI-NEXT: v_cvt_f32_f16_e32 v0, v2
; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: .LBB8_4: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: bitcast_f16_to_bf16:
Expand Down Expand Up @@ -942,21 +943,24 @@ define inreg bfloat @bitcast_f16_to_bf16_scalar(half inreg %a, i32 inreg %b) {
; SI-LABEL: bitcast_f16_to_bf16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, s16
; SI-NEXT: v_cvt_f32_f16_e32 v0, s16
; SI-NEXT: s_cmp_lg_u32 s17, 0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_cbranch_scc0 .LBB9_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; SI-NEXT: s_cbranch_execnz .LBB9_3
; SI-NEXT: .LBB9_2: ; %cmp.true
; SI-NEXT: v_cvt_f32_f16_e32 v0, v1
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; SI-NEXT: .LBB9_3: ; %end
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v1
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB9_4:
; SI-NEXT: ; implicit-def: $vgpr0
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: s_branch .LBB9_2
;
; VI-LABEL: bitcast_f16_to_bf16_scalar:
Expand Down Expand Up @@ -1049,30 +1053,26 @@ define half @bitcast_bf16_to_f16(bfloat %a, i32 %b) {
; SI-LABEL: bitcast_bf16_to_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v0
; SI-NEXT: ; implicit-def: $vgpr0
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execnz .LBB10_3
; SI-NEXT: ; %bb.1: ; %Flow
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execnz .LBB10_4
; SI-NEXT: .LBB10_2: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB10_3: ; %cmp.false
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; %bb.2: ; %Flow
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB10_2
; SI-NEXT: .LBB10_4: ; %cmp.true
; SI-NEXT: ; %bb.3: ; %cmp.true
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: ; %bb.4: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: bitcast_bf16_to_f16:
Expand Down Expand Up @@ -1194,22 +1194,24 @@ define inreg half @bitcast_bf16_to_f16_scalar(bfloat inreg %a, i32 inreg %b) {
; SI-LABEL: bitcast_bf16_to_f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_lshl_b32 s4, s16, 16
; SI-NEXT: s_cmp_lg_u32 s17, 0
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s16
; SI-NEXT: v_mul_f32_e64 v0, 1.0, s4
; SI-NEXT: s_cbranch_scc0 .LBB11_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: s_cbranch_execnz .LBB11_3
; SI-NEXT: .LBB11_2: ; %cmp.true
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v0
; SI-NEXT: .LBB11_3: ; %end
; SI-NEXT: v_cvt_f16_f32_e32 v0, v1
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB11_4:
; SI-NEXT: ; implicit-def: $vgpr0
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: s_branch .LBB11_2
;
; VI-LABEL: bitcast_bf16_to_f16_scalar:
Expand Down
Loading