Skip to content

Commit

Permalink
[NVPTX] Support __usAtomicCAS builtin (#99646)
Browse files Browse the repository at this point in the history
Supported `__usAtomicCAS` builtin originally defined in
`/usr/local/cuda/inlcude/crt/sm_70_rt.hpp`

---------

Co-authored-by: Denis Gerasimov <[email protected]>
Co-authored-by: Gonzalo Brito Gadeschi <[email protected]>
Co-authored-by: Denis.Gerasimov <[email protected]>
  • Loading branch information
4 people authored Aug 28, 2024
1 parent c43190f commit 2d1fba6
Show file tree
Hide file tree
Showing 10 changed files with 319 additions and 92 deletions.
3 changes: 3 additions & 0 deletions clang/include/clang/Basic/BuiltinsNVPTX.def
Original file line number Diff line number Diff line change
Expand Up @@ -844,6 +844,9 @@ BUILTIN(__nvvm_atom_xor_gen_ll, "LLiLLiD*LLi", "n")
TARGET_BUILTIN(__nvvm_atom_cta_xor_gen_ll, "LLiLLiD*LLi", "n", SM_60)
TARGET_BUILTIN(__nvvm_atom_sys_xor_gen_ll, "LLiLLiD*LLi", "n", SM_60)

TARGET_BUILTIN(__nvvm_atom_cas_gen_us, "UsUsD*UsUs", "n", SM_70)
TARGET_BUILTIN(__nvvm_atom_cta_cas_gen_us, "UsUsD*UsUs", "n", SM_70)
TARGET_BUILTIN(__nvvm_atom_sys_cas_gen_us, "UsUsD*UsUs", "n", SM_70)
BUILTIN(__nvvm_atom_cas_gen_i, "iiD*ii", "n")
TARGET_BUILTIN(__nvvm_atom_cta_cas_gen_i, "iiD*ii", "n", SM_60)
TARGET_BUILTIN(__nvvm_atom_sys_cas_gen_i, "iiD*ii", "n", SM_60)
Expand Down
3 changes: 3 additions & 0 deletions clang/lib/CodeGen/CGBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20344,6 +20344,7 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
case NVPTX::BI__nvvm_atom_min_gen_ull:
return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UMin, E);

case NVPTX::BI__nvvm_atom_cas_gen_us:
case NVPTX::BI__nvvm_atom_cas_gen_i:
case NVPTX::BI__nvvm_atom_cas_gen_l:
case NVPTX::BI__nvvm_atom_cas_gen_ll:
Expand Down Expand Up @@ -20535,6 +20536,7 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
case NVPTX::BI__nvvm_atom_sys_xor_gen_l:
case NVPTX::BI__nvvm_atom_sys_xor_gen_ll:
return MakeScopedAtomic(Intrinsic::nvvm_atomic_xor_gen_i_sys, *this, E);
case NVPTX::BI__nvvm_atom_cta_cas_gen_us:
case NVPTX::BI__nvvm_atom_cta_cas_gen_i:
case NVPTX::BI__nvvm_atom_cta_cas_gen_l:
case NVPTX::BI__nvvm_atom_cta_cas_gen_ll: {
Expand All @@ -20546,6 +20548,7 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
Intrinsic::nvvm_atomic_cas_gen_i_cta, {ElemTy, Ptr->getType()}),
{Ptr, EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2))});
}
case NVPTX::BI__nvvm_atom_sys_cas_gen_us:
case NVPTX::BI__nvvm_atom_sys_cas_gen_i:
case NVPTX::BI__nvvm_atom_sys_cas_gen_l:
case NVPTX::BI__nvvm_atom_sys_cas_gen_ll: {
Expand Down
14 changes: 14 additions & 0 deletions clang/lib/Headers/__clang_cuda_device_functions.h
Original file line number Diff line number Diff line change
Expand Up @@ -529,6 +529,20 @@ __DEVICE__ void __threadfence(void) { __nvvm_membar_gl(); }
__DEVICE__ void __threadfence_block(void) { __nvvm_membar_cta(); };
__DEVICE__ void __threadfence_system(void) { __nvvm_membar_sys(); };
__DEVICE__ void __trap(void) { __asm__ __volatile__("trap;"); }
__DEVICE__ unsigned short
__usAtomicCAS(unsigned short *__p, unsigned short __cmp, unsigned short __v) {
return __nvvm_atom_cas_gen_us(__p, __cmp, __v);
}
__DEVICE__ unsigned short __usAtomicCAS_block(unsigned short *__p,
unsigned short __cmp,
unsigned short __v) {
return __nvvm_atom_cta_cas_gen_us(__p, __cmp, __v);
}
__DEVICE__ unsigned short __usAtomicCAS_system(unsigned short *__p,
unsigned short __cmp,
unsigned short __v) {
return __nvvm_atom_sys_cas_gen_us(__p, __cmp, __v);
}
__DEVICE__ unsigned int __uAtomicAdd(unsigned int *__p, unsigned int __v) {
return __nvvm_atom_add_gen_i((int *)__p, __v);
}
Expand Down
16 changes: 15 additions & 1 deletion clang/test/CodeGen/builtins-nvptx.c
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
// REQUIRES: nvptx-registered-target
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_70 -target-feature +ptx63 \
// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX63_SM70 -check-prefix=LP64 %s
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_80 -target-feature +ptx70 \
// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 -check-prefix=LP32 %s
Expand Down Expand Up @@ -238,7 +241,8 @@ __shared__ long long sll;

// Check for atomic intrinsics
// CHECK-LABEL: nvvm_atom
__device__ void nvvm_atom(float *fp, float f, double *dfp, double df, int *ip,
__device__ void nvvm_atom(float *fp, float f, double *dfp, double df,
unsigned short *usp, unsigned short us, int *ip,
int i, unsigned int *uip, unsigned ui, long *lp,
long l, long long *llp, long long ll) {
// CHECK: atomicrmw add ptr {{.*}} seq_cst, align 4
Expand Down Expand Up @@ -577,6 +581,16 @@ __device__ void nvvm_atom(float *fp, float f, double *dfp, double df, int *ip,
__nvvm_atom_sys_cas_gen_ll(&sll, ll, 0);
#endif

#if __CUDA_ARCH__ >= 700
// CHECK_PTX63_SM70: cmpxchg ptr {{.*}} seq_cst seq_cst, align 2
// CHECK_PTX63_SM70-NEXT: extractvalue { i16, i1 } {{%[0-9]+}}, 0
__nvvm_atom_cas_gen_us(usp, 0, us);
// CHECK_PTX63_SM70: call i16 @llvm.nvvm.atomic.cas.gen.i.cta.i16.p0
__nvvm_atom_cta_cas_gen_us(usp, 0, us);
// CHECK_PTX63_SM70: call i16 @llvm.nvvm.atomic.cas.gen.i.sys.i16.p0
__nvvm_atom_sys_cas_gen_us(usp, 0, us);
#endif

// CHECK: ret
}

Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -890,7 +890,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// actions
computeRegisterProperties(STI.getRegisterInfo());

setMinCmpXchgSizeInBits(32);
setMinCmpXchgSizeInBits(STI.hasAtomCas16() ? 16 : 32);
setMaxAtomicSizeInBitsSupported(64);
setMaxDivRemBitWidthSupported(64);
}
Expand Down
15 changes: 15 additions & 0 deletions llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
Original file line number Diff line number Diff line change
Expand Up @@ -2076,6 +2076,12 @@ defm INT_PTX_ATOM_XOR_GEN_64_USE_G : F_ATOMIC_2<i64, Int64Regs, ".global", ".b64

// atom_cas

def atomic_cmp_swap_i16_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c),
(atomic_cmp_swap_i16 node:$a, node:$b, node:$c)>;
def atomic_cmp_swap_i16_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c),
(atomic_cmp_swap_i16 node:$a, node:$b, node:$c)>;
def atomic_cmp_swap_i16_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c),
(atomic_cmp_swap_i16 node:$a, node:$b, node:$c)>;
def atomic_cmp_swap_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c),
(atomic_cmp_swap_i32 node:$a, node:$b, node:$c)>;
def atomic_cmp_swap_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c),
Expand All @@ -2089,6 +2095,14 @@ def atomic_cmp_swap_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c),
def atomic_cmp_swap_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c),
(atomic_cmp_swap_i64 node:$a, node:$b, node:$c)>;

defm INT_PTX_ATOM_CAS_G_16 : F_ATOMIC_3<i16, Int16Regs, ".global", ".b16", ".cas",
atomic_cmp_swap_i16_g, i16imm, [hasSM<70>, hasPTX<63>]>;
defm INT_PTX_ATOM_CAS_S_16 : F_ATOMIC_3<i16, Int16Regs, ".shared", ".b16", ".cas",
atomic_cmp_swap_i16_s, i16imm, [hasSM<70>, hasPTX<63>]>;
defm INT_PTX_ATOM_CAS_GEN_16 : F_ATOMIC_3<i16, Int16Regs, "", ".b16", ".cas",
atomic_cmp_swap_i16_gen, i16imm, [hasSM<70>, hasPTX<63>]>;
defm INT_PTX_ATOM_CAS_GEN_16_USE_G : F_ATOMIC_3<i16, Int16Regs, ".global", ".b16", ".cas",
atomic_cmp_swap_i16_gen, i16imm, [hasSM<70>, hasPTX<63>]>;
defm INT_PTX_ATOM_CAS_G_32 : F_ATOMIC_3<i32, Int32Regs, ".global", ".b32", ".cas",
atomic_cmp_swap_i32_g, i32imm>;
defm INT_PTX_ATOM_CAS_S_32 : F_ATOMIC_3<i32, Int32Regs, ".shared", ".b32", ".cas",
Expand Down Expand Up @@ -2300,6 +2314,7 @@ multiclass ATOM2_incdec_impl<string OpStr> {

// atom.cas
multiclass ATOM3_cas_impl<string OpStr> {
defm _b16 : ATOM3S_impl<OpStr, "i", "b16", i16, Int16Regs, i16imm, imm, i16, []>;
defm _b32 : ATOM3S_impl<OpStr, "i", "b32", i32, Int32Regs, i32imm, imm, i32, []>;
defm _b64 : ATOM3S_impl<OpStr, "i", "b64", i64, Int64Regs, i64imm, imm, i64, []>;
}
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/NVPTX/NVPTXSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
bool hasAtomScope() const { return SmVersion >= 60; }
bool hasAtomBitwise64() const { return SmVersion >= 32; }
bool hasAtomMinMax64() const { return SmVersion >= 32; }
bool hasAtomCas16() const { return SmVersion >= 70 && PTXVersion >= 63; }
bool hasLDG() const { return SmVersion >= 32; }
bool hasHWROT32() const { return SmVersion >= 32; }
bool hasImageHandles() const;
Expand Down
140 changes: 50 additions & 90 deletions llvm/test/CodeGen/NVPTX/atomics-sm90.ll
Original file line number Diff line number Diff line change
Expand Up @@ -45,102 +45,62 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
;
; CHECKPTX71-LABEL: test(
; CHECKPTX71: {
; CHECKPTX71-NEXT: .reg .pred %p<5>;
; CHECKPTX71-NEXT: .reg .b16 %rs<18>;
; CHECKPTX71-NEXT: .reg .b32 %r<58>;
; CHECKPTX71-NEXT: .reg .f32 %f<12>;
; CHECKPTX71-NEXT: .reg .pred %p<5>;
; CHECKPTX71-NEXT: .reg .b16 %rs<34>;
; CHECKPTX71-NEXT: .reg .b32 %r<4>;
; CHECKPTX71-NEXT: .reg .f32 %f<12>;
; CHECKPTX71-EMPTY:
; CHECKPTX71-NEXT: // %bb.0:
; CHECKPTX71-NEXT: ld.param.b16 %rs1, [test_param_3];
; CHECKPTX71-NEXT: ld.param.u32 %r23, [test_param_2];
; CHECKPTX71-NEXT: ld.param.u32 %r22, [test_param_1];
; CHECKPTX71-NEXT: ld.param.u32 %r24, [test_param_0];
; CHECKPTX71-NEXT: and.b32 %r1, %r24, -4;
; CHECKPTX71-NEXT: and.b32 %r25, %r24, 3;
; CHECKPTX71-NEXT: shl.b32 %r2, %r25, 3;
; CHECKPTX71-NEXT: mov.b32 %r26, 65535;
; CHECKPTX71-NEXT: shl.b32 %r27, %r26, %r2;
; CHECKPTX71-NEXT: not.b32 %r3, %r27;
; CHECKPTX71-NEXT: ld.u32 %r54, [%r1];
; CHECKPTX71-NEXT: cvt.f32.bf16 %f2, %rs1;
; CHECKPTX71-NEXT: $L__BB0_1: // %atomicrmw.start
; CHECKPTX71-NEXT: ld.param.b16 %rs13, [test_param_3];
; CHECKPTX71-NEXT: ld.param.u32 %r3, [test_param_2];
; CHECKPTX71-NEXT: ld.param.u32 %r2, [test_param_1];
; CHECKPTX71-NEXT: ld.param.u32 %r1, [test_param_0];
; CHECKPTX71-NEXT: ld.b16 %rs30, [%r1];
; CHECKPTX71-NEXT: cvt.f32.bf16 %f1, %rs13;
; CHECKPTX71-NEXT: $L__BB0_1: // %atomicrmw.start
; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
; CHECKPTX71-NEXT: shr.u32 %r28, %r54, %r2;
; CHECKPTX71-NEXT: cvt.u16.u32 %rs2, %r28;
; CHECKPTX71-NEXT: cvt.f32.bf16 %f1, %rs2;
; CHECKPTX71-NEXT: add.rn.f32 %f3, %f1, %f2;
; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs4, %f3;
; CHECKPTX71-NEXT: cvt.u32.u16 %r29, %rs4;
; CHECKPTX71-NEXT: shl.b32 %r30, %r29, %r2;
; CHECKPTX71-NEXT: and.b32 %r31, %r54, %r3;
; CHECKPTX71-NEXT: or.b32 %r32, %r31, %r30;
; CHECKPTX71-NEXT: atom.cas.b32 %r6, [%r1], %r54, %r32;
; CHECKPTX71-NEXT: setp.ne.s32 %p1, %r6, %r54;
; CHECKPTX71-NEXT: mov.u32 %r54, %r6;
; CHECKPTX71-NEXT: @%p1 bra $L__BB0_1;
; CHECKPTX71-NEXT: // %bb.2: // %atomicrmw.end
; CHECKPTX71-NEXT: ld.u32 %r55, [%r1];
; CHECKPTX71-NEXT: $L__BB0_3: // %atomicrmw.start9
; CHECKPTX71-NEXT: cvt.f32.bf16 %f2, %rs30;
; CHECKPTX71-NEXT: add.rn.f32 %f3, %f2, %f1;
; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs14, %f3;
; CHECKPTX71-NEXT: atom.cas.b16 %rs17, [%r1], %rs30, %rs14;
; CHECKPTX71-NEXT: setp.ne.s16 %p1, %rs17, %rs30;
; CHECKPTX71-NEXT: mov.u16 %rs30, %rs17;
; CHECKPTX71-NEXT: @%p1 bra $L__BB0_1;
; CHECKPTX71-NEXT: // %bb.2: // %atomicrmw.end
; CHECKPTX71-NEXT: ld.b16 %rs31, [%r1];
; CHECKPTX71-NEXT: $L__BB0_3: // %atomicrmw.start2
; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
; CHECKPTX71-NEXT: shr.u32 %r33, %r55, %r2;
; CHECKPTX71-NEXT: cvt.u16.u32 %rs6, %r33;
; CHECKPTX71-NEXT: cvt.f32.bf16 %f4, %rs6;
; CHECKPTX71-NEXT: add.rn.f32 %f5, %f4, 0f3F800000;
; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs8, %f5;
; CHECKPTX71-NEXT: cvt.u32.u16 %r34, %rs8;
; CHECKPTX71-NEXT: shl.b32 %r35, %r34, %r2;
; CHECKPTX71-NEXT: and.b32 %r36, %r55, %r3;
; CHECKPTX71-NEXT: or.b32 %r37, %r36, %r35;
; CHECKPTX71-NEXT: atom.cas.b32 %r9, [%r1], %r55, %r37;
; CHECKPTX71-NEXT: setp.ne.s32 %p2, %r9, %r55;
; CHECKPTX71-NEXT: mov.u32 %r55, %r9;
; CHECKPTX71-NEXT: @%p2 bra $L__BB0_3;
; CHECKPTX71-NEXT: // %bb.4: // %atomicrmw.end8
; CHECKPTX71-NEXT: and.b32 %r10, %r22, -4;
; CHECKPTX71-NEXT: shl.b32 %r38, %r22, 3;
; CHECKPTX71-NEXT: and.b32 %r11, %r38, 24;
; CHECKPTX71-NEXT: shl.b32 %r40, %r26, %r11;
; CHECKPTX71-NEXT: not.b32 %r12, %r40;
; CHECKPTX71-NEXT: ld.global.u32 %r56, [%r10];
; CHECKPTX71-NEXT: $L__BB0_5: // %atomicrmw.start27
; CHECKPTX71-NEXT: cvt.f32.bf16 %f4, %rs31;
; CHECKPTX71-NEXT: add.rn.f32 %f5, %f4, 0f3F800000;
; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs18, %f5;
; CHECKPTX71-NEXT: atom.cas.b16 %rs21, [%r1], %rs31, %rs18;
; CHECKPTX71-NEXT: setp.ne.s16 %p2, %rs21, %rs31;
; CHECKPTX71-NEXT: mov.u16 %rs31, %rs21;
; CHECKPTX71-NEXT: @%p2 bra $L__BB0_3;
; CHECKPTX71-NEXT: // %bb.4: // %atomicrmw.end1
; CHECKPTX71-NEXT: ld.global.b16 %rs32, [%r2];
; CHECKPTX71-NEXT: $L__BB0_5: // %atomicrmw.start8
; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
; CHECKPTX71-NEXT: shr.u32 %r41, %r56, %r11;
; CHECKPTX71-NEXT: cvt.u16.u32 %rs10, %r41;
; CHECKPTX71-NEXT: cvt.f32.bf16 %f6, %rs10;
; CHECKPTX71-NEXT: add.rn.f32 %f8, %f6, %f2;
; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs12, %f8;
; CHECKPTX71-NEXT: cvt.u32.u16 %r42, %rs12;
; CHECKPTX71-NEXT: shl.b32 %r43, %r42, %r11;
; CHECKPTX71-NEXT: and.b32 %r44, %r56, %r12;
; CHECKPTX71-NEXT: or.b32 %r45, %r44, %r43;
; CHECKPTX71-NEXT: atom.global.cas.b32 %r15, [%r10], %r56, %r45;
; CHECKPTX71-NEXT: setp.ne.s32 %p3, %r15, %r56;
; CHECKPTX71-NEXT: mov.u32 %r56, %r15;
; CHECKPTX71-NEXT: @%p3 bra $L__BB0_5;
; CHECKPTX71-NEXT: // %bb.6: // %atomicrmw.end26
; CHECKPTX71-NEXT: and.b32 %r16, %r23, -4;
; CHECKPTX71-NEXT: shl.b32 %r46, %r23, 3;
; CHECKPTX71-NEXT: and.b32 %r17, %r46, 24;
; CHECKPTX71-NEXT: shl.b32 %r48, %r26, %r17;
; CHECKPTX71-NEXT: not.b32 %r18, %r48;
; CHECKPTX71-NEXT: ld.shared.u32 %r57, [%r16];
; CHECKPTX71-NEXT: $L__BB0_7: // %atomicrmw.start45
; CHECKPTX71-NEXT: cvt.f32.bf16 %f7, %rs32;
; CHECKPTX71-NEXT: add.rn.f32 %f8, %f7, %f1;
; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs22, %f8;
; CHECKPTX71-NEXT: atom.global.cas.b16 %rs25, [%r2], %rs32, %rs22;
; CHECKPTX71-NEXT: setp.ne.s16 %p3, %rs25, %rs32;
; CHECKPTX71-NEXT: mov.u16 %rs32, %rs25;
; CHECKPTX71-NEXT: @%p3 bra $L__BB0_5;
; CHECKPTX71-NEXT: // %bb.6: // %atomicrmw.end7
; CHECKPTX71-NEXT: ld.shared.b16 %rs33, [%r3];
; CHECKPTX71-NEXT: $L__BB0_7: // %atomicrmw.start14
; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
; CHECKPTX71-NEXT: shr.u32 %r49, %r57, %r17;
; CHECKPTX71-NEXT: cvt.u16.u32 %rs14, %r49;
; CHECKPTX71-NEXT: cvt.f32.bf16 %f9, %rs14;
; CHECKPTX71-NEXT: add.rn.f32 %f11, %f9, %f2;
; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs16, %f11;
; CHECKPTX71-NEXT: cvt.u32.u16 %r50, %rs16;
; CHECKPTX71-NEXT: shl.b32 %r51, %r50, %r17;
; CHECKPTX71-NEXT: and.b32 %r52, %r57, %r18;
; CHECKPTX71-NEXT: or.b32 %r53, %r52, %r51;
; CHECKPTX71-NEXT: atom.shared.cas.b32 %r21, [%r16], %r57, %r53;
; CHECKPTX71-NEXT: setp.ne.s32 %p4, %r21, %r57;
; CHECKPTX71-NEXT: mov.u32 %r57, %r21;
; CHECKPTX71-NEXT: @%p4 bra $L__BB0_7;
; CHECKPTX71-NEXT: // %bb.8: // %atomicrmw.end44
; CHECKPTX71-NEXT: ret;
; CHECKPTX71-NEXT: cvt.f32.bf16 %f10, %rs33;
; CHECKPTX71-NEXT: add.rn.f32 %f11, %f10, %f1;
; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs26, %f11;
; CHECKPTX71-NEXT: atom.shared.cas.b16 %rs29, [%r3], %rs33, %rs26;
; CHECKPTX71-NEXT: setp.ne.s16 %p4, %rs29, %rs33;
; CHECKPTX71-NEXT: mov.u16 %rs33, %rs29;
; CHECKPTX71-NEXT: @%p4 bra $L__BB0_7;
; CHECKPTX71-NEXT: // %bb.8: // %atomicrmw.end13
; CHECKPTX71-NEXT: ret;
%r1 = atomicrmw fadd ptr %dp0, bfloat %val seq_cst
%r2 = atomicrmw fadd ptr %dp0, bfloat 1.0 seq_cst
%r3 = atomicrmw fadd ptr addrspace(1) %dp1, bfloat %val seq_cst
Expand Down
Loading

0 comments on commit 2d1fba6

Please sign in to comment.