@@ -31,5 +31,68 @@ define amdgpu_kernel void @rcp_bf16(ptr addrspace(1) %out, bfloat %src) #1 {
3131 ret void
3232}
3333
34+ define amdgpu_kernel void @rcp_bf16_constant_4 (ptr addrspace (1 ) %out ) #1 {
35+ ; SDAG-TRUE16-LABEL: rcp_bf16_constant_4:
36+ ; SDAG-TRUE16: ; %bb.0:
37+ ; SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
38+ ; SDAG-TRUE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3e80
39+ ; SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
40+ ; SDAG-TRUE16-NEXT: flat_store_b16 v0, v1, s[0:1]
41+ ; SDAG-TRUE16-NEXT: s_endpgm
42+ ;
43+ ; SDAG-FAKE16-LABEL: rcp_bf16_constant_4:
44+ ; SDAG-FAKE16: ; %bb.0:
45+ ; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
46+ ; SDAG-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3e80
47+ ; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
48+ ; SDAG-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
49+ ; SDAG-FAKE16-NEXT: s_endpgm
50+ %rcp = call bfloat @llvm.amdgcn.rcp.bf16 (bfloat 4 .0 ) #0
51+ store bfloat %rcp , ptr addrspace (1 ) %out , align 2
52+ ret void
53+ }
54+
55+ define amdgpu_kernel void @rcp_bf16_constant_100 (ptr addrspace (1 ) %out ) #1 {
56+ ; SDAG-TRUE16-LABEL: rcp_bf16_constant_100:
57+ ; SDAG-TRUE16: ; %bb.0:
58+ ; SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
59+ ; SDAG-TRUE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3c24
60+ ; SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
61+ ; SDAG-TRUE16-NEXT: flat_store_b16 v0, v1, s[0:1]
62+ ; SDAG-TRUE16-NEXT: s_endpgm
63+ ;
64+ ; SDAG-FAKE16-LABEL: rcp_bf16_constant_100:
65+ ; SDAG-FAKE16: ; %bb.0:
66+ ; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
67+ ; SDAG-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3c24
68+ ; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
69+ ; SDAG-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
70+ ; SDAG-FAKE16-NEXT: s_endpgm
71+ %rcp = call bfloat @llvm.amdgcn.rcp.bf16 (bfloat 100 .0 ) #0
72+ store bfloat %rcp , ptr addrspace (1 ) %out , align 2
73+ ret void
74+ }
75+
76+ define amdgpu_kernel void @rcp_undef_bf16 (ptr addrspace (1 ) %out ) #1 {
77+ ; SDAG-TRUE16-LABEL: rcp_undef_bf16:
78+ ; SDAG-TRUE16: ; %bb.0:
79+ ; SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
80+ ; SDAG-TRUE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc0
81+ ; SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
82+ ; SDAG-TRUE16-NEXT: flat_store_b16 v0, v1, s[0:1]
83+ ; SDAG-TRUE16-NEXT: s_endpgm
84+ ;
85+ ; SDAG-FAKE16-LABEL: rcp_undef_bf16:
86+ ; SDAG-FAKE16: ; %bb.0:
87+ ; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
88+ ; SDAG-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc0
89+ ; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
90+ ; SDAG-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
91+ ; SDAG-FAKE16-NEXT: s_endpgm
92+ %rcp = call bfloat @llvm.amdgcn.rcp.bf16 (bfloat undef )
93+ store bfloat %rcp , ptr addrspace (1 ) %out , align 2
94+ ret void
95+ }
96+
3497attributes #0 = { nounwind readnone }
3598attributes #1 = { nounwind }
0 commit comments