From 9b6777ce7af047c8c680d1eb55595df86aafb7c5 Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean@nvidia.com>
Date: Fri, 18 Jul 2025 19:26:28 +0000
Subject: [PATCH 1/3] pre-commit tests

---
 llvm/test/CodeGen/NVPTX/f32x2-instructions.ll | 31 +++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
index af3cb63082e78..ea32170643702 100644
--- a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
@@ -1957,6 +1957,37 @@ define <2 x float> @test_uitofp_2xi32_to_2xfloat(<2 x i32> %a) #0 {
   ret <2 x float> %r
 }
 
+define void @test_trunc_to_v2bf16(<2 x float> %a, ptr %p) {
+; CHECK-LABEL: test_trunc_to_v2bf16(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_trunc_to_v2bf16_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_trunc_to_v2bf16_param_0];
+; CHECK-NEXT:    st.b32 [%rd2], %rd1;
+; CHECK-NEXT:    ret;
+  %trunc = fptrunc <2 x float> %a to <2 x bfloat>
+  store <2 x bfloat> %trunc, ptr %p
+  ret void
+}
+
+define void @test_trunc_to_v2f16(<2 x float> %a, ptr %p) {
+; CHECK-LABEL: test_trunc_to_v2f16(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_trunc_to_v2f16_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_trunc_to_v2f16_param_0];
+; CHECK-NEXT:    st.b32 [%rd2], %rd1;
+; CHECK-NEXT:    ret;
+  %trunc = fptrunc <2 x float> %a to <2 x half>
+  store <2 x half> %trunc, ptr %p
+  ret void
+}
+
+
 attributes #0 = { nounwind }
 attributes #1 = { "unsafe-fp-math" = "true" }
 attributes #2 = { "denormal-fp-math"="preserve-sign" }

From e0923957f0a644dbc81a6d70533b2bcad68aef2c Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean@nvidia.com>
Date: Fri, 18 Jul 2025 19:27:57 +0000
Subject: [PATCH 2/3] [NVPTX] Prevent fptrunc of v2f32 from being folded into
 store

---
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp   |  2 ++
 llvm/test/CodeGen/NVPTX/f32x2-instructions.ll | 12 ++++++++----
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 7aa06f9079b09..4e7002feea215 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -731,6 +731,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
   setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+  setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
+  setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Expand);
 
   // PTX does not support load / store predicate registers
   setOperationAction(ISD::LOAD, MVT::i1, Custom);
diff --git a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
index ea32170643702..f24428ebcfb8c 100644
--- a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
@@ -1960,12 +1960,14 @@ define <2 x float> @test_uitofp_2xi32_to_2xfloat(<2 x i32> %a) #0 {
 define void @test_trunc_to_v2bf16(<2 x float> %a, ptr %p) {
 ; CHECK-LABEL: test_trunc_to_v2bf16(
 ; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd2, [test_trunc_to_v2bf16_param_1];
-; CHECK-NEXT:    ld.param.b64 %rd1, [test_trunc_to_v2bf16_param_0];
-; CHECK-NEXT:    st.b32 [%rd2], %rd1;
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_trunc_to_v2bf16_param_0];
+; CHECK-NEXT:    cvt.rn.bf16x2.f32 %r3, %r2, %r1;
+; CHECK-NEXT:    st.b32 [%rd2], %r3;
 ; CHECK-NEXT:    ret;
   %trunc = fptrunc <2 x float> %a to <2 x bfloat>
   store <2 x bfloat> %trunc, ptr %p
@@ -1975,12 +1977,14 @@ define void @test_trunc_to_v2bf16(<2 x float> %a, ptr %p) {
 define void @test_trunc_to_v2f16(<2 x float> %a, ptr %p) {
 ; CHECK-LABEL: test_trunc_to_v2f16(
 ; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd2, [test_trunc_to_v2f16_param_1];
-; CHECK-NEXT:    ld.param.b64 %rd1, [test_trunc_to_v2f16_param_0];
-; CHECK-NEXT:    st.b32 [%rd2], %rd1;
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_trunc_to_v2f16_param_0];
+; CHECK-NEXT:    cvt.rn.f16x2.f32 %r3, %r2, %r1;
+; CHECK-NEXT:    st.b32 [%rd2], %r3;
 ; CHECK-NEXT:    ret;
   %trunc = fptrunc <2 x float> %a to <2 x half>
   store <2 x half> %trunc, ptr %p

From d7ae03f8f3b583f17e28a97c9341c7302aa96d3f Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean@nvidia.com>
Date: Fri, 18 Jul 2025 20:34:05 +0000
Subject: [PATCH 3/3] fixup tests

---
 llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll | 7 ++++---
 llvm/test/CodeGen/NVPTX/f16x2-instructions.ll  | 9 +++++++--
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
index e2a914d8cfc36..ba5813c869236 100644
--- a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
@@ -359,11 +359,12 @@ define <2 x bfloat> @test_select_cc_bf16_f32(<2 x bfloat> %a, <2 x bfloat> %b,
 define <2 x bfloat> @test_fptrunc_2xfloat(<2 x float> %a) #0 {
 ; CHECK-LABEL: test_fptrunc_2xfloat(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b64 %rd1, [test_fptrunc_2xfloat_param_0];
-; CHECK-NEXT:    st.param.b32 [func_retval0], %rd1;
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fptrunc_2xfloat_param_0];
+; CHECK-NEXT:    cvt.rn.bf16x2.f32 %r3, %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
   %r = fptrunc <2 x float> %a to <2 x bfloat>
   ret <2 x bfloat> %r
diff --git a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
index d0e2c1817f696..c765e34f3153f 100644
--- a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
@@ -1433,11 +1433,16 @@ define <2 x half> @test_sitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 {
 define <2 x half> @test_fptrunc_2xfloat(<2 x float> %a) #0 {
 ; CHECK-LABEL: test_fptrunc_2xfloat(
 ; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b64 %rd1, [test_fptrunc_2xfloat_param_0];
-; CHECK-NEXT:    st.param.b32 [func_retval0], %rd1;
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fptrunc_2xfloat_param_0];
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs1, %r2;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs2, %r1;
+; CHECK-NEXT:    mov.b32 %r3, {%rs2, %rs1};
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
   %r = fptrunc <2 x float> %a to <2 x half>
   ret <2 x half> %r