[CIR][X86]Implement handling for convert-half builtins#173143
[CIR][X86]Implement handling for convert-half builtins#173143andykaylor merged 20 commits intollvm:mainfrom
Conversation
|
@llvm/pr-subscribers-clangir @llvm/pr-subscribers-clang Author: Priyanshu Kumar (Priyanshu3820) ChangesRelated to: #167765 Full diff: https://github.com/llvm/llvm-project/pull/173143.diff 2 Files Affected:
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index 75bf25b20f1af..59d467da3a9fb 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -362,6 +362,27 @@ static mlir::Value emitX86Muldq(CIRGenBuilderTy &builder, mlir::Location loc,
return builder.createMul(loc, lhs, rhs);
}
+static mlir::Value emitX86CvtF16ToFloatExpr(CIRGenBuilderTy &builder,
+ mlir::Location loc,
+ mlir::Type dstTy,
+ SmallVectorImpl<mlir::Value> &ops) {
+
+ mlir::Value src = ops[0];
+ mlir::Value passthru = ops[1];
+
+ auto vecTy = mlir::cast<cir::VectorType>(src.getType());
+ uint64_t numElems = vecTy.getSize();
+
+ mlir::Value mask = getMaskVecValue(builder, loc, ops[2], numElems);
+
+ auto halfTy = cir::VectorType::get(builder.getF16Type(), numElems);
+ mlir::Value srcF16 = builder.createBitcast(loc, src, halfTy);
+
+ mlir::Value res = builder.createFloatingCast(srcF16, dstTy);
+
+ return emitX86Select(builder, loc, mask, res, passthru);
+}
+
static mlir::Value emitX86vpcom(CIRGenBuilderTy &builder, mlir::Location loc,
llvm::SmallVector<mlir::Value> ops,
bool isSigned) {
@@ -1662,12 +1683,40 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
case X86::BI__builtin_ia32_cmpnltsd:
case X86::BI__builtin_ia32_cmpnlesd:
case X86::BI__builtin_ia32_cmpordsd:
+ cgm.errorNYI(expr->getSourceRange(),
+ std::string("unimplemented X86 builtin call: ") +
+ getContext().BuiltinInfo.getName(builtinID));
+ return mlir::Value{};
case X86::BI__builtin_ia32_vcvtph2ps_mask:
case X86::BI__builtin_ia32_vcvtph2ps256_mask:
- case X86::BI__builtin_ia32_vcvtph2ps512_mask:
- case X86::BI__builtin_ia32_cvtneps2bf16_128_mask:
+ case X86::BI__builtin_ia32_vcvtph2ps512_mask: {
+ mlir::Location loc = getLoc(expr->getExprLoc());
+ return emitX86CvtF16ToFloatExpr(builder, loc, convertType(expr->getType()),
+ ops);
+ }
+ case X86::BI__builtin_ia32_cvtneps2bf16_128_mask: {
+ mlir::Location loc = getLoc(expr->getExprLoc());
+ mlir::Value intrinsicMask = getMaskVecValue(builder, loc, ops[2], 4);
+ return emitIntrinsicCallOp(builder, loc,
+ "x86.avx512bf16.mask.cvtneps2bf16.128",
+ convertType(expr->getType()),
+ mlir::ValueRange{ops[0], ops[1], intrinsicMask});
+ }
case X86::BI__builtin_ia32_cvtneps2bf16_256_mask:
- case X86::BI__builtin_ia32_cvtneps2bf16_512_mask:
+ case X86::BI__builtin_ia32_cvtneps2bf16_512_mask: {
+ mlir::Location loc = getLoc(expr->getExprLoc());
+ unsigned numElts = cast<cir::VectorType>(ops[1].getType()).getSize();
+ mlir::Value selectMask = getMaskVecValue(builder, loc, ops[2], numElts);
+ StringRef intrinsicName;
+ if (builtinID == X86::BI__builtin_ia32_cvtneps2bf16_256_mask)
+ intrinsicName = "x86.avx512bf16.cvtneps2bf16.256";
+ else
+ intrinsicName = "x86.avx512bf16.cvtneps2bf16.512";
+ mlir::Value intrinsicResult =
+ emitIntrinsicCallOp(builder, loc, intrinsicName, ops[1].getType(),
+ mlir::ValueRange{ops[0]});
+ return emitX86Select(builder, loc, selectMask, intrinsicResult, ops[1]);
+ }
case X86::BI__cpuid:
case X86::BI__cpuidex:
case X86::BI__emul:
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/avx512vlbf16-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/avx512vlbf16-builtins.c
new file mode 100644
index 0000000000000..ccfc0d4a6a813
--- /dev/null
+++ b/clang/test/CIR/CodeGenBuiltins/X86/avx512vlbf16-builtins.c
@@ -0,0 +1,80 @@
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512f -target-feature +avx512vl -target-feature +avx512bf16 -fclangir -emit-cir -o %t.cir -Wall -Werror -Wsign-conversion
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512f -target-feature +avx512vl -target-feature +avx512bf16 -fclangir -emit-llvm -o %t.ll -Wall -Werror -Wsign-conversion
+// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512f -target-feature +avx512vl -target-feature +avx512bf16 -emit-llvm -o %t.ll -Wall -Werror -Wsign-conversion
+// RUN: FileCheck --check-prefixes=OGCG --input-file=%t.ll %s
+
+#include <immintrin.h>
+
+__m256bh test_mm512_mask_cvtneps_pbh(__m256bh src, __mmask16 k, __m512 a) {
+ // CIR-LABEL: @test_mm512_mask_cvtneps_pbh
+ // CIR: cir.call @_mm512_mask_cvtneps_pbh({{.+}}, {{.+}}, {{.+}}) : (!cir.vector<16 x !cir.bf16>, !u16i, !cir.vector<16 x !cir.float>) -> !cir.vector<16 x !cir.bf16>
+
+ // LLVM-LABEL: @test_mm512_mask_cvtneps_pbh
+ // LLVM: call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512
+
+ // OGCG-LABEL: @test_mm512_mask_cvtneps_pbh
+ // OGCG: call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512
+ return _mm512_mask_cvtneps_pbh(src, k, a);
+}
+
+__m256bh test_mm512_maskz_cvtneps_pbh(__mmask16 k, __m512 a) {
+ // CIR-LABEL: @test_mm512_maskz_cvtneps_pbh
+ // CIR: cir.call @_mm512_maskz_cvtneps_pbh({{.+}}, {{.+}}) : (!u16i, !cir.vector<16 x !cir.float>) -> !cir.vector<16 x !cir.bf16>
+
+ // LLVM-LABEL: @test_mm512_maskz_cvtneps_pbh
+ // LLVM: call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> {{.+}})
+
+ // OGCG-LABEL: @test_mm512_maskz_cvtneps_pbh
+ // OGCG: call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> {{.+}})
+ return _mm512_maskz_cvtneps_pbh(k, a);
+}
+
+__m128bh test_mm256_mask_cvtneps_pbh(__m128bh src, __mmask8 k, __m256 a) {
+ // CIR-LABEL: test_mm256_mask_cvtneps_pbh
+ // CIR: cir.call @_mm256_mask_cvtneps_pbh({{.+}}, {{.+}}, {{.+}}) : (!cir.vector<8 x !cir.bf16>, !u8i, !cir.vector<8 x !cir.float>) -> !cir.vector<8 x !cir.bf16>
+
+ // LLVM-LABEL: test_mm256_mask_cvtneps_pbh
+ // LLVM: call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> {{.+}})
+
+ // OGCG-LABEL: test_mm256_mask_cvtneps_pbh
+ // OGCG: call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> {{.+}})
+ return _mm256_mask_cvtneps_pbh(src, k, a);
+}
+
+__m128bh test_mm256_maskz_cvtneps_pbh(__mmask8 k, __m256 a) {
+ // CIR-LABEL: test_mm256_maskz_cvtneps_pbh
+ // CIR: cir.call @_mm256_maskz_cvtneps_pbh({{.+}}, {{.+}}) : (!u8i, !cir.vector<8 x !cir.float>) -> !cir.vector<8 x !cir.bf16>
+
+ // LLVM-LABEL: test_mm256_maskz_cvtneps_pbh
+ // LLVM: call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> {{.+}})
+
+ // OGCG-LABEL: test_mm256_maskz_cvtneps_pbh
+ // OGCG: call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> {{.+}})
+ return _mm256_maskz_cvtneps_pbh(k, a);
+}
+
+__m128bh test_mm_mask_cvtneps_pbh(__m128bh src, __mmask8 k, __m128 a) {
+ // CIR-LABEL: test_mm_mask_cvtneps_pbh
+ // CIR: cir.call @_mm_mask_cvtneps_pbh({{.+}}, {{.+}}, {{.+}}) : (!cir.vector<8 x !cir.bf16>, !u8i, !cir.vector<4 x !cir.float>) -> !cir.vector<8 x !cir.bf1{{.+}}
+
+ // LLVM-LABEL: test_mm_mask_cvtneps_pbh
+ // LLVM: call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> {{.+}}, <8 x bfloat> {{.+}}, <4 x i1> %extract.i)
+
+ // OGCG-LABEL: test_mm_mask_cvtneps_pbh
+ // OGCG: call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> {{.+}}, <8 x bfloat> {{.+}}, <4 x i1> %extract.i)
+ return _mm_mask_cvtneps_pbh(src, k, a);
+}
+
+__m128bh test_mm_maskz_cvtneps_pbh(__mmask8 k, __m128 a) {
+ // CIR-LABEL: test_mm_maskz_cvtneps_pbh
+ // CIR: cir.call @_mm_maskz_cvtneps_pbh({{.+}}, {{.+}}) : (!u8i, !cir.vector<4 x !cir.float>) -> !cir.vector<8 x !cir.bf16>
+
+ // LLVM-LABEL: test_mm_maskz_cvtneps_pbh
+ // LLVM: call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> {{.+}}, <8 x bfloat> {{.+}}, <4 x i1> %extract.i)
+
+ // OGCG-LABEL: test_mm_maskz_cvtneps_pbh
+ // OGCG: call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> {{.+}}, <8 x bfloat> {{.+}}, <4 x i1> %extract.i)
+ return _mm_maskz_cvtneps_pbh(k, a);
+}
|
🐧 Linux x64 Test Results
✅ The build succeeded and all tests passed. |
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
|
hi @andykaylor, @bcardosolopes, I have decided to handle the fp16 builtins in a separate PR(was having some issues in them so thought i'd be needing some more time debugging). This PR only contains the bf16 builtins( |
f16 halfs to float builtins are now implemented. In this PR- #173572 |
andykaylor
left a comment
There was a problem hiding this comment.
This looks good, with one minor suggestion.
Co-authored-by: Andy Kaylor <akaylor@nvidia.com>
There was a problem hiding this comment.
Pull request overview
This PR implements handling for X86 convert-half builtins in the ClangIR code generation framework, specifically targeting AVX512 BF16 intrinsics for converting single-precision floating-point values to bfloat16 format.
- Adds implementation for
cvtneps2bf16builtins with mask support (128, 256, and 512-bit variants) - Includes comprehensive test coverage with CIR, LLVM, and original CodeGen verification
- Moves previously unimplemented builtins from NYI (Not Yet Implemented) to functional implementations
Reviewed changes
Copilot reviewed 2 out of 2 changed files in this pull request and generated no comments.
| File | Description |
|---|---|
| clang/test/CIR/CodeGenBuiltins/X86/avx512vlbf16-builtins.c | New test file covering mask and maskz variants of cvtneps_pbh functions for 128, 256, and 512-bit vectors with CIR, LLVM, and OGCG verification |
| clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp | Implements handlers for __builtin_ia32_cvtneps2bf16_*_mask builtins, removing them from the NYI fallthrough list |
💡 Add Copilot custom instructions for smarter, more guided reviews. Learn how to get started.
Related to: #167765