From 987617db2290093ca2d7a5c28da51783f287e1eb Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Mon, 4 Aug 2025 16:25:59 -0700
Subject: [PATCH 01/56] update config file with use separate registries

---
 configs/cuda_bf16.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/configs/cuda_bf16.yml b/configs/cuda_bf16.yml
index 3f315e0f7..29aa1d2dd 100644
--- a/configs/cuda_bf16.yml
+++ b/configs/cuda_bf16.yml
@@ -1,5 +1,7 @@
 Name: Numba Bfloat16
-Version: 0.0.1
+Version: 0.0.2
+GPU Arch:
+    - sm_80 # The first architecture to support bfloat16
 Entry Point: ./numba_cuda/numba/cuda/include/12/cuda_bf16.h
 File List:
     - ./numba_cuda/numba/cuda/include/12/cuda_bf16.h
@@ -22,3 +24,4 @@ Shim Include Override: "\"cuda_bf16.h\""
 Additional Import:
     - os
 Require Pynvjitlink: False
+Use Separate Registry: true

From feb8a09988750f79d8e52116fb4262413ae2f8c6 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Mon, 4 Aug 2025 16:30:49 -0700
Subject: [PATCH 02/56] regenerate bfloat16 bindings with
 lakshayg/Numbast@6282df4

---
 numba_cuda/numba/cuda/_internal/cuda_bf16.py | 14086 +++++++++++++++--
 1 file changed, 12671 insertions(+), 1415 deletions(-)

diff --git a/numba_cuda/numba/cuda/_internal/cuda_bf16.py b/numba_cuda/numba/cuda/_internal/cuda_bf16.py
index e6220fe67..4fd6c50e4 100644
--- a/numba_cuda/numba/cuda/_internal/cuda_bf16.py
+++ b/numba_cuda/numba/cuda/_internal/cuda_bf16.py
@@ -1,12 +1,12 @@
 # Automatically generated by Numbast Static Binding Generator
 # Generator Information:
-# Ast_canopy version: 0.3.0
-# Numbast version: 0.3.0
-# Generation command: /home/wangm/numbast/numbast/src/numbast/__main__.py --cfg-path configs/cuda_bf16.yml --output-dir numba_cuda/numba/cuda/_internal
-# Static binding generator parameters: {'cfg_path': 'configs/cuda_bf16.yml', 'output_dir': 'numba_cuda/numba/cuda/', 'entry_point': None, 'retain': None, 'types': None, 'datamodels': None, 'compute_capability': None, 'run_ruff_format': True}
-# Config file path (relative to the path of the generated binding): ../../../../configs/cuda_bf16.yml
+# Ast_canopy version: 0.4.0
+# Numbast version: 0.4.0
+# Generation command: /home/wangm/miniforge3/envs/numbast/lib/python3.13/site-packages/numbast/__main__.py --cfg-path configs/cuda_bf16.yml --output-dir numba_cuda/numba/cuda/_internal/
+# Static binding generator parameters: {'cfg_path': 'configs/cuda_bf16.yml', 'output_dir': 'numba_cuda/numba/cuda/_internal/', 'run_ruff_format': True}
+# Config file path (relative to the path of the generated binding): ../../../../../configs/cuda_bf16.yml
 # Cudatoolkit version: (12, 8)
-# Default CUDA_HOME path: /home/wangm/micromamba/envs/numbast
+# Default CUDA_HOME path: /home/wangm/miniforge3/envs/numbast
 
 
 # Imports:
@@ -23,11 +23,14 @@
     make_attribute_wrapper,
     register_model,
 )
+from numba.core.imputils import Registry as TargetRegistry
+from numba.core.imputils import lower_cast
 from numba.core.typing import signature
 from numba.core.typing.templates import AttributeTemplate, ConcreteTemplate
+from numba.core.typing.templates import Registry as TypingRegistry
 from numba.cuda import CUSource, declare_device
-from numba.cuda.cudadecl import register, register_attr, register_global
-from numba.cuda.cudaimpl import lower
+from numba.cuda._internal.cuda_bf16 import _type___nv_bfloat16
+from numba.cuda.vector_types import vector_types
 from numba.extending import as_numba_type
 from numba.types import (
     CPointer,
@@ -46,10 +49,23 @@
     uint16,
     uint32,
     uint64,
+    void,
 )
 
+float32x2 = vector_types["float32x2"]
+
 # Setups:
 
+
+typing_registry = TypingRegistry()
+register = typing_registry.register
+register_attr = typing_registry.register_attr
+register_global = typing_registry.register_global
+target_registry = TargetRegistry()
+lower = target_registry.lower
+lower_attr = target_registry.lower_getattr
+lower_constant = target_registry.lower_constant
+
 # Shim Stream:
 
 
@@ -76,83 +92,84 @@ def reset(self):
 shim_stream.write(shim_prefix)
 shim_obj = CUSource(shim_stream)
 
+
 # Enums:
 
 
 # Structs:
 
 
-# Typing for unnamed1401637
-class _type_class_unnamed1401637(Type):
+# Typing for unnamed1405307
+class _type_class_unnamed1405307(Type):
     def __init__(self):
-        super().__init__(name="unnamed1401637")
+        super().__init__(name="unnamed1405307")
         self.alignof_ = 2
         self.bitwidth = 2 * 8
 
 
-_type_unnamed1401637 = _type_class_unnamed1401637()
+_type_unnamed1405307 = _type_class_unnamed1405307()
 
 
 # Make Python API for struct
-unnamed1401637 = type("unnamed1401637", (), {"_nbtype": _type_unnamed1401637})
+unnamed1405307 = type("unnamed1405307", (), {"_nbtype": _type_unnamed1405307})
 
-as_numba_type.register(unnamed1401637, _type_unnamed1401637)
+as_numba_type.register(unnamed1405307, _type_unnamed1405307)
 
 
-@register_model(_type_class_unnamed1401637)
-class _model_unnamed1401637(StructModel):
+@register_model(_type_class_unnamed1405307)
+class _model_unnamed1405307(StructModel):
     def __init__(self, dmm, fe_type):
         members = [("x", uint16)]
         super().__init__(dmm, fe_type, members)
 
 
 @register_attr
-class _attr_typing_unnamed1401637(AttributeTemplate):
-    key = globals()["unnamed1401637"]
+class _attr_typing_unnamed1405307(AttributeTemplate):
+    key = globals()["unnamed1405307"]
 
     def resolve_x(self, obj):
         return uint16
 
 
-make_attribute_wrapper(_type_class_unnamed1401637, "x", "x")
+make_attribute_wrapper(_type_class_unnamed1405307, "x", "x")
 
 
 @register
-class _ctor_template_unnamed1401637(ConcreteTemplate):
-    key = globals()["unnamed1401637"]
+class _ctor_template_unnamed1405307(ConcreteTemplate):
+    key = globals()["unnamed1405307"]
     cases = []
 
 
-register_global(unnamed1401637, Function(_ctor_template_unnamed1401637))
+register_global(unnamed1405307, Function(_ctor_template_unnamed1405307))
 
 
-# Typing for unnamed1401746
-class _type_class_unnamed1401746(Type):
+# Typing for unnamed1405416
+class _type_class_unnamed1405416(Type):
     def __init__(self):
-        super().__init__(name="unnamed1401746")
+        super().__init__(name="unnamed1405416")
         self.alignof_ = 4
         self.bitwidth = 4 * 8
 
 
-_type_unnamed1401746 = _type_class_unnamed1401746()
+_type_unnamed1405416 = _type_class_unnamed1405416()
 
 
 # Make Python API for struct
-unnamed1401746 = type("unnamed1401746", (), {"_nbtype": _type_unnamed1401746})
+unnamed1405416 = type("unnamed1405416", (), {"_nbtype": _type_unnamed1405416})
 
-as_numba_type.register(unnamed1401746, _type_unnamed1401746)
+as_numba_type.register(unnamed1405416, _type_unnamed1405416)
 
 
-@register_model(_type_class_unnamed1401746)
-class _model_unnamed1401746(StructModel):
+@register_model(_type_class_unnamed1405416)
+class _model_unnamed1405416(StructModel):
     def __init__(self, dmm, fe_type):
         members = [("x", uint16), ("y", uint16)]
         super().__init__(dmm, fe_type, members)
 
 
 @register_attr
-class _attr_typing_unnamed1401746(AttributeTemplate):
-    key = globals()["unnamed1401746"]
+class _attr_typing_unnamed1405416(AttributeTemplate):
+    key = globals()["unnamed1405416"]
 
     def resolve_x(self, obj):
         return uint16
@@ -161,19 +178,19 @@ def resolve_y(self, obj):
         return uint16
 
 
-make_attribute_wrapper(_type_class_unnamed1401746, "x", "x")
+make_attribute_wrapper(_type_class_unnamed1405416, "x", "x")
 
 
-make_attribute_wrapper(_type_class_unnamed1401746, "y", "y")
+make_attribute_wrapper(_type_class_unnamed1405416, "y", "y")
 
 
 @register
-class _ctor_template_unnamed1401746(ConcreteTemplate):
-    key = globals()["unnamed1401746"]
+class _ctor_template_unnamed1405416(ConcreteTemplate):
+    key = globals()["unnamed1405416"]
     cases = []
 
 
-register_global(unnamed1401746, Function(_ctor_template_unnamed1401746))
+register_global(unnamed1405416, Function(_ctor_template_unnamed1405416))
 
 
 # Typing for __nv_bfloat16
@@ -200,17 +217,17 @@ def __init__(self, dmm, fe_type):
         super(_model___nv_bfloat16, self).__init__(dmm, fe_type, be_type)
 
 
-def _lower___nv_bfloat16_void(shim_stream, shim_obj):
+def _lower__ZN13__nv_bfloat16C1Ev(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16____nv_bfloat16_1(int &ignore, __nv_bfloat16 *self ) {
+    _ZN13__nv_bfloat16C1Ev_nbst(int &ignore, __nv_bfloat16 *self ) {
         new (self) __nv_bfloat16();
         return 0;
     }
         """
 
     _ctor_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16____nv_bfloat16_1",
+        "_ZN13__nv_bfloat16C1Ev_nbst",
         int32(
             CPointer(_type___nv_bfloat16),
         ),
@@ -224,9 +241,7 @@ def __nv_bfloat16_device_caller(arg_0):
     )
     def ctor_impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key(
-            "____nv_bfloat16____nv_bfloat16_1", shim_raw_str
-        )
+        shim_stream.write_with_key("_ZN13__nv_bfloat16C1Ev_nbst", shim_raw_str)
         selfptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
         )
@@ -250,31 +265,31 @@ def ctor_impl(context, builder, sig, args):
         )
 
 
-_lower___nv_bfloat16_void(shim_stream, shim_obj)
+_lower__ZN13__nv_bfloat16C1Ev(shim_stream, shim_obj)
 
 
-def _lower___nv_bfloat16__type_unnamed1401637(shim_stream, shim_obj):
+def _lower__ZN13__nv_bfloat16C1ERK17__nv_bfloat16_raw(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16____nv_bfloat16_2(int &ignore, __nv_bfloat16 *self , __nv_bfloat16_raw* hr) {
+    _ZN13__nv_bfloat16C1ERK17__nv_bfloat16_raw_nbst(int &ignore, __nv_bfloat16 *self , __nv_bfloat16_raw* hr) {
         new (self) __nv_bfloat16(*hr);
         return 0;
     }
         """
 
     _ctor_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16____nv_bfloat16_2",
-        int32(CPointer(_type___nv_bfloat16), CPointer(_type_unnamed1401637)),
+        "_ZN13__nv_bfloat16C1ERK17__nv_bfloat16_raw_nbst",
+        int32(CPointer(_type___nv_bfloat16), CPointer(_type_unnamed1405307)),
     )
 
     def __nv_bfloat16_device_caller(arg_0, arg_1):
         return _ctor_decl___nv_bfloat16(arg_0, arg_1)
 
-    @lower(__nv_bfloat16, _type_unnamed1401637)
+    @lower(__nv_bfloat16, _type_unnamed1405307)
     def ctor_impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
-            "____nv_bfloat16____nv_bfloat16_2", shim_raw_str
+            "_ZN13__nv_bfloat16C1ERK17__nv_bfloat16_raw_nbst", shim_raw_str
         )
         selfptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
@@ -291,7 +306,7 @@ def ctor_impl(context, builder, sig, args):
             signature(
                 int32,
                 CPointer(_type___nv_bfloat16),
-                CPointer(_type_unnamed1401637),
+                CPointer(_type_unnamed1405307),
             ),
             (selfptr, *argptrs),
         )
@@ -299,21 +314,30 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
         )
 
+    @lower_cast(_type_unnamed1405307, _type___nv_bfloat16)
+    def conversion_impl(context, builder, fromty, toty, value):
+        return ctor_impl(
+            context,
+            builder,
+            signature(_type___nv_bfloat16, CPointer(_type_unnamed1405307)),
+            value,
+        )
+
 
-_lower___nv_bfloat16__type_unnamed1401637(shim_stream, shim_obj)
+_lower__ZN13__nv_bfloat16C1ERK17__nv_bfloat16_raw(shim_stream, shim_obj)
 
 
-def _lower___nv_bfloat16_float16(shim_stream, shim_obj):
+def _lower__ZN13__nv_bfloat16C1E6__half(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16____nv_bfloat16_3(int &ignore, __nv_bfloat16 *self , __half* f) {
+    _ZN13__nv_bfloat16C1E6__half_nbst(int &ignore, __nv_bfloat16 *self , __half* f) {
         new (self) __nv_bfloat16(*f);
         return 0;
     }
         """
 
     _ctor_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16____nv_bfloat16_3",
+        "_ZN13__nv_bfloat16C1E6__half_nbst",
         int32(CPointer(_type___nv_bfloat16), CPointer(float16)),
     )
 
@@ -324,7 +348,7 @@ def __nv_bfloat16_device_caller(arg_0, arg_1):
     def ctor_impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
-            "____nv_bfloat16____nv_bfloat16_3", shim_raw_str
+            "_ZN13__nv_bfloat16C1E6__half_nbst", shim_raw_str
         )
         selfptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
@@ -346,20 +370,20 @@ def ctor_impl(context, builder, sig, args):
         )
 
 
-_lower___nv_bfloat16_float16(shim_stream, shim_obj)
+_lower__ZN13__nv_bfloat16C1E6__half(shim_stream, shim_obj)
 
 
-def _lower___nv_bfloat16_float32(shim_stream, shim_obj):
+def _lower__ZN13__nv_bfloat16C1Ef(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16____nv_bfloat16_4(int &ignore, __nv_bfloat16 *self , float* f) {
+    _ZN13__nv_bfloat16C1Ef_nbst(int &ignore, __nv_bfloat16 *self , float* f) {
         new (self) __nv_bfloat16(*f);
         return 0;
     }
         """
 
     _ctor_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16____nv_bfloat16_4",
+        "_ZN13__nv_bfloat16C1Ef_nbst",
         int32(CPointer(_type___nv_bfloat16), CPointer(float32)),
     )
 
@@ -369,9 +393,7 @@ def __nv_bfloat16_device_caller(arg_0, arg_1):
     @lower(__nv_bfloat16, float32)
     def ctor_impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key(
-            "____nv_bfloat16____nv_bfloat16_4", shim_raw_str
-        )
+        shim_stream.write_with_key("_ZN13__nv_bfloat16C1Ef_nbst", shim_raw_str)
         selfptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
         )
@@ -391,21 +413,30 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
         )
 
+    @lower_cast(float32, _type___nv_bfloat16)
+    def conversion_impl(context, builder, fromty, toty, value):
+        return ctor_impl(
+            context,
+            builder,
+            signature(_type___nv_bfloat16, CPointer(float32)),
+            value,
+        )
+
 
-_lower___nv_bfloat16_float32(shim_stream, shim_obj)
+_lower__ZN13__nv_bfloat16C1Ef(shim_stream, shim_obj)
 
 
-def _lower___nv_bfloat16_float64(shim_stream, shim_obj):
+def _lower__ZN13__nv_bfloat16C1Ed(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16____nv_bfloat16_5(int &ignore, __nv_bfloat16 *self , double* f) {
+    _ZN13__nv_bfloat16C1Ed_nbst(int &ignore, __nv_bfloat16 *self , double* f) {
         new (self) __nv_bfloat16(*f);
         return 0;
     }
         """
 
     _ctor_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16____nv_bfloat16_5",
+        "_ZN13__nv_bfloat16C1Ed_nbst",
         int32(CPointer(_type___nv_bfloat16), CPointer(float64)),
     )
 
@@ -415,9 +446,7 @@ def __nv_bfloat16_device_caller(arg_0, arg_1):
     @lower(__nv_bfloat16, float64)
     def ctor_impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key(
-            "____nv_bfloat16____nv_bfloat16_5", shim_raw_str
-        )
+        shim_stream.write_with_key("_ZN13__nv_bfloat16C1Ed_nbst", shim_raw_str)
         selfptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
         )
@@ -437,21 +466,30 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
         )
 
+    @lower_cast(float64, _type___nv_bfloat16)
+    def conversion_impl(context, builder, fromty, toty, value):
+        return ctor_impl(
+            context,
+            builder,
+            signature(_type___nv_bfloat16, CPointer(float64)),
+            value,
+        )
 
-_lower___nv_bfloat16_float64(shim_stream, shim_obj)
 
+_lower__ZN13__nv_bfloat16C1Ed(shim_stream, shim_obj)
 
-def _lower___nv_bfloat16_int16(shim_stream, shim_obj):
+
+def _lower__ZN13__nv_bfloat16C1Es(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16____nv_bfloat16_6(int &ignore, __nv_bfloat16 *self , short* val) {
+    _ZN13__nv_bfloat16C1Es_nbst(int &ignore, __nv_bfloat16 *self , short* val) {
         new (self) __nv_bfloat16(*val);
         return 0;
     }
         """
 
     _ctor_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16____nv_bfloat16_6",
+        "_ZN13__nv_bfloat16C1Es_nbst",
         int32(CPointer(_type___nv_bfloat16), CPointer(int16)),
     )
 
@@ -461,9 +499,7 @@ def __nv_bfloat16_device_caller(arg_0, arg_1):
     @lower(__nv_bfloat16, int16)
     def ctor_impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key(
-            "____nv_bfloat16____nv_bfloat16_6", shim_raw_str
-        )
+        shim_stream.write_with_key("_ZN13__nv_bfloat16C1Es_nbst", shim_raw_str)
         selfptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
         )
@@ -483,21 +519,30 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
         )
 
+    @lower_cast(int16, _type___nv_bfloat16)
+    def conversion_impl(context, builder, fromty, toty, value):
+        return ctor_impl(
+            context,
+            builder,
+            signature(_type___nv_bfloat16, CPointer(int16)),
+            value,
+        )
+
 
-_lower___nv_bfloat16_int16(shim_stream, shim_obj)
+_lower__ZN13__nv_bfloat16C1Es(shim_stream, shim_obj)
 
 
-def _lower___nv_bfloat16_uint16(shim_stream, shim_obj):
+def _lower__ZN13__nv_bfloat16C1Et(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16____nv_bfloat16_7(int &ignore, __nv_bfloat16 *self , unsigned short* val) {
+    _ZN13__nv_bfloat16C1Et_nbst(int &ignore, __nv_bfloat16 *self , unsigned short* val) {
         new (self) __nv_bfloat16(*val);
         return 0;
     }
         """
 
     _ctor_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16____nv_bfloat16_7",
+        "_ZN13__nv_bfloat16C1Et_nbst",
         int32(CPointer(_type___nv_bfloat16), CPointer(uint16)),
     )
 
@@ -507,9 +552,7 @@ def __nv_bfloat16_device_caller(arg_0, arg_1):
     @lower(__nv_bfloat16, uint16)
     def ctor_impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key(
-            "____nv_bfloat16____nv_bfloat16_7", shim_raw_str
-        )
+        shim_stream.write_with_key("_ZN13__nv_bfloat16C1Et_nbst", shim_raw_str)
         selfptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
         )
@@ -529,21 +572,30 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
         )
 
+    @lower_cast(uint16, _type___nv_bfloat16)
+    def conversion_impl(context, builder, fromty, toty, value):
+        return ctor_impl(
+            context,
+            builder,
+            signature(_type___nv_bfloat16, CPointer(uint16)),
+            value,
+        )
+
 
-_lower___nv_bfloat16_uint16(shim_stream, shim_obj)
+_lower__ZN13__nv_bfloat16C1Et(shim_stream, shim_obj)
 
 
-def _lower___nv_bfloat16_int32(shim_stream, shim_obj):
+def _lower__ZN13__nv_bfloat16C1Ei(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16____nv_bfloat16_8(int &ignore, __nv_bfloat16 *self , int* val) {
+    _ZN13__nv_bfloat16C1Ei_nbst(int &ignore, __nv_bfloat16 *self , int* val) {
         new (self) __nv_bfloat16(*val);
         return 0;
     }
         """
 
     _ctor_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16____nv_bfloat16_8",
+        "_ZN13__nv_bfloat16C1Ei_nbst",
         int32(CPointer(_type___nv_bfloat16), CPointer(int32)),
     )
 
@@ -553,9 +605,7 @@ def __nv_bfloat16_device_caller(arg_0, arg_1):
     @lower(__nv_bfloat16, int32)
     def ctor_impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key(
-            "____nv_bfloat16____nv_bfloat16_8", shim_raw_str
-        )
+        shim_stream.write_with_key("_ZN13__nv_bfloat16C1Ei_nbst", shim_raw_str)
         selfptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
         )
@@ -575,21 +625,30 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
         )
 
+    @lower_cast(int32, _type___nv_bfloat16)
+    def conversion_impl(context, builder, fromty, toty, value):
+        return ctor_impl(
+            context,
+            builder,
+            signature(_type___nv_bfloat16, CPointer(int32)),
+            value,
+        )
+
 
-_lower___nv_bfloat16_int32(shim_stream, shim_obj)
+_lower__ZN13__nv_bfloat16C1Ei(shim_stream, shim_obj)
 
 
-def _lower___nv_bfloat16_uint32(shim_stream, shim_obj):
+def _lower__ZN13__nv_bfloat16C1Ej(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16____nv_bfloat16_9(int &ignore, __nv_bfloat16 *self , unsigned int* val) {
+    _ZN13__nv_bfloat16C1Ej_nbst(int &ignore, __nv_bfloat16 *self , unsigned int* val) {
         new (self) __nv_bfloat16(*val);
         return 0;
     }
         """
 
     _ctor_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16____nv_bfloat16_9",
+        "_ZN13__nv_bfloat16C1Ej_nbst",
         int32(CPointer(_type___nv_bfloat16), CPointer(uint32)),
     )
 
@@ -599,9 +658,7 @@ def __nv_bfloat16_device_caller(arg_0, arg_1):
     @lower(__nv_bfloat16, uint32)
     def ctor_impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key(
-            "____nv_bfloat16____nv_bfloat16_9", shim_raw_str
-        )
+        shim_stream.write_with_key("_ZN13__nv_bfloat16C1Ej_nbst", shim_raw_str)
         selfptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
         )
@@ -621,21 +678,30 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
         )
 
+    @lower_cast(uint32, _type___nv_bfloat16)
+    def conversion_impl(context, builder, fromty, toty, value):
+        return ctor_impl(
+            context,
+            builder,
+            signature(_type___nv_bfloat16, CPointer(uint32)),
+            value,
+        )
 
-_lower___nv_bfloat16_uint32(shim_stream, shim_obj)
 
+_lower__ZN13__nv_bfloat16C1Ej(shim_stream, shim_obj)
 
-def _lower___nv_bfloat16_int64(shim_stream, shim_obj):
+
+def _lower__ZN13__nv_bfloat16C1El(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16____nv_bfloat16_10(int &ignore, __nv_bfloat16 *self , long* val) {
+    _ZN13__nv_bfloat16C1El_nbst(int &ignore, __nv_bfloat16 *self , long* val) {
         new (self) __nv_bfloat16(*val);
         return 0;
     }
         """
 
     _ctor_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16____nv_bfloat16_10",
+        "_ZN13__nv_bfloat16C1El_nbst",
         int32(CPointer(_type___nv_bfloat16), CPointer(int64)),
     )
 
@@ -645,9 +711,7 @@ def __nv_bfloat16_device_caller(arg_0, arg_1):
     @lower(__nv_bfloat16, int64)
     def ctor_impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key(
-            "____nv_bfloat16____nv_bfloat16_10", shim_raw_str
-        )
+        shim_stream.write_with_key("_ZN13__nv_bfloat16C1El_nbst", shim_raw_str)
         selfptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
         )
@@ -667,21 +731,30 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
         )
 
+    @lower_cast(int64, _type___nv_bfloat16)
+    def conversion_impl(context, builder, fromty, toty, value):
+        return ctor_impl(
+            context,
+            builder,
+            signature(_type___nv_bfloat16, CPointer(int64)),
+            value,
+        )
+
 
-_lower___nv_bfloat16_int64(shim_stream, shim_obj)
+_lower__ZN13__nv_bfloat16C1El(shim_stream, shim_obj)
 
 
-def _lower___nv_bfloat16_uint64(shim_stream, shim_obj):
+def _lower__ZN13__nv_bfloat16C1Em(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16____nv_bfloat16_11(int &ignore, __nv_bfloat16 *self , unsigned long* val) {
+    _ZN13__nv_bfloat16C1Em_nbst(int &ignore, __nv_bfloat16 *self , unsigned long* val) {
         new (self) __nv_bfloat16(*val);
         return 0;
     }
         """
 
     _ctor_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16____nv_bfloat16_11",
+        "_ZN13__nv_bfloat16C1Em_nbst",
         int32(CPointer(_type___nv_bfloat16), CPointer(uint64)),
     )
 
@@ -691,9 +764,7 @@ def __nv_bfloat16_device_caller(arg_0, arg_1):
     @lower(__nv_bfloat16, uint64)
     def ctor_impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key(
-            "____nv_bfloat16____nv_bfloat16_11", shim_raw_str
-        )
+        shim_stream.write_with_key("_ZN13__nv_bfloat16C1Em_nbst", shim_raw_str)
         selfptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
         )
@@ -713,21 +784,30 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
         )
 
+    @lower_cast(uint64, _type___nv_bfloat16)
+    def conversion_impl(context, builder, fromty, toty, value):
+        return ctor_impl(
+            context,
+            builder,
+            signature(_type___nv_bfloat16, CPointer(uint64)),
+            value,
+        )
+
 
-_lower___nv_bfloat16_uint64(shim_stream, shim_obj)
+_lower__ZN13__nv_bfloat16C1Em(shim_stream, shim_obj)
 
 
-def _lower___nv_bfloat16_int64(shim_stream, shim_obj):
+def _lower__ZN13__nv_bfloat16C1Ex(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16____nv_bfloat16_12(int &ignore, __nv_bfloat16 *self , long long* val) {
+    _ZN13__nv_bfloat16C1Ex_nbst(int &ignore, __nv_bfloat16 *self , long long* val) {
         new (self) __nv_bfloat16(*val);
         return 0;
     }
         """
 
     _ctor_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16____nv_bfloat16_12",
+        "_ZN13__nv_bfloat16C1Ex_nbst",
         int32(CPointer(_type___nv_bfloat16), CPointer(int64)),
     )
 
@@ -737,9 +817,7 @@ def __nv_bfloat16_device_caller(arg_0, arg_1):
     @lower(__nv_bfloat16, int64)
     def ctor_impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key(
-            "____nv_bfloat16____nv_bfloat16_12", shim_raw_str
-        )
+        shim_stream.write_with_key("_ZN13__nv_bfloat16C1Ex_nbst", shim_raw_str)
         selfptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
         )
@@ -759,21 +837,30 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
         )
 
+    @lower_cast(int64, _type___nv_bfloat16)
+    def conversion_impl(context, builder, fromty, toty, value):
+        return ctor_impl(
+            context,
+            builder,
+            signature(_type___nv_bfloat16, CPointer(int64)),
+            value,
+        )
+
 
-_lower___nv_bfloat16_int64(shim_stream, shim_obj)
+_lower__ZN13__nv_bfloat16C1Ex(shim_stream, shim_obj)
 
 
-def _lower___nv_bfloat16_uint64(shim_stream, shim_obj):
+def _lower__ZN13__nv_bfloat16C1Ey(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16____nv_bfloat16_13(int &ignore, __nv_bfloat16 *self , unsigned long long* val) {
+    _ZN13__nv_bfloat16C1Ey_nbst(int &ignore, __nv_bfloat16 *self , unsigned long long* val) {
         new (self) __nv_bfloat16(*val);
         return 0;
     }
         """
 
     _ctor_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16____nv_bfloat16_13",
+        "_ZN13__nv_bfloat16C1Ey_nbst",
         int32(CPointer(_type___nv_bfloat16), CPointer(uint64)),
     )
 
@@ -783,9 +870,7 @@ def __nv_bfloat16_device_caller(arg_0, arg_1):
     @lower(__nv_bfloat16, uint64)
     def ctor_impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key(
-            "____nv_bfloat16____nv_bfloat16_13", shim_raw_str
-        )
+        shim_stream.write_with_key("_ZN13__nv_bfloat16C1Ey_nbst", shim_raw_str)
         selfptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
         )
@@ -805,8 +890,17 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
         )
 
+    @lower_cast(uint64, _type___nv_bfloat16)
+    def conversion_impl(context, builder, fromty, toty, value):
+        return ctor_impl(
+            context,
+            builder,
+            signature(_type___nv_bfloat16, CPointer(uint64)),
+            value,
+        )
 
-_lower___nv_bfloat16_uint64(shim_stream, shim_obj)
+
+_lower__ZN13__nv_bfloat16C1Ey(shim_stream, shim_obj)
 
 
 @register
@@ -816,7 +910,7 @@ class _ctor_template___nv_bfloat16(ConcreteTemplate):
         signature(
             _type___nv_bfloat16,
         ),
-        signature(_type___nv_bfloat16, _type_unnamed1401637),
+        signature(_type___nv_bfloat16, _type_unnamed1405307),
         signature(_type___nv_bfloat16, float16),
         signature(_type___nv_bfloat16, float32),
         signature(_type___nv_bfloat16, float64),
@@ -834,18 +928,18 @@ class _ctor_template___nv_bfloat16(ConcreteTemplate):
 register_global(__nv_bfloat16, Function(_ctor_template___nv_bfloat16))
 
 
-def _from___nv_bfloat16_to__type_unnamed1401637_lower(shim_stream, shim_obj):
+def _from___nv_bfloat16_to__type_unnamed1405307_lower(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16_operator___nv_bfloat16_raw_1(__nv_bfloat16_raw &retval, __nv_bfloat16 *self) {
+    ____nv_bfloat16__ZNK13__nv_bfloat16cv17__nv_bfloat16_rawEv_1(__nv_bfloat16_raw &retval, __nv_bfloat16 *self) {
         retval = self->operator __nv_bfloat16_raw();
         return 0;
     }
         """
 
     _op_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16_operator___nv_bfloat16_raw_1",
-        _type_unnamed1401637(
+        "____nv_bfloat16__ZNK13__nv_bfloat16cv17__nv_bfloat16_rawEv_1",
+        _type_unnamed1405307(
             CPointer(_type___nv_bfloat16),
         ),
     )
@@ -853,11 +947,12 @@ def _from___nv_bfloat16_to__type_unnamed1401637_lower(shim_stream, shim_obj):
     def _conversion_op_caller___nv_bfloat16(arg):
         return _op_decl___nv_bfloat16(arg)
 
-    @lower_cast(_type___nv_bfloat16, _type_unnamed1401637)
+    @lower_cast(_type___nv_bfloat16, _type_unnamed1405307)
     def impl(context, builder, fromty, toty, value):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
-            "____nv_bfloat16_operator___nv_bfloat16_raw_1", shim_raw_str
+            "____nv_bfloat16__ZNK13__nv_bfloat16cv17__nv_bfloat16_rawEv_1",
+            shim_raw_str,
         )
         ptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
@@ -870,28 +965,28 @@ def impl(context, builder, fromty, toty, value):
             builder,
             _conversion_op_caller___nv_bfloat16,
             signature(
-                _type_unnamed1401637,
+                _type_unnamed1405307,
                 CPointer(_type___nv_bfloat16),
             ),
             (ptr,),
         )
 
 
-_from___nv_bfloat16_to__type_unnamed1401637_lower(shim_stream, shim_obj)
+_from___nv_bfloat16_to__type_unnamed1405307_lower(shim_stream, shim_obj)
 
 
-def _from___nv_bfloat16_to__type_unnamed1401637_lower(shim_stream, shim_obj):
+def _from___nv_bfloat16_to__type_unnamed1405307_lower(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16_operator___nv_bfloat16_raw_2(__nv_bfloat16_raw &retval, __nv_bfloat16 *self) {
+    ____nv_bfloat16__ZNVK13__nv_bfloat16cv17__nv_bfloat16_rawEv_1(__nv_bfloat16_raw &retval, __nv_bfloat16 *self) {
         retval = self->operator __nv_bfloat16_raw();
         return 0;
     }
         """
 
     _op_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16_operator___nv_bfloat16_raw_2",
-        _type_unnamed1401637(
+        "____nv_bfloat16__ZNVK13__nv_bfloat16cv17__nv_bfloat16_rawEv_1",
+        _type_unnamed1405307(
             CPointer(_type___nv_bfloat16),
         ),
     )
@@ -899,11 +994,12 @@ def _from___nv_bfloat16_to__type_unnamed1401637_lower(shim_stream, shim_obj):
     def _conversion_op_caller___nv_bfloat16(arg):
         return _op_decl___nv_bfloat16(arg)
 
-    @lower_cast(_type___nv_bfloat16, _type_unnamed1401637)
+    @lower_cast(_type___nv_bfloat16, _type_unnamed1405307)
     def impl(context, builder, fromty, toty, value):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
-            "____nv_bfloat16_operator___nv_bfloat16_raw_2", shim_raw_str
+            "____nv_bfloat16__ZNVK13__nv_bfloat16cv17__nv_bfloat16_rawEv_1",
+            shim_raw_str,
         )
         ptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
@@ -916,27 +1012,27 @@ def impl(context, builder, fromty, toty, value):
             builder,
             _conversion_op_caller___nv_bfloat16,
             signature(
-                _type_unnamed1401637,
+                _type_unnamed1405307,
                 CPointer(_type___nv_bfloat16),
             ),
             (ptr,),
         )
 
 
-_from___nv_bfloat16_to__type_unnamed1401637_lower(shim_stream, shim_obj)
+_from___nv_bfloat16_to__type_unnamed1405307_lower(shim_stream, shim_obj)
 
 
 def _from___nv_bfloat16_to_float32_lower(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16_operator_float_1(float &retval, __nv_bfloat16 *self) {
+    ____nv_bfloat16__ZNK13__nv_bfloat16cvfEv_1(float &retval, __nv_bfloat16 *self) {
         retval = self->operator float();
         return 0;
     }
         """
 
     _op_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16_operator_float_1",
+        "____nv_bfloat16__ZNK13__nv_bfloat16cvfEv_1",
         float32(
             CPointer(_type___nv_bfloat16),
         ),
@@ -949,7 +1045,7 @@ def _conversion_op_caller___nv_bfloat16(arg):
     def impl(context, builder, fromty, toty, value):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
-            "____nv_bfloat16_operator_float_1", shim_raw_str
+            "____nv_bfloat16__ZNK13__nv_bfloat16cvfEv_1", shim_raw_str
         )
         ptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
@@ -975,14 +1071,14 @@ def impl(context, builder, fromty, toty, value):
 def _from___nv_bfloat16_to_int8_lower(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16_operator_signed_char_1(signed char &retval, __nv_bfloat16 *self) {
+    ____nv_bfloat16__ZNK13__nv_bfloat16cvaEv_1(signed char &retval, __nv_bfloat16 *self) {
         retval = self->operator signed char();
         return 0;
     }
         """
 
     _op_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16_operator_signed_char_1",
+        "____nv_bfloat16__ZNK13__nv_bfloat16cvaEv_1",
         int8(
             CPointer(_type___nv_bfloat16),
         ),
@@ -995,7 +1091,7 @@ def _conversion_op_caller___nv_bfloat16(arg):
     def impl(context, builder, fromty, toty, value):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
-            "____nv_bfloat16_operator_signed_char_1", shim_raw_str
+            "____nv_bfloat16__ZNK13__nv_bfloat16cvaEv_1", shim_raw_str
         )
         ptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
@@ -1021,14 +1117,14 @@ def impl(context, builder, fromty, toty, value):
 def _from___nv_bfloat16_to_uint8_lower(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16_operator_unsigned_char_1(unsigned char &retval, __nv_bfloat16 *self) {
+    ____nv_bfloat16__ZNK13__nv_bfloat16cvhEv_1(unsigned char &retval, __nv_bfloat16 *self) {
         retval = self->operator unsigned char();
         return 0;
     }
         """
 
     _op_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16_operator_unsigned_char_1",
+        "____nv_bfloat16__ZNK13__nv_bfloat16cvhEv_1",
         uint8(
             CPointer(_type___nv_bfloat16),
         ),
@@ -1041,7 +1137,7 @@ def _conversion_op_caller___nv_bfloat16(arg):
     def impl(context, builder, fromty, toty, value):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
-            "____nv_bfloat16_operator_unsigned_char_1", shim_raw_str
+            "____nv_bfloat16__ZNK13__nv_bfloat16cvhEv_1", shim_raw_str
         )
         ptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
@@ -1067,14 +1163,14 @@ def impl(context, builder, fromty, toty, value):
 def _from___nv_bfloat16_to_int8_lower(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16_operator_char_1(char &retval, __nv_bfloat16 *self) {
+    ____nv_bfloat16__ZNK13__nv_bfloat16cvcEv_1(char &retval, __nv_bfloat16 *self) {
         retval = self->operator char();
         return 0;
     }
         """
 
     _op_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16_operator_char_1",
+        "____nv_bfloat16__ZNK13__nv_bfloat16cvcEv_1",
         int8(
             CPointer(_type___nv_bfloat16),
         ),
@@ -1087,7 +1183,7 @@ def _conversion_op_caller___nv_bfloat16(arg):
     def impl(context, builder, fromty, toty, value):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
-            "____nv_bfloat16_operator_char_1", shim_raw_str
+            "____nv_bfloat16__ZNK13__nv_bfloat16cvcEv_1", shim_raw_str
         )
         ptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
@@ -1113,14 +1209,14 @@ def impl(context, builder, fromty, toty, value):
 def _from___nv_bfloat16_to_int16_lower(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16_operator_short_1(short &retval, __nv_bfloat16 *self) {
+    ____nv_bfloat16__ZNK13__nv_bfloat16cvsEv_1(short &retval, __nv_bfloat16 *self) {
         retval = self->operator short();
         return 0;
     }
         """
 
     _op_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16_operator_short_1",
+        "____nv_bfloat16__ZNK13__nv_bfloat16cvsEv_1",
         int16(
             CPointer(_type___nv_bfloat16),
         ),
@@ -1133,7 +1229,7 @@ def _conversion_op_caller___nv_bfloat16(arg):
     def impl(context, builder, fromty, toty, value):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
-            "____nv_bfloat16_operator_short_1", shim_raw_str
+            "____nv_bfloat16__ZNK13__nv_bfloat16cvsEv_1", shim_raw_str
         )
         ptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
@@ -1159,14 +1255,14 @@ def impl(context, builder, fromty, toty, value):
 def _from___nv_bfloat16_to_uint16_lower(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16_operator_unsigned_short_1(unsigned short &retval, __nv_bfloat16 *self) {
+    ____nv_bfloat16__ZNK13__nv_bfloat16cvtEv_1(unsigned short &retval, __nv_bfloat16 *self) {
         retval = self->operator unsigned short();
         return 0;
     }
         """
 
     _op_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16_operator_unsigned_short_1",
+        "____nv_bfloat16__ZNK13__nv_bfloat16cvtEv_1",
         uint16(
             CPointer(_type___nv_bfloat16),
         ),
@@ -1179,7 +1275,7 @@ def _conversion_op_caller___nv_bfloat16(arg):
     def impl(context, builder, fromty, toty, value):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
-            "____nv_bfloat16_operator_unsigned_short_1", shim_raw_str
+            "____nv_bfloat16__ZNK13__nv_bfloat16cvtEv_1", shim_raw_str
         )
         ptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
@@ -1205,14 +1301,14 @@ def impl(context, builder, fromty, toty, value):
 def _from___nv_bfloat16_to_int32_lower(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16_operator_int_1(int &retval, __nv_bfloat16 *self) {
+    ____nv_bfloat16__ZNK13__nv_bfloat16cviEv_1(int &retval, __nv_bfloat16 *self) {
         retval = self->operator int();
         return 0;
     }
         """
 
     _op_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16_operator_int_1",
+        "____nv_bfloat16__ZNK13__nv_bfloat16cviEv_1",
         int32(
             CPointer(_type___nv_bfloat16),
         ),
@@ -1225,7 +1321,7 @@ def _conversion_op_caller___nv_bfloat16(arg):
     def impl(context, builder, fromty, toty, value):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
-            "____nv_bfloat16_operator_int_1", shim_raw_str
+            "____nv_bfloat16__ZNK13__nv_bfloat16cviEv_1", shim_raw_str
         )
         ptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
@@ -1251,14 +1347,14 @@ def impl(context, builder, fromty, toty, value):
 def _from___nv_bfloat16_to_uint32_lower(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16_operator_unsigned_int_1(unsigned int &retval, __nv_bfloat16 *self) {
+    ____nv_bfloat16__ZNK13__nv_bfloat16cvjEv_1(unsigned int &retval, __nv_bfloat16 *self) {
         retval = self->operator unsigned int();
         return 0;
     }
         """
 
     _op_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16_operator_unsigned_int_1",
+        "____nv_bfloat16__ZNK13__nv_bfloat16cvjEv_1",
         uint32(
             CPointer(_type___nv_bfloat16),
         ),
@@ -1271,7 +1367,7 @@ def _conversion_op_caller___nv_bfloat16(arg):
     def impl(context, builder, fromty, toty, value):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
-            "____nv_bfloat16_operator_unsigned_int_1", shim_raw_str
+            "____nv_bfloat16__ZNK13__nv_bfloat16cvjEv_1", shim_raw_str
         )
         ptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
@@ -1297,14 +1393,14 @@ def impl(context, builder, fromty, toty, value):
 def _from___nv_bfloat16_to_int64_lower(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16_operator_long_1(long &retval, __nv_bfloat16 *self) {
+    ____nv_bfloat16__ZNK13__nv_bfloat16cvlEv_1(long &retval, __nv_bfloat16 *self) {
         retval = self->operator long();
         return 0;
     }
         """
 
     _op_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16_operator_long_1",
+        "____nv_bfloat16__ZNK13__nv_bfloat16cvlEv_1",
         int64(
             CPointer(_type___nv_bfloat16),
         ),
@@ -1317,7 +1413,7 @@ def _conversion_op_caller___nv_bfloat16(arg):
     def impl(context, builder, fromty, toty, value):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
-            "____nv_bfloat16_operator_long_1", shim_raw_str
+            "____nv_bfloat16__ZNK13__nv_bfloat16cvlEv_1", shim_raw_str
         )
         ptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
@@ -1343,14 +1439,14 @@ def impl(context, builder, fromty, toty, value):
 def _from___nv_bfloat16_to_uint64_lower(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16_operator_unsigned_long_1(unsigned long &retval, __nv_bfloat16 *self) {
+    ____nv_bfloat16__ZNK13__nv_bfloat16cvmEv_1(unsigned long &retval, __nv_bfloat16 *self) {
         retval = self->operator unsigned long();
         return 0;
     }
         """
 
     _op_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16_operator_unsigned_long_1",
+        "____nv_bfloat16__ZNK13__nv_bfloat16cvmEv_1",
         uint64(
             CPointer(_type___nv_bfloat16),
         ),
@@ -1363,7 +1459,7 @@ def _conversion_op_caller___nv_bfloat16(arg):
     def impl(context, builder, fromty, toty, value):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
-            "____nv_bfloat16_operator_unsigned_long_1", shim_raw_str
+            "____nv_bfloat16__ZNK13__nv_bfloat16cvmEv_1", shim_raw_str
         )
         ptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
@@ -1389,14 +1485,14 @@ def impl(context, builder, fromty, toty, value):
 def _from___nv_bfloat16_to_int64_lower(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16_operator_long_long_1(long long &retval, __nv_bfloat16 *self) {
+    ____nv_bfloat16__ZNK13__nv_bfloat16cvxEv_1(long long &retval, __nv_bfloat16 *self) {
         retval = self->operator long long();
         return 0;
     }
         """
 
     _op_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16_operator_long_long_1",
+        "____nv_bfloat16__ZNK13__nv_bfloat16cvxEv_1",
         int64(
             CPointer(_type___nv_bfloat16),
         ),
@@ -1409,7 +1505,7 @@ def _conversion_op_caller___nv_bfloat16(arg):
     def impl(context, builder, fromty, toty, value):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
-            "____nv_bfloat16_operator_long_long_1", shim_raw_str
+            "____nv_bfloat16__ZNK13__nv_bfloat16cvxEv_1", shim_raw_str
         )
         ptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
@@ -1435,14 +1531,14 @@ def impl(context, builder, fromty, toty, value):
 def _from___nv_bfloat16_to_uint64_lower(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16_operator_unsigned_long_long_1(unsigned long long &retval, __nv_bfloat16 *self) {
+    ____nv_bfloat16__ZNK13__nv_bfloat16cvyEv_1(unsigned long long &retval, __nv_bfloat16 *self) {
         retval = self->operator unsigned long long();
         return 0;
     }
         """
 
     _op_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16_operator_unsigned_long_long_1",
+        "____nv_bfloat16__ZNK13__nv_bfloat16cvyEv_1",
         uint64(
             CPointer(_type___nv_bfloat16),
         ),
@@ -1455,7 +1551,7 @@ def _conversion_op_caller___nv_bfloat16(arg):
     def impl(context, builder, fromty, toty, value):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
-            "____nv_bfloat16_operator_unsigned_long_long_1", shim_raw_str
+            "____nv_bfloat16__ZNK13__nv_bfloat16cvyEv_1", shim_raw_str
         )
         ptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
@@ -1481,14 +1577,14 @@ def impl(context, builder, fromty, toty, value):
 def _from___nv_bfloat16_to_bool__lower(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16_operator_bool_1(bool &retval, __nv_bfloat16 *self) {
+    ____nv_bfloat16__ZNK13__nv_bfloat16cvbEv_1(bool &retval, __nv_bfloat16 *self) {
         retval = self->operator bool();
         return 0;
     }
         """
 
     _op_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16_operator_bool_1",
+        "____nv_bfloat16__ZNK13__nv_bfloat16cvbEv_1",
         bool_(
             CPointer(_type___nv_bfloat16),
         ),
@@ -1501,7 +1597,7 @@ def _conversion_op_caller___nv_bfloat16(arg):
     def impl(context, builder, fromty, toty, value):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
-            "____nv_bfloat16_operator_bool_1", shim_raw_str
+            "____nv_bfloat16__ZNK13__nv_bfloat16cvbEv_1", shim_raw_str
         )
         ptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
@@ -1565,17 +1661,17 @@ def resolve_y(self, obj):
 make_attribute_wrapper(_type_class___nv_bfloat162, "y", "y")
 
 
-def _lower___nv_bfloat162_void(shim_stream, shim_obj):
+def _lower__ZN14__nv_bfloat162C1Ev(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat162____nv_bfloat162_1(int &ignore, __nv_bfloat162 *self ) {
+    _ZN14__nv_bfloat162C1Ev_nbst(int &ignore, __nv_bfloat162 *self ) {
         new (self) __nv_bfloat162();
         return 0;
     }
         """
 
     _ctor_decl___nv_bfloat162 = declare_device(
-        "____nv_bfloat162____nv_bfloat162_1",
+        "_ZN14__nv_bfloat162C1Ev_nbst",
         int32(
             CPointer(_type___nv_bfloat162),
         ),
@@ -1589,9 +1685,7 @@ def __nv_bfloat162_device_caller(arg_0):
     )
     def ctor_impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key(
-            "____nv_bfloat162____nv_bfloat162_1", shim_raw_str
-        )
+        shim_stream.write_with_key("_ZN14__nv_bfloat162C1Ev_nbst", shim_raw_str)
         selfptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat162), name="selfptr"
         )
@@ -1615,20 +1709,20 @@ def ctor_impl(context, builder, sig, args):
         )
 
 
-_lower___nv_bfloat162_void(shim_stream, shim_obj)
+_lower__ZN14__nv_bfloat162C1Ev(shim_stream, shim_obj)
 
 
-def _lower___nv_bfloat162__type___nv_bfloat162(shim_stream, shim_obj):
+def _lower__ZN14__nv_bfloat162C1EOS_(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat162____nv_bfloat162_2(int &ignore, __nv_bfloat162 *self , __nv_bfloat162* src) {
+    _ZN14__nv_bfloat162C1EOS__nbst(int &ignore, __nv_bfloat162 *self , __nv_bfloat162* src) {
         new (self) __nv_bfloat162(*src);
         return 0;
     }
         """
 
     _ctor_decl___nv_bfloat162 = declare_device(
-        "____nv_bfloat162____nv_bfloat162_2",
+        "_ZN14__nv_bfloat162C1EOS__nbst",
         int32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
     )
 
@@ -1639,7 +1733,7 @@ def __nv_bfloat162_device_caller(arg_0, arg_1):
     def ctor_impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
-            "____nv_bfloat162____nv_bfloat162_2", shim_raw_str
+            "_ZN14__nv_bfloat162C1EOS__nbst", shim_raw_str
         )
         selfptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat162), name="selfptr"
@@ -1665,22 +1759,20 @@ def ctor_impl(context, builder, sig, args):
         )
 
 
-_lower___nv_bfloat162__type___nv_bfloat162(shim_stream, shim_obj)
+_lower__ZN14__nv_bfloat162C1EOS_(shim_stream, shim_obj)
 
 
-def _lower___nv_bfloat162__type___nv_bfloat16__type___nv_bfloat16(
-    shim_stream, shim_obj
-):
+def _lower__ZN14__nv_bfloat162C1ERK13__nv_bfloat16S2_(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat162____nv_bfloat162_3(int &ignore, __nv_bfloat162 *self , __nv_bfloat16* a, __nv_bfloat16* b) {
+    _ZN14__nv_bfloat162C1ERK13__nv_bfloat16S2__nbst(int &ignore, __nv_bfloat162 *self , __nv_bfloat16* a, __nv_bfloat16* b) {
         new (self) __nv_bfloat162(*a, *b);
         return 0;
     }
         """
 
     _ctor_decl___nv_bfloat162 = declare_device(
-        "____nv_bfloat162____nv_bfloat162_3",
+        "_ZN14__nv_bfloat162C1ERK13__nv_bfloat16S2__nbst",
         int32(
             CPointer(_type___nv_bfloat162),
             CPointer(_type___nv_bfloat16),
@@ -1695,7 +1787,7 @@ def __nv_bfloat162_device_caller(arg_0, arg_1, arg_2):
     def ctor_impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
-            "____nv_bfloat162____nv_bfloat162_3", shim_raw_str
+            "_ZN14__nv_bfloat162C1ERK13__nv_bfloat16S2__nbst", shim_raw_str
         )
         selfptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat162), name="selfptr"
@@ -1722,22 +1814,20 @@ def ctor_impl(context, builder, sig, args):
         )
 
 
-_lower___nv_bfloat162__type___nv_bfloat16__type___nv_bfloat16(
-    shim_stream, shim_obj
-)
+_lower__ZN14__nv_bfloat162C1ERK13__nv_bfloat16S2_(shim_stream, shim_obj)
 
 
-def _lower___nv_bfloat162__type___nv_bfloat162(shim_stream, shim_obj):
+def _lower__ZN14__nv_bfloat162C1ERKS_(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat162____nv_bfloat162_4(int &ignore, __nv_bfloat162 *self , __nv_bfloat162* src) {
+    _ZN14__nv_bfloat162C1ERKS__nbst(int &ignore, __nv_bfloat162 *self , __nv_bfloat162* src) {
         new (self) __nv_bfloat162(*src);
         return 0;
     }
         """
 
     _ctor_decl___nv_bfloat162 = declare_device(
-        "____nv_bfloat162____nv_bfloat162_4",
+        "_ZN14__nv_bfloat162C1ERKS__nbst",
         int32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
     )
 
@@ -1748,7 +1838,7 @@ def __nv_bfloat162_device_caller(arg_0, arg_1):
     def ctor_impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
-            "____nv_bfloat162____nv_bfloat162_4", shim_raw_str
+            "_ZN14__nv_bfloat162C1ERKS__nbst", shim_raw_str
         )
         selfptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat162), name="selfptr"
@@ -1774,31 +1864,31 @@ def ctor_impl(context, builder, sig, args):
         )
 
 
-_lower___nv_bfloat162__type___nv_bfloat162(shim_stream, shim_obj)
+_lower__ZN14__nv_bfloat162C1ERKS_(shim_stream, shim_obj)
 
 
-def _lower___nv_bfloat162__type_unnamed1401746(shim_stream, shim_obj):
+def _lower__ZN14__nv_bfloat162C1ERK18__nv_bfloat162_raw(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat162____nv_bfloat162_5(int &ignore, __nv_bfloat162 *self , __nv_bfloat162_raw* h2r) {
+    _ZN14__nv_bfloat162C1ERK18__nv_bfloat162_raw_nbst(int &ignore, __nv_bfloat162 *self , __nv_bfloat162_raw* h2r) {
         new (self) __nv_bfloat162(*h2r);
         return 0;
     }
         """
 
     _ctor_decl___nv_bfloat162 = declare_device(
-        "____nv_bfloat162____nv_bfloat162_5",
-        int32(CPointer(_type___nv_bfloat162), CPointer(_type_unnamed1401746)),
+        "_ZN14__nv_bfloat162C1ERK18__nv_bfloat162_raw_nbst",
+        int32(CPointer(_type___nv_bfloat162), CPointer(_type_unnamed1405416)),
     )
 
     def __nv_bfloat162_device_caller(arg_0, arg_1):
         return _ctor_decl___nv_bfloat162(arg_0, arg_1)
 
-    @lower(__nv_bfloat162, _type_unnamed1401746)
+    @lower(__nv_bfloat162, _type_unnamed1405416)
     def ctor_impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
-            "____nv_bfloat162____nv_bfloat162_5", shim_raw_str
+            "_ZN14__nv_bfloat162C1ERK18__nv_bfloat162_raw_nbst", shim_raw_str
         )
         selfptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat162), name="selfptr"
@@ -1815,7 +1905,7 @@ def ctor_impl(context, builder, sig, args):
             signature(
                 int32,
                 CPointer(_type___nv_bfloat162),
-                CPointer(_type_unnamed1401746),
+                CPointer(_type_unnamed1405416),
             ),
             (selfptr, *argptrs),
         )
@@ -1823,8 +1913,17 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat162, "alignof_", None)
         )
 
+    @lower_cast(_type_unnamed1405416, _type___nv_bfloat162)
+    def conversion_impl(context, builder, fromty, toty, value):
+        return ctor_impl(
+            context,
+            builder,
+            signature(_type___nv_bfloat162, CPointer(_type_unnamed1405416)),
+            value,
+        )
+
 
-_lower___nv_bfloat162__type_unnamed1401746(shim_stream, shim_obj)
+_lower__ZN14__nv_bfloat162C1ERK18__nv_bfloat162_raw(shim_stream, shim_obj)
 
 
 @register
@@ -1839,25 +1938,25 @@ class _ctor_template___nv_bfloat162(ConcreteTemplate):
             _type___nv_bfloat162, _type___nv_bfloat16, _type___nv_bfloat16
         ),
         signature(_type___nv_bfloat162, _type___nv_bfloat162),
-        signature(_type___nv_bfloat162, _type_unnamed1401746),
+        signature(_type___nv_bfloat162, _type_unnamed1405416),
     ]
 
 
 register_global(__nv_bfloat162, Function(_ctor_template___nv_bfloat162))
 
 
-def _from___nv_bfloat162_to__type_unnamed1401746_lower(shim_stream, shim_obj):
+def _from___nv_bfloat162_to__type_unnamed1405416_lower(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat162_operator___nv_bfloat162_raw_1(__nv_bfloat162_raw &retval, __nv_bfloat162 *self) {
+    ____nv_bfloat162__ZNK14__nv_bfloat162cv18__nv_bfloat162_rawEv_1(__nv_bfloat162_raw &retval, __nv_bfloat162 *self) {
         retval = self->operator __nv_bfloat162_raw();
         return 0;
     }
         """
 
     _op_decl___nv_bfloat162 = declare_device(
-        "____nv_bfloat162_operator___nv_bfloat162_raw_1",
-        _type_unnamed1401746(
+        "____nv_bfloat162__ZNK14__nv_bfloat162cv18__nv_bfloat162_rawEv_1",
+        _type_unnamed1405416(
             CPointer(_type___nv_bfloat162),
         ),
     )
@@ -1865,11 +1964,12 @@ def _from___nv_bfloat162_to__type_unnamed1401746_lower(shim_stream, shim_obj):
     def _conversion_op_caller___nv_bfloat162(arg):
         return _op_decl___nv_bfloat162(arg)
 
-    @lower_cast(_type___nv_bfloat162, _type_unnamed1401746)
+    @lower_cast(_type___nv_bfloat162, _type_unnamed1405416)
     def impl(context, builder, fromty, toty, value):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
-            "____nv_bfloat162_operator___nv_bfloat162_raw_1", shim_raw_str
+            "____nv_bfloat162__ZNK14__nv_bfloat162cv18__nv_bfloat162_rawEv_1",
+            shim_raw_str,
         )
         ptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat162), name="selfptr"
@@ -1882,1997 +1982,2083 @@ def impl(context, builder, fromty, toty, value):
             builder,
             _conversion_op_caller___nv_bfloat162,
             signature(
-                _type_unnamed1401746,
+                _type_unnamed1405416,
                 CPointer(_type___nv_bfloat162),
             ),
             (ptr,),
         )
 
 
-_from___nv_bfloat162_to__type_unnamed1401746_lower(shim_stream, shim_obj)
+_from___nv_bfloat162_to__type_unnamed1405416_lower(shim_stream, shim_obj)
 
 
 # Functions:
 
 
-def make_bfloat162():
+def __double2bfloat16():
     pass
 
 
-def _make_bfloat162_1_lower(shim_stream, shim_obj):
+def _lower__ZL17__double2bfloat16d_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    make_bfloat162_1(__nv_bfloat162 &retval , __nv_bfloat16* x, __nv_bfloat16* y) {
-        retval = make_bfloat162(*x, *y);
+    _ZL17__double2bfloat16d_nbst(__nv_bfloat16 &retval , double* a) {
+        retval = __double2bfloat16(*a);
         return 0;
     }
         """
 
-    make_bfloat162_1 = declare_device(
-        "make_bfloat162_1",
-        _type___nv_bfloat162(
-            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
-        ),
+    _ZL17__double2bfloat16d_nbst = declare_device(
+        "_ZL17__double2bfloat16d_nbst", _type___nv_bfloat16(CPointer(float64))
     )
 
-    def make_bfloat162_1_caller(arg_0, arg_1):
-        return make_bfloat162_1(arg_0, arg_1)
+    def _ZL17__double2bfloat16d_nbst_caller(arg_0):
+        return _ZL17__double2bfloat16d_nbst(arg_0)
 
-    @lower(make_bfloat162, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__double2bfloat16, float64)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("make_bfloat162_1", shim_raw_str)
+        shim_stream.write_with_key("_ZL17__double2bfloat16d_nbst", shim_raw_str)
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            make_bfloat162_1_caller,
-            signature(
-                _type___nv_bfloat162,
-                CPointer(_type___nv_bfloat16),
-                CPointer(_type___nv_bfloat16),
-            ),
+            _ZL17__double2bfloat16d_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(float64)),
             ptrs,
         )
 
 
-_make_bfloat162_1_lower(shim_stream, shim_obj)
+_lower__ZL17__double2bfloat16d_nbst(shim_stream, shim_obj)
 
 
-def htrunc():
+def __float2bfloat16():
     pass
 
 
-def _htrunc_1_lower(shim_stream, shim_obj):
+def _lower__ZL16__float2bfloat16f_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    htrunc_1(__nv_bfloat16 &retval , __nv_bfloat16* h) {
-        retval = htrunc(*h);
+    _ZL16__float2bfloat16f_nbst(__nv_bfloat16 &retval , float* a) {
+        retval = __float2bfloat16(*a);
         return 0;
     }
         """
 
-    htrunc_1 = declare_device(
-        "htrunc_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16))
+    _ZL16__float2bfloat16f_nbst = declare_device(
+        "_ZL16__float2bfloat16f_nbst", _type___nv_bfloat16(CPointer(float32))
     )
 
-    def htrunc_1_caller(arg_0):
-        return htrunc_1(arg_0)
+    def _ZL16__float2bfloat16f_nbst_caller(arg_0):
+        return _ZL16__float2bfloat16f_nbst(arg_0)
 
-    @lower(htrunc, _type___nv_bfloat16)
+    @lower(__float2bfloat16, float32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("htrunc_1", shim_raw_str)
+        shim_stream.write_with_key("_ZL16__float2bfloat16f_nbst", shim_raw_str)
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            htrunc_1_caller,
-            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            _ZL16__float2bfloat16f_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(float32)),
             ptrs,
         )
 
 
-_htrunc_1_lower(shim_stream, shim_obj)
+_lower__ZL16__float2bfloat16f_nbst(shim_stream, shim_obj)
 
 
-def hceil():
+def __float2bfloat16_rn():
     pass
 
 
-def _hceil_1_lower(shim_stream, shim_obj):
+def _lower__ZL19__float2bfloat16_rnf_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    hceil_1(__nv_bfloat16 &retval , __nv_bfloat16* h) {
-        retval = hceil(*h);
+    _ZL19__float2bfloat16_rnf_nbst(__nv_bfloat16 &retval , float* a) {
+        retval = __float2bfloat16_rn(*a);
         return 0;
     }
         """
 
-    hceil_1 = declare_device(
-        "hceil_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16))
+    _ZL19__float2bfloat16_rnf_nbst = declare_device(
+        "_ZL19__float2bfloat16_rnf_nbst", _type___nv_bfloat16(CPointer(float32))
     )
 
-    def hceil_1_caller(arg_0):
-        return hceil_1(arg_0)
+    def _ZL19__float2bfloat16_rnf_nbst_caller(arg_0):
+        return _ZL19__float2bfloat16_rnf_nbst(arg_0)
 
-    @lower(hceil, _type___nv_bfloat16)
+    @lower(__float2bfloat16_rn, float32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("hceil_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL19__float2bfloat16_rnf_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            hceil_1_caller,
-            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            _ZL19__float2bfloat16_rnf_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(float32)),
             ptrs,
         )
 
 
-_hceil_1_lower(shim_stream, shim_obj)
+_lower__ZL19__float2bfloat16_rnf_nbst(shim_stream, shim_obj)
 
 
-def hfloor():
+def __float2bfloat16_rz():
     pass
 
 
-def _hfloor_1_lower(shim_stream, shim_obj):
+def _lower__ZL19__float2bfloat16_rzf_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    hfloor_1(__nv_bfloat16 &retval , __nv_bfloat16* h) {
-        retval = hfloor(*h);
+    _ZL19__float2bfloat16_rzf_nbst(__nv_bfloat16 &retval , float* a) {
+        retval = __float2bfloat16_rz(*a);
         return 0;
     }
         """
 
-    hfloor_1 = declare_device(
-        "hfloor_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16))
+    _ZL19__float2bfloat16_rzf_nbst = declare_device(
+        "_ZL19__float2bfloat16_rzf_nbst", _type___nv_bfloat16(CPointer(float32))
     )
 
-    def hfloor_1_caller(arg_0):
-        return hfloor_1(arg_0)
+    def _ZL19__float2bfloat16_rzf_nbst_caller(arg_0):
+        return _ZL19__float2bfloat16_rzf_nbst(arg_0)
 
-    @lower(hfloor, _type___nv_bfloat16)
+    @lower(__float2bfloat16_rz, float32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("hfloor_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL19__float2bfloat16_rzf_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            hfloor_1_caller,
-            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            _ZL19__float2bfloat16_rzf_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(float32)),
             ptrs,
         )
 
 
-_hfloor_1_lower(shim_stream, shim_obj)
+_lower__ZL19__float2bfloat16_rzf_nbst(shim_stream, shim_obj)
 
 
-def hrint():
+def __float2bfloat16_rd():
     pass
 
 
-def _hrint_1_lower(shim_stream, shim_obj):
+def _lower__ZL19__float2bfloat16_rdf_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    hrint_1(__nv_bfloat16 &retval , __nv_bfloat16* h) {
-        retval = hrint(*h);
+    _ZL19__float2bfloat16_rdf_nbst(__nv_bfloat16 &retval , float* a) {
+        retval = __float2bfloat16_rd(*a);
         return 0;
     }
         """
 
-    hrint_1 = declare_device(
-        "hrint_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16))
+    _ZL19__float2bfloat16_rdf_nbst = declare_device(
+        "_ZL19__float2bfloat16_rdf_nbst", _type___nv_bfloat16(CPointer(float32))
     )
 
-    def hrint_1_caller(arg_0):
-        return hrint_1(arg_0)
+    def _ZL19__float2bfloat16_rdf_nbst_caller(arg_0):
+        return _ZL19__float2bfloat16_rdf_nbst(arg_0)
 
-    @lower(hrint, _type___nv_bfloat16)
+    @lower(__float2bfloat16_rd, float32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("hrint_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL19__float2bfloat16_rdf_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            hrint_1_caller,
-            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            _ZL19__float2bfloat16_rdf_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(float32)),
             ptrs,
         )
 
 
-_hrint_1_lower(shim_stream, shim_obj)
+_lower__ZL19__float2bfloat16_rdf_nbst(shim_stream, shim_obj)
 
 
-def h2trunc():
+def __float2bfloat16_ru():
     pass
 
 
-def _h2trunc_1_lower(shim_stream, shim_obj):
+def _lower__ZL19__float2bfloat16_ruf_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    h2trunc_1(__nv_bfloat162 &retval , __nv_bfloat162* h) {
-        retval = h2trunc(*h);
+    _ZL19__float2bfloat16_ruf_nbst(__nv_bfloat16 &retval , float* a) {
+        retval = __float2bfloat16_ru(*a);
         return 0;
     }
         """
 
-    h2trunc_1 = declare_device(
-        "h2trunc_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162))
+    _ZL19__float2bfloat16_ruf_nbst = declare_device(
+        "_ZL19__float2bfloat16_ruf_nbst", _type___nv_bfloat16(CPointer(float32))
     )
 
-    def h2trunc_1_caller(arg_0):
-        return h2trunc_1(arg_0)
+    def _ZL19__float2bfloat16_ruf_nbst_caller(arg_0):
+        return _ZL19__float2bfloat16_ruf_nbst(arg_0)
 
-    @lower(h2trunc, _type___nv_bfloat162)
+    @lower(__float2bfloat16_ru, float32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("h2trunc_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL19__float2bfloat16_ruf_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            h2trunc_1_caller,
-            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            _ZL19__float2bfloat16_ruf_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(float32)),
             ptrs,
         )
 
 
-_h2trunc_1_lower(shim_stream, shim_obj)
+_lower__ZL19__float2bfloat16_ruf_nbst(shim_stream, shim_obj)
 
 
-def h2ceil():
+def __bfloat162float():
     pass
 
 
-def _h2ceil_1_lower(shim_stream, shim_obj):
+def _lower__ZL16__bfloat162float13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    h2ceil_1(__nv_bfloat162 &retval , __nv_bfloat162* h) {
-        retval = h2ceil(*h);
+    _ZL16__bfloat162float13__nv_bfloat16_nbst(float &retval , __nv_bfloat16* a) {
+        retval = __bfloat162float(*a);
         return 0;
     }
         """
 
-    h2ceil_1 = declare_device(
-        "h2ceil_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162))
+    _ZL16__bfloat162float13__nv_bfloat16_nbst = declare_device(
+        "_ZL16__bfloat162float13__nv_bfloat16_nbst",
+        float32(CPointer(_type___nv_bfloat16)),
     )
 
-    def h2ceil_1_caller(arg_0):
-        return h2ceil_1(arg_0)
+    def _ZL16__bfloat162float13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL16__bfloat162float13__nv_bfloat16_nbst(arg_0)
 
-    @lower(h2ceil, _type___nv_bfloat162)
+    @lower(__bfloat162float, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("h2ceil_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL16__bfloat162float13__nv_bfloat16_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            h2ceil_1_caller,
-            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            _ZL16__bfloat162float13__nv_bfloat16_nbst_caller,
+            signature(float32, CPointer(_type___nv_bfloat16)),
             ptrs,
         )
 
 
-_h2ceil_1_lower(shim_stream, shim_obj)
+_lower__ZL16__bfloat162float13__nv_bfloat16_nbst(shim_stream, shim_obj)
 
 
-def h2floor():
+def __float2bfloat162_rn():
     pass
 
 
-def _h2floor_1_lower(shim_stream, shim_obj):
+def _lower__ZL20__float2bfloat162_rnf_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    h2floor_1(__nv_bfloat162 &retval , __nv_bfloat162* h) {
-        retval = h2floor(*h);
+    _ZL20__float2bfloat162_rnf_nbst(__nv_bfloat162 &retval , float* a) {
+        retval = __float2bfloat162_rn(*a);
         return 0;
     }
         """
 
-    h2floor_1 = declare_device(
-        "h2floor_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162))
+    _ZL20__float2bfloat162_rnf_nbst = declare_device(
+        "_ZL20__float2bfloat162_rnf_nbst",
+        _type___nv_bfloat162(CPointer(float32)),
     )
 
-    def h2floor_1_caller(arg_0):
-        return h2floor_1(arg_0)
+    def _ZL20__float2bfloat162_rnf_nbst_caller(arg_0):
+        return _ZL20__float2bfloat162_rnf_nbst(arg_0)
 
-    @lower(h2floor, _type___nv_bfloat162)
+    @lower(__float2bfloat162_rn, float32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("h2floor_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL20__float2bfloat162_rnf_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            h2floor_1_caller,
-            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            _ZL20__float2bfloat162_rnf_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(float32)),
             ptrs,
         )
 
 
-_h2floor_1_lower(shim_stream, shim_obj)
+_lower__ZL20__float2bfloat162_rnf_nbst(shim_stream, shim_obj)
 
 
-def h2rint():
+def __floats2bfloat162_rn():
     pass
 
 
-def _h2rint_1_lower(shim_stream, shim_obj):
+def _lower__ZL21__floats2bfloat162_rnff_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    h2rint_1(__nv_bfloat162 &retval , __nv_bfloat162* h) {
-        retval = h2rint(*h);
+    _ZL21__floats2bfloat162_rnff_nbst(__nv_bfloat162 &retval , float* a, float* b) {
+        retval = __floats2bfloat162_rn(*a, *b);
         return 0;
     }
         """
 
-    h2rint_1 = declare_device(
-        "h2rint_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162))
+    _ZL21__floats2bfloat162_rnff_nbst = declare_device(
+        "_ZL21__floats2bfloat162_rnff_nbst",
+        _type___nv_bfloat162(CPointer(float32), CPointer(float32)),
     )
 
-    def h2rint_1_caller(arg_0):
-        return h2rint_1(arg_0)
+    def _ZL21__floats2bfloat162_rnff_nbst_caller(arg_0, arg_1):
+        return _ZL21__floats2bfloat162_rnff_nbst(arg_0, arg_1)
 
-    @lower(h2rint, _type___nv_bfloat162)
+    @lower(__floats2bfloat162_rn, float32, float32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("h2rint_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL21__floats2bfloat162_rnff_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            h2rint_1_caller,
-            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            _ZL21__floats2bfloat162_rnff_nbst_caller,
+            signature(
+                _type___nv_bfloat162, CPointer(float32), CPointer(float32)
+            ),
             ptrs,
         )
 
 
-_h2rint_1_lower(shim_stream, shim_obj)
+_lower__ZL21__floats2bfloat162_rnff_nbst(shim_stream, shim_obj)
 
 
-def hsqrt():
+def __low2float():
     pass
 
 
-def _hsqrt_1_lower(shim_stream, shim_obj):
+def _lower__ZL11__low2float14__nv_bfloat162_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    hsqrt_1(__nv_bfloat16 &retval , __nv_bfloat16* a) {
-        retval = hsqrt(*a);
+    _ZL11__low2float14__nv_bfloat162_nbst(float &retval , __nv_bfloat162* a) {
+        retval = __low2float(*a);
         return 0;
     }
         """
 
-    hsqrt_1 = declare_device(
-        "hsqrt_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16))
+    _ZL11__low2float14__nv_bfloat162_nbst = declare_device(
+        "_ZL11__low2float14__nv_bfloat162_nbst",
+        float32(CPointer(_type___nv_bfloat162)),
     )
 
-    def hsqrt_1_caller(arg_0):
-        return hsqrt_1(arg_0)
+    def _ZL11__low2float14__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL11__low2float14__nv_bfloat162_nbst(arg_0)
 
-    @lower(hsqrt, _type___nv_bfloat16)
+    @lower(__low2float, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("hsqrt_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL11__low2float14__nv_bfloat162_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            hsqrt_1_caller,
-            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            _ZL11__low2float14__nv_bfloat162_nbst_caller,
+            signature(float32, CPointer(_type___nv_bfloat162)),
             ptrs,
         )
 
 
-_hsqrt_1_lower(shim_stream, shim_obj)
+_lower__ZL11__low2float14__nv_bfloat162_nbst(shim_stream, shim_obj)
 
 
-def hrsqrt():
+def __high2float():
     pass
 
 
-def _hrsqrt_1_lower(shim_stream, shim_obj):
+def _lower__ZL12__high2float14__nv_bfloat162_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    hrsqrt_1(__nv_bfloat16 &retval , __nv_bfloat16* a) {
-        retval = hrsqrt(*a);
+    _ZL12__high2float14__nv_bfloat162_nbst(float &retval , __nv_bfloat162* a) {
+        retval = __high2float(*a);
         return 0;
     }
         """
 
-    hrsqrt_1 = declare_device(
-        "hrsqrt_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16))
+    _ZL12__high2float14__nv_bfloat162_nbst = declare_device(
+        "_ZL12__high2float14__nv_bfloat162_nbst",
+        float32(CPointer(_type___nv_bfloat162)),
     )
 
-    def hrsqrt_1_caller(arg_0):
-        return hrsqrt_1(arg_0)
+    def _ZL12__high2float14__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL12__high2float14__nv_bfloat162_nbst(arg_0)
 
-    @lower(hrsqrt, _type___nv_bfloat16)
+    @lower(__high2float, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("hrsqrt_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL12__high2float14__nv_bfloat162_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            hrsqrt_1_caller,
-            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            _ZL12__high2float14__nv_bfloat162_nbst_caller,
+            signature(float32, CPointer(_type___nv_bfloat162)),
             ptrs,
         )
 
 
-_hrsqrt_1_lower(shim_stream, shim_obj)
+_lower__ZL12__high2float14__nv_bfloat162_nbst(shim_stream, shim_obj)
 
 
-def hrcp():
+def __float22bfloat162_rn():
     pass
 
 
-def _hrcp_1_lower(shim_stream, shim_obj):
+def _lower__ZL21__float22bfloat162_rn6float2_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    hrcp_1(__nv_bfloat16 &retval , __nv_bfloat16* a) {
-        retval = hrcp(*a);
+    _ZL21__float22bfloat162_rn6float2_nbst(__nv_bfloat162 &retval , float2* a) {
+        retval = __float22bfloat162_rn(*a);
         return 0;
     }
         """
 
-    hrcp_1 = declare_device(
-        "hrcp_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16))
+    _ZL21__float22bfloat162_rn6float2_nbst = declare_device(
+        "_ZL21__float22bfloat162_rn6float2_nbst",
+        _type___nv_bfloat162(CPointer(float32x2)),
     )
 
-    def hrcp_1_caller(arg_0):
-        return hrcp_1(arg_0)
+    def _ZL21__float22bfloat162_rn6float2_nbst_caller(arg_0):
+        return _ZL21__float22bfloat162_rn6float2_nbst(arg_0)
 
-    @lower(hrcp, _type___nv_bfloat16)
+    @lower(__float22bfloat162_rn, float32x2)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("hrcp_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL21__float22bfloat162_rn6float2_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            hrcp_1_caller,
-            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            _ZL21__float22bfloat162_rn6float2_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(float32x2)),
             ptrs,
         )
 
 
-_hrcp_1_lower(shim_stream, shim_obj)
+_lower__ZL21__float22bfloat162_rn6float2_nbst(shim_stream, shim_obj)
 
 
-def hlog():
+def __bfloat1622float2():
     pass
 
 
-def _hlog_1_lower(shim_stream, shim_obj):
+def _lower__ZL18__bfloat1622float214__nv_bfloat162_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    hlog_1(__nv_bfloat16 &retval , __nv_bfloat16* a) {
-        retval = hlog(*a);
+    _ZL18__bfloat1622float214__nv_bfloat162_nbst(float2 &retval , __nv_bfloat162* a) {
+        retval = __bfloat1622float2(*a);
         return 0;
     }
         """
 
-    hlog_1 = declare_device(
-        "hlog_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16))
+    _ZL18__bfloat1622float214__nv_bfloat162_nbst = declare_device(
+        "_ZL18__bfloat1622float214__nv_bfloat162_nbst",
+        float32x2(CPointer(_type___nv_bfloat162)),
     )
 
-    def hlog_1_caller(arg_0):
-        return hlog_1(arg_0)
+    def _ZL18__bfloat1622float214__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL18__bfloat1622float214__nv_bfloat162_nbst(arg_0)
 
-    @lower(hlog, _type___nv_bfloat16)
+    @lower(__bfloat1622float2, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("hlog_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL18__bfloat1622float214__nv_bfloat162_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            hlog_1_caller,
-            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            _ZL18__bfloat1622float214__nv_bfloat162_nbst_caller,
+            signature(float32x2, CPointer(_type___nv_bfloat162)),
             ptrs,
         )
 
 
-_hlog_1_lower(shim_stream, shim_obj)
+_lower__ZL18__bfloat1622float214__nv_bfloat162_nbst(shim_stream, shim_obj)
 
 
-def hlog2():
+def __bfloat162char_rz():
     pass
 
 
-def _hlog2_1_lower(shim_stream, shim_obj):
+def _lower__ZL18__bfloat162char_rz13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    hlog2_1(__nv_bfloat16 &retval , __nv_bfloat16* a) {
-        retval = hlog2(*a);
+    _ZL18__bfloat162char_rz13__nv_bfloat16_nbst(signed char &retval , __nv_bfloat16* h) {
+        retval = __bfloat162char_rz(*h);
         return 0;
     }
         """
 
-    hlog2_1 = declare_device(
-        "hlog2_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16))
+    _ZL18__bfloat162char_rz13__nv_bfloat16_nbst = declare_device(
+        "_ZL18__bfloat162char_rz13__nv_bfloat16_nbst",
+        int8(CPointer(_type___nv_bfloat16)),
     )
 
-    def hlog2_1_caller(arg_0):
-        return hlog2_1(arg_0)
+    def _ZL18__bfloat162char_rz13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL18__bfloat162char_rz13__nv_bfloat16_nbst(arg_0)
 
-    @lower(hlog2, _type___nv_bfloat16)
+    @lower(__bfloat162char_rz, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("hlog2_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL18__bfloat162char_rz13__nv_bfloat16_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            hlog2_1_caller,
-            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            _ZL18__bfloat162char_rz13__nv_bfloat16_nbst_caller,
+            signature(int8, CPointer(_type___nv_bfloat16)),
             ptrs,
         )
 
 
-_hlog2_1_lower(shim_stream, shim_obj)
+_lower__ZL18__bfloat162char_rz13__nv_bfloat16_nbst(shim_stream, shim_obj)
 
 
-def hlog10():
+def __bfloat162uchar_rz():
     pass
 
 
-def _hlog10_1_lower(shim_stream, shim_obj):
+def _lower__ZL19__bfloat162uchar_rz13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    hlog10_1(__nv_bfloat16 &retval , __nv_bfloat16* a) {
-        retval = hlog10(*a);
+    _ZL19__bfloat162uchar_rz13__nv_bfloat16_nbst(unsigned char &retval , __nv_bfloat16* h) {
+        retval = __bfloat162uchar_rz(*h);
         return 0;
     }
         """
 
-    hlog10_1 = declare_device(
-        "hlog10_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16))
+    _ZL19__bfloat162uchar_rz13__nv_bfloat16_nbst = declare_device(
+        "_ZL19__bfloat162uchar_rz13__nv_bfloat16_nbst",
+        uint8(CPointer(_type___nv_bfloat16)),
     )
 
-    def hlog10_1_caller(arg_0):
-        return hlog10_1(arg_0)
+    def _ZL19__bfloat162uchar_rz13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL19__bfloat162uchar_rz13__nv_bfloat16_nbst(arg_0)
 
-    @lower(hlog10, _type___nv_bfloat16)
+    @lower(__bfloat162uchar_rz, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("hlog10_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL19__bfloat162uchar_rz13__nv_bfloat16_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            hlog10_1_caller,
-            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            _ZL19__bfloat162uchar_rz13__nv_bfloat16_nbst_caller,
+            signature(uint8, CPointer(_type___nv_bfloat16)),
             ptrs,
         )
 
 
-_hlog10_1_lower(shim_stream, shim_obj)
+_lower__ZL19__bfloat162uchar_rz13__nv_bfloat16_nbst(shim_stream, shim_obj)
 
 
-def hexp():
+def __bfloat162int_rn():
     pass
 
 
-def _hexp_1_lower(shim_stream, shim_obj):
+def _lower__ZL17__bfloat162int_rn13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    hexp_1(__nv_bfloat16 &retval , __nv_bfloat16* a) {
-        retval = hexp(*a);
+    _ZL17__bfloat162int_rn13__nv_bfloat16_nbst(int &retval , __nv_bfloat16* h) {
+        retval = __bfloat162int_rn(*h);
         return 0;
     }
         """
 
-    hexp_1 = declare_device(
-        "hexp_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16))
+    _ZL17__bfloat162int_rn13__nv_bfloat16_nbst = declare_device(
+        "_ZL17__bfloat162int_rn13__nv_bfloat16_nbst",
+        int32(CPointer(_type___nv_bfloat16)),
     )
 
-    def hexp_1_caller(arg_0):
-        return hexp_1(arg_0)
+    def _ZL17__bfloat162int_rn13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL17__bfloat162int_rn13__nv_bfloat16_nbst(arg_0)
 
-    @lower(hexp, _type___nv_bfloat16)
+    @lower(__bfloat162int_rn, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("hexp_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL17__bfloat162int_rn13__nv_bfloat16_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            hexp_1_caller,
-            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            _ZL17__bfloat162int_rn13__nv_bfloat16_nbst_caller,
+            signature(int32, CPointer(_type___nv_bfloat16)),
             ptrs,
         )
 
 
-_hexp_1_lower(shim_stream, shim_obj)
+_lower__ZL17__bfloat162int_rn13__nv_bfloat16_nbst(shim_stream, shim_obj)
 
 
-def htanh_approx():
+def __bfloat162int_rz():
     pass
 
 
-def _htanh_approx_1_lower(shim_stream, shim_obj):
+def _lower__ZL17__bfloat162int_rz13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    htanh_approx_1(__nv_bfloat16 &retval , __nv_bfloat16* a) {
-        retval = htanh_approx(*a);
+    _ZL17__bfloat162int_rz13__nv_bfloat16_nbst(int &retval , __nv_bfloat16* h) {
+        retval = __bfloat162int_rz(*h);
         return 0;
     }
         """
 
-    htanh_approx_1 = declare_device(
-        "htanh_approx_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16))
+    _ZL17__bfloat162int_rz13__nv_bfloat16_nbst = declare_device(
+        "_ZL17__bfloat162int_rz13__nv_bfloat16_nbst",
+        int32(CPointer(_type___nv_bfloat16)),
     )
 
-    def htanh_approx_1_caller(arg_0):
-        return htanh_approx_1(arg_0)
+    def _ZL17__bfloat162int_rz13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL17__bfloat162int_rz13__nv_bfloat16_nbst(arg_0)
 
-    @lower(htanh_approx, _type___nv_bfloat16)
+    @lower(__bfloat162int_rz, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("htanh_approx_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL17__bfloat162int_rz13__nv_bfloat16_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            htanh_approx_1_caller,
-            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            _ZL17__bfloat162int_rz13__nv_bfloat16_nbst_caller,
+            signature(int32, CPointer(_type___nv_bfloat16)),
             ptrs,
         )
 
 
-_htanh_approx_1_lower(shim_stream, shim_obj)
+_lower__ZL17__bfloat162int_rz13__nv_bfloat16_nbst(shim_stream, shim_obj)
 
 
-def h2tanh_approx():
+def __bfloat162int_rd():
     pass
 
 
-def _h2tanh_approx_1_lower(shim_stream, shim_obj):
+def _lower__ZL17__bfloat162int_rd13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    h2tanh_approx_1(__nv_bfloat162 &retval , __nv_bfloat162* a) {
-        retval = h2tanh_approx(*a);
+    _ZL17__bfloat162int_rd13__nv_bfloat16_nbst(int &retval , __nv_bfloat16* h) {
+        retval = __bfloat162int_rd(*h);
         return 0;
     }
         """
 
-    h2tanh_approx_1 = declare_device(
-        "h2tanh_approx_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162))
+    _ZL17__bfloat162int_rd13__nv_bfloat16_nbst = declare_device(
+        "_ZL17__bfloat162int_rd13__nv_bfloat16_nbst",
+        int32(CPointer(_type___nv_bfloat16)),
     )
 
-    def h2tanh_approx_1_caller(arg_0):
-        return h2tanh_approx_1(arg_0)
+    def _ZL17__bfloat162int_rd13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL17__bfloat162int_rd13__nv_bfloat16_nbst(arg_0)
 
-    @lower(h2tanh_approx, _type___nv_bfloat162)
+    @lower(__bfloat162int_rd, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("h2tanh_approx_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL17__bfloat162int_rd13__nv_bfloat16_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            h2tanh_approx_1_caller,
-            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            _ZL17__bfloat162int_rd13__nv_bfloat16_nbst_caller,
+            signature(int32, CPointer(_type___nv_bfloat16)),
             ptrs,
         )
 
 
-_h2tanh_approx_1_lower(shim_stream, shim_obj)
+_lower__ZL17__bfloat162int_rd13__nv_bfloat16_nbst(shim_stream, shim_obj)
 
 
-def htanh():
+def __bfloat162int_ru():
     pass
 
 
-def _htanh_1_lower(shim_stream, shim_obj):
+def _lower__ZL17__bfloat162int_ru13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    htanh_1(__nv_bfloat16 &retval , __nv_bfloat16* a) {
-        retval = htanh(*a);
+    _ZL17__bfloat162int_ru13__nv_bfloat16_nbst(int &retval , __nv_bfloat16* h) {
+        retval = __bfloat162int_ru(*h);
         return 0;
     }
         """
 
-    htanh_1 = declare_device(
-        "htanh_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16))
+    _ZL17__bfloat162int_ru13__nv_bfloat16_nbst = declare_device(
+        "_ZL17__bfloat162int_ru13__nv_bfloat16_nbst",
+        int32(CPointer(_type___nv_bfloat16)),
     )
 
-    def htanh_1_caller(arg_0):
-        return htanh_1(arg_0)
+    def _ZL17__bfloat162int_ru13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL17__bfloat162int_ru13__nv_bfloat16_nbst(arg_0)
 
-    @lower(htanh, _type___nv_bfloat16)
+    @lower(__bfloat162int_ru, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("htanh_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL17__bfloat162int_ru13__nv_bfloat16_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            htanh_1_caller,
-            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            _ZL17__bfloat162int_ru13__nv_bfloat16_nbst_caller,
+            signature(int32, CPointer(_type___nv_bfloat16)),
             ptrs,
         )
 
 
-_htanh_1_lower(shim_stream, shim_obj)
+_lower__ZL17__bfloat162int_ru13__nv_bfloat16_nbst(shim_stream, shim_obj)
 
 
-def h2tanh():
+def __int2bfloat16_rn():
     pass
 
 
-def _h2tanh_1_lower(shim_stream, shim_obj):
+def _lower__ZL17__int2bfloat16_rni_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    h2tanh_1(__nv_bfloat162 &retval , __nv_bfloat162* a) {
-        retval = h2tanh(*a);
+    _ZL17__int2bfloat16_rni_nbst(__nv_bfloat16 &retval , int* i) {
+        retval = __int2bfloat16_rn(*i);
         return 0;
     }
         """
 
-    h2tanh_1 = declare_device(
-        "h2tanh_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162))
+    _ZL17__int2bfloat16_rni_nbst = declare_device(
+        "_ZL17__int2bfloat16_rni_nbst", _type___nv_bfloat16(CPointer(int32))
     )
 
-    def h2tanh_1_caller(arg_0):
-        return h2tanh_1(arg_0)
+    def _ZL17__int2bfloat16_rni_nbst_caller(arg_0):
+        return _ZL17__int2bfloat16_rni_nbst(arg_0)
 
-    @lower(h2tanh, _type___nv_bfloat162)
+    @lower(__int2bfloat16_rn, int32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("h2tanh_1", shim_raw_str)
+        shim_stream.write_with_key("_ZL17__int2bfloat16_rni_nbst", shim_raw_str)
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            h2tanh_1_caller,
-            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            _ZL17__int2bfloat16_rni_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(int32)),
             ptrs,
         )
 
 
-_h2tanh_1_lower(shim_stream, shim_obj)
+_lower__ZL17__int2bfloat16_rni_nbst(shim_stream, shim_obj)
 
 
-def hexp2():
+def __int2bfloat16_rz():
     pass
 
 
-def _hexp2_1_lower(shim_stream, shim_obj):
+def _lower__ZL17__int2bfloat16_rzi_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    hexp2_1(__nv_bfloat16 &retval , __nv_bfloat16* a) {
-        retval = hexp2(*a);
+    _ZL17__int2bfloat16_rzi_nbst(__nv_bfloat16 &retval , int* i) {
+        retval = __int2bfloat16_rz(*i);
         return 0;
     }
         """
 
-    hexp2_1 = declare_device(
-        "hexp2_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16))
+    _ZL17__int2bfloat16_rzi_nbst = declare_device(
+        "_ZL17__int2bfloat16_rzi_nbst", _type___nv_bfloat16(CPointer(int32))
     )
 
-    def hexp2_1_caller(arg_0):
-        return hexp2_1(arg_0)
+    def _ZL17__int2bfloat16_rzi_nbst_caller(arg_0):
+        return _ZL17__int2bfloat16_rzi_nbst(arg_0)
 
-    @lower(hexp2, _type___nv_bfloat16)
+    @lower(__int2bfloat16_rz, int32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("hexp2_1", shim_raw_str)
+        shim_stream.write_with_key("_ZL17__int2bfloat16_rzi_nbst", shim_raw_str)
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            hexp2_1_caller,
-            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            _ZL17__int2bfloat16_rzi_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(int32)),
             ptrs,
         )
 
 
-_hexp2_1_lower(shim_stream, shim_obj)
+_lower__ZL17__int2bfloat16_rzi_nbst(shim_stream, shim_obj)
 
 
-def hexp10():
+def __int2bfloat16_rd():
     pass
 
 
-def _hexp10_1_lower(shim_stream, shim_obj):
+def _lower__ZL17__int2bfloat16_rdi_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    hexp10_1(__nv_bfloat16 &retval , __nv_bfloat16* a) {
-        retval = hexp10(*a);
+    _ZL17__int2bfloat16_rdi_nbst(__nv_bfloat16 &retval , int* i) {
+        retval = __int2bfloat16_rd(*i);
         return 0;
     }
         """
 
-    hexp10_1 = declare_device(
-        "hexp10_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16))
+    _ZL17__int2bfloat16_rdi_nbst = declare_device(
+        "_ZL17__int2bfloat16_rdi_nbst", _type___nv_bfloat16(CPointer(int32))
     )
 
-    def hexp10_1_caller(arg_0):
-        return hexp10_1(arg_0)
+    def _ZL17__int2bfloat16_rdi_nbst_caller(arg_0):
+        return _ZL17__int2bfloat16_rdi_nbst(arg_0)
 
-    @lower(hexp10, _type___nv_bfloat16)
+    @lower(__int2bfloat16_rd, int32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("hexp10_1", shim_raw_str)
+        shim_stream.write_with_key("_ZL17__int2bfloat16_rdi_nbst", shim_raw_str)
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            hexp10_1_caller,
-            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            _ZL17__int2bfloat16_rdi_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(int32)),
             ptrs,
         )
 
 
-_hexp10_1_lower(shim_stream, shim_obj)
+_lower__ZL17__int2bfloat16_rdi_nbst(shim_stream, shim_obj)
 
 
-def hcos():
+def __int2bfloat16_ru():
     pass
 
 
-def _hcos_1_lower(shim_stream, shim_obj):
+def _lower__ZL17__int2bfloat16_rui_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    hcos_1(__nv_bfloat16 &retval , __nv_bfloat16* a) {
-        retval = hcos(*a);
+    _ZL17__int2bfloat16_rui_nbst(__nv_bfloat16 &retval , int* i) {
+        retval = __int2bfloat16_ru(*i);
         return 0;
     }
         """
 
-    hcos_1 = declare_device(
-        "hcos_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16))
+    _ZL17__int2bfloat16_rui_nbst = declare_device(
+        "_ZL17__int2bfloat16_rui_nbst", _type___nv_bfloat16(CPointer(int32))
     )
 
-    def hcos_1_caller(arg_0):
-        return hcos_1(arg_0)
+    def _ZL17__int2bfloat16_rui_nbst_caller(arg_0):
+        return _ZL17__int2bfloat16_rui_nbst(arg_0)
 
-    @lower(hcos, _type___nv_bfloat16)
+    @lower(__int2bfloat16_ru, int32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("hcos_1", shim_raw_str)
+        shim_stream.write_with_key("_ZL17__int2bfloat16_rui_nbst", shim_raw_str)
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            hcos_1_caller,
-            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            _ZL17__int2bfloat16_rui_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(int32)),
             ptrs,
         )
 
 
-_hcos_1_lower(shim_stream, shim_obj)
+_lower__ZL17__int2bfloat16_rui_nbst(shim_stream, shim_obj)
 
 
-def hsin():
+def __bfloat162short_rn():
     pass
 
 
-def _hsin_1_lower(shim_stream, shim_obj):
+def _lower__ZL19__bfloat162short_rn13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    hsin_1(__nv_bfloat16 &retval , __nv_bfloat16* a) {
-        retval = hsin(*a);
+    _ZL19__bfloat162short_rn13__nv_bfloat16_nbst(short &retval , __nv_bfloat16* h) {
+        retval = __bfloat162short_rn(*h);
         return 0;
     }
         """
 
-    hsin_1 = declare_device(
-        "hsin_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16))
+    _ZL19__bfloat162short_rn13__nv_bfloat16_nbst = declare_device(
+        "_ZL19__bfloat162short_rn13__nv_bfloat16_nbst",
+        int16(CPointer(_type___nv_bfloat16)),
     )
 
-    def hsin_1_caller(arg_0):
-        return hsin_1(arg_0)
+    def _ZL19__bfloat162short_rn13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL19__bfloat162short_rn13__nv_bfloat16_nbst(arg_0)
 
-    @lower(hsin, _type___nv_bfloat16)
+    @lower(__bfloat162short_rn, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("hsin_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL19__bfloat162short_rn13__nv_bfloat16_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            hsin_1_caller,
-            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            _ZL19__bfloat162short_rn13__nv_bfloat16_nbst_caller,
+            signature(int16, CPointer(_type___nv_bfloat16)),
             ptrs,
         )
 
 
-_hsin_1_lower(shim_stream, shim_obj)
+_lower__ZL19__bfloat162short_rn13__nv_bfloat16_nbst(shim_stream, shim_obj)
 
 
-def h2sqrt():
+def __bfloat162short_rz():
     pass
 
 
-def _h2sqrt_1_lower(shim_stream, shim_obj):
+def _lower__ZL19__bfloat162short_rz13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    h2sqrt_1(__nv_bfloat162 &retval , __nv_bfloat162* a) {
-        retval = h2sqrt(*a);
+    _ZL19__bfloat162short_rz13__nv_bfloat16_nbst(short &retval , __nv_bfloat16* h) {
+        retval = __bfloat162short_rz(*h);
         return 0;
     }
         """
 
-    h2sqrt_1 = declare_device(
-        "h2sqrt_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162))
+    _ZL19__bfloat162short_rz13__nv_bfloat16_nbst = declare_device(
+        "_ZL19__bfloat162short_rz13__nv_bfloat16_nbst",
+        int16(CPointer(_type___nv_bfloat16)),
     )
 
-    def h2sqrt_1_caller(arg_0):
-        return h2sqrt_1(arg_0)
+    def _ZL19__bfloat162short_rz13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL19__bfloat162short_rz13__nv_bfloat16_nbst(arg_0)
 
-    @lower(h2sqrt, _type___nv_bfloat162)
+    @lower(__bfloat162short_rz, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("h2sqrt_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL19__bfloat162short_rz13__nv_bfloat16_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            h2sqrt_1_caller,
-            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            _ZL19__bfloat162short_rz13__nv_bfloat16_nbst_caller,
+            signature(int16, CPointer(_type___nv_bfloat16)),
             ptrs,
         )
 
 
-_h2sqrt_1_lower(shim_stream, shim_obj)
+_lower__ZL19__bfloat162short_rz13__nv_bfloat16_nbst(shim_stream, shim_obj)
 
 
-def h2rsqrt():
+def __bfloat162short_rd():
     pass
 
 
-def _h2rsqrt_1_lower(shim_stream, shim_obj):
+def _lower__ZL19__bfloat162short_rd13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    h2rsqrt_1(__nv_bfloat162 &retval , __nv_bfloat162* a) {
-        retval = h2rsqrt(*a);
+    _ZL19__bfloat162short_rd13__nv_bfloat16_nbst(short &retval , __nv_bfloat16* h) {
+        retval = __bfloat162short_rd(*h);
         return 0;
     }
         """
 
-    h2rsqrt_1 = declare_device(
-        "h2rsqrt_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162))
+    _ZL19__bfloat162short_rd13__nv_bfloat16_nbst = declare_device(
+        "_ZL19__bfloat162short_rd13__nv_bfloat16_nbst",
+        int16(CPointer(_type___nv_bfloat16)),
     )
 
-    def h2rsqrt_1_caller(arg_0):
-        return h2rsqrt_1(arg_0)
+    def _ZL19__bfloat162short_rd13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL19__bfloat162short_rd13__nv_bfloat16_nbst(arg_0)
 
-    @lower(h2rsqrt, _type___nv_bfloat162)
+    @lower(__bfloat162short_rd, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("h2rsqrt_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL19__bfloat162short_rd13__nv_bfloat16_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            h2rsqrt_1_caller,
-            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            _ZL19__bfloat162short_rd13__nv_bfloat16_nbst_caller,
+            signature(int16, CPointer(_type___nv_bfloat16)),
             ptrs,
         )
 
 
-_h2rsqrt_1_lower(shim_stream, shim_obj)
+_lower__ZL19__bfloat162short_rd13__nv_bfloat16_nbst(shim_stream, shim_obj)
 
 
-def h2rcp():
+def __bfloat162short_ru():
     pass
 
 
-def _h2rcp_1_lower(shim_stream, shim_obj):
+def _lower__ZL19__bfloat162short_ru13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    h2rcp_1(__nv_bfloat162 &retval , __nv_bfloat162* a) {
-        retval = h2rcp(*a);
+    _ZL19__bfloat162short_ru13__nv_bfloat16_nbst(short &retval , __nv_bfloat16* h) {
+        retval = __bfloat162short_ru(*h);
         return 0;
     }
         """
 
-    h2rcp_1 = declare_device(
-        "h2rcp_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162))
+    _ZL19__bfloat162short_ru13__nv_bfloat16_nbst = declare_device(
+        "_ZL19__bfloat162short_ru13__nv_bfloat16_nbst",
+        int16(CPointer(_type___nv_bfloat16)),
     )
 
-    def h2rcp_1_caller(arg_0):
-        return h2rcp_1(arg_0)
+    def _ZL19__bfloat162short_ru13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL19__bfloat162short_ru13__nv_bfloat16_nbst(arg_0)
 
-    @lower(h2rcp, _type___nv_bfloat162)
+    @lower(__bfloat162short_ru, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("h2rcp_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL19__bfloat162short_ru13__nv_bfloat16_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            h2rcp_1_caller,
-            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            _ZL19__bfloat162short_ru13__nv_bfloat16_nbst_caller,
+            signature(int16, CPointer(_type___nv_bfloat16)),
             ptrs,
         )
 
 
-_h2rcp_1_lower(shim_stream, shim_obj)
+_lower__ZL19__bfloat162short_ru13__nv_bfloat16_nbst(shim_stream, shim_obj)
 
 
-def h2log():
+def __short2bfloat16_rn():
     pass
 
 
-def _h2log_1_lower(shim_stream, shim_obj):
+def _lower__ZL19__short2bfloat16_rns_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    h2log_1(__nv_bfloat162 &retval , __nv_bfloat162* a) {
-        retval = h2log(*a);
+    _ZL19__short2bfloat16_rns_nbst(__nv_bfloat16 &retval , short* i) {
+        retval = __short2bfloat16_rn(*i);
         return 0;
     }
         """
 
-    h2log_1 = declare_device(
-        "h2log_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162))
+    _ZL19__short2bfloat16_rns_nbst = declare_device(
+        "_ZL19__short2bfloat16_rns_nbst", _type___nv_bfloat16(CPointer(int16))
     )
 
-    def h2log_1_caller(arg_0):
-        return h2log_1(arg_0)
+    def _ZL19__short2bfloat16_rns_nbst_caller(arg_0):
+        return _ZL19__short2bfloat16_rns_nbst(arg_0)
 
-    @lower(h2log, _type___nv_bfloat162)
+    @lower(__short2bfloat16_rn, int16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("h2log_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL19__short2bfloat16_rns_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            h2log_1_caller,
-            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            _ZL19__short2bfloat16_rns_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(int16)),
             ptrs,
         )
 
 
-_h2log_1_lower(shim_stream, shim_obj)
+_lower__ZL19__short2bfloat16_rns_nbst(shim_stream, shim_obj)
 
 
-def h2log2():
+def __short2bfloat16_rz():
     pass
 
 
-def _h2log2_1_lower(shim_stream, shim_obj):
+def _lower__ZL19__short2bfloat16_rzs_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    h2log2_1(__nv_bfloat162 &retval , __nv_bfloat162* a) {
-        retval = h2log2(*a);
+    _ZL19__short2bfloat16_rzs_nbst(__nv_bfloat16 &retval , short* i) {
+        retval = __short2bfloat16_rz(*i);
         return 0;
     }
         """
 
-    h2log2_1 = declare_device(
-        "h2log2_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162))
+    _ZL19__short2bfloat16_rzs_nbst = declare_device(
+        "_ZL19__short2bfloat16_rzs_nbst", _type___nv_bfloat16(CPointer(int16))
     )
 
-    def h2log2_1_caller(arg_0):
-        return h2log2_1(arg_0)
+    def _ZL19__short2bfloat16_rzs_nbst_caller(arg_0):
+        return _ZL19__short2bfloat16_rzs_nbst(arg_0)
 
-    @lower(h2log2, _type___nv_bfloat162)
+    @lower(__short2bfloat16_rz, int16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("h2log2_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL19__short2bfloat16_rzs_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            h2log2_1_caller,
-            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            _ZL19__short2bfloat16_rzs_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(int16)),
             ptrs,
         )
 
 
-_h2log2_1_lower(shim_stream, shim_obj)
+_lower__ZL19__short2bfloat16_rzs_nbst(shim_stream, shim_obj)
 
 
-def h2log10():
+def __short2bfloat16_rd():
     pass
 
 
-def _h2log10_1_lower(shim_stream, shim_obj):
+def _lower__ZL19__short2bfloat16_rds_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    h2log10_1(__nv_bfloat162 &retval , __nv_bfloat162* a) {
-        retval = h2log10(*a);
+    _ZL19__short2bfloat16_rds_nbst(__nv_bfloat16 &retval , short* i) {
+        retval = __short2bfloat16_rd(*i);
         return 0;
     }
         """
 
-    h2log10_1 = declare_device(
-        "h2log10_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162))
+    _ZL19__short2bfloat16_rds_nbst = declare_device(
+        "_ZL19__short2bfloat16_rds_nbst", _type___nv_bfloat16(CPointer(int16))
     )
 
-    def h2log10_1_caller(arg_0):
-        return h2log10_1(arg_0)
+    def _ZL19__short2bfloat16_rds_nbst_caller(arg_0):
+        return _ZL19__short2bfloat16_rds_nbst(arg_0)
 
-    @lower(h2log10, _type___nv_bfloat162)
+    @lower(__short2bfloat16_rd, int16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("h2log10_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL19__short2bfloat16_rds_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            h2log10_1_caller,
-            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            _ZL19__short2bfloat16_rds_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(int16)),
             ptrs,
         )
 
 
-_h2log10_1_lower(shim_stream, shim_obj)
+_lower__ZL19__short2bfloat16_rds_nbst(shim_stream, shim_obj)
 
 
-def h2exp():
+def __short2bfloat16_ru():
     pass
 
 
-def _h2exp_1_lower(shim_stream, shim_obj):
+def _lower__ZL19__short2bfloat16_rus_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    h2exp_1(__nv_bfloat162 &retval , __nv_bfloat162* a) {
-        retval = h2exp(*a);
+    _ZL19__short2bfloat16_rus_nbst(__nv_bfloat16 &retval , short* i) {
+        retval = __short2bfloat16_ru(*i);
         return 0;
     }
         """
 
-    h2exp_1 = declare_device(
-        "h2exp_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162))
+    _ZL19__short2bfloat16_rus_nbst = declare_device(
+        "_ZL19__short2bfloat16_rus_nbst", _type___nv_bfloat16(CPointer(int16))
     )
 
-    def h2exp_1_caller(arg_0):
-        return h2exp_1(arg_0)
+    def _ZL19__short2bfloat16_rus_nbst_caller(arg_0):
+        return _ZL19__short2bfloat16_rus_nbst(arg_0)
 
-    @lower(h2exp, _type___nv_bfloat162)
+    @lower(__short2bfloat16_ru, int16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("h2exp_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL19__short2bfloat16_rus_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            h2exp_1_caller,
-            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            _ZL19__short2bfloat16_rus_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(int16)),
             ptrs,
         )
 
 
-_h2exp_1_lower(shim_stream, shim_obj)
+_lower__ZL19__short2bfloat16_rus_nbst(shim_stream, shim_obj)
 
 
-def h2exp2():
+def __bfloat162uint_rn():
     pass
 
 
-def _h2exp2_1_lower(shim_stream, shim_obj):
+def _lower__ZL18__bfloat162uint_rn13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    h2exp2_1(__nv_bfloat162 &retval , __nv_bfloat162* a) {
-        retval = h2exp2(*a);
+    _ZL18__bfloat162uint_rn13__nv_bfloat16_nbst(unsigned int &retval , __nv_bfloat16* h) {
+        retval = __bfloat162uint_rn(*h);
         return 0;
     }
         """
 
-    h2exp2_1 = declare_device(
-        "h2exp2_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162))
+    _ZL18__bfloat162uint_rn13__nv_bfloat16_nbst = declare_device(
+        "_ZL18__bfloat162uint_rn13__nv_bfloat16_nbst",
+        uint32(CPointer(_type___nv_bfloat16)),
     )
 
-    def h2exp2_1_caller(arg_0):
-        return h2exp2_1(arg_0)
+    def _ZL18__bfloat162uint_rn13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL18__bfloat162uint_rn13__nv_bfloat16_nbst(arg_0)
 
-    @lower(h2exp2, _type___nv_bfloat162)
+    @lower(__bfloat162uint_rn, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("h2exp2_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL18__bfloat162uint_rn13__nv_bfloat16_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            h2exp2_1_caller,
-            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            _ZL18__bfloat162uint_rn13__nv_bfloat16_nbst_caller,
+            signature(uint32, CPointer(_type___nv_bfloat16)),
             ptrs,
         )
 
 
-_h2exp2_1_lower(shim_stream, shim_obj)
+_lower__ZL18__bfloat162uint_rn13__nv_bfloat16_nbst(shim_stream, shim_obj)
 
 
-def h2exp10():
+def __bfloat162uint_rz():
     pass
 
 
-def _h2exp10_1_lower(shim_stream, shim_obj):
+def _lower__ZL18__bfloat162uint_rz13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    h2exp10_1(__nv_bfloat162 &retval , __nv_bfloat162* a) {
-        retval = h2exp10(*a);
+    _ZL18__bfloat162uint_rz13__nv_bfloat16_nbst(unsigned int &retval , __nv_bfloat16* h) {
+        retval = __bfloat162uint_rz(*h);
         return 0;
     }
         """
 
-    h2exp10_1 = declare_device(
-        "h2exp10_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162))
+    _ZL18__bfloat162uint_rz13__nv_bfloat16_nbst = declare_device(
+        "_ZL18__bfloat162uint_rz13__nv_bfloat16_nbst",
+        uint32(CPointer(_type___nv_bfloat16)),
     )
 
-    def h2exp10_1_caller(arg_0):
-        return h2exp10_1(arg_0)
+    def _ZL18__bfloat162uint_rz13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL18__bfloat162uint_rz13__nv_bfloat16_nbst(arg_0)
 
-    @lower(h2exp10, _type___nv_bfloat162)
+    @lower(__bfloat162uint_rz, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("h2exp10_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL18__bfloat162uint_rz13__nv_bfloat16_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            h2exp10_1_caller,
-            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            _ZL18__bfloat162uint_rz13__nv_bfloat16_nbst_caller,
+            signature(uint32, CPointer(_type___nv_bfloat16)),
             ptrs,
         )
 
 
-_h2exp10_1_lower(shim_stream, shim_obj)
+_lower__ZL18__bfloat162uint_rz13__nv_bfloat16_nbst(shim_stream, shim_obj)
 
 
-def h2cos():
+def __bfloat162uint_rd():
     pass
 
 
-def _h2cos_1_lower(shim_stream, shim_obj):
+def _lower__ZL18__bfloat162uint_rd13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    h2cos_1(__nv_bfloat162 &retval , __nv_bfloat162* a) {
-        retval = h2cos(*a);
+    _ZL18__bfloat162uint_rd13__nv_bfloat16_nbst(unsigned int &retval , __nv_bfloat16* h) {
+        retval = __bfloat162uint_rd(*h);
         return 0;
     }
         """
 
-    h2cos_1 = declare_device(
-        "h2cos_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162))
+    _ZL18__bfloat162uint_rd13__nv_bfloat16_nbst = declare_device(
+        "_ZL18__bfloat162uint_rd13__nv_bfloat16_nbst",
+        uint32(CPointer(_type___nv_bfloat16)),
     )
 
-    def h2cos_1_caller(arg_0):
-        return h2cos_1(arg_0)
+    def _ZL18__bfloat162uint_rd13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL18__bfloat162uint_rd13__nv_bfloat16_nbst(arg_0)
 
-    @lower(h2cos, _type___nv_bfloat162)
+    @lower(__bfloat162uint_rd, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("h2cos_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL18__bfloat162uint_rd13__nv_bfloat16_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            h2cos_1_caller,
-            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            _ZL18__bfloat162uint_rd13__nv_bfloat16_nbst_caller,
+            signature(uint32, CPointer(_type___nv_bfloat16)),
             ptrs,
         )
 
 
-_h2cos_1_lower(shim_stream, shim_obj)
+_lower__ZL18__bfloat162uint_rd13__nv_bfloat16_nbst(shim_stream, shim_obj)
 
 
-def h2sin():
+def __bfloat162uint_ru():
     pass
 
 
-def _h2sin_1_lower(shim_stream, shim_obj):
+def _lower__ZL18__bfloat162uint_ru13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    h2sin_1(__nv_bfloat162 &retval , __nv_bfloat162* a) {
-        retval = h2sin(*a);
+    _ZL18__bfloat162uint_ru13__nv_bfloat16_nbst(unsigned int &retval , __nv_bfloat16* h) {
+        retval = __bfloat162uint_ru(*h);
         return 0;
     }
         """
 
-    h2sin_1 = declare_device(
-        "h2sin_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162))
+    _ZL18__bfloat162uint_ru13__nv_bfloat16_nbst = declare_device(
+        "_ZL18__bfloat162uint_ru13__nv_bfloat16_nbst",
+        uint32(CPointer(_type___nv_bfloat16)),
     )
 
-    def h2sin_1_caller(arg_0):
-        return h2sin_1(arg_0)
+    def _ZL18__bfloat162uint_ru13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL18__bfloat162uint_ru13__nv_bfloat16_nbst(arg_0)
 
-    @lower(h2sin, _type___nv_bfloat162)
+    @lower(__bfloat162uint_ru, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("h2sin_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL18__bfloat162uint_ru13__nv_bfloat16_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            h2sin_1_caller,
-            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            _ZL18__bfloat162uint_ru13__nv_bfloat16_nbst_caller,
+            signature(uint32, CPointer(_type___nv_bfloat16)),
             ptrs,
         )
 
 
-_h2sin_1_lower(shim_stream, shim_obj)
+_lower__ZL18__bfloat162uint_ru13__nv_bfloat16_nbst(shim_stream, shim_obj)
 
 
-def atomicAdd():
+def __uint2bfloat16_rn():
     pass
 
 
-def _atomicAdd_1_lower(shim_stream, shim_obj):
+def _lower__ZL18__uint2bfloat16_rnj_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    atomicAdd_1(__nv_bfloat162 &retval , __nv_bfloat162 ** address, __nv_bfloat162* val) {
-        retval = atomicAdd(*address, *val);
+    _ZL18__uint2bfloat16_rnj_nbst(__nv_bfloat16 &retval , unsigned int* i) {
+        retval = __uint2bfloat16_rn(*i);
         return 0;
     }
         """
 
-    atomicAdd_1 = declare_device(
-        "atomicAdd_1",
-        _type___nv_bfloat162(
-            CPointer(CPointer(_type___nv_bfloat162)),
-            CPointer(_type___nv_bfloat162),
-        ),
+    _ZL18__uint2bfloat16_rnj_nbst = declare_device(
+        "_ZL18__uint2bfloat16_rnj_nbst", _type___nv_bfloat16(CPointer(uint32))
     )
 
-    def atomicAdd_1_caller(arg_0, arg_1):
-        return atomicAdd_1(arg_0, arg_1)
+    def _ZL18__uint2bfloat16_rnj_nbst_caller(arg_0):
+        return _ZL18__uint2bfloat16_rnj_nbst(arg_0)
 
-    @lower(atomicAdd, CPointer(_type___nv_bfloat162), _type___nv_bfloat162)
+    @lower(__uint2bfloat16_rn, uint32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("atomicAdd_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL18__uint2bfloat16_rnj_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            atomicAdd_1_caller,
-            signature(
-                _type___nv_bfloat162,
-                CPointer(CPointer(_type___nv_bfloat162)),
-                CPointer(_type___nv_bfloat162),
-            ),
+            _ZL18__uint2bfloat16_rnj_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(uint32)),
             ptrs,
         )
 
 
-_atomicAdd_1_lower(shim_stream, shim_obj)
+_lower__ZL18__uint2bfloat16_rnj_nbst(shim_stream, shim_obj)
+
+
+def __uint2bfloat16_rz():
+    pass
 
 
-def _atomicAdd_2_lower(shim_stream, shim_obj):
+def _lower__ZL18__uint2bfloat16_rzj_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    atomicAdd_2(__nv_bfloat16 &retval , __nv_bfloat16 ** address, __nv_bfloat16* val) {
-        retval = atomicAdd(*address, *val);
+    _ZL18__uint2bfloat16_rzj_nbst(__nv_bfloat16 &retval , unsigned int* i) {
+        retval = __uint2bfloat16_rz(*i);
         return 0;
     }
         """
 
-    atomicAdd_2 = declare_device(
-        "atomicAdd_2",
-        _type___nv_bfloat16(
-            CPointer(CPointer(_type___nv_bfloat16)),
-            CPointer(_type___nv_bfloat16),
-        ),
+    _ZL18__uint2bfloat16_rzj_nbst = declare_device(
+        "_ZL18__uint2bfloat16_rzj_nbst", _type___nv_bfloat16(CPointer(uint32))
     )
 
-    def atomicAdd_2_caller(arg_0, arg_1):
-        return atomicAdd_2(arg_0, arg_1)
+    def _ZL18__uint2bfloat16_rzj_nbst_caller(arg_0):
+        return _ZL18__uint2bfloat16_rzj_nbst(arg_0)
 
-    @lower(atomicAdd, CPointer(_type___nv_bfloat16), _type___nv_bfloat16)
+    @lower(__uint2bfloat16_rz, uint32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("atomicAdd_2", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL18__uint2bfloat16_rzj_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            atomicAdd_2_caller,
-            signature(
-                _type___nv_bfloat16,
-                CPointer(CPointer(_type___nv_bfloat16)),
-                CPointer(_type___nv_bfloat16),
-            ),
+            _ZL18__uint2bfloat16_rzj_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(uint32)),
             ptrs,
         )
 
 
-_atomicAdd_2_lower(shim_stream, shim_obj)
+_lower__ZL18__uint2bfloat16_rzj_nbst(shim_stream, shim_obj)
+
+
+def __uint2bfloat16_rd():
+    pass
 
 
-def _operator_add_1_lower(shim_stream, shim_obj):
+def _lower__ZL18__uint2bfloat16_rdj_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    operator_add_1(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
-        retval = operator+(*lh, *rh);
+    _ZL18__uint2bfloat16_rdj_nbst(__nv_bfloat16 &retval , unsigned int* i) {
+        retval = __uint2bfloat16_rd(*i);
         return 0;
     }
         """
 
-    operator_add_1 = declare_device(
-        "operator_add_1",
-        _type___nv_bfloat16(
-            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
-        ),
+    _ZL18__uint2bfloat16_rdj_nbst = declare_device(
+        "_ZL18__uint2bfloat16_rdj_nbst", _type___nv_bfloat16(CPointer(uint32))
     )
 
-    def operator_add_1_caller(arg_0, arg_1):
-        return operator_add_1(arg_0, arg_1)
+    def _ZL18__uint2bfloat16_rdj_nbst_caller(arg_0):
+        return _ZL18__uint2bfloat16_rdj_nbst(arg_0)
 
-    @lower(operator.add, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__uint2bfloat16_rd, uint32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_add_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL18__uint2bfloat16_rdj_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            operator_add_1_caller,
-            signature(
-                _type___nv_bfloat16,
-                CPointer(_type___nv_bfloat16),
-                CPointer(_type___nv_bfloat16),
-            ),
+            _ZL18__uint2bfloat16_rdj_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(uint32)),
             ptrs,
         )
 
 
-_operator_add_1_lower(shim_stream, shim_obj)
+_lower__ZL18__uint2bfloat16_rdj_nbst(shim_stream, shim_obj)
+
+
+def __uint2bfloat16_ru():
+    pass
 
 
-def _operator_sub_1_lower(shim_stream, shim_obj):
+def _lower__ZL18__uint2bfloat16_ruj_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    operator_sub_1(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
-        retval = operator-(*lh, *rh);
+    _ZL18__uint2bfloat16_ruj_nbst(__nv_bfloat16 &retval , unsigned int* i) {
+        retval = __uint2bfloat16_ru(*i);
         return 0;
     }
         """
 
-    operator_sub_1 = declare_device(
-        "operator_sub_1",
-        _type___nv_bfloat16(
-            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
-        ),
+    _ZL18__uint2bfloat16_ruj_nbst = declare_device(
+        "_ZL18__uint2bfloat16_ruj_nbst", _type___nv_bfloat16(CPointer(uint32))
     )
 
-    def operator_sub_1_caller(arg_0, arg_1):
-        return operator_sub_1(arg_0, arg_1)
+    def _ZL18__uint2bfloat16_ruj_nbst_caller(arg_0):
+        return _ZL18__uint2bfloat16_ruj_nbst(arg_0)
 
-    @lower(operator.sub, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__uint2bfloat16_ru, uint32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_sub_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL18__uint2bfloat16_ruj_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            operator_sub_1_caller,
-            signature(
-                _type___nv_bfloat16,
-                CPointer(_type___nv_bfloat16),
-                CPointer(_type___nv_bfloat16),
-            ),
+            _ZL18__uint2bfloat16_ruj_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(uint32)),
             ptrs,
         )
 
 
-_operator_sub_1_lower(shim_stream, shim_obj)
+_lower__ZL18__uint2bfloat16_ruj_nbst(shim_stream, shim_obj)
 
 
-def _operator_mul_1_lower(shim_stream, shim_obj):
+def __bfloat162ushort_rn():
+    pass
+
+
+def _lower__ZL20__bfloat162ushort_rn13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    operator_mul_1(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
-        retval = operator*(*lh, *rh);
+    _ZL20__bfloat162ushort_rn13__nv_bfloat16_nbst(unsigned short &retval , __nv_bfloat16* h) {
+        retval = __bfloat162ushort_rn(*h);
         return 0;
     }
         """
 
-    operator_mul_1 = declare_device(
-        "operator_mul_1",
-        _type___nv_bfloat16(
-            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
-        ),
+    _ZL20__bfloat162ushort_rn13__nv_bfloat16_nbst = declare_device(
+        "_ZL20__bfloat162ushort_rn13__nv_bfloat16_nbst",
+        uint16(CPointer(_type___nv_bfloat16)),
     )
 
-    def operator_mul_1_caller(arg_0, arg_1):
-        return operator_mul_1(arg_0, arg_1)
+    def _ZL20__bfloat162ushort_rn13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL20__bfloat162ushort_rn13__nv_bfloat16_nbst(arg_0)
 
-    @lower(operator.mul, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__bfloat162ushort_rn, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_mul_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL20__bfloat162ushort_rn13__nv_bfloat16_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            operator_mul_1_caller,
-            signature(
-                _type___nv_bfloat16,
-                CPointer(_type___nv_bfloat16),
-                CPointer(_type___nv_bfloat16),
-            ),
+            _ZL20__bfloat162ushort_rn13__nv_bfloat16_nbst_caller,
+            signature(uint16, CPointer(_type___nv_bfloat16)),
             ptrs,
         )
 
 
-_operator_mul_1_lower(shim_stream, shim_obj)
+_lower__ZL20__bfloat162ushort_rn13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def __bfloat162ushort_rz():
+    pass
 
 
-def _operator_truediv_1_lower(shim_stream, shim_obj):
+def _lower__ZL20__bfloat162ushort_rz13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    operator_truediv_1(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
-        retval = operator/(*lh, *rh);
+    _ZL20__bfloat162ushort_rz13__nv_bfloat16_nbst(unsigned short &retval , __nv_bfloat16* h) {
+        retval = __bfloat162ushort_rz(*h);
         return 0;
     }
         """
 
-    operator_truediv_1 = declare_device(
-        "operator_truediv_1",
-        _type___nv_bfloat16(
-            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
-        ),
+    _ZL20__bfloat162ushort_rz13__nv_bfloat16_nbst = declare_device(
+        "_ZL20__bfloat162ushort_rz13__nv_bfloat16_nbst",
+        uint16(CPointer(_type___nv_bfloat16)),
     )
 
-    def operator_truediv_1_caller(arg_0, arg_1):
-        return operator_truediv_1(arg_0, arg_1)
+    def _ZL20__bfloat162ushort_rz13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL20__bfloat162ushort_rz13__nv_bfloat16_nbst(arg_0)
 
-    @lower(operator.truediv, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__bfloat162ushort_rz, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_truediv_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL20__bfloat162ushort_rz13__nv_bfloat16_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            operator_truediv_1_caller,
-            signature(
-                _type___nv_bfloat16,
-                CPointer(_type___nv_bfloat16),
-                CPointer(_type___nv_bfloat16),
-            ),
+            _ZL20__bfloat162ushort_rz13__nv_bfloat16_nbst_caller,
+            signature(uint16, CPointer(_type___nv_bfloat16)),
             ptrs,
         )
 
 
-_operator_truediv_1_lower(shim_stream, shim_obj)
+_lower__ZL20__bfloat162ushort_rz13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def __bfloat162ushort_rd():
+    pass
 
 
-def _operator_iadd_1_lower(shim_stream, shim_obj):
+def _lower__ZL20__bfloat162ushort_rd13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    operator_iadd_1(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
-        retval = operator+=(*lh, *rh);
+    _ZL20__bfloat162ushort_rd13__nv_bfloat16_nbst(unsigned short &retval , __nv_bfloat16* h) {
+        retval = __bfloat162ushort_rd(*h);
         return 0;
     }
         """
 
-    operator_iadd_1 = declare_device(
-        "operator_iadd_1",
-        _type___nv_bfloat16(
-            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
-        ),
+    _ZL20__bfloat162ushort_rd13__nv_bfloat16_nbst = declare_device(
+        "_ZL20__bfloat162ushort_rd13__nv_bfloat16_nbst",
+        uint16(CPointer(_type___nv_bfloat16)),
     )
 
-    def operator_iadd_1_caller(arg_0, arg_1):
-        return operator_iadd_1(arg_0, arg_1)
+    def _ZL20__bfloat162ushort_rd13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL20__bfloat162ushort_rd13__nv_bfloat16_nbst(arg_0)
 
-    @lower(operator.iadd, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__bfloat162ushort_rd, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_iadd_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL20__bfloat162ushort_rd13__nv_bfloat16_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            operator_iadd_1_caller,
-            signature(
-                _type___nv_bfloat16,
-                CPointer(_type___nv_bfloat16),
-                CPointer(_type___nv_bfloat16),
-            ),
+            _ZL20__bfloat162ushort_rd13__nv_bfloat16_nbst_caller,
+            signature(uint16, CPointer(_type___nv_bfloat16)),
             ptrs,
         )
 
 
-_operator_iadd_1_lower(shim_stream, shim_obj)
+_lower__ZL20__bfloat162ushort_rd13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def __bfloat162ushort_ru():
+    pass
 
 
-def _operator_isub_1_lower(shim_stream, shim_obj):
+def _lower__ZL20__bfloat162ushort_ru13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    operator_isub_1(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
-        retval = operator-=(*lh, *rh);
+    _ZL20__bfloat162ushort_ru13__nv_bfloat16_nbst(unsigned short &retval , __nv_bfloat16* h) {
+        retval = __bfloat162ushort_ru(*h);
         return 0;
     }
         """
 
-    operator_isub_1 = declare_device(
-        "operator_isub_1",
-        _type___nv_bfloat16(
-            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
-        ),
+    _ZL20__bfloat162ushort_ru13__nv_bfloat16_nbst = declare_device(
+        "_ZL20__bfloat162ushort_ru13__nv_bfloat16_nbst",
+        uint16(CPointer(_type___nv_bfloat16)),
     )
 
-    def operator_isub_1_caller(arg_0, arg_1):
-        return operator_isub_1(arg_0, arg_1)
+    def _ZL20__bfloat162ushort_ru13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL20__bfloat162ushort_ru13__nv_bfloat16_nbst(arg_0)
 
-    @lower(operator.isub, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__bfloat162ushort_ru, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_isub_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL20__bfloat162ushort_ru13__nv_bfloat16_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            operator_isub_1_caller,
-            signature(
-                _type___nv_bfloat16,
-                CPointer(_type___nv_bfloat16),
-                CPointer(_type___nv_bfloat16),
-            ),
+            _ZL20__bfloat162ushort_ru13__nv_bfloat16_nbst_caller,
+            signature(uint16, CPointer(_type___nv_bfloat16)),
             ptrs,
         )
 
 
-_operator_isub_1_lower(shim_stream, shim_obj)
+_lower__ZL20__bfloat162ushort_ru13__nv_bfloat16_nbst(shim_stream, shim_obj)
 
 
-def _operator_imul_1_lower(shim_stream, shim_obj):
+def __ushort2bfloat16_rn():
+    pass
+
+
+def _lower__ZL20__ushort2bfloat16_rnt_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    operator_imul_1(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
-        retval = operator*=(*lh, *rh);
+    _ZL20__ushort2bfloat16_rnt_nbst(__nv_bfloat16 &retval , unsigned short* i) {
+        retval = __ushort2bfloat16_rn(*i);
         return 0;
     }
         """
 
-    operator_imul_1 = declare_device(
-        "operator_imul_1",
-        _type___nv_bfloat16(
-            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
-        ),
+    _ZL20__ushort2bfloat16_rnt_nbst = declare_device(
+        "_ZL20__ushort2bfloat16_rnt_nbst", _type___nv_bfloat16(CPointer(uint16))
     )
 
-    def operator_imul_1_caller(arg_0, arg_1):
-        return operator_imul_1(arg_0, arg_1)
+    def _ZL20__ushort2bfloat16_rnt_nbst_caller(arg_0):
+        return _ZL20__ushort2bfloat16_rnt_nbst(arg_0)
 
-    @lower(operator.imul, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__ushort2bfloat16_rn, uint16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_imul_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL20__ushort2bfloat16_rnt_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            operator_imul_1_caller,
-            signature(
-                _type___nv_bfloat16,
-                CPointer(_type___nv_bfloat16),
-                CPointer(_type___nv_bfloat16),
-            ),
+            _ZL20__ushort2bfloat16_rnt_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(uint16)),
             ptrs,
         )
 
 
-_operator_imul_1_lower(shim_stream, shim_obj)
+_lower__ZL20__ushort2bfloat16_rnt_nbst(shim_stream, shim_obj)
+
+
+def __ushort2bfloat16_rz():
+    pass
 
 
-def _operator_itruediv_1_lower(shim_stream, shim_obj):
+def _lower__ZL20__ushort2bfloat16_rzt_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    operator_itruediv_1(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
-        retval = operator/=(*lh, *rh);
+    _ZL20__ushort2bfloat16_rzt_nbst(__nv_bfloat16 &retval , unsigned short* i) {
+        retval = __ushort2bfloat16_rz(*i);
         return 0;
     }
         """
 
-    operator_itruediv_1 = declare_device(
-        "operator_itruediv_1",
-        _type___nv_bfloat16(
-            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
-        ),
+    _ZL20__ushort2bfloat16_rzt_nbst = declare_device(
+        "_ZL20__ushort2bfloat16_rzt_nbst", _type___nv_bfloat16(CPointer(uint16))
     )
 
-    def operator_itruediv_1_caller(arg_0, arg_1):
-        return operator_itruediv_1(arg_0, arg_1)
+    def _ZL20__ushort2bfloat16_rzt_nbst_caller(arg_0):
+        return _ZL20__ushort2bfloat16_rzt_nbst(arg_0)
 
-    @lower(operator.itruediv, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__ushort2bfloat16_rz, uint16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_itruediv_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL20__ushort2bfloat16_rzt_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            operator_itruediv_1_caller,
-            signature(
-                _type___nv_bfloat16,
-                CPointer(_type___nv_bfloat16),
-                CPointer(_type___nv_bfloat16),
-            ),
+            _ZL20__ushort2bfloat16_rzt_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(uint16)),
             ptrs,
         )
 
 
-_operator_itruediv_1_lower(shim_stream, shim_obj)
+_lower__ZL20__ushort2bfloat16_rzt_nbst(shim_stream, shim_obj)
+
+
+def __ushort2bfloat16_rd():
+    pass
 
 
-def _operator_pos_1_lower(shim_stream, shim_obj):
+def _lower__ZL20__ushort2bfloat16_rdt_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    operator_pos_1(__nv_bfloat16 &retval , __nv_bfloat16* h) {
-        retval = operator+(*h);
+    _ZL20__ushort2bfloat16_rdt_nbst(__nv_bfloat16 &retval , unsigned short* i) {
+        retval = __ushort2bfloat16_rd(*i);
         return 0;
     }
         """
 
-    operator_pos_1 = declare_device(
-        "operator_pos_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16))
+    _ZL20__ushort2bfloat16_rdt_nbst = declare_device(
+        "_ZL20__ushort2bfloat16_rdt_nbst", _type___nv_bfloat16(CPointer(uint16))
     )
 
-    def operator_pos_1_caller(arg_0):
-        return operator_pos_1(arg_0)
+    def _ZL20__ushort2bfloat16_rdt_nbst_caller(arg_0):
+        return _ZL20__ushort2bfloat16_rdt_nbst(arg_0)
 
-    @lower(operator.pos, _type___nv_bfloat16)
+    @lower(__ushort2bfloat16_rd, uint16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_pos_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL20__ushort2bfloat16_rdt_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            operator_pos_1_caller,
-            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            _ZL20__ushort2bfloat16_rdt_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(uint16)),
             ptrs,
         )
 
 
-_operator_pos_1_lower(shim_stream, shim_obj)
+_lower__ZL20__ushort2bfloat16_rdt_nbst(shim_stream, shim_obj)
+
+
+def __ushort2bfloat16_ru():
+    pass
 
 
-def _operator_neg_1_lower(shim_stream, shim_obj):
+def _lower__ZL20__ushort2bfloat16_rut_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    operator_neg_1(__nv_bfloat16 &retval , __nv_bfloat16* h) {
-        retval = operator-(*h);
+    _ZL20__ushort2bfloat16_rut_nbst(__nv_bfloat16 &retval , unsigned short* i) {
+        retval = __ushort2bfloat16_ru(*i);
         return 0;
     }
         """
 
-    operator_neg_1 = declare_device(
-        "operator_neg_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16))
+    _ZL20__ushort2bfloat16_rut_nbst = declare_device(
+        "_ZL20__ushort2bfloat16_rut_nbst", _type___nv_bfloat16(CPointer(uint16))
     )
 
-    def operator_neg_1_caller(arg_0):
-        return operator_neg_1(arg_0)
+    def _ZL20__ushort2bfloat16_rut_nbst_caller(arg_0):
+        return _ZL20__ushort2bfloat16_rut_nbst(arg_0)
 
-    @lower(operator.neg, _type___nv_bfloat16)
+    @lower(__ushort2bfloat16_ru, uint16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_neg_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL20__ushort2bfloat16_rut_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            operator_neg_1_caller,
-            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            _ZL20__ushort2bfloat16_rut_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(uint16)),
             ptrs,
         )
 
 
-_operator_neg_1_lower(shim_stream, shim_obj)
+_lower__ZL20__ushort2bfloat16_rut_nbst(shim_stream, shim_obj)
+
+
+def __bfloat162ull_rn():
+    pass
 
 
-def _operator_eq_1_lower(shim_stream, shim_obj):
+def _lower__ZL17__bfloat162ull_rn13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    operator_eq_1(bool &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
-        retval = operator==(*lh, *rh);
+    _ZL17__bfloat162ull_rn13__nv_bfloat16_nbst(unsigned long long &retval , __nv_bfloat16* h) {
+        retval = __bfloat162ull_rn(*h);
         return 0;
     }
         """
 
-    operator_eq_1 = declare_device(
-        "operator_eq_1",
-        bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)),
+    _ZL17__bfloat162ull_rn13__nv_bfloat16_nbst = declare_device(
+        "_ZL17__bfloat162ull_rn13__nv_bfloat16_nbst",
+        uint64(CPointer(_type___nv_bfloat16)),
     )
 
-    def operator_eq_1_caller(arg_0, arg_1):
-        return operator_eq_1(arg_0, arg_1)
+    def _ZL17__bfloat162ull_rn13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL17__bfloat162ull_rn13__nv_bfloat16_nbst(arg_0)
 
-    @lower(operator.eq, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__bfloat162ull_rn, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_eq_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL17__bfloat162ull_rn13__nv_bfloat16_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            operator_eq_1_caller,
-            signature(
-                bool_,
-                CPointer(_type___nv_bfloat16),
-                CPointer(_type___nv_bfloat16),
-            ),
+            _ZL17__bfloat162ull_rn13__nv_bfloat16_nbst_caller,
+            signature(uint64, CPointer(_type___nv_bfloat16)),
             ptrs,
         )
 
 
-_operator_eq_1_lower(shim_stream, shim_obj)
+_lower__ZL17__bfloat162ull_rn13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def __bfloat162ull_rz():
+    pass
 
 
-def _operator_ne_1_lower(shim_stream, shim_obj):
+def _lower__ZL17__bfloat162ull_rz13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    operator_ne_1(bool &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
-        retval = operator!=(*lh, *rh);
+    _ZL17__bfloat162ull_rz13__nv_bfloat16_nbst(unsigned long long &retval , __nv_bfloat16* h) {
+        retval = __bfloat162ull_rz(*h);
         return 0;
     }
         """
 
-    operator_ne_1 = declare_device(
-        "operator_ne_1",
-        bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)),
+    _ZL17__bfloat162ull_rz13__nv_bfloat16_nbst = declare_device(
+        "_ZL17__bfloat162ull_rz13__nv_bfloat16_nbst",
+        uint64(CPointer(_type___nv_bfloat16)),
     )
 
-    def operator_ne_1_caller(arg_0, arg_1):
-        return operator_ne_1(arg_0, arg_1)
+    def _ZL17__bfloat162ull_rz13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL17__bfloat162ull_rz13__nv_bfloat16_nbst(arg_0)
 
-    @lower(operator.ne, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__bfloat162ull_rz, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_ne_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL17__bfloat162ull_rz13__nv_bfloat16_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            operator_ne_1_caller,
-            signature(
-                bool_,
-                CPointer(_type___nv_bfloat16),
-                CPointer(_type___nv_bfloat16),
-            ),
+            _ZL17__bfloat162ull_rz13__nv_bfloat16_nbst_caller,
+            signature(uint64, CPointer(_type___nv_bfloat16)),
             ptrs,
         )
 
 
-_operator_ne_1_lower(shim_stream, shim_obj)
+_lower__ZL17__bfloat162ull_rz13__nv_bfloat16_nbst(shim_stream, shim_obj)
 
 
-def _operator_gt_1_lower(shim_stream, shim_obj):
+def make_bfloat162():
+    pass
+
+
+def _lower__ZL14make_bfloat16213__nv_bfloat16S__nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    operator_gt_1(bool &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
-        retval = operator>(*lh, *rh);
+    _ZL14make_bfloat16213__nv_bfloat16S__nbst(__nv_bfloat162 &retval , __nv_bfloat16* x, __nv_bfloat16* y) {
+        retval = make_bfloat162(*x, *y);
         return 0;
     }
         """
 
-    operator_gt_1 = declare_device(
-        "operator_gt_1",
-        bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)),
+    _ZL14make_bfloat16213__nv_bfloat16S__nbst = declare_device(
+        "_ZL14make_bfloat16213__nv_bfloat16S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
+        ),
     )
 
-    def operator_gt_1_caller(arg_0, arg_1):
-        return operator_gt_1(arg_0, arg_1)
+    def _ZL14make_bfloat16213__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL14make_bfloat16213__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    @lower(operator.gt, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(make_bfloat162, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_gt_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL14make_bfloat16213__nv_bfloat16S__nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            operator_gt_1_caller,
+            _ZL14make_bfloat16213__nv_bfloat16S__nbst_caller,
             signature(
-                bool_,
+                _type___nv_bfloat162,
                 CPointer(_type___nv_bfloat16),
                 CPointer(_type___nv_bfloat16),
             ),
@@ -3880,858 +4066,11629 @@ def impl(context, builder, sig, args):
         )
 
 
-_operator_gt_1_lower(shim_stream, shim_obj)
+_lower__ZL14make_bfloat16213__nv_bfloat16S__nbst(shim_stream, shim_obj)
 
 
-def _operator_lt_1_lower(shim_stream, shim_obj):
+def __bfloat162ull_rd():
+    pass
+
+
+def _lower__ZL17__bfloat162ull_rd13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    operator_lt_1(bool &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
-        retval = operator<(*lh, *rh);
+    _ZL17__bfloat162ull_rd13__nv_bfloat16_nbst(unsigned long long &retval , __nv_bfloat16* h) {
+        retval = __bfloat162ull_rd(*h);
         return 0;
     }
         """
 
-    operator_lt_1 = declare_device(
-        "operator_lt_1",
-        bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)),
+    _ZL17__bfloat162ull_rd13__nv_bfloat16_nbst = declare_device(
+        "_ZL17__bfloat162ull_rd13__nv_bfloat16_nbst",
+        uint64(CPointer(_type___nv_bfloat16)),
     )
 
-    def operator_lt_1_caller(arg_0, arg_1):
-        return operator_lt_1(arg_0, arg_1)
+    def _ZL17__bfloat162ull_rd13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL17__bfloat162ull_rd13__nv_bfloat16_nbst(arg_0)
 
-    @lower(operator.lt, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__bfloat162ull_rd, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_lt_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL17__bfloat162ull_rd13__nv_bfloat16_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            operator_lt_1_caller,
-            signature(
-                bool_,
-                CPointer(_type___nv_bfloat16),
-                CPointer(_type___nv_bfloat16),
-            ),
+            _ZL17__bfloat162ull_rd13__nv_bfloat16_nbst_caller,
+            signature(uint64, CPointer(_type___nv_bfloat16)),
             ptrs,
         )
 
 
-_operator_lt_1_lower(shim_stream, shim_obj)
+_lower__ZL17__bfloat162ull_rd13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def __bfloat162ull_ru():
+    pass
 
 
-def _operator_ge_1_lower(shim_stream, shim_obj):
+def _lower__ZL17__bfloat162ull_ru13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    operator_ge_1(bool &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
-        retval = operator>=(*lh, *rh);
+    _ZL17__bfloat162ull_ru13__nv_bfloat16_nbst(unsigned long long &retval , __nv_bfloat16* h) {
+        retval = __bfloat162ull_ru(*h);
         return 0;
     }
         """
 
-    operator_ge_1 = declare_device(
-        "operator_ge_1",
-        bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)),
+    _ZL17__bfloat162ull_ru13__nv_bfloat16_nbst = declare_device(
+        "_ZL17__bfloat162ull_ru13__nv_bfloat16_nbst",
+        uint64(CPointer(_type___nv_bfloat16)),
     )
 
-    def operator_ge_1_caller(arg_0, arg_1):
-        return operator_ge_1(arg_0, arg_1)
+    def _ZL17__bfloat162ull_ru13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL17__bfloat162ull_ru13__nv_bfloat16_nbst(arg_0)
 
-    @lower(operator.ge, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__bfloat162ull_ru, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_ge_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL17__bfloat162ull_ru13__nv_bfloat16_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            operator_ge_1_caller,
-            signature(
-                bool_,
-                CPointer(_type___nv_bfloat16),
-                CPointer(_type___nv_bfloat16),
-            ),
+            _ZL17__bfloat162ull_ru13__nv_bfloat16_nbst_caller,
+            signature(uint64, CPointer(_type___nv_bfloat16)),
             ptrs,
         )
 
 
-_operator_ge_1_lower(shim_stream, shim_obj)
+_lower__ZL17__bfloat162ull_ru13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def __ull2bfloat16_rn():
+    pass
 
 
-def _operator_le_1_lower(shim_stream, shim_obj):
+def _lower__ZL17__ull2bfloat16_rny_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    operator_le_1(bool &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
-        retval = operator<=(*lh, *rh);
+    _ZL17__ull2bfloat16_rny_nbst(__nv_bfloat16 &retval , unsigned long long* i) {
+        retval = __ull2bfloat16_rn(*i);
         return 0;
     }
         """
 
-    operator_le_1 = declare_device(
-        "operator_le_1",
-        bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)),
+    _ZL17__ull2bfloat16_rny_nbst = declare_device(
+        "_ZL17__ull2bfloat16_rny_nbst", _type___nv_bfloat16(CPointer(uint64))
     )
 
-    def operator_le_1_caller(arg_0, arg_1):
-        return operator_le_1(arg_0, arg_1)
+    def _ZL17__ull2bfloat16_rny_nbst_caller(arg_0):
+        return _ZL17__ull2bfloat16_rny_nbst(arg_0)
 
-    @lower(operator.le, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__ull2bfloat16_rn, uint64)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_le_1", shim_raw_str)
+        shim_stream.write_with_key("_ZL17__ull2bfloat16_rny_nbst", shim_raw_str)
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            operator_le_1_caller,
-            signature(
-                bool_,
-                CPointer(_type___nv_bfloat16),
-                CPointer(_type___nv_bfloat16),
-            ),
+            _ZL17__ull2bfloat16_rny_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(uint64)),
             ptrs,
         )
 
 
-_operator_le_1_lower(shim_stream, shim_obj)
+_lower__ZL17__ull2bfloat16_rny_nbst(shim_stream, shim_obj)
+
+
+def __ull2bfloat16_rz():
+    pass
 
 
-def _operator_add_2_lower(shim_stream, shim_obj):
+def _lower__ZL17__ull2bfloat16_rzy_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    operator_add_2(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
-        retval = operator+(*lh, *rh);
+    _ZL17__ull2bfloat16_rzy_nbst(__nv_bfloat16 &retval , unsigned long long* i) {
+        retval = __ull2bfloat16_rz(*i);
         return 0;
     }
         """
 
-    operator_add_2 = declare_device(
-        "operator_add_2",
-        _type___nv_bfloat162(
-            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
-        ),
+    _ZL17__ull2bfloat16_rzy_nbst = declare_device(
+        "_ZL17__ull2bfloat16_rzy_nbst", _type___nv_bfloat16(CPointer(uint64))
     )
 
-    def operator_add_2_caller(arg_0, arg_1):
-        return operator_add_2(arg_0, arg_1)
+    def _ZL17__ull2bfloat16_rzy_nbst_caller(arg_0):
+        return _ZL17__ull2bfloat16_rzy_nbst(arg_0)
 
-    @lower(operator.add, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__ull2bfloat16_rz, uint64)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_add_2", shim_raw_str)
+        shim_stream.write_with_key("_ZL17__ull2bfloat16_rzy_nbst", shim_raw_str)
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            operator_add_2_caller,
-            signature(
-                _type___nv_bfloat162,
-                CPointer(_type___nv_bfloat162),
-                CPointer(_type___nv_bfloat162),
-            ),
+            _ZL17__ull2bfloat16_rzy_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(uint64)),
             ptrs,
         )
 
 
-_operator_add_2_lower(shim_stream, shim_obj)
+_lower__ZL17__ull2bfloat16_rzy_nbst(shim_stream, shim_obj)
+
+
+def __ull2bfloat16_rd():
+    pass
 
 
-def _operator_sub_2_lower(shim_stream, shim_obj):
+def _lower__ZL17__ull2bfloat16_rdy_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    operator_sub_2(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
-        retval = operator-(*lh, *rh);
+    _ZL17__ull2bfloat16_rdy_nbst(__nv_bfloat16 &retval , unsigned long long* i) {
+        retval = __ull2bfloat16_rd(*i);
         return 0;
     }
         """
 
-    operator_sub_2 = declare_device(
-        "operator_sub_2",
-        _type___nv_bfloat162(
-            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
-        ),
+    _ZL17__ull2bfloat16_rdy_nbst = declare_device(
+        "_ZL17__ull2bfloat16_rdy_nbst", _type___nv_bfloat16(CPointer(uint64))
     )
 
-    def operator_sub_2_caller(arg_0, arg_1):
-        return operator_sub_2(arg_0, arg_1)
+    def _ZL17__ull2bfloat16_rdy_nbst_caller(arg_0):
+        return _ZL17__ull2bfloat16_rdy_nbst(arg_0)
 
-    @lower(operator.sub, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__ull2bfloat16_rd, uint64)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_sub_2", shim_raw_str)
+        shim_stream.write_with_key("_ZL17__ull2bfloat16_rdy_nbst", shim_raw_str)
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            operator_sub_2_caller,
-            signature(
-                _type___nv_bfloat162,
-                CPointer(_type___nv_bfloat162),
-                CPointer(_type___nv_bfloat162),
-            ),
+            _ZL17__ull2bfloat16_rdy_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(uint64)),
             ptrs,
         )
 
 
-_operator_sub_2_lower(shim_stream, shim_obj)
+_lower__ZL17__ull2bfloat16_rdy_nbst(shim_stream, shim_obj)
+
+
+def __ull2bfloat16_ru():
+    pass
 
 
-def _operator_mul_2_lower(shim_stream, shim_obj):
+def _lower__ZL17__ull2bfloat16_ruy_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    operator_mul_2(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
-        retval = operator*(*lh, *rh);
+    _ZL17__ull2bfloat16_ruy_nbst(__nv_bfloat16 &retval , unsigned long long* i) {
+        retval = __ull2bfloat16_ru(*i);
         return 0;
     }
         """
 
-    operator_mul_2 = declare_device(
-        "operator_mul_2",
-        _type___nv_bfloat162(
-            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
-        ),
+    _ZL17__ull2bfloat16_ruy_nbst = declare_device(
+        "_ZL17__ull2bfloat16_ruy_nbst", _type___nv_bfloat16(CPointer(uint64))
     )
 
-    def operator_mul_2_caller(arg_0, arg_1):
-        return operator_mul_2(arg_0, arg_1)
+    def _ZL17__ull2bfloat16_ruy_nbst_caller(arg_0):
+        return _ZL17__ull2bfloat16_ruy_nbst(arg_0)
 
-    @lower(operator.mul, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__ull2bfloat16_ru, uint64)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_mul_2", shim_raw_str)
+        shim_stream.write_with_key("_ZL17__ull2bfloat16_ruy_nbst", shim_raw_str)
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            operator_mul_2_caller,
-            signature(
-                _type___nv_bfloat162,
-                CPointer(_type___nv_bfloat162),
-                CPointer(_type___nv_bfloat162),
-            ),
+            _ZL17__ull2bfloat16_ruy_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(uint64)),
             ptrs,
         )
 
 
-_operator_mul_2_lower(shim_stream, shim_obj)
+_lower__ZL17__ull2bfloat16_ruy_nbst(shim_stream, shim_obj)
 
 
-def _operator_truediv_2_lower(shim_stream, shim_obj):
+def __bfloat162ll_rn():
+    pass
+
+
+def _lower__ZL16__bfloat162ll_rn13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    operator_truediv_2(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
-        retval = operator/(*lh, *rh);
+    _ZL16__bfloat162ll_rn13__nv_bfloat16_nbst(long long &retval , __nv_bfloat16* h) {
+        retval = __bfloat162ll_rn(*h);
         return 0;
     }
         """
 
-    operator_truediv_2 = declare_device(
-        "operator_truediv_2",
-        _type___nv_bfloat162(
-            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
-        ),
+    _ZL16__bfloat162ll_rn13__nv_bfloat16_nbst = declare_device(
+        "_ZL16__bfloat162ll_rn13__nv_bfloat16_nbst",
+        int64(CPointer(_type___nv_bfloat16)),
     )
 
-    def operator_truediv_2_caller(arg_0, arg_1):
-        return operator_truediv_2(arg_0, arg_1)
+    def _ZL16__bfloat162ll_rn13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL16__bfloat162ll_rn13__nv_bfloat16_nbst(arg_0)
 
-    @lower(operator.truediv, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__bfloat162ll_rn, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_truediv_2", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL16__bfloat162ll_rn13__nv_bfloat16_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            operator_truediv_2_caller,
-            signature(
-                _type___nv_bfloat162,
-                CPointer(_type___nv_bfloat162),
-                CPointer(_type___nv_bfloat162),
-            ),
+            _ZL16__bfloat162ll_rn13__nv_bfloat16_nbst_caller,
+            signature(int64, CPointer(_type___nv_bfloat16)),
             ptrs,
         )
 
 
-_operator_truediv_2_lower(shim_stream, shim_obj)
+_lower__ZL16__bfloat162ll_rn13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def __bfloat162ll_rz():
+    pass
 
 
-def _operator_iadd_2_lower(shim_stream, shim_obj):
+def _lower__ZL16__bfloat162ll_rz13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    operator_iadd_2(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
-        retval = operator+=(*lh, *rh);
+    _ZL16__bfloat162ll_rz13__nv_bfloat16_nbst(long long &retval , __nv_bfloat16* h) {
+        retval = __bfloat162ll_rz(*h);
         return 0;
     }
         """
 
-    operator_iadd_2 = declare_device(
-        "operator_iadd_2",
-        _type___nv_bfloat162(
-            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
-        ),
+    _ZL16__bfloat162ll_rz13__nv_bfloat16_nbst = declare_device(
+        "_ZL16__bfloat162ll_rz13__nv_bfloat16_nbst",
+        int64(CPointer(_type___nv_bfloat16)),
     )
 
-    def operator_iadd_2_caller(arg_0, arg_1):
-        return operator_iadd_2(arg_0, arg_1)
+    def _ZL16__bfloat162ll_rz13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL16__bfloat162ll_rz13__nv_bfloat16_nbst(arg_0)
 
-    @lower(operator.iadd, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__bfloat162ll_rz, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_iadd_2", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL16__bfloat162ll_rz13__nv_bfloat16_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            operator_iadd_2_caller,
-            signature(
-                _type___nv_bfloat162,
-                CPointer(_type___nv_bfloat162),
-                CPointer(_type___nv_bfloat162),
-            ),
+            _ZL16__bfloat162ll_rz13__nv_bfloat16_nbst_caller,
+            signature(int64, CPointer(_type___nv_bfloat16)),
             ptrs,
         )
 
 
-_operator_iadd_2_lower(shim_stream, shim_obj)
+_lower__ZL16__bfloat162ll_rz13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def __bfloat162ll_rd():
+    pass
 
 
-def _operator_isub_2_lower(shim_stream, shim_obj):
+def _lower__ZL16__bfloat162ll_rd13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    operator_isub_2(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
-        retval = operator-=(*lh, *rh);
+    _ZL16__bfloat162ll_rd13__nv_bfloat16_nbst(long long &retval , __nv_bfloat16* h) {
+        retval = __bfloat162ll_rd(*h);
         return 0;
     }
         """
 
-    operator_isub_2 = declare_device(
-        "operator_isub_2",
-        _type___nv_bfloat162(
-            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
-        ),
+    _ZL16__bfloat162ll_rd13__nv_bfloat16_nbst = declare_device(
+        "_ZL16__bfloat162ll_rd13__nv_bfloat16_nbst",
+        int64(CPointer(_type___nv_bfloat16)),
     )
 
-    def operator_isub_2_caller(arg_0, arg_1):
-        return operator_isub_2(arg_0, arg_1)
+    def _ZL16__bfloat162ll_rd13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL16__bfloat162ll_rd13__nv_bfloat16_nbst(arg_0)
 
-    @lower(operator.isub, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__bfloat162ll_rd, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_isub_2", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL16__bfloat162ll_rd13__nv_bfloat16_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            operator_isub_2_caller,
-            signature(
-                _type___nv_bfloat162,
-                CPointer(_type___nv_bfloat162),
-                CPointer(_type___nv_bfloat162),
-            ),
+            _ZL16__bfloat162ll_rd13__nv_bfloat16_nbst_caller,
+            signature(int64, CPointer(_type___nv_bfloat16)),
             ptrs,
         )
 
 
-_operator_isub_2_lower(shim_stream, shim_obj)
+_lower__ZL16__bfloat162ll_rd13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def __bfloat162ll_ru():
+    pass
 
 
-def _operator_imul_2_lower(shim_stream, shim_obj):
+def _lower__ZL16__bfloat162ll_ru13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    operator_imul_2(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
-        retval = operator*=(*lh, *rh);
+    _ZL16__bfloat162ll_ru13__nv_bfloat16_nbst(long long &retval , __nv_bfloat16* h) {
+        retval = __bfloat162ll_ru(*h);
         return 0;
     }
         """
 
-    operator_imul_2 = declare_device(
-        "operator_imul_2",
-        _type___nv_bfloat162(
-            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
-        ),
+    _ZL16__bfloat162ll_ru13__nv_bfloat16_nbst = declare_device(
+        "_ZL16__bfloat162ll_ru13__nv_bfloat16_nbst",
+        int64(CPointer(_type___nv_bfloat16)),
     )
 
-    def operator_imul_2_caller(arg_0, arg_1):
-        return operator_imul_2(arg_0, arg_1)
+    def _ZL16__bfloat162ll_ru13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL16__bfloat162ll_ru13__nv_bfloat16_nbst(arg_0)
 
-    @lower(operator.imul, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__bfloat162ll_ru, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_imul_2", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL16__bfloat162ll_ru13__nv_bfloat16_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            operator_imul_2_caller,
-            signature(
-                _type___nv_bfloat162,
-                CPointer(_type___nv_bfloat162),
-                CPointer(_type___nv_bfloat162),
-            ),
+            _ZL16__bfloat162ll_ru13__nv_bfloat16_nbst_caller,
+            signature(int64, CPointer(_type___nv_bfloat16)),
             ptrs,
         )
 
 
-_operator_imul_2_lower(shim_stream, shim_obj)
+_lower__ZL16__bfloat162ll_ru13__nv_bfloat16_nbst(shim_stream, shim_obj)
 
 
-def _operator_itruediv_2_lower(shim_stream, shim_obj):
+def __ll2bfloat16_rn():
+    pass
+
+
+def _lower__ZL16__ll2bfloat16_rnx_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    operator_itruediv_2(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
-        retval = operator/=(*lh, *rh);
+    _ZL16__ll2bfloat16_rnx_nbst(__nv_bfloat16 &retval , long long* i) {
+        retval = __ll2bfloat16_rn(*i);
         return 0;
     }
         """
 
-    operator_itruediv_2 = declare_device(
-        "operator_itruediv_2",
-        _type___nv_bfloat162(
-            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
-        ),
+    _ZL16__ll2bfloat16_rnx_nbst = declare_device(
+        "_ZL16__ll2bfloat16_rnx_nbst", _type___nv_bfloat16(CPointer(int64))
     )
 
-    def operator_itruediv_2_caller(arg_0, arg_1):
-        return operator_itruediv_2(arg_0, arg_1)
+    def _ZL16__ll2bfloat16_rnx_nbst_caller(arg_0):
+        return _ZL16__ll2bfloat16_rnx_nbst(arg_0)
 
-    @lower(operator.itruediv, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__ll2bfloat16_rn, int64)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_itruediv_2", shim_raw_str)
+        shim_stream.write_with_key("_ZL16__ll2bfloat16_rnx_nbst", shim_raw_str)
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            operator_itruediv_2_caller,
-            signature(
-                _type___nv_bfloat162,
-                CPointer(_type___nv_bfloat162),
-                CPointer(_type___nv_bfloat162),
-            ),
+            _ZL16__ll2bfloat16_rnx_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(int64)),
             ptrs,
         )
 
 
-_operator_itruediv_2_lower(shim_stream, shim_obj)
+_lower__ZL16__ll2bfloat16_rnx_nbst(shim_stream, shim_obj)
+
+
+def __ll2bfloat16_rz():
+    pass
 
 
-def _operator_pos_2_lower(shim_stream, shim_obj):
+def _lower__ZL16__ll2bfloat16_rzx_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    operator_pos_2(__nv_bfloat162 &retval , __nv_bfloat162* h) {
-        retval = operator+(*h);
+    _ZL16__ll2bfloat16_rzx_nbst(__nv_bfloat16 &retval , long long* i) {
+        retval = __ll2bfloat16_rz(*i);
         return 0;
     }
         """
 
-    operator_pos_2 = declare_device(
-        "operator_pos_2", _type___nv_bfloat162(CPointer(_type___nv_bfloat162))
+    _ZL16__ll2bfloat16_rzx_nbst = declare_device(
+        "_ZL16__ll2bfloat16_rzx_nbst", _type___nv_bfloat16(CPointer(int64))
     )
 
-    def operator_pos_2_caller(arg_0):
-        return operator_pos_2(arg_0)
+    def _ZL16__ll2bfloat16_rzx_nbst_caller(arg_0):
+        return _ZL16__ll2bfloat16_rzx_nbst(arg_0)
 
-    @lower(operator.pos, _type___nv_bfloat162)
+    @lower(__ll2bfloat16_rz, int64)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_pos_2", shim_raw_str)
+        shim_stream.write_with_key("_ZL16__ll2bfloat16_rzx_nbst", shim_raw_str)
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            operator_pos_2_caller,
-            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            _ZL16__ll2bfloat16_rzx_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(int64)),
             ptrs,
         )
 
 
-_operator_pos_2_lower(shim_stream, shim_obj)
+_lower__ZL16__ll2bfloat16_rzx_nbst(shim_stream, shim_obj)
+
+
+def __ll2bfloat16_rd():
+    pass
 
 
-def _operator_neg_2_lower(shim_stream, shim_obj):
+def _lower__ZL16__ll2bfloat16_rdx_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    operator_neg_2(__nv_bfloat162 &retval , __nv_bfloat162* h) {
-        retval = operator-(*h);
+    _ZL16__ll2bfloat16_rdx_nbst(__nv_bfloat16 &retval , long long* i) {
+        retval = __ll2bfloat16_rd(*i);
         return 0;
     }
         """
 
-    operator_neg_2 = declare_device(
-        "operator_neg_2", _type___nv_bfloat162(CPointer(_type___nv_bfloat162))
+    _ZL16__ll2bfloat16_rdx_nbst = declare_device(
+        "_ZL16__ll2bfloat16_rdx_nbst", _type___nv_bfloat16(CPointer(int64))
     )
 
-    def operator_neg_2_caller(arg_0):
-        return operator_neg_2(arg_0)
+    def _ZL16__ll2bfloat16_rdx_nbst_caller(arg_0):
+        return _ZL16__ll2bfloat16_rdx_nbst(arg_0)
 
-    @lower(operator.neg, _type___nv_bfloat162)
+    @lower(__ll2bfloat16_rd, int64)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_neg_2", shim_raw_str)
+        shim_stream.write_with_key("_ZL16__ll2bfloat16_rdx_nbst", shim_raw_str)
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
-        return context.compile_internal(
-            builder,
-            operator_neg_2_caller,
-            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
-            ptrs,
-        )
+        return context.compile_internal(
+            builder,
+            _ZL16__ll2bfloat16_rdx_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(int64)),
+            ptrs,
+        )
+
+
+_lower__ZL16__ll2bfloat16_rdx_nbst(shim_stream, shim_obj)
+
+
+def __ll2bfloat16_ru():
+    pass
+
+
+def _lower__ZL16__ll2bfloat16_rux_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL16__ll2bfloat16_rux_nbst(__nv_bfloat16 &retval , long long* i) {
+        retval = __ll2bfloat16_ru(*i);
+        return 0;
+    }
+        """
+
+    _ZL16__ll2bfloat16_rux_nbst = declare_device(
+        "_ZL16__ll2bfloat16_rux_nbst", _type___nv_bfloat16(CPointer(int64))
+    )
+
+    def _ZL16__ll2bfloat16_rux_nbst_caller(arg_0):
+        return _ZL16__ll2bfloat16_rux_nbst(arg_0)
+
+    @lower(__ll2bfloat16_ru, int64)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key("_ZL16__ll2bfloat16_rux_nbst", shim_raw_str)
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL16__ll2bfloat16_rux_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(int64)),
+            ptrs,
+        )
+
+
+_lower__ZL16__ll2bfloat16_rux_nbst(shim_stream, shim_obj)
+
+
+def htrunc():
+    pass
+
+
+def _lower__ZL6htrunc13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6htrunc13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* h) {
+        retval = htrunc(*h);
+        return 0;
+    }
+        """
+
+    _ZL6htrunc13__nv_bfloat16_nbst = declare_device(
+        "_ZL6htrunc13__nv_bfloat16_nbst",
+        _type___nv_bfloat16(CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL6htrunc13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL6htrunc13__nv_bfloat16_nbst(arg_0)
+
+    @lower(htrunc, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6htrunc13__nv_bfloat16_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6htrunc13__nv_bfloat16_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            ptrs,
+        )
+
+
+_lower__ZL6htrunc13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def hceil():
+    pass
+
+
+def _lower__ZL5hceil13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL5hceil13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* h) {
+        retval = hceil(*h);
+        return 0;
+    }
+        """
+
+    _ZL5hceil13__nv_bfloat16_nbst = declare_device(
+        "_ZL5hceil13__nv_bfloat16_nbst",
+        _type___nv_bfloat16(CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL5hceil13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL5hceil13__nv_bfloat16_nbst(arg_0)
+
+    @lower(hceil, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL5hceil13__nv_bfloat16_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL5hceil13__nv_bfloat16_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            ptrs,
+        )
+
+
+_lower__ZL5hceil13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def hfloor():
+    pass
+
+
+def _lower__ZL6hfloor13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6hfloor13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* h) {
+        retval = hfloor(*h);
+        return 0;
+    }
+        """
+
+    _ZL6hfloor13__nv_bfloat16_nbst = declare_device(
+        "_ZL6hfloor13__nv_bfloat16_nbst",
+        _type___nv_bfloat16(CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL6hfloor13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL6hfloor13__nv_bfloat16_nbst(arg_0)
+
+    @lower(hfloor, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6hfloor13__nv_bfloat16_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6hfloor13__nv_bfloat16_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            ptrs,
+        )
+
+
+_lower__ZL6hfloor13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def hrint():
+    pass
+
+
+def _lower__ZL5hrint13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL5hrint13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* h) {
+        retval = hrint(*h);
+        return 0;
+    }
+        """
+
+    _ZL5hrint13__nv_bfloat16_nbst = declare_device(
+        "_ZL5hrint13__nv_bfloat16_nbst",
+        _type___nv_bfloat16(CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL5hrint13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL5hrint13__nv_bfloat16_nbst(arg_0)
+
+    @lower(hrint, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL5hrint13__nv_bfloat16_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL5hrint13__nv_bfloat16_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            ptrs,
+        )
+
+
+_lower__ZL5hrint13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def h2trunc():
+    pass
+
+
+def _lower__ZL7h2trunc14__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL7h2trunc14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* h) {
+        retval = h2trunc(*h);
+        return 0;
+    }
+        """
+
+    _ZL7h2trunc14__nv_bfloat162_nbst = declare_device(
+        "_ZL7h2trunc14__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL7h2trunc14__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL7h2trunc14__nv_bfloat162_nbst(arg_0)
+
+    @lower(h2trunc, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL7h2trunc14__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL7h2trunc14__nv_bfloat162_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            ptrs,
+        )
+
+
+_lower__ZL7h2trunc14__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def h2ceil():
+    pass
+
+
+def _lower__ZL6h2ceil14__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6h2ceil14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* h) {
+        retval = h2ceil(*h);
+        return 0;
+    }
+        """
+
+    _ZL6h2ceil14__nv_bfloat162_nbst = declare_device(
+        "_ZL6h2ceil14__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL6h2ceil14__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL6h2ceil14__nv_bfloat162_nbst(arg_0)
+
+    @lower(h2ceil, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6h2ceil14__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6h2ceil14__nv_bfloat162_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            ptrs,
+        )
+
+
+_lower__ZL6h2ceil14__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def h2floor():
+    pass
+
+
+def _lower__ZL7h2floor14__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL7h2floor14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* h) {
+        retval = h2floor(*h);
+        return 0;
+    }
+        """
+
+    _ZL7h2floor14__nv_bfloat162_nbst = declare_device(
+        "_ZL7h2floor14__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL7h2floor14__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL7h2floor14__nv_bfloat162_nbst(arg_0)
+
+    @lower(h2floor, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL7h2floor14__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL7h2floor14__nv_bfloat162_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            ptrs,
+        )
+
+
+_lower__ZL7h2floor14__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def h2rint():
+    pass
+
+
+def _lower__ZL6h2rint14__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6h2rint14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* h) {
+        retval = h2rint(*h);
+        return 0;
+    }
+        """
+
+    _ZL6h2rint14__nv_bfloat162_nbst = declare_device(
+        "_ZL6h2rint14__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL6h2rint14__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL6h2rint14__nv_bfloat162_nbst(arg_0)
+
+    @lower(h2rint, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6h2rint14__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6h2rint14__nv_bfloat162_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            ptrs,
+        )
+
+
+_lower__ZL6h2rint14__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def __bfloat162bfloat162():
+    pass
+
+
+def _lower__ZL20__bfloat162bfloat16213__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL20__bfloat162bfloat16213__nv_bfloat16_nbst(__nv_bfloat162 &retval , __nv_bfloat16* a) {
+        retval = __bfloat162bfloat162(*a);
+        return 0;
+    }
+        """
+
+    _ZL20__bfloat162bfloat16213__nv_bfloat16_nbst = declare_device(
+        "_ZL20__bfloat162bfloat16213__nv_bfloat16_nbst",
+        _type___nv_bfloat162(CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL20__bfloat162bfloat16213__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL20__bfloat162bfloat16213__nv_bfloat16_nbst(arg_0)
+
+    @lower(__bfloat162bfloat162, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL20__bfloat162bfloat16213__nv_bfloat16_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL20__bfloat162bfloat16213__nv_bfloat16_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat16)),
+            ptrs,
+        )
+
+
+_lower__ZL20__bfloat162bfloat16213__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def __lowhigh2highlow():
+    pass
+
+
+def _lower__ZL17__lowhigh2highlow14__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL17__lowhigh2highlow14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) {
+        retval = __lowhigh2highlow(*a);
+        return 0;
+    }
+        """
+
+    _ZL17__lowhigh2highlow14__nv_bfloat162_nbst = declare_device(
+        "_ZL17__lowhigh2highlow14__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL17__lowhigh2highlow14__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL17__lowhigh2highlow14__nv_bfloat162_nbst(arg_0)
+
+    @lower(__lowhigh2highlow, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL17__lowhigh2highlow14__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL17__lowhigh2highlow14__nv_bfloat162_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            ptrs,
+        )
+
+
+_lower__ZL17__lowhigh2highlow14__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def __lows2bfloat162():
+    pass
+
+
+def _lower__ZL16__lows2bfloat16214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL16__lows2bfloat16214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __lows2bfloat162(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL16__lows2bfloat16214__nv_bfloat162S__nbst = declare_device(
+        "_ZL16__lows2bfloat16214__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL16__lows2bfloat16214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL16__lows2bfloat16214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__lows2bfloat162, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL16__lows2bfloat16214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL16__lows2bfloat16214__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL16__lows2bfloat16214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __highs2bfloat162():
+    pass
+
+
+def _lower__ZL17__highs2bfloat16214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL17__highs2bfloat16214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __highs2bfloat162(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL17__highs2bfloat16214__nv_bfloat162S__nbst = declare_device(
+        "_ZL17__highs2bfloat16214__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL17__highs2bfloat16214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL17__highs2bfloat16214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__highs2bfloat162, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL17__highs2bfloat16214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL17__highs2bfloat16214__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL17__highs2bfloat16214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __high2bfloat16():
+    pass
+
+
+def _lower__ZL15__high2bfloat1614__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL15__high2bfloat1614__nv_bfloat162_nbst(__nv_bfloat16 &retval , __nv_bfloat162* a) {
+        retval = __high2bfloat16(*a);
+        return 0;
+    }
+        """
+
+    _ZL15__high2bfloat1614__nv_bfloat162_nbst = declare_device(
+        "_ZL15__high2bfloat1614__nv_bfloat162_nbst",
+        _type___nv_bfloat16(CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL15__high2bfloat1614__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL15__high2bfloat1614__nv_bfloat162_nbst(arg_0)
+
+    @lower(__high2bfloat16, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL15__high2bfloat1614__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL15__high2bfloat1614__nv_bfloat162_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat162)),
+            ptrs,
+        )
+
+
+_lower__ZL15__high2bfloat1614__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def __low2bfloat16():
+    pass
+
+
+def _lower__ZL14__low2bfloat1614__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL14__low2bfloat1614__nv_bfloat162_nbst(__nv_bfloat16 &retval , __nv_bfloat162* a) {
+        retval = __low2bfloat16(*a);
+        return 0;
+    }
+        """
+
+    _ZL14__low2bfloat1614__nv_bfloat162_nbst = declare_device(
+        "_ZL14__low2bfloat1614__nv_bfloat162_nbst",
+        _type___nv_bfloat16(CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL14__low2bfloat1614__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL14__low2bfloat1614__nv_bfloat162_nbst(arg_0)
+
+    @lower(__low2bfloat16, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL14__low2bfloat1614__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL14__low2bfloat1614__nv_bfloat162_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat162)),
+            ptrs,
+        )
+
+
+_lower__ZL14__low2bfloat1614__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def __hisinf():
+    pass
+
+
+def _lower__ZL8__hisinf13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL8__hisinf13__nv_bfloat16_nbst(int &retval , __nv_bfloat16* a) {
+        retval = __hisinf(*a);
+        return 0;
+    }
+        """
+
+    _ZL8__hisinf13__nv_bfloat16_nbst = declare_device(
+        "_ZL8__hisinf13__nv_bfloat16_nbst", int32(CPointer(_type___nv_bfloat16))
+    )
+
+    def _ZL8__hisinf13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL8__hisinf13__nv_bfloat16_nbst(arg_0)
+
+    @lower(__hisinf, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL8__hisinf13__nv_bfloat16_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL8__hisinf13__nv_bfloat16_nbst_caller,
+            signature(int32, CPointer(_type___nv_bfloat16)),
+            ptrs,
+        )
+
+
+_lower__ZL8__hisinf13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def __halves2bfloat162():
+    pass
+
+
+def _lower__ZL18__halves2bfloat16213__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL18__halves2bfloat16213__nv_bfloat16S__nbst(__nv_bfloat162 &retval , __nv_bfloat16* a, __nv_bfloat16* b) {
+        retval = __halves2bfloat162(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL18__halves2bfloat16213__nv_bfloat16S__nbst = declare_device(
+        "_ZL18__halves2bfloat16213__nv_bfloat16S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
+        ),
+    )
+
+    def _ZL18__halves2bfloat16213__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL18__halves2bfloat16213__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__halves2bfloat162, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL18__halves2bfloat16213__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL18__halves2bfloat16213__nv_bfloat16S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL18__halves2bfloat16213__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __low2bfloat162():
+    pass
+
+
+def _lower__ZL15__low2bfloat16214__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL15__low2bfloat16214__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) {
+        retval = __low2bfloat162(*a);
+        return 0;
+    }
+        """
+
+    _ZL15__low2bfloat16214__nv_bfloat162_nbst = declare_device(
+        "_ZL15__low2bfloat16214__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL15__low2bfloat16214__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL15__low2bfloat16214__nv_bfloat162_nbst(arg_0)
+
+    @lower(__low2bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL15__low2bfloat16214__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL15__low2bfloat16214__nv_bfloat162_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            ptrs,
+        )
+
+
+_lower__ZL15__low2bfloat16214__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def __high2bfloat162():
+    pass
+
+
+def _lower__ZL16__high2bfloat16214__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL16__high2bfloat16214__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) {
+        retval = __high2bfloat162(*a);
+        return 0;
+    }
+        """
+
+    _ZL16__high2bfloat16214__nv_bfloat162_nbst = declare_device(
+        "_ZL16__high2bfloat16214__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL16__high2bfloat16214__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL16__high2bfloat16214__nv_bfloat162_nbst(arg_0)
+
+    @lower(__high2bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL16__high2bfloat16214__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL16__high2bfloat16214__nv_bfloat162_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            ptrs,
+        )
+
+
+_lower__ZL16__high2bfloat16214__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def __bfloat16_as_short():
+    pass
+
+
+def _lower__ZL19__bfloat16_as_short13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL19__bfloat16_as_short13__nv_bfloat16_nbst(short &retval , __nv_bfloat16* h) {
+        retval = __bfloat16_as_short(*h);
+        return 0;
+    }
+        """
+
+    _ZL19__bfloat16_as_short13__nv_bfloat16_nbst = declare_device(
+        "_ZL19__bfloat16_as_short13__nv_bfloat16_nbst",
+        int16(CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL19__bfloat16_as_short13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL19__bfloat16_as_short13__nv_bfloat16_nbst(arg_0)
+
+    @lower(__bfloat16_as_short, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL19__bfloat16_as_short13__nv_bfloat16_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL19__bfloat16_as_short13__nv_bfloat16_nbst_caller,
+            signature(int16, CPointer(_type___nv_bfloat16)),
+            ptrs,
+        )
+
+
+_lower__ZL19__bfloat16_as_short13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def __bfloat16_as_ushort():
+    pass
+
+
+def _lower__ZL20__bfloat16_as_ushort13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL20__bfloat16_as_ushort13__nv_bfloat16_nbst(unsigned short &retval , __nv_bfloat16* h) {
+        retval = __bfloat16_as_ushort(*h);
+        return 0;
+    }
+        """
+
+    _ZL20__bfloat16_as_ushort13__nv_bfloat16_nbst = declare_device(
+        "_ZL20__bfloat16_as_ushort13__nv_bfloat16_nbst",
+        uint16(CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL20__bfloat16_as_ushort13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL20__bfloat16_as_ushort13__nv_bfloat16_nbst(arg_0)
+
+    @lower(__bfloat16_as_ushort, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL20__bfloat16_as_ushort13__nv_bfloat16_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL20__bfloat16_as_ushort13__nv_bfloat16_nbst_caller,
+            signature(uint16, CPointer(_type___nv_bfloat16)),
+            ptrs,
+        )
+
+
+_lower__ZL20__bfloat16_as_ushort13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def __short_as_bfloat16():
+    pass
+
+
+def _lower__ZL19__short_as_bfloat16s_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL19__short_as_bfloat16s_nbst(__nv_bfloat16 &retval , short* i) {
+        retval = __short_as_bfloat16(*i);
+        return 0;
+    }
+        """
+
+    _ZL19__short_as_bfloat16s_nbst = declare_device(
+        "_ZL19__short_as_bfloat16s_nbst", _type___nv_bfloat16(CPointer(int16))
+    )
+
+    def _ZL19__short_as_bfloat16s_nbst_caller(arg_0):
+        return _ZL19__short_as_bfloat16s_nbst(arg_0)
+
+    @lower(__short_as_bfloat16, int16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL19__short_as_bfloat16s_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL19__short_as_bfloat16s_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(int16)),
+            ptrs,
+        )
+
+
+_lower__ZL19__short_as_bfloat16s_nbst(shim_stream, shim_obj)
+
+
+def __ushort_as_bfloat16():
+    pass
+
+
+def _lower__ZL20__ushort_as_bfloat16t_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL20__ushort_as_bfloat16t_nbst(__nv_bfloat16 &retval , unsigned short* i) {
+        retval = __ushort_as_bfloat16(*i);
+        return 0;
+    }
+        """
+
+    _ZL20__ushort_as_bfloat16t_nbst = declare_device(
+        "_ZL20__ushort_as_bfloat16t_nbst", _type___nv_bfloat16(CPointer(uint16))
+    )
+
+    def _ZL20__ushort_as_bfloat16t_nbst_caller(arg_0):
+        return _ZL20__ushort_as_bfloat16t_nbst(arg_0)
+
+    @lower(__ushort_as_bfloat16, uint16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL20__ushort_as_bfloat16t_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL20__ushort_as_bfloat16t_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(uint16)),
+            ptrs,
+        )
+
+
+_lower__ZL20__ushort_as_bfloat16t_nbst(shim_stream, shim_obj)
+
+
+def __shfl_sync():
+    pass
+
+
+def _lower__ZL11__shfl_syncj14__nv_bfloat162ii_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL11__shfl_syncj14__nv_bfloat162ii_nbst(__nv_bfloat162 &retval , unsigned int* mask, __nv_bfloat162* var, int* srcLane, int* width) {
+        retval = __shfl_sync(*mask, *var, *srcLane, *width);
+        return 0;
+    }
+        """
+
+    _ZL11__shfl_syncj14__nv_bfloat162ii_nbst = declare_device(
+        "_ZL11__shfl_syncj14__nv_bfloat162ii_nbst",
+        _type___nv_bfloat162(
+            CPointer(uint32),
+            CPointer(_type___nv_bfloat162),
+            CPointer(int32),
+            CPointer(int32),
+        ),
+    )
+
+    def _ZL11__shfl_syncj14__nv_bfloat162ii_nbst_caller(
+        arg_0, arg_1, arg_2, arg_3
+    ):
+        return _ZL11__shfl_syncj14__nv_bfloat162ii_nbst(
+            arg_0, arg_1, arg_2, arg_3
+        )
+
+    @lower(__shfl_sync, uint32, _type___nv_bfloat162, int32, int32)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL11__shfl_syncj14__nv_bfloat162ii_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL11__shfl_syncj14__nv_bfloat162ii_nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(uint32),
+                CPointer(_type___nv_bfloat162),
+                CPointer(int32),
+                CPointer(int32),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL11__shfl_syncj14__nv_bfloat162ii_nbst(shim_stream, shim_obj)
+
+
+def __shfl_up_sync():
+    pass
+
+
+def _lower__ZL14__shfl_up_syncj14__nv_bfloat162ji_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL14__shfl_up_syncj14__nv_bfloat162ji_nbst(__nv_bfloat162 &retval , unsigned int* mask, __nv_bfloat162* var, unsigned int* delta, int* width) {
+        retval = __shfl_up_sync(*mask, *var, *delta, *width);
+        return 0;
+    }
+        """
+
+    _ZL14__shfl_up_syncj14__nv_bfloat162ji_nbst = declare_device(
+        "_ZL14__shfl_up_syncj14__nv_bfloat162ji_nbst",
+        _type___nv_bfloat162(
+            CPointer(uint32),
+            CPointer(_type___nv_bfloat162),
+            CPointer(uint32),
+            CPointer(int32),
+        ),
+    )
+
+    def _ZL14__shfl_up_syncj14__nv_bfloat162ji_nbst_caller(
+        arg_0, arg_1, arg_2, arg_3
+    ):
+        return _ZL14__shfl_up_syncj14__nv_bfloat162ji_nbst(
+            arg_0, arg_1, arg_2, arg_3
+        )
+
+    @lower(__shfl_up_sync, uint32, _type___nv_bfloat162, uint32, int32)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL14__shfl_up_syncj14__nv_bfloat162ji_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL14__shfl_up_syncj14__nv_bfloat162ji_nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(uint32),
+                CPointer(_type___nv_bfloat162),
+                CPointer(uint32),
+                CPointer(int32),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL14__shfl_up_syncj14__nv_bfloat162ji_nbst(shim_stream, shim_obj)
+
+
+def __shfl_down_sync():
+    pass
+
+
+def _lower__ZL16__shfl_down_syncj14__nv_bfloat162ji_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL16__shfl_down_syncj14__nv_bfloat162ji_nbst(__nv_bfloat162 &retval , unsigned int* mask, __nv_bfloat162* var, unsigned int* delta, int* width) {
+        retval = __shfl_down_sync(*mask, *var, *delta, *width);
+        return 0;
+    }
+        """
+
+    _ZL16__shfl_down_syncj14__nv_bfloat162ji_nbst = declare_device(
+        "_ZL16__shfl_down_syncj14__nv_bfloat162ji_nbst",
+        _type___nv_bfloat162(
+            CPointer(uint32),
+            CPointer(_type___nv_bfloat162),
+            CPointer(uint32),
+            CPointer(int32),
+        ),
+    )
+
+    def _ZL16__shfl_down_syncj14__nv_bfloat162ji_nbst_caller(
+        arg_0, arg_1, arg_2, arg_3
+    ):
+        return _ZL16__shfl_down_syncj14__nv_bfloat162ji_nbst(
+            arg_0, arg_1, arg_2, arg_3
+        )
+
+    @lower(__shfl_down_sync, uint32, _type___nv_bfloat162, uint32, int32)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL16__shfl_down_syncj14__nv_bfloat162ji_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL16__shfl_down_syncj14__nv_bfloat162ji_nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(uint32),
+                CPointer(_type___nv_bfloat162),
+                CPointer(uint32),
+                CPointer(int32),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL16__shfl_down_syncj14__nv_bfloat162ji_nbst(shim_stream, shim_obj)
+
+
+def __shfl_xor_sync():
+    pass
+
+
+def _lower__ZL15__shfl_xor_syncj14__nv_bfloat162ii_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL15__shfl_xor_syncj14__nv_bfloat162ii_nbst(__nv_bfloat162 &retval , unsigned int* mask, __nv_bfloat162* var, int* laneMask, int* width) {
+        retval = __shfl_xor_sync(*mask, *var, *laneMask, *width);
+        return 0;
+    }
+        """
+
+    _ZL15__shfl_xor_syncj14__nv_bfloat162ii_nbst = declare_device(
+        "_ZL15__shfl_xor_syncj14__nv_bfloat162ii_nbst",
+        _type___nv_bfloat162(
+            CPointer(uint32),
+            CPointer(_type___nv_bfloat162),
+            CPointer(int32),
+            CPointer(int32),
+        ),
+    )
+
+    def _ZL15__shfl_xor_syncj14__nv_bfloat162ii_nbst_caller(
+        arg_0, arg_1, arg_2, arg_3
+    ):
+        return _ZL15__shfl_xor_syncj14__nv_bfloat162ii_nbst(
+            arg_0, arg_1, arg_2, arg_3
+        )
+
+    @lower(__shfl_xor_sync, uint32, _type___nv_bfloat162, int32, int32)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL15__shfl_xor_syncj14__nv_bfloat162ii_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL15__shfl_xor_syncj14__nv_bfloat162ii_nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(uint32),
+                CPointer(_type___nv_bfloat162),
+                CPointer(int32),
+                CPointer(int32),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL15__shfl_xor_syncj14__nv_bfloat162ii_nbst(shim_stream, shim_obj)
+
+
+def _lower__ZL11__shfl_syncj13__nv_bfloat16ii_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL11__shfl_syncj13__nv_bfloat16ii_nbst(__nv_bfloat16 &retval , unsigned int* mask, __nv_bfloat16* var, int* srcLane, int* width) {
+        retval = __shfl_sync(*mask, *var, *srcLane, *width);
+        return 0;
+    }
+        """
+
+    _ZL11__shfl_syncj13__nv_bfloat16ii_nbst = declare_device(
+        "_ZL11__shfl_syncj13__nv_bfloat16ii_nbst",
+        _type___nv_bfloat16(
+            CPointer(uint32),
+            CPointer(_type___nv_bfloat16),
+            CPointer(int32),
+            CPointer(int32),
+        ),
+    )
+
+    def _ZL11__shfl_syncj13__nv_bfloat16ii_nbst_caller(
+        arg_0, arg_1, arg_2, arg_3
+    ):
+        return _ZL11__shfl_syncj13__nv_bfloat16ii_nbst(
+            arg_0, arg_1, arg_2, arg_3
+        )
+
+    @lower(__shfl_sync, uint32, _type___nv_bfloat16, int32, int32)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL11__shfl_syncj13__nv_bfloat16ii_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL11__shfl_syncj13__nv_bfloat16ii_nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(uint32),
+                CPointer(_type___nv_bfloat16),
+                CPointer(int32),
+                CPointer(int32),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL11__shfl_syncj13__nv_bfloat16ii_nbst(shim_stream, shim_obj)
+
+
+def _lower__ZL14__shfl_up_syncj13__nv_bfloat16ji_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL14__shfl_up_syncj13__nv_bfloat16ji_nbst(__nv_bfloat16 &retval , unsigned int* mask, __nv_bfloat16* var, unsigned int* delta, int* width) {
+        retval = __shfl_up_sync(*mask, *var, *delta, *width);
+        return 0;
+    }
+        """
+
+    _ZL14__shfl_up_syncj13__nv_bfloat16ji_nbst = declare_device(
+        "_ZL14__shfl_up_syncj13__nv_bfloat16ji_nbst",
+        _type___nv_bfloat16(
+            CPointer(uint32),
+            CPointer(_type___nv_bfloat16),
+            CPointer(uint32),
+            CPointer(int32),
+        ),
+    )
+
+    def _ZL14__shfl_up_syncj13__nv_bfloat16ji_nbst_caller(
+        arg_0, arg_1, arg_2, arg_3
+    ):
+        return _ZL14__shfl_up_syncj13__nv_bfloat16ji_nbst(
+            arg_0, arg_1, arg_2, arg_3
+        )
+
+    @lower(__shfl_up_sync, uint32, _type___nv_bfloat16, uint32, int32)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL14__shfl_up_syncj13__nv_bfloat16ji_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL14__shfl_up_syncj13__nv_bfloat16ji_nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(uint32),
+                CPointer(_type___nv_bfloat16),
+                CPointer(uint32),
+                CPointer(int32),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL14__shfl_up_syncj13__nv_bfloat16ji_nbst(shim_stream, shim_obj)
+
+
+def _lower__ZL16__shfl_down_syncj13__nv_bfloat16ji_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL16__shfl_down_syncj13__nv_bfloat16ji_nbst(__nv_bfloat16 &retval , unsigned int* mask, __nv_bfloat16* var, unsigned int* delta, int* width) {
+        retval = __shfl_down_sync(*mask, *var, *delta, *width);
+        return 0;
+    }
+        """
+
+    _ZL16__shfl_down_syncj13__nv_bfloat16ji_nbst = declare_device(
+        "_ZL16__shfl_down_syncj13__nv_bfloat16ji_nbst",
+        _type___nv_bfloat16(
+            CPointer(uint32),
+            CPointer(_type___nv_bfloat16),
+            CPointer(uint32),
+            CPointer(int32),
+        ),
+    )
+
+    def _ZL16__shfl_down_syncj13__nv_bfloat16ji_nbst_caller(
+        arg_0, arg_1, arg_2, arg_3
+    ):
+        return _ZL16__shfl_down_syncj13__nv_bfloat16ji_nbst(
+            arg_0, arg_1, arg_2, arg_3
+        )
+
+    @lower(__shfl_down_sync, uint32, _type___nv_bfloat16, uint32, int32)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL16__shfl_down_syncj13__nv_bfloat16ji_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL16__shfl_down_syncj13__nv_bfloat16ji_nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(uint32),
+                CPointer(_type___nv_bfloat16),
+                CPointer(uint32),
+                CPointer(int32),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL16__shfl_down_syncj13__nv_bfloat16ji_nbst(shim_stream, shim_obj)
+
+
+def _lower__ZL15__shfl_xor_syncj13__nv_bfloat16ii_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL15__shfl_xor_syncj13__nv_bfloat16ii_nbst(__nv_bfloat16 &retval , unsigned int* mask, __nv_bfloat16* var, int* laneMask, int* width) {
+        retval = __shfl_xor_sync(*mask, *var, *laneMask, *width);
+        return 0;
+    }
+        """
+
+    _ZL15__shfl_xor_syncj13__nv_bfloat16ii_nbst = declare_device(
+        "_ZL15__shfl_xor_syncj13__nv_bfloat16ii_nbst",
+        _type___nv_bfloat16(
+            CPointer(uint32),
+            CPointer(_type___nv_bfloat16),
+            CPointer(int32),
+            CPointer(int32),
+        ),
+    )
+
+    def _ZL15__shfl_xor_syncj13__nv_bfloat16ii_nbst_caller(
+        arg_0, arg_1, arg_2, arg_3
+    ):
+        return _ZL15__shfl_xor_syncj13__nv_bfloat16ii_nbst(
+            arg_0, arg_1, arg_2, arg_3
+        )
+
+    @lower(__shfl_xor_sync, uint32, _type___nv_bfloat16, int32, int32)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL15__shfl_xor_syncj13__nv_bfloat16ii_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL15__shfl_xor_syncj13__nv_bfloat16ii_nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(uint32),
+                CPointer(_type___nv_bfloat16),
+                CPointer(int32),
+                CPointer(int32),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL15__shfl_xor_syncj13__nv_bfloat16ii_nbst(shim_stream, shim_obj)
+
+
+def __ldg():
+    pass
+
+
+def _lower__ZL5__ldgPK14__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL5__ldgPK14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162 ** ptr) {
+        retval = __ldg(*ptr);
+        return 0;
+    }
+        """
+
+    _ZL5__ldgPK14__nv_bfloat162_nbst = declare_device(
+        "_ZL5__ldgPK14__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(CPointer(_type___nv_bfloat162))),
+    )
+
+    def _ZL5__ldgPK14__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL5__ldgPK14__nv_bfloat162_nbst(arg_0)
+
+    @lower(__ldg, CPointer(_type___nv_bfloat162))
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL5__ldgPK14__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL5__ldgPK14__nv_bfloat162_nbst_caller,
+            signature(
+                _type___nv_bfloat162, CPointer(CPointer(_type___nv_bfloat162))
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL5__ldgPK14__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def _lower__ZL5__ldgPK13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL5__ldgPK13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16 ** ptr) {
+        retval = __ldg(*ptr);
+        return 0;
+    }
+        """
+
+    _ZL5__ldgPK13__nv_bfloat16_nbst = declare_device(
+        "_ZL5__ldgPK13__nv_bfloat16_nbst",
+        _type___nv_bfloat16(CPointer(CPointer(_type___nv_bfloat16))),
+    )
+
+    def _ZL5__ldgPK13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL5__ldgPK13__nv_bfloat16_nbst(arg_0)
+
+    @lower(__ldg, CPointer(_type___nv_bfloat16))
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL5__ldgPK13__nv_bfloat16_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL5__ldgPK13__nv_bfloat16_nbst_caller,
+            signature(
+                _type___nv_bfloat16, CPointer(CPointer(_type___nv_bfloat16))
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL5__ldgPK13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def __ldcg():
+    pass
+
+
+def _lower__ZL6__ldcgPK14__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__ldcgPK14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162 ** ptr) {
+        retval = __ldcg(*ptr);
+        return 0;
+    }
+        """
+
+    _ZL6__ldcgPK14__nv_bfloat162_nbst = declare_device(
+        "_ZL6__ldcgPK14__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(CPointer(_type___nv_bfloat162))),
+    )
+
+    def _ZL6__ldcgPK14__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL6__ldcgPK14__nv_bfloat162_nbst(arg_0)
+
+    @lower(__ldcg, CPointer(_type___nv_bfloat162))
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__ldcgPK14__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__ldcgPK14__nv_bfloat162_nbst_caller,
+            signature(
+                _type___nv_bfloat162, CPointer(CPointer(_type___nv_bfloat162))
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__ldcgPK14__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def _lower__ZL6__ldcgPK13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__ldcgPK13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16 ** ptr) {
+        retval = __ldcg(*ptr);
+        return 0;
+    }
+        """
+
+    _ZL6__ldcgPK13__nv_bfloat16_nbst = declare_device(
+        "_ZL6__ldcgPK13__nv_bfloat16_nbst",
+        _type___nv_bfloat16(CPointer(CPointer(_type___nv_bfloat16))),
+    )
+
+    def _ZL6__ldcgPK13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL6__ldcgPK13__nv_bfloat16_nbst(arg_0)
+
+    @lower(__ldcg, CPointer(_type___nv_bfloat16))
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__ldcgPK13__nv_bfloat16_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__ldcgPK13__nv_bfloat16_nbst_caller,
+            signature(
+                _type___nv_bfloat16, CPointer(CPointer(_type___nv_bfloat16))
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__ldcgPK13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def __ldca():
+    pass
+
+
+def _lower__ZL6__ldcaPK14__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__ldcaPK14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162 ** ptr) {
+        retval = __ldca(*ptr);
+        return 0;
+    }
+        """
+
+    _ZL6__ldcaPK14__nv_bfloat162_nbst = declare_device(
+        "_ZL6__ldcaPK14__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(CPointer(_type___nv_bfloat162))),
+    )
+
+    def _ZL6__ldcaPK14__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL6__ldcaPK14__nv_bfloat162_nbst(arg_0)
+
+    @lower(__ldca, CPointer(_type___nv_bfloat162))
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__ldcaPK14__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__ldcaPK14__nv_bfloat162_nbst_caller,
+            signature(
+                _type___nv_bfloat162, CPointer(CPointer(_type___nv_bfloat162))
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__ldcaPK14__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def _lower__ZL6__ldcaPK13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__ldcaPK13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16 ** ptr) {
+        retval = __ldca(*ptr);
+        return 0;
+    }
+        """
+
+    _ZL6__ldcaPK13__nv_bfloat16_nbst = declare_device(
+        "_ZL6__ldcaPK13__nv_bfloat16_nbst",
+        _type___nv_bfloat16(CPointer(CPointer(_type___nv_bfloat16))),
+    )
+
+    def _ZL6__ldcaPK13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL6__ldcaPK13__nv_bfloat16_nbst(arg_0)
+
+    @lower(__ldca, CPointer(_type___nv_bfloat16))
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__ldcaPK13__nv_bfloat16_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__ldcaPK13__nv_bfloat16_nbst_caller,
+            signature(
+                _type___nv_bfloat16, CPointer(CPointer(_type___nv_bfloat16))
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__ldcaPK13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def __ldcs():
+    pass
+
+
+def _lower__ZL6__ldcsPK14__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__ldcsPK14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162 ** ptr) {
+        retval = __ldcs(*ptr);
+        return 0;
+    }
+        """
+
+    _ZL6__ldcsPK14__nv_bfloat162_nbst = declare_device(
+        "_ZL6__ldcsPK14__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(CPointer(_type___nv_bfloat162))),
+    )
+
+    def _ZL6__ldcsPK14__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL6__ldcsPK14__nv_bfloat162_nbst(arg_0)
+
+    @lower(__ldcs, CPointer(_type___nv_bfloat162))
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__ldcsPK14__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__ldcsPK14__nv_bfloat162_nbst_caller,
+            signature(
+                _type___nv_bfloat162, CPointer(CPointer(_type___nv_bfloat162))
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__ldcsPK14__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def _lower__ZL6__ldcsPK13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__ldcsPK13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16 ** ptr) {
+        retval = __ldcs(*ptr);
+        return 0;
+    }
+        """
+
+    _ZL6__ldcsPK13__nv_bfloat16_nbst = declare_device(
+        "_ZL6__ldcsPK13__nv_bfloat16_nbst",
+        _type___nv_bfloat16(CPointer(CPointer(_type___nv_bfloat16))),
+    )
+
+    def _ZL6__ldcsPK13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL6__ldcsPK13__nv_bfloat16_nbst(arg_0)
+
+    @lower(__ldcs, CPointer(_type___nv_bfloat16))
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__ldcsPK13__nv_bfloat16_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__ldcsPK13__nv_bfloat16_nbst_caller,
+            signature(
+                _type___nv_bfloat16, CPointer(CPointer(_type___nv_bfloat16))
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__ldcsPK13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def __ldlu():
+    pass
+
+
+def _lower__ZL6__ldluPK14__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__ldluPK14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162 ** ptr) {
+        retval = __ldlu(*ptr);
+        return 0;
+    }
+        """
+
+    _ZL6__ldluPK14__nv_bfloat162_nbst = declare_device(
+        "_ZL6__ldluPK14__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(CPointer(_type___nv_bfloat162))),
+    )
+
+    def _ZL6__ldluPK14__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL6__ldluPK14__nv_bfloat162_nbst(arg_0)
+
+    @lower(__ldlu, CPointer(_type___nv_bfloat162))
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__ldluPK14__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__ldluPK14__nv_bfloat162_nbst_caller,
+            signature(
+                _type___nv_bfloat162, CPointer(CPointer(_type___nv_bfloat162))
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__ldluPK14__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def _lower__ZL6__ldluPK13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__ldluPK13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16 ** ptr) {
+        retval = __ldlu(*ptr);
+        return 0;
+    }
+        """
+
+    _ZL6__ldluPK13__nv_bfloat16_nbst = declare_device(
+        "_ZL6__ldluPK13__nv_bfloat16_nbst",
+        _type___nv_bfloat16(CPointer(CPointer(_type___nv_bfloat16))),
+    )
+
+    def _ZL6__ldluPK13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL6__ldluPK13__nv_bfloat16_nbst(arg_0)
+
+    @lower(__ldlu, CPointer(_type___nv_bfloat16))
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__ldluPK13__nv_bfloat16_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__ldluPK13__nv_bfloat16_nbst_caller,
+            signature(
+                _type___nv_bfloat16, CPointer(CPointer(_type___nv_bfloat16))
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__ldluPK13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def __ldcv():
+    pass
+
+
+def _lower__ZL6__ldcvPK14__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__ldcvPK14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162 ** ptr) {
+        retval = __ldcv(*ptr);
+        return 0;
+    }
+        """
+
+    _ZL6__ldcvPK14__nv_bfloat162_nbst = declare_device(
+        "_ZL6__ldcvPK14__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(CPointer(_type___nv_bfloat162))),
+    )
+
+    def _ZL6__ldcvPK14__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL6__ldcvPK14__nv_bfloat162_nbst(arg_0)
+
+    @lower(__ldcv, CPointer(_type___nv_bfloat162))
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__ldcvPK14__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__ldcvPK14__nv_bfloat162_nbst_caller,
+            signature(
+                _type___nv_bfloat162, CPointer(CPointer(_type___nv_bfloat162))
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__ldcvPK14__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def _lower__ZL6__ldcvPK13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__ldcvPK13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16 ** ptr) {
+        retval = __ldcv(*ptr);
+        return 0;
+    }
+        """
+
+    _ZL6__ldcvPK13__nv_bfloat16_nbst = declare_device(
+        "_ZL6__ldcvPK13__nv_bfloat16_nbst",
+        _type___nv_bfloat16(CPointer(CPointer(_type___nv_bfloat16))),
+    )
+
+    def _ZL6__ldcvPK13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL6__ldcvPK13__nv_bfloat16_nbst(arg_0)
+
+    @lower(__ldcv, CPointer(_type___nv_bfloat16))
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__ldcvPK13__nv_bfloat16_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__ldcvPK13__nv_bfloat16_nbst_caller,
+            signature(
+                _type___nv_bfloat16, CPointer(CPointer(_type___nv_bfloat16))
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__ldcvPK13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def __stwb():
+    pass
+
+
+def _lower__ZL6__stwbP14__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__stwbP14__nv_bfloat162S__nbst(int &retval , __nv_bfloat162 ** ptr, __nv_bfloat162* value) {
+        __stwb(*ptr, *value);
+        return 0;
+    }
+        """
+
+    _ZL6__stwbP14__nv_bfloat162S__nbst = declare_device(
+        "_ZL6__stwbP14__nv_bfloat162S__nbst",
+        void(
+            CPointer(CPointer(_type___nv_bfloat162)),
+            CPointer(_type___nv_bfloat162),
+        ),
+    )
+
+    def _ZL6__stwbP14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL6__stwbP14__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__stwb, CPointer(_type___nv_bfloat162), _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__stwbP14__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__stwbP14__nv_bfloat162S__nbst_caller,
+            signature(
+                void,
+                CPointer(CPointer(_type___nv_bfloat162)),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__stwbP14__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZL6__stwbP13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__stwbP13__nv_bfloat16S__nbst(int &retval , __nv_bfloat16 ** ptr, __nv_bfloat16* value) {
+        __stwb(*ptr, *value);
+        return 0;
+    }
+        """
+
+    _ZL6__stwbP13__nv_bfloat16S__nbst = declare_device(
+        "_ZL6__stwbP13__nv_bfloat16S__nbst",
+        void(
+            CPointer(CPointer(_type___nv_bfloat16)),
+            CPointer(_type___nv_bfloat16),
+        ),
+    )
+
+    def _ZL6__stwbP13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL6__stwbP13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__stwb, CPointer(_type___nv_bfloat16), _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__stwbP13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__stwbP13__nv_bfloat16S__nbst_caller,
+            signature(
+                void,
+                CPointer(CPointer(_type___nv_bfloat16)),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__stwbP13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __stcg():
+    pass
+
+
+def _lower__ZL6__stcgP14__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__stcgP14__nv_bfloat162S__nbst(int &retval , __nv_bfloat162 ** ptr, __nv_bfloat162* value) {
+        __stcg(*ptr, *value);
+        return 0;
+    }
+        """
+
+    _ZL6__stcgP14__nv_bfloat162S__nbst = declare_device(
+        "_ZL6__stcgP14__nv_bfloat162S__nbst",
+        void(
+            CPointer(CPointer(_type___nv_bfloat162)),
+            CPointer(_type___nv_bfloat162),
+        ),
+    )
+
+    def _ZL6__stcgP14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL6__stcgP14__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__stcg, CPointer(_type___nv_bfloat162), _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__stcgP14__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__stcgP14__nv_bfloat162S__nbst_caller,
+            signature(
+                void,
+                CPointer(CPointer(_type___nv_bfloat162)),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__stcgP14__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZL6__stcgP13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__stcgP13__nv_bfloat16S__nbst(int &retval , __nv_bfloat16 ** ptr, __nv_bfloat16* value) {
+        __stcg(*ptr, *value);
+        return 0;
+    }
+        """
+
+    _ZL6__stcgP13__nv_bfloat16S__nbst = declare_device(
+        "_ZL6__stcgP13__nv_bfloat16S__nbst",
+        void(
+            CPointer(CPointer(_type___nv_bfloat16)),
+            CPointer(_type___nv_bfloat16),
+        ),
+    )
+
+    def _ZL6__stcgP13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL6__stcgP13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__stcg, CPointer(_type___nv_bfloat16), _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__stcgP13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__stcgP13__nv_bfloat16S__nbst_caller,
+            signature(
+                void,
+                CPointer(CPointer(_type___nv_bfloat16)),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__stcgP13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __stcs():
+    pass
+
+
+def _lower__ZL6__stcsP14__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__stcsP14__nv_bfloat162S__nbst(int &retval , __nv_bfloat162 ** ptr, __nv_bfloat162* value) {
+        __stcs(*ptr, *value);
+        return 0;
+    }
+        """
+
+    _ZL6__stcsP14__nv_bfloat162S__nbst = declare_device(
+        "_ZL6__stcsP14__nv_bfloat162S__nbst",
+        void(
+            CPointer(CPointer(_type___nv_bfloat162)),
+            CPointer(_type___nv_bfloat162),
+        ),
+    )
+
+    def _ZL6__stcsP14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL6__stcsP14__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__stcs, CPointer(_type___nv_bfloat162), _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__stcsP14__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__stcsP14__nv_bfloat162S__nbst_caller,
+            signature(
+                void,
+                CPointer(CPointer(_type___nv_bfloat162)),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__stcsP14__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZL6__stcsP13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__stcsP13__nv_bfloat16S__nbst(int &retval , __nv_bfloat16 ** ptr, __nv_bfloat16* value) {
+        __stcs(*ptr, *value);
+        return 0;
+    }
+        """
+
+    _ZL6__stcsP13__nv_bfloat16S__nbst = declare_device(
+        "_ZL6__stcsP13__nv_bfloat16S__nbst",
+        void(
+            CPointer(CPointer(_type___nv_bfloat16)),
+            CPointer(_type___nv_bfloat16),
+        ),
+    )
+
+    def _ZL6__stcsP13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL6__stcsP13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__stcs, CPointer(_type___nv_bfloat16), _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__stcsP13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__stcsP13__nv_bfloat16S__nbst_caller,
+            signature(
+                void,
+                CPointer(CPointer(_type___nv_bfloat16)),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__stcsP13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __stwt():
+    pass
+
+
+def _lower__ZL6__stwtP14__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__stwtP14__nv_bfloat162S__nbst(int &retval , __nv_bfloat162 ** ptr, __nv_bfloat162* value) {
+        __stwt(*ptr, *value);
+        return 0;
+    }
+        """
+
+    _ZL6__stwtP14__nv_bfloat162S__nbst = declare_device(
+        "_ZL6__stwtP14__nv_bfloat162S__nbst",
+        void(
+            CPointer(CPointer(_type___nv_bfloat162)),
+            CPointer(_type___nv_bfloat162),
+        ),
+    )
+
+    def _ZL6__stwtP14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL6__stwtP14__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__stwt, CPointer(_type___nv_bfloat162), _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__stwtP14__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__stwtP14__nv_bfloat162S__nbst_caller,
+            signature(
+                void,
+                CPointer(CPointer(_type___nv_bfloat162)),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__stwtP14__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZL6__stwtP13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__stwtP13__nv_bfloat16S__nbst(int &retval , __nv_bfloat16 ** ptr, __nv_bfloat16* value) {
+        __stwt(*ptr, *value);
+        return 0;
+    }
+        """
+
+    _ZL6__stwtP13__nv_bfloat16S__nbst = declare_device(
+        "_ZL6__stwtP13__nv_bfloat16S__nbst",
+        void(
+            CPointer(CPointer(_type___nv_bfloat16)),
+            CPointer(_type___nv_bfloat16),
+        ),
+    )
+
+    def _ZL6__stwtP13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL6__stwtP13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__stwt, CPointer(_type___nv_bfloat16), _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__stwtP13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__stwtP13__nv_bfloat16S__nbst_caller,
+            signature(
+                void,
+                CPointer(CPointer(_type___nv_bfloat16)),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__stwtP13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __heq2():
+    pass
+
+
+def _lower__ZL6__heq214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__heq214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __heq2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL6__heq214__nv_bfloat162S__nbst = declare_device(
+        "_ZL6__heq214__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL6__heq214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL6__heq214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__heq2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__heq214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__heq214__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__heq214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hne2():
+    pass
+
+
+def _lower__ZL6__hne214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__hne214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hne2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL6__hne214__nv_bfloat162S__nbst = declare_device(
+        "_ZL6__hne214__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL6__hne214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL6__hne214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hne2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__hne214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__hne214__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__hne214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hle2():
+    pass
+
+
+def _lower__ZL6__hle214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__hle214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hle2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL6__hle214__nv_bfloat162S__nbst = declare_device(
+        "_ZL6__hle214__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL6__hle214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL6__hle214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hle2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__hle214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__hle214__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__hle214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hge2():
+    pass
+
+
+def _lower__ZL6__hge214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__hge214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hge2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL6__hge214__nv_bfloat162S__nbst = declare_device(
+        "_ZL6__hge214__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL6__hge214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL6__hge214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hge2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__hge214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__hge214__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__hge214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hlt2():
+    pass
+
+
+def _lower__ZL6__hlt214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__hlt214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hlt2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL6__hlt214__nv_bfloat162S__nbst = declare_device(
+        "_ZL6__hlt214__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL6__hlt214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL6__hlt214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hlt2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__hlt214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__hlt214__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__hlt214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hgt2():
+    pass
+
+
+def _lower__ZL6__hgt214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__hgt214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hgt2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL6__hgt214__nv_bfloat162S__nbst = declare_device(
+        "_ZL6__hgt214__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL6__hgt214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL6__hgt214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hgt2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__hgt214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__hgt214__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__hgt214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hequ2():
+    pass
+
+
+def _lower__ZL7__hequ214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL7__hequ214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hequ2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL7__hequ214__nv_bfloat162S__nbst = declare_device(
+        "_ZL7__hequ214__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL7__hequ214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL7__hequ214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hequ2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL7__hequ214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL7__hequ214__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL7__hequ214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hneu2():
+    pass
+
+
+def _lower__ZL7__hneu214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL7__hneu214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hneu2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL7__hneu214__nv_bfloat162S__nbst = declare_device(
+        "_ZL7__hneu214__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL7__hneu214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL7__hneu214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hneu2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL7__hneu214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL7__hneu214__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL7__hneu214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hleu2():
+    pass
+
+
+def _lower__ZL7__hleu214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL7__hleu214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hleu2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL7__hleu214__nv_bfloat162S__nbst = declare_device(
+        "_ZL7__hleu214__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL7__hleu214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL7__hleu214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hleu2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL7__hleu214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL7__hleu214__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL7__hleu214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hgeu2():
+    pass
+
+
+def _lower__ZL7__hgeu214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL7__hgeu214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hgeu2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL7__hgeu214__nv_bfloat162S__nbst = declare_device(
+        "_ZL7__hgeu214__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL7__hgeu214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL7__hgeu214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hgeu2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL7__hgeu214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL7__hgeu214__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL7__hgeu214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hltu2():
+    pass
+
+
+def _lower__ZL7__hltu214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL7__hltu214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hltu2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL7__hltu214__nv_bfloat162S__nbst = declare_device(
+        "_ZL7__hltu214__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL7__hltu214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL7__hltu214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hltu2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL7__hltu214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL7__hltu214__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL7__hltu214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hgtu2():
+    pass
+
+
+def _lower__ZL7__hgtu214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL7__hgtu214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hgtu2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL7__hgtu214__nv_bfloat162S__nbst = declare_device(
+        "_ZL7__hgtu214__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL7__hgtu214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL7__hgtu214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hgtu2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL7__hgtu214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL7__hgtu214__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL7__hgtu214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __heq2_mask():
+    pass
+
+
+def _lower__ZL11__heq2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL11__heq2_mask14__nv_bfloat162S__nbst(unsigned int &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __heq2_mask(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL11__heq2_mask14__nv_bfloat162S__nbst = declare_device(
+        "_ZL11__heq2_mask14__nv_bfloat162S__nbst",
+        uint32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL11__heq2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL11__heq2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__heq2_mask, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL11__heq2_mask14__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL11__heq2_mask14__nv_bfloat162S__nbst_caller,
+            signature(
+                uint32,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL11__heq2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hne2_mask():
+    pass
+
+
+def _lower__ZL11__hne2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL11__hne2_mask14__nv_bfloat162S__nbst(unsigned int &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hne2_mask(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL11__hne2_mask14__nv_bfloat162S__nbst = declare_device(
+        "_ZL11__hne2_mask14__nv_bfloat162S__nbst",
+        uint32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL11__hne2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL11__hne2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hne2_mask, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL11__hne2_mask14__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL11__hne2_mask14__nv_bfloat162S__nbst_caller,
+            signature(
+                uint32,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL11__hne2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hle2_mask():
+    pass
+
+
+def _lower__ZL11__hle2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL11__hle2_mask14__nv_bfloat162S__nbst(unsigned int &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hle2_mask(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL11__hle2_mask14__nv_bfloat162S__nbst = declare_device(
+        "_ZL11__hle2_mask14__nv_bfloat162S__nbst",
+        uint32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL11__hle2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL11__hle2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hle2_mask, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL11__hle2_mask14__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL11__hle2_mask14__nv_bfloat162S__nbst_caller,
+            signature(
+                uint32,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL11__hle2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hge2_mask():
+    pass
+
+
+def _lower__ZL11__hge2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL11__hge2_mask14__nv_bfloat162S__nbst(unsigned int &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hge2_mask(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL11__hge2_mask14__nv_bfloat162S__nbst = declare_device(
+        "_ZL11__hge2_mask14__nv_bfloat162S__nbst",
+        uint32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL11__hge2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL11__hge2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hge2_mask, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL11__hge2_mask14__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL11__hge2_mask14__nv_bfloat162S__nbst_caller,
+            signature(
+                uint32,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL11__hge2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hlt2_mask():
+    pass
+
+
+def _lower__ZL11__hlt2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL11__hlt2_mask14__nv_bfloat162S__nbst(unsigned int &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hlt2_mask(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL11__hlt2_mask14__nv_bfloat162S__nbst = declare_device(
+        "_ZL11__hlt2_mask14__nv_bfloat162S__nbst",
+        uint32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL11__hlt2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL11__hlt2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hlt2_mask, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL11__hlt2_mask14__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL11__hlt2_mask14__nv_bfloat162S__nbst_caller,
+            signature(
+                uint32,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL11__hlt2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hgt2_mask():
+    pass
+
+
+def _lower__ZL11__hgt2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL11__hgt2_mask14__nv_bfloat162S__nbst(unsigned int &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hgt2_mask(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL11__hgt2_mask14__nv_bfloat162S__nbst = declare_device(
+        "_ZL11__hgt2_mask14__nv_bfloat162S__nbst",
+        uint32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL11__hgt2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL11__hgt2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hgt2_mask, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL11__hgt2_mask14__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL11__hgt2_mask14__nv_bfloat162S__nbst_caller,
+            signature(
+                uint32,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL11__hgt2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hequ2_mask():
+    pass
+
+
+def _lower__ZL12__hequ2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL12__hequ2_mask14__nv_bfloat162S__nbst(unsigned int &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hequ2_mask(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL12__hequ2_mask14__nv_bfloat162S__nbst = declare_device(
+        "_ZL12__hequ2_mask14__nv_bfloat162S__nbst",
+        uint32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL12__hequ2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL12__hequ2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hequ2_mask, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL12__hequ2_mask14__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL12__hequ2_mask14__nv_bfloat162S__nbst_caller,
+            signature(
+                uint32,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL12__hequ2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hneu2_mask():
+    pass
+
+
+def _lower__ZL12__hneu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL12__hneu2_mask14__nv_bfloat162S__nbst(unsigned int &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hneu2_mask(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL12__hneu2_mask14__nv_bfloat162S__nbst = declare_device(
+        "_ZL12__hneu2_mask14__nv_bfloat162S__nbst",
+        uint32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL12__hneu2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL12__hneu2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hneu2_mask, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL12__hneu2_mask14__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL12__hneu2_mask14__nv_bfloat162S__nbst_caller,
+            signature(
+                uint32,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL12__hneu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hleu2_mask():
+    pass
+
+
+def _lower__ZL12__hleu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL12__hleu2_mask14__nv_bfloat162S__nbst(unsigned int &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hleu2_mask(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL12__hleu2_mask14__nv_bfloat162S__nbst = declare_device(
+        "_ZL12__hleu2_mask14__nv_bfloat162S__nbst",
+        uint32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL12__hleu2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL12__hleu2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hleu2_mask, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL12__hleu2_mask14__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL12__hleu2_mask14__nv_bfloat162S__nbst_caller,
+            signature(
+                uint32,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL12__hleu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hgeu2_mask():
+    pass
+
+
+def _lower__ZL12__hgeu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL12__hgeu2_mask14__nv_bfloat162S__nbst(unsigned int &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hgeu2_mask(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL12__hgeu2_mask14__nv_bfloat162S__nbst = declare_device(
+        "_ZL12__hgeu2_mask14__nv_bfloat162S__nbst",
+        uint32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL12__hgeu2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL12__hgeu2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hgeu2_mask, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL12__hgeu2_mask14__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL12__hgeu2_mask14__nv_bfloat162S__nbst_caller,
+            signature(
+                uint32,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL12__hgeu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hltu2_mask():
+    pass
+
+
+def _lower__ZL12__hltu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL12__hltu2_mask14__nv_bfloat162S__nbst(unsigned int &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hltu2_mask(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL12__hltu2_mask14__nv_bfloat162S__nbst = declare_device(
+        "_ZL12__hltu2_mask14__nv_bfloat162S__nbst",
+        uint32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL12__hltu2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL12__hltu2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hltu2_mask, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL12__hltu2_mask14__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL12__hltu2_mask14__nv_bfloat162S__nbst_caller,
+            signature(
+                uint32,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL12__hltu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hgtu2_mask():
+    pass
+
+
+def _lower__ZL12__hgtu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL12__hgtu2_mask14__nv_bfloat162S__nbst(unsigned int &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hgtu2_mask(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL12__hgtu2_mask14__nv_bfloat162S__nbst = declare_device(
+        "_ZL12__hgtu2_mask14__nv_bfloat162S__nbst",
+        uint32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL12__hgtu2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL12__hgtu2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hgtu2_mask, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL12__hgtu2_mask14__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL12__hgtu2_mask14__nv_bfloat162S__nbst_caller,
+            signature(
+                uint32,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL12__hgtu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hisnan2():
+    pass
+
+
+def _lower__ZL9__hisnan214__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL9__hisnan214__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) {
+        retval = __hisnan2(*a);
+        return 0;
+    }
+        """
+
+    _ZL9__hisnan214__nv_bfloat162_nbst = declare_device(
+        "_ZL9__hisnan214__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL9__hisnan214__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL9__hisnan214__nv_bfloat162_nbst(arg_0)
+
+    @lower(__hisnan2, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL9__hisnan214__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL9__hisnan214__nv_bfloat162_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            ptrs,
+        )
+
+
+_lower__ZL9__hisnan214__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def __hadd2():
+    pass
+
+
+def _lower__ZL7__hadd214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL7__hadd214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hadd2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL7__hadd214__nv_bfloat162S__nbst = declare_device(
+        "_ZL7__hadd214__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL7__hadd214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL7__hadd214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hadd2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL7__hadd214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL7__hadd214__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL7__hadd214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hsub2():
+    pass
+
+
+def _lower__ZL7__hsub214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL7__hsub214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hsub2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL7__hsub214__nv_bfloat162S__nbst = declare_device(
+        "_ZL7__hsub214__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL7__hsub214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL7__hsub214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hsub2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL7__hsub214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL7__hsub214__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL7__hsub214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hmul2():
+    pass
+
+
+def _lower__ZL7__hmul214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL7__hmul214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hmul2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL7__hmul214__nv_bfloat162S__nbst = declare_device(
+        "_ZL7__hmul214__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL7__hmul214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL7__hmul214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hmul2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL7__hmul214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL7__hmul214__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL7__hmul214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hadd2_rn():
+    pass
+
+
+def _lower__ZL10__hadd2_rn14__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL10__hadd2_rn14__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hadd2_rn(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL10__hadd2_rn14__nv_bfloat162S__nbst = declare_device(
+        "_ZL10__hadd2_rn14__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL10__hadd2_rn14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL10__hadd2_rn14__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hadd2_rn, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL10__hadd2_rn14__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL10__hadd2_rn14__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL10__hadd2_rn14__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hsub2_rn():
+    pass
+
+
+def _lower__ZL10__hsub2_rn14__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL10__hsub2_rn14__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hsub2_rn(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL10__hsub2_rn14__nv_bfloat162S__nbst = declare_device(
+        "_ZL10__hsub2_rn14__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL10__hsub2_rn14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL10__hsub2_rn14__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hsub2_rn, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL10__hsub2_rn14__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL10__hsub2_rn14__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL10__hsub2_rn14__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hmul2_rn():
+    pass
+
+
+def _lower__ZL10__hmul2_rn14__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL10__hmul2_rn14__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hmul2_rn(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL10__hmul2_rn14__nv_bfloat162S__nbst = declare_device(
+        "_ZL10__hmul2_rn14__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL10__hmul2_rn14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL10__hmul2_rn14__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hmul2_rn, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL10__hmul2_rn14__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL10__hmul2_rn14__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL10__hmul2_rn14__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __h2div():
+    pass
+
+
+def _lower__ZL7__h2div14__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL7__h2div14__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __h2div(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL7__h2div14__nv_bfloat162S__nbst = declare_device(
+        "_ZL7__h2div14__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL7__h2div14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL7__h2div14__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__h2div, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL7__h2div14__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL7__h2div14__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL7__h2div14__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __habs2():
+    pass
+
+
+def _lower__ZL7__habs214__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL7__habs214__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) {
+        retval = __habs2(*a);
+        return 0;
+    }
+        """
+
+    _ZL7__habs214__nv_bfloat162_nbst = declare_device(
+        "_ZL7__habs214__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL7__habs214__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL7__habs214__nv_bfloat162_nbst(arg_0)
+
+    @lower(__habs2, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL7__habs214__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL7__habs214__nv_bfloat162_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            ptrs,
+        )
+
+
+_lower__ZL7__habs214__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def __hadd2_sat():
+    pass
+
+
+def _lower__ZL11__hadd2_sat14__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL11__hadd2_sat14__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hadd2_sat(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL11__hadd2_sat14__nv_bfloat162S__nbst = declare_device(
+        "_ZL11__hadd2_sat14__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL11__hadd2_sat14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL11__hadd2_sat14__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hadd2_sat, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL11__hadd2_sat14__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL11__hadd2_sat14__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL11__hadd2_sat14__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hsub2_sat():
+    pass
+
+
+def _lower__ZL11__hsub2_sat14__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL11__hsub2_sat14__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hsub2_sat(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL11__hsub2_sat14__nv_bfloat162S__nbst = declare_device(
+        "_ZL11__hsub2_sat14__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL11__hsub2_sat14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL11__hsub2_sat14__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hsub2_sat, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL11__hsub2_sat14__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL11__hsub2_sat14__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL11__hsub2_sat14__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hmul2_sat():
+    pass
+
+
+def _lower__ZL11__hmul2_sat14__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL11__hmul2_sat14__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hmul2_sat(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL11__hmul2_sat14__nv_bfloat162S__nbst = declare_device(
+        "_ZL11__hmul2_sat14__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL11__hmul2_sat14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL11__hmul2_sat14__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hmul2_sat, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL11__hmul2_sat14__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL11__hmul2_sat14__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL11__hmul2_sat14__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hfma2():
+    pass
+
+
+def _lower__ZL7__hfma214__nv_bfloat162S_S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL7__hfma214__nv_bfloat162S_S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b, __nv_bfloat162* c) {
+        retval = __hfma2(*a, *b, *c);
+        return 0;
+    }
+        """
+
+    _ZL7__hfma214__nv_bfloat162S_S__nbst = declare_device(
+        "_ZL7__hfma214__nv_bfloat162S_S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162),
+            CPointer(_type___nv_bfloat162),
+            CPointer(_type___nv_bfloat162),
+        ),
+    )
+
+    def _ZL7__hfma214__nv_bfloat162S_S__nbst_caller(arg_0, arg_1, arg_2):
+        return _ZL7__hfma214__nv_bfloat162S_S__nbst(arg_0, arg_1, arg_2)
+
+    @lower(
+        __hfma2,
+        _type___nv_bfloat162,
+        _type___nv_bfloat162,
+        _type___nv_bfloat162,
+    )
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL7__hfma214__nv_bfloat162S_S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL7__hfma214__nv_bfloat162S_S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL7__hfma214__nv_bfloat162S_S__nbst(shim_stream, shim_obj)
+
+
+def __hfma2_sat():
+    pass
+
+
+def _lower__ZL11__hfma2_sat14__nv_bfloat162S_S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL11__hfma2_sat14__nv_bfloat162S_S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b, __nv_bfloat162* c) {
+        retval = __hfma2_sat(*a, *b, *c);
+        return 0;
+    }
+        """
+
+    _ZL11__hfma2_sat14__nv_bfloat162S_S__nbst = declare_device(
+        "_ZL11__hfma2_sat14__nv_bfloat162S_S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162),
+            CPointer(_type___nv_bfloat162),
+            CPointer(_type___nv_bfloat162),
+        ),
+    )
+
+    def _ZL11__hfma2_sat14__nv_bfloat162S_S__nbst_caller(arg_0, arg_1, arg_2):
+        return _ZL11__hfma2_sat14__nv_bfloat162S_S__nbst(arg_0, arg_1, arg_2)
+
+    @lower(
+        __hfma2_sat,
+        _type___nv_bfloat162,
+        _type___nv_bfloat162,
+        _type___nv_bfloat162,
+    )
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL11__hfma2_sat14__nv_bfloat162S_S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL11__hfma2_sat14__nv_bfloat162S_S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL11__hfma2_sat14__nv_bfloat162S_S__nbst(shim_stream, shim_obj)
+
+
+def __hneg2():
+    pass
+
+
+def _lower__ZL7__hneg214__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL7__hneg214__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) {
+        retval = __hneg2(*a);
+        return 0;
+    }
+        """
+
+    _ZL7__hneg214__nv_bfloat162_nbst = declare_device(
+        "_ZL7__hneg214__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL7__hneg214__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL7__hneg214__nv_bfloat162_nbst(arg_0)
+
+    @lower(__hneg2, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL7__hneg214__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL7__hneg214__nv_bfloat162_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            ptrs,
+        )
+
+
+_lower__ZL7__hneg214__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def __habs():
+    pass
+
+
+def _lower__ZL6__habs13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__habs13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) {
+        retval = __habs(*a);
+        return 0;
+    }
+        """
+
+    _ZL6__habs13__nv_bfloat16_nbst = declare_device(
+        "_ZL6__habs13__nv_bfloat16_nbst",
+        _type___nv_bfloat16(CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL6__habs13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL6__habs13__nv_bfloat16_nbst(arg_0)
+
+    @lower(__habs, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__habs13__nv_bfloat16_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__habs13__nv_bfloat16_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            ptrs,
+        )
+
+
+_lower__ZL6__habs13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def __hadd():
+    pass
+
+
+def _lower__ZL6__hadd13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__hadd13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) {
+        retval = __hadd(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL6__hadd13__nv_bfloat16S__nbst = declare_device(
+        "_ZL6__hadd13__nv_bfloat16S__nbst",
+        _type___nv_bfloat16(
+            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
+        ),
+    )
+
+    def _ZL6__hadd13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL6__hadd13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__hadd, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__hadd13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__hadd13__nv_bfloat16S__nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__hadd13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __hsub():
+    pass
+
+
+def _lower__ZL6__hsub13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__hsub13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) {
+        retval = __hsub(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL6__hsub13__nv_bfloat16S__nbst = declare_device(
+        "_ZL6__hsub13__nv_bfloat16S__nbst",
+        _type___nv_bfloat16(
+            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
+        ),
+    )
+
+    def _ZL6__hsub13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL6__hsub13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__hsub, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__hsub13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__hsub13__nv_bfloat16S__nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__hsub13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __hmul():
+    pass
+
+
+def _lower__ZL6__hmul13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__hmul13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) {
+        retval = __hmul(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL6__hmul13__nv_bfloat16S__nbst = declare_device(
+        "_ZL6__hmul13__nv_bfloat16S__nbst",
+        _type___nv_bfloat16(
+            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
+        ),
+    )
+
+    def _ZL6__hmul13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL6__hmul13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__hmul, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__hmul13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__hmul13__nv_bfloat16S__nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__hmul13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __hadd_rn():
+    pass
+
+
+def _lower__ZL9__hadd_rn13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL9__hadd_rn13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) {
+        retval = __hadd_rn(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL9__hadd_rn13__nv_bfloat16S__nbst = declare_device(
+        "_ZL9__hadd_rn13__nv_bfloat16S__nbst",
+        _type___nv_bfloat16(
+            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
+        ),
+    )
+
+    def _ZL9__hadd_rn13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL9__hadd_rn13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__hadd_rn, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL9__hadd_rn13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL9__hadd_rn13__nv_bfloat16S__nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL9__hadd_rn13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __hsub_rn():
+    pass
+
+
+def _lower__ZL9__hsub_rn13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL9__hsub_rn13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) {
+        retval = __hsub_rn(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL9__hsub_rn13__nv_bfloat16S__nbst = declare_device(
+        "_ZL9__hsub_rn13__nv_bfloat16S__nbst",
+        _type___nv_bfloat16(
+            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
+        ),
+    )
+
+    def _ZL9__hsub_rn13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL9__hsub_rn13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__hsub_rn, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL9__hsub_rn13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL9__hsub_rn13__nv_bfloat16S__nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL9__hsub_rn13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __hmul_rn():
+    pass
+
+
+def _lower__ZL9__hmul_rn13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL9__hmul_rn13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) {
+        retval = __hmul_rn(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL9__hmul_rn13__nv_bfloat16S__nbst = declare_device(
+        "_ZL9__hmul_rn13__nv_bfloat16S__nbst",
+        _type___nv_bfloat16(
+            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
+        ),
+    )
+
+    def _ZL9__hmul_rn13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL9__hmul_rn13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__hmul_rn, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL9__hmul_rn13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL9__hmul_rn13__nv_bfloat16S__nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL9__hmul_rn13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __hdiv():
+    pass
+
+
+def _lower__ZL6__hdiv13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__hdiv13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) {
+        retval = __hdiv(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL6__hdiv13__nv_bfloat16S__nbst = declare_device(
+        "_ZL6__hdiv13__nv_bfloat16S__nbst",
+        _type___nv_bfloat16(
+            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
+        ),
+    )
+
+    def _ZL6__hdiv13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL6__hdiv13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__hdiv, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__hdiv13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__hdiv13__nv_bfloat16S__nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__hdiv13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __hadd_sat():
+    pass
+
+
+def _lower__ZL10__hadd_sat13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL10__hadd_sat13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) {
+        retval = __hadd_sat(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL10__hadd_sat13__nv_bfloat16S__nbst = declare_device(
+        "_ZL10__hadd_sat13__nv_bfloat16S__nbst",
+        _type___nv_bfloat16(
+            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
+        ),
+    )
+
+    def _ZL10__hadd_sat13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL10__hadd_sat13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__hadd_sat, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL10__hadd_sat13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL10__hadd_sat13__nv_bfloat16S__nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL10__hadd_sat13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __hsub_sat():
+    pass
+
+
+def _lower__ZL10__hsub_sat13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL10__hsub_sat13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) {
+        retval = __hsub_sat(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL10__hsub_sat13__nv_bfloat16S__nbst = declare_device(
+        "_ZL10__hsub_sat13__nv_bfloat16S__nbst",
+        _type___nv_bfloat16(
+            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
+        ),
+    )
+
+    def _ZL10__hsub_sat13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL10__hsub_sat13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__hsub_sat, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL10__hsub_sat13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL10__hsub_sat13__nv_bfloat16S__nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL10__hsub_sat13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __hmul_sat():
+    pass
+
+
+def _lower__ZL10__hmul_sat13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL10__hmul_sat13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) {
+        retval = __hmul_sat(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL10__hmul_sat13__nv_bfloat16S__nbst = declare_device(
+        "_ZL10__hmul_sat13__nv_bfloat16S__nbst",
+        _type___nv_bfloat16(
+            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
+        ),
+    )
+
+    def _ZL10__hmul_sat13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL10__hmul_sat13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__hmul_sat, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL10__hmul_sat13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL10__hmul_sat13__nv_bfloat16S__nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL10__hmul_sat13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __hfma():
+    pass
+
+
+def _lower__ZL6__hfma13__nv_bfloat16S_S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__hfma13__nv_bfloat16S_S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b, __nv_bfloat16* c) {
+        retval = __hfma(*a, *b, *c);
+        return 0;
+    }
+        """
+
+    _ZL6__hfma13__nv_bfloat16S_S__nbst = declare_device(
+        "_ZL6__hfma13__nv_bfloat16S_S__nbst",
+        _type___nv_bfloat16(
+            CPointer(_type___nv_bfloat16),
+            CPointer(_type___nv_bfloat16),
+            CPointer(_type___nv_bfloat16),
+        ),
+    )
+
+    def _ZL6__hfma13__nv_bfloat16S_S__nbst_caller(arg_0, arg_1, arg_2):
+        return _ZL6__hfma13__nv_bfloat16S_S__nbst(arg_0, arg_1, arg_2)
+
+    @lower(
+        __hfma, _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16
+    )
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__hfma13__nv_bfloat16S_S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__hfma13__nv_bfloat16S_S__nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__hfma13__nv_bfloat16S_S__nbst(shim_stream, shim_obj)
+
+
+def __hfma_sat():
+    pass
+
+
+def _lower__ZL10__hfma_sat13__nv_bfloat16S_S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL10__hfma_sat13__nv_bfloat16S_S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b, __nv_bfloat16* c) {
+        retval = __hfma_sat(*a, *b, *c);
+        return 0;
+    }
+        """
+
+    _ZL10__hfma_sat13__nv_bfloat16S_S__nbst = declare_device(
+        "_ZL10__hfma_sat13__nv_bfloat16S_S__nbst",
+        _type___nv_bfloat16(
+            CPointer(_type___nv_bfloat16),
+            CPointer(_type___nv_bfloat16),
+            CPointer(_type___nv_bfloat16),
+        ),
+    )
+
+    def _ZL10__hfma_sat13__nv_bfloat16S_S__nbst_caller(arg_0, arg_1, arg_2):
+        return _ZL10__hfma_sat13__nv_bfloat16S_S__nbst(arg_0, arg_1, arg_2)
+
+    @lower(
+        __hfma_sat,
+        _type___nv_bfloat16,
+        _type___nv_bfloat16,
+        _type___nv_bfloat16,
+    )
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL10__hfma_sat13__nv_bfloat16S_S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL10__hfma_sat13__nv_bfloat16S_S__nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL10__hfma_sat13__nv_bfloat16S_S__nbst(shim_stream, shim_obj)
+
+
+def __hneg():
+    pass
+
+
+def _lower__ZL6__hneg13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__hneg13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) {
+        retval = __hneg(*a);
+        return 0;
+    }
+        """
+
+    _ZL6__hneg13__nv_bfloat16_nbst = declare_device(
+        "_ZL6__hneg13__nv_bfloat16_nbst",
+        _type___nv_bfloat16(CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL6__hneg13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL6__hneg13__nv_bfloat16_nbst(arg_0)
+
+    @lower(__hneg, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__hneg13__nv_bfloat16_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__hneg13__nv_bfloat16_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            ptrs,
+        )
+
+
+_lower__ZL6__hneg13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def __hbeq2():
+    pass
+
+
+def _lower__ZL7__hbeq214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL7__hbeq214__nv_bfloat162S__nbst(bool &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hbeq2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL7__hbeq214__nv_bfloat162S__nbst = declare_device(
+        "_ZL7__hbeq214__nv_bfloat162S__nbst",
+        bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL7__hbeq214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL7__hbeq214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hbeq2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL7__hbeq214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL7__hbeq214__nv_bfloat162S__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL7__hbeq214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hbne2():
+    pass
+
+
+def _lower__ZL7__hbne214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL7__hbne214__nv_bfloat162S__nbst(bool &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hbne2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL7__hbne214__nv_bfloat162S__nbst = declare_device(
+        "_ZL7__hbne214__nv_bfloat162S__nbst",
+        bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL7__hbne214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL7__hbne214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hbne2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL7__hbne214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL7__hbne214__nv_bfloat162S__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL7__hbne214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hble2():
+    pass
+
+
+def _lower__ZL7__hble214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL7__hble214__nv_bfloat162S__nbst(bool &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hble2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL7__hble214__nv_bfloat162S__nbst = declare_device(
+        "_ZL7__hble214__nv_bfloat162S__nbst",
+        bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL7__hble214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL7__hble214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hble2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL7__hble214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL7__hble214__nv_bfloat162S__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL7__hble214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hbge2():
+    pass
+
+
+def _lower__ZL7__hbge214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL7__hbge214__nv_bfloat162S__nbst(bool &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hbge2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL7__hbge214__nv_bfloat162S__nbst = declare_device(
+        "_ZL7__hbge214__nv_bfloat162S__nbst",
+        bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL7__hbge214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL7__hbge214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hbge2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL7__hbge214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL7__hbge214__nv_bfloat162S__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL7__hbge214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hblt2():
+    pass
+
+
+def _lower__ZL7__hblt214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL7__hblt214__nv_bfloat162S__nbst(bool &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hblt2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL7__hblt214__nv_bfloat162S__nbst = declare_device(
+        "_ZL7__hblt214__nv_bfloat162S__nbst",
+        bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL7__hblt214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL7__hblt214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hblt2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL7__hblt214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL7__hblt214__nv_bfloat162S__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL7__hblt214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hbgt2():
+    pass
+
+
+def _lower__ZL7__hbgt214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL7__hbgt214__nv_bfloat162S__nbst(bool &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hbgt2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL7__hbgt214__nv_bfloat162S__nbst = declare_device(
+        "_ZL7__hbgt214__nv_bfloat162S__nbst",
+        bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL7__hbgt214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL7__hbgt214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hbgt2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL7__hbgt214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL7__hbgt214__nv_bfloat162S__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL7__hbgt214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hbequ2():
+    pass
+
+
+def _lower__ZL8__hbequ214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL8__hbequ214__nv_bfloat162S__nbst(bool &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hbequ2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL8__hbequ214__nv_bfloat162S__nbst = declare_device(
+        "_ZL8__hbequ214__nv_bfloat162S__nbst",
+        bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL8__hbequ214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL8__hbequ214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hbequ2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL8__hbequ214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL8__hbequ214__nv_bfloat162S__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL8__hbequ214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hbneu2():
+    pass
+
+
+def _lower__ZL8__hbneu214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL8__hbneu214__nv_bfloat162S__nbst(bool &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hbneu2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL8__hbneu214__nv_bfloat162S__nbst = declare_device(
+        "_ZL8__hbneu214__nv_bfloat162S__nbst",
+        bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL8__hbneu214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL8__hbneu214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hbneu2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL8__hbneu214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL8__hbneu214__nv_bfloat162S__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL8__hbneu214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hbleu2():
+    pass
+
+
+def _lower__ZL8__hbleu214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL8__hbleu214__nv_bfloat162S__nbst(bool &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hbleu2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL8__hbleu214__nv_bfloat162S__nbst = declare_device(
+        "_ZL8__hbleu214__nv_bfloat162S__nbst",
+        bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL8__hbleu214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL8__hbleu214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hbleu2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL8__hbleu214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL8__hbleu214__nv_bfloat162S__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL8__hbleu214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hbgeu2():
+    pass
+
+
+def _lower__ZL8__hbgeu214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL8__hbgeu214__nv_bfloat162S__nbst(bool &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hbgeu2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL8__hbgeu214__nv_bfloat162S__nbst = declare_device(
+        "_ZL8__hbgeu214__nv_bfloat162S__nbst",
+        bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL8__hbgeu214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL8__hbgeu214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hbgeu2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL8__hbgeu214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL8__hbgeu214__nv_bfloat162S__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL8__hbgeu214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hbltu2():
+    pass
+
+
+def _lower__ZL8__hbltu214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL8__hbltu214__nv_bfloat162S__nbst(bool &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hbltu2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL8__hbltu214__nv_bfloat162S__nbst = declare_device(
+        "_ZL8__hbltu214__nv_bfloat162S__nbst",
+        bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL8__hbltu214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL8__hbltu214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hbltu2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL8__hbltu214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL8__hbltu214__nv_bfloat162S__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL8__hbltu214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hbgtu2():
+    pass
+
+
+def _lower__ZL8__hbgtu214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL8__hbgtu214__nv_bfloat162S__nbst(bool &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hbgtu2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL8__hbgtu214__nv_bfloat162S__nbst = declare_device(
+        "_ZL8__hbgtu214__nv_bfloat162S__nbst",
+        bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL8__hbgtu214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL8__hbgtu214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hbgtu2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL8__hbgtu214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL8__hbgtu214__nv_bfloat162S__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL8__hbgtu214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __heq():
+    pass
+
+
+def _lower__ZL5__heq13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL5__heq13__nv_bfloat16S__nbst(bool &retval , __nv_bfloat16* a, __nv_bfloat16* b) {
+        retval = __heq(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL5__heq13__nv_bfloat16S__nbst = declare_device(
+        "_ZL5__heq13__nv_bfloat16S__nbst",
+        bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL5__heq13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL5__heq13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__heq, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL5__heq13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL5__heq13__nv_bfloat16S__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL5__heq13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __hne():
+    pass
+
+
+def _lower__ZL5__hne13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL5__hne13__nv_bfloat16S__nbst(bool &retval , __nv_bfloat16* a, __nv_bfloat16* b) {
+        retval = __hne(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL5__hne13__nv_bfloat16S__nbst = declare_device(
+        "_ZL5__hne13__nv_bfloat16S__nbst",
+        bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL5__hne13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL5__hne13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__hne, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL5__hne13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL5__hne13__nv_bfloat16S__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL5__hne13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __hle():
+    pass
+
+
+def _lower__ZL5__hle13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL5__hle13__nv_bfloat16S__nbst(bool &retval , __nv_bfloat16* a, __nv_bfloat16* b) {
+        retval = __hle(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL5__hle13__nv_bfloat16S__nbst = declare_device(
+        "_ZL5__hle13__nv_bfloat16S__nbst",
+        bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL5__hle13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL5__hle13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__hle, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL5__hle13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL5__hle13__nv_bfloat16S__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL5__hle13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __hge():
+    pass
+
+
+def _lower__ZL5__hge13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL5__hge13__nv_bfloat16S__nbst(bool &retval , __nv_bfloat16* a, __nv_bfloat16* b) {
+        retval = __hge(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL5__hge13__nv_bfloat16S__nbst = declare_device(
+        "_ZL5__hge13__nv_bfloat16S__nbst",
+        bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL5__hge13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL5__hge13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__hge, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL5__hge13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL5__hge13__nv_bfloat16S__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL5__hge13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __hlt():
+    pass
+
+
+def _lower__ZL5__hlt13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL5__hlt13__nv_bfloat16S__nbst(bool &retval , __nv_bfloat16* a, __nv_bfloat16* b) {
+        retval = __hlt(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL5__hlt13__nv_bfloat16S__nbst = declare_device(
+        "_ZL5__hlt13__nv_bfloat16S__nbst",
+        bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL5__hlt13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL5__hlt13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__hlt, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL5__hlt13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL5__hlt13__nv_bfloat16S__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL5__hlt13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __hgt():
+    pass
+
+
+def _lower__ZL5__hgt13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL5__hgt13__nv_bfloat16S__nbst(bool &retval , __nv_bfloat16* a, __nv_bfloat16* b) {
+        retval = __hgt(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL5__hgt13__nv_bfloat16S__nbst = declare_device(
+        "_ZL5__hgt13__nv_bfloat16S__nbst",
+        bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL5__hgt13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL5__hgt13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__hgt, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL5__hgt13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL5__hgt13__nv_bfloat16S__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL5__hgt13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __hequ():
+    pass
+
+
+def _lower__ZL6__hequ13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__hequ13__nv_bfloat16S__nbst(bool &retval , __nv_bfloat16* a, __nv_bfloat16* b) {
+        retval = __hequ(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL6__hequ13__nv_bfloat16S__nbst = declare_device(
+        "_ZL6__hequ13__nv_bfloat16S__nbst",
+        bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL6__hequ13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL6__hequ13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__hequ, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__hequ13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__hequ13__nv_bfloat16S__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__hequ13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __hneu():
+    pass
+
+
+def _lower__ZL6__hneu13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__hneu13__nv_bfloat16S__nbst(bool &retval , __nv_bfloat16* a, __nv_bfloat16* b) {
+        retval = __hneu(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL6__hneu13__nv_bfloat16S__nbst = declare_device(
+        "_ZL6__hneu13__nv_bfloat16S__nbst",
+        bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL6__hneu13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL6__hneu13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__hneu, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__hneu13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__hneu13__nv_bfloat16S__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__hneu13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __hleu():
+    pass
+
+
+def _lower__ZL6__hleu13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__hleu13__nv_bfloat16S__nbst(bool &retval , __nv_bfloat16* a, __nv_bfloat16* b) {
+        retval = __hleu(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL6__hleu13__nv_bfloat16S__nbst = declare_device(
+        "_ZL6__hleu13__nv_bfloat16S__nbst",
+        bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL6__hleu13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL6__hleu13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__hleu, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__hleu13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__hleu13__nv_bfloat16S__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__hleu13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __hgeu():
+    pass
+
+
+def _lower__ZL6__hgeu13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__hgeu13__nv_bfloat16S__nbst(bool &retval , __nv_bfloat16* a, __nv_bfloat16* b) {
+        retval = __hgeu(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL6__hgeu13__nv_bfloat16S__nbst = declare_device(
+        "_ZL6__hgeu13__nv_bfloat16S__nbst",
+        bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL6__hgeu13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL6__hgeu13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__hgeu, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__hgeu13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__hgeu13__nv_bfloat16S__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__hgeu13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __hltu():
+    pass
+
+
+def _lower__ZL6__hltu13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__hltu13__nv_bfloat16S__nbst(bool &retval , __nv_bfloat16* a, __nv_bfloat16* b) {
+        retval = __hltu(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL6__hltu13__nv_bfloat16S__nbst = declare_device(
+        "_ZL6__hltu13__nv_bfloat16S__nbst",
+        bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL6__hltu13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL6__hltu13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__hltu, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__hltu13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__hltu13__nv_bfloat16S__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__hltu13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __hgtu():
+    pass
+
+
+def _lower__ZL6__hgtu13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__hgtu13__nv_bfloat16S__nbst(bool &retval , __nv_bfloat16* a, __nv_bfloat16* b) {
+        retval = __hgtu(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL6__hgtu13__nv_bfloat16S__nbst = declare_device(
+        "_ZL6__hgtu13__nv_bfloat16S__nbst",
+        bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL6__hgtu13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL6__hgtu13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__hgtu, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__hgtu13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__hgtu13__nv_bfloat16S__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__hgtu13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __hisnan():
+    pass
+
+
+def _lower__ZL8__hisnan13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL8__hisnan13__nv_bfloat16_nbst(bool &retval , __nv_bfloat16* a) {
+        retval = __hisnan(*a);
+        return 0;
+    }
+        """
+
+    _ZL8__hisnan13__nv_bfloat16_nbst = declare_device(
+        "_ZL8__hisnan13__nv_bfloat16_nbst", bool_(CPointer(_type___nv_bfloat16))
+    )
+
+    def _ZL8__hisnan13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL8__hisnan13__nv_bfloat16_nbst(arg_0)
+
+    @lower(__hisnan, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL8__hisnan13__nv_bfloat16_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL8__hisnan13__nv_bfloat16_nbst_caller,
+            signature(bool_, CPointer(_type___nv_bfloat16)),
+            ptrs,
+        )
+
+
+_lower__ZL8__hisnan13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def __hmax():
+    pass
+
+
+def _lower__ZL6__hmax13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__hmax13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) {
+        retval = __hmax(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL6__hmax13__nv_bfloat16S__nbst = declare_device(
+        "_ZL6__hmax13__nv_bfloat16S__nbst",
+        _type___nv_bfloat16(
+            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
+        ),
+    )
+
+    def _ZL6__hmax13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL6__hmax13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__hmax, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__hmax13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__hmax13__nv_bfloat16S__nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__hmax13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __hmin():
+    pass
+
+
+def _lower__ZL6__hmin13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__hmin13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) {
+        retval = __hmin(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL6__hmin13__nv_bfloat16S__nbst = declare_device(
+        "_ZL6__hmin13__nv_bfloat16S__nbst",
+        _type___nv_bfloat16(
+            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
+        ),
+    )
+
+    def _ZL6__hmin13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL6__hmin13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__hmin, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__hmin13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__hmin13__nv_bfloat16S__nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__hmin13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __hmax_nan():
+    pass
+
+
+def _lower__ZL10__hmax_nan13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL10__hmax_nan13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) {
+        retval = __hmax_nan(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL10__hmax_nan13__nv_bfloat16S__nbst = declare_device(
+        "_ZL10__hmax_nan13__nv_bfloat16S__nbst",
+        _type___nv_bfloat16(
+            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
+        ),
+    )
+
+    def _ZL10__hmax_nan13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL10__hmax_nan13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__hmax_nan, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL10__hmax_nan13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL10__hmax_nan13__nv_bfloat16S__nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL10__hmax_nan13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __hmin_nan():
+    pass
+
+
+def _lower__ZL10__hmin_nan13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL10__hmin_nan13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) {
+        retval = __hmin_nan(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL10__hmin_nan13__nv_bfloat16S__nbst = declare_device(
+        "_ZL10__hmin_nan13__nv_bfloat16S__nbst",
+        _type___nv_bfloat16(
+            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
+        ),
+    )
+
+    def _ZL10__hmin_nan13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL10__hmin_nan13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__hmin_nan, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL10__hmin_nan13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL10__hmin_nan13__nv_bfloat16S__nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL10__hmin_nan13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __hfma_relu():
+    pass
+
+
+def _lower__ZL11__hfma_relu13__nv_bfloat16S_S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL11__hfma_relu13__nv_bfloat16S_S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b, __nv_bfloat16* c) {
+        retval = __hfma_relu(*a, *b, *c);
+        return 0;
+    }
+        """
+
+    _ZL11__hfma_relu13__nv_bfloat16S_S__nbst = declare_device(
+        "_ZL11__hfma_relu13__nv_bfloat16S_S__nbst",
+        _type___nv_bfloat16(
+            CPointer(_type___nv_bfloat16),
+            CPointer(_type___nv_bfloat16),
+            CPointer(_type___nv_bfloat16),
+        ),
+    )
+
+    def _ZL11__hfma_relu13__nv_bfloat16S_S__nbst_caller(arg_0, arg_1, arg_2):
+        return _ZL11__hfma_relu13__nv_bfloat16S_S__nbst(arg_0, arg_1, arg_2)
+
+    @lower(
+        __hfma_relu,
+        _type___nv_bfloat16,
+        _type___nv_bfloat16,
+        _type___nv_bfloat16,
+    )
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL11__hfma_relu13__nv_bfloat16S_S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL11__hfma_relu13__nv_bfloat16S_S__nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL11__hfma_relu13__nv_bfloat16S_S__nbst(shim_stream, shim_obj)
+
+
+def __hmax2():
+    pass
+
+
+def _lower__ZL7__hmax214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL7__hmax214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hmax2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL7__hmax214__nv_bfloat162S__nbst = declare_device(
+        "_ZL7__hmax214__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL7__hmax214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL7__hmax214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hmax2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL7__hmax214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL7__hmax214__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL7__hmax214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hmin2():
+    pass
+
+
+def _lower__ZL7__hmin214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL7__hmin214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hmin2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL7__hmin214__nv_bfloat162S__nbst = declare_device(
+        "_ZL7__hmin214__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL7__hmin214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL7__hmin214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hmin2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL7__hmin214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL7__hmin214__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL7__hmin214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hmax2_nan():
+    pass
+
+
+def _lower__ZL11__hmax2_nan14__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL11__hmax2_nan14__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hmax2_nan(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL11__hmax2_nan14__nv_bfloat162S__nbst = declare_device(
+        "_ZL11__hmax2_nan14__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL11__hmax2_nan14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL11__hmax2_nan14__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hmax2_nan, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL11__hmax2_nan14__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL11__hmax2_nan14__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL11__hmax2_nan14__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hmin2_nan():
+    pass
+
+
+def _lower__ZL11__hmin2_nan14__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL11__hmin2_nan14__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hmin2_nan(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL11__hmin2_nan14__nv_bfloat162S__nbst = declare_device(
+        "_ZL11__hmin2_nan14__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL11__hmin2_nan14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL11__hmin2_nan14__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hmin2_nan, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL11__hmin2_nan14__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL11__hmin2_nan14__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL11__hmin2_nan14__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hfma2_relu():
+    pass
+
+
+def _lower__ZL12__hfma2_relu14__nv_bfloat162S_S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL12__hfma2_relu14__nv_bfloat162S_S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b, __nv_bfloat162* c) {
+        retval = __hfma2_relu(*a, *b, *c);
+        return 0;
+    }
+        """
+
+    _ZL12__hfma2_relu14__nv_bfloat162S_S__nbst = declare_device(
+        "_ZL12__hfma2_relu14__nv_bfloat162S_S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162),
+            CPointer(_type___nv_bfloat162),
+            CPointer(_type___nv_bfloat162),
+        ),
+    )
+
+    def _ZL12__hfma2_relu14__nv_bfloat162S_S__nbst_caller(arg_0, arg_1, arg_2):
+        return _ZL12__hfma2_relu14__nv_bfloat162S_S__nbst(arg_0, arg_1, arg_2)
+
+    @lower(
+        __hfma2_relu,
+        _type___nv_bfloat162,
+        _type___nv_bfloat162,
+        _type___nv_bfloat162,
+    )
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL12__hfma2_relu14__nv_bfloat162S_S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL12__hfma2_relu14__nv_bfloat162S_S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL12__hfma2_relu14__nv_bfloat162S_S__nbst(shim_stream, shim_obj)
+
+
+def __hcmadd():
+    pass
+
+
+def _lower__ZL8__hcmadd14__nv_bfloat162S_S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL8__hcmadd14__nv_bfloat162S_S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b, __nv_bfloat162* c) {
+        retval = __hcmadd(*a, *b, *c);
+        return 0;
+    }
+        """
+
+    _ZL8__hcmadd14__nv_bfloat162S_S__nbst = declare_device(
+        "_ZL8__hcmadd14__nv_bfloat162S_S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162),
+            CPointer(_type___nv_bfloat162),
+            CPointer(_type___nv_bfloat162),
+        ),
+    )
+
+    def _ZL8__hcmadd14__nv_bfloat162S_S__nbst_caller(arg_0, arg_1, arg_2):
+        return _ZL8__hcmadd14__nv_bfloat162S_S__nbst(arg_0, arg_1, arg_2)
+
+    @lower(
+        __hcmadd,
+        _type___nv_bfloat162,
+        _type___nv_bfloat162,
+        _type___nv_bfloat162,
+    )
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL8__hcmadd14__nv_bfloat162S_S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL8__hcmadd14__nv_bfloat162S_S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL8__hcmadd14__nv_bfloat162S_S__nbst(shim_stream, shim_obj)
+
+
+def hsqrt():
+    pass
+
+
+def _lower__ZL5hsqrt13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL5hsqrt13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) {
+        retval = hsqrt(*a);
+        return 0;
+    }
+        """
+
+    _ZL5hsqrt13__nv_bfloat16_nbst = declare_device(
+        "_ZL5hsqrt13__nv_bfloat16_nbst",
+        _type___nv_bfloat16(CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL5hsqrt13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL5hsqrt13__nv_bfloat16_nbst(arg_0)
+
+    @lower(hsqrt, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL5hsqrt13__nv_bfloat16_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL5hsqrt13__nv_bfloat16_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            ptrs,
+        )
+
+
+_lower__ZL5hsqrt13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def hrsqrt():
+    pass
+
+
+def _lower__ZL6hrsqrt13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6hrsqrt13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) {
+        retval = hrsqrt(*a);
+        return 0;
+    }
+        """
+
+    _ZL6hrsqrt13__nv_bfloat16_nbst = declare_device(
+        "_ZL6hrsqrt13__nv_bfloat16_nbst",
+        _type___nv_bfloat16(CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL6hrsqrt13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL6hrsqrt13__nv_bfloat16_nbst(arg_0)
+
+    @lower(hrsqrt, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6hrsqrt13__nv_bfloat16_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6hrsqrt13__nv_bfloat16_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            ptrs,
+        )
+
+
+_lower__ZL6hrsqrt13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def hrcp():
+    pass
+
+
+def _lower__ZL4hrcp13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL4hrcp13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) {
+        retval = hrcp(*a);
+        return 0;
+    }
+        """
+
+    _ZL4hrcp13__nv_bfloat16_nbst = declare_device(
+        "_ZL4hrcp13__nv_bfloat16_nbst",
+        _type___nv_bfloat16(CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL4hrcp13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL4hrcp13__nv_bfloat16_nbst(arg_0)
+
+    @lower(hrcp, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key("_ZL4hrcp13__nv_bfloat16_nbst", shim_raw_str)
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL4hrcp13__nv_bfloat16_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            ptrs,
+        )
+
+
+_lower__ZL4hrcp13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def hlog():
+    pass
+
+
+def _lower__ZL4hlog13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL4hlog13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) {
+        retval = hlog(*a);
+        return 0;
+    }
+        """
+
+    _ZL4hlog13__nv_bfloat16_nbst = declare_device(
+        "_ZL4hlog13__nv_bfloat16_nbst",
+        _type___nv_bfloat16(CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL4hlog13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL4hlog13__nv_bfloat16_nbst(arg_0)
+
+    @lower(hlog, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key("_ZL4hlog13__nv_bfloat16_nbst", shim_raw_str)
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL4hlog13__nv_bfloat16_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            ptrs,
+        )
+
+
+_lower__ZL4hlog13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def hlog2():
+    pass
+
+
+def _lower__ZL5hlog213__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL5hlog213__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) {
+        retval = hlog2(*a);
+        return 0;
+    }
+        """
+
+    _ZL5hlog213__nv_bfloat16_nbst = declare_device(
+        "_ZL5hlog213__nv_bfloat16_nbst",
+        _type___nv_bfloat16(CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL5hlog213__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL5hlog213__nv_bfloat16_nbst(arg_0)
+
+    @lower(hlog2, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL5hlog213__nv_bfloat16_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL5hlog213__nv_bfloat16_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            ptrs,
+        )
+
+
+_lower__ZL5hlog213__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def hlog10():
+    pass
+
+
+def _lower__ZL6hlog1013__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6hlog1013__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) {
+        retval = hlog10(*a);
+        return 0;
+    }
+        """
+
+    _ZL6hlog1013__nv_bfloat16_nbst = declare_device(
+        "_ZL6hlog1013__nv_bfloat16_nbst",
+        _type___nv_bfloat16(CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL6hlog1013__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL6hlog1013__nv_bfloat16_nbst(arg_0)
+
+    @lower(hlog10, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6hlog1013__nv_bfloat16_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6hlog1013__nv_bfloat16_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            ptrs,
+        )
+
+
+_lower__ZL6hlog1013__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def hexp():
+    pass
+
+
+def _lower__ZL4hexp13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL4hexp13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) {
+        retval = hexp(*a);
+        return 0;
+    }
+        """
+
+    _ZL4hexp13__nv_bfloat16_nbst = declare_device(
+        "_ZL4hexp13__nv_bfloat16_nbst",
+        _type___nv_bfloat16(CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL4hexp13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL4hexp13__nv_bfloat16_nbst(arg_0)
+
+    @lower(hexp, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key("_ZL4hexp13__nv_bfloat16_nbst", shim_raw_str)
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL4hexp13__nv_bfloat16_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            ptrs,
+        )
+
+
+_lower__ZL4hexp13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def htanh_approx():
+    pass
+
+
+def _lower__ZL12htanh_approx13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL12htanh_approx13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) {
+        retval = htanh_approx(*a);
+        return 0;
+    }
+        """
+
+    _ZL12htanh_approx13__nv_bfloat16_nbst = declare_device(
+        "_ZL12htanh_approx13__nv_bfloat16_nbst",
+        _type___nv_bfloat16(CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL12htanh_approx13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL12htanh_approx13__nv_bfloat16_nbst(arg_0)
+
+    @lower(htanh_approx, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL12htanh_approx13__nv_bfloat16_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL12htanh_approx13__nv_bfloat16_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            ptrs,
+        )
+
+
+_lower__ZL12htanh_approx13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def h2tanh_approx():
+    pass
+
+
+def _lower__ZL13h2tanh_approx14__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL13h2tanh_approx14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) {
+        retval = h2tanh_approx(*a);
+        return 0;
+    }
+        """
+
+    _ZL13h2tanh_approx14__nv_bfloat162_nbst = declare_device(
+        "_ZL13h2tanh_approx14__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL13h2tanh_approx14__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL13h2tanh_approx14__nv_bfloat162_nbst(arg_0)
+
+    @lower(h2tanh_approx, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL13h2tanh_approx14__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL13h2tanh_approx14__nv_bfloat162_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            ptrs,
+        )
+
+
+_lower__ZL13h2tanh_approx14__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def htanh():
+    pass
+
+
+def _lower__ZL5htanh13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL5htanh13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) {
+        retval = htanh(*a);
+        return 0;
+    }
+        """
+
+    _ZL5htanh13__nv_bfloat16_nbst = declare_device(
+        "_ZL5htanh13__nv_bfloat16_nbst",
+        _type___nv_bfloat16(CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL5htanh13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL5htanh13__nv_bfloat16_nbst(arg_0)
+
+    @lower(htanh, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL5htanh13__nv_bfloat16_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL5htanh13__nv_bfloat16_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            ptrs,
+        )
+
+
+_lower__ZL5htanh13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def h2tanh():
+    pass
+
+
+def _lower__ZL6h2tanh14__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6h2tanh14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) {
+        retval = h2tanh(*a);
+        return 0;
+    }
+        """
+
+    _ZL6h2tanh14__nv_bfloat162_nbst = declare_device(
+        "_ZL6h2tanh14__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL6h2tanh14__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL6h2tanh14__nv_bfloat162_nbst(arg_0)
+
+    @lower(h2tanh, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6h2tanh14__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6h2tanh14__nv_bfloat162_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            ptrs,
+        )
+
+
+_lower__ZL6h2tanh14__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def hexp2():
+    pass
+
+
+def _lower__ZL5hexp213__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL5hexp213__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) {
+        retval = hexp2(*a);
+        return 0;
+    }
+        """
+
+    _ZL5hexp213__nv_bfloat16_nbst = declare_device(
+        "_ZL5hexp213__nv_bfloat16_nbst",
+        _type___nv_bfloat16(CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL5hexp213__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL5hexp213__nv_bfloat16_nbst(arg_0)
+
+    @lower(hexp2, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL5hexp213__nv_bfloat16_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL5hexp213__nv_bfloat16_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            ptrs,
+        )
+
+
+_lower__ZL5hexp213__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def hexp10():
+    pass
+
+
+def _lower__ZL6hexp1013__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6hexp1013__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) {
+        retval = hexp10(*a);
+        return 0;
+    }
+        """
+
+    _ZL6hexp1013__nv_bfloat16_nbst = declare_device(
+        "_ZL6hexp1013__nv_bfloat16_nbst",
+        _type___nv_bfloat16(CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL6hexp1013__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL6hexp1013__nv_bfloat16_nbst(arg_0)
+
+    @lower(hexp10, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6hexp1013__nv_bfloat16_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6hexp1013__nv_bfloat16_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            ptrs,
+        )
+
+
+_lower__ZL6hexp1013__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def hcos():
+    pass
+
+
+def _lower__ZL4hcos13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL4hcos13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) {
+        retval = hcos(*a);
+        return 0;
+    }
+        """
+
+    _ZL4hcos13__nv_bfloat16_nbst = declare_device(
+        "_ZL4hcos13__nv_bfloat16_nbst",
+        _type___nv_bfloat16(CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL4hcos13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL4hcos13__nv_bfloat16_nbst(arg_0)
+
+    @lower(hcos, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key("_ZL4hcos13__nv_bfloat16_nbst", shim_raw_str)
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL4hcos13__nv_bfloat16_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            ptrs,
+        )
+
+
+_lower__ZL4hcos13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def hsin():
+    pass
+
+
+def _lower__ZL4hsin13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL4hsin13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) {
+        retval = hsin(*a);
+        return 0;
+    }
+        """
+
+    _ZL4hsin13__nv_bfloat16_nbst = declare_device(
+        "_ZL4hsin13__nv_bfloat16_nbst",
+        _type___nv_bfloat16(CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL4hsin13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL4hsin13__nv_bfloat16_nbst(arg_0)
+
+    @lower(hsin, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key("_ZL4hsin13__nv_bfloat16_nbst", shim_raw_str)
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL4hsin13__nv_bfloat16_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            ptrs,
+        )
+
+
+_lower__ZL4hsin13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def h2sqrt():
+    pass
+
+
+def _lower__ZL6h2sqrt14__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6h2sqrt14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) {
+        retval = h2sqrt(*a);
+        return 0;
+    }
+        """
+
+    _ZL6h2sqrt14__nv_bfloat162_nbst = declare_device(
+        "_ZL6h2sqrt14__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL6h2sqrt14__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL6h2sqrt14__nv_bfloat162_nbst(arg_0)
+
+    @lower(h2sqrt, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6h2sqrt14__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6h2sqrt14__nv_bfloat162_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            ptrs,
+        )
+
+
+_lower__ZL6h2sqrt14__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def h2rsqrt():
+    pass
+
+
+def _lower__ZL7h2rsqrt14__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL7h2rsqrt14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) {
+        retval = h2rsqrt(*a);
+        return 0;
+    }
+        """
+
+    _ZL7h2rsqrt14__nv_bfloat162_nbst = declare_device(
+        "_ZL7h2rsqrt14__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL7h2rsqrt14__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL7h2rsqrt14__nv_bfloat162_nbst(arg_0)
+
+    @lower(h2rsqrt, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL7h2rsqrt14__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL7h2rsqrt14__nv_bfloat162_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            ptrs,
+        )
+
+
+_lower__ZL7h2rsqrt14__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def h2rcp():
+    pass
+
+
+def _lower__ZL5h2rcp14__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL5h2rcp14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) {
+        retval = h2rcp(*a);
+        return 0;
+    }
+        """
+
+    _ZL5h2rcp14__nv_bfloat162_nbst = declare_device(
+        "_ZL5h2rcp14__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL5h2rcp14__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL5h2rcp14__nv_bfloat162_nbst(arg_0)
+
+    @lower(h2rcp, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL5h2rcp14__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL5h2rcp14__nv_bfloat162_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            ptrs,
+        )
+
+
+_lower__ZL5h2rcp14__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def h2log():
+    pass
+
+
+def _lower__ZL5h2log14__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL5h2log14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) {
+        retval = h2log(*a);
+        return 0;
+    }
+        """
+
+    _ZL5h2log14__nv_bfloat162_nbst = declare_device(
+        "_ZL5h2log14__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL5h2log14__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL5h2log14__nv_bfloat162_nbst(arg_0)
+
+    @lower(h2log, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL5h2log14__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL5h2log14__nv_bfloat162_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            ptrs,
+        )
+
+
+_lower__ZL5h2log14__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def h2log2():
+    pass
+
+
+def _lower__ZL6h2log214__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6h2log214__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) {
+        retval = h2log2(*a);
+        return 0;
+    }
+        """
+
+    _ZL6h2log214__nv_bfloat162_nbst = declare_device(
+        "_ZL6h2log214__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL6h2log214__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL6h2log214__nv_bfloat162_nbst(arg_0)
+
+    @lower(h2log2, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6h2log214__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6h2log214__nv_bfloat162_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            ptrs,
+        )
+
+
+_lower__ZL6h2log214__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def h2log10():
+    pass
+
+
+def _lower__ZL7h2log1014__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL7h2log1014__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) {
+        retval = h2log10(*a);
+        return 0;
+    }
+        """
+
+    _ZL7h2log1014__nv_bfloat162_nbst = declare_device(
+        "_ZL7h2log1014__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL7h2log1014__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL7h2log1014__nv_bfloat162_nbst(arg_0)
+
+    @lower(h2log10, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL7h2log1014__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL7h2log1014__nv_bfloat162_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            ptrs,
+        )
+
+
+_lower__ZL7h2log1014__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def h2exp():
+    pass
+
+
+def _lower__ZL5h2exp14__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL5h2exp14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) {
+        retval = h2exp(*a);
+        return 0;
+    }
+        """
+
+    _ZL5h2exp14__nv_bfloat162_nbst = declare_device(
+        "_ZL5h2exp14__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL5h2exp14__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL5h2exp14__nv_bfloat162_nbst(arg_0)
+
+    @lower(h2exp, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL5h2exp14__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL5h2exp14__nv_bfloat162_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            ptrs,
+        )
+
+
+_lower__ZL5h2exp14__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def h2exp2():
+    pass
+
+
+def _lower__ZL6h2exp214__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6h2exp214__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) {
+        retval = h2exp2(*a);
+        return 0;
+    }
+        """
+
+    _ZL6h2exp214__nv_bfloat162_nbst = declare_device(
+        "_ZL6h2exp214__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL6h2exp214__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL6h2exp214__nv_bfloat162_nbst(arg_0)
+
+    @lower(h2exp2, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6h2exp214__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6h2exp214__nv_bfloat162_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            ptrs,
+        )
+
+
+_lower__ZL6h2exp214__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def h2exp10():
+    pass
+
+
+def _lower__ZL7h2exp1014__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL7h2exp1014__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) {
+        retval = h2exp10(*a);
+        return 0;
+    }
+        """
+
+    _ZL7h2exp1014__nv_bfloat162_nbst = declare_device(
+        "_ZL7h2exp1014__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL7h2exp1014__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL7h2exp1014__nv_bfloat162_nbst(arg_0)
+
+    @lower(h2exp10, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL7h2exp1014__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL7h2exp1014__nv_bfloat162_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            ptrs,
+        )
+
+
+_lower__ZL7h2exp1014__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def h2cos():
+    pass
+
+
+def _lower__ZL5h2cos14__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL5h2cos14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) {
+        retval = h2cos(*a);
+        return 0;
+    }
+        """
+
+    _ZL5h2cos14__nv_bfloat162_nbst = declare_device(
+        "_ZL5h2cos14__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL5h2cos14__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL5h2cos14__nv_bfloat162_nbst(arg_0)
+
+    @lower(h2cos, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL5h2cos14__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL5h2cos14__nv_bfloat162_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            ptrs,
+        )
+
+
+_lower__ZL5h2cos14__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def h2sin():
+    pass
+
+
+def _lower__ZL5h2sin14__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL5h2sin14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) {
+        retval = h2sin(*a);
+        return 0;
+    }
+        """
+
+    _ZL5h2sin14__nv_bfloat162_nbst = declare_device(
+        "_ZL5h2sin14__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL5h2sin14__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL5h2sin14__nv_bfloat162_nbst(arg_0)
+
+    @lower(h2sin, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL5h2sin14__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL5h2sin14__nv_bfloat162_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            ptrs,
+        )
+
+
+_lower__ZL5h2sin14__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def atomicAdd():
+    pass
+
+
+def _lower__ZL9atomicAddP14__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL9atomicAddP14__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162 ** address, __nv_bfloat162* val) {
+        retval = atomicAdd(*address, *val);
+        return 0;
+    }
+        """
+
+    _ZL9atomicAddP14__nv_bfloat162S__nbst = declare_device(
+        "_ZL9atomicAddP14__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(CPointer(_type___nv_bfloat162)),
+            CPointer(_type___nv_bfloat162),
+        ),
+    )
+
+    def _ZL9atomicAddP14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL9atomicAddP14__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(atomicAdd, CPointer(_type___nv_bfloat162), _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL9atomicAddP14__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL9atomicAddP14__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(CPointer(_type___nv_bfloat162)),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL9atomicAddP14__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZL9atomicAddP13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL9atomicAddP13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16 ** address, __nv_bfloat16* val) {
+        retval = atomicAdd(*address, *val);
+        return 0;
+    }
+        """
+
+    _ZL9atomicAddP13__nv_bfloat16S__nbst = declare_device(
+        "_ZL9atomicAddP13__nv_bfloat16S__nbst",
+        _type___nv_bfloat16(
+            CPointer(CPointer(_type___nv_bfloat16)),
+            CPointer(_type___nv_bfloat16),
+        ),
+    )
+
+    def _ZL9atomicAddP13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL9atomicAddP13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(atomicAdd, CPointer(_type___nv_bfloat16), _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL9atomicAddP13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL9atomicAddP13__nv_bfloat16S__nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(CPointer(_type___nv_bfloat16)),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL9atomicAddP13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZplRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZplRK13__nv_bfloat16S1__nbst(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
+        retval = operator+(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZplRK13__nv_bfloat16S1__nbst = declare_device(
+        "_ZplRK13__nv_bfloat16S1__nbst",
+        _type___nv_bfloat16(
+            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
+        ),
+    )
+
+    def _ZplRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1):
+        return _ZplRK13__nv_bfloat16S1__nbst(arg_0, arg_1)
+
+    @lower(operator.add, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZplRK13__nv_bfloat16S1__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZplRK13__nv_bfloat16S1__nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZplRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZmiRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZmiRK13__nv_bfloat16S1__nbst(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
+        retval = operator-(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZmiRK13__nv_bfloat16S1__nbst = declare_device(
+        "_ZmiRK13__nv_bfloat16S1__nbst",
+        _type___nv_bfloat16(
+            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
+        ),
+    )
+
+    def _ZmiRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1):
+        return _ZmiRK13__nv_bfloat16S1__nbst(arg_0, arg_1)
+
+    @lower(operator.sub, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZmiRK13__nv_bfloat16S1__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZmiRK13__nv_bfloat16S1__nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZmiRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZmlRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZmlRK13__nv_bfloat16S1__nbst(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
+        retval = operator*(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZmlRK13__nv_bfloat16S1__nbst = declare_device(
+        "_ZmlRK13__nv_bfloat16S1__nbst",
+        _type___nv_bfloat16(
+            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
+        ),
+    )
+
+    def _ZmlRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1):
+        return _ZmlRK13__nv_bfloat16S1__nbst(arg_0, arg_1)
+
+    @lower(operator.mul, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZmlRK13__nv_bfloat16S1__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZmlRK13__nv_bfloat16S1__nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZmlRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZdvRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZdvRK13__nv_bfloat16S1__nbst(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
+        retval = operator/(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZdvRK13__nv_bfloat16S1__nbst = declare_device(
+        "_ZdvRK13__nv_bfloat16S1__nbst",
+        _type___nv_bfloat16(
+            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
+        ),
+    )
+
+    def _ZdvRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1):
+        return _ZdvRK13__nv_bfloat16S1__nbst(arg_0, arg_1)
+
+    @lower(operator.truediv, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZdvRK13__nv_bfloat16S1__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZdvRK13__nv_bfloat16S1__nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZdvRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZpLR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZpLR13__nv_bfloat16RKS__nbst(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
+        retval = operator+=(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZpLR13__nv_bfloat16RKS__nbst = declare_device(
+        "_ZpLR13__nv_bfloat16RKS__nbst",
+        _type___nv_bfloat16(
+            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
+        ),
+    )
+
+    def _ZpLR13__nv_bfloat16RKS__nbst_caller(arg_0, arg_1):
+        return _ZpLR13__nv_bfloat16RKS__nbst(arg_0, arg_1)
+
+    @lower(operator.iadd, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZpLR13__nv_bfloat16RKS__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZpLR13__nv_bfloat16RKS__nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZpLR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZmIR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZmIR13__nv_bfloat16RKS__nbst(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
+        retval = operator-=(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZmIR13__nv_bfloat16RKS__nbst = declare_device(
+        "_ZmIR13__nv_bfloat16RKS__nbst",
+        _type___nv_bfloat16(
+            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
+        ),
+    )
+
+    def _ZmIR13__nv_bfloat16RKS__nbst_caller(arg_0, arg_1):
+        return _ZmIR13__nv_bfloat16RKS__nbst(arg_0, arg_1)
+
+    @lower(operator.isub, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZmIR13__nv_bfloat16RKS__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZmIR13__nv_bfloat16RKS__nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZmIR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZmLR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZmLR13__nv_bfloat16RKS__nbst(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
+        retval = operator*=(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZmLR13__nv_bfloat16RKS__nbst = declare_device(
+        "_ZmLR13__nv_bfloat16RKS__nbst",
+        _type___nv_bfloat16(
+            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
+        ),
+    )
+
+    def _ZmLR13__nv_bfloat16RKS__nbst_caller(arg_0, arg_1):
+        return _ZmLR13__nv_bfloat16RKS__nbst(arg_0, arg_1)
+
+    @lower(operator.imul, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZmLR13__nv_bfloat16RKS__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZmLR13__nv_bfloat16RKS__nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZmLR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZdVR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZdVR13__nv_bfloat16RKS__nbst(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
+        retval = operator/=(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZdVR13__nv_bfloat16RKS__nbst = declare_device(
+        "_ZdVR13__nv_bfloat16RKS__nbst",
+        _type___nv_bfloat16(
+            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
+        ),
+    )
+
+    def _ZdVR13__nv_bfloat16RKS__nbst_caller(arg_0, arg_1):
+        return _ZdVR13__nv_bfloat16RKS__nbst(arg_0, arg_1)
+
+    @lower(operator.itruediv, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZdVR13__nv_bfloat16RKS__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZdVR13__nv_bfloat16RKS__nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZdVR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZpsRK13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZpsRK13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* h) {
+        retval = operator+(*h);
+        return 0;
+    }
+        """
+
+    _ZpsRK13__nv_bfloat16_nbst = declare_device(
+        "_ZpsRK13__nv_bfloat16_nbst",
+        _type___nv_bfloat16(CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZpsRK13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZpsRK13__nv_bfloat16_nbst(arg_0)
+
+    @lower(operator.pos, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key("_ZpsRK13__nv_bfloat16_nbst", shim_raw_str)
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZpsRK13__nv_bfloat16_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            ptrs,
+        )
+
+
+_lower__ZpsRK13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def _lower__ZngRK13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZngRK13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* h) {
+        retval = operator-(*h);
+        return 0;
+    }
+        """
+
+    _ZngRK13__nv_bfloat16_nbst = declare_device(
+        "_ZngRK13__nv_bfloat16_nbst",
+        _type___nv_bfloat16(CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZngRK13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZngRK13__nv_bfloat16_nbst(arg_0)
+
+    @lower(operator.neg, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key("_ZngRK13__nv_bfloat16_nbst", shim_raw_str)
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZngRK13__nv_bfloat16_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            ptrs,
+        )
+
+
+_lower__ZngRK13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def _lower__ZeqRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZeqRK13__nv_bfloat16S1__nbst(bool &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
+        retval = operator==(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZeqRK13__nv_bfloat16S1__nbst = declare_device(
+        "_ZeqRK13__nv_bfloat16S1__nbst",
+        bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZeqRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1):
+        return _ZeqRK13__nv_bfloat16S1__nbst(arg_0, arg_1)
+
+    @lower(operator.eq, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZeqRK13__nv_bfloat16S1__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZeqRK13__nv_bfloat16S1__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZeqRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZneRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZneRK13__nv_bfloat16S1__nbst(bool &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
+        retval = operator!=(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZneRK13__nv_bfloat16S1__nbst = declare_device(
+        "_ZneRK13__nv_bfloat16S1__nbst",
+        bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZneRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1):
+        return _ZneRK13__nv_bfloat16S1__nbst(arg_0, arg_1)
+
+    @lower(operator.ne, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZneRK13__nv_bfloat16S1__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZneRK13__nv_bfloat16S1__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZneRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZgtRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZgtRK13__nv_bfloat16S1__nbst(bool &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
+        retval = operator>(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZgtRK13__nv_bfloat16S1__nbst = declare_device(
+        "_ZgtRK13__nv_bfloat16S1__nbst",
+        bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZgtRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1):
+        return _ZgtRK13__nv_bfloat16S1__nbst(arg_0, arg_1)
+
+    @lower(operator.gt, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZgtRK13__nv_bfloat16S1__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZgtRK13__nv_bfloat16S1__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZgtRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZltRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZltRK13__nv_bfloat16S1__nbst(bool &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
+        retval = operator<(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZltRK13__nv_bfloat16S1__nbst = declare_device(
+        "_ZltRK13__nv_bfloat16S1__nbst",
+        bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZltRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1):
+        return _ZltRK13__nv_bfloat16S1__nbst(arg_0, arg_1)
+
+    @lower(operator.lt, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZltRK13__nv_bfloat16S1__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZltRK13__nv_bfloat16S1__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZltRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZgeRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZgeRK13__nv_bfloat16S1__nbst(bool &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
+        retval = operator>=(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZgeRK13__nv_bfloat16S1__nbst = declare_device(
+        "_ZgeRK13__nv_bfloat16S1__nbst",
+        bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZgeRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1):
+        return _ZgeRK13__nv_bfloat16S1__nbst(arg_0, arg_1)
+
+    @lower(operator.ge, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZgeRK13__nv_bfloat16S1__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZgeRK13__nv_bfloat16S1__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZgeRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZleRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZleRK13__nv_bfloat16S1__nbst(bool &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
+        retval = operator<=(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZleRK13__nv_bfloat16S1__nbst = declare_device(
+        "_ZleRK13__nv_bfloat16S1__nbst",
+        bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZleRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1):
+        return _ZleRK13__nv_bfloat16S1__nbst(arg_0, arg_1)
+
+    @lower(operator.le, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZleRK13__nv_bfloat16S1__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZleRK13__nv_bfloat16S1__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZleRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZplRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZplRK14__nv_bfloat162S1__nbst(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
+        retval = operator+(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZplRK14__nv_bfloat162S1__nbst = declare_device(
+        "_ZplRK14__nv_bfloat162S1__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZplRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1):
+        return _ZplRK14__nv_bfloat162S1__nbst(arg_0, arg_1)
+
+    @lower(operator.add, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZplRK14__nv_bfloat162S1__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZplRK14__nv_bfloat162S1__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZplRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZmiRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZmiRK14__nv_bfloat162S1__nbst(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
+        retval = operator-(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZmiRK14__nv_bfloat162S1__nbst = declare_device(
+        "_ZmiRK14__nv_bfloat162S1__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZmiRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1):
+        return _ZmiRK14__nv_bfloat162S1__nbst(arg_0, arg_1)
+
+    @lower(operator.sub, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZmiRK14__nv_bfloat162S1__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZmiRK14__nv_bfloat162S1__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZmiRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZmlRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZmlRK14__nv_bfloat162S1__nbst(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
+        retval = operator*(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZmlRK14__nv_bfloat162S1__nbst = declare_device(
+        "_ZmlRK14__nv_bfloat162S1__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZmlRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1):
+        return _ZmlRK14__nv_bfloat162S1__nbst(arg_0, arg_1)
+
+    @lower(operator.mul, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZmlRK14__nv_bfloat162S1__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZmlRK14__nv_bfloat162S1__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZmlRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZdvRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZdvRK14__nv_bfloat162S1__nbst(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
+        retval = operator/(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZdvRK14__nv_bfloat162S1__nbst = declare_device(
+        "_ZdvRK14__nv_bfloat162S1__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZdvRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1):
+        return _ZdvRK14__nv_bfloat162S1__nbst(arg_0, arg_1)
+
+    @lower(operator.truediv, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZdvRK14__nv_bfloat162S1__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZdvRK14__nv_bfloat162S1__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZdvRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZpLR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZpLR14__nv_bfloat162RKS__nbst(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
+        retval = operator+=(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZpLR14__nv_bfloat162RKS__nbst = declare_device(
+        "_ZpLR14__nv_bfloat162RKS__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZpLR14__nv_bfloat162RKS__nbst_caller(arg_0, arg_1):
+        return _ZpLR14__nv_bfloat162RKS__nbst(arg_0, arg_1)
+
+    @lower(operator.iadd, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZpLR14__nv_bfloat162RKS__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZpLR14__nv_bfloat162RKS__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZpLR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZmIR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZmIR14__nv_bfloat162RKS__nbst(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
+        retval = operator-=(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZmIR14__nv_bfloat162RKS__nbst = declare_device(
+        "_ZmIR14__nv_bfloat162RKS__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZmIR14__nv_bfloat162RKS__nbst_caller(arg_0, arg_1):
+        return _ZmIR14__nv_bfloat162RKS__nbst(arg_0, arg_1)
+
+    @lower(operator.isub, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZmIR14__nv_bfloat162RKS__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZmIR14__nv_bfloat162RKS__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZmIR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZmLR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZmLR14__nv_bfloat162RKS__nbst(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
+        retval = operator*=(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZmLR14__nv_bfloat162RKS__nbst = declare_device(
+        "_ZmLR14__nv_bfloat162RKS__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZmLR14__nv_bfloat162RKS__nbst_caller(arg_0, arg_1):
+        return _ZmLR14__nv_bfloat162RKS__nbst(arg_0, arg_1)
+
+    @lower(operator.imul, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZmLR14__nv_bfloat162RKS__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZmLR14__nv_bfloat162RKS__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZmLR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZdVR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZdVR14__nv_bfloat162RKS__nbst(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
+        retval = operator/=(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZdVR14__nv_bfloat162RKS__nbst = declare_device(
+        "_ZdVR14__nv_bfloat162RKS__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZdVR14__nv_bfloat162RKS__nbst_caller(arg_0, arg_1):
+        return _ZdVR14__nv_bfloat162RKS__nbst(arg_0, arg_1)
+
+    @lower(operator.itruediv, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZdVR14__nv_bfloat162RKS__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZdVR14__nv_bfloat162RKS__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZdVR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZpsRK14__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZpsRK14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* h) {
+        retval = operator+(*h);
+        return 0;
+    }
+        """
+
+    _ZpsRK14__nv_bfloat162_nbst = declare_device(
+        "_ZpsRK14__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZpsRK14__nv_bfloat162_nbst_caller(arg_0):
+        return _ZpsRK14__nv_bfloat162_nbst(arg_0)
+
+    @lower(operator.pos, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key("_ZpsRK14__nv_bfloat162_nbst", shim_raw_str)
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZpsRK14__nv_bfloat162_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            ptrs,
+        )
+
+
+_lower__ZpsRK14__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def _lower__ZngRK14__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZngRK14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* h) {
+        retval = operator-(*h);
+        return 0;
+    }
+        """
+
+    _ZngRK14__nv_bfloat162_nbst = declare_device(
+        "_ZngRK14__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZngRK14__nv_bfloat162_nbst_caller(arg_0):
+        return _ZngRK14__nv_bfloat162_nbst(arg_0)
+
+    @lower(operator.neg, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key("_ZngRK14__nv_bfloat162_nbst", shim_raw_str)
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZngRK14__nv_bfloat162_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            ptrs,
+        )
+
+
+_lower__ZngRK14__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def _lower__ZeqRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZeqRK14__nv_bfloat162S1__nbst(bool &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
+        retval = operator==(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZeqRK14__nv_bfloat162S1__nbst = declare_device(
+        "_ZeqRK14__nv_bfloat162S1__nbst",
+        bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZeqRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1):
+        return _ZeqRK14__nv_bfloat162S1__nbst(arg_0, arg_1)
+
+    @lower(operator.eq, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZeqRK14__nv_bfloat162S1__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZeqRK14__nv_bfloat162S1__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZeqRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZneRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZneRK14__nv_bfloat162S1__nbst(bool &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
+        retval = operator!=(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZneRK14__nv_bfloat162S1__nbst = declare_device(
+        "_ZneRK14__nv_bfloat162S1__nbst",
+        bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZneRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1):
+        return _ZneRK14__nv_bfloat162S1__nbst(arg_0, arg_1)
+
+    @lower(operator.ne, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZneRK14__nv_bfloat162S1__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZneRK14__nv_bfloat162S1__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZneRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZgtRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZgtRK14__nv_bfloat162S1__nbst(bool &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
+        retval = operator>(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZgtRK14__nv_bfloat162S1__nbst = declare_device(
+        "_ZgtRK14__nv_bfloat162S1__nbst",
+        bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZgtRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1):
+        return _ZgtRK14__nv_bfloat162S1__nbst(arg_0, arg_1)
+
+    @lower(operator.gt, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZgtRK14__nv_bfloat162S1__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZgtRK14__nv_bfloat162S1__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZgtRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZltRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZltRK14__nv_bfloat162S1__nbst(bool &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
+        retval = operator<(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZltRK14__nv_bfloat162S1__nbst = declare_device(
+        "_ZltRK14__nv_bfloat162S1__nbst",
+        bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZltRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1):
+        return _ZltRK14__nv_bfloat162S1__nbst(arg_0, arg_1)
+
+    @lower(operator.lt, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZltRK14__nv_bfloat162S1__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZltRK14__nv_bfloat162S1__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZltRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZgeRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZgeRK14__nv_bfloat162S1__nbst(bool &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
+        retval = operator>=(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZgeRK14__nv_bfloat162S1__nbst = declare_device(
+        "_ZgeRK14__nv_bfloat162S1__nbst",
+        bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZgeRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1):
+        return _ZgeRK14__nv_bfloat162S1__nbst(arg_0, arg_1)
+
+    @lower(operator.ge, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZgeRK14__nv_bfloat162S1__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZgeRK14__nv_bfloat162S1__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZgeRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZleRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZleRK14__nv_bfloat162S1__nbst(bool &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
+        retval = operator<=(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZleRK14__nv_bfloat162S1__nbst = declare_device(
+        "_ZleRK14__nv_bfloat162S1__nbst",
+        bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZleRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1):
+        return _ZleRK14__nv_bfloat162S1__nbst(arg_0, arg_1)
+
+    @lower(operator.le, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZleRK14__nv_bfloat162S1__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZleRK14__nv_bfloat162S1__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZleRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj)
+
+
+def __half():
+    pass
+
+
+def _lower__ZN6__halfC1E13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZN6__halfC1E13__nv_bfloat16_nbst(int &retval , __nv_bfloat16* f) {
+        __half(*f);
+        return 0;
+    }
+        """
+
+    _ZN6__halfC1E13__nv_bfloat16_nbst = declare_device(
+        "_ZN6__halfC1E13__nv_bfloat16_nbst", void(CPointer(_type___nv_bfloat16))
+    )
+
+    def _ZN6__halfC1E13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZN6__halfC1E13__nv_bfloat16_nbst(arg_0)
+
+    @lower(__half, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZN6__halfC1E13__nv_bfloat16_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZN6__halfC1E13__nv_bfloat16_nbst_caller,
+            signature(void, CPointer(_type___nv_bfloat16)),
+            ptrs,
+        )
+
+
+_lower__ZN6__halfC1E13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+@register
+class _typing___double2bfloat16(ConcreteTemplate):
+    key = globals()["__double2bfloat16"]
+    cases = [signature(_type___nv_bfloat16, float64)]
+
+
+register_global(__double2bfloat16, types.Function(_typing___double2bfloat16))
+
+
+@register
+class _typing___float2bfloat16(ConcreteTemplate):
+    key = globals()["__float2bfloat16"]
+    cases = [signature(_type___nv_bfloat16, float32)]
+
+
+register_global(__float2bfloat16, types.Function(_typing___float2bfloat16))
+
+
+@register
+class _typing___float2bfloat16_rn(ConcreteTemplate):
+    key = globals()["__float2bfloat16_rn"]
+    cases = [signature(_type___nv_bfloat16, float32)]
+
+
+register_global(
+    __float2bfloat16_rn, types.Function(_typing___float2bfloat16_rn)
+)
+
+
+@register
+class _typing___float2bfloat16_rz(ConcreteTemplate):
+    key = globals()["__float2bfloat16_rz"]
+    cases = [signature(_type___nv_bfloat16, float32)]
+
+
+register_global(
+    __float2bfloat16_rz, types.Function(_typing___float2bfloat16_rz)
+)
+
+
+@register
+class _typing___float2bfloat16_rd(ConcreteTemplate):
+    key = globals()["__float2bfloat16_rd"]
+    cases = [signature(_type___nv_bfloat16, float32)]
+
+
+register_global(
+    __float2bfloat16_rd, types.Function(_typing___float2bfloat16_rd)
+)
+
+
+@register
+class _typing___float2bfloat16_ru(ConcreteTemplate):
+    key = globals()["__float2bfloat16_ru"]
+    cases = [signature(_type___nv_bfloat16, float32)]
+
+
+register_global(
+    __float2bfloat16_ru, types.Function(_typing___float2bfloat16_ru)
+)
+
+
+@register
+class _typing___bfloat162float(ConcreteTemplate):
+    key = globals()["__bfloat162float"]
+    cases = [signature(float32, _type___nv_bfloat16)]
+
+
+register_global(__bfloat162float, types.Function(_typing___bfloat162float))
+
+
+@register
+class _typing___float2bfloat162_rn(ConcreteTemplate):
+    key = globals()["__float2bfloat162_rn"]
+    cases = [signature(_type___nv_bfloat162, float32)]
+
+
+register_global(
+    __float2bfloat162_rn, types.Function(_typing___float2bfloat162_rn)
+)
+
+
+@register
+class _typing___floats2bfloat162_rn(ConcreteTemplate):
+    key = globals()["__floats2bfloat162_rn"]
+    cases = [signature(_type___nv_bfloat162, float32, float32)]
+
+
+register_global(
+    __floats2bfloat162_rn, types.Function(_typing___floats2bfloat162_rn)
+)
+
+
+@register
+class _typing___low2float(ConcreteTemplate):
+    key = globals()["__low2float"]
+    cases = [signature(float32, _type___nv_bfloat162)]
+
+
+register_global(__low2float, types.Function(_typing___low2float))
+
+
+@register
+class _typing___high2float(ConcreteTemplate):
+    key = globals()["__high2float"]
+    cases = [signature(float32, _type___nv_bfloat162)]
+
+
+register_global(__high2float, types.Function(_typing___high2float))
+
+
+@register
+class _typing___float22bfloat162_rn(ConcreteTemplate):
+    key = globals()["__float22bfloat162_rn"]
+    cases = [signature(_type___nv_bfloat162, float32x2)]
+
+
+register_global(
+    __float22bfloat162_rn, types.Function(_typing___float22bfloat162_rn)
+)
+
+
+@register
+class _typing___bfloat1622float2(ConcreteTemplate):
+    key = globals()["__bfloat1622float2"]
+    cases = [signature(float32x2, _type___nv_bfloat162)]
+
+
+register_global(__bfloat1622float2, types.Function(_typing___bfloat1622float2))
+
+
+@register
+class _typing___bfloat162char_rz(ConcreteTemplate):
+    key = globals()["__bfloat162char_rz"]
+    cases = [signature(int8, _type___nv_bfloat16)]
+
+
+register_global(__bfloat162char_rz, types.Function(_typing___bfloat162char_rz))
+
+
+@register
+class _typing___bfloat162uchar_rz(ConcreteTemplate):
+    key = globals()["__bfloat162uchar_rz"]
+    cases = [signature(uint8, _type___nv_bfloat16)]
+
+
+register_global(
+    __bfloat162uchar_rz, types.Function(_typing___bfloat162uchar_rz)
+)
+
+
+@register
+class _typing___bfloat162int_rn(ConcreteTemplate):
+    key = globals()["__bfloat162int_rn"]
+    cases = [signature(int32, _type___nv_bfloat16)]
+
+
+register_global(__bfloat162int_rn, types.Function(_typing___bfloat162int_rn))
+
+
+@register
+class _typing___bfloat162int_rz(ConcreteTemplate):
+    key = globals()["__bfloat162int_rz"]
+    cases = [signature(int32, _type___nv_bfloat16)]
+
+
+register_global(__bfloat162int_rz, types.Function(_typing___bfloat162int_rz))
+
+
+@register
+class _typing___bfloat162int_rd(ConcreteTemplate):
+    key = globals()["__bfloat162int_rd"]
+    cases = [signature(int32, _type___nv_bfloat16)]
+
+
+register_global(__bfloat162int_rd, types.Function(_typing___bfloat162int_rd))
+
+
+@register
+class _typing___bfloat162int_ru(ConcreteTemplate):
+    key = globals()["__bfloat162int_ru"]
+    cases = [signature(int32, _type___nv_bfloat16)]
+
+
+register_global(__bfloat162int_ru, types.Function(_typing___bfloat162int_ru))
+
+
+@register
+class _typing___int2bfloat16_rn(ConcreteTemplate):
+    key = globals()["__int2bfloat16_rn"]
+    cases = [signature(_type___nv_bfloat16, int32)]
+
+
+register_global(__int2bfloat16_rn, types.Function(_typing___int2bfloat16_rn))
+
+
+@register
+class _typing___int2bfloat16_rz(ConcreteTemplate):
+    key = globals()["__int2bfloat16_rz"]
+    cases = [signature(_type___nv_bfloat16, int32)]
+
+
+register_global(__int2bfloat16_rz, types.Function(_typing___int2bfloat16_rz))
+
+
+@register
+class _typing___int2bfloat16_rd(ConcreteTemplate):
+    key = globals()["__int2bfloat16_rd"]
+    cases = [signature(_type___nv_bfloat16, int32)]
+
+
+register_global(__int2bfloat16_rd, types.Function(_typing___int2bfloat16_rd))
+
+
+@register
+class _typing___int2bfloat16_ru(ConcreteTemplate):
+    key = globals()["__int2bfloat16_ru"]
+    cases = [signature(_type___nv_bfloat16, int32)]
+
+
+register_global(__int2bfloat16_ru, types.Function(_typing___int2bfloat16_ru))
+
+
+@register
+class _typing___bfloat162short_rn(ConcreteTemplate):
+    key = globals()["__bfloat162short_rn"]
+    cases = [signature(int16, _type___nv_bfloat16)]
+
+
+register_global(
+    __bfloat162short_rn, types.Function(_typing___bfloat162short_rn)
+)
+
+
+@register
+class _typing___bfloat162short_rz(ConcreteTemplate):
+    key = globals()["__bfloat162short_rz"]
+    cases = [signature(int16, _type___nv_bfloat16)]
+
+
+register_global(
+    __bfloat162short_rz, types.Function(_typing___bfloat162short_rz)
+)
+
+
+@register
+class _typing___bfloat162short_rd(ConcreteTemplate):
+    key = globals()["__bfloat162short_rd"]
+    cases = [signature(int16, _type___nv_bfloat16)]
+
+
+register_global(
+    __bfloat162short_rd, types.Function(_typing___bfloat162short_rd)
+)
+
+
+@register
+class _typing___bfloat162short_ru(ConcreteTemplate):
+    key = globals()["__bfloat162short_ru"]
+    cases = [signature(int16, _type___nv_bfloat16)]
+
+
+register_global(
+    __bfloat162short_ru, types.Function(_typing___bfloat162short_ru)
+)
+
+
+@register
+class _typing___short2bfloat16_rn(ConcreteTemplate):
+    key = globals()["__short2bfloat16_rn"]
+    cases = [signature(_type___nv_bfloat16, int16)]
+
+
+register_global(
+    __short2bfloat16_rn, types.Function(_typing___short2bfloat16_rn)
+)
+
+
+@register
+class _typing___short2bfloat16_rz(ConcreteTemplate):
+    key = globals()["__short2bfloat16_rz"]
+    cases = [signature(_type___nv_bfloat16, int16)]
+
+
+register_global(
+    __short2bfloat16_rz, types.Function(_typing___short2bfloat16_rz)
+)
+
+
+@register
+class _typing___short2bfloat16_rd(ConcreteTemplate):
+    key = globals()["__short2bfloat16_rd"]
+    cases = [signature(_type___nv_bfloat16, int16)]
+
+
+register_global(
+    __short2bfloat16_rd, types.Function(_typing___short2bfloat16_rd)
+)
+
+
+@register
+class _typing___short2bfloat16_ru(ConcreteTemplate):
+    key = globals()["__short2bfloat16_ru"]
+    cases = [signature(_type___nv_bfloat16, int16)]
+
+
+register_global(
+    __short2bfloat16_ru, types.Function(_typing___short2bfloat16_ru)
+)
+
+
+@register
+class _typing___bfloat162uint_rn(ConcreteTemplate):
+    key = globals()["__bfloat162uint_rn"]
+    cases = [signature(uint32, _type___nv_bfloat16)]
+
+
+register_global(__bfloat162uint_rn, types.Function(_typing___bfloat162uint_rn))
+
+
+@register
+class _typing___bfloat162uint_rz(ConcreteTemplate):
+    key = globals()["__bfloat162uint_rz"]
+    cases = [signature(uint32, _type___nv_bfloat16)]
+
+
+register_global(__bfloat162uint_rz, types.Function(_typing___bfloat162uint_rz))
+
+
+@register
+class _typing___bfloat162uint_rd(ConcreteTemplate):
+    key = globals()["__bfloat162uint_rd"]
+    cases = [signature(uint32, _type___nv_bfloat16)]
+
+
+register_global(__bfloat162uint_rd, types.Function(_typing___bfloat162uint_rd))
+
+
+@register
+class _typing___bfloat162uint_ru(ConcreteTemplate):
+    key = globals()["__bfloat162uint_ru"]
+    cases = [signature(uint32, _type___nv_bfloat16)]
+
+
+register_global(__bfloat162uint_ru, types.Function(_typing___bfloat162uint_ru))
+
+
+@register
+class _typing___uint2bfloat16_rn(ConcreteTemplate):
+    key = globals()["__uint2bfloat16_rn"]
+    cases = [signature(_type___nv_bfloat16, uint32)]
+
+
+register_global(__uint2bfloat16_rn, types.Function(_typing___uint2bfloat16_rn))
+
+
+@register
+class _typing___uint2bfloat16_rz(ConcreteTemplate):
+    key = globals()["__uint2bfloat16_rz"]
+    cases = [signature(_type___nv_bfloat16, uint32)]
+
+
+register_global(__uint2bfloat16_rz, types.Function(_typing___uint2bfloat16_rz))
+
+
+@register
+class _typing___uint2bfloat16_rd(ConcreteTemplate):
+    key = globals()["__uint2bfloat16_rd"]
+    cases = [signature(_type___nv_bfloat16, uint32)]
+
+
+register_global(__uint2bfloat16_rd, types.Function(_typing___uint2bfloat16_rd))
+
+
+@register
+class _typing___uint2bfloat16_ru(ConcreteTemplate):
+    key = globals()["__uint2bfloat16_ru"]
+    cases = [signature(_type___nv_bfloat16, uint32)]
+
+
+register_global(__uint2bfloat16_ru, types.Function(_typing___uint2bfloat16_ru))
+
+
+@register
+class _typing___bfloat162ushort_rn(ConcreteTemplate):
+    key = globals()["__bfloat162ushort_rn"]
+    cases = [signature(uint16, _type___nv_bfloat16)]
+
+
+register_global(
+    __bfloat162ushort_rn, types.Function(_typing___bfloat162ushort_rn)
+)
+
+
+@register
+class _typing___bfloat162ushort_rz(ConcreteTemplate):
+    key = globals()["__bfloat162ushort_rz"]
+    cases = [signature(uint16, _type___nv_bfloat16)]
+
+
+register_global(
+    __bfloat162ushort_rz, types.Function(_typing___bfloat162ushort_rz)
+)
+
+
+@register
+class _typing___bfloat162ushort_rd(ConcreteTemplate):
+    key = globals()["__bfloat162ushort_rd"]
+    cases = [signature(uint16, _type___nv_bfloat16)]
+
+
+register_global(
+    __bfloat162ushort_rd, types.Function(_typing___bfloat162ushort_rd)
+)
+
+
+@register
+class _typing___bfloat162ushort_ru(ConcreteTemplate):
+    key = globals()["__bfloat162ushort_ru"]
+    cases = [signature(uint16, _type___nv_bfloat16)]
+
+
+register_global(
+    __bfloat162ushort_ru, types.Function(_typing___bfloat162ushort_ru)
+)
+
+
+@register
+class _typing___ushort2bfloat16_rn(ConcreteTemplate):
+    key = globals()["__ushort2bfloat16_rn"]
+    cases = [signature(_type___nv_bfloat16, uint16)]
+
+
+register_global(
+    __ushort2bfloat16_rn, types.Function(_typing___ushort2bfloat16_rn)
+)
+
+
+@register
+class _typing___ushort2bfloat16_rz(ConcreteTemplate):
+    key = globals()["__ushort2bfloat16_rz"]
+    cases = [signature(_type___nv_bfloat16, uint16)]
+
+
+register_global(
+    __ushort2bfloat16_rz, types.Function(_typing___ushort2bfloat16_rz)
+)
+
+
+@register
+class _typing___ushort2bfloat16_rd(ConcreteTemplate):
+    key = globals()["__ushort2bfloat16_rd"]
+    cases = [signature(_type___nv_bfloat16, uint16)]
+
+
+register_global(
+    __ushort2bfloat16_rd, types.Function(_typing___ushort2bfloat16_rd)
+)
+
+
+@register
+class _typing___ushort2bfloat16_ru(ConcreteTemplate):
+    key = globals()["__ushort2bfloat16_ru"]
+    cases = [signature(_type___nv_bfloat16, uint16)]
+
+
+register_global(
+    __ushort2bfloat16_ru, types.Function(_typing___ushort2bfloat16_ru)
+)
+
+
+@register
+class _typing___bfloat162ull_rn(ConcreteTemplate):
+    key = globals()["__bfloat162ull_rn"]
+    cases = [signature(uint64, _type___nv_bfloat16)]
+
+
+register_global(__bfloat162ull_rn, types.Function(_typing___bfloat162ull_rn))
+
+
+@register
+class _typing___bfloat162ull_rz(ConcreteTemplate):
+    key = globals()["__bfloat162ull_rz"]
+    cases = [signature(uint64, _type___nv_bfloat16)]
+
+
+register_global(__bfloat162ull_rz, types.Function(_typing___bfloat162ull_rz))
+
+
+@register
+class _typing_make_bfloat162(ConcreteTemplate):
+    key = globals()["make_bfloat162"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat16, _type___nv_bfloat16
+        )
+    ]
+
+
+register_global(make_bfloat162, types.Function(_typing_make_bfloat162))
+
+
+@register
+class _typing___bfloat162ull_rd(ConcreteTemplate):
+    key = globals()["__bfloat162ull_rd"]
+    cases = [signature(uint64, _type___nv_bfloat16)]
+
+
+register_global(__bfloat162ull_rd, types.Function(_typing___bfloat162ull_rd))
+
+
+@register
+class _typing___bfloat162ull_ru(ConcreteTemplate):
+    key = globals()["__bfloat162ull_ru"]
+    cases = [signature(uint64, _type___nv_bfloat16)]
+
+
+register_global(__bfloat162ull_ru, types.Function(_typing___bfloat162ull_ru))
+
+
+@register
+class _typing___ull2bfloat16_rn(ConcreteTemplate):
+    key = globals()["__ull2bfloat16_rn"]
+    cases = [signature(_type___nv_bfloat16, uint64)]
+
+
+register_global(__ull2bfloat16_rn, types.Function(_typing___ull2bfloat16_rn))
+
+
+@register
+class _typing___ull2bfloat16_rz(ConcreteTemplate):
+    key = globals()["__ull2bfloat16_rz"]
+    cases = [signature(_type___nv_bfloat16, uint64)]
+
+
+register_global(__ull2bfloat16_rz, types.Function(_typing___ull2bfloat16_rz))
+
+
+@register
+class _typing___ull2bfloat16_rd(ConcreteTemplate):
+    key = globals()["__ull2bfloat16_rd"]
+    cases = [signature(_type___nv_bfloat16, uint64)]
+
+
+register_global(__ull2bfloat16_rd, types.Function(_typing___ull2bfloat16_rd))
+
+
+@register
+class _typing___ull2bfloat16_ru(ConcreteTemplate):
+    key = globals()["__ull2bfloat16_ru"]
+    cases = [signature(_type___nv_bfloat16, uint64)]
+
+
+register_global(__ull2bfloat16_ru, types.Function(_typing___ull2bfloat16_ru))
+
+
+@register
+class _typing___bfloat162ll_rn(ConcreteTemplate):
+    key = globals()["__bfloat162ll_rn"]
+    cases = [signature(int64, _type___nv_bfloat16)]
+
+
+register_global(__bfloat162ll_rn, types.Function(_typing___bfloat162ll_rn))
+
+
+@register
+class _typing___bfloat162ll_rz(ConcreteTemplate):
+    key = globals()["__bfloat162ll_rz"]
+    cases = [signature(int64, _type___nv_bfloat16)]
+
+
+register_global(__bfloat162ll_rz, types.Function(_typing___bfloat162ll_rz))
+
+
+@register
+class _typing___bfloat162ll_rd(ConcreteTemplate):
+    key = globals()["__bfloat162ll_rd"]
+    cases = [signature(int64, _type___nv_bfloat16)]
+
+
+register_global(__bfloat162ll_rd, types.Function(_typing___bfloat162ll_rd))
+
+
+@register
+class _typing___bfloat162ll_ru(ConcreteTemplate):
+    key = globals()["__bfloat162ll_ru"]
+    cases = [signature(int64, _type___nv_bfloat16)]
+
+
+register_global(__bfloat162ll_ru, types.Function(_typing___bfloat162ll_ru))
+
+
+@register
+class _typing___ll2bfloat16_rn(ConcreteTemplate):
+    key = globals()["__ll2bfloat16_rn"]
+    cases = [signature(_type___nv_bfloat16, int64)]
+
+
+register_global(__ll2bfloat16_rn, types.Function(_typing___ll2bfloat16_rn))
+
+
+@register
+class _typing___ll2bfloat16_rz(ConcreteTemplate):
+    key = globals()["__ll2bfloat16_rz"]
+    cases = [signature(_type___nv_bfloat16, int64)]
+
+
+register_global(__ll2bfloat16_rz, types.Function(_typing___ll2bfloat16_rz))
+
+
+@register
+class _typing___ll2bfloat16_rd(ConcreteTemplate):
+    key = globals()["__ll2bfloat16_rd"]
+    cases = [signature(_type___nv_bfloat16, int64)]
+
+
+register_global(__ll2bfloat16_rd, types.Function(_typing___ll2bfloat16_rd))
+
+
+@register
+class _typing___ll2bfloat16_ru(ConcreteTemplate):
+    key = globals()["__ll2bfloat16_ru"]
+    cases = [signature(_type___nv_bfloat16, int64)]
+
+
+register_global(__ll2bfloat16_ru, types.Function(_typing___ll2bfloat16_ru))
+
+
+@register
+class _typing_htrunc(ConcreteTemplate):
+    key = globals()["htrunc"]
+    cases = [signature(_type___nv_bfloat16, _type___nv_bfloat16)]
+
+
+register_global(htrunc, types.Function(_typing_htrunc))
+
+
+@register
+class _typing_hceil(ConcreteTemplate):
+    key = globals()["hceil"]
+    cases = [signature(_type___nv_bfloat16, _type___nv_bfloat16)]
+
+
+register_global(hceil, types.Function(_typing_hceil))
+
+
+@register
+class _typing_hfloor(ConcreteTemplate):
+    key = globals()["hfloor"]
+    cases = [signature(_type___nv_bfloat16, _type___nv_bfloat16)]
+
+
+register_global(hfloor, types.Function(_typing_hfloor))
+
+
+@register
+class _typing_hrint(ConcreteTemplate):
+    key = globals()["hrint"]
+    cases = [signature(_type___nv_bfloat16, _type___nv_bfloat16)]
+
+
+register_global(hrint, types.Function(_typing_hrint))
+
+
+@register
+class _typing_h2trunc(ConcreteTemplate):
+    key = globals()["h2trunc"]
+    cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(h2trunc, types.Function(_typing_h2trunc))
+
+
+@register
+class _typing_h2ceil(ConcreteTemplate):
+    key = globals()["h2ceil"]
+    cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(h2ceil, types.Function(_typing_h2ceil))
+
+
+@register
+class _typing_h2floor(ConcreteTemplate):
+    key = globals()["h2floor"]
+    cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(h2floor, types.Function(_typing_h2floor))
+
+
+@register
+class _typing_h2rint(ConcreteTemplate):
+    key = globals()["h2rint"]
+    cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(h2rint, types.Function(_typing_h2rint))
+
+
+@register
+class _typing___bfloat162bfloat162(ConcreteTemplate):
+    key = globals()["__bfloat162bfloat162"]
+    cases = [signature(_type___nv_bfloat162, _type___nv_bfloat16)]
+
+
+register_global(
+    __bfloat162bfloat162, types.Function(_typing___bfloat162bfloat162)
+)
+
+
+@register
+class _typing___lowhigh2highlow(ConcreteTemplate):
+    key = globals()["__lowhigh2highlow"]
+    cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__lowhigh2highlow, types.Function(_typing___lowhigh2highlow))
+
+
+@register
+class _typing___lows2bfloat162(ConcreteTemplate):
+    key = globals()["__lows2bfloat162"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
+
+
+register_global(__lows2bfloat162, types.Function(_typing___lows2bfloat162))
+
+
+@register
+class _typing___highs2bfloat162(ConcreteTemplate):
+    key = globals()["__highs2bfloat162"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
+
+
+register_global(__highs2bfloat162, types.Function(_typing___highs2bfloat162))
+
+
+@register
+class _typing___high2bfloat16(ConcreteTemplate):
+    key = globals()["__high2bfloat16"]
+    cases = [signature(_type___nv_bfloat16, _type___nv_bfloat162)]
+
+
+register_global(__high2bfloat16, types.Function(_typing___high2bfloat16))
+
+
+@register
+class _typing___low2bfloat16(ConcreteTemplate):
+    key = globals()["__low2bfloat16"]
+    cases = [signature(_type___nv_bfloat16, _type___nv_bfloat162)]
+
+
+register_global(__low2bfloat16, types.Function(_typing___low2bfloat16))
+
+
+@register
+class _typing___hisinf(ConcreteTemplate):
+    key = globals()["__hisinf"]
+    cases = [signature(int32, _type___nv_bfloat16)]
+
+
+register_global(__hisinf, types.Function(_typing___hisinf))
+
+
+@register
+class _typing___halves2bfloat162(ConcreteTemplate):
+    key = globals()["__halves2bfloat162"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat16, _type___nv_bfloat16
+        )
+    ]
+
+
+register_global(__halves2bfloat162, types.Function(_typing___halves2bfloat162))
+
+
+@register
+class _typing___low2bfloat162(ConcreteTemplate):
+    key = globals()["__low2bfloat162"]
+    cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__low2bfloat162, types.Function(_typing___low2bfloat162))
+
+
+@register
+class _typing___high2bfloat162(ConcreteTemplate):
+    key = globals()["__high2bfloat162"]
+    cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__high2bfloat162, types.Function(_typing___high2bfloat162))
+
+
+@register
+class _typing___bfloat16_as_short(ConcreteTemplate):
+    key = globals()["__bfloat16_as_short"]
+    cases = [signature(int16, _type___nv_bfloat16)]
+
+
+register_global(
+    __bfloat16_as_short, types.Function(_typing___bfloat16_as_short)
+)
+
+
+@register
+class _typing___bfloat16_as_ushort(ConcreteTemplate):
+    key = globals()["__bfloat16_as_ushort"]
+    cases = [signature(uint16, _type___nv_bfloat16)]
+
+
+register_global(
+    __bfloat16_as_ushort, types.Function(_typing___bfloat16_as_ushort)
+)
+
+
+@register
+class _typing___short_as_bfloat16(ConcreteTemplate):
+    key = globals()["__short_as_bfloat16"]
+    cases = [signature(_type___nv_bfloat16, int16)]
+
+
+register_global(
+    __short_as_bfloat16, types.Function(_typing___short_as_bfloat16)
+)
+
+
+@register
+class _typing___ushort_as_bfloat16(ConcreteTemplate):
+    key = globals()["__ushort_as_bfloat16"]
+    cases = [signature(_type___nv_bfloat16, uint16)]
+
+
+register_global(
+    __ushort_as_bfloat16, types.Function(_typing___ushort_as_bfloat16)
+)
+
+
+@register
+class _typing___shfl_sync(ConcreteTemplate):
+    key = globals()["__shfl_sync"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, uint32, _type___nv_bfloat162, int32, int32
+        ),
+        signature(
+            _type___nv_bfloat16, uint32, _type___nv_bfloat16, int32, int32
+        ),
+    ]
+
+
+register_global(__shfl_sync, types.Function(_typing___shfl_sync))
+
+
+@register
+class _typing___shfl_up_sync(ConcreteTemplate):
+    key = globals()["__shfl_up_sync"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, uint32, _type___nv_bfloat162, uint32, int32
+        ),
+        signature(
+            _type___nv_bfloat16, uint32, _type___nv_bfloat16, uint32, int32
+        ),
+    ]
+
+
+register_global(__shfl_up_sync, types.Function(_typing___shfl_up_sync))
+
+
+@register
+class _typing___shfl_down_sync(ConcreteTemplate):
+    key = globals()["__shfl_down_sync"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, uint32, _type___nv_bfloat162, uint32, int32
+        ),
+        signature(
+            _type___nv_bfloat16, uint32, _type___nv_bfloat16, uint32, int32
+        ),
+    ]
+
+
+register_global(__shfl_down_sync, types.Function(_typing___shfl_down_sync))
+
+
+@register
+class _typing___shfl_xor_sync(ConcreteTemplate):
+    key = globals()["__shfl_xor_sync"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, uint32, _type___nv_bfloat162, int32, int32
+        ),
+        signature(
+            _type___nv_bfloat16, uint32, _type___nv_bfloat16, int32, int32
+        ),
+    ]
+
+
+register_global(__shfl_xor_sync, types.Function(_typing___shfl_xor_sync))
+
+
+@register
+class _typing___ldg(ConcreteTemplate):
+    key = globals()["__ldg"]
+    cases = [
+        signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+        signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+    ]
+
+
+register_global(__ldg, types.Function(_typing___ldg))
+
+
+@register
+class _typing___ldcg(ConcreteTemplate):
+    key = globals()["__ldcg"]
+    cases = [
+        signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+        signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+    ]
+
+
+register_global(__ldcg, types.Function(_typing___ldcg))
+
+
+@register
+class _typing___ldca(ConcreteTemplate):
+    key = globals()["__ldca"]
+    cases = [
+        signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+        signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+    ]
+
+
+register_global(__ldca, types.Function(_typing___ldca))
+
+
+@register
+class _typing___ldcs(ConcreteTemplate):
+    key = globals()["__ldcs"]
+    cases = [
+        signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+        signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+    ]
+
+
+register_global(__ldcs, types.Function(_typing___ldcs))
+
+
+@register
+class _typing___ldlu(ConcreteTemplate):
+    key = globals()["__ldlu"]
+    cases = [
+        signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+        signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+    ]
+
+
+register_global(__ldlu, types.Function(_typing___ldlu))
+
+
+@register
+class _typing___ldcv(ConcreteTemplate):
+    key = globals()["__ldcv"]
+    cases = [
+        signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+        signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+    ]
+
+
+register_global(__ldcv, types.Function(_typing___ldcv))
+
+
+@register
+class _typing___stwb(ConcreteTemplate):
+    key = globals()["__stwb"]
+    cases = [
+        signature(void, CPointer(_type___nv_bfloat162), _type___nv_bfloat162),
+        signature(void, CPointer(_type___nv_bfloat16), _type___nv_bfloat16),
+    ]
+
+
+register_global(__stwb, types.Function(_typing___stwb))
+
+
+@register
+class _typing___stcg(ConcreteTemplate):
+    key = globals()["__stcg"]
+    cases = [
+        signature(void, CPointer(_type___nv_bfloat162), _type___nv_bfloat162),
+        signature(void, CPointer(_type___nv_bfloat16), _type___nv_bfloat16),
+    ]
+
+
+register_global(__stcg, types.Function(_typing___stcg))
+
+
+@register
+class _typing___stcs(ConcreteTemplate):
+    key = globals()["__stcs"]
+    cases = [
+        signature(void, CPointer(_type___nv_bfloat162), _type___nv_bfloat162),
+        signature(void, CPointer(_type___nv_bfloat16), _type___nv_bfloat16),
+    ]
+
+
+register_global(__stcs, types.Function(_typing___stcs))
+
+
+@register
+class _typing___stwt(ConcreteTemplate):
+    key = globals()["__stwt"]
+    cases = [
+        signature(void, CPointer(_type___nv_bfloat162), _type___nv_bfloat162),
+        signature(void, CPointer(_type___nv_bfloat16), _type___nv_bfloat16),
+    ]
+
+
+register_global(__stwt, types.Function(_typing___stwt))
+
+
+@register
+class _typing___heq2(ConcreteTemplate):
+    key = globals()["__heq2"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
+
+
+register_global(__heq2, types.Function(_typing___heq2))
+
+
+@register
+class _typing___hne2(ConcreteTemplate):
+    key = globals()["__hne2"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
+
+
+register_global(__hne2, types.Function(_typing___hne2))
+
+
+@register
+class _typing___hle2(ConcreteTemplate):
+    key = globals()["__hle2"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
+
+
+register_global(__hle2, types.Function(_typing___hle2))
+
+
+@register
+class _typing___hge2(ConcreteTemplate):
+    key = globals()["__hge2"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
+
+
+register_global(__hge2, types.Function(_typing___hge2))
+
+
+@register
+class _typing___hlt2(ConcreteTemplate):
+    key = globals()["__hlt2"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
+
+
+register_global(__hlt2, types.Function(_typing___hlt2))
+
+
+@register
+class _typing___hgt2(ConcreteTemplate):
+    key = globals()["__hgt2"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
+
+
+register_global(__hgt2, types.Function(_typing___hgt2))
+
+
+@register
+class _typing___hequ2(ConcreteTemplate):
+    key = globals()["__hequ2"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
+
+
+register_global(__hequ2, types.Function(_typing___hequ2))
+
+
+@register
+class _typing___hneu2(ConcreteTemplate):
+    key = globals()["__hneu2"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
+
+
+register_global(__hneu2, types.Function(_typing___hneu2))
+
+
+@register
+class _typing___hleu2(ConcreteTemplate):
+    key = globals()["__hleu2"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
+
+
+register_global(__hleu2, types.Function(_typing___hleu2))
+
+
+@register
+class _typing___hgeu2(ConcreteTemplate):
+    key = globals()["__hgeu2"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
+
+
+register_global(__hgeu2, types.Function(_typing___hgeu2))
+
+
+@register
+class _typing___hltu2(ConcreteTemplate):
+    key = globals()["__hltu2"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
+
+
+register_global(__hltu2, types.Function(_typing___hltu2))
+
+
+@register
+class _typing___hgtu2(ConcreteTemplate):
+    key = globals()["__hgtu2"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
+
+
+register_global(__hgtu2, types.Function(_typing___hgtu2))
+
+
+@register
+class _typing___heq2_mask(ConcreteTemplate):
+    key = globals()["__heq2_mask"]
+    cases = [signature(uint32, _type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__heq2_mask, types.Function(_typing___heq2_mask))
+
+
+@register
+class _typing___hne2_mask(ConcreteTemplate):
+    key = globals()["__hne2_mask"]
+    cases = [signature(uint32, _type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__hne2_mask, types.Function(_typing___hne2_mask))
+
+
+@register
+class _typing___hle2_mask(ConcreteTemplate):
+    key = globals()["__hle2_mask"]
+    cases = [signature(uint32, _type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__hle2_mask, types.Function(_typing___hle2_mask))
+
+
+@register
+class _typing___hge2_mask(ConcreteTemplate):
+    key = globals()["__hge2_mask"]
+    cases = [signature(uint32, _type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__hge2_mask, types.Function(_typing___hge2_mask))
+
+
+@register
+class _typing___hlt2_mask(ConcreteTemplate):
+    key = globals()["__hlt2_mask"]
+    cases = [signature(uint32, _type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__hlt2_mask, types.Function(_typing___hlt2_mask))
+
+
+@register
+class _typing___hgt2_mask(ConcreteTemplate):
+    key = globals()["__hgt2_mask"]
+    cases = [signature(uint32, _type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__hgt2_mask, types.Function(_typing___hgt2_mask))
+
+
+@register
+class _typing___hequ2_mask(ConcreteTemplate):
+    key = globals()["__hequ2_mask"]
+    cases = [signature(uint32, _type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__hequ2_mask, types.Function(_typing___hequ2_mask))
+
+
+@register
+class _typing___hneu2_mask(ConcreteTemplate):
+    key = globals()["__hneu2_mask"]
+    cases = [signature(uint32, _type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__hneu2_mask, types.Function(_typing___hneu2_mask))
+
+
+@register
+class _typing___hleu2_mask(ConcreteTemplate):
+    key = globals()["__hleu2_mask"]
+    cases = [signature(uint32, _type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__hleu2_mask, types.Function(_typing___hleu2_mask))
+
+
+@register
+class _typing___hgeu2_mask(ConcreteTemplate):
+    key = globals()["__hgeu2_mask"]
+    cases = [signature(uint32, _type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__hgeu2_mask, types.Function(_typing___hgeu2_mask))
+
+
+@register
+class _typing___hltu2_mask(ConcreteTemplate):
+    key = globals()["__hltu2_mask"]
+    cases = [signature(uint32, _type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__hltu2_mask, types.Function(_typing___hltu2_mask))
+
+
+@register
+class _typing___hgtu2_mask(ConcreteTemplate):
+    key = globals()["__hgtu2_mask"]
+    cases = [signature(uint32, _type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__hgtu2_mask, types.Function(_typing___hgtu2_mask))
+
+
+@register
+class _typing___hisnan2(ConcreteTemplate):
+    key = globals()["__hisnan2"]
+    cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__hisnan2, types.Function(_typing___hisnan2))
+
+
+@register
+class _typing___hadd2(ConcreteTemplate):
+    key = globals()["__hadd2"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
+
+
+register_global(__hadd2, types.Function(_typing___hadd2))
+
+
+@register
+class _typing___hsub2(ConcreteTemplate):
+    key = globals()["__hsub2"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
+
+
+register_global(__hsub2, types.Function(_typing___hsub2))
+
+
+@register
+class _typing___hmul2(ConcreteTemplate):
+    key = globals()["__hmul2"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
+
+
+register_global(__hmul2, types.Function(_typing___hmul2))
+
+
+@register
+class _typing___hadd2_rn(ConcreteTemplate):
+    key = globals()["__hadd2_rn"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
+
+
+register_global(__hadd2_rn, types.Function(_typing___hadd2_rn))
+
+
+@register
+class _typing___hsub2_rn(ConcreteTemplate):
+    key = globals()["__hsub2_rn"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
+
+
+register_global(__hsub2_rn, types.Function(_typing___hsub2_rn))
+
+
+@register
+class _typing___hmul2_rn(ConcreteTemplate):
+    key = globals()["__hmul2_rn"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
+
+
+register_global(__hmul2_rn, types.Function(_typing___hmul2_rn))
+
+
+@register
+class _typing___h2div(ConcreteTemplate):
+    key = globals()["__h2div"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
+
+
+register_global(__h2div, types.Function(_typing___h2div))
+
+
+@register
+class _typing___habs2(ConcreteTemplate):
+    key = globals()["__habs2"]
+    cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__habs2, types.Function(_typing___habs2))
+
+
+@register
+class _typing___hadd2_sat(ConcreteTemplate):
+    key = globals()["__hadd2_sat"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
+
+
+register_global(__hadd2_sat, types.Function(_typing___hadd2_sat))
+
+
+@register
+class _typing___hsub2_sat(ConcreteTemplate):
+    key = globals()["__hsub2_sat"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
+
+
+register_global(__hsub2_sat, types.Function(_typing___hsub2_sat))
+
+
+@register
+class _typing___hmul2_sat(ConcreteTemplate):
+    key = globals()["__hmul2_sat"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
+
+
+register_global(__hmul2_sat, types.Function(_typing___hmul2_sat))
+
+
+@register
+class _typing___hfma2(ConcreteTemplate):
+    key = globals()["__hfma2"]
+    cases = [
+        signature(
+            _type___nv_bfloat162,
+            _type___nv_bfloat162,
+            _type___nv_bfloat162,
+            _type___nv_bfloat162,
+        )
+    ]
+
+
+register_global(__hfma2, types.Function(_typing___hfma2))
+
+
+@register
+class _typing___hfma2_sat(ConcreteTemplate):
+    key = globals()["__hfma2_sat"]
+    cases = [
+        signature(
+            _type___nv_bfloat162,
+            _type___nv_bfloat162,
+            _type___nv_bfloat162,
+            _type___nv_bfloat162,
+        )
+    ]
+
+
+register_global(__hfma2_sat, types.Function(_typing___hfma2_sat))
+
+
+@register
+class _typing___hneg2(ConcreteTemplate):
+    key = globals()["__hneg2"]
+    cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__hneg2, types.Function(_typing___hneg2))
+
+
+@register
+class _typing___habs(ConcreteTemplate):
+    key = globals()["__habs"]
+    cases = [signature(_type___nv_bfloat16, _type___nv_bfloat16)]
+
+
+register_global(__habs, types.Function(_typing___habs))
+
+
+@register
+class _typing___hadd(ConcreteTemplate):
+    key = globals()["__hadd"]
+    cases = [
+        signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16)
+    ]
+
+
+register_global(__hadd, types.Function(_typing___hadd))
+
+
+@register
+class _typing___hsub(ConcreteTemplate):
+    key = globals()["__hsub"]
+    cases = [
+        signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16)
+    ]
+
+
+register_global(__hsub, types.Function(_typing___hsub))
+
+
+@register
+class _typing___hmul(ConcreteTemplate):
+    key = globals()["__hmul"]
+    cases = [
+        signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16)
+    ]
+
+
+register_global(__hmul, types.Function(_typing___hmul))
+
+
+@register
+class _typing___hadd_rn(ConcreteTemplate):
+    key = globals()["__hadd_rn"]
+    cases = [
+        signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16)
+    ]
+
+
+register_global(__hadd_rn, types.Function(_typing___hadd_rn))
+
+
+@register
+class _typing___hsub_rn(ConcreteTemplate):
+    key = globals()["__hsub_rn"]
+    cases = [
+        signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16)
+    ]
+
+
+register_global(__hsub_rn, types.Function(_typing___hsub_rn))
+
+
+@register
+class _typing___hmul_rn(ConcreteTemplate):
+    key = globals()["__hmul_rn"]
+    cases = [
+        signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16)
+    ]
+
+
+register_global(__hmul_rn, types.Function(_typing___hmul_rn))
+
+
+@register
+class _typing___hdiv(ConcreteTemplate):
+    key = globals()["__hdiv"]
+    cases = [
+        signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16)
+    ]
+
+
+register_global(__hdiv, types.Function(_typing___hdiv))
+
+
+@register
+class _typing___hadd_sat(ConcreteTemplate):
+    key = globals()["__hadd_sat"]
+    cases = [
+        signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16)
+    ]
+
+
+register_global(__hadd_sat, types.Function(_typing___hadd_sat))
+
+
+@register
+class _typing___hsub_sat(ConcreteTemplate):
+    key = globals()["__hsub_sat"]
+    cases = [
+        signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16)
+    ]
+
+
+register_global(__hsub_sat, types.Function(_typing___hsub_sat))
+
+
+@register
+class _typing___hmul_sat(ConcreteTemplate):
+    key = globals()["__hmul_sat"]
+    cases = [
+        signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16)
+    ]
+
+
+register_global(__hmul_sat, types.Function(_typing___hmul_sat))
+
+
+@register
+class _typing___hfma(ConcreteTemplate):
+    key = globals()["__hfma"]
+    cases = [
+        signature(
+            _type___nv_bfloat16,
+            _type___nv_bfloat16,
+            _type___nv_bfloat16,
+            _type___nv_bfloat16,
+        )
+    ]
+
+
+register_global(__hfma, types.Function(_typing___hfma))
+
+
+@register
+class _typing___hfma_sat(ConcreteTemplate):
+    key = globals()["__hfma_sat"]
+    cases = [
+        signature(
+            _type___nv_bfloat16,
+            _type___nv_bfloat16,
+            _type___nv_bfloat16,
+            _type___nv_bfloat16,
+        )
+    ]
+
+
+register_global(__hfma_sat, types.Function(_typing___hfma_sat))
+
+
+@register
+class _typing___hneg(ConcreteTemplate):
+    key = globals()["__hneg"]
+    cases = [signature(_type___nv_bfloat16, _type___nv_bfloat16)]
+
+
+register_global(__hneg, types.Function(_typing___hneg))
+
+
+@register
+class _typing___hbeq2(ConcreteTemplate):
+    key = globals()["__hbeq2"]
+    cases = [signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__hbeq2, types.Function(_typing___hbeq2))
+
+
+@register
+class _typing___hbne2(ConcreteTemplate):
+    key = globals()["__hbne2"]
+    cases = [signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__hbne2, types.Function(_typing___hbne2))
+
+
+@register
+class _typing___hble2(ConcreteTemplate):
+    key = globals()["__hble2"]
+    cases = [signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__hble2, types.Function(_typing___hble2))
+
+
+@register
+class _typing___hbge2(ConcreteTemplate):
+    key = globals()["__hbge2"]
+    cases = [signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__hbge2, types.Function(_typing___hbge2))
+
+
+@register
+class _typing___hblt2(ConcreteTemplate):
+    key = globals()["__hblt2"]
+    cases = [signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__hblt2, types.Function(_typing___hblt2))
+
+
+@register
+class _typing___hbgt2(ConcreteTemplate):
+    key = globals()["__hbgt2"]
+    cases = [signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__hbgt2, types.Function(_typing___hbgt2))
+
+
+@register
+class _typing___hbequ2(ConcreteTemplate):
+    key = globals()["__hbequ2"]
+    cases = [signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__hbequ2, types.Function(_typing___hbequ2))
+
+
+@register
+class _typing___hbneu2(ConcreteTemplate):
+    key = globals()["__hbneu2"]
+    cases = [signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__hbneu2, types.Function(_typing___hbneu2))
+
+
+@register
+class _typing___hbleu2(ConcreteTemplate):
+    key = globals()["__hbleu2"]
+    cases = [signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__hbleu2, types.Function(_typing___hbleu2))
+
+
+@register
+class _typing___hbgeu2(ConcreteTemplate):
+    key = globals()["__hbgeu2"]
+    cases = [signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__hbgeu2, types.Function(_typing___hbgeu2))
+
+
+@register
+class _typing___hbltu2(ConcreteTemplate):
+    key = globals()["__hbltu2"]
+    cases = [signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__hbltu2, types.Function(_typing___hbltu2))
+
+
+@register
+class _typing___hbgtu2(ConcreteTemplate):
+    key = globals()["__hbgtu2"]
+    cases = [signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__hbgtu2, types.Function(_typing___hbgtu2))
+
+
+@register
+class _typing___heq(ConcreteTemplate):
+    key = globals()["__heq"]
+    cases = [signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16)]
+
+
+register_global(__heq, types.Function(_typing___heq))
+
+
+@register
+class _typing___hne(ConcreteTemplate):
+    key = globals()["__hne"]
+    cases = [signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16)]
 
 
-_operator_neg_2_lower(shim_stream, shim_obj)
+register_global(__hne, types.Function(_typing___hne))
 
 
-def _operator_eq_2_lower(shim_stream, shim_obj):
-    shim_raw_str = """
-    extern "C" __device__ int
-    operator_eq_2(bool &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
-        retval = operator==(*lh, *rh);
-        return 0;
-    }
-        """
+@register
+class _typing___hle(ConcreteTemplate):
+    key = globals()["__hle"]
+    cases = [signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16)]
 
-    operator_eq_2 = declare_device(
-        "operator_eq_2",
-        bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
-    )
 
-    def operator_eq_2_caller(arg_0, arg_1):
-        return operator_eq_2(arg_0, arg_1)
+register_global(__hle, types.Function(_typing___hle))
 
-    @lower(operator.eq, _type___nv_bfloat162, _type___nv_bfloat162)
-    def impl(context, builder, sig, args):
-        context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_eq_2", shim_raw_str)
-        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
-        for ptr, ty, arg in zip(ptrs, sig.args, args):
-            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
-        return context.compile_internal(
-            builder,
-            operator_eq_2_caller,
-            signature(
-                bool_,
-                CPointer(_type___nv_bfloat162),
-                CPointer(_type___nv_bfloat162),
-            ),
-            ptrs,
-        )
+@register
+class _typing___hge(ConcreteTemplate):
+    key = globals()["__hge"]
+    cases = [signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16)]
 
 
-_operator_eq_2_lower(shim_stream, shim_obj)
+register_global(__hge, types.Function(_typing___hge))
 
 
-def _operator_ne_2_lower(shim_stream, shim_obj):
-    shim_raw_str = """
-    extern "C" __device__ int
-    operator_ne_2(bool &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
-        retval = operator!=(*lh, *rh);
-        return 0;
-    }
-        """
+@register
+class _typing___hlt(ConcreteTemplate):
+    key = globals()["__hlt"]
+    cases = [signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16)]
 
-    operator_ne_2 = declare_device(
-        "operator_ne_2",
-        bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
-    )
 
-    def operator_ne_2_caller(arg_0, arg_1):
-        return operator_ne_2(arg_0, arg_1)
+register_global(__hlt, types.Function(_typing___hlt))
 
-    @lower(operator.ne, _type___nv_bfloat162, _type___nv_bfloat162)
-    def impl(context, builder, sig, args):
-        context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_ne_2", shim_raw_str)
-        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
-        for ptr, ty, arg in zip(ptrs, sig.args, args):
-            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
-        return context.compile_internal(
-            builder,
-            operator_ne_2_caller,
-            signature(
-                bool_,
-                CPointer(_type___nv_bfloat162),
-                CPointer(_type___nv_bfloat162),
-            ),
-            ptrs,
-        )
+@register
+class _typing___hgt(ConcreteTemplate):
+    key = globals()["__hgt"]
+    cases = [signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16)]
 
 
-_operator_ne_2_lower(shim_stream, shim_obj)
+register_global(__hgt, types.Function(_typing___hgt))
 
 
-def _operator_gt_2_lower(shim_stream, shim_obj):
-    shim_raw_str = """
-    extern "C" __device__ int
-    operator_gt_2(bool &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
-        retval = operator>(*lh, *rh);
-        return 0;
-    }
-        """
+@register
+class _typing___hequ(ConcreteTemplate):
+    key = globals()["__hequ"]
+    cases = [signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16)]
 
-    operator_gt_2 = declare_device(
-        "operator_gt_2",
-        bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
-    )
 
-    def operator_gt_2_caller(arg_0, arg_1):
-        return operator_gt_2(arg_0, arg_1)
+register_global(__hequ, types.Function(_typing___hequ))
 
-    @lower(operator.gt, _type___nv_bfloat162, _type___nv_bfloat162)
-    def impl(context, builder, sig, args):
-        context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_gt_2", shim_raw_str)
-        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
-        for ptr, ty, arg in zip(ptrs, sig.args, args):
-            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
-        return context.compile_internal(
-            builder,
-            operator_gt_2_caller,
-            signature(
-                bool_,
-                CPointer(_type___nv_bfloat162),
-                CPointer(_type___nv_bfloat162),
-            ),
-            ptrs,
-        )
+@register
+class _typing___hneu(ConcreteTemplate):
+    key = globals()["__hneu"]
+    cases = [signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16)]
 
 
-_operator_gt_2_lower(shim_stream, shim_obj)
+register_global(__hneu, types.Function(_typing___hneu))
 
 
-def _operator_lt_2_lower(shim_stream, shim_obj):
-    shim_raw_str = """
-    extern "C" __device__ int
-    operator_lt_2(bool &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
-        retval = operator<(*lh, *rh);
-        return 0;
-    }
-        """
+@register
+class _typing___hleu(ConcreteTemplate):
+    key = globals()["__hleu"]
+    cases = [signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16)]
 
-    operator_lt_2 = declare_device(
-        "operator_lt_2",
-        bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
-    )
 
-    def operator_lt_2_caller(arg_0, arg_1):
-        return operator_lt_2(arg_0, arg_1)
+register_global(__hleu, types.Function(_typing___hleu))
 
-    @lower(operator.lt, _type___nv_bfloat162, _type___nv_bfloat162)
-    def impl(context, builder, sig, args):
-        context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_lt_2", shim_raw_str)
-        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
-        for ptr, ty, arg in zip(ptrs, sig.args, args):
-            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
-        return context.compile_internal(
-            builder,
-            operator_lt_2_caller,
-            signature(
-                bool_,
-                CPointer(_type___nv_bfloat162),
-                CPointer(_type___nv_bfloat162),
-            ),
-            ptrs,
-        )
+@register
+class _typing___hgeu(ConcreteTemplate):
+    key = globals()["__hgeu"]
+    cases = [signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16)]
 
 
-_operator_lt_2_lower(shim_stream, shim_obj)
+register_global(__hgeu, types.Function(_typing___hgeu))
 
 
-def _operator_ge_2_lower(shim_stream, shim_obj):
-    shim_raw_str = """
-    extern "C" __device__ int
-    operator_ge_2(bool &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
-        retval = operator>=(*lh, *rh);
-        return 0;
-    }
-        """
+@register
+class _typing___hltu(ConcreteTemplate):
+    key = globals()["__hltu"]
+    cases = [signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16)]
 
-    operator_ge_2 = declare_device(
-        "operator_ge_2",
-        bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
-    )
 
-    def operator_ge_2_caller(arg_0, arg_1):
-        return operator_ge_2(arg_0, arg_1)
+register_global(__hltu, types.Function(_typing___hltu))
 
-    @lower(operator.ge, _type___nv_bfloat162, _type___nv_bfloat162)
-    def impl(context, builder, sig, args):
-        context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_ge_2", shim_raw_str)
-        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
-        for ptr, ty, arg in zip(ptrs, sig.args, args):
-            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
-        return context.compile_internal(
-            builder,
-            operator_ge_2_caller,
-            signature(
-                bool_,
-                CPointer(_type___nv_bfloat162),
-                CPointer(_type___nv_bfloat162),
-            ),
-            ptrs,
-        )
+@register
+class _typing___hgtu(ConcreteTemplate):
+    key = globals()["__hgtu"]
+    cases = [signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16)]
 
 
-_operator_ge_2_lower(shim_stream, shim_obj)
+register_global(__hgtu, types.Function(_typing___hgtu))
 
 
-def _operator_le_2_lower(shim_stream, shim_obj):
-    shim_raw_str = """
-    extern "C" __device__ int
-    operator_le_2(bool &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
-        retval = operator<=(*lh, *rh);
-        return 0;
-    }
-        """
+@register
+class _typing___hisnan(ConcreteTemplate):
+    key = globals()["__hisnan"]
+    cases = [signature(bool_, _type___nv_bfloat16)]
 
-    operator_le_2 = declare_device(
-        "operator_le_2",
-        bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
-    )
 
-    def operator_le_2_caller(arg_0, arg_1):
-        return operator_le_2(arg_0, arg_1)
+register_global(__hisnan, types.Function(_typing___hisnan))
 
-    @lower(operator.le, _type___nv_bfloat162, _type___nv_bfloat162)
-    def impl(context, builder, sig, args):
-        context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_le_2", shim_raw_str)
-        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
-        for ptr, ty, arg in zip(ptrs, sig.args, args):
-            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
-        return context.compile_internal(
-            builder,
-            operator_le_2_caller,
-            signature(
-                bool_,
-                CPointer(_type___nv_bfloat162),
-                CPointer(_type___nv_bfloat162),
-            ),
-            ptrs,
-        )
+@register
+class _typing___hmax(ConcreteTemplate):
+    key = globals()["__hmax"]
+    cases = [
+        signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16)
+    ]
 
 
-_operator_le_2_lower(shim_stream, shim_obj)
+register_global(__hmax, types.Function(_typing___hmax))
 
 
 @register
-class _typing_make_bfloat162(ConcreteTemplate):
-    key = globals()["make_bfloat162"]
+class _typing___hmin(ConcreteTemplate):
+    key = globals()["__hmin"]
     cases = [
-        signature(
-            _type___nv_bfloat162, _type___nv_bfloat16, _type___nv_bfloat16
-        )
+        signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16)
     ]
 
 
-register_global(make_bfloat162, types.Function(_typing_make_bfloat162))
+register_global(__hmin, types.Function(_typing___hmin))
 
 
 @register
-class _typing_htrunc(ConcreteTemplate):
-    key = globals()["htrunc"]
-    cases = [signature(_type___nv_bfloat16, _type___nv_bfloat16)]
+class _typing___hmax_nan(ConcreteTemplate):
+    key = globals()["__hmax_nan"]
+    cases = [
+        signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16)
+    ]
 
 
-register_global(htrunc, types.Function(_typing_htrunc))
+register_global(__hmax_nan, types.Function(_typing___hmax_nan))
 
 
 @register
-class _typing_hceil(ConcreteTemplate):
-    key = globals()["hceil"]
-    cases = [signature(_type___nv_bfloat16, _type___nv_bfloat16)]
+class _typing___hmin_nan(ConcreteTemplate):
+    key = globals()["__hmin_nan"]
+    cases = [
+        signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16)
+    ]
 
 
-register_global(hceil, types.Function(_typing_hceil))
+register_global(__hmin_nan, types.Function(_typing___hmin_nan))
 
 
 @register
-class _typing_hfloor(ConcreteTemplate):
-    key = globals()["hfloor"]
-    cases = [signature(_type___nv_bfloat16, _type___nv_bfloat16)]
+class _typing___hfma_relu(ConcreteTemplate):
+    key = globals()["__hfma_relu"]
+    cases = [
+        signature(
+            _type___nv_bfloat16,
+            _type___nv_bfloat16,
+            _type___nv_bfloat16,
+            _type___nv_bfloat16,
+        )
+    ]
 
 
-register_global(hfloor, types.Function(_typing_hfloor))
+register_global(__hfma_relu, types.Function(_typing___hfma_relu))
 
 
 @register
-class _typing_hrint(ConcreteTemplate):
-    key = globals()["hrint"]
-    cases = [signature(_type___nv_bfloat16, _type___nv_bfloat16)]
+class _typing___hmax2(ConcreteTemplate):
+    key = globals()["__hmax2"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
 
 
-register_global(hrint, types.Function(_typing_hrint))
+register_global(__hmax2, types.Function(_typing___hmax2))
 
 
 @register
-class _typing_h2trunc(ConcreteTemplate):
-    key = globals()["h2trunc"]
-    cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)]
+class _typing___hmin2(ConcreteTemplate):
+    key = globals()["__hmin2"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
 
 
-register_global(h2trunc, types.Function(_typing_h2trunc))
+register_global(__hmin2, types.Function(_typing___hmin2))
 
 
 @register
-class _typing_h2ceil(ConcreteTemplate):
-    key = globals()["h2ceil"]
-    cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)]
+class _typing___hmax2_nan(ConcreteTemplate):
+    key = globals()["__hmax2_nan"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
 
 
-register_global(h2ceil, types.Function(_typing_h2ceil))
+register_global(__hmax2_nan, types.Function(_typing___hmax2_nan))
 
 
 @register
-class _typing_h2floor(ConcreteTemplate):
-    key = globals()["h2floor"]
-    cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)]
+class _typing___hmin2_nan(ConcreteTemplate):
+    key = globals()["__hmin2_nan"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
 
 
-register_global(h2floor, types.Function(_typing_h2floor))
+register_global(__hmin2_nan, types.Function(_typing___hmin2_nan))
 
 
 @register
-class _typing_h2rint(ConcreteTemplate):
-    key = globals()["h2rint"]
-    cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)]
+class _typing___hfma2_relu(ConcreteTemplate):
+    key = globals()["__hfma2_relu"]
+    cases = [
+        signature(
+            _type___nv_bfloat162,
+            _type___nv_bfloat162,
+            _type___nv_bfloat162,
+            _type___nv_bfloat162,
+        )
+    ]
 
 
-register_global(h2rint, types.Function(_typing_h2rint))
+register_global(__hfma2_relu, types.Function(_typing___hfma2_relu))
+
+
+@register
+class _typing___hcmadd(ConcreteTemplate):
+    key = globals()["__hcmadd"]
+    cases = [
+        signature(
+            _type___nv_bfloat162,
+            _type___nv_bfloat162,
+            _type___nv_bfloat162,
+            _type___nv_bfloat162,
+        )
+    ]
+
+
+register_global(__hcmadd, types.Function(_typing___hcmadd))
 
 
 @register
@@ -4988,6 +15945,15 @@ class _typing_atomicAdd(ConcreteTemplate):
 register_global(atomicAdd, types.Function(_typing_atomicAdd))
 
 
+@register
+class _typing___half(ConcreteTemplate):
+    key = globals()["__half"]
+    cases = [signature(void, _type___nv_bfloat16)]
+
+
+register_global(__half, types.Function(_typing___half))
+
+
 @register_global(operator.add)
 class _typing_operator_add(ConcreteTemplate):
     cases = [
@@ -5149,7 +16115,297 @@ class _typing_operator_le(ConcreteTemplate):
 
 
 # Aliases:
-__nv_bfloat16_raw = unnamed1401637
-__nv_bfloat162_raw = unnamed1401746
+__nv_bfloat16_raw = unnamed1405307
+__nv_bfloat162_raw = unnamed1405416
 nv_bfloat16 = __nv_bfloat16
 nv_bfloat162 = __nv_bfloat162
+
+
+# Symbols:
+
+
+_NBTYPE_SYMBOLS = [
+    "_type_unnamed1405307",
+    "_type_unnamed1405416",
+    "_type___nv_bfloat16",
+    "_type___nv_bfloat162",
+]
+
+
+_RECORD_SYMBOLS = [
+    "unnamed1405307",
+    "unnamed1405416",
+    "__nv_bfloat16",
+    "__nv_bfloat162",
+]
+
+
+_FUNCTION_SYMBOLS = [
+    "__double2bfloat16",
+    "__float2bfloat16",
+    "__float2bfloat16_rn",
+    "__float2bfloat16_rz",
+    "__float2bfloat16_rd",
+    "__float2bfloat16_ru",
+    "__bfloat162float",
+    "__float2bfloat162_rn",
+    "__floats2bfloat162_rn",
+    "__low2float",
+    "__high2float",
+    "__float22bfloat162_rn",
+    "__bfloat1622float2",
+    "__bfloat162char_rz",
+    "__bfloat162uchar_rz",
+    "__bfloat162int_rn",
+    "__bfloat162int_rz",
+    "__bfloat162int_rd",
+    "__bfloat162int_ru",
+    "__int2bfloat16_rn",
+    "__int2bfloat16_rz",
+    "__int2bfloat16_rd",
+    "__int2bfloat16_ru",
+    "__bfloat162short_rn",
+    "__bfloat162short_rz",
+    "__bfloat162short_rd",
+    "__bfloat162short_ru",
+    "__short2bfloat16_rn",
+    "__short2bfloat16_rz",
+    "__short2bfloat16_rd",
+    "__short2bfloat16_ru",
+    "__bfloat162uint_rn",
+    "__bfloat162uint_rz",
+    "__bfloat162uint_rd",
+    "__bfloat162uint_ru",
+    "__uint2bfloat16_rn",
+    "__uint2bfloat16_rz",
+    "__uint2bfloat16_rd",
+    "__uint2bfloat16_ru",
+    "__bfloat162ushort_rn",
+    "__bfloat162ushort_rz",
+    "__bfloat162ushort_rd",
+    "__bfloat162ushort_ru",
+    "__ushort2bfloat16_rn",
+    "__ushort2bfloat16_rz",
+    "__ushort2bfloat16_rd",
+    "__ushort2bfloat16_ru",
+    "__bfloat162ull_rn",
+    "__bfloat162ull_rz",
+    "make_bfloat162",
+    "__bfloat162ull_rd",
+    "__bfloat162ull_ru",
+    "__ull2bfloat16_rn",
+    "__ull2bfloat16_rz",
+    "__ull2bfloat16_rd",
+    "__ull2bfloat16_ru",
+    "__bfloat162ll_rn",
+    "__bfloat162ll_rz",
+    "__bfloat162ll_rd",
+    "__bfloat162ll_ru",
+    "__ll2bfloat16_rn",
+    "__ll2bfloat16_rz",
+    "__ll2bfloat16_rd",
+    "__ll2bfloat16_ru",
+    "htrunc",
+    "hceil",
+    "hfloor",
+    "hrint",
+    "h2trunc",
+    "h2ceil",
+    "h2floor",
+    "h2rint",
+    "__bfloat162bfloat162",
+    "__lowhigh2highlow",
+    "__lows2bfloat162",
+    "__highs2bfloat162",
+    "__high2bfloat16",
+    "__low2bfloat16",
+    "__hisinf",
+    "__halves2bfloat162",
+    "__low2bfloat162",
+    "__high2bfloat162",
+    "__bfloat16_as_short",
+    "__bfloat16_as_ushort",
+    "__short_as_bfloat16",
+    "__ushort_as_bfloat16",
+    "__shfl_sync",
+    "__shfl_sync",
+    "__shfl_up_sync",
+    "__shfl_up_sync",
+    "__shfl_down_sync",
+    "__shfl_down_sync",
+    "__shfl_xor_sync",
+    "__shfl_xor_sync",
+    "__ldg",
+    "__ldg",
+    "__ldcg",
+    "__ldcg",
+    "__ldca",
+    "__ldca",
+    "__ldcs",
+    "__ldcs",
+    "__ldlu",
+    "__ldlu",
+    "__ldcv",
+    "__ldcv",
+    "__stwb",
+    "__stwb",
+    "__stcg",
+    "__stcg",
+    "__stcs",
+    "__stcs",
+    "__stwt",
+    "__stwt",
+    "__heq2",
+    "__hne2",
+    "__hle2",
+    "__hge2",
+    "__hlt2",
+    "__hgt2",
+    "__hequ2",
+    "__hneu2",
+    "__hleu2",
+    "__hgeu2",
+    "__hltu2",
+    "__hgtu2",
+    "__heq2_mask",
+    "__hne2_mask",
+    "__hle2_mask",
+    "__hge2_mask",
+    "__hlt2_mask",
+    "__hgt2_mask",
+    "__hequ2_mask",
+    "__hneu2_mask",
+    "__hleu2_mask",
+    "__hgeu2_mask",
+    "__hltu2_mask",
+    "__hgtu2_mask",
+    "__hisnan2",
+    "__hadd2",
+    "__hsub2",
+    "__hmul2",
+    "__hadd2_rn",
+    "__hsub2_rn",
+    "__hmul2_rn",
+    "__h2div",
+    "__habs2",
+    "__hadd2_sat",
+    "__hsub2_sat",
+    "__hmul2_sat",
+    "__hfma2",
+    "__hfma2_sat",
+    "__hneg2",
+    "__habs",
+    "__hadd",
+    "__hsub",
+    "__hmul",
+    "__hadd_rn",
+    "__hsub_rn",
+    "__hmul_rn",
+    "__hdiv",
+    "__hadd_sat",
+    "__hsub_sat",
+    "__hmul_sat",
+    "__hfma",
+    "__hfma_sat",
+    "__hneg",
+    "__hbeq2",
+    "__hbne2",
+    "__hble2",
+    "__hbge2",
+    "__hblt2",
+    "__hbgt2",
+    "__hbequ2",
+    "__hbneu2",
+    "__hbleu2",
+    "__hbgeu2",
+    "__hbltu2",
+    "__hbgtu2",
+    "__heq",
+    "__hne",
+    "__hle",
+    "__hge",
+    "__hlt",
+    "__hgt",
+    "__hequ",
+    "__hneu",
+    "__hleu",
+    "__hgeu",
+    "__hltu",
+    "__hgtu",
+    "__hisnan",
+    "__hmax",
+    "__hmin",
+    "__hmax_nan",
+    "__hmin_nan",
+    "__hfma_relu",
+    "__hmax2",
+    "__hmin2",
+    "__hmax2_nan",
+    "__hmin2_nan",
+    "__hfma2_relu",
+    "__hcmadd",
+    "hsqrt",
+    "hrsqrt",
+    "hrcp",
+    "hlog",
+    "hlog2",
+    "hlog10",
+    "hexp",
+    "htanh_approx",
+    "h2tanh_approx",
+    "htanh",
+    "h2tanh",
+    "hexp2",
+    "hexp10",
+    "hcos",
+    "hsin",
+    "h2sqrt",
+    "h2rsqrt",
+    "h2rcp",
+    "h2log",
+    "h2log2",
+    "h2log10",
+    "h2exp",
+    "h2exp2",
+    "h2exp10",
+    "h2cos",
+    "h2sin",
+    "atomicAdd",
+    "atomicAdd",
+    "operator+",
+    "operator-",
+    "operator*",
+    "operator/",
+    "operator+=",
+    "operator-=",
+    "operator*=",
+    "operator/=",
+    "operator+",
+    "operator-",
+    "operator==",
+    "operator!=",
+    "operator>",
+    "operator<",
+    "operator>=",
+    "operator<=",
+    "operator+",
+    "operator-",
+    "operator*",
+    "operator/",
+    "operator+=",
+    "operator-=",
+    "operator*=",
+    "operator/=",
+    "operator+",
+    "operator-",
+    "operator==",
+    "operator!=",
+    "operator>",
+    "operator<",
+    "operator>=",
+    "operator<=",
+    "__half",
+]
+
+
+__all__ = _NBTYPE_SYMBOLS + _RECORD_SYMBOLS + _FUNCTION_SYMBOLS

From ae6de8cfa4e4600e86ec7fe55199e0d86bb3c27d Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Mon, 4 Aug 2025 16:31:54 -0700
Subject: [PATCH 03/56] remove re-import of bfloat16 type

---
 numba_cuda/numba/cuda/_internal/cuda_bf16.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/numba_cuda/numba/cuda/_internal/cuda_bf16.py b/numba_cuda/numba/cuda/_internal/cuda_bf16.py
index 4fd6c50e4..45eae7d41 100644
--- a/numba_cuda/numba/cuda/_internal/cuda_bf16.py
+++ b/numba_cuda/numba/cuda/_internal/cuda_bf16.py
@@ -29,7 +29,6 @@
 from numba.core.typing.templates import AttributeTemplate, ConcreteTemplate
 from numba.core.typing.templates import Registry as TypingRegistry
 from numba.cuda import CUSource, declare_device
-from numba.cuda._internal.cuda_bf16 import _type___nv_bfloat16
 from numba.cuda.vector_types import vector_types
 from numba.extending import as_numba_type
 from numba.types import (

From 8498a995d250212096bedd9933f3c4b38dbd45ca Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Tue, 5 Aug 2025 12:34:09 -0700
Subject: [PATCH 04/56] implement custom bfloat16 type object; insert type
 registry into cuda target; mock bfloat16 llvmIR type

---
 numba_cuda/numba/cuda/_internal/cuda_bf16.py | 81 ++++++--------------
 numba_cuda/numba/cuda/bf16.py                |  5 +-
 numba_cuda/numba/cuda/models.py              | 35 ++++++++-
 numba_cuda/numba/cuda/target.py              | 12 ++-
 numba_cuda/numba/cuda/types.py               | 15 ++++
 5 files changed, 86 insertions(+), 62 deletions(-)

diff --git a/numba_cuda/numba/cuda/_internal/cuda_bf16.py b/numba_cuda/numba/cuda/_internal/cuda_bf16.py
index 8a263b962..162eec8a8 100644
--- a/numba_cuda/numba/cuda/_internal/cuda_bf16.py
+++ b/numba_cuda/numba/cuda/_internal/cuda_bf16.py
@@ -25,10 +25,11 @@
 )
 from numba.core.imputils import Registry as TargetRegistry
 from numba.core.imputils import lower_cast
-from numba.core.typing.templates import Registry as TypingRegistry
+from numba.cuda.typing.templates import Registry as TypingRegistry
 from numba.cuda.typing import signature
 from numba.cuda.typing.templates import AttributeTemplate, ConcreteTemplate
 from numba.cuda import CUSource, declare_device
+from numba.cuda.types import bfloat16
 from numba.cuda.vector_types import vector_types
 from numba.extending import as_numba_type
 from numba.types import (
@@ -52,6 +53,7 @@
 )
 
 float32x2 = vector_types["float32x2"]
+__half = float16
 
 # Setups:
 
@@ -192,28 +194,7 @@ class _ctor_template_unnamed1405416(ConcreteTemplate):
 register_global(unnamed1405416, Function(_ctor_template_unnamed1405416))
 
 
-# Typing for __nv_bfloat16
-class _type_class___nv_bfloat16(Number):
-    def __init__(self):
-        super().__init__(name="__nv_bfloat16")
-        self.alignof_ = 2
-        self.bitwidth = 2 * 8
-
-
-_type___nv_bfloat16 = _type_class___nv_bfloat16()
-
-
-# Make Python API for struct
-__nv_bfloat16 = type("__nv_bfloat16", (), {"_nbtype": _type___nv_bfloat16})
-
-as_numba_type.register(__nv_bfloat16, _type___nv_bfloat16)
-
-
-@register_model(_type_class___nv_bfloat16)
-class _model___nv_bfloat16(PrimitiveModel):
-    def __init__(self, dmm, fe_type):
-        be_type = ir.IntType(fe_type.bitwidth)
-        super(_model___nv_bfloat16, self).__init__(dmm, fe_type, be_type)
+__nv_bfloat16 = _type___nv_bfloat16 = bfloat16
 
 
 def _lower__ZN13__nv_bfloat16C1Ev(shim_stream, shim_obj):
@@ -417,8 +398,8 @@ def conversion_impl(context, builder, fromty, toty, value):
         return ctor_impl(
             context,
             builder,
-            signature(_type___nv_bfloat16, CPointer(float32)),
-            value,
+            signature(_type___nv_bfloat16, float32),
+            [value],
         )
 
 
@@ -470,8 +451,8 @@ def conversion_impl(context, builder, fromty, toty, value):
         return ctor_impl(
             context,
             builder,
-            signature(_type___nv_bfloat16, CPointer(float64)),
-            value,
+            signature(_type___nv_bfloat16, float64),
+            [value],
         )
 
 
@@ -523,8 +504,8 @@ def conversion_impl(context, builder, fromty, toty, value):
         return ctor_impl(
             context,
             builder,
-            signature(_type___nv_bfloat16, CPointer(int16)),
-            value,
+            signature(_type___nv_bfloat16, int16),
+            [value],
         )
 
 
@@ -576,8 +557,8 @@ def conversion_impl(context, builder, fromty, toty, value):
         return ctor_impl(
             context,
             builder,
-            signature(_type___nv_bfloat16, CPointer(uint16)),
-            value,
+            signature(_type___nv_bfloat16, uint16),
+            [value],
         )
 
 
@@ -629,8 +610,8 @@ def conversion_impl(context, builder, fromty, toty, value):
         return ctor_impl(
             context,
             builder,
-            signature(_type___nv_bfloat16, CPointer(int32)),
-            value,
+            signature(_type___nv_bfloat16, int32),
+            [value],
         )
 
 
@@ -682,8 +663,8 @@ def conversion_impl(context, builder, fromty, toty, value):
         return ctor_impl(
             context,
             builder,
-            signature(_type___nv_bfloat16, CPointer(uint32)),
-            value,
+            signature(_type___nv_bfloat16, uint32),
+            [value],
         )
 
 
@@ -735,8 +716,8 @@ def conversion_impl(context, builder, fromty, toty, value):
         return ctor_impl(
             context,
             builder,
-            signature(_type___nv_bfloat16, CPointer(int64)),
-            value,
+            signature(_type___nv_bfloat16, int64),
+            [value],
         )
 
 
@@ -788,8 +769,8 @@ def conversion_impl(context, builder, fromty, toty, value):
         return ctor_impl(
             context,
             builder,
-            signature(_type___nv_bfloat16, CPointer(uint64)),
-            value,
+            signature(_type___nv_bfloat16, uint64),
+            [value],
         )
 
 
@@ -841,8 +822,8 @@ def conversion_impl(context, builder, fromty, toty, value):
         return ctor_impl(
             context,
             builder,
-            signature(_type___nv_bfloat16, CPointer(int64)),
-            value,
+            signature(_type___nv_bfloat16, int64),
+            [value],
         )
 
 
@@ -894,8 +875,8 @@ def conversion_impl(context, builder, fromty, toty, value):
         return ctor_impl(
             context,
             builder,
-            signature(_type___nv_bfloat16, CPointer(uint64)),
-            value,
+            signature(_type___nv_bfloat16, uint64),
+            [value],
         )
 
 
@@ -13635,10 +13616,6 @@ def impl(context, builder, sig, args):
 _lower__ZleRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj)
 
 
-def __half():
-    pass
-
-
 def _lower__ZN6__halfC1E13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
@@ -15944,15 +15921,6 @@ class _typing_atomicAdd(ConcreteTemplate):
 register_global(atomicAdd, types.Function(_typing_atomicAdd))
 
 
-@register
-class _typing___half(ConcreteTemplate):
-    key = globals()["__half"]
-    cases = [signature(void, _type___nv_bfloat16)]
-
-
-register_global(__half, types.Function(_typing___half))
-
-
 @register_global(operator.add)
 class _typing_operator_add(ConcreteTemplate):
     cases = [
@@ -16403,7 +16371,6 @@ class _typing_operator_le(ConcreteTemplate):
     "operator<",
     "operator>=",
     "operator<=",
-    "__half",
 ]
 
 
diff --git a/numba_cuda/numba/cuda/bf16.py b/numba_cuda/numba/cuda/bf16.py
index 1ef6a370a..7ce28e459 100644
--- a/numba_cuda/numba/cuda/bf16.py
+++ b/numba_cuda/numba/cuda/bf16.py
@@ -1,6 +1,5 @@
 from numba.cuda._internal.cuda_bf16 import (
-    _type_class___nv_bfloat16,
-    nv_bfloat16 as bfloat16,
+    __nv_bfloat16 as bfloat16,
     htrunc,
     hceil,
     hfloor,
@@ -25,7 +24,7 @@
 
 
 def _make_unary(a, func):
-    if isinstance(a, _type_class___nv_bfloat16):
+    if isinstance(a, bfloat16):
         return lambda a: func(a)
 
 
diff --git a/numba_cuda/numba/cuda/models.py b/numba_cuda/numba/cuda/models.py
index f9735d7fc..768a1c3bf 100644
--- a/numba_cuda/numba/cuda/models.py
+++ b/numba_cuda/numba/cuda/models.py
@@ -3,9 +3,10 @@
 from llvmlite import ir
 
 from numba.core.datamodel.registry import DataModelManager, register
+from numba.core.datamodel import PrimitiveModel
 from numba.core.extending import models
 from numba.core import types
-from numba.cuda.types import Dim3, GridGroup, CUDADispatcher
+from numba.cuda.types import Dim3, GridGroup, CUDADispatcher, Bfloat16
 
 
 cuda_data_manager = DataModelManager()
@@ -42,3 +43,35 @@ def __init__(self, dmm, fe_type):
 
 
 register_model(CUDADispatcher)(models.OpaqueModel)
+
+
+def _as_bfloat(value):
+    # Step 1: Convert to float
+    f = ir.types._as_float(value)
+    # Step 2: Truncate (or round, we choose truncate) last 16 bits
+    bf = f >> 16
+    return bf
+
+
+class BfloatType(ir.types._BaseFloatType):
+    """Brain-float type"""
+
+    null = "0.0"
+    intrinsic_name = "bfloat"
+
+    def __str__(self):
+        return "bfloat"
+
+    def format_constant(self, value):
+        return ir.types._format_double(_as_bfloat(value))
+
+
+BfloatType._create_instance()
+
+
+@register_model(Bfloat16)
+class _model___nv_bfloat16(PrimitiveModel):
+    def __init__(self, dmm, fe_type):
+        # be_type = BfloatType()
+        be_type = ir.IntType(16)
+        super(_model___nv_bfloat16, self).__init__(dmm, fe_type, be_type)
diff --git a/numba_cuda/numba/cuda/target.py b/numba_cuda/numba/cuda/target.py
index 26b717264..66f2e2359 100644
--- a/numba_cuda/numba/cuda/target.py
+++ b/numba_cuda/numba/cuda/target.py
@@ -31,7 +31,14 @@
 
 class CUDATypingContext(typing.BaseContext):
     def load_additional_registries(self):
-        from . import cudadecl, cudamath, fp16, libdevicedecl, vector_types
+        from . import (
+            cudadecl,
+            cudamath,
+            fp16,
+            bf16,
+            libdevicedecl,
+            vector_types,
+        )
         from numba.core.typing import enumdecl, cffi_utils
 
         self.install_registry(cudadecl.registry)
@@ -42,6 +49,7 @@ def load_additional_registries(self):
         self.install_registry(enumdecl.registry)
         self.install_registry(vector_types.typing_registry)
         self.install_registry(fp16.typing_registry)
+        self.install_registry(bf16.typing_registry)
 
     def resolve_value_type(self, val):
         # treat other dispatcher object as another device function
@@ -154,6 +162,7 @@ def load_additional_registries(self):
             libdeviceimpl,
             mathimpl,
             vector_types,
+            bf16,
         )
 
         # fix for #8940
@@ -167,6 +176,7 @@ def load_additional_registries(self):
         self.install_registry(mathimpl.registry)
         self.install_registry(vector_types.impl_registry)
         self.install_registry(fp16.target_registry)
+        self.install_registry(bf16.target_registry)
 
     def codegen(self):
         return self._internal_codegen
diff --git a/numba_cuda/numba/cuda/types.py b/numba_cuda/numba/cuda/types.py
index 92b8f3ecb..844ce393a 100644
--- a/numba_cuda/numba/cuda/types.py
+++ b/numba_cuda/numba/cuda/types.py
@@ -38,3 +38,18 @@ class CUDADispatcher(types.Dispatcher):
     # is still probably a good idea to have a separate type for CUDA
     # dispatchers, and this type might get other differentiation from the CPU
     # dispatcher type in future.
+
+
+class Bfloat16(types.Number):
+    """
+    A bfloat16 type.
+    """
+
+    def __init__(self):
+        super().__init__(name="__nv_bfloat16")
+
+        self.alignof_ = 2
+        self.bitwidth = 2 * 8
+
+
+bfloat16 = Bfloat16()

From f79f0bfa659df0e88b528a4d93ed5ff5932bcf3b Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Wed, 6 Aug 2025 20:55:32 -0700
Subject: [PATCH 05/56] update bfloat16 bindings

---
 configs/cuda_bf16.yml | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/configs/cuda_bf16.yml b/configs/cuda_bf16.yml
index 29aa1d2dd..348e48ee7 100644
--- a/configs/cuda_bf16.yml
+++ b/configs/cuda_bf16.yml
@@ -1,7 +1,7 @@
 Name: Numba Bfloat16
 Version: 0.0.2
 GPU Arch:
-    - sm_80 # The first architecture to support bfloat16
+    - sm_80 # sm_80 is the first CUDA architecture that supports bfloat16
 Entry Point: ./numba_cuda/numba/cuda/include/12/cuda_bf16.h
 File List:
     - ./numba_cuda/numba/cuda/include/12/cuda_bf16.h
@@ -21,7 +21,4 @@ Data Models:
     __nv_bfloat162: StructModel
     nv_bfloat162: StructModel
 Shim Include Override: "\"cuda_bf16.h\""
-Additional Import:
-    - os
-Require Pynvjitlink: False
-Use Separate Registry: true
+Use Separate Registry: True

From 1b3598f2011e9776ea9463e080831142f39969d1 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Wed, 6 Aug 2025 20:56:07 -0700
Subject: [PATCH 06/56] export typing and target registries in bf16

---
 numba_cuda/numba/cuda/bf16.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/numba_cuda/numba/cuda/bf16.py b/numba_cuda/numba/cuda/bf16.py
index 7ce28e459..693a8e573 100644
--- a/numba_cuda/numba/cuda/bf16.py
+++ b/numba_cuda/numba/cuda/bf16.py
@@ -1,5 +1,7 @@
 from numba.cuda._internal.cuda_bf16 import (
-    __nv_bfloat16 as bfloat16,
+    typing_registry,
+    target_registry,
+    nv_bfloat16 as bfloat16,
     htrunc,
     hceil,
     hfloor,
@@ -18,13 +20,14 @@
     htanh,
     htanh_approx,
 )
+from numba.cuda.types import Bfloat16
 from numba.extending import overload
 
 import math
 
 
 def _make_unary(a, func):
-    if isinstance(a, bfloat16):
+    if isinstance(a, Bfloat16):
         return lambda a: func(a)
 
 
@@ -90,6 +93,8 @@ def exp2_ol(a):
 
 
 __all__ = [
+    "typing_registry",
+    "target_registry",
     "bfloat16",
     "htrunc",
     "hceil",

From efc32f0aecd9a8bfb1b18af6d2468d6a2d3aa656 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Wed, 6 Aug 2025 20:56:34 -0700
Subject: [PATCH 07/56] manually implement the lower_cast for float16 to
 bfloat16

---
 numba_cuda/numba/cuda/_internal/cuda_bf16.py | 56 ++++++++++++++++++++
 1 file changed, 56 insertions(+)

diff --git a/numba_cuda/numba/cuda/_internal/cuda_bf16.py b/numba_cuda/numba/cuda/_internal/cuda_bf16.py
index 162eec8a8..fcc70298d 100644
--- a/numba_cuda/numba/cuda/_internal/cuda_bf16.py
+++ b/numba_cuda/numba/cuda/_internal/cuda_bf16.py
@@ -307,6 +307,62 @@ def conversion_impl(context, builder, fromty, toty, value):
 _lower__ZN13__nv_bfloat16C1ERK17__nv_bfloat16_raw(shim_stream, shim_obj)
 
 
+def _lower__float16_to_bfloat16(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZN13__float162bfloat16_nbst(int &ignore, __nv_bfloat16 *self , __half* hr) {
+        new (self) __nv_bfloat16(*hr);
+        return 0;
+    }
+        """
+
+    _ctor_decl___float162bfloat16 = declare_device(
+        "_ZN13__float162bfloat16_nbst",
+        int32(CPointer(_type___nv_bfloat16), CPointer(float16)),
+    )
+
+    def __float162bfloat16_device_caller(arg_0, arg_1):
+        return _ctor_decl___float162bfloat16(arg_0, arg_1)
+
+    def ctor_impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key("_ZN13__float162bfloat16_nbst", shim_raw_str)
+        selfptr = builder.alloca(
+            context.get_value_type(_type___nv_bfloat16), name="selfptr"
+        )
+        argptrs = [
+            builder.alloca(context.get_value_type(arg)) for arg in sig.args
+        ]
+        for ptr, ty, arg in zip(argptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        context.compile_internal(
+            builder,
+            __float162bfloat16_device_caller,
+            signature(
+                int32,
+                CPointer(_type___nv_bfloat16),
+                CPointer(float16),
+            ),
+            (selfptr, *argptrs),
+        )
+        return builder.load(
+            selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
+        )
+
+    @lower_cast(float16, _type___nv_bfloat16)
+    def conversion_impl(context, builder, fromty, toty, value):
+        return ctor_impl(
+            context,
+            builder,
+            signature(_type___nv_bfloat16, float16),
+            [value],
+        )
+
+
+_lower__float16_to_bfloat16(shim_stream, shim_obj)
+
+
 def _lower__ZN13__nv_bfloat16C1E6__half(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int

From b0f76e96a39bfdf0b1b557dccd50483a11fc07a5 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Wed, 6 Aug 2025 22:32:17 -0700
Subject: [PATCH 08/56] add converting rules and unify rules

---
 numba_cuda/numba/cuda/types.py | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/numba_cuda/numba/cuda/types.py b/numba_cuda/numba/cuda/types.py
index 844ce393a..92e3cafde 100644
--- a/numba_cuda/numba/cuda/types.py
+++ b/numba_cuda/numba/cuda/types.py
@@ -1,4 +1,5 @@
 from numba.core import types
+from numba.core.typeconv import Conversion
 
 
 class Dim3(types.Type):
@@ -49,7 +50,32 @@ def __init__(self):
         super().__init__(name="__nv_bfloat16")
 
         self.alignof_ = 2
-        self.bitwidth = 2 * 8
+        self.bitwidth = 16
+
+    def can_convert_from(self, other):
+        if isinstance(other, types.Float):
+            return Conversion.unsafe
+
+        elif isinstance(other, types.Integer):
+            if other.bitwidth == 8:
+                return Conversion.safe
+
+        return Conversion.unsafe
+
+    def can_convert_to(self, typingctx, other):
+        if isinstance(other, types.Float):
+            if other.bitwidth >= 32:
+                return Conversion.safe
+            else:
+                return Conversion.unsafe
+        elif isinstance(other, types.Integer):
+            return Conversion.unsafe
+
+        return Conversion.unsafe
+
+    def unify(self, typingctx, other):
+        if isinstance(other, (types.Float, types.Integer)):
+            return typingctx.unify_pairs(self, other)
 
 
 bfloat16 = Bfloat16()

From 041862516163d2639421198d78ee029f81b1800d Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Thu, 14 Aug 2025 14:42:31 -0700
Subject: [PATCH 09/56] choose irType based on compute capability

---
 numba_cuda/numba/cuda/models.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/numba_cuda/numba/cuda/models.py b/numba_cuda/numba/cuda/models.py
index 768a1c3bf..02f629575 100644
--- a/numba_cuda/numba/cuda/models.py
+++ b/numba_cuda/numba/cuda/models.py
@@ -72,6 +72,14 @@ def format_constant(self, value):
 @register_model(Bfloat16)
 class _model___nv_bfloat16(PrimitiveModel):
     def __init__(self, dmm, fe_type):
-        # be_type = BfloatType()
-        be_type = ir.IntType(16)
+        from numba.cuda.api import get_current_device
+
+        major, minor = get_current_device().compute_capability
+
+        # Blackwell device leverage latest nvvm (llvm 20+ dialect) which has
+        # bfloat type
+        if major >= 10:
+            be_type = BfloatType()
+        else:
+            be_type = ir.IntType(16)
         super(_model___nv_bfloat16, self).__init__(dmm, fe_type, be_type)

From 6ffa69666c84b8702aa5398abbc22d338d9311ee Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Thu, 14 Aug 2025 14:43:41 -0700
Subject: [PATCH 10/56] vend ctk13 code

---
 numba_cuda/numba/cuda/cudadrv/nvrtc.py        |    6 +-
 numba_cuda/numba/cuda/include/13/cuda_bf16.h  | 5118 +++++++++++++++++
 .../numba/cuda/include/13/cuda_bf16.hpp       | 3865 +++++++++++++
 3 files changed, 8988 insertions(+), 1 deletion(-)
 create mode 100644 numba_cuda/numba/cuda/include/13/cuda_bf16.h
 create mode 100644 numba_cuda/numba/cuda/include/13/cuda_bf16.hpp

diff --git a/numba_cuda/numba/cuda/cudadrv/nvrtc.py b/numba_cuda/numba/cuda/cudadrv/nvrtc.py
index 0c4074a73..7b1efc225 100644
--- a/numba_cuda/numba/cuda/cudadrv/nvrtc.py
+++ b/numba_cuda/numba/cuda/cudadrv/nvrtc.py
@@ -356,8 +356,12 @@ def compile(src, name, cc, ltoir=False):
 
     if nvrtc_ver_major == 11:
         numba_include = f"{os.path.join(numba_cuda_path, 'include', '11')}"
-    else:
+    elif nvrtc_ver_major == 12:
         numba_include = f"{os.path.join(numba_cuda_path, 'include', '12')}"
+    elif nvrtc_ver_major == 13:
+        numba_include = f"{os.path.join(numba_cuda_path, 'include', '13')}"
+    else:
+        raise RuntimeError(f"Unsupported CUDA version: {nvrtc_version}")
 
     if config.CUDA_NVRTC_EXTRA_SEARCH_PATHS:
         extra_includes = config.CUDA_NVRTC_EXTRA_SEARCH_PATHS.split(":")
diff --git a/numba_cuda/numba/cuda/include/13/cuda_bf16.h b/numba_cuda/numba/cuda/include/13/cuda_bf16.h
new file mode 100644
index 000000000..38feffba0
--- /dev/null
+++ b/numba_cuda/numba/cuda/include/13/cuda_bf16.h
@@ -0,0 +1,5118 @@
+/*
+* Copyright 1993-2024 NVIDIA Corporation.  All rights reserved.
+*
+* NOTICE TO LICENSEE:
+*
+* This source code and/or documentation ("Licensed Deliverables") are
+* subject to NVIDIA intellectual property rights under U.S. and
+* international Copyright laws.
+*
+* These Licensed Deliverables contained herein is PROPRIETARY and
+* CONFIDENTIAL to NVIDIA and is being provided under the terms and
+* conditions of a form of NVIDIA software license agreement by and
+* between NVIDIA and Licensee ("License Agreement") or electronically
+* accepted by Licensee.  Notwithstanding any terms or conditions to
+* the contrary in the License Agreement, reproduction or disclosure
+* of the Licensed Deliverables to any third party without the express
+* written consent of NVIDIA is prohibited.
+*
+* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+* OF THESE LICENSED DELIVERABLES.
+*
+* U.S. Government End Users.  These Licensed Deliverables are a
+* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+* 1995), consisting of "commercial computer software" and "commercial
+* computer software documentation" as such terms are used in 48
+* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+* only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+* U.S. Government End Users acquire the Licensed Deliverables with
+* only those rights set forth herein.
+*
+* Any use of the Licensed Deliverables in individual and commercial
+* software must include, in the user documentation and internal
+* comments to the code, the above Disclaimer and U.S. Government End
+* Users Notice.
+*/
+
+/**
+* \defgroup CUDA_MATH_INTRINSIC_BFLOAT16 Bfloat16 Precision Intrinsics
+* This section describes nv_bfloat16 precision intrinsic functions.
+* To use these functions, include the header file \p cuda_bf16.h in your program.
+* All of the functions defined here are available in device code.
+* Some of the functions are also available to host compilers, please
+* refer to respective functions' documentation for details.
+*
+* NOTE: Aggressive floating-point optimizations performed by host or device
+* compilers may affect numeric behavior of the functions implemented in this
+* header. Specific examples are:
+* - hsin(__nv_bfloat16);
+* - hcos(__nv_bfloat16);
+* - h2sin(__nv_bfloat162);
+* - h2cos(__nv_bfloat162);
+*
+* The following macros are available to help users selectively enable/disable
+* various definitions present in the header file:
+* - \p CUDA_NO_BFLOAT16 - If defined, this macro will prevent the definition of
+* additional type aliases in the global namespace, helping to avoid potential
+* conflicts with symbols defined in the user program.
+* - \p __CUDA_NO_BFLOAT16_CONVERSIONS__ - If defined, this macro will prevent
+* the use of the C++ type conversions (converting constructors and conversion
+* operators) that are common for built-in floating-point types, but may be
+* undesirable for \p __nv_bfloat16 which is essentially a user-defined type.
+* - \p __CUDA_NO_BFLOAT16_OPERATORS__ and \p __CUDA_NO_BFLOAT162_OPERATORS__ -
+* If defined, these macros will prevent the inadvertent use of usual arithmetic
+* and comparison operators. This enforces the storage-only type semantics and
+* prevents C++ style computations on \p __nv_bfloat16 and \p __nv_bfloat162 types.
+*/
+
+/**
+* \defgroup CUDA_MATH_INTRINSIC_BFLOAT16_CONSTANTS Bfloat16 Arithmetic Constants
+* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+* To use these constants, include the header file \p cuda_bf16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__BFLOAT16_ARITHMETIC Bfloat16 Arithmetic Functions
+* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+* To use these functions, include the header file \p cuda_bf16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__BFLOAT162_ARITHMETIC Bfloat162 Arithmetic Functions
+* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+* To use these functions, include the header file \p cuda_bf16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__BFLOAT16_COMPARISON Bfloat16 Comparison Functions
+* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+* To use these functions, include the header file \p cuda_bf16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__BFLOAT162_COMPARISON Bfloat162 Comparison Functions
+* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+* To use these functions, include the header file \p cuda_bf16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__BFLOAT16_MISC Bfloat16 Precision Conversion and Data Movement
+* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+* To use these functions, include the header file \p cuda_bf16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__BFLOAT16_FUNCTIONS Bfloat16 Math Functions
+* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+* To use these functions, include the header file \p cuda_bf16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__BFLOAT162_FUNCTIONS Bfloat162 Math Functions
+* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+* To use these functions, include the header file \p cuda_bf16.h in your program.
+*/
+
+#ifndef __CUDA_BF16_H__
+#define __CUDA_BF16_H__
+
+/* bring in __half data type and operations, for use in converting constructors */
+#include "cuda_fp16.h"
+
+// implicitly provided by NVRTC
+#if !defined(__CUDACC_RTC__)
+/* bring in float2, double4, etc vector types */
+#include "vector_types.h"
+/* bring in operations on vector types like: make_float2 */
+#include "vector_functions.h"
+#endif  /* !defined(__CUDACC_RTC__) */
+
+#define ___CUDA_BF16_STRINGIFY_INNERMOST(x) #x
+#define __CUDA_BF16_STRINGIFY(x) ___CUDA_BF16_STRINGIFY_INNERMOST(x)
+
+#if defined(__cplusplus)
+
+/* Set up function decorations */
+#if (defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3))))
+#define __CUDA_BF16_DECL__ __device__
+#define __CUDA_HOSTDEVICE_BF16_DECL__ __device__
+#define __CUDA_HOSTDEVICE__ __device__
+#elif defined(__CUDACC__) || defined(_NVHPC_CUDA)
+#define __CUDA_BF16_DECL__ static __device__ __inline__
+#define __CUDA_HOSTDEVICE_BF16_DECL__ static __host__ __device__ __inline__
+#define __CUDA_HOSTDEVICE__ __host__ __device__
+#else /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+#if defined(__GNUC__)
+#define __CUDA_HOSTDEVICE_BF16_DECL__ static __attribute__ ((unused))
+#else
+#define __CUDA_HOSTDEVICE_BF16_DECL__ static
+#endif /* defined(__GNUC__) */
+#define __CUDA_HOSTDEVICE__
+#endif /* (defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3))))) */
+
+#define __CUDA_BF16_TYPES_EXIST__
+
+/* Macros to allow nv_bfloat16 & nv_bfloat162 to be used by inline assembly */
+#define __BFLOAT16_TO_US(var) *(reinterpret_cast<unsigned short *>(&(var)))
+#define __BFLOAT16_TO_CUS(var) *(reinterpret_cast<const unsigned short *>(&(var)))
+#define __BFLOAT162_TO_UI(var) *(reinterpret_cast<unsigned int *>(&(var)))
+#define __BFLOAT162_TO_CUI(var) *(reinterpret_cast<const unsigned int *>(&(var)))
+
+/* Forward-declaration of structures defined in "cuda_bf16.hpp" */
+struct __nv_bfloat16;
+struct __nv_bfloat162;
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts double number to nv_bfloat16 precision in round-to-nearest-even mode
+* and returns \p nv_bfloat16 with converted value.
+*
+* \details Converts double number \p a to nv_bfloat16 precision in round-to-nearest-even mode.
+* \param[in] a - double. Is only being read.
+* \returns nv_bfloat16
+* - \p a converted to \p nv_bfloat16 using round-to-nearest-even mode.
+* - __double2bfloat16 \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - __double2bfloat16 \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - __double2bfloat16(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __double2bfloat16(const double a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts float number to nv_bfloat16 precision in round-to-nearest-even mode
+* and returns \p nv_bfloat16 with converted value.
+*
+* \details Converts float number \p a to nv_bfloat16 precision in round-to-nearest-even mode.
+* \param[in] a - float. Is only being read.
+* \returns nv_bfloat16
+* - \p a converted to nv_bfloat16 using round-to-nearest-even mode.
+*
+* \see __float2bfloat16_rn(float) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16(const float a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts float number to nv_bfloat16 precision in round-to-nearest-even mode
+* and returns \p nv_bfloat16 with converted value.
+*
+* \details Converts float number \p a to nv_bfloat16 precision in round-to-nearest-even mode.
+* \param[in] a - float. Is only being read.
+* \returns nv_bfloat16
+* - \p a converted to nv_bfloat16 using round-to-nearest-even mode.
+* - __float2bfloat16_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - __float2bfloat16_rn \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - __float2bfloat16_rn(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rn(const float a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts float number to nv_bfloat16 precision in round-towards-zero mode
+* and returns \p nv_bfloat16 with converted value.
+*
+* \details Converts float number \p a to nv_bfloat16 precision in round-towards-zero mode.
+* \param[in] a - float. Is only being read.
+* \returns nv_bfloat16
+* - \p a converted to nv_bfloat16 using round-towards-zero mode.
+* - __float2bfloat16_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - __float2bfloat16_rz \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - __float2bfloat16_rz(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rz(const float a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts float number to nv_bfloat16 precision in round-down mode
+* and returns \p nv_bfloat16 with converted value.
+*
+* \details Converts float number \p a to nv_bfloat16 precision in round-down mode.
+* \param[in] a - float. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p a converted to nv_bfloat16 using round-down mode.
+* - __float2bfloat16_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - __float2bfloat16_rd \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - __float2bfloat16_rd(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rd(const float a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts float number to nv_bfloat16 precision in round-up mode
+* and returns \p nv_bfloat16 with converted value.
+*
+* \details Converts float number \p a to nv_bfloat16 precision in round-up mode.
+* \param[in] a - float. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p a converted to nv_bfloat16 using round-up mode.
+* - __float2bfloat16_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - __float2bfloat16_ru \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - __float2bfloat16_ru(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_ru(const float a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts \p nv_bfloat16 number to float.
+*
+* \details Converts nv_bfloat16 number \p a to float.
+* \param[in] a - float. Is only being read.
+*
+* \returns float
+* - \p a converted to float.
+* - __bfloat162float \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - __bfloat162float \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - __bfloat162float(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ float __bfloat162float(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts input to nv_bfloat16 precision in round-to-nearest-even mode and
+* populates both halves of \p nv_bfloat162 with converted value.
+*
+* \details Converts input \p a to nv_bfloat16 precision in round-to-nearest-even mode and
+* populates both halves of \p nv_bfloat162 with converted value.
+* \param[in] a - float. Is only being read.
+*
+* \returns nv_bfloat162
+* - The \p nv_bfloat162 value with both halves equal to the converted nv_bfloat16
+* precision number.
+*
+* \see __float2bfloat16_rn(float) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __float2bfloat162_rn(const float a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts both input floats to nv_bfloat16 precision in round-to-nearest-even
+* mode and returns \p nv_bfloat162 with converted values.
+*
+* \details Converts both input floats to nv_bfloat16 precision in round-to-nearest-even mode
+* and combines the results into one \p nv_bfloat162 number. Low 16 bits of the return
+* value correspond to the input \p a, high 16 bits correspond to the input \p
+* b.
+* \param[in] a - float. Is only being read.
+* \param[in] b - float. Is only being read.
+*
+* \returns nv_bfloat162
+* - The \p nv_bfloat162 value with corresponding halves equal to the
+* converted input floats.
+*
+* \see __float2bfloat16_rn(float) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __floats2bfloat162_rn(const float a, const float b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts low 16 bits of \p nv_bfloat162 to float and returns the result
+*
+* \details Converts low 16 bits of \p nv_bfloat162 input \p a to 32-bit floating-point number
+* and returns the result.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns float
+* - The low 16 bits of \p a converted to float.
+*
+* \see __bfloat162float(__nv_bfloat16) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ float __low2float(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts high 16 bits of \p nv_bfloat162 to float and returns the result
+*
+* \details Converts high 16 bits of \p nv_bfloat162 input \p a to 32-bit floating-point number
+* and returns the result.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns float
+* - The high 16 bits of \p a converted to float.
+*
+* \see __bfloat162float(__nv_bfloat16) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ float __high2float(const __nv_bfloat162 a);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts both components of float2 number to nv_bfloat16 precision in
+* round-to-nearest-even mode and returns \p nv_bfloat162 with converted values.
+*
+* \details Converts both components of float2 to nv_bfloat16 precision in round-to-nearest-even
+* mode and combines the results into one \p nv_bfloat162 number. Low 16 bits of the
+* return value correspond to \p a.x and high 16 bits of the return value
+* correspond to \p a.y.
+* \param[in] a - float2. Is only being read.
+*
+* \returns nv_bfloat162
+* - The \p nv_bfloat162 which has corresponding halves equal to the
+* converted float2 components.
+*
+* \see __float2bfloat16_rn(float) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __float22bfloat162_rn(const float2 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts both halves of \p nv_bfloat162 to float2 and returns the result.
+*
+* \details Converts both halves of \p nv_bfloat162 input \p a to float and returns the
+* result as a \p float2 packed value.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns float2
+* - \p a converted to float2.
+*
+* \see __bfloat162float(__nv_bfloat16) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ float2 __bfloat1622float2(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed char in round-towards-zero mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed
+* char in round-towards-zero mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns signed char
+* - \p h converted to a signed char using round-towards-zero mode.
+* - __bfloat162char_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __bfloat162char_rz \cuda_math_formula (x), x > 127\end_cuda_math_formula returns SCHAR_MAX = \p 0x7F.
+* - __bfloat162char_rz \cuda_math_formula (x), x < -128\end_cuda_math_formula returns SCHAR_MIN = \p 0x80.
+* - __bfloat162char_rz(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ signed char __bfloat162char_rz(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned char in round-towards-zero mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned
+* char in round-towards-zero mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns unsigned char
+* - \p h converted to an unsigned char using round-towards-zero mode.
+* - __bfloat162uchar_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __bfloat162uchar_rz \cuda_math_formula (x), x > 255\end_cuda_math_formula returns UCHAR_MAX = \p 0xFF.
+* - __bfloat162uchar_rz \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0.
+* - __bfloat162uchar_rz(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned char __bfloat162uchar_rz(const __nv_bfloat16 h);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed integer in round-to-nearest-even mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed integer in
+* round-to-nearest-even mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns int
+* - \p h converted to a signed integer using round-to-nearest-even mode.
+* - __bfloat162int_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __bfloat162int_rn \cuda_math_formula (x), x > INT_MAX\end_cuda_math_formula returns INT_MAX = \p 0x7FFFFFFF.
+* - __bfloat162int_rn \cuda_math_formula (x), x < INT_MIN\end_cuda_math_formula returns INT_MIN = \p 0x80000000.
+* - __bfloat162int_rn(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ int __bfloat162int_rn(const __nv_bfloat16 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed integer in round-towards-zero mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed integer in
+* round-towards-zero mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns int
+* - \p h converted to a signed integer using round-towards-zero mode.
+* - __bfloat162int_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __bfloat162int_rz \cuda_math_formula (x), x > INT_MAX\end_cuda_math_formula returns INT_MAX = \p 0x7FFFFFFF.
+* - __bfloat162int_rz \cuda_math_formula (x), x < INT_MIN\end_cuda_math_formula returns INT_MIN = \p 0x80000000.
+* - __bfloat162int_rz(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ int __bfloat162int_rz(const __nv_bfloat16 h);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed integer in round-down mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed integer in
+* round-down mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns int
+* - \p h converted to a signed integer using round-down mode.
+* - __bfloat162int_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __bfloat162int_rd \cuda_math_formula (x), x > INT_MAX\end_cuda_math_formula returns INT_MAX = \p 0x7FFFFFFF.
+* - __bfloat162int_rd \cuda_math_formula (x), x < INT_MIN\end_cuda_math_formula returns INT_MIN = \p 0x80000000.
+* - __bfloat162int_rd(NaN) returns 0.* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ int __bfloat162int_rd(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed integer in round-up mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed integer in
+* round-up mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns int
+* - \p h converted to a signed integer using round-up mode.
+* - __bfloat162int_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __bfloat162int_ru \cuda_math_formula (x), x > INT_MAX\end_cuda_math_formula returns INT_MAX = \p 0x7FFFFFFF.
+* - __bfloat162int_ru \cuda_math_formula (x), x < INT_MIN\end_cuda_math_formula returns INT_MIN = \p 0x80000000.
+* - __bfloat162int_ru(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ int __bfloat162int_ru(const __nv_bfloat16 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed integer to a nv_bfloat16 in round-to-nearest-even mode.
+*
+* \details Convert the signed integer value \p i to a nv_bfloat16 floating-point
+* value in round-to-nearest-even mode.
+* \param[in] i - int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rn(const int i);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed integer to a nv_bfloat16 in round-towards-zero mode.
+*
+* \details Convert the signed integer value \p i to a nv_bfloat16 floating-point
+* value in round-towards-zero mode.
+* \param[in] i - int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rz(const int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed integer to a nv_bfloat16 in round-down mode.
+*
+* \details Convert the signed integer value \p i to a nv_bfloat16 floating-point
+* value in round-down mode.
+* \param[in] i - int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rd(const int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed integer to a nv_bfloat16 in round-up mode.
+*
+* \details Convert the signed integer value \p i to a nv_bfloat16 floating-point
+* value in round-up mode.
+* \param[in] i - int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_ru(const int i);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed short integer in round-to-nearest-even
+* mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed short
+* integer in round-to-nearest-even mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns short int
+* - \p h converted to a signed short integer using round-to-nearest-even mode.
+* - __bfloat162short_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __bfloat162short_rn \cuda_math_formula (x), x > 32767\end_cuda_math_formula returns SHRT_MAX = \p 0x7FFF.
+* - __bfloat162short_rn \cuda_math_formula (x), x < -32768\end_cuda_math_formula returns SHRT_MIN = \p 0x8000.
+* - __bfloat162short_rn(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ short int __bfloat162short_rn(const __nv_bfloat16 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed short integer in round-towards-zero mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed short
+* integer in round-towards-zero mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns short int
+* - \p h converted to a signed short integer using round-towards-zero mode.
+* - __bfloat162short_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __bfloat162short_rz \cuda_math_formula (x), x > 32767\end_cuda_math_formula returns SHRT_MAX = \p 0x7FFF.
+* - __bfloat162short_rz \cuda_math_formula (x), x < -32768\end_cuda_math_formula returns SHRT_MIN = \p 0x8000.
+* - __bfloat162short_rz(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ short int __bfloat162short_rz(const __nv_bfloat16 h);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed short integer in round-down mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed short
+* integer in round-down mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns short int
+* - \p h converted to a signed short integer using round-down mode.
+* - __bfloat162short_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __bfloat162short_rd \cuda_math_formula (x), x > 32767\end_cuda_math_formula returns SHRT_MAX = \p 0x7FFF.
+* - __bfloat162short_rd \cuda_math_formula (x), x < -32768\end_cuda_math_formula returns SHRT_MIN = \p 0x8000.
+* - __bfloat162short_rd(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ short int __bfloat162short_rd(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed short integer in round-up mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed short
+* integer in round-up mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns short int
+* - \p h converted to a signed short integer using round-up mode.
+* - __bfloat162short_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __bfloat162short_ru \cuda_math_formula (x), x > 32767\end_cuda_math_formula returns SHRT_MAX = \p 0x7FFF.
+* - __bfloat162short_ru \cuda_math_formula (x), x < -32768\end_cuda_math_formula returns SHRT_MIN = \p 0x8000.
+* - __bfloat162short_ru(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ short int __bfloat162short_ru(const __nv_bfloat16 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed short integer to a nv_bfloat16 in round-to-nearest-even
+* mode.
+*
+* \details Convert the signed short integer value \p i to a nv_bfloat16 floating-point
+* value in round-to-nearest-even mode.
+* \param[in] i - short int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rn(const short int i);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed short integer to a nv_bfloat16 in round-towards-zero mode.
+*
+* \details Convert the signed short integer value \p i to a nv_bfloat16 floating-point
+* value in round-towards-zero mode.
+* \param[in] i - short int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rz(const short int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed short integer to a nv_bfloat16 in round-down mode.
+*
+* \details Convert the signed short integer value \p i to a nv_bfloat16 floating-point
+* value in round-down mode.
+* \param[in] i - short int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rd(const short int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed short integer to a nv_bfloat16 in round-up mode.
+*
+* \details Convert the signed short integer value \p i to a nv_bfloat16 floating-point
+* value in round-up mode.
+* \param[in] i - short int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_ru(const short int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned integer in round-to-nearest-even mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned integer
+* in round-to-nearest-even mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns unsigned int
+* - \p h converted to an unsigned integer.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ unsigned int __bfloat162uint_rn(const __nv_bfloat16 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned integer in round-towards-zero mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned integer
+* in round-towards-zero mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns unsigned int
+* - \p h converted to an unsigned integer.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __bfloat162uint_rz(const __nv_bfloat16 h);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned integer in round-down mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned integer
+* in round-down mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns unsigned int
+* - \p h converted to an unsigned integer.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ unsigned int __bfloat162uint_rd(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned integer in round-up mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned integer
+* in round-up mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns unsigned int
+* - \p h converted to an unsigned integer.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ unsigned int __bfloat162uint_ru(const __nv_bfloat16 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned integer to a nv_bfloat16 in round-to-nearest-even mode.
+*
+* \details Convert the unsigned integer value \p i to a nv_bfloat16 floating-point
+* value in round-to-nearest-even mode.
+* \param[in] i - unsigned int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rn(const unsigned int i);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned integer to a nv_bfloat16 in round-towards-zero mode.
+*
+* \details Convert the unsigned integer value \p i to a nv_bfloat16 floating-point
+* value in round-towards-zero mode.
+* \param[in] i - unsigned int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rz(const unsigned int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned integer to a nv_bfloat16 in round-down mode.
+*
+* \details Convert the unsigned integer value \p i to a nv_bfloat16 floating-point
+* value in round-down mode.
+* \param[in] i - unsigned int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rd(const unsigned int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned integer to a nv_bfloat16 in round-up mode.
+*
+* \details Convert the unsigned integer value \p i to a nv_bfloat16 floating-point
+* value in round-up mode.
+* \param[in] i - unsigned int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_ru(const unsigned int i);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned short integer in round-to-nearest-even
+* mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned short
+* integer in round-to-nearest-even mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns unsigned short int
+* - \p h converted to an unsigned short integer.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_rn(const __nv_bfloat16 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned short integer in round-towards-zero
+* mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned short
+* integer in round-towards-zero mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns unsigned short int
+* - \p h converted to an unsigned short integer.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned short int __bfloat162ushort_rz(const __nv_bfloat16 h);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned short integer in round-down mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned short
+* integer in round-down mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns unsigned short int
+* - \p h converted to an unsigned short integer.
+*/
+__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_rd(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned short integer in round-up mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned short
+* integer in round-up mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns unsigned short int
+* - \p h converted to an unsigned short integer.
+*/
+__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_ru(const __nv_bfloat16 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned short integer to a nv_bfloat16 in round-to-nearest-even
+* mode.
+*
+* \details Convert the unsigned short integer value \p i to a nv_bfloat16 floating-point
+* value in round-to-nearest-even mode.
+* \param[in] i - unsigned short int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rn(const unsigned short int i);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned short integer to a nv_bfloat16 in round-towards-zero
+* mode.
+*
+* \details Convert the unsigned short integer value \p i to a nv_bfloat16 floating-point
+* value in round-towards-zero mode.
+* \param[in] i - unsigned short int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rz(const unsigned short int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned short integer to a nv_bfloat16 in round-down mode.
+*
+* \details Convert the unsigned short integer value \p i to a nv_bfloat16 floating-point
+* value in round-down mode.
+* \param[in] i - unsigned short int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rd(const unsigned short int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned short integer to a nv_bfloat16 in round-up mode.
+*
+* \details Convert the unsigned short integer value \p i to a nv_bfloat16 floating-point
+* value in round-up mode.
+* \param[in] i - unsigned short int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_ru(const unsigned short int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned 64-bit integer in round-to-nearest-even
+* mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned 64-bit
+* integer in round-to-nearest-even mode. NaN inputs return 0x8000000000000000.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns unsigned long long int
+* - \p h converted to an unsigned 64-bit integer.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_rn(const __nv_bfloat16 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned 64-bit integer in round-towards-zero
+* mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned 64-bit
+* integer in round-towards-zero mode. NaN inputs return 0x8000000000000000.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns unsigned long long int
+* - \p h converted to an unsigned 64-bit integer.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned long long int __bfloat162ull_rz(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Vector function, combines two \p nv_bfloat16 numbers into one \p nv_bfloat162 number.
+*
+* \details Combines two input \p nv_bfloat16 number \p x and \p y into one \p nv_bfloat162 number.
+* Input \p x is stored in low 16 bits of the return value, input \p y is stored
+* in high 16 bits of the return value.
+* \param[in] x - nv_bfloat16. Is only being read.
+* \param[in] y - nv_bfloat16. Is only being read.
+*
+* \returns __nv_bfloat162
+* - The \p __nv_bfloat162 vector with one half equal to \p x and the other to \p y.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 make_bfloat162(const __nv_bfloat16 x, const __nv_bfloat16 y);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned 64-bit integer in round-down mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned 64-bit
+* integer in round-down mode. NaN inputs return 0x8000000000000000.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns unsigned long long int
+* - \p h converted to an unsigned 64-bit integer.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_rd(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned 64-bit integer in round-up mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned 64-bit
+* integer in round-up mode. NaN inputs return 0x8000000000000000.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns unsigned long long int
+* - \p h converted to an unsigned 64-bit integer.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_ru(const __nv_bfloat16 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned 64-bit integer to a nv_bfloat16 in round-to-nearest-even
+* mode.
+*
+* \details Convert the unsigned 64-bit integer value \p i to a nv_bfloat16 floating-point
+* value in round-to-nearest-even mode.
+* \param[in] i - unsigned long long int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rn(const unsigned long long int i);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned 64-bit integer to a nv_bfloat16 in round-towards-zero
+* mode.
+*
+* \details Convert the unsigned 64-bit integer value \p i to a nv_bfloat16 floating-point
+* value in round-towards-zero mode.
+* \param[in] i - unsigned long long int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rz(const unsigned long long int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned 64-bit integer to a nv_bfloat16 in round-down mode.
+*
+* \details Convert the unsigned 64-bit integer value \p i to a nv_bfloat16 floating-point
+* value in round-down mode.
+* \param[in] i - unsigned long long int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rd(const unsigned long long int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned 64-bit integer to a nv_bfloat16 in round-up mode.
+*
+* \details Convert the unsigned 64-bit integer value \p i to a nv_bfloat16 floating-point
+* value in round-up mode.
+* \param[in] i - unsigned long long int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_ru(const unsigned long long int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed 64-bit integer in round-to-nearest-even
+* mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed 64-bit
+* integer in round-to-nearest-even mode. NaN inputs return a long long int with hex value of 0x8000000000000000.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns long long int
+* - \p h converted to a signed 64-bit integer.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ long long int __bfloat162ll_rn(const __nv_bfloat16 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed 64-bit integer in round-towards-zero mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed 64-bit
+* integer in round-towards-zero mode. NaN inputs return a long long int with hex value of 0x8000000000000000.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns long long int
+* - \p h converted to a signed 64-bit integer.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ long long int __bfloat162ll_rz(const __nv_bfloat16 h);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed 64-bit integer in round-down mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed 64-bit
+* integer in round-down mode. NaN inputs return a long long int with hex value of 0x8000000000000000.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns long long int
+* - \p h converted to a signed 64-bit integer.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ long long int __bfloat162ll_rd(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed 64-bit integer in round-up mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed 64-bit
+* integer in round-up mode. NaN inputs return a long long int with hex value of 0x8000000000000000.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns long long int
+* - \p h converted to a signed 64-bit integer.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ long long int __bfloat162ll_ru(const __nv_bfloat16 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed 64-bit integer to a nv_bfloat16 in round-to-nearest-even
+* mode.
+*
+* \details Convert the signed 64-bit integer value \p i to a nv_bfloat16 floating-point
+* value in round-to-nearest-even mode.
+* \param[in] i - long long int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rn(const long long int i);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed 64-bit integer to a nv_bfloat16 in round-towards-zero mode.
+*
+* \details Convert the signed 64-bit integer value \p i to a nv_bfloat16 floating-point
+* value in round-towards-zero mode.
+* \param[in] i - long long int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rz(const long long int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed 64-bit integer to a nv_bfloat16 in round-down mode.
+*
+* \details Convert the signed 64-bit integer value \p i to a nv_bfloat16 floating-point
+* value in round-down mode.
+* \param[in] i - long long int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rd(const long long int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed 64-bit integer to a nv_bfloat16 in round-up mode.
+*
+* \details Convert the signed 64-bit integer value \p i to a nv_bfloat16 floating-point
+* value in round-up mode.
+* \param[in] i - long long int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_ru(const long long int i);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Truncate input argument to the integral part.
+*
+* \details Round \p h to the nearest integer value that does not exceed \p h in
+* magnitude.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The truncated integer value.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 htrunc(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculate ceiling of the input argument.
+*
+* \details Compute the smallest integer value not less than \p h.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The smallest integer value not less than \p h.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hceil(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculate the largest integer less than or equal to \p h.
+*
+* \details Calculate the largest integer value which is less than or equal to \p h.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The largest integer value which is less than or equal to \p h.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hfloor(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Round input to nearest integer value in nv_bfloat16 floating-point
+* number.
+*
+* \details Round \p h to the nearest integer value in nv_bfloat16 floating-point
+* format, with bfloat16way cases rounded to the nearest even integer value.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The nearest integer to \p h.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hrint(const __nv_bfloat16 h);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Truncate \p nv_bfloat162 vector input argument to the integral part.
+*
+* \details Round each component of vector \p h to the nearest integer value that does
+* not exceed \p h in magnitude.
+* \param[in] h - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The truncated \p h.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2trunc(const __nv_bfloat162 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculate \p nv_bfloat162 vector ceiling of the input argument.
+*
+* \details For each component of vector \p h compute the smallest integer value not less
+* than \p h.
+* \param[in] h - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The vector of smallest integers not less than \p h.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2ceil(const __nv_bfloat162 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculate the largest integer less than or equal to \p h.
+*
+* \details For each component of vector \p h calculate the largest integer value which
+* is less than or equal to \p h.
+* \param[in] h - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The vector of largest integers which is less than or equal to \p h.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2floor(const __nv_bfloat162 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Round input to nearest integer value in nv_bfloat16 floating-point
+* number.
+*
+* \details Round each component of \p nv_bfloat162 vector \p h to the nearest integer value in
+* nv_bfloat16 floating-point format, with bfloat16way cases rounded to the
+* nearest even integer value.
+* \param[in] h - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The vector of rounded integer values.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2rint(const __nv_bfloat162 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Returns \p nv_bfloat162 with both halves equal to the input value.
+*
+* \details Returns \p nv_bfloat162 number with both halves equal to the input \p a \p nv_bfloat16
+* number.
+* \param[in] a - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat162
+* - The vector which has both its halves equal to the input \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __bfloat162bfloat162(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Swaps both halves of the \p nv_bfloat162 input.
+*
+* \details Swaps both halves of the \p nv_bfloat162 input and returns a new \p nv_bfloat162 number
+* with swapped halves.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - \p a with its halves being swapped.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __lowhigh2highlow(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Extracts low 16 bits from each of the two \p nv_bfloat162 inputs and combines
+* into one \p nv_bfloat162 number.
+*
+* \details Extracts low 16 bits from each of the two \p nv_bfloat162 inputs and combines into
+* one \p nv_bfloat162 number. Low 16 bits from input \p a is stored in low 16 bits of
+* the return value, low 16 bits from input \p b is stored in high 16 bits of
+* the return value.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The low 16 bits of \p a and of \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __lows2bfloat162(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Extracts high 16 bits from each of the two \p nv_bfloat162 inputs and
+* combines into one \p nv_bfloat162 number.
+*
+* \details Extracts high 16 bits from each of the two \p nv_bfloat162 inputs and combines into
+* one \p nv_bfloat162 number. High 16 bits from input \p a is stored in low 16 bits of
+* the return value, high 16 bits from input \p b is stored in high 16 bits of
+* the return value.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The high 16 bits of \p a and of \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __highs2bfloat162(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Returns high 16 bits of \p nv_bfloat162 input.
+*
+* \details Returns high 16 bits of \p nv_bfloat162 input \p a.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat16
+* - The high 16 bits of the input.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __high2bfloat16(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Returns low 16 bits of \p nv_bfloat162 input.
+*
+* \details Returns low 16 bits of \p nv_bfloat162 input \p a.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat16
+* - Returns \p nv_bfloat16 which contains low 16 bits of the input \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __low2bfloat16(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Checks if the input \p nv_bfloat16 number is infinite.
+*
+* \details Checks if the input \p nv_bfloat16 number \p a is infinite.
+* \param[in] a - nv_bfloat16. Is only being read.
+*
+* \returns int
+* - -1 if \p a is equal to negative infinity,
+* - 1 if \p a is equal to positive infinity,
+* - 0 otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ int __hisinf(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Combines two \p nv_bfloat16 numbers into one \p nv_bfloat162 number.
+*
+* \details Combines two input \p nv_bfloat16 number \p a and \p b into one \p nv_bfloat162 number.
+* Input \p a is stored in low 16 bits of the return value, input \p b is stored
+* in high 16 bits of the return value.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat162
+* - The nv_bfloat162 with one nv_bfloat16 equal to \p a and the other to \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __halves2bfloat162(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Extracts low 16 bits from \p nv_bfloat162 input.
+*
+* \details Extracts low 16 bits from \p nv_bfloat162 input \p a and returns a new \p nv_bfloat162
+* number which has both halves equal to the extracted bits.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The nv_bfloat162 with both halves equal to the low 16 bits of the input.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __low2bfloat162(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Extracts high 16 bits from \p nv_bfloat162 input.
+*
+* \details Extracts high 16 bits from \p nv_bfloat162 input \p a and returns a new \p nv_bfloat162
+* number which has both halves equal to the extracted bits.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The nv_bfloat162 with both halves equal to the high 16 bits of the input.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __high2bfloat162(const __nv_bfloat162 a);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Reinterprets bits in a \p nv_bfloat16 as a signed short integer.
+*
+* \details Reinterprets the bits in the nv_bfloat16 floating-point number \p h
+* as a signed short integer.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns short int
+* - The reinterpreted value.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ short int __bfloat16_as_short(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Reinterprets bits in a \p nv_bfloat16 as an unsigned short integer.
+*
+* \details Reinterprets the bits in the nv_bfloat16 floating-point \p h
+* as an unsigned short number.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns unsigned short int
+* - The reinterpreted value.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned short int __bfloat16_as_ushort(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Reinterprets bits in a signed short integer as a \p nv_bfloat16.
+*
+* \details Reinterprets the bits in the signed short integer \p i as a
+* nv_bfloat16 floating-point number.
+* \param[in] i - short int. Is only being read.
+*
+* \returns nv_bfloat16
+* - The reinterpreted value.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __short_as_bfloat16(const short int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Reinterprets bits in an unsigned short integer as a \p nv_bfloat16.
+*
+* \details Reinterprets the bits in the unsigned short integer \p i as a
+* nv_bfloat16 floating-point number.
+* \param[in] i - unsigned short int. Is only being read.
+*
+* \returns nv_bfloat16
+* - The reinterpreted value.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ushort_as_bfloat16(const unsigned short int i);
+
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300))) || defined(_NVHPC_CUDA)
+
+#if !defined warpSize && !defined __local_warpSize
+#define warpSize    32
+#define __local_warpSize
+#endif
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Exchange a variable between threads within a warp. Direct copy from indexed thread.
+*
+* \details Returns the value of \p var held by the thread whose ID is given by \p srcLane.
+* If the \p width is less than \p warpSize, then each subsection of the warp behaves as a separate
+* entity with a starting logical thread ID of 0. If \p srcLane is outside the range \p [0:width-1],
+* the value returned corresponds to the value of \p var held by the \p srcLane modulo \p width (i.e.
+* within the same subsection). \p width must have a value which is a power of 2;
+* results are undefined if \p width is not a power of 2, or is a number greater than
+* \p warpSize.
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read.
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - nv_bfloat162. Is only being read.
+* \param[in] srcLane - int. Is only being read.
+* \param[in] width - int. Is only being read.
+*
+* \returns Returns the 4-byte word referenced by \p var from the source thread ID as \p nv_bfloat162.
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_sync(const unsigned int mask, const __nv_bfloat162 var, const int srcLane, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller.
+*
+* \details Calculates a source thread ID by subtracting \p delta from the caller's lane ID.
+* The value of \p var held by the resulting lane ID is returned: in effect, \p var is shifted up
+* the warp by \p delta threads. If the \p width is less than \p warpSize, then each subsection of the warp
+* behaves as a separate entity with a starting logical thread ID of 0. The source thread index
+* will not wrap around the value of \p width, so effectively the lower \p delta threads will be unchanged.
+* \p width must have a value which is a power of 2; results are undefined if \p width is not a power of 2,
+* or is a number greater than \p warpSize.
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read.
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - nv_bfloat162. Is only being read.
+* \param[in] delta - unsigned int. Is only being read.
+* \param[in] width - int. Is only being read.
+*
+* \returns Returns the 4-byte word referenced by \p var from the source thread ID as \p nv_bfloat162.
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_up_sync(const unsigned int mask, const __nv_bfloat162 var, const unsigned int delta, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller.
+*
+* \details Calculates a source thread ID by adding \p delta to the caller's thread ID.
+* The value of \p var held by the resulting thread ID is returned: this has the effect
+* of shifting \p var down the warp by \p delta threads. If the \p width is less than \p warpSize, then
+* each subsection of the warp behaves as a separate entity with a starting logical
+* thread ID of 0. Similarly to the __shfl_up_sync(), the ID number of the source thread
+* will not wrap around the value of \p width and the upper \p delta threads
+* will remain unchanged.
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read.
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - nv_bfloat162. Is only being read.
+* \param[in] delta - unsigned int. Is only being read.
+* \param[in] width - int. Is only being read.
+*
+* \returns Returns the 4-byte word referenced by \p var from the source thread ID as \p nv_bfloat162.
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_down_sync(const unsigned int mask, const __nv_bfloat162 var, const unsigned int delta, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID.
+*
+* \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with \p laneMask:
+* the value of \p var held by the resulting thread ID is returned. If the \p width is less than \p warpSize, then each
+* group of \p width consecutive threads are able to access elements from earlier groups of threads,
+* however if they attempt to access elements from later groups of threads their own value of \p var
+* will be returned. This mode implements a butterfly addressing pattern such as is used in tree
+* reduction and broadcast.
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read.
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - nv_bfloat162. Is only being read.
+* \param[in] laneMask - int. Is only being read.
+* \param[in] width - int. Is only being read.
+*
+* \returns Returns the 4-byte word referenced by \p var from the source thread ID as \p nv_bfloat162.
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_xor_sync(const unsigned int mask, const __nv_bfloat162 var, const int laneMask, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Exchange a variable between threads within a warp. Direct copy from indexed thread.
+*
+* \details Returns the value of \p var held by the thread whose ID is given by \p srcLane.
+* If the \p width is less than \p warpSize, then each subsection of the warp behaves as a separate
+* entity with a starting logical thread ID of 0. If \p srcLane is outside the range \p [0:width-1],
+* the value returned corresponds to the value of \p var held by the \p srcLane modulo \p width (i.e.
+* within the same subsection). \p width must have a value which is a power of 2;
+* results are undefined if \p width is not a power of 2, or is a number greater than
+* \p warpSize.
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read.
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - nv_bfloat16. Is only being read.
+* \param[in] srcLane - int. Is only being read.
+* \param[in] width - int. Is only being read.
+*
+* \returns Returns the 2-byte word referenced by \p var from the source thread ID as \p nv_bfloat16.
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_sync(const unsigned int mask, const __nv_bfloat16 var, const int srcLane, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller.
+*
+* \details Calculates a source thread ID by subtracting \p delta from the caller's lane ID.
+* The value of \p var held by the resulting lane ID is returned: in effect, \p var is shifted up
+* the warp by \p delta threads. If the \p width is less than \p warpSize, then each subsection of the warp
+* behaves as a separate entity with a starting logical thread ID of 0. The source thread index
+* will not wrap around the value of \p width, so effectively the lower \p delta threads will be unchanged.
+* \p width must have a value which is a power of 2; results are undefined if \p width is not a power of 2,
+* or is a number greater than \p warpSize.
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read.
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - nv_bfloat16. Is only being read.
+* \param[in] delta - unsigned int. Is only being read.
+* \param[in] width - int. Is only being read.
+*
+* \returns Returns the 2-byte word referenced by \p var from the source thread ID as \p nv_bfloat16.
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_up_sync(const unsigned int mask, const __nv_bfloat16 var, const unsigned int delta, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller.
+*
+* \details Calculates a source thread ID by adding \p delta to the caller's thread ID.
+* The value of \p var held by the resulting thread ID is returned: this has the effect
+* of shifting \p var down the warp by \p delta threads. If the \p width is less than \p warpSize, then
+* each subsection of the warp behaves as a separate entity with a starting logical
+* thread ID of 0. Similarly to the __shfl_up_sync(), the ID number of the source thread
+* will not wrap around the value of \p width and the upper \p delta threads
+* will remain unchanged.
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read.
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - nv_bfloat16. Is only being read.
+* \param[in] delta - unsigned int. Is only being read.
+* \param[in] width - int. Is only being read.
+*
+* \returns Returns the 2-byte word referenced by \p var from the source thread ID as \p nv_bfloat16.
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_down_sync(const unsigned int mask, const __nv_bfloat16 var, const unsigned int delta, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID.
+*
+* \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with \p laneMask:
+* the value of \p var held by the resulting thread ID is returned. If the \p width is less than \p warpSize, then each
+* group of \p width consecutive threads are able to access elements from earlier groups of threads,
+* however if they attempt to access elements from later groups of threads their own value of \p var
+* will be returned. This mode implements a butterfly addressing pattern such as is used in tree
+* reduction and broadcast.
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read.
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - nv_bfloat16. Is only being read.
+* \param[in] laneMask - int. Is only being read.
+* \param[in] width - int. Is only being read.
+*
+* \returns Returns the 2-byte word referenced by \p var from the source thread ID as \p nv_bfloat16.
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_xor_sync(const unsigned int mask, const __nv_bfloat16 var, const int laneMask, const int width = warpSize);
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300))) || defined(_NVHPC_CUDA) */
+
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320))) || defined(_NVHPC_CUDA)
+#if defined(__local_warpSize)
+#undef warpSize
+#undef __local_warpSize
+#endif
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.nc` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldg(const  __nv_bfloat162 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.nc` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldg(const __nv_bfloat16 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.cg` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldcg(const  __nv_bfloat162 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.cg` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldcg(const __nv_bfloat16 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.ca` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldca(const  __nv_bfloat162 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.ca` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldca(const __nv_bfloat16 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.cs` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldcs(const  __nv_bfloat162 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.cs` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldcs(const __nv_bfloat16 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.lu` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldlu(const  __nv_bfloat162 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.lu` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldlu(const __nv_bfloat16 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.cv` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldcv(const  __nv_bfloat162 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.cv` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldcv(const __nv_bfloat16 *const ptr);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `st.global.wb` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_BF16_DECL__ void __stwb(__nv_bfloat162 *const ptr, const __nv_bfloat162 value);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `st.global.wb` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_BF16_DECL__ void __stwb(__nv_bfloat16 *const ptr, const __nv_bfloat16 value);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `st.global.cg` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_BF16_DECL__ void __stcg(__nv_bfloat162 *const ptr, const __nv_bfloat162 value);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `st.global.cg` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_BF16_DECL__ void __stcg(__nv_bfloat16 *const ptr, const __nv_bfloat16 value);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `st.global.cs` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_BF16_DECL__ void __stcs(__nv_bfloat162 *const ptr, const __nv_bfloat162 value);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `st.global.cs` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_BF16_DECL__ void __stcs(__nv_bfloat16 *const ptr, const __nv_bfloat16 value);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `st.global.wt` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_BF16_DECL__ void __stwt(__nv_bfloat162 *const ptr, const __nv_bfloat162 value);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `st.global.wt` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_BF16_DECL__ void __stwt(__nv_bfloat16 *const ptr, const __nv_bfloat16 value);
+
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320))) || defined(_NVHPC_CUDA) */
+
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs nv_bfloat162 vector if-equal comparison.
+*
+* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The vector result of if-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __heq2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector not-equal comparison.
+*
+* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The vector result of not-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hne2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector less-equal comparison.
+*
+* \details Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The \p nv_bfloat162 result of less-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hle2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector greater-equal comparison.
+*
+* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The vector result of greater-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hge2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector less-than comparison.
+*
+* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The nv_bfloat162 vector result of less-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hlt2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector greater-than comparison.
+*
+* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The vector result of greater-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hgt2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered if-equal comparison.
+*
+* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The vector result of unordered if-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hequ2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered not-equal comparison.
+*
+* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The vector result of unordered not-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hneu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered less-equal comparison.
+*
+* Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The vector result of unordered less-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hleu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered greater-equal comparison.
+*
+* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The \p nv_bfloat162 vector result of unordered greater-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered less-than comparison.
+*
+* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The vector result of unordered less-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hltu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered greater-than comparison.
+*
+* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The \p nv_bfloat162 vector result of unordered greater-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs nv_bfloat162 vector if-equal comparison.
+*
+* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of if-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __heq2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector not-equal comparison.
+*
+* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of not-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hne2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector less-equal comparison.
+*
+* \details Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of less-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hle2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector greater-equal comparison.
+*
+* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of greater-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hge2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector less-than comparison.
+*
+* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of less-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hlt2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector greater-than comparison.
+*
+* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of greater-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hgt2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered if-equal comparison.
+*
+* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of unordered if-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hequ2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered not-equal comparison.
+*
+* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of unordered not-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hneu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered less-equal comparison.
+*
+* Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of unordered less-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hleu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered greater-equal comparison.
+*
+* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of unordered greater-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hgeu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered less-than comparison.
+*
+* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of unordered less-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hltu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered greater-than comparison.
+*
+* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of unordered greater-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hgtu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Determine whether \p nv_bfloat162 argument is a NaN.
+*
+* \details Determine whether each nv_bfloat16 of input \p nv_bfloat162 number \p a is a NaN.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The nv_bfloat162 with the corresponding \p nv_bfloat16 results set to
+* 1.0 for NaN, 0.0 otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hisnan2(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector addition in round-to-nearest-even mode.
+*
+* \details Performs \p nv_bfloat162 vector add of inputs \p a and \p b, in round-to-nearest-even
+* mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-95
+* \endinternal
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The sum of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hadd2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector subtraction in round-to-nearest-even mode.
+*
+* \details Subtracts \p nv_bfloat162 input vector \p b from input vector \p a in
+* round-to-nearest-even mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-104
+* \endinternal
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The subtraction of vector \p b from \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hsub2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector multiplication in round-to-nearest-even mode.
+*
+* \details Performs \p nv_bfloat162 vector multiplication of inputs \p a and \p b, in
+* round-to-nearest-even mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-102
+* \endinternal
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The result of elementwise multiplying the vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmul2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector addition in round-to-nearest-even mode.
+*
+* \details Performs \p nv_bfloat162 vector add of inputs \p a and \p b, in round-to-nearest-even
+* mode. Prevents floating-point contractions of mul+add into fma.
+* \internal
+* \req DEEPLEARN-SRM_REQ-95
+* \endinternal
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The sum of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hadd2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector subtraction in round-to-nearest-even mode.
+*
+* \details Subtracts \p nv_bfloat162 input vector \p b from input vector \p a in
+* round-to-nearest-even mode. Prevents floating-point contractions of mul+sub into fma.
+* \internal
+* \req DEEPLEARN-SRM_REQ-104
+* \endinternal
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The subtraction of vector \p b from \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hsub2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector multiplication in round-to-nearest-even mode.
+*
+* \details Performs \p nv_bfloat162 vector multiplication of inputs \p a and \p b, in
+* round-to-nearest-even mode. Prevents floating-point contractions of mul+add
+* or sub into fma.
+* \internal
+* \req DEEPLEARN-SRM_REQ-102
+* \endinternal
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The result of elementwise multiplying the vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmul2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector division in round-to-nearest-even mode.
+*
+* \details Divides \p nv_bfloat162 input vector \p a by input vector \p b in round-to-nearest-even
+* mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-103
+* \endinternal
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The elementwise division of \p a with \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __h2div(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Calculates the absolute value of both halves of the input \p nv_bfloat162 number and
+* returns the result.
+*
+* \details Calculates the absolute value of both halves of the input \p nv_bfloat162 number and
+* returns the result.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns bfloat2
+* - Returns \p a with the absolute value of both halves.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __habs2(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector addition in round-to-nearest-even mode, with
+* saturation to [0.0, 1.0].
+*
+* \details Performs \p nv_bfloat162 vector add of inputs \p a and \p b, in round-to-nearest-even
+* mode, and clamps the results to range [0.0, 1.0]. NaN results are flushed to
+* +0.0.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The sum of \p a and \p b, with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hadd2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector subtraction in round-to-nearest-even mode,
+* with saturation to [0.0, 1.0].
+*
+* \details Subtracts \p nv_bfloat162 input vector \p b from input vector \p a in
+* round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN
+* results are flushed to +0.0.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The subtraction of vector \p b from \p a, with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hsub2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector multiplication in round-to-nearest-even mode,
+* with saturation to [0.0, 1.0].
+*
+* \details Performs \p nv_bfloat162 vector multiplication of inputs \p a and \p b, in
+* round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN
+* results are flushed to +0.0.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The result of elementwise multiplication of vectors \p a and \p b,
+* with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmul2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b);
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector fused multiply-add in round-to-nearest-even
+* mode.
+*
+* \details Performs \p nv_bfloat162 vector multiply on inputs \p a and \p b,
+* then performs a \p nv_bfloat162 vector add of the result with \p c,
+* rounding the result once in round-to-nearest-even mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-105
+* \endinternal
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+* \param[in] c - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector fused multiply-add in round-to-nearest-even
+* mode, with saturation to [0.0, 1.0].
+*
+* \details Performs \p nv_bfloat162 vector multiply on inputs \p a and \p b,
+* then performs a \p nv_bfloat162 vector add of the result with \p c,
+* rounding the result once in round-to-nearest-even mode, and clamps the
+* results to range [0.0, 1.0]. NaN results are flushed to +0.0.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+* \param[in] c - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c,
+* with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c);
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Negates both halves of the input \p nv_bfloat162 number and returns the
+* result.
+*
+* \details Negates both halves of the input \p nv_bfloat162 number \p a and returns the result.
+* \internal
+* \req DEEPLEARN-SRM_REQ-101
+* \endinternal
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - Returns \p a with both halves negated.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hneg2(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Calculates the absolute value of input \p nv_bfloat16 number and returns the result.
+*
+* \details Calculates the absolute value of input \p nv_bfloat16 number and returns the result.
+* \param[in] a - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The absolute value of a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __habs(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 addition in round-to-nearest-even mode.
+*
+* \details Performs \p nv_bfloat16 addition of inputs \p a and \p b, in round-to-nearest-even
+* mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-94
+* \endinternal
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The sum of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hadd(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 subtraction in round-to-nearest-even mode.
+*
+* \details Subtracts \p nv_bfloat16 input \p b from input \p a in round-to-nearest-even
+* mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-97
+* \endinternal
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The result of subtracting \p b from \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hsub(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 multiplication in round-to-nearest-even mode.
+*
+* \details Performs \p nv_bfloat16 multiplication of inputs \p a and \p b, in round-to-nearest-even
+* mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-99
+* \endinternal
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The result of multiplying \p a and \p b.
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmul(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 addition in round-to-nearest-even mode.
+*
+* \details Performs \p nv_bfloat16 addition of inputs \p a and \p b, in round-to-nearest-even
+* mode. Prevents floating-point contractions of mul+add into fma.
+* \internal
+* \req DEEPLEARN-SRM_REQ-94
+* \endinternal
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The sum of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hadd_rn(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 subtraction in round-to-nearest-even mode.
+*
+* \details Subtracts \p nv_bfloat16 input \p b from input \p a in round-to-nearest-even
+* mode. Prevents floating-point contractions of mul+sub into fma.
+* \internal
+* \req DEEPLEARN-SRM_REQ-97
+* \endinternal
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The result of subtracting \p b from \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hsub_rn(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 multiplication in round-to-nearest-even mode.
+*
+* \details Performs \p nv_bfloat16 multiplication of inputs \p a and \p b, in round-to-nearest-even
+* mode. Prevents floating-point contractions of mul+add or sub into fma.
+* \internal
+* \req DEEPLEARN-SRM_REQ-99
+* \endinternal
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The result of multiplying \p a and \p b.
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmul_rn(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 division in round-to-nearest-even mode.
+*
+* \details Divides \p nv_bfloat16 input \p a by input \p b in round-to-nearest-even
+* mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-98
+* \endinternal
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The result of dividing \p a by \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__  __nv_bfloat16 __hdiv(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 addition in round-to-nearest-even mode, with
+* saturation to [0.0, 1.0].
+*
+* \details Performs \p nv_bfloat16 add of inputs \p a and \p b, in round-to-nearest-even mode,
+* and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The sum of \p a and \p b, with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hadd_sat(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 subtraction in round-to-nearest-even mode, with
+* saturation to [0.0, 1.0].
+*
+* \details Subtracts \p nv_bfloat16 input \p b from input \p a in round-to-nearest-even
+* mode,
+* and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The result of subtraction of \p b from \p a, with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hsub_sat(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 multiplication in round-to-nearest-even mode, with
+* saturation to [0.0, 1.0].
+*
+* \details Performs \p nv_bfloat16 multiplication of inputs \p a and \p b, in round-to-nearest-even
+* mode, and clamps the result to range [0.0, 1.0]. NaN results are flushed to
+* +0.0.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The result of multiplying \p a and \p b, with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmul_sat(const __nv_bfloat16 a, const __nv_bfloat16 b);
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 fused multiply-add in round-to-nearest-even mode.
+*
+* \details Performs \p nv_bfloat16 multiply on inputs \p a and \p b,
+* then performs a \p nv_bfloat16 add of the result with \p c,
+* rounding the result once in round-to-nearest-even mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-96
+* \endinternal
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+* \param[in] c - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The result of fused multiply-add operation on \p
+* a, \p b, and \p c.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __hfma(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 fused multiply-add in round-to-nearest-even mode,
+* with saturation to [0.0, 1.0].
+*
+* \details Performs \p nv_bfloat16 multiply on inputs \p a and \p b,
+* then performs a \p nv_bfloat16 add of the result with \p c,
+* rounding the result once in round-to-nearest-even mode, and clamps the result
+* to range [0.0, 1.0]. NaN results are flushed to +0.0.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+* \param[in] c - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The result of fused multiply-add operation on \p
+* a, \p b, and \p c, with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __hfma_sat(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c);
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Negates input \p nv_bfloat16 number and returns the result.
+*
+* \details Negates input \p nv_bfloat16 number and returns the result.
+* \internal
+* \req DEEPLEARN-SRM_REQ-100
+* \endinternal
+* \param[in] a - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - minus a
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hneg(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector if-equal comparison and returns boolean true
+* if both \p nv_bfloat16 results are true, boolean false otherwise.
+*
+* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 if-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of if-equal comparison
+* of vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbeq2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector not-equal comparison and returns boolean
+* true if both \p nv_bfloat16 results are true, boolean false otherwise.
+*
+* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 not-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of not-equal comparison
+* of vectors \p a and \p b are true,
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbne2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector less-equal comparison and returns boolean
+* true if both \p nv_bfloat16 results are true, boolean false otherwise.
+*
+* \details Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 less-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of less-equal comparison
+* of vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hble2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector greater-equal comparison and returns boolean
+* true if both \p nv_bfloat16 results are true, boolean false otherwise.
+*
+* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 greater-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of greater-equal
+* comparison of vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbge2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector less-than comparison and returns boolean
+* true if both \p nv_bfloat16 results are true, boolean false otherwise.
+*
+* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 less-than comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of less-than comparison
+* of vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hblt2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector greater-than comparison and returns boolean
+* true if both \p nv_bfloat16 results are true, boolean false otherwise.
+*
+* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 greater-than comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of greater-than
+* comparison of vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbgt2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered if-equal comparison and returns
+* boolean true if both \p nv_bfloat16 results are true, boolean false otherwise.
+*
+* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 if-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of unordered if-equal
+* comparison of vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbequ2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered not-equal comparison and returns
+* boolean true if both \p nv_bfloat16 results are true, boolean false otherwise.
+*
+* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 not-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of unordered not-equal
+* comparison of vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbneu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered less-equal comparison and returns
+* boolean true if both \p nv_bfloat16 results are true, boolean false otherwise.
+*
+* \details Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 less-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of unordered less-equal
+* comparison of vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbleu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered greater-equal comparison and
+* returns boolean true if both \p nv_bfloat16 results are true, boolean false
+* otherwise.
+*
+* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 greater-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of unordered
+* greater-equal comparison of vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered less-than comparison and returns
+* boolean true if both \p nv_bfloat16 results are true, boolean false otherwise.
+*
+* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 less-than comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of unordered less-than comparison of
+* vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbltu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered greater-than comparison and
+* returns boolean true if both \p nv_bfloat16 results are true, boolean false
+* otherwise.
+*
+* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 greater-than comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of unordered
+* greater-than comparison of vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 if-equal comparison.
+*
+* \details Performs \p nv_bfloat16 if-equal comparison of inputs \p a and \p b.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns bool
+* - The boolean result of if-equal comparison of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __heq(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 not-equal comparison.
+*
+* \details Performs \p nv_bfloat16 not-equal comparison of inputs \p a and \p b.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns bool
+* - The boolean result of not-equal comparison of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hne(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 less-equal comparison.
+*
+* \details Performs \p nv_bfloat16 less-equal comparison of inputs \p a and \p b.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns bool
+* - The boolean result of less-equal comparison of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hle(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 greater-equal comparison.
+*
+* \details Performs \p nv_bfloat16 greater-equal comparison of inputs \p a and \p b.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns bool
+* - The boolean result of greater-equal comparison of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hge(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 less-than comparison.
+*
+* \details Performs \p nv_bfloat16 less-than comparison of inputs \p a and \p b.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns bool
+* - The boolean result of less-than comparison of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hlt(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 greater-than comparison.
+*
+* \details Performs \p nv_bfloat16 greater-than comparison of inputs \p a and \p b.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns bool
+* - The boolean result of greater-than comparison of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hgt(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 unordered if-equal comparison.
+*
+* \details Performs \p nv_bfloat16 if-equal comparison of inputs \p a and \p b.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns bool
+* - The boolean result of unordered if-equal comparison of \p a and
+* \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hequ(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 unordered not-equal comparison.
+*
+* \details Performs \p nv_bfloat16 not-equal comparison of inputs \p a and \p b.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns bool
+* - The boolean result of unordered not-equal comparison of \p a and
+* \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hneu(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 unordered less-equal comparison.
+*
+* \details Performs \p nv_bfloat16 less-equal comparison of inputs \p a and \p b.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns bool
+* - The boolean result of unordered less-equal comparison of \p a and
+* \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hleu(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 unordered greater-equal comparison.
+*
+* \details Performs \p nv_bfloat16 greater-equal comparison of inputs \p a and \p b.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns bool
+* - The boolean result of unordered greater-equal comparison of \p a
+* and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hgeu(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 unordered less-than comparison.
+*
+* \details Performs \p nv_bfloat16 less-than comparison of inputs \p a and \p b.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns bool
+* - The boolean result of unordered less-than comparison of \p a and
+* \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hltu(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 unordered greater-than comparison.
+*
+* \details Performs \p nv_bfloat16 greater-than comparison of inputs \p a and \p b.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns bool
+* - The boolean result of unordered greater-than comparison of \p a
+* and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hgtu(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Determine whether \p nv_bfloat16 argument is a NaN.
+*
+* \details Determine whether \p nv_bfloat16 value \p a is a NaN.
+* \param[in] a - nv_bfloat16. Is only being read.
+*
+* \returns bool
+* - true if argument is NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hisnan(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Calculates \p nv_bfloat16 maximum of two input values.
+*
+* \details Calculates \p nv_bfloat16 max(\p a, \p b)
+* defined as (\p a > \p b) ? \p a : \p b.
+* - If either of inputs is NaN, the other input is returned.
+* - If both inputs are NaNs, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmax(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Calculates \p nv_bfloat16 minimum of two input values.
+*
+* \details Calculates \p nv_bfloat16 min(\p a, \p b)
+* defined as (\p a < \p b) ? \p a : \p b.
+* - If either of inputs is NaN, the other input is returned.
+* - If both inputs are NaNs, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmin(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Calculates \p nv_bfloat16 maximum of two input values, NaNs pass through.
+*
+* \details Calculates \p nv_bfloat16 max(\p a, \p b)
+* defined as (\p a > \p b) ? \p a : \p b.
+* - If either of inputs is NaN, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmax_nan(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Calculates \p nv_bfloat16 minimum of two input values, NaNs pass through.
+*
+* \details Calculates \p nv_bfloat16 min(\p a, \p b)
+* defined as (\p a < \p b) ? \p a : \p b.
+* - If either of inputs is NaN, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmin_nan(const __nv_bfloat16 a, const __nv_bfloat16 b);
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 fused multiply-add in round-to-nearest-even mode with relu saturation.
+*
+* \details Performs \p nv_bfloat16 multiply on inputs \p a and \p b,
+* then performs a \p nv_bfloat16 add of the result with \p c,
+* rounding the result once in round-to-nearest-even mode.
+* Then negative result is clamped to 0.
+* NaN result is converted to canonical NaN.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+* \param[in] c - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The result of fused multiply-add operation on \p
+* a, \p b, and \p c with relu saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __hfma_relu(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c);
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Calculates \p nv_bfloat162 vector maximum of two inputs.
+*
+* \details Calculates \p nv_bfloat162 vector max(\p a, \p b).
+* Elementwise \p nv_bfloat16 operation is defined as
+* (\p a > \p b) ? \p a : \p b.
+* - If either of inputs is NaN, the other input is returned.
+* - If both inputs are NaNs, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The result of elementwise maximum of vectors \p a  and \p b
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmax2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Calculates \p nv_bfloat162 vector minimum of two inputs.
+*
+* \details Calculates \p nv_bfloat162 vector min(\p a, \p b).
+* Elementwise \p nv_bfloat16 operation is defined as
+* (\p a < \p b) ? \p a : \p b.
+* - If either of inputs is NaN, the other input is returned.
+* - If both inputs are NaNs, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The result of elementwise minimum of vectors \p a  and \p b
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmin2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Calculates \p nv_bfloat162 vector maximum of two inputs, NaNs pass through.
+*
+* \details Calculates \p nv_bfloat162 vector max(\p a, \p b).
+* Elementwise \p nv_bfloat16 operation is defined as
+* (\p a > \p b) ? \p a : \p b.
+* - If either of inputs is NaN, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The result of elementwise maximum of vectors \p a  and \p b, with NaNs pass through
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmax2_nan(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Calculates \p nv_bfloat162 vector minimum of two inputs, NaNs pass through.
+*
+* \details Calculates \p nv_bfloat162 vector min(\p a, \p b).
+* Elementwise \p nv_bfloat16 operation is defined as
+* (\p a < \p b) ? \p a : \p b.
+* - If either of inputs is NaN, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The result of elementwise minimum of vectors \p a  and \p b, with NaNs pass through
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmin2_nan(const __nv_bfloat162 a, const __nv_bfloat162 b);
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector fused multiply-add in round-to-nearest-even
+* mode with relu saturation.
+*
+* \details Performs \p nv_bfloat162 vector multiply on inputs \p a and \p b,
+* then performs a \p nv_bfloat162 vector add of the result with \p c,
+* rounding the result once in round-to-nearest-even mode.
+* Then negative result is clamped to 0.
+* NaN result is converted to canonical NaN.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+* \param[in] c - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c with relu saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2_relu(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs fast complex multiply-accumulate
+*
+* \details Interprets vector \p nv_bfloat162 input pairs \p a, \p b, and \p c as
+* complex numbers in \p nv_bfloat16 precision and performs
+* complex multiply-accumulate operation: a*b + c
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+* \param[in] c - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The result of complex multiply-accumulate operation on complex numbers \p a, \p b, and \p c
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __hcmadd(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c);
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 square root in round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat16 square root of input \p a in round-to-nearest-even mode.
+* \param[in] a - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The square root of \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hsqrt(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 reciprocal square root in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p nv_bfloat16 reciprocal square root of input \p a in round-to-nearest-even
+* mode.
+* \param[in] a - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The reciprocal square root of \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hrsqrt(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 reciprocal in round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat16 reciprocal of input \p a in round-to-nearest-even mode.
+* \param[in] a - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The reciprocal of \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hrcp(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 natural logarithm in round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat16 natural logarithm of input \p a in round-to-nearest-even
+* mode.
+* \param[in] a - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The natural logarithm of \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hlog(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 binary logarithm in round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat16 binary logarithm of input \p a in round-to-nearest-even
+* mode.
+* \param[in] a - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The binary logarithm of \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hlog2(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 decimal logarithm in round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat16 decimal logarithm of input \p a in round-to-nearest-even
+* mode.
+* \param[in] a - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The decimal logarithm of \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hlog10(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 natural exponential function in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p nv_bfloat16 natural exponential function of input \p a in
+* round-to-nearest-even mode.
+* \param[in] a - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The natural exponential function on \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hexp(const __nv_bfloat16 a);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates approximate \p nv_bfloat16 hyperbolic tangent function.
+*
+* \details Calculates approximate \p nv_bfloat16 hyperbolic tangent function: \cuda_math_formula \tanh(a)\end_cuda_math_formula.
+* This operation uses HW acceleration on devices of compute capability 9.x and higher.
+* \param[in] a - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The approximate hyperbolic tangent function of \p a.
+* - htanh_approx \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula (\pm 0)\end_cuda_math_formula.
+* - htanh_approx \cuda_math_formula (\pm\infty)\end_cuda_math_formula returns \cuda_math_formula (\pm 1)\end_cuda_math_formula.
+* - htanh_approx(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 htanh_approx(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector approximate hyperbolic tangent function.
+*
+* \details Calculates \p nv_bfloat162 approximate hyperbolic tangent function of input vector \p a.
+* This operation uses HW acceleration on devices of compute capability 9.x and higher.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The elementwise approximate hyperbolic tangent function on vector \p a.
+*
+* \see htanh_approx(__nv_bfloat16) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2tanh_approx(const __nv_bfloat162 a);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 hyperbolic tangent function in
+* round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat16 hyperbolic tangent function: \cuda_math_formula \tanh(a)\end_cuda_math_formula in
+* round-to-nearest-even mode.
+* \param[in] a - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The hyperbolic tangent function of \p a.
+* - htanh \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula (\pm 0)\end_cuda_math_formula.
+* - htanh \cuda_math_formula (\pm\infty)\end_cuda_math_formula returns \cuda_math_formula (\pm 1)\end_cuda_math_formula.
+* - htanh(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 htanh(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector hyperbolic tangent function in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p nv_bfloat162 hyperbolic tangent function of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The elementwise hyperbolic tangent function on vector \p a.
+*
+* \see htanh(__nv_bfloat16) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2tanh(const __nv_bfloat162 a);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 binary exponential function in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p nv_bfloat16 binary exponential function of input \p a in
+* round-to-nearest-even mode.
+* \param[in] a - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The binary exponential function on \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hexp2(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 decimal exponential function in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p nv_bfloat16 decimal exponential function of input \p a in
+* round-to-nearest-even mode.
+* \param[in] a - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The decimal exponential function on \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hexp10(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 cosine in round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat16 cosine of input \p a in round-to-nearest-even mode.
+*
+* NOTE: this function's implementation calls cosf(float) function and is exposed
+* to compiler optimizations. Specifically, \p --use_fast_math flag changes cosf(float)
+* into an intrinsic __cosf(float), which has less accurate numeric behavior.
+*
+* \param[in] a - nv_bfloat16. Is only being read.
+* \returns nv_bfloat16
+* - The cosine of \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hcos(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 sine in round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat16 sine of input \p a in round-to-nearest-even mode.
+*
+* NOTE: this function's implementation calls sinf(float) function and is exposed
+* to compiler optimizations. Specifically, \p --use_fast_math flag changes sinf(float)
+* into an intrinsic __sinf(float), which has less accurate numeric behavior.
+*
+* \param[in] a - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The sine of \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hsin(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector square root in round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat162 square root of input vector \p a in round-to-nearest-even
+* mode.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The elementwise square root on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2sqrt(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector reciprocal square root in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p nv_bfloat162 reciprocal square root of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The elementwise reciprocal square root on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2rsqrt(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector reciprocal in round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat162 reciprocal of input vector \p a in round-to-nearest-even
+* mode.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The elementwise reciprocal on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2rcp(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector natural logarithm in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p nv_bfloat162 natural logarithm of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The elementwise natural logarithm on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2log(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector binary logarithm in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p nv_bfloat162 binary logarithm of input vector \p a in round-to-nearest-even
+* mode.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The elementwise binary logarithm on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2log2(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector decimal logarithm in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p nv_bfloat162 decimal logarithm of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The elementwise decimal logarithm on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2log10(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector exponential function in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p nv_bfloat162 exponential function of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The elementwise exponential function on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2exp(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector binary exponential function in
+* round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat162 binary exponential function of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The elementwise binary exponential function on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2exp2(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector decimal exponential function in
+* round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat162 decimal exponential function of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The elementwise decimal exponential function on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2exp10(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector cosine in round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat162 cosine of input vector \p a in round-to-nearest-even
+* mode.
+*
+* NOTE: this function's implementation calls cosf(float) function and is exposed
+* to compiler optimizations. Specifically, \p --use_fast_math flag changes cosf(float)
+* into an intrinsic __cosf(float), which has less accurate numeric behavior.
+*
+* \param[in] a - nv_bfloat162. Is only being read.
+* \returns nv_bfloat162
+* - The elementwise cosine on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2cos(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector sine in round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat162 sine of input vector \p a in round-to-nearest-even mode.
+*
+* NOTE: this function's implementation calls sinf(float) function and is exposed
+* to compiler optimizations. Specifically, \p --use_fast_math flag changes sinf(float)
+* into an intrinsic __sinf(float), which has less accurate numeric behavior.
+*
+* \param[in] a - nv_bfloat162. Is only being read.
+* \returns nv_bfloat162
+* - The elementwise sine on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2sin(const __nv_bfloat162 a);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Vector add \p val to the value stored at \p address in global or shared memory, and writes this
+* value back to \p address. The atomicity of the add operation is guaranteed separately for each of the
+* two nv_bfloat16 elements; the entire __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access.
+*
+* \details The location of \p address must be in global or shared memory. This operation has undefined
+* behavior otherwise. This operation is natively supported by devices of compute capability 9.x and higher,
+* older devices use emulation path.
+*
+* \param[in] address - __nv_bfloat162*. An address in global or shared memory.
+* \param[in] val - __nv_bfloat162. The value to be added.
+*
+* \returns __nv_bfloat162
+* - The old value read from \p address.
+*
+* \note_ref_guide_atomic
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 atomicAdd(__nv_bfloat162 *const address, const __nv_bfloat162 val);
+
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700))) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Adds \p val to the value stored at \p address in global or shared memory, and writes this value
+* back to \p address. This operation is performed in one atomic operation.
+*
+* \details The location of \p address must be in global or shared memory. This operation has undefined
+* behavior otherwise. This operation is natively supported by devices of compute capability 9.x and higher,
+* older devices of compute capability 7.x and 8.x use emulation path.
+*
+* \param[in] address - __nv_bfloat16*. An address in global or shared memory.
+* \param[in] val - __nv_bfloat16. The value to be added.
+*
+* \returns __nv_bfloat16
+* - The old value read from \p address.
+*
+* \note_ref_guide_atomic
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 atomicAdd(__nv_bfloat16 *const address, const __nv_bfloat16 val);
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700))) || defined(_NVHPC_CUDA) */
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+
+
+#endif /* defined(__cplusplus) */
+
+#if !defined(_MSC_VER) && __cplusplus >= 201103L
+#   define __CPP_VERSION_AT_LEAST_11_BF16
+#elif _MSC_FULL_VER >= 190024210 && _MSVC_LANG >= 201103L
+#   define __CPP_VERSION_AT_LEAST_11_BF16
+#endif
+
+/* C++11 header for ::std::move.
+ * In RTC mode, ::std::move is provided implicitly; don't include the header
+ */
+#if defined(__CPP_VERSION_AT_LEAST_11_BF16) && !defined(__CUDACC_RTC__)
+#include <utility>
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) && !defined(__CUDACC_RTC__) */
+
+/* C++ header for ::std::memcpy (used for type punning in host-side implementations).
+ * When compiling as a CUDA source file memcpy is provided implicitly.
+ * !defined(__CUDACC__) implies !defined(__CUDACC_RTC__).
+ */
+#if defined(__cplusplus) && !defined(__CUDACC__)
+#include <cstring>
+#endif /* defined(__cplusplus) && !defined(__CUDACC__) */
+
+// implicitly provided by NVRTC
+#if !defined(__CUDACC_RTC__)
+#include <nv/target>
+#endif  /* !defined(__CUDACC_RTC__) */
+
+#if (defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3))))
+#define __CUDA_BF16_INLINE__
+#define __CUDA_BF16_FORCEINLINE__
+#else
+#define __CUDA_BF16_INLINE__ inline
+#define __CUDA_BF16_FORCEINLINE__ __forceinline__
+#endif /* #if (defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3)))) */
+
+/* Set up structure-alignment attribute */
+#if defined(__CUDACC__)
+#define __CUDA_ALIGN__(align) __align__(align)
+#else
+/* Define alignment macro based on compiler type (cannot assume C11 "_Alignas" is available) */
+#if defined(__CPP_VERSION_AT_LEAST_11_BF16)
+#define __CUDA_ALIGN__(n) alignas(n)    /* C++11 kindly gives us a keyword for this */
+#else /* defined(__CPP_VERSION_AT_LEAST_11_BF16)*/
+#if defined(__GNUC__)
+#define __CUDA_ALIGN__(n) __attribute__ ((aligned(n)))
+#elif defined(_MSC_VER)
+#define __CUDA_ALIGN__(n) __declspec(align(n))
+#else
+#define __CUDA_ALIGN__(n)
+#endif /* defined(__GNUC__) */
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */
+#endif /* defined(__CUDACC__) */
+
+// define __CUDA_BF16_CONSTEXPR__ in order to
+// use constexpr where possible, with supporting C++ dialects
+// undef after use
+#if (defined __CPP_VERSION_AT_LEAST_11_BF16)
+#define __CUDA_BF16_CONSTEXPR__   constexpr
+#else
+#define __CUDA_BF16_CONSTEXPR__
+#endif
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+ * \brief __nv_bfloat16_raw data type
+ * \details Type allows static initialization of \p nv_bfloat16 until it becomes
+ * a built-in type.
+ *
+ * - Note: this initialization is as a bit-field representation of \p nv_bfloat16,
+ * and not a conversion from \p short to \p nv_bfloat16.
+ * Such representation will be deprecated in a future version of CUDA.
+ *
+ * - Note: this is visible to non-nvcc compilers, including C-only compilations
+ */
+typedef struct __CUDA_ALIGN__(2) {
+    /**
+     * Storage field contains bits representation of the \p nv_bfloat16 floating-point number.
+     */
+    unsigned short x;
+} __nv_bfloat16_raw;
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+ * \brief __nv_bfloat162_raw data type
+ * \details Type allows static initialization of \p nv_bfloat162 until it becomes
+ * a built-in type.
+ *
+ * - Note: this initialization is as a bit-field representation of \p nv_bfloat162,
+ * and not a conversion from \p short2 to \p nv_bfloat162.
+ * Such representation will be deprecated in a future version of CUDA.
+ *
+ * - Note: this is visible to non-nvcc compilers, including C-only compilations
+ */
+typedef struct __CUDA_ALIGN__(4) {
+    /**
+     * Storage field contains bits of the lower \p nv_bfloat16 part.
+     */
+    unsigned short x;
+    /**
+     * Storage field contains bits of the upper \p nv_bfloat16 part.
+     */
+    unsigned short y;
+} __nv_bfloat162_raw;
+
+/* All other definitions in this file are only visible to C++ compilers */
+#if defined(__cplusplus)
+
+/* Hide GCC member initialization list warnings because of host/device in-function init requirement */
+#if defined(__GNUC__)
+#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Weffc++"
+#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */
+#endif /* defined(__GNUC__) */
+
+/* class' : multiple assignment operators specified
+   The class has multiple assignment operators of a single type. This warning is informational */
+#if defined(_MSC_VER) && _MSC_VER >= 1500
+#pragma warning( push )
+#pragma warning( disable:4522 )
+#endif /* defined(__GNUC__) */
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+ * \brief nv_bfloat16 datatype
+ *
+ * \details This structure implements the datatype for storing
+ * nv_bfloat16 floating-point numbers. The structure implements
+ * assignment operators and type conversions. 16 bits are being
+ * used in total: 1 sign bit, 8 bits for the exponent, and
+ * the significand is being stored in 7 bits. The total
+ * precision is 8 bits.
+ *
+ */
+struct __CUDA_ALIGN__(2) __nv_bfloat16 {
+protected:
+    /**
+     * Protected storage variable contains the bits of floating-point data.
+     */
+    unsigned short __x;
+
+public:
+
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * \brief Constructor by default.
+     * \details Emtpy default constructor, result is uninitialized.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_BF16)
+    __nv_bfloat16() = default;
+#else
+    __CUDA_HOSTDEVICE__ __nv_bfloat16() { }
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */
+
+    /* Convert to/from __nv_bfloat16_raw */
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Constructor from \p __nv_bfloat16_raw.
+     */
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_CONSTEXPR__ __nv_bfloat16(const __nv_bfloat16_raw &hr) : __x(hr.x) { }
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Assignment operator from \p __nv_bfloat16_raw.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(const __nv_bfloat16_raw &hr);
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Assignment operator from \p __nv_bfloat16_raw to \p volatile \p __nv_bfloat16.
+     */
+    __CUDA_HOSTDEVICE__ volatile __nv_bfloat16 &operator=(const __nv_bfloat16_raw &hr) volatile;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Assignment operator from \p volatile \p __nv_bfloat16_raw to \p volatile \p __nv_bfloat16.
+     */
+    __CUDA_HOSTDEVICE__ volatile __nv_bfloat16 &operator=(const volatile __nv_bfloat16_raw &hr) volatile;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Type cast to \p __nv_bfloat16_raw operator.
+     */
+    __CUDA_HOSTDEVICE__ operator __nv_bfloat16_raw() const;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Type cast to \p __nv_bfloat16_raw operator with \p volatile input.
+     */
+    __CUDA_HOSTDEVICE__ operator __nv_bfloat16_raw() const volatile;
+
+#if !defined(__CUDA_NO_BFLOAT16_CONVERSIONS__)
+#if defined(__CPP_VERSION_AT_LEAST_11_BF16)
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Construct \p __nv_bfloat16 from \p __half input using default round-to-nearest-even rounding mode.
+     */
+    explicit __CUDA_HOSTDEVICE__ __nv_bfloat16(const __half f)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    asm("{  cvt.rn.bf16.f16 %0, %1;}\n" : "=h"(__x) : "h"(__BFLOAT16_TO_CUS(f)));
+,
+    __x = __float2bfloat16(__half2float(f)).__x;
+)
+}
+#endif /* #if defined(__CPP_VERSION_AT_LEAST_11_BF16) */
+
+    /* Construct from float/double */
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Construct \p __nv_bfloat16 from \p float input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(const float f) { __x = __float2bfloat16(f).__x; }
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Construct \p __nv_bfloat16 from \p double input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(const double f) { __x = __double2bfloat16(f).__x; }
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Type cast to \p float operator.
+     */
+    __CUDA_HOSTDEVICE__ operator float() const;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Type cast to \p __nv_bfloat16 assignment operator from \p float input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(const float f);
+
+    /* We omit "cast to double" operator, so as to not be ambiguous about up-cast */
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Type cast to \p __nv_bfloat16 assignment operator from \p double input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(const double f);
+
+/*
+ * Implicit type conversions to/from integer types were only available to nvcc compilation.
+ * Introducing them for all compilers is a potentially breaking change that may affect
+ * overloads resolution and will require users to update their code.
+ * Define __CUDA_BF16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__ to opt-out.
+ */
+#if !(defined __CUDA_BF16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__) || (defined __CUDACC__)
+    /* Allow automatic construction from types supported natively in hardware */
+    /* Note we do avoid constructor init-list because of special host/device compilation rules */
+
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Construct \p __nv_bfloat16 from \p short integer input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(short val) { __x = __short2bfloat16_rn(val).__x; }
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Construct \p __nv_bfloat16 from \p unsigned \p short integer input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(unsigned short val) { __x = __ushort2bfloat16_rn(val).__x; }
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Construct \p __nv_bfloat16 from \p int input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(int val) { __x = __int2bfloat16_rn(val).__x; }
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Construct \p __nv_bfloat16 from \p unsigned \p int input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(unsigned int val) { __x = __uint2bfloat16_rn(val).__x; }
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Construct \p __nv_bfloat16 from \p long input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(const long val) {
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (sizeof(long) == sizeof(long long))
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            __x = __ll2bfloat16_rn(static_cast<long long>(val)).__x;
+        } else {
+            __x = __int2bfloat16_rn(static_cast<int>(val)).__x;
+        }
+    }
+
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Construct \p __nv_bfloat16 from \p unsigned \p long input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(const unsigned long val) {
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (sizeof(unsigned long) == sizeof(unsigned long long))
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            __x = __ull2bfloat16_rn(static_cast<unsigned long long>(val)).__x;
+        } else {
+            __x = __uint2bfloat16_rn(static_cast<unsigned int>(val)).__x;
+        }
+    }
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Construct \p __nv_bfloat16 from \p long \p long input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(long long val) { __x = __ll2bfloat16_rn(val).__x; }
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Construct \p __nv_bfloat16 from \p unsigned \p long \p long input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(unsigned long long val) { __x = __ull2bfloat16_rn(val).__x; }
+
+    /* Allow automatic casts to supported built-in types, matching all that are permitted with float */
+
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Conversion operator to \p signed \p char data type.
+     * Using round-toward-zero rounding mode.
+     *
+     * See __bfloat162char_rz(__nv_bfloat16) for further details
+     */
+    __CUDA_HOSTDEVICE__ operator signed char() const;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Conversion operator to \p unsigned \p char data type.
+     * Using round-toward-zero rounding mode.
+     *
+     * See __bfloat162uchar_rz(__nv_bfloat16) for further details
+     */
+    __CUDA_HOSTDEVICE__ operator unsigned char() const;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Conversion operator to an implementation defined \p char data type.
+     * Using round-toward-zero rounding mode.
+     *
+     * Detects signedness of the \p char type and proceeds accordingly, see
+     * further details in signed and unsigned char operators.
+     */
+    __CUDA_HOSTDEVICE__ operator char() const;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Conversion operator to \p short data type.
+     * Using round-toward-zero rounding mode.
+     *
+     * See __bfloat162short_rz(__nv_bfloat16) for further details
+     */
+    __CUDA_HOSTDEVICE__ operator short() const;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Conversion operator to \p unsigned \p short data type.
+     * Using round-toward-zero rounding mode.
+     *
+     * See __bfloat162ushort_rz(__nv_bfloat16) for further details
+     */
+    __CUDA_HOSTDEVICE__ operator unsigned short() const;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Conversion operator to \p int data type.
+     * Using round-toward-zero rounding mode.
+     *
+     * See __bfloat162int_rz(__nv_bfloat16) for further details
+     */
+    __CUDA_HOSTDEVICE__ operator int() const;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Conversion operator to \p unsigned \p int data type.
+     * Using round-toward-zero rounding mode.
+     *
+     * See __bfloat162uint_rz(__nv_bfloat16) for further details
+     */
+    __CUDA_HOSTDEVICE__ operator unsigned int() const;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Conversion operator to \p long data type.
+     * Using round-toward-zero rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ operator long() const;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Conversion operator to \p unsigned \p long data type.
+     * Using round-toward-zero rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ operator unsigned long() const;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Conversion operator to \p long \p long data type.
+     * Using round-toward-zero rounding mode.
+     *
+     * See __bfloat162ll_rz(__nv_bfloat16) for further details
+     */
+    __CUDA_HOSTDEVICE__ operator long long() const;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Conversion operator to \p unsigned \p long \p long data type.
+     * Using round-toward-zero rounding mode.
+     *
+     * See __bfloat162ull_rz(__nv_bfloat16) for further details
+     */
+    __CUDA_HOSTDEVICE__ operator unsigned long long() const;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Type cast from \p short assignment operator, using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(short val);
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Type cast from \p unsigned \p short assignment operator, using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(unsigned short val);
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Type cast from \p int assignment operator, using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(int val);
+   /**
+    * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Type cast from \p unsigned \p int assignment operator, using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(unsigned int val);
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Type cast from \p long \p long assignment operator, using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(long long val);
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Type cast from \p unsigned \p long \p long assignment operator, using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(unsigned long long val);
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Conversion operator to \p bool data type.
+     * +0 and -0 inputs convert to \p false.
+     * Non-zero inputs convert to \p true.
+     */
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_CONSTEXPR__ operator bool() const { return (__x & 0x7FFFU) != 0U; }
+#endif /* !(defined __CUDA_BF16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__) || (defined __CUDACC__) */
+#endif /* !defined(__CUDA_NO_BFLOAT16_CONVERSIONS__) */
+};
+
+#if !defined(__CUDA_NO_BFLOAT16_OPERATORS__)
+/* Some basic arithmetic operations expected of a built-in */
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Performs \p nv_bfloat16 addition operation.
+ * See also __hadd(__nv_bfloat16, __nv_bfloat16)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator+(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Performs \p nv_bfloat16 subtraction operation.
+ * See also __hsub(__nv_bfloat16, __nv_bfloat16)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator-(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Performs \p nv_bfloat16 multiplication operation.
+ * See also __hmul(__nv_bfloat16, __nv_bfloat16)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator*(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Performs \p nv_bfloat16 division operation.
+ * See also __hdiv(__nv_bfloat16, __nv_bfloat16)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator/(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Performs \p nv_bfloat16 compound assignment with addition operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator+=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Performs \p nv_bfloat16 compound assignment with subtraction operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator-=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Performs \p nv_bfloat16 compound assignment with multiplication operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator*=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Performs \p nv_bfloat16 compound assignment with division operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator/=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+
+/* Note for increment and decrement we use the raw value 0x3F80U equating to nv_bfloat16(1.0F), to avoid the extra conversion */
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Performs \p nv_bfloat16 prefix increment operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator++(__nv_bfloat16 &h);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Performs \p nv_bfloat16 prefix decrement operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator--(__nv_bfloat16 &h);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Performs \p nv_bfloat16 postfix increment operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16  operator++(__nv_bfloat16 &h, const int ignored);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Performs \p nv_bfloat16 postfix decrement operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16  operator--(__nv_bfloat16 &h, const int ignored);
+/* Unary plus and inverse operators */
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Implements \p nv_bfloat16 unary plus operator, returns input value.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator+(const __nv_bfloat16 &h);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Implements \p nv_bfloat16 unary minus operator.
+ * See also __hneg(__nv_bfloat16)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator-(const __nv_bfloat16 &h);
+
+/* Some basic comparison operations to make it look like a built-in */
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+ * Performs \p nv_bfloat16 ordered compare equal operation.
+ * See also __heq(__nv_bfloat16, __nv_bfloat16)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator==(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+ * Performs \p nv_bfloat16 unordered compare not-equal operation.
+ * See also __hneu(__nv_bfloat16, __nv_bfloat16)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator!=(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+ * Performs \p nv_bfloat16 ordered greater-than compare operation.
+ * See also __hgt(__nv_bfloat16, __nv_bfloat16)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator> (const __nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+ * Performs \p nv_bfloat16 ordered less-than compare operation.
+ * See also __hlt(__nv_bfloat16, __nv_bfloat16)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator< (const __nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+ * Performs \p nv_bfloat16 ordered greater-or-equal compare operation.
+ * See also __hge(__nv_bfloat16, __nv_bfloat16)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator>=(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+ * Performs \p nv_bfloat16 ordered less-or-equal compare operation.
+ * See also __hle(__nv_bfloat16, __nv_bfloat16)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator<=(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+#endif /* !defined(__CUDA_NO_BFLOAT16_OPERATORS__) */
+
+/**
+* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+ * \brief nv_bfloat162 datatype
+ * \details This structure implements the datatype for storing two
+ * nv_bfloat16 floating-point numbers.
+ * The structure implements assignment, arithmetic and comparison
+ * operators, and type conversions.
+ *
+ * - NOTE: __nv_bfloat162 is visible to non-nvcc host compilers
+ */
+struct __CUDA_ALIGN__(4) __nv_bfloat162 {
+    /**
+     * Storage field holding lower \p __nv_bfloat16 part.
+     */
+    __nv_bfloat16 x;
+    /**
+     * Storage field holding upper \p __nv_bfloat16 part.
+     */
+    __nv_bfloat16 y;
+
+    // All construct/copy/assign/move
+public:
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * \brief Constructor by default.
+     * \details Emtpy default constructor, result is uninitialized.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_BF16)
+    __nv_bfloat162() = default;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Move constructor, available for \p C++11 and later dialects
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat162(__nv_bfloat162 &&src);
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Move assignment operator, available for \p C++11 and later dialects
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat162 &operator=(__nv_bfloat162 &&src);
+#else
+    __CUDA_HOSTDEVICE__ __nv_bfloat162();
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */
+
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Constructor from two \p __nv_bfloat16 variables
+     */
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_CONSTEXPR__ __nv_bfloat162(const __nv_bfloat16 &a, const __nv_bfloat16 &b) : x(a), y(b) { }
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Copy constructor
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat162(const __nv_bfloat162 &src);
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Copy assignment operator
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat162 &operator=(const __nv_bfloat162 &src);
+
+    /* Convert to/from __nv_bfloat162_raw */
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Constructor from \p __nv_bfloat162_raw
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat162(const __nv_bfloat162_raw &h2r );
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Assignment operator from \p __nv_bfloat162_raw
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat162 &operator=(const __nv_bfloat162_raw &h2r);
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Conversion operator to \p __nv_bfloat162_raw
+     */
+    __CUDA_HOSTDEVICE__ operator __nv_bfloat162_raw() const;
+};
+
+#if !defined(__CUDA_NO_BFLOAT162_OPERATORS__)
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Performs packed \p nv_bfloat16 addition operation.
+ * See also __hadd2(__nv_bfloat162, __nv_bfloat162)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator+(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Performs packed \p nv_bfloat16 subtraction operation.
+ * See also __hsub2(__nv_bfloat162, __nv_bfloat162)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator-(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Performs packed \p nv_bfloat16 multiplication operation.
+ * See also __hmul2(__nv_bfloat162, __nv_bfloat162)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator*(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Performs packed \p nv_bfloat16 division operation.
+ * See also __h2div(__nv_bfloat162, __nv_bfloat162)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator/(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Performs packed \p nv_bfloat16 compound assignment with addition operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162& operator+=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Performs packed \p nv_bfloat16 compound assignment with subtraction operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162& operator-=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Performs packed \p nv_bfloat16 compound assignment with multiplication operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162& operator*=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Performs packed \p nv_bfloat16 compound assignment with division operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162& operator/=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Performs packed \p nv_bfloat16 prefix increment operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 &operator++(__nv_bfloat162 &h);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Performs packed \p nv_bfloat16 prefix decrement operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 &operator--(__nv_bfloat162 &h);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Performs packed \p nv_bfloat16 postfix increment operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162  operator++(__nv_bfloat162 &h, const int ignored);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Performs packed \p nv_bfloat16 postfix decrement operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162  operator--(__nv_bfloat162 &h, const int ignored);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Implements packed \p nv_bfloat16 unary plus operator, returns input value.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator+(const __nv_bfloat162 &h);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Implements packed \p nv_bfloat16 unary minus operator.
+ * See also __hneg2(__nv_bfloat162)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator-(const __nv_bfloat162 &h);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+ * Performs packed \p nv_bfloat16 ordered compare equal operation.
+ * See also __hbeq2(__nv_bfloat162, __nv_bfloat162)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator==(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+ * Performs packed \p nv_bfloat16 unordered compare not-equal operation.
+ * See also __hbneu2(__nv_bfloat162, __nv_bfloat162)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator!=(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+ * Performs packed \p nv_bfloat16 ordered greater-than compare operation.
+ * See also __hbgt2(__nv_bfloat162, __nv_bfloat162)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator>(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+ * Performs packed \p nv_bfloat16 ordered less-than compare operation.
+ * See also __hblt2(__nv_bfloat162, __nv_bfloat162)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator<(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+ * Performs packed \p nv_bfloat16 ordered greater-or-equal compare operation.
+ * See also __hbge2(__nv_bfloat162, __nv_bfloat162)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator>=(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+ * Performs packed \p nv_bfloat16 ordered less-or-equal compare operation.
+ * See also __hble2(__nv_bfloat162, __nv_bfloat162)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator<=(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+
+#endif /* !defined(__CUDA_NO_BFLOAT162_OPERATORS__) */
+
+#if defined(__CPP_VERSION_AT_LEAST_11_BF16)
+#if !defined(__CUDA_NO_HALF_CONVERSIONS__)
+__CUDA_HOSTDEVICE__
+#ifdef __CUDACC_RTC__
+inline
+#else
+__CUDA_BF16_FORCEINLINE__
+#endif
+__half::__half(const __nv_bfloat16 f)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    asm("{  cvt.rn.f16.bf16 %0, %1;}\n" : "=h"(__x) : "h"(__BFLOAT16_TO_CUS(f)));
+,
+    __x = __float2half_rn(__bfloat162float(f)).__x;
+)
+}
+#endif
+#endif /* #if defined(__CPP_VERSION_AT_LEAST_11_BF16) */
+
+#endif /* defined(__cplusplus) */
+
+#if (defined(__FORCE_INCLUDE_CUDA_BF16_HPP_FROM_BF16_H__) || \
+    !(defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3)))))
+/* Note the .hpp file is included to capture the "nv_bfloat16" & "nv_bfloat162" built-in function definitions. For NVRTC, the built-in
+   function definitions are compiled at NVRTC library build-time and are available through the NVRTC built-ins library at
+   link time.
+*/
+#include "cuda_bf16.hpp"
+#endif /* (defined(__FORCE_INCLUDE_CUDA_BF16_HPP_FROM_BF16_H__) || \
+          !(defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3))))) */
+
+/* Define first-class types "nv_bfloat16" and "nv_bfloat162", unless user specifies otherwise via "#define CUDA_NO_BFLOAT16" */
+/* C cannot ever have these types defined here, because __nv_bfloat16 and __nv_bfloat162 are C++ classes */
+#if defined(__cplusplus) && !defined(CUDA_NO_BFLOAT16)
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+ * \brief This datatype is meant to be the first-class or fundamental
+ * implementation of the bfloat16 numbers format.
+ *
+ * \details Should be implemented in the compiler in the future.
+ * Current implementation is a simple typedef to a respective
+ * user-level type with underscores.
+ */
+typedef __nv_bfloat16  nv_bfloat16;
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+ * \brief This datatype is meant to be the first-class or fundamental
+ * implementation of type for pairs of bfloat16 numbers.
+ *
+ * \details Should be implemented in the compiler in the future.
+ * Current implementation is a simple typedef to a respective
+ * user-level type with underscores.
+ */
+typedef __nv_bfloat162 nv_bfloat162;
+
+#endif /* defined(__cplusplus) && !defined(CUDA_NO_BFLOAT16) */
+
+#undef __CUDA_BF16_DECL__
+#undef __CUDA_HOSTDEVICE_BF16_DECL__
+#undef __CUDA_HOSTDEVICE__
+#undef __CUDA_BF16_INLINE__
+#undef __CUDA_BF16_FORCEINLINE__
+#undef ___CUDA_BF16_STRINGIFY_INNERMOST
+#undef __CUDA_BF16_STRINGIFY
+
+#endif /* end of include guard: __CUDA_BF16_H__ */
diff --git a/numba_cuda/numba/cuda/include/13/cuda_bf16.hpp b/numba_cuda/numba/cuda/include/13/cuda_bf16.hpp
new file mode 100644
index 000000000..5f610c976
--- /dev/null
+++ b/numba_cuda/numba/cuda/include/13/cuda_bf16.hpp
@@ -0,0 +1,3865 @@
+/*
+* Copyright 1993-2024 NVIDIA Corporation.  All rights reserved.
+*
+* NOTICE TO LICENSEE:
+*
+* This source code and/or documentation ("Licensed Deliverables") are
+* subject to NVIDIA intellectual property rights under U.S. and
+* international Copyright laws.
+*
+* These Licensed Deliverables contained herein is PROPRIETARY and
+* CONFIDENTIAL to NVIDIA and is being provided under the terms and
+* conditions of a form of NVIDIA software license agreement by and
+* between NVIDIA and Licensee ("License Agreement") or electronically
+* accepted by Licensee.  Notwithstanding any terms or conditions to
+* the contrary in the License Agreement, reproduction or disclosure
+* of the Licensed Deliverables to any third party without the express
+* written consent of NVIDIA is prohibited.
+*
+* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+* OF THESE LICENSED DELIVERABLES.
+*
+* U.S. Government End Users.  These Licensed Deliverables are a
+* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+* 1995), consisting of "commercial computer software" and "commercial
+* computer software documentation" as such terms are used in 48
+* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+* only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+* U.S. Government End Users acquire the Licensed Deliverables with
+* only those rights set forth herein.
+*
+* Any use of the Licensed Deliverables in individual and commercial
+* software must include, in the user documentation and internal
+* comments to the code, the above Disclaimer and U.S. Government End
+* Users Notice.
+*/
+
+#if !defined(__CUDA_BF16_HPP__)
+#define __CUDA_BF16_HPP__
+
+#if !defined(__CUDA_BF16_H__)
+#error "Do not include this file directly. Instead, include cuda_bf16.h."
+#endif
+
+#if !defined(IF_DEVICE_OR_CUDACC)
+#if defined(__CUDACC__)
+    #define IF_DEVICE_OR_CUDACC(d, c, f) NV_IF_ELSE_TARGET(NV_IS_DEVICE, d, c)
+#else
+    #define IF_DEVICE_OR_CUDACC(d, c, f) NV_IF_ELSE_TARGET(NV_IS_DEVICE, d, f)
+#endif
+#endif
+
+/* All other definitions in this file are only visible to C++ compilers */
+#if defined(__cplusplus)
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16_CONSTANTS
+ * \brief Defines floating-point positive infinity value for the \p nv_bfloat16 data type
+ */
+#define CUDART_INF_BF16            __ushort_as_bfloat16((unsigned short)0x7F80U)
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16_CONSTANTS
+ * \brief Defines canonical NaN value for the \p nv_bfloat16 data type
+ */
+#define CUDART_NAN_BF16            __ushort_as_bfloat16((unsigned short)0x7FFFU)
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16_CONSTANTS
+ * \brief Defines a minimum representable (denormalized) value for the \p nv_bfloat16 data type
+ */
+#define CUDART_MIN_DENORM_BF16     __ushort_as_bfloat16((unsigned short)0x0001U)
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16_CONSTANTS
+ * \brief Defines a maximum representable value for the \p nv_bfloat16 data type
+ */
+#define CUDART_MAX_NORMAL_BF16     __ushort_as_bfloat16((unsigned short)0x7F7FU)
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16_CONSTANTS
+ * \brief Defines a negative zero value for the \p nv_bfloat16 data type
+ */
+#define CUDART_NEG_ZERO_BF16       __ushort_as_bfloat16((unsigned short)0x8000U)
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16_CONSTANTS
+ * \brief Defines a positive zero value for the \p nv_bfloat16 data type
+ */
+#define CUDART_ZERO_BF16           __ushort_as_bfloat16((unsigned short)0x0000U)
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16_CONSTANTS
+ * \brief Defines a value of 1.0 for the \p nv_bfloat16 data type
+ */
+#define CUDART_ONE_BF16            __ushort_as_bfloat16((unsigned short)0x3F80U)
+
+#if !(defined __DOXYGEN_ONLY__)
+
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(const __nv_bfloat16_raw &hr) { __x = hr.x; return *this; }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ volatile __nv_bfloat16 &__nv_bfloat16::operator=(const __nv_bfloat16_raw &hr) volatile { __x = hr.x; return *this; }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ volatile __nv_bfloat16 &__nv_bfloat16::operator=(const volatile __nv_bfloat16_raw &hr) volatile { __x = hr.x; return *this; }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator __nv_bfloat16_raw() const { __nv_bfloat16_raw ret; ret.x = __x; return ret; }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator __nv_bfloat16_raw() const volatile { __nv_bfloat16_raw ret; ret.x = __x; return ret; }
+
+#if !defined(__CUDA_NO_BFLOAT16_CONVERSIONS__)
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator float() const { return __bfloat162float(*this); }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(const float f) { __x = __float2bfloat16(f).__x; return *this; }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(const double f) { __x = __double2bfloat16(f).__x; return *this; }
+
+/*
+ * Implicit type conversions to/from integer types were only available to nvcc compilation.
+ * Introducing them for all compilers is a potentially breaking change that may affect
+ * overloads resolution and will require users to update their code.
+ * Define __CUDA_BF16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__ to opt-out.
+ */
+#if !(defined __CUDA_BF16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__) || (defined __CUDACC__)
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator signed char() const { return __bfloat162char_rz(*this); }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator unsigned char() const { return __bfloat162uchar_rz(*this); }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator char() const {
+        char value;
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (((char)-1) < (char)0)
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            value = static_cast<char>(__bfloat162char_rz(*this));
+        }
+        else
+        {
+            value = static_cast<char>(__bfloat162uchar_rz(*this));
+        }
+        return value;
+    }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator short() const { return __bfloat162short_rz(*this); }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator unsigned short() const { return __bfloat162ushort_rz(*this); }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator int() const { return __bfloat162int_rz(*this); }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator unsigned int() const { return __bfloat162uint_rz(*this); }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator long() const {
+        long retval;
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (sizeof(long) == sizeof(long long))
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            retval = static_cast<long>(__bfloat162ll_rz(*this));
+        }
+        else
+        {
+            retval = static_cast<long>(__bfloat162int_rz(*this));
+        }
+        return retval;
+    }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator unsigned long() const {
+        unsigned long retval;
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (sizeof(unsigned long) == sizeof(unsigned long long))
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            retval = static_cast<unsigned long>(__bfloat162ull_rz(*this));
+        }
+        else
+        {
+            retval = static_cast<unsigned long>(__bfloat162uint_rz(*this));
+        }
+        return retval;
+    }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator long long() const { return __bfloat162ll_rz(*this); }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator unsigned long long() const { return __bfloat162ull_rz(*this); }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(short val) { __x = __short2bfloat16_rn(val).__x; return *this; }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(unsigned short val) { __x = __ushort2bfloat16_rn(val).__x; return *this; }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(int val) { __x = __int2bfloat16_rn(val).__x; return *this; }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(unsigned int val) { __x = __uint2bfloat16_rn(val).__x; return *this; }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(long long val) { __x = __ll2bfloat16_rn(val).__x; return *this; }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(unsigned long long val) { __x = __ull2bfloat16_rn(val).__x; return *this; }
+#endif /* !(defined __CUDA_BF16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__) || (defined __CUDACC__) */
+#endif /* !defined(__CUDA_NO_BFLOAT16_CONVERSIONS__) */
+
+
+#if !defined(__CUDA_NO_BFLOAT16_OPERATORS__)
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator+(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hadd(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator-(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hsub(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator*(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hmul(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator/(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hdiv(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator+=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh) { lh = __hadd(lh, rh); return lh; }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator-=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh) { lh = __hsub(lh, rh); return lh; }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator*=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh) { lh = __hmul(lh, rh); return lh; }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator/=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh) { lh = __hdiv(lh, rh); return lh; }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator++(__nv_bfloat16 &h)      { __nv_bfloat16_raw one; one.x = 0x3F80U; h += one; return h; }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator--(__nv_bfloat16 &h)      { __nv_bfloat16_raw one; one.x = 0x3F80U; h -= one; return h; }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16  operator++(__nv_bfloat16 &h, const int ignored)
+{
+    // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
+    static_cast<void>(ignored);
+
+    const __nv_bfloat16 ret = h;
+    __nv_bfloat16_raw one;
+    one.x = 0x3F80U;
+    h += one;
+    return ret;
+}
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16  operator--(__nv_bfloat16 &h, const int ignored)
+{
+    // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
+    static_cast<void>(ignored);
+
+    const __nv_bfloat16 ret = h;
+    __nv_bfloat16_raw one;
+    one.x = 0x3F80U;
+    h -= one;
+    return ret;
+}
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator+(const __nv_bfloat16 &h) { return h; }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator-(const __nv_bfloat16 &h) { return __hneg(h); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator==(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __heq(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator!=(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hneu(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator> (const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hgt(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator< (const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hlt(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator>=(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hge(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator<=(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hle(lh, rh); }
+#endif /* !defined(__CUDA_NO_BFLOAT16_OPERATORS__) */
+
+#if defined(__CPP_VERSION_AT_LEAST_11_BF16)
+__CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat162::__nv_bfloat162(__nv_bfloat162 &&src) {
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __BFLOAT162_TO_UI(*this) = ::std::move(__BFLOAT162_TO_CUI(src));
+,
+    this->x = src.x;
+    this->y = src.y;
+)
+}
+__CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat162 &__nv_bfloat162::operator=(__nv_bfloat162 &&src) {
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __BFLOAT162_TO_UI(*this) = ::std::move(__BFLOAT162_TO_CUI(src));
+,
+    this->x = src.x;
+    this->y = src.y;
+)
+    return *this;
+}
+#else
+__CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat162::__nv_bfloat162() { }
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat162::__nv_bfloat162(const __nv_bfloat162 &src) {
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+   __BFLOAT162_TO_UI(*this) = __BFLOAT162_TO_CUI(src);
+,
+    this->x = src.x;
+    this->y = src.y;
+)
+}
+__CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat162 &__nv_bfloat162::operator=(const __nv_bfloat162 &src) {
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+   __BFLOAT162_TO_UI(*this) = __BFLOAT162_TO_CUI(src);
+,
+    this->x = src.x;
+    this->y = src.y;
+)
+    return *this;
+}
+__CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat162::__nv_bfloat162(const __nv_bfloat162_raw &h2r ) {
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __BFLOAT162_TO_UI(*this) = __BFLOAT162_TO_CUI(h2r);
+,
+    __nv_bfloat16_raw tr;
+    tr.x = h2r.x;
+    this->x = static_cast<__nv_bfloat16>(tr);
+    tr.x = h2r.y;
+    this->y = static_cast<__nv_bfloat16>(tr);
+)
+}
+__CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat162 &__nv_bfloat162::operator=(const __nv_bfloat162_raw &h2r) {
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __BFLOAT162_TO_UI(*this) = __BFLOAT162_TO_CUI(h2r);
+,
+    __nv_bfloat16_raw tr;
+    tr.x = h2r.x;
+    this->x = static_cast<__nv_bfloat16>(tr);
+    tr.x = h2r.y;
+    this->y = static_cast<__nv_bfloat16>(tr);
+)
+    return *this;
+}
+__CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat162::operator __nv_bfloat162_raw() const {
+    __nv_bfloat162_raw ret;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    ret.x = 0U;
+    ret.y = 0U;
+    __BFLOAT162_TO_UI(ret) = __BFLOAT162_TO_CUI(*this);
+,
+    ret.x = static_cast<__nv_bfloat16_raw>(this->x).x;
+    ret.y = static_cast<__nv_bfloat16_raw>(this->y).x;
+)
+    return ret;
+}
+
+#if !defined(__CUDA_NO_BFLOAT162_OPERATORS__)
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator+(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hadd2(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator-(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hsub2(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator*(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hmul2(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator/(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __h2div(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162& operator+=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh) { lh = __hadd2(lh, rh); return lh; }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162& operator-=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh) { lh = __hsub2(lh, rh); return lh; }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162& operator*=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh) { lh = __hmul2(lh, rh); return lh; }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162& operator/=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh) { lh = __h2div(lh, rh); return lh; }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 &operator++(__nv_bfloat162 &h)      { __nv_bfloat162_raw one; one.x = 0x3F80U; one.y = 0x3F80U; h = __hadd2(h, one); return h; }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 &operator--(__nv_bfloat162 &h)      { __nv_bfloat162_raw one; one.x = 0x3F80U; one.y = 0x3F80U; h = __hsub2(h, one); return h; }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162  operator++(__nv_bfloat162 &h, const int ignored)
+{
+    // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
+    static_cast<void>(ignored);
+
+    const __nv_bfloat162 ret = h;
+    __nv_bfloat162_raw one;
+    one.x = 0x3F80U;
+    one.y = 0x3F80U;
+    h = __hadd2(h, one);
+    return ret;
+}
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162  operator--(__nv_bfloat162 &h, const int ignored)
+{
+    // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
+    static_cast<void>(ignored);
+
+    const __nv_bfloat162 ret = h;
+    __nv_bfloat162_raw one;
+    one.x = 0x3F80U;
+    one.y = 0x3F80U;
+    h = __hsub2(h, one);
+    return ret;
+}
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator+(const __nv_bfloat162 &h) { return h; }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator-(const __nv_bfloat162 &h) { return __hneg2(h); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator==(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hbeq2(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator!=(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hbneu2(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator>(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hbgt2(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator<(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hblt2(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator>=(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hbge2(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator<=(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hble2(lh, rh); }
+#endif /* !defined(__CUDA_NO_BFLOAT162_OPERATORS__) */
+
+/* Restore warning for multiple assignment operators */
+#if defined(_MSC_VER) && _MSC_VER >= 1500
+#pragma warning( pop )
+#endif /* defined(_MSC_VER) && _MSC_VER >= 1500 */
+
+/* Restore -Weffc++ warnings from here on */
+#if defined(__GNUC__)
+#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
+#pragma GCC diagnostic pop
+#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */
+#endif /* defined(__GNUC__) */
+
+#undef __CUDA_HOSTDEVICE__
+#undef __CUDA_ALIGN__
+
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __internal_float_as_uint(const float f)
+{
+    unsigned int u;
+IF_DEVICE_OR_CUDACC(
+    u = __float_as_uint(f);
+,
+    memcpy(&u, &f, sizeof(f));
+,
+    ::std::memcpy(&u, &f, sizeof(f));
+)
+    return u;
+}
+
+__CUDA_HOSTDEVICE_BF16_DECL__ float __internal_uint_as_float(const unsigned int u)
+{
+    float f;
+IF_DEVICE_OR_CUDACC(
+    f = __uint_as_float(u);
+,
+    memcpy(&f, &u, sizeof(u));
+,
+    ::std::memcpy(&f, &u, sizeof(u));
+)
+    return f;
+}
+
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned short __internal_float2bfloat16(const float f, unsigned int &sign, unsigned int &remainder)
+{
+    unsigned int x;
+
+    x = __internal_float_as_uint(f);
+
+    if ((x & 0x7fffffffU) > 0x7f800000U) {
+        sign = 0U;
+        remainder = 0U;
+        return static_cast<unsigned short>(0x7fffU);
+    }
+    sign = x >> 31U;
+    remainder = x << 16U;
+    return static_cast<unsigned short>(x >> 16U);
+}
+
+__CUDA_HOSTDEVICE_BF16_DECL__ float __internal_double2float_rn(const double x)
+{
+    float r;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rn.f32.f64 %0, %1;" : "=f"(r) : "d"(x));
+,
+    r = static_cast<float>(x);
+)
+    return r;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ double __internal_float2double(const float x)
+{
+    double r;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.f64.f32 %0, %1;" : "=d"(r) : "f"(x));
+,
+    r = static_cast<double>(x);
+)
+    return r;
+}
+
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __double2bfloat16(const double x)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 val;
+    asm("{  cvt.rn.bf16.f64 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "d"(x));
+    return val;
+,
+    float f = __internal_double2float_rn(x);
+    const double d = __internal_float2double(f);
+    unsigned int u = __internal_float_as_uint(f);
+
+    bool x_is_not_nan = ((u << (unsigned)1U) <= (unsigned)0xFF000000U);
+
+
+    if ((x > 0.0) && (d > x)) {
+        u--;
+    }
+    if ((x < 0.0) && (d < x)) {
+        u--;
+    }
+    if ((d != x) && x_is_not_nan) {
+        u |= 1U;
+    }
+
+    f = __internal_uint_as_float(u);
+
+    return __float2bfloat16(f);
+)
+}
+
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16(const float a)
+{
+    __nv_bfloat16 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    asm("{  cvt.rn.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a));
+,
+    __nv_bfloat16_raw r;
+    unsigned int sign = 0U;
+    unsigned int remainder = 0U;
+    r.x = __internal_float2bfloat16(a, sign, remainder);
+    if ((remainder > 0x80000000U) || ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) {
+        r.x++;
+    }
+    val = r;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rn(const float a)
+{
+    __nv_bfloat16 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    asm("{  cvt.rn.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a));
+,
+    __nv_bfloat16_raw r;
+    unsigned int sign = 0U;
+    unsigned int remainder = 0U;
+    r.x = __internal_float2bfloat16(a, sign, remainder);
+    if ((remainder > 0x80000000U) || ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) {
+        r.x++;
+    }
+    val = r;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rz(const float a)
+{
+    __nv_bfloat16 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    asm("{  cvt.rz.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a));
+,
+    __nv_bfloat16_raw r;
+    unsigned int sign = 0U;
+    unsigned int remainder = 0U;
+    r.x = __internal_float2bfloat16(a, sign, remainder);
+    val = r;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rd(const float a)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 val;
+    asm("{  cvt.rm.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a));
+    return val;
+,
+    __nv_bfloat16 val;
+    __nv_bfloat16_raw r;
+    unsigned int sign = 0U;
+    unsigned int remainder = 0U;
+    r.x = __internal_float2bfloat16(a, sign, remainder);
+    if ((remainder != 0U) && (sign != 0U)) {
+        r.x++;
+    }
+    val = r;
+    return val;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_ru(const float a)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 val;
+    asm("{  cvt.rp.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a));
+    return val;
+,
+    __nv_bfloat16 val;
+    __nv_bfloat16_raw r;
+    unsigned int sign = 0U;
+    unsigned int remainder = 0U;
+    r.x = __internal_float2bfloat16(a, sign, remainder);
+    if ((remainder != 0U) && (sign == 0U)) {
+        r.x++;
+    }
+    val = r;
+    return val;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __float2bfloat162_rn(const float a)
+{
+    __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    asm("{.reg .b16 low;\n"
+        "  cvt.rn.bf16.f32 low, %1;\n"
+        "  mov.b32 %0, {low,low};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "f"(a));
+,
+    val = __nv_bfloat162(__float2bfloat16_rn(a), __float2bfloat16_rn(a));
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __floats2bfloat162_rn(const float a, const float b)
+{
+    __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    asm("{ cvt.rn.bf16x2.f32 %0, %2, %1;}\n"
+        : "=r"(__BFLOAT162_TO_UI(val)) : "f"(a), "f"(b));
+,
+    val = __nv_bfloat162(__float2bfloat16_rn(a), __float2bfloat16_rn(b));
+)
+    return val;
+}
+
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ float __internal_device_bfloat162float(const unsigned short h)
+{
+    float f;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    asm("{ cvt.f32.bf16 %0, %1;}\n" : "=f"(f) : "h"(h));
+,
+    asm("{ mov.b32 %0, {0,%1};}\n" : "=f"(f) : "h"(h));
+)
+    return f;
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+
+__CUDA_HOSTDEVICE_BF16_DECL__ float __internal_bfloat162float(const unsigned short h)
+{
+    float f;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    f = __internal_device_bfloat162float(h);
+,
+    unsigned int u = static_cast<unsigned int>(h) << 16;
+    f = __internal_uint_as_float(u);
+)
+    return f;
+}
+
+__CUDA_HOSTDEVICE_BF16_DECL__ float __bfloat162float(const __nv_bfloat16 a)
+{
+    return __internal_bfloat162float(static_cast<__nv_bfloat16_raw>(a).x);
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ float __low2float(const __nv_bfloat162 a)
+{
+    return __internal_bfloat162float(static_cast<__nv_bfloat162_raw>(a).x);
+}
+
+__CUDA_HOSTDEVICE_BF16_DECL__ float __high2float(const __nv_bfloat162 a)
+{
+    return __internal_bfloat162float(static_cast<__nv_bfloat162_raw>(a).y);
+}
+
+/* CUDA vector-types compatible vector creation function (note returns __nv_bfloat162, not nv_bfloat162) */
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 make_bfloat162(const __nv_bfloat16 x, const __nv_bfloat16 y)
+{
+    __nv_bfloat162 t; t.x = x; t.y = y; return t;
+}
+
+/* Definitions of intrinsics */
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __float22bfloat162_rn(const float2 a)
+{
+    __nv_bfloat162 val = __floats2bfloat162_rn(a.x, a.y);
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ float2 __bfloat1622float2(const __nv_bfloat162 a)
+{
+    float hi_float;
+    float lo_float;
+    lo_float = __internal_bfloat162float(((__nv_bfloat162_raw)a).x);
+    hi_float = __internal_bfloat162float(((__nv_bfloat162_raw)a).y);
+    return make_float2(lo_float, hi_float);
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ int __bfloat162int_rn(const __nv_bfloat16 h)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    int val;
+    asm("{  cvt.rni.s32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h)));
+    return val;
+,
+    return __float2int_rn(__bfloat162float(h));
+)
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+
+__CUDA_HOSTDEVICE_BF16_DECL__ int __internal_bfloat162int_rz(const __nv_bfloat16 h)
+{
+    const float f = __bfloat162float(h);
+    int   i;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    i = __float2int_rz(f);
+,
+    const int max_val = (int)0x7fffffffU;
+    const int min_val = (int)0x80000000U;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__nv_bfloat16_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xFF00U) {
+        // NaN
+        i = 0;
+    } else if (f >= static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    } else {
+        i = static_cast<int>(f);
+    }
+)
+    return i;
+}
+
+__CUDA_HOSTDEVICE_BF16_DECL__ int __bfloat162int_rz(const __nv_bfloat16 h)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    int val;
+    asm("{  cvt.rzi.s32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h)));
+    return val;
+,
+    return __internal_bfloat162int_rz(h);
+)
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ int __bfloat162int_rd(const __nv_bfloat16 h)
+{
+    int val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    asm("{  cvt.rmi.s32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h)));
+,
+    const float f = __bfloat162float(h);
+    asm("cvt.rmi.s32.f32 %0, %1;" : "=r"(val) : "f"(f));
+)
+    return val;
+}
+__CUDA_BF16_DECL__ int __bfloat162int_ru(const __nv_bfloat16 h)
+{
+    int val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    asm("{  cvt.rpi.s32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h)));
+,
+    const float f = __bfloat162float(h);
+    asm("cvt.rpi.s32.f32 %0, %1;" : "=r"(val) : "f"(f));
+)
+    return val;
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat16 __internal_device_int2bfloat16_rn(const int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+        __nv_bfloat16 val;
+       asm("cvt.rn.bf16.s32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i));
+       return val;
+,
+        const float ru = __int2float_ru(i);
+        const float rd = __int2float_rd(i);
+        float rz = __int2float_rz(i);
+        if (ru != rd) {
+            rz = __uint_as_float(__float_as_uint(rz) | 1U);
+        }
+        return __float2bfloat16_rn(rz);
+)
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rn(const int i)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_int2bfloat16_rn(i);
+,
+    const double d = static_cast<double>(i);
+    return __double2bfloat16(d);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ signed char __bfloat162char_rz(const __nv_bfloat16 h)
+{
+    signed char i;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    unsigned short tmp = 0;
+    asm("{ .reg.b8 myreg;\n"
+        "  cvt.rzi.s8.bf16 myreg, %1;\n"
+        "  mov.b16 %0, {myreg, 0};\n}"
+         :"=h"(tmp) : "h"(__BFLOAT16_TO_CUS(h)));
+    const unsigned char u = static_cast<unsigned char>(tmp);
+    i = static_cast<signed char>(u);
+,
+    const float f = __bfloat162float(h);
+    const signed char max_val = (signed char)0x7fU;
+    const signed char min_val = (signed char)0x80U;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__nv_bfloat16_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xFF00U) {
+        // NaN
+        i = 0;
+    } else if (f > static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    } else {
+        // normal value, conversion is well-defined
+        i = static_cast<signed char>(f);
+    }
+)
+    return i;
+}
+
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned char __bfloat162uchar_rz(const __nv_bfloat16 h)
+{
+    unsigned char i;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    unsigned short tmp = 0;
+    asm("{ .reg.b8 myreg;\n"
+        "  cvt.rzi.u8.bf16 myreg, %1;\n"
+        "  mov.b16 %0, {myreg, 0};\n}"
+         :"=h"(tmp) : "h"(__BFLOAT16_TO_CUS(h)));
+    i = static_cast<unsigned char>(tmp);
+,
+    const float f = __bfloat162float(h);
+    const unsigned char max_val = 0xffU;
+    const unsigned char min_val = 0U;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__nv_bfloat16_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xFF00U) {
+        // NaN
+        i = 0U;
+    } else if (f > static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    } else {
+        // normal value, conversion is well-defined
+        i = static_cast<unsigned char>(f);
+    }
+)
+    return i;
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rz(const int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+     __nv_bfloat16 val;
+    asm("cvt.rz.bf16.s32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i));
+    return val;
+,
+    return __float2bfloat16_rz(__int2float_rz(i));
+)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rd(const int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+     __nv_bfloat16 val;
+    asm("cvt.rm.bf16.s32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i));
+    return val;
+,
+    return __float2bfloat16_rd(__int2float_rd(i));
+)
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_ru(const int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+     __nv_bfloat16 val;
+    asm("cvt.rp.bf16.s32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i));
+    return val;
+,
+    return __float2bfloat16_ru(__int2float_ru(i));
+)
+}
+
+__CUDA_BF16_DECL__ short int __bfloat162short_rn(const __nv_bfloat16 h)
+{
+   short int val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm("cvt.rni.s16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+,
+   asm("{ .reg.f32 f;\n"
+       "  mov.b32 f, {0,%1};\n"
+       "  cvt.rni.s16.f32 %0,f;\n}"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+)
+   return val;
+}
+
+__CUDA_BF16_DECL__ short int __internal_device_bfloat162short_rz(const __nv_bfloat16 h)
+{
+    short int val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    asm("cvt.rzi.s16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+,
+    asm("{ .reg.f32 f;\n"
+        "  mov.b32 f, {0,%1};\n"
+        "  cvt.rzi.s16.f32 %0,f;\n}"
+         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+)
+    return val;
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_BF16_DECL__ short int __bfloat162short_rz(const __nv_bfloat16 h)
+{
+    short int val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    val = __internal_device_bfloat162short_rz(h);
+,
+    const float f = __bfloat162float(h);
+    const short int max_val = (short int)0x7fffU;
+    const short int min_val = (short int)0x8000U;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__nv_bfloat16_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xFF00U) {
+        // NaN
+        val = 0;
+    } else if (f > static_cast<float>(max_val)) {
+        // saturate maximum
+        val = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        val = min_val;
+    } else {
+        val = static_cast<short int>(f);
+    }
+)
+   return val;
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ short int __bfloat162short_rd(const __nv_bfloat16 h)
+{
+   short int val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm("cvt.rmi.s16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+,
+   asm("{ .reg.f32 f;\n"
+       "  mov.b32 f, {0,%1};\n"
+       "  cvt.rmi.s16.f32 %0,f;\n}"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+)
+   return val;
+}
+__CUDA_BF16_DECL__ short int __bfloat162short_ru(const __nv_bfloat16 h)
+{
+   short int val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm("cvt.rpi.s16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+,
+   asm("{ .reg.f32 f;\n"
+       "  mov.b32 f, {0,%1};\n"
+       "  cvt.rpi.s16.f32 %0,f;\n}"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+)
+   return val;
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rn(const short int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 val;
+    asm("cvt.rn.bf16.s16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i));
+    return val;
+,
+    const float f = static_cast<float>(i);
+    return __float2bfloat16_rn(f);
+)
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rz(const short int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 val;
+    asm("cvt.rz.bf16.s16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i));
+    return val;
+,
+    return __float2bfloat16_rz(__int2float_rz(static_cast<int>(i)));
+)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rd(const short int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 val;
+    asm("cvt.rm.bf16.s16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i));
+    return val;
+,
+    return __float2bfloat16_rd(__int2float_rd(static_cast<int>(i)));
+)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_ru(const short int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 val;
+    asm("cvt.rp.bf16.s16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i));
+    return val;
+,
+    return __float2bfloat16_ru(__int2float_ru(static_cast<int>(i)));
+)
+}
+
+__CUDA_BF16_DECL__ unsigned int __bfloat162uint_rn(const __nv_bfloat16 h)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    unsigned int val;
+    asm("{  cvt.rni.u32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h)));
+    return val;
+,
+    return __float2uint_rn(__bfloat162float(h));
+)
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __internal_bfloat162uint_rz(const __nv_bfloat16 h)
+{
+    const float f = __bfloat162float(h);
+    unsigned int i;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    i = __float2uint_rz(f);
+,
+    const unsigned int max_val = 0xffffffffU;
+    const unsigned int min_val = 0U;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__nv_bfloat16_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xFF00U) {
+        // NaN
+        i = 0U;
+    } else if (f >= static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    } else {
+        i = static_cast<unsigned int>(f);
+    }
+)
+    return i;
+}
+
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __bfloat162uint_rz(const __nv_bfloat16 h)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    unsigned int val;
+    asm("{  cvt.rzi.u32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h)));
+    return val;
+,
+    return __internal_bfloat162uint_rz(h);
+)
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ unsigned int __bfloat162uint_rd(const __nv_bfloat16 h)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    unsigned int val;
+    asm("{  cvt.rmi.u32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h)));
+    return val;
+,
+    return __float2uint_rd(__bfloat162float(h));
+)
+}
+__CUDA_BF16_DECL__ unsigned int __bfloat162uint_ru(const __nv_bfloat16 h)
+{
+    unsigned int val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    asm("{  cvt.rpi.u32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h)));
+,
+    const float f = __bfloat162float(h);
+    asm("cvt.rpi.u32.f32 %0, %1;" : "=r"(val) : "f"(f));
+)
+    return val;
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat16 __internal_device_uint2bfloat16_rn(const unsigned int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 val;
+    asm("cvt.rn.bf16.u32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i));
+    return val;
+,
+    const float ru = __uint2float_ru(i);
+    const float rd = __uint2float_rd(i);
+    float rz = __uint2float_rz(i);
+    if (ru != rd) {
+        rz = __uint_as_float(__float_as_uint(rz) | 1U);
+    }
+    return __float2bfloat16_rn(rz);
+)
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rn(const unsigned int i)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_uint2bfloat16_rn(i);
+,
+    const double d = static_cast<double>(i);
+    return __double2bfloat16(d);
+)
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rz(const unsigned int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+     __nv_bfloat16 val;
+    asm("cvt.rz.bf16.u32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i));
+    return val;
+,
+    return __float2bfloat16_rz(__uint2float_rz(i));
+)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rd(const unsigned int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+     __nv_bfloat16 val;
+    asm("cvt.rm.bf16.u32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i));
+    return val;
+,
+    return __float2bfloat16_rd(__uint2float_rd(i));
+)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_ru(const unsigned int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+     __nv_bfloat16 val;
+    asm("cvt.rp.bf16.u32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i));
+    return val;
+,
+    return __float2bfloat16_ru(__uint2float_ru(i));
+)
+}
+
+__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_rn(const __nv_bfloat16 h)
+{
+   unsigned short int val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm("cvt.rni.u16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+,
+   asm("{ .reg.f32 f;\n"
+       "  mov.b32 f, {0,%1};\n"
+       "  cvt.rni.u16.f32 %0,f;\n}"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+)
+   return val;
+}
+
+__CUDA_BF16_DECL__ unsigned short int __internal_device_bfloat162ushort_rz(const __nv_bfloat16 h)
+{
+   unsigned short int val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm("cvt.rzi.u16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+,
+   asm("{ .reg.f32 f;\n"
+       "  mov.b32 f, {0,%1};\n"
+       "  cvt.rzi.u16.f32 %0,f;\n}"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+)
+   return val;
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned short int __bfloat162ushort_rz(const __nv_bfloat16 h)
+{
+   unsigned short int val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+   val = __internal_device_bfloat162ushort_rz(h);
+,
+    const float f = __bfloat162float(h);
+    const unsigned short int max_val = 0xffffU;
+    const unsigned short int min_val = 0U;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__nv_bfloat16_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xFF00U) {
+        // NaN
+        val = 0U;
+    } else if (f > static_cast<float>(max_val)) {
+        // saturate maximum
+        val = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        val = min_val;
+    } else {
+        val = static_cast<unsigned short int>(f);
+    }
+)
+   return val;
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_rd(const __nv_bfloat16 h)
+{
+   unsigned short int val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm("cvt.rmi.u16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+,
+   asm("{ .reg.f32 f;\n"
+       "  mov.b32 f, {0,%1};\n"
+       "  cvt.rmi.u16.f32 %0,f;\n}"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+)
+   return val;
+}
+__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_ru(const __nv_bfloat16 h)
+{
+   unsigned short int val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm("cvt.rpi.u16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+,
+   asm("{ .reg.f32 f;\n"
+       "  mov.b32 f, {0,%1};\n"
+       "  cvt.rpi.u16.f32 %0,f;\n}"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+)
+   return val;
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rn(const unsigned short int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 val;
+    asm("cvt.rn.bf16.u16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i));
+    return val;
+,
+    const float f = static_cast<float>(i);
+    return __float2bfloat16_rn(f);
+)
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rz(const unsigned short int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 val;
+    asm("cvt.rz.bf16.u16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i));
+    return val;
+,
+    return __float2bfloat16_rz(__uint2float_rz(static_cast<unsigned int>(i)));
+)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rd(const unsigned short int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 val;
+    asm("cvt.rm.bf16.u16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i));
+    return val;
+,
+    return __float2bfloat16_rd(__uint2float_rd(static_cast<unsigned int>(i)));
+)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_ru(const unsigned short int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 val;
+    asm("cvt.rp.bf16.u16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i));
+    return val;
+,
+    return __float2bfloat16_ru(__uint2float_ru(static_cast<unsigned int>(i)));
+)
+}
+
+__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_rn(const __nv_bfloat16 h)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    unsigned long long int i;
+    asm("cvt.rni.u64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h)));
+    return i;
+,
+    return __float2ull_rn(__bfloat162float(h));
+)
+}
+
+__CUDA_BF16_DECL__ unsigned long long int __internal_device_bfloat162ull_rz(const __nv_bfloat16 h)
+{
+    unsigned long long int i;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    asm("cvt.rzi.u64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h)));
+,
+    const float f = __bfloat162float(h);
+    i = __float2ull_rz(f);
+)
+    return i;
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned long long int __bfloat162ull_rz(const __nv_bfloat16 h)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_bfloat162ull_rz(h);
+,
+    const float f = __bfloat162float(h);
+    unsigned long long int i;
+    const unsigned long long int max_val = 0xffffffffffffffffULL;
+    const unsigned long long int min_val = 0ULL;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__nv_bfloat16_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xFF00U) {
+        // NaN
+        i = 0x8000000000000000ULL;
+    } else if (f >= static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    } else {
+        i = static_cast<unsigned long long int>(f);
+    }
+    return i;
+)
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_rd(const __nv_bfloat16 h)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    unsigned long long int i;
+    asm("cvt.rmi.u64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h)));
+    return i;
+,
+    return __float2ull_rd(__bfloat162float(h));
+)
+}
+__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_ru(const __nv_bfloat16 h)
+{
+    unsigned long long int i;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    asm("cvt.rpi.u64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h)));
+,
+    const float f = __bfloat162float(h);
+    asm("cvt.rpi.u64.f32 %0, %1;" : "=l"(i) : "f"(f));
+)
+    return i;
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat16 __internal_device_ull2bfloat16_rn(const unsigned long long int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 h;
+    asm("cvt.rn.bf16.u64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i));
+    return h;
+,
+    const float ru = __ull2float_ru(i);
+    const float rd = __ull2float_rd(i);
+    float rz = __ull2float_rz(i);
+    if (ru != rd) {
+        rz = __uint_as_float(__float_as_uint(rz) | 1U);
+    }
+    return __float2bfloat16_rn(rz);
+)
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rn(const unsigned long long int i)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_ull2bfloat16_rn(i);
+,
+    float f = static_cast<float>(i);
+    const unsigned long long int uf = static_cast<unsigned long long int>(f);
+    unsigned int u = __internal_float_as_uint(f);
+    // round up happened here
+    // note: no need to handle round up to f == 0x1.p64 specially
+    if (uf > i) {
+        u--;
+    }
+    if (uf != i) {
+        u |= 1U;
+    }
+    f = __internal_uint_as_float(u);
+    return __float2bfloat16_rn(f);
+)
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rz(const unsigned long long int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 h;
+    asm("cvt.rz.bf16.u64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i));
+    return h;
+,
+    return __float2bfloat16_rz(__ull2float_rz(i));
+)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rd(const unsigned long long int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 h;
+    asm("cvt.rm.bf16.u64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i));
+    return h;
+,
+    return __float2bfloat16_rd(__ull2float_rd(i));
+)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_ru(const unsigned long long int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 h;
+    asm("cvt.rp.bf16.u64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i));
+    return h;
+,
+    return __float2bfloat16_ru(__ull2float_ru(i));
+)
+}
+__CUDA_BF16_DECL__ long long int __bfloat162ll_rn(const __nv_bfloat16 h)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    long long int i;
+    asm("cvt.rni.s64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h)));
+    return i;
+,
+    return __float2ll_rn(__bfloat162float(h));
+)
+}
+
+__CUDA_BF16_DECL__ long long int __internal_device_bfloat162ll_rz(const __nv_bfloat16 h)
+{
+    long long int i;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    asm("cvt.rzi.s64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h)));
+,
+    const float f = __bfloat162float(h);
+    i = __float2ll_rz(f);
+)
+    return i;
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_BF16_DECL__ long long int __bfloat162ll_rz(const __nv_bfloat16 h)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_bfloat162ll_rz(h);
+,
+    long long int i;
+    const float f = __bfloat162float(h);
+    const long long int max_val = (long long int)0x7fffffffffffffffULL;
+    const long long int min_val = (long long int)0x8000000000000000ULL;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__nv_bfloat16_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xFF00U) {
+        // NaN
+        i = min_val;
+    } else if (f >= static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    } else {
+        i = static_cast<long long int>(f);
+    }
+    return i;
+)
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ long long int __bfloat162ll_rd(const __nv_bfloat16 h)
+{
+    long long int i;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    asm("cvt.rmi.s64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h)));
+,
+    const float f = __bfloat162float(h);
+    asm("cvt.rmi.s64.f32 %0, %1;" : "=l"(i) : "f"(f));
+)
+    return i;
+}
+__CUDA_BF16_DECL__ long long int __bfloat162ll_ru(const __nv_bfloat16 h)
+{
+    long long int i;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    asm("cvt.rpi.s64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h)));
+,
+    const float f = __bfloat162float(h);
+    asm("cvt.rpi.s64.f32 %0, %1;" : "=l"(i) : "f"(f));
+)
+    return i;
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat16 __internal_device_ll2bfloat16_rn(const long long int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 h;
+    asm("cvt.rn.bf16.s64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i));
+    return h;
+,
+    const float ru = __ll2float_ru(i);
+    const float rd = __ll2float_rd(i);
+    float rz = __ll2float_rz(i);
+    if (ru != rd) {
+        rz = __uint_as_float(__float_as_uint(rz) | 1U);
+    }
+    return __float2bfloat16_rn(rz);
+)
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rn(const long long int i)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_ll2bfloat16_rn(i);
+,
+    float f = static_cast<float>(i);
+    const long long int lf = static_cast<long long int>(f);
+    unsigned int u = __internal_float_as_uint(f);
+
+    if ((f > 0.0f) && (lf > i)) {
+        u--;
+    }
+    if ((f < 0.0f) && (lf < i)) {
+        u--;
+    }
+    if (lf != i) {
+        u |= 1U;
+    }
+
+    f = __internal_uint_as_float(u);
+    return __float2bfloat16_rn(f);
+)
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rz(const long long int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 h;
+    asm("cvt.rz.bf16.s64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i));
+    return h;
+,
+    return __float2bfloat16_rz(__ll2float_rz(i));
+)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rd(const long long int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 h;
+    asm("cvt.rm.bf16.s64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i));
+    return h;
+,
+    return __float2bfloat16_rd(__ll2float_rd(i));
+)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_ru(const long long int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 h;
+    asm("cvt.rp.bf16.s64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i));
+    return h;
+,
+    return __float2bfloat16_ru(__ll2float_ru(i));
+)
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat16 htrunc(const __nv_bfloat16 h)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 r;
+    asm("cvt.rzi.bf16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(h)));
+    return r;
+,
+    return __float2bfloat16_rz(truncf(__bfloat162float(h)));
+)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 hceil(const __nv_bfloat16 h)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 r;
+    asm("cvt.rpi.bf16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(h)));
+    return r;
+,
+    float fh = __bfloat162float(h);
+    asm( "{ cvt.rpi.f32.f32 %0, %0; }\n"
+        :"+f"(fh));
+    return __float2bfloat16_rz(fh);
+)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 hfloor(const __nv_bfloat16 h)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 r;
+    asm("cvt.rmi.bf16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(h)));
+    return r;
+,
+    float fh = __bfloat162float(h);
+    asm( "{ cvt.rmi.f32.f32 %0, %0; }\n"
+        :"+f"(fh));
+    return __float2bfloat16_rz(fh);
+)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 hrint(const __nv_bfloat16 h)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 r;
+    asm("cvt.rni.bf16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(h)));
+    return r;
+,
+    return __float2bfloat16_rz(rintf(__bfloat162float(h)));
+)
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat162 h2trunc(const __nv_bfloat162 h)
+{
+    const __nv_bfloat16 low  = htrunc(h.x);
+    const __nv_bfloat16 high = htrunc(h.y);
+    return __nv_bfloat162(low, high);
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2ceil(const __nv_bfloat162 h)
+{
+    const __nv_bfloat16 low  = hceil(h.x);
+    const __nv_bfloat16 high = hceil(h.y);
+    return __nv_bfloat162(low, high);
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2floor(const __nv_bfloat162 h)
+{
+    const __nv_bfloat16 low  = hfloor(h.x);
+    const __nv_bfloat16 high = hfloor(h.y);
+    return __nv_bfloat162(low, high);
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat162 h2rint(const __nv_bfloat162 h)
+{
+    return __halves2bfloat162(hrint(__low2bfloat16(h)), hrint(__high2bfloat16(h)));
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __lows2bfloat162(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{.reg .b16 alow,ahigh,blow,bhigh;\n"
+        "  mov.b32 {alow,ahigh}, %1;\n"
+        "  mov.b32 {blow,bhigh}, %2;\n"
+        "  mov.b32 %0, {alow,blow};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)), "r"(__BFLOAT162_TO_CUI(b)));
+,
+    val.x = a.x;
+    val.y = b.x;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __highs2bfloat162(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{.reg .b16 alow,ahigh,blow,bhigh;\n"
+        "  mov.b32 {alow,ahigh}, %1;\n"
+        "  mov.b32 {blow,bhigh}, %2;\n"
+        "  mov.b32 %0, {ahigh,bhigh};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)), "r"(__BFLOAT162_TO_CUI(b)));
+,
+    val.x = a.y;
+    val.y = b.y;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __low2bfloat16(const __nv_bfloat162 a)
+{
+    __nv_bfloat16 ret;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{.reg .b16 low,high;\n"
+        " mov.b32 {low,high}, %1;\n"
+        " mov.b16 %0, low;}" : "=h"(__BFLOAT16_TO_US(ret)) : "r"(__BFLOAT162_TO_CUI(a)));
+,
+    ret = a.x;
+)
+    return ret;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ int __hisinf(const __nv_bfloat16 a)
+{
+    int retval;
+    const __nv_bfloat16_raw araw = __nv_bfloat16_raw(a);
+    if (araw.x == 0xFF80U) {
+        retval = -1;
+    } else if (araw.x == 0x7F80U) {
+        retval = 1;
+    } else {
+        retval = 0;
+    }
+    return retval;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __low2bfloat162(const __nv_bfloat162 a)
+{
+    __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{.reg .b16 low,high;\n"
+        "  mov.b32 {low,high}, %1;\n"
+        "  mov.b32 %0, {low,low};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)));
+,
+    val.x = a.x;
+    val.y = a.x;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __high2bfloat162(const __nv_bfloat162 a)
+{
+    __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{.reg .b16 low,high;\n"
+        "  mov.b32 {low,high}, %1;\n"
+        "  mov.b32 %0, {high,high};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)));
+,
+    val.x = a.y;
+    val.y = a.y;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __high2bfloat16(const __nv_bfloat162 a)
+{
+    __nv_bfloat16 ret;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{.reg .b16 low,high;\n"
+        " mov.b32 {low,high}, %1;\n"
+        " mov.b16 %0, high;}" : "=h"(__BFLOAT16_TO_US(ret)) : "r"(__BFLOAT162_TO_CUI(a)));
+,
+    ret = a.y;
+)
+    return ret;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __halves2bfloat162(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{  mov.b32 %0, {%1,%2};}\n"
+        : "=r"(__BFLOAT162_TO_UI(val)) : "h"(__BFLOAT16_TO_CUS(a)), "h"(__BFLOAT16_TO_CUS(b)));
+,
+    val.x = a;
+    val.y = b;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __bfloat162bfloat162(const __nv_bfloat16 a)
+{
+    __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{  mov.b32 %0, {%1,%1};}\n"
+        : "=r"(__BFLOAT162_TO_UI(val)) : "h"(__BFLOAT16_TO_CUS(a)));
+,
+    val.x = a;
+    val.y = a;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __lowhigh2highlow(const __nv_bfloat162 a)
+{
+    __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{.reg .b16 low,high;\n"
+        "  mov.b32 {low,high}, %1;\n"
+        "  mov.b32 %0, {high,low};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)));
+,
+    val.x = a.y;
+    val.y = a.x;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ short int __bfloat16_as_short(const __nv_bfloat16 h)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return static_cast<short int>(__BFLOAT16_TO_CUS(h));
+,
+    return static_cast<short int>(__nv_bfloat16_raw(h).x);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned short int __bfloat16_as_ushort(const __nv_bfloat16 h)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __BFLOAT16_TO_CUS(h);
+,
+    return __nv_bfloat16_raw(h).x;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __short_as_bfloat16(const short int i)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __nv_bfloat16 h;
+    __BFLOAT16_TO_US(h) = static_cast<unsigned short int>(i);
+    return h;
+,
+    __nv_bfloat16_raw hr;
+    hr.x = static_cast<unsigned short int>(i);
+    return __nv_bfloat16(hr);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ushort_as_bfloat16(const unsigned short int i)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __nv_bfloat16 h;
+    __BFLOAT16_TO_US(h) = i;
+    return h;
+,
+    __nv_bfloat16_raw hr;
+    hr.x = i;
+    return __nv_bfloat16(hr);
+)
+}
+
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300))) || defined(_NVHPC_CUDA)
+/******************************************************************************
+*                           __nv_bfloat16, __nv_bfloat162 warp shuffle        *
+******************************************************************************/
+#define __SHUFFLE_SYNC_BFLOAT162_MACRO(name, var, delta, c, mask) /* do */ {\
+   __nv_bfloat162 r; \
+   asm volatile ("{" __CUDA_BF16_STRINGIFY(name) " %0,%1,%2,%3,%4;\n}" \
+       :"=r"(__BFLOAT162_TO_UI(r)): "r"(__BFLOAT162_TO_CUI(var)), "r"(delta), "r"(c), "r"(mask)); \
+   return r; \
+} /* while(0) */
+
+__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_sync(const unsigned int mask, const __nv_bfloat162 var, const int srcLane, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
+    __SHUFFLE_SYNC_BFLOAT162_MACRO(shfl.sync.idx.b32, var, srcLane, c, mask)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_up_sync(const unsigned int mask, const __nv_bfloat162 var, const unsigned int delta, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = (warp_size - static_cast<unsigned>(width)) << 8U;
+    __SHUFFLE_SYNC_BFLOAT162_MACRO(shfl.sync.up.b32, var, delta, c, mask)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_down_sync(const unsigned int mask, const __nv_bfloat162 var, const unsigned int delta, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
+    __SHUFFLE_SYNC_BFLOAT162_MACRO(shfl.sync.down.b32, var, delta, c, mask)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_xor_sync(const unsigned int mask, const __nv_bfloat162 var, const int laneMask, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
+    __SHUFFLE_SYNC_BFLOAT162_MACRO(shfl.sync.bfly.b32, var, laneMask, c, mask)
+}
+
+#undef __SHUFFLE_SYNC_BFLOAT162_MACRO
+
+__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_sync(const unsigned int mask, const __nv_bfloat16 var, const int srcLane, const int width)
+{
+    const __nv_bfloat162 temp1 = __halves2bfloat162(var, var);
+    const __nv_bfloat162 temp2 = __shfl_sync(mask, temp1, srcLane, width);
+    return __low2bfloat16(temp2);
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_up_sync(const unsigned int mask, const __nv_bfloat16 var, const unsigned int delta, const int width)
+{
+    const __nv_bfloat162 temp1 = __halves2bfloat162(var, var);
+    const __nv_bfloat162 temp2 = __shfl_up_sync(mask, temp1, delta, width);
+    return __low2bfloat16(temp2);
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_down_sync(const unsigned int mask, const __nv_bfloat16 var, const unsigned int delta, const int width)
+{
+    const __nv_bfloat162 temp1 = __halves2bfloat162(var, var);
+    const __nv_bfloat162 temp2 = __shfl_down_sync(mask, temp1, delta, width);
+    return __low2bfloat16(temp2);
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_xor_sync(const unsigned int mask, const __nv_bfloat16 var, const int laneMask, const int width)
+{
+    const __nv_bfloat162 temp1 = __halves2bfloat162(var, var);
+    const __nv_bfloat162 temp2 = __shfl_xor_sync(mask, temp1, laneMask, width);
+    return __low2bfloat16(temp2);
+}
+
+/******************************************************************************
+*               __nv_bfloat16 and __nv_bfloat162 __ldg,__ldcg,__ldca,__ldcs   *
+******************************************************************************/
+
+#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
+#define __LDG_PTR   "l"
+#else
+#define __LDG_PTR   "r"
+#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldg(const  __nv_bfloat162 *const ptr)
+{
+    __nv_bfloat162 ret;
+    asm ("ld.global.nc.b32 %0, [%1];"  : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldg(const __nv_bfloat16 *const ptr)
+{
+    __nv_bfloat16 ret;
+    asm ("ld.global.nc.b16 %0, [%1];"  : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldcg(const  __nv_bfloat162 *const ptr)
+{
+    __nv_bfloat162 ret;
+    asm ("ld.global.cg.b32 %0, [%1];"  : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldcg(const __nv_bfloat16 *const ptr)
+{
+    __nv_bfloat16 ret;
+    asm ("ld.global.cg.b16 %0, [%1];"  : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldca(const  __nv_bfloat162 *const ptr)
+{
+    __nv_bfloat162 ret;
+    asm ("ld.global.ca.b32 %0, [%1];"  : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldca(const __nv_bfloat16 *const ptr)
+{
+    __nv_bfloat16 ret;
+    asm ("ld.global.ca.b16 %0, [%1];"  : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldcs(const  __nv_bfloat162 *const ptr)
+{
+    __nv_bfloat162 ret;
+    asm ("ld.global.cs.b32 %0, [%1];"  : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldcs(const __nv_bfloat16 *const ptr)
+{
+    __nv_bfloat16 ret;
+    asm ("ld.global.cs.b16 %0, [%1];"  : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldlu(const  __nv_bfloat162 *const ptr)
+{
+    __nv_bfloat162 ret;
+    asm ("ld.global.lu.b32 %0, [%1];"  : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr) : "memory");
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldlu(const __nv_bfloat16 *const ptr)
+{
+    __nv_bfloat16 ret;
+    asm ("ld.global.lu.b16 %0, [%1];"  : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr) : "memory");
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldcv(const  __nv_bfloat162 *const ptr)
+{
+    __nv_bfloat162 ret;
+    asm ("ld.global.cv.b32 %0, [%1];"  : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr) : "memory");
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldcv(const __nv_bfloat16 *const ptr)
+{
+    __nv_bfloat16 ret;
+    asm ("ld.global.cv.b16 %0, [%1];"  : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr) : "memory");
+    return ret;
+}
+
+__CUDA_BF16_DECL__ void __stwb(__nv_bfloat162 *const ptr, const __nv_bfloat162 value)
+{
+    asm ("st.global.wb.b32 [%0], %1;"  :: __LDG_PTR(ptr), "r"(__BFLOAT162_TO_CUI(value)) : "memory");
+}
+__CUDA_BF16_DECL__ void __stwb(__nv_bfloat16 *const ptr, const __nv_bfloat16 value)
+{
+    asm ("st.global.wb.b16 [%0], %1;"  :: __LDG_PTR(ptr),  "h"(__BFLOAT16_TO_CUS(value)) : "memory");
+}
+__CUDA_BF16_DECL__ void __stcg(__nv_bfloat162 *const ptr, const __nv_bfloat162 value)
+{
+    asm ("st.global.cg.b32 [%0], %1;"  :: __LDG_PTR(ptr), "r"(__BFLOAT162_TO_CUI(value)) : "memory");
+}
+__CUDA_BF16_DECL__ void __stcg(__nv_bfloat16 *const ptr, const __nv_bfloat16 value)
+{
+    asm ("st.global.cg.b16 [%0], %1;"  :: __LDG_PTR(ptr),  "h"(__BFLOAT16_TO_CUS(value)) : "memory");
+}
+__CUDA_BF16_DECL__ void __stcs(__nv_bfloat162 *const ptr, const __nv_bfloat162 value)
+{
+    asm ("st.global.cs.b32 [%0], %1;"  :: __LDG_PTR(ptr), "r"(__BFLOAT162_TO_CUI(value)) : "memory");
+}
+__CUDA_BF16_DECL__ void __stcs(__nv_bfloat16 *const ptr, const __nv_bfloat16 value)
+{
+    asm ("st.global.cs.b16 [%0], %1;"  :: __LDG_PTR(ptr),  "h"(__BFLOAT16_TO_CUS(value)) : "memory");
+}
+__CUDA_BF16_DECL__ void __stwt(__nv_bfloat162 *const ptr, const __nv_bfloat162 value)
+{
+    asm ("st.global.wt.b32 [%0], %1;"  :: __LDG_PTR(ptr), "r"(__BFLOAT162_TO_CUI(value)) : "memory");
+}
+__CUDA_BF16_DECL__ void __stwt(__nv_bfloat16 *const ptr, const __nv_bfloat16 value)
+{
+    asm ("st.global.wt.b16 [%0], %1;"  :: __LDG_PTR(ptr),  "h"(__BFLOAT16_TO_CUS(value)) : "memory");
+}
+
+#undef __LDG_PTR
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300))) || defined(_NVHPC_CUDA) */
+/******************************************************************************
+*                             __nv_bfloat162 comparison                       *
+******************************************************************************/
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+#define __COMPARISON_OP_BFLOAT162_MACRO(name) {\
+   __nv_bfloat162 val; \
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,\
+   asm( "{ " __CUDA_BF16_STRINGIFY(name) ".bf16x2.bf16x2 %0,%1,%2;\n}" \
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \
+,\
+   asm( "{.reg .b32 low_a,low_b,high_a,high_b,high_res,low_res;\n"\
+        "  and.b32 high_a, %1, 0xffff0000U;\n"\
+        "  and.b32 high_b, %2, 0xffff0000U;\n"\
+        "  shl.b32 low_a, %1, 16;\n"\
+        "  shl.b32 low_b, %2, 16;\n"\
+        "  " __CUDA_BF16_STRINGIFY(name) ".f32.f32 low_res, low_a, low_b;\n"\
+        "  " __CUDA_BF16_STRINGIFY(name) ".f32.f32 high_res, high_a, high_b;\n"\
+        "  shr.u32 low_res, low_res, 16;\n"\
+        "  or.b32  %0, high_res, low_res;}\n"\
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \
+)\
+   return val; \
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_heq2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.eq)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hne2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.ne)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hle2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.le)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hge2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.ge)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hlt2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.lt)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hgt2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.gt)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hequ2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.equ)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hneu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.neu)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hleu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.leu)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.geu)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hltu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.ltu)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.gtu)
+}
+#undef __COMPARISON_OP_BFLOAT162_MACRO
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __heq2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_heq2(a, b);
+,
+    __nv_bfloat162_raw val;
+    val.x = __heq(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    val.y = __heq(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    return __nv_bfloat162(val);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hne2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hne2(a, b);
+,
+    __nv_bfloat162_raw val;
+    val.x = __hne(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    val.y = __hne(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    return __nv_bfloat162(val);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hle2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hle2(a, b);
+,
+    __nv_bfloat162_raw val;
+    val.x = __hle(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    val.y = __hle(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    return __nv_bfloat162(val);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hge2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hge2(a, b);
+,
+    __nv_bfloat162_raw val;
+    val.x = __hge(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    val.y = __hge(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    return __nv_bfloat162(val);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hlt2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hlt2(a, b);
+,
+    __nv_bfloat162_raw val;
+    val.x = __hlt(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    val.y = __hlt(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    return __nv_bfloat162(val);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hgt2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hgt2(a, b);
+,
+    __nv_bfloat162_raw val;
+    val.x = __hgt(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    val.y = __hgt(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    return __nv_bfloat162(val);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hequ2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hequ2(a, b);
+,
+    __nv_bfloat162_raw val;
+    val.x = __hequ(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    val.y = __hequ(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    return __nv_bfloat162(val);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hneu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hneu2(a, b);
+,
+    __nv_bfloat162_raw val;
+    val.x = __hneu(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    val.y = __hneu(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    return __nv_bfloat162(val);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hleu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hleu2(a, b);
+,
+    __nv_bfloat162_raw val;
+    val.x = __hleu(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    val.y = __hleu(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    return __nv_bfloat162(val);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hgeu2(a, b);
+,
+    __nv_bfloat162_raw val;
+    val.x = __hgeu(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    val.y = __hgeu(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    return __nv_bfloat162(val);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hltu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hltu2(a, b);
+,
+    __nv_bfloat162_raw val;
+    val.x = __hltu(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    val.y = __hltu(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    return __nv_bfloat162(val);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hgtu2(a, b);
+,
+    __nv_bfloat162_raw val;
+    val.x = __hgtu(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    val.y = __hgtu(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    return __nv_bfloat162(val);
+)
+}
+
+/******************************************************************************
+*                __nv_bfloat162 comparison with mask output                   *
+******************************************************************************/
+#define __COMPARISON_OP_BFLOAT162_MACRO_MASK(name) {\
+   unsigned val; \
+   asm( "{ " __CUDA_BF16_STRINGIFY(name) ".u32.bf16x2 %0,%1,%2;\n}" \
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \
+   return val; \
+}
+
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __heq2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.eq)
+,
+    const unsigned short px = __heq(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __heq(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hne2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.ne)
+,
+    const unsigned short px = __hne(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hne(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hle2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.le)
+,
+    const unsigned short px = __hle(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hle(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hge2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.ge)
+,
+    const unsigned short px = __hge(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hge(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hlt2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.lt)
+,
+    const unsigned short px = __hlt(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hlt(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hgt2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.gt)
+,
+    const unsigned short px = __hgt(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hgt(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hequ2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.equ)
+,
+    const unsigned short px = __hequ(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hequ(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hneu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.neu)
+,
+    const unsigned short px = __hneu(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hneu(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hleu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.leu)
+,
+    const unsigned short px = __hleu(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hleu(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hgeu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.geu)
+,
+    const unsigned short px = __hgeu(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hgeu(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hltu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.ltu)
+,
+    const unsigned short px = __hltu(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hltu(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hgtu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.gtu)
+,
+    const unsigned short px = __hgtu(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hgtu(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+#undef __COMPARISON_OP_BFLOAT162_MACRO_MASK
+
+#define __BOOL_COMPARISON_OP_BFLOAT162_MACRO(name) {\
+   unsigned int val; \
+   bool retval; \
+   asm( "{ " __CUDA_BF16_STRINGIFY(name) ".bf16x2.bf16x2 %0,%1,%2;\n}" \
+        :"=r"(val) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \
+   if (val == 0x3F803F80U) {\
+      retval = true; \
+   } else { \
+      retval = false; \
+   }\
+   return retval;\
+}
+
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbeq2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.eq)
+,
+    return (__heq(a.x, b.x) && __heq(a.y, b.y));
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbne2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.ne)
+,
+    return (__hne(a.x, b.x) && __hne(a.y, b.y));
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hble2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.le)
+,
+    return (__hle(a.x, b.x) && __hle(a.y, b.y));
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbge2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.ge)
+,
+    return (__hge(a.x, b.x) && __hge(a.y, b.y));
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hblt2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.lt)
+,
+    return (__hlt(a.x, b.x) && __hlt(a.y, b.y));
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbgt2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.gt)
+,
+    return (__hgt(a.x, b.x) && __hgt(a.y, b.y));
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbequ2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.equ)
+,
+    return (__hequ(a.x, b.x) && __hequ(a.y, b.y));
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbneu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.neu)
+,
+    return (__hneu(a.x, b.x) && __hneu(a.y, b.y));
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbleu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.leu)
+,
+    return (__hleu(a.x, b.x) && __hleu(a.y, b.y));
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.geu)
+,
+    return (__hgeu(a.x, b.x) && __hgeu(a.y, b.y));
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbltu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.ltu)
+,
+    return (__hltu(a.x, b.x) && __hltu(a.y, b.y));
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.gtu)
+,
+    return (__hgtu(a.x, b.x) && __hgtu(a.y, b.y));
+)
+}
+#undef __BOOL_COMPARISON_OP_BFLOAT162_MACRO
+/******************************************************************************
+*                             __nv_bfloat16 comparison                              *
+******************************************************************************/
+#define __COMPARISON_OP_BFLOAT16_MACRO(name) {\
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,\
+   unsigned short val; \
+   asm( "{ .reg .pred __$temp3;\n" \
+        "  setp." __CUDA_BF16_STRINGIFY(name) ".bf16  __$temp3, %1, %2;\n" \
+        "  selp.u16 %0, 1, 0, __$temp3;}" \
+        : "=h"(val) : "h"(__BFLOAT16_TO_CUS(a)), "h"(__BFLOAT16_TO_CUS(b))); \
+   return (val != 0U) ? true : false; \
+,\
+   unsigned int val; \
+   asm( "{.reg .b32 a,b;\n"\
+        "  mov.b32 a, {0, %1};\n"\
+        "  mov.b32 b, {0, %2};\n"\
+        "  set." __CUDA_BF16_STRINGIFY(name) ".f32.f32 %0, a, b;}\n"\
+        :"=r"(val) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \
+   return (val != 0U) ? true : false; \
+)\
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __heq(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __COMPARISON_OP_BFLOAT16_MACRO(eq)
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    return (fa == fb);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hne(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __COMPARISON_OP_BFLOAT16_MACRO(ne)
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    return (fa != fb) && (!__hisnan(a)) && (!__hisnan(b));
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hle(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __COMPARISON_OP_BFLOAT16_MACRO(le)
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    return (fa <= fb);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hge(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __COMPARISON_OP_BFLOAT16_MACRO(ge)
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    return (fa >= fb);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hlt(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __COMPARISON_OP_BFLOAT16_MACRO(lt)
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    return (fa < fb);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hgt(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __COMPARISON_OP_BFLOAT16_MACRO(gt)
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    return (fa > fb);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hequ(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __COMPARISON_OP_BFLOAT16_MACRO(equ)
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    return (fa == fb) || (__hisnan(a)) || (__hisnan(b));
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hneu(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __COMPARISON_OP_BFLOAT16_MACRO(neu)
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    return (fa != fb);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hleu(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __COMPARISON_OP_BFLOAT16_MACRO(leu)
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    return (fa <= fb) || (__hisnan(a)) || (__hisnan(b));
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hgeu(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __COMPARISON_OP_BFLOAT16_MACRO(geu)
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    return (fa >= fb) || (__hisnan(a)) || (__hisnan(b));
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hltu(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __COMPARISON_OP_BFLOAT16_MACRO(ltu)
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    return (fa < fb) || (__hisnan(a)) || (__hisnan(b));
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hgtu(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __COMPARISON_OP_BFLOAT16_MACRO(gtu)
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    return (fa > fb) || (__hisnan(a)) || (__hisnan(b));
+)
+}
+#undef __COMPARISON_OP_BFLOAT16_MACRO
+/******************************************************************************
+*                            __nv_bfloat162 arithmetic                        *
+******************************************************************************/
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hadd2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+   __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm( "{ add.bf16x2 %0,%1,%2; }\n"
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+,
+   asm( "{.reg .b32 c;\n"
+        "  mov.b32 c, 0x3f803f80U;\n"
+        "  fma.rn.bf16x2 %0,%1,c,%2;}\n"
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+)
+   return val;
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hsub2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+   __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm( "{ sub.bf16x2 %0,%1,%2; }\n"
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+,
+   asm( "{.reg .b32 c;\n"
+        "  mov.b32 c, 0xbf80bf80U;\n"
+        "  fma.rn.bf16x2 %0,%2,c,%1;}\n"
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+)
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hmul2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+   __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm( "{ mul.bf16x2 %0,%1,%2; }\n"
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+,
+   asm( "{.reg .b32 c;\n"
+        "  mov.b32 c, 0x80008000U;\n"
+        "  fma.rn.bf16x2 %0,%1,%2,c;}\n"
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+)
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hadd2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+   __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm( "{ add.rn.bf16x2 %0,%1,%2; }\n"
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+,
+   asm( "{.reg .b32 c;\n"
+        "  mov.b32 c, 0x3f803f80U;\n"
+        "  fma.rn.bf16x2 %0,%1,c,%2;}\n"
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+)
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hsub2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+   __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm( "{ sub.rn.bf16x2 %0,%1,%2; }\n"
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+,
+   asm( "{.reg .b32 c;\n"
+        "  mov.b32 c, 0xbf80bf80U;\n"
+        "  fma.rn.bf16x2 %0,%2,c,%1;}\n"
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+)
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hmul2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+   __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm( "{ mul.rn.bf16x2 %0,%1,%2; }\n"
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+,
+   asm( "{.reg .b32 c;\n"
+        "  mov.b32 c, 0x80008000U;\n"
+        "  fma.rn.bf16x2 %0,%1,%2,c;}\n"
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+)
+   return val;
+}
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hadd2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    val = __internal_device_hadd2(a, b);
+,
+    val.x = __hadd(a.x, b.x);
+    val.y = __hadd(a.y, b.y);
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hsub2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    val = __internal_device_hsub2(a, b);
+,
+    val.x = __hsub(a.x, b.x);
+    val.y = __hsub(a.y, b.y);
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmul2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    val = __internal_device_hmul2(a, b);
+,
+    val.x = __hmul(a.x, b.x);
+    val.y = __hmul(a.y, b.y);
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hadd2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    val = __internal_device_hadd2_rn(a, b);
+,
+    val.x = __hadd_rn(a.x, b.x);
+    val.y = __hadd_rn(a.y, b.y);
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hsub2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    val = __internal_device_hsub2_rn(a, b);
+,
+    val.x = __hsub_rn(a.x, b.x);
+    val.y = __hsub_rn(a.y, b.y);
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmul2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    val = __internal_device_hmul2_rn(a, b);
+,
+    val.x = __hmul_rn(a.x, b.x);
+    val.y = __hmul_rn(a.y, b.y);
+)
+    return val;
+}
+
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hadd2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    asm( "{.reg .b32 f, one, zero;\n"
+        "  mov.b32 one, 0x3f803f80U;\n"
+        "  mov.b32 zero, 0;\n"
+        "  fma.rn.bf16x2 f,%1,one,%2;\n"
+        "  max.bf16x2 f, f, zero;\n"
+        "  min.bf16x2 %0, f, one;\n}"
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+,
+    val.x = __hadd_sat(a.x, b.x);
+    val.y = __hadd_sat(a.y, b.y);
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hsub2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    asm( "{.reg .b32 f, one, zero, mone;\n"
+        "  mov.b32 one, 0x3f803f80U;\n"
+        "  mov.b32 zero, 0;\n"
+        "  mov.b32 mone, 0xbf80bf80U;\n"
+        "  fma.rn.bf16x2 f,%2,mone,%1;\n"
+        "  max.bf16x2 f, f, zero;\n"
+        "  min.bf16x2 %0, f, one;\n}"
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+,
+    val.x = __hsub_sat(a.x, b.x);
+    val.y = __hsub_sat(a.y, b.y);
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmul2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    asm( "{.reg .b32 f, one, zero, mzero;\n"
+        "  mov.b32 one, 0x3f803f80U;\n"
+        "  mov.b32 zero, 0;\n"
+        "  mov.b32 mzero, 0x80008000U;\n"
+        "  fma.rn.bf16x2 f,%1,%2,mzero;\n"
+        "  max.bf16x2 f, f, zero;\n"
+        "  min.bf16x2 %0, f, one;\n}"
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+,
+    val.x = __hmul_sat(a.x, b.x);
+    val.y = __hmul_sat(a.y, b.y);
+)
+    return val;
+}
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c)
+{
+    __nv_bfloat162 val;
+    asm( "{fma.rn.bf16x2 %0,%1,%2,%3;\n}"
+         :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)),"r"(__BFLOAT162_TO_CUI(c)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c)
+{
+    __nv_bfloat162 val;
+    asm( "{ .reg .b32 f, one, zero;\n"
+         "  mov.b32 one, 0x3f803f80U;\n"
+         "  mov.b32 zero, 0;\n"
+         "  fma.rn.bf16x2 f, %1, %2, %3;\n"
+         "  max.bf16x2 f, f, zero;\n"
+         "  min.bf16x2 %0, f, one;\n}"
+         :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)),"r"(__BFLOAT162_TO_CUI(c)));
+    return val;
+}
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __h2div(const __nv_bfloat162 a, const __nv_bfloat162 b) {
+    __nv_bfloat16 ha, hb;
+
+    ha = __low2bfloat16(a);
+    hb = __low2bfloat16(b);
+
+    const __nv_bfloat16 v1 = __hdiv(ha, hb);
+
+    ha = __high2bfloat16(a);
+    hb = __high2bfloat16(b);
+
+    const __nv_bfloat16 v2 = __hdiv(ha, hb);
+
+    return __halves2bfloat162(v1, v2);
+}
+/******************************************************************************
+*                             __nv_bfloat16 arithmetic                        *
+******************************************************************************/
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ __nv_bfloat16 __internal_sm80_device_hadd(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __nv_bfloat16 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm( "{ add.bf16 %0,%1,%2; }\n"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+,
+    asm( "{.reg .b16 c;\n"
+        "  mov.b16 c, 0x3f80U;\n"
+        "  fma.rn.bf16 %0,%1,c,%2;}\n"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+)
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __internal_sm80_device_hsub(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+   __nv_bfloat16 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm( "{ sub.bf16 %0,%1,%2; }\n"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+,
+   asm( "{.reg .b16 c;\n"
+        "  mov.b16 c, 0xbf80U;\n"
+        "  fma.rn.bf16 %0,%2,c,%1;}\n"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+)
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __internal_sm80_device_hmul(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+   __nv_bfloat16 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm( "{ mul.bf16 %0,%1,%2; }\n"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+,
+   asm( "{.reg .b16 c;\n"
+        "  mov.b16 c, 0x8000U;\n"
+        "  fma.rn.bf16 %0,%1,%2,c;}\n"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+)
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __internal_sm80_device_hadd_rn(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+   __nv_bfloat16 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm( "{ add.rn.bf16 %0,%1,%2; }\n"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+,
+   asm( "{.reg .b16 c;\n"
+        "  mov.b16 c, 0x3f80U;\n"
+        "  fma.rn.bf16 %0,%1,c,%2;}\n"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+)
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __internal_sm80_device_hsub_rn(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+   __nv_bfloat16 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm( "{ sub.rn.bf16 %0,%1,%2; }\n"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+,
+   asm( "{.reg .b16 c;\n"
+        "  mov.b16 c, 0xbf80U;\n"
+        "  fma.rn.bf16 %0,%2,c,%1;}\n"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+)
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __internal_sm80_device_hmul_rn(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+   __nv_bfloat16 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm( "{ mul.rn.bf16 %0,%1,%2; }\n"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+,
+   asm( "{.reg .b16 c;\n"
+        "  mov.b16 c, 0x8000U;\n"
+        "  fma.rn.bf16 %0,%1,%2,c;}\n"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+)
+   return val;
+}
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ __nv_bfloat16 __internal_device_hadd(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __nv_bfloat16 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    val = __internal_sm80_device_hadd(a, b);
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    // avoid ftz in device code
+    val = __float2bfloat16(__fmaf_ieee_rn(fa, 1.0f, fb));
+)
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __internal_device_hsub(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __nv_bfloat16 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    val = __internal_sm80_device_hsub(a, b);
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    // avoid ftz in device code
+    val = __float2bfloat16(__fmaf_ieee_rn(fb, -1.0f, fa));
+)
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __internal_device_hmul(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __nv_bfloat16 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    val = __internal_sm80_device_hmul(a, b);
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    // avoid ftz in device code
+    val = __float2bfloat16(__fmaf_ieee_rn(fa, fb, -0.0f));
+)
+    return val;
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hadd(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hadd(a, b);
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    return __float2bfloat16(fa + fb);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hsub(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hsub(a, b);
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    return __float2bfloat16(fa - fb);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmul(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hmul(a, b);
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    return __float2bfloat16(fa * fb);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hadd_rn(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    return __internal_sm80_device_hadd_rn(a, b);
+,
+    return __hadd(a, b);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hsub_rn(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    return __internal_sm80_device_hsub_rn(a, b);
+,
+    return __hsub(a, b);
+
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmul_rn(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    return __internal_sm80_device_hmul_rn(a, b);
+,
+    return __hmul(a, b);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hadd_sat(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __nv_bfloat16 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    asm( "{ .reg .b16 f, one, zero;\n"
+         "  mov.b16 one, 0x3f80U;\n"
+         "  mov.b16 zero, 0;\n"
+         "  fma.rn.bf16 f, %1, one, %2;\n"
+         "  max.bf16 f, f, zero;\n"
+         "  min.bf16 %0, f, one;\n}"
+         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+,
+    val = __hmin(__hmax(__hadd(a, b), CUDART_ZERO_BF16), CUDART_ONE_BF16);
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hsub_sat(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __nv_bfloat16 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    asm( "{ .reg .b16 f, one, zero, mone;\n"
+         "  mov.b16 one, 0x3f80U;\n"
+         "  mov.b16 zero, 0;\n"
+         "  mov.b16 mone, 0xbf80U;\n"
+         "  fma.rn.bf16 f, %2, mone, %1;\n"
+         "  max.bf16 f, f, zero;\n"
+         "  min.bf16 %0, f, one;\n}"
+         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+,
+    val = __hmin(__hmax(__hsub(a, b), CUDART_ZERO_BF16), CUDART_ONE_BF16);
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmul_sat(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __nv_bfloat16 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    asm( "{ .reg .b16 f, one, zero, mzero;\n"
+         "  mov.b16 one, 0x3f80U;\n"
+         "  mov.b16 zero, 0;\n"
+         "  mov.b16 mzero, 0x8000U;\n"
+         "  fma.rn.bf16 f, %1, %2, mzero;\n"
+         "  max.bf16 f, f, zero;\n"
+         "  min.bf16 %0, f, one;\n}"
+         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+,
+    val = __hmin(__hmax(__hmul(a, b), CUDART_ZERO_BF16), CUDART_ONE_BF16);
+)
+    return val;
+}
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ __nv_bfloat16 __hfma(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c)
+{
+    __nv_bfloat16 val;
+    asm( "{fma.rn.bf16 %0,%1,%2,%3;\n}"
+         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)),"h"(__BFLOAT16_TO_CUS(c)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __hfma_sat(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c)
+{
+    __nv_bfloat16 val;
+    asm( "{ .reg .b16 f, one, zero;\n"
+         "  mov.b16 one, 0x3f80U;\n"
+         "  mov.b16 zero, 0;\n"
+         "  fma.rn.bf16 f, %1, %2, %3;\n"
+         "  max.bf16 f, f, zero;\n"
+         "  min.bf16 %0, f, one;\n}"
+         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)),"h"(__BFLOAT16_TO_CUS(c)));
+    return val;
+}
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+#define __BINARY_OP_BFLOAT16_MACRO(name) /* do */ {\
+   __nv_bfloat16 val; \
+   asm( "{.reg .b32 a,b,res;\n"\
+        "  mov.b32 a, {0,%1};\n"\
+        "  mov.b32 b, {0,%2};\n"\
+        "  " __CUDA_BF16_STRINGIFY(name) ".f32 res, a, b;\n"\
+        "  cvt.rn.bf16.f32 %0, res;}\n"\
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \
+   return val; \
+} /* while(0) */
+__CUDA_BF16_DECL__ __nv_bfloat16 __internal_device_hdiv(const __nv_bfloat16 a, const __nv_bfloat16 b) {
+    const float two_126 =  __uint_as_float(0x7E800000U) ; //2^126
+    const float a_f = __bfloat162float(a);
+    float b_f = __bfloat162float(b);
+    float ans;
+    bool b_big = (fabsf(b_f) >= two_126);
+    if(b_big){b_f *= 0.25f;}
+
+    // f32 div approximation. Good enough for c-r bfloat div.
+    asm("{ div.approx.f32 %0, %1, %2; }" : "=f"(ans) : "f"(a_f), "f"(b_f));
+
+    // Prevent ftz:
+    if(b_big){ans = __fmaf_ieee_rn(ans, 0.25f, -0.0f);}
+    return __float2bfloat16(ans);
+}
+
+#undef __BINARY_OP_BFLOAT16_MACRO
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hdiv(const __nv_bfloat16 a, const __nv_bfloat16 b) {
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hdiv(a, b);
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    return __float2bfloat16(fa / fb);
+)
+}
+
+/******************************************************************************
+*                             __nv_bfloat162 functions                        *
+******************************************************************************/
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ __nv_bfloat16 __hsin_internal(const __nv_bfloat16 a) {
+    float f = __bfloat162float(a);
+    float r = sinf(f);
+    // Detect compile-time FTZ setting:
+    // if subnormal constant is not flushed to zero at compile-time, then
+    // ftz=off, and it is safe to return result of sinf()
+    // Otherwise, ftz=on, then sinf() result is valid for non-flushed
+    // values, and subnormal input is returned unchanged via else
+    // branch.
+    if ((__uint_as_float(0x00000001U) > 0.0f) || (f != 0.0f))
+    {
+        f = r;
+    }
+    return __float2bfloat16_rn(f);
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 hsin(const __nv_bfloat16 a) {
+    return __hsin_internal(a);
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2sin(const __nv_bfloat162 a) {
+    const __nv_bfloat16 l = __low2bfloat16(a);
+    const __nv_bfloat16 h = __high2bfloat16(a);
+    return __halves2bfloat162(__hsin_internal(l), __hsin_internal(h));
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __hcos_internal(const __nv_bfloat16 a) {
+    float f = __bfloat162float(a);
+    f = cosf(f);
+    return __float2bfloat16_rn(f);
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 hcos(const __nv_bfloat16 a) {
+    return __hcos_internal(a);
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2cos(const __nv_bfloat162 a) {
+    const __nv_bfloat16 l = __low2bfloat16(a);
+    const __nv_bfloat16 h = __high2bfloat16(a);
+    return __halves2bfloat162(__hcos_internal(l), __hcos_internal(h));
+}
+
+__CUDA_BF16_DECL__ float __internal_device_fast_bf16exp(const float x)
+{
+    const float log2e_up = __uint_as_float(0x3FB8AA3CU);
+    float fa = x * log2e_up;
+    asm("{ ex2.approx.f32 %0, %0; }" : "+f"(fa));
+    return fa;
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat16 hexp(const __nv_bfloat16 a) {
+    float fa = __bfloat162float(a);
+    fa = __internal_device_fast_bf16exp(fa);
+    return __float2bfloat16_rn(fa);
+}
+
+#define __APPROX_FCAST2(fun) /* do */ {\
+   __nv_bfloat162 val;\
+   asm("{.reg.b16         hl, hu;         \n"\
+                " .reg.b32         fl, fu;         \n"\
+                "  mov.b32         {hl, hu}, %1;   \n"\
+                "  mov.b32         fl, {0,hl};     \n"\
+                "  mov.b32         fu, {0,hu};     \n"\
+                "  " __CUDA_BF16_STRINGIFY(fun) ".approx.f32   fl, fl;     \n"\
+                "  " __CUDA_BF16_STRINGIFY(fun) ".approx.f32   fu, fu;     \n"\
+                "  cvt.rn.bf16.f32    hl, fl;     \n"\
+                "  cvt.rn.bf16.f32    hu, fu;     \n"\
+                "  mov.b32         %0, {hl, hu};   \n"\
+                "}":"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)));       \
+   return val;\
+} /* while(0) */
+#define __BF16_SPEC_CASE2(i,r, spc, ulp) \
+   "{.reg.b32 spc, ulp, p;\n"\
+   "  mov.b32 spc," __CUDA_BF16_STRINGIFY(spc) ";\n"\
+   "  mov.b32 ulp," __CUDA_BF16_STRINGIFY(ulp) ";\n"\
+   "  set.eq.f16x2.f16x2 p," __CUDA_BF16_STRINGIFY(i) ", spc;\n"\
+   "  fma.rn.bf16x2 " __CUDA_BF16_STRINGIFY(r) ",p,ulp," __CUDA_BF16_STRINGIFY(r) ";\n}\n"
+
+__CUDA_BF16_DECL__ __nv_bfloat162 h2exp(const __nv_bfloat162 a) {
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __nv_bfloat162 val;
+    asm("{.reg.b16         hl, hu;         \n"
+        " .reg.b32         h,r,fl,fu, C;   \n"
+        "  mov.b32         {hl, hu}, %1;   \n"
+        "  mov.b32         h, %1;          \n"
+        "  mov.b32         fl, {0,hl};     \n"
+        "  mov.b32         fu, {0,hu};     \n"
+        "  mov.b32         C, 0x3FB8AA3CU;  \n"
+        "  mul.f32         fl,fl,C;        \n"
+        "  mul.f32         fu,fu,C;        \n"
+        "  ex2.approx.f32      fl, fl;     \n"
+        "  ex2.approx.f32      fu, fu;     \n"
+        "  cvt.rn.bf16.f32    hl, fl;     \n"
+        "  cvt.rn.bf16.f32    hu, fu;     \n"
+        "  mov.b32         r, {hl, hu};    \n"
+        "  mov.b32         %0, r;  \n"
+        "}":"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)));
+    return val;
+,
+    return __floats2bfloat162_rn( __internal_device_fast_bf16exp(__low2float(a)), __internal_device_fast_bf16exp(__high2float(a)) );
+)
+}
+
+__CUDA_BF16_DECL__ float __internal_device_tanhf_noftz(const float x)
+{
+    float f = x;
+    float r = tanhf(x);
+    // Detect compile-time FTZ setting:
+    // if subnormal constant is not flushed to zero at compile-time, then
+    // ftz=off, and it is safe to return result of tanhf()
+    // Otherwise, ftz=on, then tanhf() result is valid for non-flushed
+    // values, and subnormal input is returned unchanged via else
+    // branch.
+    if ((__uint_as_float(0x00000001U) > 0.0f) || (f != 0.0f))
+    {
+        f = r;
+    }
+    return f;
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat16 htanh(const __nv_bfloat16 a) {
+    float f = __bfloat162float(a);
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_75,
+    asm("{ tanh.approx.f32 %0, %0; }" : "+f"(f));
+,
+    f = __internal_device_tanhf_noftz(f);
+)
+    __nv_bfloat16 h = __float2bfloat16_rn(f);
+    return h;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2tanh(const __nv_bfloat162 a) {
+    float2 f = __bfloat1622float2(a);
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_75,
+    asm("{ tanh.approx.f32 %0, %0; }" : "+f"(f.x));
+    asm("{ tanh.approx.f32 %0, %0; }" : "+f"(f.y));
+,
+    f.x = __internal_device_tanhf_noftz(f.x);
+    f.y = __internal_device_tanhf_noftz(f.y);
+)
+    __nv_bfloat162 h = __float22bfloat162_rn(f);
+    return h;
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat16 htanh_approx(const __nv_bfloat16 a) {
+    __nv_bfloat16 r;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16_raw hr = (__nv_bfloat16_raw)a;
+    asm("tanh.approx.bf16 %0, %0;" : "+h"(hr.x));
+    r = (__nv_bfloat16)hr;
+,
+    r = htanh(a);
+)
+    return r;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2tanh_approx(const __nv_bfloat162 a) {
+    __nv_bfloat162 res;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    asm("tanh.approx.bf16x2 %0, %1;" : "=r"(__BFLOAT162_TO_UI(res)) : "r"(__BFLOAT162_TO_CUI(a)));
+,
+    res = h2tanh(a);
+)
+    return res;
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat16 hexp2(const __nv_bfloat16 a) {
+    float fa = __bfloat162float(a);
+    asm("{ ex2.approx.f32 %0, %0; }" : "+f"(fa));
+    return __float2bfloat16_rn(fa);
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2exp2(const __nv_bfloat162 a) {
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __APPROX_FCAST2(ex2)
+,
+    float fl = __low2float(a);
+    asm("{ ex2.approx.f32 %0, %0; }" : "+f"(fl));
+    float fh = __high2float(a);
+    asm("{ ex2.approx.f32 %0, %0; }" : "+f"(fh));
+    return __floats2bfloat162_rn( fl, fh );
+)
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat16 hexp10(const __nv_bfloat16 a) {
+    const float log10_2 = __uint_as_float(0x40549A78U);
+    float fa = __bfloat162float(a) * log10_2;
+    asm("{ ex2.approx.f32 %0, %0; }" : "+f"(fa));
+    __nv_bfloat16 r = __float2bfloat16_rn(fa);
+    __nv_bfloat16_raw araw = static_cast<__nv_bfloat16_raw>(a);
+    if (araw.x == (unsigned short)0xBC95U)
+    {
+        araw.x = 0x3f75U;
+        r = static_cast<__nv_bfloat16>(araw);
+    }
+    return r;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2exp10(const __nv_bfloat162 a) {
+    __nv_bfloat162 r;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    asm("{.reg.b16         hl, hu;         \n"
+        " .reg.b32         h,r,fl,fu, C;   \n"
+        "  mov.b32         {hl, hu}, %1;   \n"
+        "  mov.b32         fl, {0,hl};     \n"
+        "  mov.b32         fu, {0,hu};     \n"
+        "  mov.b32         C, 0x40549A78U;  \n"
+        "  mul.f32         fl,fl,C;        \n"
+        "  mul.f32         fu,fu,C;        \n"
+        "  ex2.approx.f32      fl, fl;     \n"
+        "  ex2.approx.f32      fu, fu;     \n"
+        "  cvt.rn.bf16.f32    hl, fl;     \n"
+        "  cvt.rn.bf16.f32    hu, fu;     \n"
+        "  mov.b32         r, {hl, hu};    \n"
+        __BF16_SPEC_CASE2(%1, r, 0xBC95BC95U,0xBF00BF00U)
+        "  mov.b32         %0, r;  \n"
+        "}":"=r"(__BFLOAT162_TO_UI(r)) : "r"(__BFLOAT162_TO_CUI(a)));
+,
+    const float log10_2 = __uint_as_float(0x40549A78U);
+    float fl = __low2float(a) * log10_2;
+    asm("{ ex2.approx.f32 %0, %0; }" : "+f"(fl));
+
+    float fh = __high2float(a) * log10_2;
+    asm("{ ex2.approx.f32 %0, %0; }" : "+f"(fh));
+
+    r = __floats2bfloat162_rn( fl, fh );
+
+    const __nv_bfloat162_raw araw = static_cast<__nv_bfloat162_raw>(a);
+    if (araw.x == (unsigned short)0xBC95U)
+    {
+        __nv_bfloat16_raw raw_fix;
+        raw_fix.x = (unsigned short)0x3f75U;
+        r.x = static_cast<__nv_bfloat16>(raw_fix);
+    }
+    if (araw.y == (unsigned short)0xBC95U)
+    {
+        __nv_bfloat16_raw raw_fix;
+        raw_fix.x = (unsigned short)0x3f75U;
+        r.y = static_cast<__nv_bfloat16>(raw_fix);
+    }
+)
+    return r;
+}
+
+__CUDA_BF16_DECL__ float __internal_device_fast_bf16log2(float x)
+{
+    asm("{ lg2.approx.f32 %0, %0; }" : "+f"(x));
+    return x;
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat16 hlog2(const __nv_bfloat16 a) {
+    float fa = __bfloat162float(a);
+    fa = __internal_device_fast_bf16log2(fa);
+    return __float2bfloat16_rn(fa);
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat162 h2log2(const __nv_bfloat162 a) {
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __APPROX_FCAST2(lg2)
+,
+    float fl = __low2float(a);
+    fl = __internal_device_fast_bf16log2(fl);
+    float fh = __high2float(a);
+    fh = __internal_device_fast_bf16log2(fh);
+    return __floats2bfloat162_rn( fl, fh );
+)
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat16 hlog(const __nv_bfloat16 a) {
+    const float flt_ln2 = __uint_as_float(0x3f317218U);
+    float fa = __bfloat162float(a);
+    fa = __internal_device_fast_bf16log2(fa);
+    fa = fa * flt_ln2;
+    return __float2bfloat16_rn(fa);
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2log(const __nv_bfloat162 a) {
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __nv_bfloat162 val;
+    asm("{.reg.b16         hl, hu;             \n"
+        " .reg.b32         r, fl, fu, C, h;    \n"
+        "  mov.b32         {hl, hu}, %1;       \n"
+        "  mov.b32         h, %1;              \n"
+        "  mov.b32         fl, {0,hl};         \n"
+        "  mov.b32         fu, {0,hu};         \n"
+        "  lg2.approx.f32      fl, fl;         \n"
+        "  lg2.approx.f32      fu, fu;         \n"
+        "  mov.b32         C, 0x3f317218U;     \n"
+        "  mul.f32         fl,fl,C;            \n"
+        "  mul.f32         fu,fu,C;            \n"
+        "  cvt.rn.bf16.f32    hl, fl;         \n"
+        "  cvt.rn.bf16.f32    hu, fu;         \n"
+        "  mov.b32         r, {hl, hu};        \n"
+        "  mov.b32         %0, r;              \n"
+        "}":"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)));
+    return val;
+,
+    const float flt_ln2 = __uint_as_float(0x3f317218U);
+
+    float fl = __low2float(a);
+    fl = __internal_device_fast_bf16log2(fl);
+    fl = fl * flt_ln2;
+
+    float fh = __high2float(a);
+    fh = __internal_device_fast_bf16log2(fh);
+    fh = fh * flt_ln2;
+
+    return __floats2bfloat162_rn( fl, fh );
+)
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat16 hlog10(const __nv_bfloat16 a) {
+    const float flt_log10_2 = __uint_as_float(0x3E9A209BU);
+    float fa = __bfloat162float(a);
+    fa = __internal_device_fast_bf16log2(fa);
+    fa = fa * flt_log10_2;
+    return __float2bfloat16_rn(fa);
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2log10(const __nv_bfloat162 a) {
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __nv_bfloat162 val;
+    asm("{.reg.b16         hl, hu;             \n"
+        " .reg.b32         r, fl, fu, C, h;    \n"
+        "  mov.b32         {hl, hu}, %1;       \n"
+        "  mov.b32         h, %1;              \n"
+        "  mov.b32         fl, {0,hl};         \n"
+        "  mov.b32         fu, {0,hu};         \n"
+        "  lg2.approx.f32      fl, fl;         \n"
+        "  lg2.approx.f32      fu, fu;         \n"
+        "  mov.b32         C, 0x3E9A209BU;      \n"
+        "  mul.f32         fl,fl,C;            \n"
+        "  mul.f32         fu,fu,C;            \n"
+        "  cvt.rn.bf16.f32    hl, fl;         \n"
+        "  cvt.rn.bf16.f32    hu, fu;         \n"
+        "  mov.b32         r, {hl, hu};        \n"
+        "  mov.b32         %0, r;              \n"
+        "}":"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)));
+    return val;
+,
+    const float flt_log10_2 = __uint_as_float(0x3E9A209BU);
+
+    float fl = __low2float(a);
+    fl = __internal_device_fast_bf16log2(fl);
+    fl = fl * flt_log10_2;
+
+    float fh = __high2float(a);
+    fh = __internal_device_fast_bf16log2(fh);
+    fh = fh * flt_log10_2;
+
+    return __floats2bfloat162_rn( fl, fh );
+)
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat162 h2rcp(const __nv_bfloat162 a) {
+    float fl = __low2float(a);
+    asm("{ rcp.approx.f32 %0, %0; }" : "+f"(fl));
+    float fh = __high2float(a);
+    asm("{ rcp.approx.f32 %0, %0; }" : "+f"(fh));
+    return __floats2bfloat162_rn( fl, fh );
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 hrcp(const __nv_bfloat16 a) {
+    float fa = __bfloat162float(a);
+    asm("{ rcp.approx.f32 %0, %0; }" : "+f"(fa));
+    return __float2bfloat16_rn(fa);
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2rsqrt(const __nv_bfloat162 a) {
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __APPROX_FCAST2(rsqrt)
+,
+    float fl = __low2float(a);
+    asm("{ rsqrt.approx.f32 %0, %0; }" : "+f"(fl));
+    float fh = __high2float(a);
+    asm("{ rsqrt.approx.f32 %0, %0; }" : "+f"(fh));
+    return __floats2bfloat162_rn( fl, fh );
+)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 hrsqrt(const __nv_bfloat16 a) {
+    float fa = __bfloat162float(a);
+    asm("{ rsqrt.approx.f32 %0, %0; }" : "+f"(fa));
+    return __float2bfloat16_rn(fa);
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2sqrt(const __nv_bfloat162 a) {
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __APPROX_FCAST2(sqrt)
+,
+    float fl = __low2float(a);
+    asm("{ sqrt.approx.f32 %0, %0; }" : "+f"(fl));
+    float fh = __high2float(a);
+    asm("{ sqrt.approx.f32 %0, %0; }" : "+f"(fh));
+    return __floats2bfloat162_rn( fl, fh );
+)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 hsqrt(const __nv_bfloat16 a) {
+    float fa = __bfloat162float(a);
+    asm("{ sqrt.approx.f32 %0, %0; }" : "+f"(fa));
+    return __float2bfloat16_rn(fa);
+}
+#undef __APPROX_FCAST2
+#undef __BF16_SPEC_CASE2
+
+__CUDA_BF16_DECL__ bool __internal_device_hisnan(const __nv_bfloat16 a)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 r;
+    asm("{set.nan.bf16.bf16 %0,%1,%1;\n}"
+        :"=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(a)));
+    return __BFLOAT16_TO_CUS(r) != 0U;
+,
+    unsigned int r;
+    asm( "{.reg .b32 a;\n"
+         "  mov.b32 a, {0,%1};\n"
+         "  set.nan.f32.f32 %0, a, a;}\n"
+         :"=r"(r) : "h"(__BFLOAT16_TO_CUS(a)));
+    return r != 0U;
+)
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hisnan2(const __nv_bfloat162 a)
+{
+    __nv_bfloat162 r;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    asm("{set.nan.bf16x2.bf16x2 %0,%1,%1;\n}"
+        :"=r"(__BFLOAT162_TO_UI(r)) : "r"(__BFLOAT162_TO_CUI(a)));
+,
+    __nv_bfloat162_raw val;
+    val.x = __hisnan(a.x) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    val.y = __hisnan(a.y) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    r = __nv_bfloat162(val);
+)
+    return r;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hisnan(const __nv_bfloat16 a)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hisnan(a);
+,
+    const __nv_bfloat16_raw hr = static_cast<__nv_bfloat16_raw>(a);
+    return ((hr.x & 0x7FFFU) > 0x7F80U);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hneg2(const __nv_bfloat162 a)
+{
+    __nv_bfloat162 r;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    asm("{neg.bf16x2 %0,%1;\n}"
+        :"=r"(__BFLOAT162_TO_UI(r)) : "r"(__BFLOAT162_TO_CUI(a)));
+,
+    r.x = __hneg(a.x);
+    r.y = __hneg(a.y);
+)
+    return r;
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ __nv_bfloat16 __internal_device_hneg(const __nv_bfloat16 a)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __nv_bfloat16 r;
+    asm("{neg.bf16 %0,%1;\n}"
+        :"=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(a)));
+    return r;
+,
+    const float fa = __bfloat162float(a);
+    return __float2bfloat16(__fmaf_ieee_rn(fa, -1.0f, -0.0f));
+)
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hneg(const __nv_bfloat16 a)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hneg(a);
+,
+    const float fa = __bfloat162float(a);
+    return __float2bfloat16(-fa);
+)
+}
+
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __habs2(const __nv_bfloat162 a)
+{
+    __nv_bfloat162 r;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    asm("{abs.bf16x2 %0,%1;\n}"
+        :"=r"(__BFLOAT162_TO_UI(r)) : "r"(__BFLOAT162_TO_CUI(a)));
+,
+    r.x = __habs(a.x);
+    r.y = __habs(a.y);
+)
+    return r;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __habs(const __nv_bfloat16 a)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __nv_bfloat16 r;
+    asm("{abs.bf16 %0,%1;\n}"
+        :"=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(a)));
+    return r;
+,
+    __nv_bfloat16_raw abs_a_raw = static_cast<__nv_bfloat16_raw>(a);
+    abs_a_raw.x &= (unsigned short)0x7FFFU;
+    if (abs_a_raw.x > (unsigned short)0x7F80U)
+    {
+        // return canonical NaN
+        abs_a_raw.x = (unsigned short)0x7FFFU;
+    }
+    return static_cast<__nv_bfloat16>(abs_a_raw);
+)
+}
+
+/******************************************************************************
+*                             __nv_bfloat16 arithmetic                             *
+******************************************************************************/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmax(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __nv_bfloat16 val;
+    asm( "{ max.bf16 %0,%1,%2;\n}"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+    return val;
+,
+    __nv_bfloat16 maxval;
+
+    maxval = (__hge(a, b) || __hisnan(b)) ? a : b;
+
+    if (__hisnan(maxval))
+    {
+        // if both inputs are NaN, return canonical NaN
+        maxval = CUDART_NAN_BF16;
+    }
+    else if (__heq(a, b))
+    {
+        // hmax(+0.0, -0.0) = +0.0
+        // unsigned compare 0x8000U > 0x0000U
+        __nv_bfloat16_raw ra = __nv_bfloat16_raw(a);
+        __nv_bfloat16_raw rb = __nv_bfloat16_raw(b);
+        maxval = (ra.x > rb.x) ? b : a;
+    }
+
+    return maxval;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmin(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __nv_bfloat16 val;
+    asm( "{ min.bf16 %0,%1,%2;\n}"
+         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+    return val;
+,
+    __nv_bfloat16 minval;
+
+    minval = (__hle(a, b) || __hisnan(b)) ? a : b;
+
+    if (__hisnan(minval))
+    {
+        // if both inputs are NaN, return canonical NaN
+        minval = CUDART_NAN_BF16;
+    }
+    else if (__heq(a, b))
+    {
+        // hmin(+0.0, -0.0) = -0.0
+        // unsigned compare 0x8000U > 0x0000U
+        __nv_bfloat16_raw ra = __nv_bfloat16_raw(a);
+        __nv_bfloat16_raw rb = __nv_bfloat16_raw(b);
+        minval = (ra.x > rb.x) ? a : b;
+    }
+
+    return minval;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmax_nan(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __nv_bfloat16 val;
+    asm( "{ max.NaN.bf16 %0,%1,%2;\n}"
+         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+    return val;
+,
+    __nv_bfloat16 maxval;
+
+    if (__hisnan(a) || __hisnan(b))
+    {
+        // if either input is NaN, return canonical NaN
+        maxval = CUDART_NAN_BF16;
+    }
+    else
+    {
+        maxval = __hge(a, b) ? a : b;
+    }
+
+    return maxval;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmin_nan(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __nv_bfloat16 val;
+    asm( "{ min.NaN.bf16 %0,%1,%2;\n}"
+         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+    return val;
+,
+    __nv_bfloat16 minval;
+
+    if (__hisnan(a) || __hisnan(b))
+    {
+        // if either input is NaN, return canonical NaN
+        minval = CUDART_NAN_BF16;
+    }
+    else
+    {
+        minval = __hle(a, b) ? a : b;
+    }
+
+    return minval;
+)
+}
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ __nv_bfloat16 __hfma_relu(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c)
+{
+    __nv_bfloat16 val;
+    asm( "{ fma.rn.relu.bf16 %0,%1,%2,%3;\n}"
+         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)),"h"(__BFLOAT16_TO_CUS(c)));
+    return val;
+}
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */
+/******************************************************************************
+*                            __nv_bfloat162 arithmetic                             *
+******************************************************************************/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmax2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __nv_bfloat162 val;
+    asm( "{ max.bf16x2 %0,%1,%2;\n}"
+         :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+    return val;
+,
+    __nv_bfloat162 val;
+    val.x = __hmax(a.x, b.x);
+    val.y = __hmax(a.y, b.y);
+    return val;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmin2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __nv_bfloat162 val;
+    asm( "{ min.bf16x2 %0,%1,%2;\n}"
+         :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+    return val;
+,
+    __nv_bfloat162 val;
+    val.x = __hmin(a.x, b.x);
+    val.y = __hmin(a.y, b.y);
+    return val;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmax2_nan(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __nv_bfloat162 val;
+    asm( "{ max.NaN.bf16x2 %0,%1,%2;\n}"
+         :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+    return val;
+,
+    __nv_bfloat162 val;
+    val.x = __hmax_nan(a.x, b.x);
+    val.y = __hmax_nan(a.y, b.y);
+    return val;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmin2_nan(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __nv_bfloat162 val;
+    asm( "{ min.NaN.bf16x2 %0,%1,%2;\n}"
+         :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+    return val;
+,
+    __nv_bfloat162 val;
+    val.x = __hmin_nan(a.x, b.x);
+    val.y = __hmin_nan(a.y, b.y);
+    return val;
+)
+}
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2_relu(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c)
+{
+    __nv_bfloat162 val;
+    asm( "{ fma.rn.relu.bf16x2 %0,%1,%2,%3;\n}"
+         :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)),"r"(__BFLOAT162_TO_CUI(c)));
+    return val;
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat162 __hcmadd(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c)
+{
+    // fast version of complex multiply-accumulate
+    // (a.re, a.im) * (b.re, b.im) + (c.re, c.im)
+    // acc.re = (c.re + a.re*b.re) - a.im*b.im
+    // acc.im = (c.im + a.re*b.im) + a.im*b.re
+    __nv_bfloat16 real_tmp = __hfma(a.x, b.x, c.x);
+    __nv_bfloat16 img_tmp  = __hfma(a.x, b.y, c.y);
+    real_tmp = __hfma(__hneg(a.y), b.y, real_tmp);
+    img_tmp  = __hfma(a.y,         b.x, img_tmp);
+    return make_bfloat162(real_tmp, img_tmp);
+}
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */
+
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/* Define __PTR for atomicAdd prototypes below, undef after done */
+#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
+#define __PTR   "l"
+#else
+#define __PTR   "r"
+#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/
+
+__CUDA_BF16_DECL__ __nv_bfloat162 atomicAdd(__nv_bfloat162 *const address, const __nv_bfloat162 val)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat162 r;
+    asm volatile ("{ atom.add.noftz.bf16x2 %0,[%1],%2; }\n"
+                  : "=r"(__BFLOAT162_TO_UI(r)) : __PTR(address), "r"(__BFLOAT162_TO_CUI(val))
+                  : "memory");
+    return r;
+,
+    unsigned int* address_as_uint = (unsigned int*)address;
+    unsigned int old = *address_as_uint;
+    unsigned int assumed;
+    do {
+        assumed = old;
+        __nv_bfloat162 new_val = __hadd2(val, *(__nv_bfloat162*)&assumed);
+        old = atomicCAS(address_as_uint, assumed, *(unsigned int*)&new_val);
+    } while (assumed != old);
+    return *(__nv_bfloat162*)&old;
+)
+}
+
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700))) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ __nv_bfloat16 atomicAdd(__nv_bfloat16 *const address, const __nv_bfloat16 val)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 r;
+    asm volatile ("{ atom.add.noftz.bf16 %0,[%1],%2; }\n"
+                  : "=h"(__BFLOAT16_TO_US(r))
+                  : __PTR(address), "h"(__BFLOAT16_TO_CUS(val))
+                  : "memory");
+    return r;
+,
+    unsigned short int* address_as_us = (unsigned short int*)address;
+    unsigned short int old = *address_as_us;
+    unsigned short int assumed;
+    do {
+        assumed = old;
+        old = atomicCAS(address_as_us, assumed,
+            __bfloat16_as_ushort(__hadd(val, __ushort_as_bfloat16(assumed))));
+    } while (assumed != old);
+    return __ushort_as_bfloat16(old);
+)
+}
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700))) || defined(_NVHPC_CUDA) */
+
+#undef __PTR
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+
+#endif /* !(defined __DOXYGEN_ONLY__) */
+
+#endif /* defined(__cplusplus) */
+
+#undef __CUDA_HOSTDEVICE_BF16_DECL__
+#undef __CUDA_BF16_DECL__
+#undef __CUDA_BF16_CONSTEXPR__
+
+#if defined(__CPP_VERSION_AT_LEAST_11_BF16)
+#undef __CPP_VERSION_AT_LEAST_11_BF16
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */
+
+#undef ___CUDA_BF16_STRINGIFY_INNERMOST
+#undef __CUDA_BF16_STRINGIFY
+
+#endif /* end of include guard: __CUDA_BF16_HPP__ */

From 834a905f5747c5f0f9bedbeb0e61fdd8dbc54892 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Thu, 14 Aug 2025 14:44:03 -0700
Subject: [PATCH 11/56] regenerate with ctk13

---
 configs/cuda_bf16.yml                        |   4 +-
 numba_cuda/numba/cuda/_internal/cuda_bf16.py | 108 ++++++-------------
 2 files changed, 33 insertions(+), 79 deletions(-)

diff --git a/configs/cuda_bf16.yml b/configs/cuda_bf16.yml
index 348e48ee7..9d4727aae 100644
--- a/configs/cuda_bf16.yml
+++ b/configs/cuda_bf16.yml
@@ -2,9 +2,9 @@ Name: Numba Bfloat16
 Version: 0.0.2
 GPU Arch:
     - sm_80 # sm_80 is the first CUDA architecture that supports bfloat16
-Entry Point: ./numba_cuda/numba/cuda/include/12/cuda_bf16.h
+Entry Point: ./numba_cuda/numba/cuda/include/13/cuda_bf16.h
 File List:
-    - ./numba_cuda/numba/cuda/include/12/cuda_bf16.h
+    - ./numba_cuda/numba/cuda/include/13/cuda_bf16.h
 Exclude: {}
 Types:
     __nv_bfloat16_raw: Number
diff --git a/numba_cuda/numba/cuda/_internal/cuda_bf16.py b/numba_cuda/numba/cuda/_internal/cuda_bf16.py
index fcc70298d..6483cfd73 100644
--- a/numba_cuda/numba/cuda/_internal/cuda_bf16.py
+++ b/numba_cuda/numba/cuda/_internal/cuda_bf16.py
@@ -12,7 +12,6 @@
 # Imports:
 import io
 import operator
-import os
 
 import numba
 from llvmlite import ir
@@ -25,9 +24,9 @@
 )
 from numba.core.imputils import Registry as TargetRegistry
 from numba.core.imputils import lower_cast
-from numba.cuda.typing.templates import Registry as TypingRegistry
-from numba.cuda.typing import signature
-from numba.cuda.typing.templates import AttributeTemplate, ConcreteTemplate
+from numba.core.typing import signature
+from numba.core.typing.templates import AttributeTemplate, ConcreteTemplate
+from numba.core.typing.templates import Registry as TypingRegistry
 from numba.cuda import CUSource, declare_device
 from numba.cuda.types import bfloat16
 from numba.cuda.vector_types import vector_types
@@ -55,9 +54,6 @@
 float32x2 = vector_types["float32x2"]
 __half = float16
 
-# Setups:
-
-
 typing_registry = TypingRegistry()
 register = typing_registry.register
 register_attr = typing_registry.register_attr
@@ -299,68 +295,12 @@ def conversion_impl(context, builder, fromty, toty, value):
         return ctor_impl(
             context,
             builder,
-            signature(_type___nv_bfloat16, CPointer(_type_unnamed1405307)),
-            value,
-        )
-
-
-_lower__ZN13__nv_bfloat16C1ERK17__nv_bfloat16_raw(shim_stream, shim_obj)
-
-
-def _lower__float16_to_bfloat16(shim_stream, shim_obj):
-    shim_raw_str = """
-    extern "C" __device__ int
-    _ZN13__float162bfloat16_nbst(int &ignore, __nv_bfloat16 *self , __half* hr) {
-        new (self) __nv_bfloat16(*hr);
-        return 0;
-    }
-        """
-
-    _ctor_decl___float162bfloat16 = declare_device(
-        "_ZN13__float162bfloat16_nbst",
-        int32(CPointer(_type___nv_bfloat16), CPointer(float16)),
-    )
-
-    def __float162bfloat16_device_caller(arg_0, arg_1):
-        return _ctor_decl___float162bfloat16(arg_0, arg_1)
-
-    def ctor_impl(context, builder, sig, args):
-        context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("_ZN13__float162bfloat16_nbst", shim_raw_str)
-        selfptr = builder.alloca(
-            context.get_value_type(_type___nv_bfloat16), name="selfptr"
-        )
-        argptrs = [
-            builder.alloca(context.get_value_type(arg)) for arg in sig.args
-        ]
-        for ptr, ty, arg in zip(argptrs, sig.args, args):
-            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
-
-        context.compile_internal(
-            builder,
-            __float162bfloat16_device_caller,
-            signature(
-                int32,
-                CPointer(_type___nv_bfloat16),
-                CPointer(float16),
-            ),
-            (selfptr, *argptrs),
-        )
-        return builder.load(
-            selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
-        )
-
-    @lower_cast(float16, _type___nv_bfloat16)
-    def conversion_impl(context, builder, fromty, toty, value):
-        return ctor_impl(
-            context,
-            builder,
-            signature(_type___nv_bfloat16, float16),
+            signature(_type___nv_bfloat16, fromty),
             [value],
         )
 
 
-_lower__float16_to_bfloat16(shim_stream, shim_obj)
+_lower__ZN13__nv_bfloat16C1ERK17__nv_bfloat16_raw(shim_stream, shim_obj)
 
 
 def _lower__ZN13__nv_bfloat16C1E6__half(shim_stream, shim_obj):
@@ -454,7 +394,7 @@ def conversion_impl(context, builder, fromty, toty, value):
         return ctor_impl(
             context,
             builder,
-            signature(_type___nv_bfloat16, float32),
+            signature(_type___nv_bfloat16, fromty),
             [value],
         )
 
@@ -507,7 +447,7 @@ def conversion_impl(context, builder, fromty, toty, value):
         return ctor_impl(
             context,
             builder,
-            signature(_type___nv_bfloat16, float64),
+            signature(_type___nv_bfloat16, fromty),
             [value],
         )
 
@@ -560,7 +500,7 @@ def conversion_impl(context, builder, fromty, toty, value):
         return ctor_impl(
             context,
             builder,
-            signature(_type___nv_bfloat16, int16),
+            signature(_type___nv_bfloat16, fromty),
             [value],
         )
 
@@ -613,7 +553,7 @@ def conversion_impl(context, builder, fromty, toty, value):
         return ctor_impl(
             context,
             builder,
-            signature(_type___nv_bfloat16, uint16),
+            signature(_type___nv_bfloat16, fromty),
             [value],
         )
 
@@ -666,7 +606,7 @@ def conversion_impl(context, builder, fromty, toty, value):
         return ctor_impl(
             context,
             builder,
-            signature(_type___nv_bfloat16, int32),
+            signature(_type___nv_bfloat16, fromty),
             [value],
         )
 
@@ -719,7 +659,7 @@ def conversion_impl(context, builder, fromty, toty, value):
         return ctor_impl(
             context,
             builder,
-            signature(_type___nv_bfloat16, uint32),
+            signature(_type___nv_bfloat16, fromty),
             [value],
         )
 
@@ -772,7 +712,7 @@ def conversion_impl(context, builder, fromty, toty, value):
         return ctor_impl(
             context,
             builder,
-            signature(_type___nv_bfloat16, int64),
+            signature(_type___nv_bfloat16, fromty),
             [value],
         )
 
@@ -825,7 +765,7 @@ def conversion_impl(context, builder, fromty, toty, value):
         return ctor_impl(
             context,
             builder,
-            signature(_type___nv_bfloat16, uint64),
+            signature(_type___nv_bfloat16, fromty),
             [value],
         )
 
@@ -878,7 +818,7 @@ def conversion_impl(context, builder, fromty, toty, value):
         return ctor_impl(
             context,
             builder,
-            signature(_type___nv_bfloat16, int64),
+            signature(_type___nv_bfloat16, fromty),
             [value],
         )
 
@@ -931,7 +871,7 @@ def conversion_impl(context, builder, fromty, toty, value):
         return ctor_impl(
             context,
             builder,
-            signature(_type___nv_bfloat16, uint64),
+            signature(_type___nv_bfloat16, fromty),
             [value],
         )
 
@@ -1954,8 +1894,8 @@ def conversion_impl(context, builder, fromty, toty, value):
         return ctor_impl(
             context,
             builder,
-            signature(_type___nv_bfloat162, CPointer(_type_unnamed1405416)),
-            value,
+            signature(_type___nv_bfloat162, fromty),
+            [value],
         )
 
 
@@ -13672,6 +13612,10 @@ def impl(context, builder, sig, args):
 _lower__ZleRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj)
 
 
+def __half():
+    pass
+
+
 def _lower__ZN6__halfC1E13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
@@ -15977,6 +15921,15 @@ class _typing_atomicAdd(ConcreteTemplate):
 register_global(atomicAdd, types.Function(_typing_atomicAdd))
 
 
+@register
+class _typing___half(ConcreteTemplate):
+    key = globals()["__half"]
+    cases = [signature(void, _type___nv_bfloat16)]
+
+
+register_global(__half, types.Function(_typing___half))
+
+
 @register_global(operator.add)
 class _typing_operator_add(ConcreteTemplate):
     cases = [
@@ -16427,6 +16380,7 @@ class _typing_operator_le(ConcreteTemplate):
     "operator<",
     "operator>=",
     "operator<=",
+    "__half",
 ]
 
 

From 577f00aafb1e41af6dc69385db155e3d1d1da904 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Thu, 14 Aug 2025 14:45:46 -0700
Subject: [PATCH 12/56] explicitly test against bfloat16 type

---
 numba_cuda/numba/cuda/bf16.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/numba_cuda/numba/cuda/bf16.py b/numba_cuda/numba/cuda/bf16.py
index 693a8e573..65446d51f 100644
--- a/numba_cuda/numba/cuda/bf16.py
+++ b/numba_cuda/numba/cuda/bf16.py
@@ -20,14 +20,13 @@
     htanh,
     htanh_approx,
 )
-from numba.cuda.types import Bfloat16
 from numba.extending import overload
 
 import math
 
 
 def _make_unary(a, func):
-    if isinstance(a, Bfloat16):
+    if a == bfloat16:
         return lambda a: func(a)
 
 

From 765f8ee4143a811830759233c2115a50f9dab0f0 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Fri, 15 Aug 2025 10:43:38 -0700
Subject: [PATCH 13/56] hand write lower cast fp16->bf16

---
 numba_cuda/numba/cuda/_internal/cuda_bf16.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/numba_cuda/numba/cuda/_internal/cuda_bf16.py b/numba_cuda/numba/cuda/_internal/cuda_bf16.py
index 6483cfd73..2a7032d9c 100644
--- a/numba_cuda/numba/cuda/_internal/cuda_bf16.py
+++ b/numba_cuda/numba/cuda/_internal/cuda_bf16.py
@@ -345,6 +345,17 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
         )
 
+    # In C++, this cast is an explicit constructor call, by default Numbast will not generate
+    # this lower cast. We implement this by hand to enable the cast from fp16 to bf16.
+    @lower_cast(float16, _type___nv_bfloat16)
+    def conversion_impl(context, builder, fromty, toty, value):
+        return ctor_impl(
+            context,
+            builder,
+            signature(_type___nv_bfloat16, fromty),
+            [value],
+        )
+
 
 _lower__ZN13__nv_bfloat16C1E6__half(shim_stream, shim_obj)
 

From c443e4d6624e2ce82e89ec8148d78a6c6148df93 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Fri, 15 Aug 2025 11:07:10 -0700
Subject: [PATCH 14/56] ptx test for several basic ptx

---
 .../tests/cudapy/test_bfloat16_bindings.py    | 31 +++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py
index 3538fb230..abc8e47d8 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py
@@ -1,6 +1,7 @@
 import numba.cuda as cuda
 from numba.cuda.testing import unittest, CUDATestCase
 import numpy as np
+import operator
 
 from numba import (
     config,
@@ -291,6 +292,36 @@ def kernel(arr):
 
         np.testing.assert_allclose(arr, [3], atol=1e-2)
 
+    def test_bf16_intrinsics_used_in_lto(self):
+        self.skip_unsupported()
+
+        operations = [
+            (operator.add, "fma.rn.bf16"),
+            (operator.sub, "fma.rn.bf16"),
+            (operator.mul, "fma.rn.bf16"),
+            (
+                operator.truediv,
+                "div.approx.f32",
+            ),  # no native bf16 div, see cuda_bf16.hpp:L3067
+        ]
+
+        for op, ptx_op in operations:
+            with self.subTest(op=op):
+
+                @cuda.jit(lto=True)
+                def kernel(arr):
+                    a = nv_bfloat16(3.14)
+                    b = nv_bfloat16(5)
+                    arr[0] = float32(op(a, b))
+
+                arr = np.zeros(1, np.float32)
+                kernel[1, 1](arr)
+                np.testing.assert_allclose(arr, [op(3.14, 5)], atol=1e-1)
+
+                ptx = next(iter(kernel.inspect_lto_ptx().values()))
+
+                assert ptx_op in ptx, ptx
+
 
 if __name__ == "__main__":
     unittest.main()

From 166c9ae74c704ae5e4b4bdf619c27a65a86e28cc Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Fri, 15 Aug 2025 12:44:33 -0700
Subject: [PATCH 15/56] add double underscore intrinsics

---
 numba_cuda/numba/cuda/bf16.py                 |  34 ++++++
 .../numba/cuda/tests/cudapy/test_bfloat16.py  | 101 +++++++++++++++++-
 2 files changed, 134 insertions(+), 1 deletion(-)

diff --git a/numba_cuda/numba/cuda/bf16.py b/numba_cuda/numba/cuda/bf16.py
index 65446d51f..1ac3798c0 100644
--- a/numba_cuda/numba/cuda/bf16.py
+++ b/numba_cuda/numba/cuda/bf16.py
@@ -2,6 +2,23 @@
     typing_registry,
     target_registry,
     nv_bfloat16 as bfloat16,
+    # Arithmetic intrinsics
+    __habs,
+    __hadd,
+    __hsub,
+    __hmul,
+    __hadd_rn,
+    __hsub_rn,
+    __hmul_rn,
+    __hdiv,
+    __hadd_sat,
+    __hsub_sat,
+    __hmul_sat,
+    __hfma,
+    __hfma_sat,
+    __hneg,
+    __hfma_relu,
+    atomicAdd,
     htrunc,
     hceil,
     hfloor,
@@ -95,6 +112,23 @@ def exp2_ol(a):
     "typing_registry",
     "target_registry",
     "bfloat16",
+    # Arithmetic intrinsics
+    "__habs",
+    "__hadd",
+    "__hsub",
+    "__hmul",
+    "__hadd_rn",
+    "__hsub_rn",
+    "__hmul_rn",
+    "__hdiv",
+    "__hadd_sat",
+    "__hsub_sat",
+    "__hmul_sat",
+    "__hfma",
+    "__hfma_sat",
+    "__hneg",
+    "__hfma_relu",
+    "atomicAdd",
     "htrunc",
     "hceil",
     "hfloor",
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py
index 49e843abe..af25a3860 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py
@@ -1,5 +1,22 @@
 from numba import cuda, float32
-from numba.cuda.bf16 import bfloat16
+from numba.cuda.bf16 import (
+    bfloat16,
+    __habs,
+    __hadd,
+    __hsub,
+    __hmul,
+    __hdiv,
+    __hadd_rn,
+    __hsub_rn,
+    __hmul_rn,
+    __hadd_sat,
+    __hsub_sat,
+    __hmul_sat,
+    __hfma,
+    __hfma_sat,
+    __hfma_relu,
+    __hneg,
+)
 from numba.cuda.testing import CUDATestCase
 
 import math
@@ -60,3 +77,85 @@ def kernel(arr):
                     self.assertAlmostEqual(arr[0], f(3.14), delta=1e-1)
                 else:
                     self.assertAlmostEqual(arr[0], f(3.14), delta=1e-2)
+
+    def test_arithmetic_intrinsics_basic(self):
+        self.skip_unsupported()
+
+        @cuda.jit
+        def kernel(out):
+            a = bfloat16(1.25)
+            b = bfloat16(-2.5)
+
+            out[0] = float32(__habs(b))
+            out[1] = float32(__hadd(a, b))
+            out[2] = float32(__hsub(a, b))
+            out[3] = float32(__hmul(a, b))
+            out[4] = float32(__hdiv(b, a))
+            out[5] = float32(__hneg(a))
+            out[6] = float32(__hfma(a, b, b))
+
+            out[7] = float32(__hadd_rn(a, b))
+            out[8] = float32(__hsub_rn(a, b))
+            out[9] = float32(__hmul_rn(a, b))
+
+        out = cuda.device_array((10,), dtype="float32")
+        kernel[1, 1](out)
+
+        a = 1.25
+        b = -2.5
+        expected = [
+            abs(b),
+            a + b,
+            a - b,
+            a * b,
+            b / a,
+            -a,
+            a * b + b,
+            a + b,
+            a - b,
+            a * b,
+        ]
+        for i, exp in enumerate(expected):
+            self.assertAlmostEqual(out[i], exp, delta=1e-2)
+
+    def test_arithmetic_intrinsics_saturating(self):
+        self.skip_unsupported()
+
+        @cuda.jit
+        def kernel(out):
+            a = bfloat16(1.5)
+            b = bfloat16(0.75)
+
+            out[0] = float32(__hadd_sat(a, b))  # 2.25 -> 1.0
+            out[1] = float32(__hsub_sat(b, a))  # -0.75 -> 0.0
+            out[2] = float32(__hmul_sat(a, b))  # 1.125 -> 1.0
+            out[3] = float32(__hfma_sat(a, b, a))  # 1.125 + 1.5 -> 1.0
+
+        out = cuda.device_array((4,), dtype="float32")
+        kernel[1, 1](out)
+
+        self.assertAlmostEqual(out[0], 1.0, delta=1e-3)
+        self.assertAlmostEqual(out[1], 0.0, delta=1e-3)
+        self.assertAlmostEqual(out[2], 1.0, delta=1e-3)
+        self.assertAlmostEqual(out[3], 1.0, delta=1e-3)
+
+        # Also check they are clamped within [0, 1]
+        for i in range(4):
+            self.assertGreaterEqual(out[i], 0.0)
+            self.assertLessEqual(out[i], 1.0)
+
+    def test_fma_relu_intrinsic(self):
+        self.skip_unsupported()
+
+        @cuda.jit
+        def kernel(out):
+            a = bfloat16(-1.5)
+            b = bfloat16(2.0)
+            c = bfloat16(0.0)
+
+            out[0] = float32(__hfma_relu(a, b, c))  # -3.0 -> relu -> 0.0
+
+        out = cuda.device_array((1,), dtype="float32")
+        kernel[1, 1](out)
+
+        self.assertAlmostEqual(out[0], 0.0, delta=1e-3)

From 6d8fd66f0d823738acace44f30c9078a19a3f0b2 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Fri, 15 Aug 2025 12:51:37 -0700
Subject: [PATCH 16/56] regnerate with globals

---
 numba_cuda/numba/cuda/_internal/cuda_bf16.py | 1223 ++++++++++++------
 1 file changed, 817 insertions(+), 406 deletions(-)

diff --git a/numba_cuda/numba/cuda/_internal/cuda_bf16.py b/numba_cuda/numba/cuda/_internal/cuda_bf16.py
index 2a7032d9c..86ddba13b 100644
--- a/numba_cuda/numba/cuda/_internal/cuda_bf16.py
+++ b/numba_cuda/numba/cuda/_internal/cuda_bf16.py
@@ -1,7 +1,7 @@
 # Automatically generated by Numbast Static Binding Generator
 # Generator Information:
 # Ast_canopy version: 0.4.0
-# Numbast version: 0.4.0
+# Numbast version: 0.5.0
 # Generation command: /home/wangm/miniforge3/envs/numbast/lib/python3.13/site-packages/numbast/__main__.py --cfg-path configs/cuda_bf16.yml --output-dir numba_cuda/numba/cuda/_internal/
 # Static binding generator parameters: {'cfg_path': 'configs/cuda_bf16.yml', 'output_dir': 'numba_cuda/numba/cuda/_internal/', 'run_ruff_format': True}
 # Config file path (relative to the path of the generated binding): ../../../../../configs/cuda_bf16.yml
@@ -23,12 +23,11 @@
     register_model,
 )
 from numba.core.imputils import Registry as TargetRegistry
-from numba.core.imputils import lower_cast
 from numba.core.typing import signature
 from numba.core.typing.templates import AttributeTemplate, ConcreteTemplate
 from numba.core.typing.templates import Registry as TypingRegistry
 from numba.cuda import CUSource, declare_device
-from numba.cuda.types import bfloat16
+from numba.cuda._internal.cuda_bf16 import _type___nv_bfloat16
 from numba.cuda.vector_types import vector_types
 from numba.extending import as_numba_type
 from numba.types import (
@@ -52,7 +51,7 @@
 )
 
 float32x2 = vector_types["float32x2"]
-__half = float16
+
 
 typing_registry = TypingRegistry()
 register = typing_registry.register
@@ -190,7 +189,28 @@ class _ctor_template_unnamed1405416(ConcreteTemplate):
 register_global(unnamed1405416, Function(_ctor_template_unnamed1405416))
 
 
-__nv_bfloat16 = _type___nv_bfloat16 = bfloat16
+# Typing for __nv_bfloat16
+class _type_class___nv_bfloat16(Number):
+    def __init__(self):
+        super().__init__(name="__nv_bfloat16")
+        self.alignof_ = 2
+        self.bitwidth = 2 * 8
+
+
+_type___nv_bfloat16 = _type_class___nv_bfloat16()
+
+
+# Make Python API for struct
+__nv_bfloat16 = type("__nv_bfloat16", (), {"_nbtype": _type___nv_bfloat16})
+
+as_numba_type.register(__nv_bfloat16, _type___nv_bfloat16)
+
+
+@register_model(_type_class___nv_bfloat16)
+class _model___nv_bfloat16(PrimitiveModel):
+    def __init__(self, dmm, fe_type):
+        be_type = ir.IntType(fe_type.bitwidth)
+        super(_model___nv_bfloat16, self).__init__(dmm, fe_type, be_type)
 
 
 def _lower__ZN13__nv_bfloat16C1Ev(shim_stream, shim_obj):
@@ -290,15 +310,6 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
         )
 
-    @lower_cast(_type_unnamed1405307, _type___nv_bfloat16)
-    def conversion_impl(context, builder, fromty, toty, value):
-        return ctor_impl(
-            context,
-            builder,
-            signature(_type___nv_bfloat16, fromty),
-            [value],
-        )
-
 
 _lower__ZN13__nv_bfloat16C1ERK17__nv_bfloat16_raw(shim_stream, shim_obj)
 
@@ -345,17 +356,6 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
         )
 
-    # In C++, this cast is an explicit constructor call, by default Numbast will not generate
-    # this lower cast. We implement this by hand to enable the cast from fp16 to bf16.
-    @lower_cast(float16, _type___nv_bfloat16)
-    def conversion_impl(context, builder, fromty, toty, value):
-        return ctor_impl(
-            context,
-            builder,
-            signature(_type___nv_bfloat16, fromty),
-            [value],
-        )
-
 
 _lower__ZN13__nv_bfloat16C1E6__half(shim_stream, shim_obj)
 
@@ -400,15 +400,6 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
         )
 
-    @lower_cast(float32, _type___nv_bfloat16)
-    def conversion_impl(context, builder, fromty, toty, value):
-        return ctor_impl(
-            context,
-            builder,
-            signature(_type___nv_bfloat16, fromty),
-            [value],
-        )
-
 
 _lower__ZN13__nv_bfloat16C1Ef(shim_stream, shim_obj)
 
@@ -453,15 +444,6 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
         )
 
-    @lower_cast(float64, _type___nv_bfloat16)
-    def conversion_impl(context, builder, fromty, toty, value):
-        return ctor_impl(
-            context,
-            builder,
-            signature(_type___nv_bfloat16, fromty),
-            [value],
-        )
-
 
 _lower__ZN13__nv_bfloat16C1Ed(shim_stream, shim_obj)
 
@@ -506,15 +488,6 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
         )
 
-    @lower_cast(int16, _type___nv_bfloat16)
-    def conversion_impl(context, builder, fromty, toty, value):
-        return ctor_impl(
-            context,
-            builder,
-            signature(_type___nv_bfloat16, fromty),
-            [value],
-        )
-
 
 _lower__ZN13__nv_bfloat16C1Es(shim_stream, shim_obj)
 
@@ -559,15 +532,6 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
         )
 
-    @lower_cast(uint16, _type___nv_bfloat16)
-    def conversion_impl(context, builder, fromty, toty, value):
-        return ctor_impl(
-            context,
-            builder,
-            signature(_type___nv_bfloat16, fromty),
-            [value],
-        )
-
 
 _lower__ZN13__nv_bfloat16C1Et(shim_stream, shim_obj)
 
@@ -612,15 +576,6 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
         )
 
-    @lower_cast(int32, _type___nv_bfloat16)
-    def conversion_impl(context, builder, fromty, toty, value):
-        return ctor_impl(
-            context,
-            builder,
-            signature(_type___nv_bfloat16, fromty),
-            [value],
-        )
-
 
 _lower__ZN13__nv_bfloat16C1Ei(shim_stream, shim_obj)
 
@@ -665,15 +620,6 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
         )
 
-    @lower_cast(uint32, _type___nv_bfloat16)
-    def conversion_impl(context, builder, fromty, toty, value):
-        return ctor_impl(
-            context,
-            builder,
-            signature(_type___nv_bfloat16, fromty),
-            [value],
-        )
-
 
 _lower__ZN13__nv_bfloat16C1Ej(shim_stream, shim_obj)
 
@@ -718,15 +664,6 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
         )
 
-    @lower_cast(int64, _type___nv_bfloat16)
-    def conversion_impl(context, builder, fromty, toty, value):
-        return ctor_impl(
-            context,
-            builder,
-            signature(_type___nv_bfloat16, fromty),
-            [value],
-        )
-
 
 _lower__ZN13__nv_bfloat16C1El(shim_stream, shim_obj)
 
@@ -771,15 +708,6 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
         )
 
-    @lower_cast(uint64, _type___nv_bfloat16)
-    def conversion_impl(context, builder, fromty, toty, value):
-        return ctor_impl(
-            context,
-            builder,
-            signature(_type___nv_bfloat16, fromty),
-            [value],
-        )
-
 
 _lower__ZN13__nv_bfloat16C1Em(shim_stream, shim_obj)
 
@@ -824,15 +752,6 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
         )
 
-    @lower_cast(int64, _type___nv_bfloat16)
-    def conversion_impl(context, builder, fromty, toty, value):
-        return ctor_impl(
-            context,
-            builder,
-            signature(_type___nv_bfloat16, fromty),
-            [value],
-        )
-
 
 _lower__ZN13__nv_bfloat16C1Ex(shim_stream, shim_obj)
 
@@ -877,15 +796,6 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
         )
 
-    @lower_cast(uint64, _type___nv_bfloat16)
-    def conversion_impl(context, builder, fromty, toty, value):
-        return ctor_impl(
-            context,
-            builder,
-            signature(_type___nv_bfloat16, fromty),
-            [value],
-        )
-
 
 _lower__ZN13__nv_bfloat16C1Ey(shim_stream, shim_obj)
 
@@ -1900,15 +1810,6 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat162, "alignof_", None)
         )
 
-    @lower_cast(_type_unnamed1405416, _type___nv_bfloat162)
-    def conversion_impl(context, builder, fromty, toty, value):
-        return ctor_impl(
-            context,
-            builder,
-            signature(_type___nv_bfloat162, fromty),
-            [value],
-        )
-
 
 _lower__ZN14__nv_bfloat162C1ERK18__nv_bfloat162_raw(shim_stream, shim_obj)
 
@@ -2002,7 +1903,9 @@ def _lower__ZL17__double2bfloat16d_nbst(shim_stream, shim_obj):
     def _ZL17__double2bfloat16d_nbst_caller(arg_0):
         return _ZL17__double2bfloat16d_nbst(arg_0)
 
-    @lower(__double2bfloat16, float64)
+    handle = globals()["__double2bfloat16"]
+
+    @lower(handle, float64)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key("_ZL17__double2bfloat16d_nbst", shim_raw_str)
@@ -2041,7 +1944,9 @@ def _lower__ZL16__float2bfloat16f_nbst(shim_stream, shim_obj):
     def _ZL16__float2bfloat16f_nbst_caller(arg_0):
         return _ZL16__float2bfloat16f_nbst(arg_0)
 
-    @lower(__float2bfloat16, float32)
+    handle = globals()["__float2bfloat16"]
+
+    @lower(handle, float32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key("_ZL16__float2bfloat16f_nbst", shim_raw_str)
@@ -2080,7 +1985,9 @@ def _lower__ZL19__float2bfloat16_rnf_nbst(shim_stream, shim_obj):
     def _ZL19__float2bfloat16_rnf_nbst_caller(arg_0):
         return _ZL19__float2bfloat16_rnf_nbst(arg_0)
 
-    @lower(__float2bfloat16_rn, float32)
+    handle = globals()["__float2bfloat16_rn"]
+
+    @lower(handle, float32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -2121,7 +2028,9 @@ def _lower__ZL19__float2bfloat16_rzf_nbst(shim_stream, shim_obj):
     def _ZL19__float2bfloat16_rzf_nbst_caller(arg_0):
         return _ZL19__float2bfloat16_rzf_nbst(arg_0)
 
-    @lower(__float2bfloat16_rz, float32)
+    handle = globals()["__float2bfloat16_rz"]
+
+    @lower(handle, float32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -2162,7 +2071,9 @@ def _lower__ZL19__float2bfloat16_rdf_nbst(shim_stream, shim_obj):
     def _ZL19__float2bfloat16_rdf_nbst_caller(arg_0):
         return _ZL19__float2bfloat16_rdf_nbst(arg_0)
 
-    @lower(__float2bfloat16_rd, float32)
+    handle = globals()["__float2bfloat16_rd"]
+
+    @lower(handle, float32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -2203,7 +2114,9 @@ def _lower__ZL19__float2bfloat16_ruf_nbst(shim_stream, shim_obj):
     def _ZL19__float2bfloat16_ruf_nbst_caller(arg_0):
         return _ZL19__float2bfloat16_ruf_nbst(arg_0)
 
-    @lower(__float2bfloat16_ru, float32)
+    handle = globals()["__float2bfloat16_ru"]
+
+    @lower(handle, float32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -2245,7 +2158,9 @@ def _lower__ZL16__bfloat162float13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL16__bfloat162float13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL16__bfloat162float13__nv_bfloat16_nbst(arg_0)
 
-    @lower(__bfloat162float, _type___nv_bfloat16)
+    handle = globals()["__bfloat162float"]
+
+    @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -2287,7 +2202,9 @@ def _lower__ZL20__float2bfloat162_rnf_nbst(shim_stream, shim_obj):
     def _ZL20__float2bfloat162_rnf_nbst_caller(arg_0):
         return _ZL20__float2bfloat162_rnf_nbst(arg_0)
 
-    @lower(__float2bfloat162_rn, float32)
+    handle = globals()["__float2bfloat162_rn"]
+
+    @lower(handle, float32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -2329,7 +2246,9 @@ def _lower__ZL21__floats2bfloat162_rnff_nbst(shim_stream, shim_obj):
     def _ZL21__floats2bfloat162_rnff_nbst_caller(arg_0, arg_1):
         return _ZL21__floats2bfloat162_rnff_nbst(arg_0, arg_1)
 
-    @lower(__floats2bfloat162_rn, float32, float32)
+    handle = globals()["__floats2bfloat162_rn"]
+
+    @lower(handle, float32, float32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -2373,7 +2292,9 @@ def _lower__ZL11__low2float14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL11__low2float14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL11__low2float14__nv_bfloat162_nbst(arg_0)
 
-    @lower(__low2float, _type___nv_bfloat162)
+    handle = globals()["__low2float"]
+
+    @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -2415,7 +2336,9 @@ def _lower__ZL12__high2float14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL12__high2float14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL12__high2float14__nv_bfloat162_nbst(arg_0)
 
-    @lower(__high2float, _type___nv_bfloat162)
+    handle = globals()["__high2float"]
+
+    @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -2457,7 +2380,9 @@ def _lower__ZL21__float22bfloat162_rn6float2_nbst(shim_stream, shim_obj):
     def _ZL21__float22bfloat162_rn6float2_nbst_caller(arg_0):
         return _ZL21__float22bfloat162_rn6float2_nbst(arg_0)
 
-    @lower(__float22bfloat162_rn, float32x2)
+    handle = globals()["__float22bfloat162_rn"]
+
+    @lower(handle, float32x2)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -2499,7 +2424,9 @@ def _lower__ZL18__bfloat1622float214__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL18__bfloat1622float214__nv_bfloat162_nbst_caller(arg_0):
         return _ZL18__bfloat1622float214__nv_bfloat162_nbst(arg_0)
 
-    @lower(__bfloat1622float2, _type___nv_bfloat162)
+    handle = globals()["__bfloat1622float2"]
+
+    @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -2541,7 +2468,9 @@ def _lower__ZL18__bfloat162char_rz13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL18__bfloat162char_rz13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL18__bfloat162char_rz13__nv_bfloat16_nbst(arg_0)
 
-    @lower(__bfloat162char_rz, _type___nv_bfloat16)
+    handle = globals()["__bfloat162char_rz"]
+
+    @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -2583,7 +2512,9 @@ def _lower__ZL19__bfloat162uchar_rz13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL19__bfloat162uchar_rz13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL19__bfloat162uchar_rz13__nv_bfloat16_nbst(arg_0)
 
-    @lower(__bfloat162uchar_rz, _type___nv_bfloat16)
+    handle = globals()["__bfloat162uchar_rz"]
+
+    @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -2625,7 +2556,9 @@ def _lower__ZL17__bfloat162int_rn13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL17__bfloat162int_rn13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL17__bfloat162int_rn13__nv_bfloat16_nbst(arg_0)
 
-    @lower(__bfloat162int_rn, _type___nv_bfloat16)
+    handle = globals()["__bfloat162int_rn"]
+
+    @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -2667,7 +2600,9 @@ def _lower__ZL17__bfloat162int_rz13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL17__bfloat162int_rz13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL17__bfloat162int_rz13__nv_bfloat16_nbst(arg_0)
 
-    @lower(__bfloat162int_rz, _type___nv_bfloat16)
+    handle = globals()["__bfloat162int_rz"]
+
+    @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -2709,7 +2644,9 @@ def _lower__ZL17__bfloat162int_rd13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL17__bfloat162int_rd13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL17__bfloat162int_rd13__nv_bfloat16_nbst(arg_0)
 
-    @lower(__bfloat162int_rd, _type___nv_bfloat16)
+    handle = globals()["__bfloat162int_rd"]
+
+    @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -2751,7 +2688,9 @@ def _lower__ZL17__bfloat162int_ru13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL17__bfloat162int_ru13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL17__bfloat162int_ru13__nv_bfloat16_nbst(arg_0)
 
-    @lower(__bfloat162int_ru, _type___nv_bfloat16)
+    handle = globals()["__bfloat162int_ru"]
+
+    @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -2792,7 +2731,9 @@ def _lower__ZL17__int2bfloat16_rni_nbst(shim_stream, shim_obj):
     def _ZL17__int2bfloat16_rni_nbst_caller(arg_0):
         return _ZL17__int2bfloat16_rni_nbst(arg_0)
 
-    @lower(__int2bfloat16_rn, int32)
+    handle = globals()["__int2bfloat16_rn"]
+
+    @lower(handle, int32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key("_ZL17__int2bfloat16_rni_nbst", shim_raw_str)
@@ -2831,7 +2772,9 @@ def _lower__ZL17__int2bfloat16_rzi_nbst(shim_stream, shim_obj):
     def _ZL17__int2bfloat16_rzi_nbst_caller(arg_0):
         return _ZL17__int2bfloat16_rzi_nbst(arg_0)
 
-    @lower(__int2bfloat16_rz, int32)
+    handle = globals()["__int2bfloat16_rz"]
+
+    @lower(handle, int32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key("_ZL17__int2bfloat16_rzi_nbst", shim_raw_str)
@@ -2870,7 +2813,9 @@ def _lower__ZL17__int2bfloat16_rdi_nbst(shim_stream, shim_obj):
     def _ZL17__int2bfloat16_rdi_nbst_caller(arg_0):
         return _ZL17__int2bfloat16_rdi_nbst(arg_0)
 
-    @lower(__int2bfloat16_rd, int32)
+    handle = globals()["__int2bfloat16_rd"]
+
+    @lower(handle, int32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key("_ZL17__int2bfloat16_rdi_nbst", shim_raw_str)
@@ -2909,7 +2854,9 @@ def _lower__ZL17__int2bfloat16_rui_nbst(shim_stream, shim_obj):
     def _ZL17__int2bfloat16_rui_nbst_caller(arg_0):
         return _ZL17__int2bfloat16_rui_nbst(arg_0)
 
-    @lower(__int2bfloat16_ru, int32)
+    handle = globals()["__int2bfloat16_ru"]
+
+    @lower(handle, int32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key("_ZL17__int2bfloat16_rui_nbst", shim_raw_str)
@@ -2949,7 +2896,9 @@ def _lower__ZL19__bfloat162short_rn13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL19__bfloat162short_rn13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL19__bfloat162short_rn13__nv_bfloat16_nbst(arg_0)
 
-    @lower(__bfloat162short_rn, _type___nv_bfloat16)
+    handle = globals()["__bfloat162short_rn"]
+
+    @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -2991,7 +2940,9 @@ def _lower__ZL19__bfloat162short_rz13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL19__bfloat162short_rz13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL19__bfloat162short_rz13__nv_bfloat16_nbst(arg_0)
 
-    @lower(__bfloat162short_rz, _type___nv_bfloat16)
+    handle = globals()["__bfloat162short_rz"]
+
+    @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -3033,7 +2984,9 @@ def _lower__ZL19__bfloat162short_rd13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL19__bfloat162short_rd13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL19__bfloat162short_rd13__nv_bfloat16_nbst(arg_0)
 
-    @lower(__bfloat162short_rd, _type___nv_bfloat16)
+    handle = globals()["__bfloat162short_rd"]
+
+    @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -3075,7 +3028,9 @@ def _lower__ZL19__bfloat162short_ru13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL19__bfloat162short_ru13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL19__bfloat162short_ru13__nv_bfloat16_nbst(arg_0)
 
-    @lower(__bfloat162short_ru, _type___nv_bfloat16)
+    handle = globals()["__bfloat162short_ru"]
+
+    @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -3116,7 +3071,9 @@ def _lower__ZL19__short2bfloat16_rns_nbst(shim_stream, shim_obj):
     def _ZL19__short2bfloat16_rns_nbst_caller(arg_0):
         return _ZL19__short2bfloat16_rns_nbst(arg_0)
 
-    @lower(__short2bfloat16_rn, int16)
+    handle = globals()["__short2bfloat16_rn"]
+
+    @lower(handle, int16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -3157,7 +3114,9 @@ def _lower__ZL19__short2bfloat16_rzs_nbst(shim_stream, shim_obj):
     def _ZL19__short2bfloat16_rzs_nbst_caller(arg_0):
         return _ZL19__short2bfloat16_rzs_nbst(arg_0)
 
-    @lower(__short2bfloat16_rz, int16)
+    handle = globals()["__short2bfloat16_rz"]
+
+    @lower(handle, int16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -3198,7 +3157,9 @@ def _lower__ZL19__short2bfloat16_rds_nbst(shim_stream, shim_obj):
     def _ZL19__short2bfloat16_rds_nbst_caller(arg_0):
         return _ZL19__short2bfloat16_rds_nbst(arg_0)
 
-    @lower(__short2bfloat16_rd, int16)
+    handle = globals()["__short2bfloat16_rd"]
+
+    @lower(handle, int16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -3239,7 +3200,9 @@ def _lower__ZL19__short2bfloat16_rus_nbst(shim_stream, shim_obj):
     def _ZL19__short2bfloat16_rus_nbst_caller(arg_0):
         return _ZL19__short2bfloat16_rus_nbst(arg_0)
 
-    @lower(__short2bfloat16_ru, int16)
+    handle = globals()["__short2bfloat16_ru"]
+
+    @lower(handle, int16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -3281,7 +3244,9 @@ def _lower__ZL18__bfloat162uint_rn13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL18__bfloat162uint_rn13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL18__bfloat162uint_rn13__nv_bfloat16_nbst(arg_0)
 
-    @lower(__bfloat162uint_rn, _type___nv_bfloat16)
+    handle = globals()["__bfloat162uint_rn"]
+
+    @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -3323,7 +3288,9 @@ def _lower__ZL18__bfloat162uint_rz13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL18__bfloat162uint_rz13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL18__bfloat162uint_rz13__nv_bfloat16_nbst(arg_0)
 
-    @lower(__bfloat162uint_rz, _type___nv_bfloat16)
+    handle = globals()["__bfloat162uint_rz"]
+
+    @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -3365,7 +3332,9 @@ def _lower__ZL18__bfloat162uint_rd13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL18__bfloat162uint_rd13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL18__bfloat162uint_rd13__nv_bfloat16_nbst(arg_0)
 
-    @lower(__bfloat162uint_rd, _type___nv_bfloat16)
+    handle = globals()["__bfloat162uint_rd"]
+
+    @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -3407,7 +3376,9 @@ def _lower__ZL18__bfloat162uint_ru13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL18__bfloat162uint_ru13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL18__bfloat162uint_ru13__nv_bfloat16_nbst(arg_0)
 
-    @lower(__bfloat162uint_ru, _type___nv_bfloat16)
+    handle = globals()["__bfloat162uint_ru"]
+
+    @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -3448,7 +3419,9 @@ def _lower__ZL18__uint2bfloat16_rnj_nbst(shim_stream, shim_obj):
     def _ZL18__uint2bfloat16_rnj_nbst_caller(arg_0):
         return _ZL18__uint2bfloat16_rnj_nbst(arg_0)
 
-    @lower(__uint2bfloat16_rn, uint32)
+    handle = globals()["__uint2bfloat16_rn"]
+
+    @lower(handle, uint32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -3489,7 +3462,9 @@ def _lower__ZL18__uint2bfloat16_rzj_nbst(shim_stream, shim_obj):
     def _ZL18__uint2bfloat16_rzj_nbst_caller(arg_0):
         return _ZL18__uint2bfloat16_rzj_nbst(arg_0)
 
-    @lower(__uint2bfloat16_rz, uint32)
+    handle = globals()["__uint2bfloat16_rz"]
+
+    @lower(handle, uint32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -3530,7 +3505,9 @@ def _lower__ZL18__uint2bfloat16_rdj_nbst(shim_stream, shim_obj):
     def _ZL18__uint2bfloat16_rdj_nbst_caller(arg_0):
         return _ZL18__uint2bfloat16_rdj_nbst(arg_0)
 
-    @lower(__uint2bfloat16_rd, uint32)
+    handle = globals()["__uint2bfloat16_rd"]
+
+    @lower(handle, uint32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -3571,7 +3548,9 @@ def _lower__ZL18__uint2bfloat16_ruj_nbst(shim_stream, shim_obj):
     def _ZL18__uint2bfloat16_ruj_nbst_caller(arg_0):
         return _ZL18__uint2bfloat16_ruj_nbst(arg_0)
 
-    @lower(__uint2bfloat16_ru, uint32)
+    handle = globals()["__uint2bfloat16_ru"]
+
+    @lower(handle, uint32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -3613,7 +3592,9 @@ def _lower__ZL20__bfloat162ushort_rn13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL20__bfloat162ushort_rn13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL20__bfloat162ushort_rn13__nv_bfloat16_nbst(arg_0)
 
-    @lower(__bfloat162ushort_rn, _type___nv_bfloat16)
+    handle = globals()["__bfloat162ushort_rn"]
+
+    @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -3655,7 +3636,9 @@ def _lower__ZL20__bfloat162ushort_rz13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL20__bfloat162ushort_rz13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL20__bfloat162ushort_rz13__nv_bfloat16_nbst(arg_0)
 
-    @lower(__bfloat162ushort_rz, _type___nv_bfloat16)
+    handle = globals()["__bfloat162ushort_rz"]
+
+    @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -3697,7 +3680,9 @@ def _lower__ZL20__bfloat162ushort_rd13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL20__bfloat162ushort_rd13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL20__bfloat162ushort_rd13__nv_bfloat16_nbst(arg_0)
 
-    @lower(__bfloat162ushort_rd, _type___nv_bfloat16)
+    handle = globals()["__bfloat162ushort_rd"]
+
+    @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -3739,7 +3724,9 @@ def _lower__ZL20__bfloat162ushort_ru13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL20__bfloat162ushort_ru13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL20__bfloat162ushort_ru13__nv_bfloat16_nbst(arg_0)
 
-    @lower(__bfloat162ushort_ru, _type___nv_bfloat16)
+    handle = globals()["__bfloat162ushort_ru"]
+
+    @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -3780,7 +3767,9 @@ def _lower__ZL20__ushort2bfloat16_rnt_nbst(shim_stream, shim_obj):
     def _ZL20__ushort2bfloat16_rnt_nbst_caller(arg_0):
         return _ZL20__ushort2bfloat16_rnt_nbst(arg_0)
 
-    @lower(__ushort2bfloat16_rn, uint16)
+    handle = globals()["__ushort2bfloat16_rn"]
+
+    @lower(handle, uint16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -3821,7 +3810,9 @@ def _lower__ZL20__ushort2bfloat16_rzt_nbst(shim_stream, shim_obj):
     def _ZL20__ushort2bfloat16_rzt_nbst_caller(arg_0):
         return _ZL20__ushort2bfloat16_rzt_nbst(arg_0)
 
-    @lower(__ushort2bfloat16_rz, uint16)
+    handle = globals()["__ushort2bfloat16_rz"]
+
+    @lower(handle, uint16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -3862,7 +3853,9 @@ def _lower__ZL20__ushort2bfloat16_rdt_nbst(shim_stream, shim_obj):
     def _ZL20__ushort2bfloat16_rdt_nbst_caller(arg_0):
         return _ZL20__ushort2bfloat16_rdt_nbst(arg_0)
 
-    @lower(__ushort2bfloat16_rd, uint16)
+    handle = globals()["__ushort2bfloat16_rd"]
+
+    @lower(handle, uint16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -3903,7 +3896,9 @@ def _lower__ZL20__ushort2bfloat16_rut_nbst(shim_stream, shim_obj):
     def _ZL20__ushort2bfloat16_rut_nbst_caller(arg_0):
         return _ZL20__ushort2bfloat16_rut_nbst(arg_0)
 
-    @lower(__ushort2bfloat16_ru, uint16)
+    handle = globals()["__ushort2bfloat16_ru"]
+
+    @lower(handle, uint16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -3945,7 +3940,9 @@ def _lower__ZL17__bfloat162ull_rn13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL17__bfloat162ull_rn13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL17__bfloat162ull_rn13__nv_bfloat16_nbst(arg_0)
 
-    @lower(__bfloat162ull_rn, _type___nv_bfloat16)
+    handle = globals()["__bfloat162ull_rn"]
+
+    @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -3987,7 +3984,9 @@ def _lower__ZL17__bfloat162ull_rz13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL17__bfloat162ull_rz13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL17__bfloat162ull_rz13__nv_bfloat16_nbst(arg_0)
 
-    @lower(__bfloat162ull_rz, _type___nv_bfloat16)
+    handle = globals()["__bfloat162ull_rz"]
+
+    @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -4031,7 +4030,9 @@ def _lower__ZL14make_bfloat16213__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL14make_bfloat16213__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL14make_bfloat16213__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    @lower(make_bfloat162, _type___nv_bfloat16, _type___nv_bfloat16)
+    handle = globals()["make_bfloat162"]
+
+    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -4077,7 +4078,9 @@ def _lower__ZL17__bfloat162ull_rd13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL17__bfloat162ull_rd13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL17__bfloat162ull_rd13__nv_bfloat16_nbst(arg_0)
 
-    @lower(__bfloat162ull_rd, _type___nv_bfloat16)
+    handle = globals()["__bfloat162ull_rd"]
+
+    @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -4119,7 +4122,9 @@ def _lower__ZL17__bfloat162ull_ru13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL17__bfloat162ull_ru13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL17__bfloat162ull_ru13__nv_bfloat16_nbst(arg_0)
 
-    @lower(__bfloat162ull_ru, _type___nv_bfloat16)
+    handle = globals()["__bfloat162ull_ru"]
+
+    @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -4160,7 +4165,9 @@ def _lower__ZL17__ull2bfloat16_rny_nbst(shim_stream, shim_obj):
     def _ZL17__ull2bfloat16_rny_nbst_caller(arg_0):
         return _ZL17__ull2bfloat16_rny_nbst(arg_0)
 
-    @lower(__ull2bfloat16_rn, uint64)
+    handle = globals()["__ull2bfloat16_rn"]
+
+    @lower(handle, uint64)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key("_ZL17__ull2bfloat16_rny_nbst", shim_raw_str)
@@ -4199,7 +4206,9 @@ def _lower__ZL17__ull2bfloat16_rzy_nbst(shim_stream, shim_obj):
     def _ZL17__ull2bfloat16_rzy_nbst_caller(arg_0):
         return _ZL17__ull2bfloat16_rzy_nbst(arg_0)
 
-    @lower(__ull2bfloat16_rz, uint64)
+    handle = globals()["__ull2bfloat16_rz"]
+
+    @lower(handle, uint64)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key("_ZL17__ull2bfloat16_rzy_nbst", shim_raw_str)
@@ -4238,7 +4247,9 @@ def _lower__ZL17__ull2bfloat16_rdy_nbst(shim_stream, shim_obj):
     def _ZL17__ull2bfloat16_rdy_nbst_caller(arg_0):
         return _ZL17__ull2bfloat16_rdy_nbst(arg_0)
 
-    @lower(__ull2bfloat16_rd, uint64)
+    handle = globals()["__ull2bfloat16_rd"]
+
+    @lower(handle, uint64)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key("_ZL17__ull2bfloat16_rdy_nbst", shim_raw_str)
@@ -4277,7 +4288,9 @@ def _lower__ZL17__ull2bfloat16_ruy_nbst(shim_stream, shim_obj):
     def _ZL17__ull2bfloat16_ruy_nbst_caller(arg_0):
         return _ZL17__ull2bfloat16_ruy_nbst(arg_0)
 
-    @lower(__ull2bfloat16_ru, uint64)
+    handle = globals()["__ull2bfloat16_ru"]
+
+    @lower(handle, uint64)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key("_ZL17__ull2bfloat16_ruy_nbst", shim_raw_str)
@@ -4317,7 +4330,9 @@ def _lower__ZL16__bfloat162ll_rn13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL16__bfloat162ll_rn13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL16__bfloat162ll_rn13__nv_bfloat16_nbst(arg_0)
 
-    @lower(__bfloat162ll_rn, _type___nv_bfloat16)
+    handle = globals()["__bfloat162ll_rn"]
+
+    @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -4359,7 +4374,9 @@ def _lower__ZL16__bfloat162ll_rz13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL16__bfloat162ll_rz13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL16__bfloat162ll_rz13__nv_bfloat16_nbst(arg_0)
 
-    @lower(__bfloat162ll_rz, _type___nv_bfloat16)
+    handle = globals()["__bfloat162ll_rz"]
+
+    @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -4401,7 +4418,9 @@ def _lower__ZL16__bfloat162ll_rd13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL16__bfloat162ll_rd13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL16__bfloat162ll_rd13__nv_bfloat16_nbst(arg_0)
 
-    @lower(__bfloat162ll_rd, _type___nv_bfloat16)
+    handle = globals()["__bfloat162ll_rd"]
+
+    @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -4443,7 +4462,9 @@ def _lower__ZL16__bfloat162ll_ru13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL16__bfloat162ll_ru13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL16__bfloat162ll_ru13__nv_bfloat16_nbst(arg_0)
 
-    @lower(__bfloat162ll_ru, _type___nv_bfloat16)
+    handle = globals()["__bfloat162ll_ru"]
+
+    @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -4484,7 +4505,9 @@ def _lower__ZL16__ll2bfloat16_rnx_nbst(shim_stream, shim_obj):
     def _ZL16__ll2bfloat16_rnx_nbst_caller(arg_0):
         return _ZL16__ll2bfloat16_rnx_nbst(arg_0)
 
-    @lower(__ll2bfloat16_rn, int64)
+    handle = globals()["__ll2bfloat16_rn"]
+
+    @lower(handle, int64)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key("_ZL16__ll2bfloat16_rnx_nbst", shim_raw_str)
@@ -4523,7 +4546,9 @@ def _lower__ZL16__ll2bfloat16_rzx_nbst(shim_stream, shim_obj):
     def _ZL16__ll2bfloat16_rzx_nbst_caller(arg_0):
         return _ZL16__ll2bfloat16_rzx_nbst(arg_0)
 
-    @lower(__ll2bfloat16_rz, int64)
+    handle = globals()["__ll2bfloat16_rz"]
+
+    @lower(handle, int64)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key("_ZL16__ll2bfloat16_rzx_nbst", shim_raw_str)
@@ -4562,7 +4587,9 @@ def _lower__ZL16__ll2bfloat16_rdx_nbst(shim_stream, shim_obj):
     def _ZL16__ll2bfloat16_rdx_nbst_caller(arg_0):
         return _ZL16__ll2bfloat16_rdx_nbst(arg_0)
 
-    @lower(__ll2bfloat16_rd, int64)
+    handle = globals()["__ll2bfloat16_rd"]
+
+    @lower(handle, int64)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key("_ZL16__ll2bfloat16_rdx_nbst", shim_raw_str)
@@ -4601,7 +4628,9 @@ def _lower__ZL16__ll2bfloat16_rux_nbst(shim_stream, shim_obj):
     def _ZL16__ll2bfloat16_rux_nbst_caller(arg_0):
         return _ZL16__ll2bfloat16_rux_nbst(arg_0)
 
-    @lower(__ll2bfloat16_ru, int64)
+    handle = globals()["__ll2bfloat16_ru"]
+
+    @lower(handle, int64)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key("_ZL16__ll2bfloat16_rux_nbst", shim_raw_str)
@@ -4641,7 +4670,9 @@ def _lower__ZL6htrunc13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL6htrunc13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL6htrunc13__nv_bfloat16_nbst(arg_0)
 
-    @lower(htrunc, _type___nv_bfloat16)
+    handle = globals()["htrunc"]
+
+    @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -4683,7 +4714,9 @@ def _lower__ZL5hceil13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL5hceil13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL5hceil13__nv_bfloat16_nbst(arg_0)
 
-    @lower(hceil, _type___nv_bfloat16)
+    handle = globals()["hceil"]
+
+    @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -4725,7 +4758,9 @@ def _lower__ZL6hfloor13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL6hfloor13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL6hfloor13__nv_bfloat16_nbst(arg_0)
 
-    @lower(hfloor, _type___nv_bfloat16)
+    handle = globals()["hfloor"]
+
+    @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -4767,7 +4802,9 @@ def _lower__ZL5hrint13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL5hrint13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL5hrint13__nv_bfloat16_nbst(arg_0)
 
-    @lower(hrint, _type___nv_bfloat16)
+    handle = globals()["hrint"]
+
+    @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -4809,7 +4846,9 @@ def _lower__ZL7h2trunc14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL7h2trunc14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL7h2trunc14__nv_bfloat162_nbst(arg_0)
 
-    @lower(h2trunc, _type___nv_bfloat162)
+    handle = globals()["h2trunc"]
+
+    @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -4851,7 +4890,9 @@ def _lower__ZL6h2ceil14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL6h2ceil14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL6h2ceil14__nv_bfloat162_nbst(arg_0)
 
-    @lower(h2ceil, _type___nv_bfloat162)
+    handle = globals()["h2ceil"]
+
+    @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -4893,7 +4934,9 @@ def _lower__ZL7h2floor14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL7h2floor14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL7h2floor14__nv_bfloat162_nbst(arg_0)
 
-    @lower(h2floor, _type___nv_bfloat162)
+    handle = globals()["h2floor"]
+
+    @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -4935,7 +4978,9 @@ def _lower__ZL6h2rint14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL6h2rint14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL6h2rint14__nv_bfloat162_nbst(arg_0)
 
-    @lower(h2rint, _type___nv_bfloat162)
+    handle = globals()["h2rint"]
+
+    @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -4977,7 +5022,9 @@ def _lower__ZL20__bfloat162bfloat16213__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL20__bfloat162bfloat16213__nv_bfloat16_nbst_caller(arg_0):
         return _ZL20__bfloat162bfloat16213__nv_bfloat16_nbst(arg_0)
 
-    @lower(__bfloat162bfloat162, _type___nv_bfloat16)
+    handle = globals()["__bfloat162bfloat162"]
+
+    @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -5019,7 +5066,9 @@ def _lower__ZL17__lowhigh2highlow14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL17__lowhigh2highlow14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL17__lowhigh2highlow14__nv_bfloat162_nbst(arg_0)
 
-    @lower(__lowhigh2highlow, _type___nv_bfloat162)
+    handle = globals()["__lowhigh2highlow"]
+
+    @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -5063,7 +5112,9 @@ def _lower__ZL16__lows2bfloat16214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL16__lows2bfloat16214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL16__lows2bfloat16214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__lows2bfloat162, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["__lows2bfloat162"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -5111,7 +5162,9 @@ def _lower__ZL17__highs2bfloat16214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL17__highs2bfloat16214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL17__highs2bfloat16214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__highs2bfloat162, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["__highs2bfloat162"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -5157,7 +5210,9 @@ def _lower__ZL15__high2bfloat1614__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL15__high2bfloat1614__nv_bfloat162_nbst_caller(arg_0):
         return _ZL15__high2bfloat1614__nv_bfloat162_nbst(arg_0)
 
-    @lower(__high2bfloat16, _type___nv_bfloat162)
+    handle = globals()["__high2bfloat16"]
+
+    @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -5199,7 +5254,9 @@ def _lower__ZL14__low2bfloat1614__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL14__low2bfloat1614__nv_bfloat162_nbst_caller(arg_0):
         return _ZL14__low2bfloat1614__nv_bfloat162_nbst(arg_0)
 
-    @lower(__low2bfloat16, _type___nv_bfloat162)
+    handle = globals()["__low2bfloat16"]
+
+    @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -5240,7 +5297,9 @@ def _lower__ZL8__hisinf13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL8__hisinf13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL8__hisinf13__nv_bfloat16_nbst(arg_0)
 
-    @lower(__hisinf, _type___nv_bfloat16)
+    handle = globals()["__hisinf"]
+
+    @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -5284,7 +5343,9 @@ def _lower__ZL18__halves2bfloat16213__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL18__halves2bfloat16213__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL18__halves2bfloat16213__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    @lower(__halves2bfloat162, _type___nv_bfloat16, _type___nv_bfloat16)
+    handle = globals()["__halves2bfloat162"]
+
+    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -5330,7 +5391,9 @@ def _lower__ZL15__low2bfloat16214__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL15__low2bfloat16214__nv_bfloat162_nbst_caller(arg_0):
         return _ZL15__low2bfloat16214__nv_bfloat162_nbst(arg_0)
 
-    @lower(__low2bfloat162, _type___nv_bfloat162)
+    handle = globals()["__low2bfloat162"]
+
+    @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -5372,7 +5435,9 @@ def _lower__ZL16__high2bfloat16214__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL16__high2bfloat16214__nv_bfloat162_nbst_caller(arg_0):
         return _ZL16__high2bfloat16214__nv_bfloat162_nbst(arg_0)
 
-    @lower(__high2bfloat162, _type___nv_bfloat162)
+    handle = globals()["__high2bfloat162"]
+
+    @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -5414,7 +5479,9 @@ def _lower__ZL19__bfloat16_as_short13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL19__bfloat16_as_short13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL19__bfloat16_as_short13__nv_bfloat16_nbst(arg_0)
 
-    @lower(__bfloat16_as_short, _type___nv_bfloat16)
+    handle = globals()["__bfloat16_as_short"]
+
+    @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -5456,7 +5523,9 @@ def _lower__ZL20__bfloat16_as_ushort13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL20__bfloat16_as_ushort13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL20__bfloat16_as_ushort13__nv_bfloat16_nbst(arg_0)
 
-    @lower(__bfloat16_as_ushort, _type___nv_bfloat16)
+    handle = globals()["__bfloat16_as_ushort"]
+
+    @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -5497,7 +5566,9 @@ def _lower__ZL19__short_as_bfloat16s_nbst(shim_stream, shim_obj):
     def _ZL19__short_as_bfloat16s_nbst_caller(arg_0):
         return _ZL19__short_as_bfloat16s_nbst(arg_0)
 
-    @lower(__short_as_bfloat16, int16)
+    handle = globals()["__short_as_bfloat16"]
+
+    @lower(handle, int16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -5538,7 +5609,9 @@ def _lower__ZL20__ushort_as_bfloat16t_nbst(shim_stream, shim_obj):
     def _ZL20__ushort_as_bfloat16t_nbst_caller(arg_0):
         return _ZL20__ushort_as_bfloat16t_nbst(arg_0)
 
-    @lower(__ushort_as_bfloat16, uint16)
+    handle = globals()["__ushort_as_bfloat16"]
+
+    @lower(handle, uint16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -5589,7 +5662,9 @@ def _ZL11__shfl_syncj14__nv_bfloat162ii_nbst_caller(
             arg_0, arg_1, arg_2, arg_3
         )
 
-    @lower(__shfl_sync, uint32, _type___nv_bfloat162, int32, int32)
+    handle = globals()["__shfl_sync"]
+
+    @lower(handle, uint32, _type___nv_bfloat162, int32, int32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -5646,7 +5721,9 @@ def _ZL14__shfl_up_syncj14__nv_bfloat162ji_nbst_caller(
             arg_0, arg_1, arg_2, arg_3
         )
 
-    @lower(__shfl_up_sync, uint32, _type___nv_bfloat162, uint32, int32)
+    handle = globals()["__shfl_up_sync"]
+
+    @lower(handle, uint32, _type___nv_bfloat162, uint32, int32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -5703,7 +5780,9 @@ def _ZL16__shfl_down_syncj14__nv_bfloat162ji_nbst_caller(
             arg_0, arg_1, arg_2, arg_3
         )
 
-    @lower(__shfl_down_sync, uint32, _type___nv_bfloat162, uint32, int32)
+    handle = globals()["__shfl_down_sync"]
+
+    @lower(handle, uint32, _type___nv_bfloat162, uint32, int32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -5760,7 +5839,9 @@ def _ZL15__shfl_xor_syncj14__nv_bfloat162ii_nbst_caller(
             arg_0, arg_1, arg_2, arg_3
         )
 
-    @lower(__shfl_xor_sync, uint32, _type___nv_bfloat162, int32, int32)
+    handle = globals()["__shfl_xor_sync"]
+
+    @lower(handle, uint32, _type___nv_bfloat162, int32, int32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -5813,7 +5894,9 @@ def _ZL11__shfl_syncj13__nv_bfloat16ii_nbst_caller(
             arg_0, arg_1, arg_2, arg_3
         )
 
-    @lower(__shfl_sync, uint32, _type___nv_bfloat16, int32, int32)
+    handle = globals()["__shfl_sync"]
+
+    @lower(handle, uint32, _type___nv_bfloat16, int32, int32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -5866,7 +5949,9 @@ def _ZL14__shfl_up_syncj13__nv_bfloat16ji_nbst_caller(
             arg_0, arg_1, arg_2, arg_3
         )
 
-    @lower(__shfl_up_sync, uint32, _type___nv_bfloat16, uint32, int32)
+    handle = globals()["__shfl_up_sync"]
+
+    @lower(handle, uint32, _type___nv_bfloat16, uint32, int32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -5919,7 +6004,9 @@ def _ZL16__shfl_down_syncj13__nv_bfloat16ji_nbst_caller(
             arg_0, arg_1, arg_2, arg_3
         )
 
-    @lower(__shfl_down_sync, uint32, _type___nv_bfloat16, uint32, int32)
+    handle = globals()["__shfl_down_sync"]
+
+    @lower(handle, uint32, _type___nv_bfloat16, uint32, int32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -5972,7 +6059,9 @@ def _ZL15__shfl_xor_syncj13__nv_bfloat16ii_nbst_caller(
             arg_0, arg_1, arg_2, arg_3
         )
 
-    @lower(__shfl_xor_sync, uint32, _type___nv_bfloat16, int32, int32)
+    handle = globals()["__shfl_xor_sync"]
+
+    @lower(handle, uint32, _type___nv_bfloat16, int32, int32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -6020,7 +6109,9 @@ def _lower__ZL5__ldgPK14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL5__ldgPK14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL5__ldgPK14__nv_bfloat162_nbst(arg_0)
 
-    @lower(__ldg, CPointer(_type___nv_bfloat162))
+    handle = globals()["__ldg"]
+
+    @lower(handle, CPointer(_type___nv_bfloat162))
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -6060,7 +6151,9 @@ def _lower__ZL5__ldgPK13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL5__ldgPK13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL5__ldgPK13__nv_bfloat16_nbst(arg_0)
 
-    @lower(__ldg, CPointer(_type___nv_bfloat16))
+    handle = globals()["__ldg"]
+
+    @lower(handle, CPointer(_type___nv_bfloat16))
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -6104,7 +6197,9 @@ def _lower__ZL6__ldcgPK14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL6__ldcgPK14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL6__ldcgPK14__nv_bfloat162_nbst(arg_0)
 
-    @lower(__ldcg, CPointer(_type___nv_bfloat162))
+    handle = globals()["__ldcg"]
+
+    @lower(handle, CPointer(_type___nv_bfloat162))
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -6144,7 +6239,9 @@ def _lower__ZL6__ldcgPK13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL6__ldcgPK13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL6__ldcgPK13__nv_bfloat16_nbst(arg_0)
 
-    @lower(__ldcg, CPointer(_type___nv_bfloat16))
+    handle = globals()["__ldcg"]
+
+    @lower(handle, CPointer(_type___nv_bfloat16))
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -6188,7 +6285,9 @@ def _lower__ZL6__ldcaPK14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL6__ldcaPK14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL6__ldcaPK14__nv_bfloat162_nbst(arg_0)
 
-    @lower(__ldca, CPointer(_type___nv_bfloat162))
+    handle = globals()["__ldca"]
+
+    @lower(handle, CPointer(_type___nv_bfloat162))
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -6228,7 +6327,9 @@ def _lower__ZL6__ldcaPK13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL6__ldcaPK13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL6__ldcaPK13__nv_bfloat16_nbst(arg_0)
 
-    @lower(__ldca, CPointer(_type___nv_bfloat16))
+    handle = globals()["__ldca"]
+
+    @lower(handle, CPointer(_type___nv_bfloat16))
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -6272,7 +6373,9 @@ def _lower__ZL6__ldcsPK14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL6__ldcsPK14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL6__ldcsPK14__nv_bfloat162_nbst(arg_0)
 
-    @lower(__ldcs, CPointer(_type___nv_bfloat162))
+    handle = globals()["__ldcs"]
+
+    @lower(handle, CPointer(_type___nv_bfloat162))
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -6312,7 +6415,9 @@ def _lower__ZL6__ldcsPK13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL6__ldcsPK13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL6__ldcsPK13__nv_bfloat16_nbst(arg_0)
 
-    @lower(__ldcs, CPointer(_type___nv_bfloat16))
+    handle = globals()["__ldcs"]
+
+    @lower(handle, CPointer(_type___nv_bfloat16))
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -6356,7 +6461,9 @@ def _lower__ZL6__ldluPK14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL6__ldluPK14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL6__ldluPK14__nv_bfloat162_nbst(arg_0)
 
-    @lower(__ldlu, CPointer(_type___nv_bfloat162))
+    handle = globals()["__ldlu"]
+
+    @lower(handle, CPointer(_type___nv_bfloat162))
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -6396,7 +6503,9 @@ def _lower__ZL6__ldluPK13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL6__ldluPK13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL6__ldluPK13__nv_bfloat16_nbst(arg_0)
 
-    @lower(__ldlu, CPointer(_type___nv_bfloat16))
+    handle = globals()["__ldlu"]
+
+    @lower(handle, CPointer(_type___nv_bfloat16))
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -6440,7 +6549,9 @@ def _lower__ZL6__ldcvPK14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL6__ldcvPK14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL6__ldcvPK14__nv_bfloat162_nbst(arg_0)
 
-    @lower(__ldcv, CPointer(_type___nv_bfloat162))
+    handle = globals()["__ldcv"]
+
+    @lower(handle, CPointer(_type___nv_bfloat162))
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -6480,7 +6591,9 @@ def _lower__ZL6__ldcvPK13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL6__ldcvPK13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL6__ldcvPK13__nv_bfloat16_nbst(arg_0)
 
-    @lower(__ldcv, CPointer(_type___nv_bfloat16))
+    handle = globals()["__ldcv"]
+
+    @lower(handle, CPointer(_type___nv_bfloat16))
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -6527,7 +6640,9 @@ def _lower__ZL6__stwbP14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL6__stwbP14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL6__stwbP14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__stwb, CPointer(_type___nv_bfloat162), _type___nv_bfloat162)
+    handle = globals()["__stwb"]
+
+    @lower(handle, CPointer(_type___nv_bfloat162), _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -6572,7 +6687,9 @@ def _lower__ZL6__stwbP13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL6__stwbP13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL6__stwbP13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    @lower(__stwb, CPointer(_type___nv_bfloat16), _type___nv_bfloat16)
+    handle = globals()["__stwb"]
+
+    @lower(handle, CPointer(_type___nv_bfloat16), _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -6621,7 +6738,9 @@ def _lower__ZL6__stcgP14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL6__stcgP14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL6__stcgP14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__stcg, CPointer(_type___nv_bfloat162), _type___nv_bfloat162)
+    handle = globals()["__stcg"]
+
+    @lower(handle, CPointer(_type___nv_bfloat162), _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -6666,7 +6785,9 @@ def _lower__ZL6__stcgP13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL6__stcgP13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL6__stcgP13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    @lower(__stcg, CPointer(_type___nv_bfloat16), _type___nv_bfloat16)
+    handle = globals()["__stcg"]
+
+    @lower(handle, CPointer(_type___nv_bfloat16), _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -6715,7 +6836,9 @@ def _lower__ZL6__stcsP14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL6__stcsP14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL6__stcsP14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__stcs, CPointer(_type___nv_bfloat162), _type___nv_bfloat162)
+    handle = globals()["__stcs"]
+
+    @lower(handle, CPointer(_type___nv_bfloat162), _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -6760,7 +6883,9 @@ def _lower__ZL6__stcsP13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL6__stcsP13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL6__stcsP13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    @lower(__stcs, CPointer(_type___nv_bfloat16), _type___nv_bfloat16)
+    handle = globals()["__stcs"]
+
+    @lower(handle, CPointer(_type___nv_bfloat16), _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -6809,7 +6934,9 @@ def _lower__ZL6__stwtP14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL6__stwtP14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL6__stwtP14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__stwt, CPointer(_type___nv_bfloat162), _type___nv_bfloat162)
+    handle = globals()["__stwt"]
+
+    @lower(handle, CPointer(_type___nv_bfloat162), _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -6854,7 +6981,9 @@ def _lower__ZL6__stwtP13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL6__stwtP13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL6__stwtP13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    @lower(__stwt, CPointer(_type___nv_bfloat16), _type___nv_bfloat16)
+    handle = globals()["__stwt"]
+
+    @lower(handle, CPointer(_type___nv_bfloat16), _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -6902,7 +7031,9 @@ def _lower__ZL6__heq214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL6__heq214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL6__heq214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__heq2, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["__heq2"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -6950,7 +7081,9 @@ def _lower__ZL6__hne214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL6__hne214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL6__hne214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__hne2, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["__hne2"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -6998,7 +7131,9 @@ def _lower__ZL6__hle214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL6__hle214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL6__hle214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__hle2, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["__hle2"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -7046,7 +7181,9 @@ def _lower__ZL6__hge214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL6__hge214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL6__hge214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__hge2, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["__hge2"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -7094,7 +7231,9 @@ def _lower__ZL6__hlt214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL6__hlt214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL6__hlt214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__hlt2, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["__hlt2"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -7142,7 +7281,9 @@ def _lower__ZL6__hgt214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL6__hgt214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL6__hgt214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__hgt2, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["__hgt2"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -7190,7 +7331,9 @@ def _lower__ZL7__hequ214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL7__hequ214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL7__hequ214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__hequ2, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["__hequ2"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -7238,7 +7381,9 @@ def _lower__ZL7__hneu214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL7__hneu214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL7__hneu214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__hneu2, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["__hneu2"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -7286,7 +7431,9 @@ def _lower__ZL7__hleu214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL7__hleu214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL7__hleu214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__hleu2, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["__hleu2"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -7334,7 +7481,9 @@ def _lower__ZL7__hgeu214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL7__hgeu214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL7__hgeu214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__hgeu2, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["__hgeu2"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -7382,7 +7531,9 @@ def _lower__ZL7__hltu214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL7__hltu214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL7__hltu214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__hltu2, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["__hltu2"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -7430,7 +7581,9 @@ def _lower__ZL7__hgtu214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL7__hgtu214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL7__hgtu214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__hgtu2, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["__hgtu2"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -7476,7 +7629,9 @@ def _lower__ZL11__heq2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL11__heq2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL11__heq2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__heq2_mask, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["__heq2_mask"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -7522,7 +7677,9 @@ def _lower__ZL11__hne2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL11__hne2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL11__hne2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__hne2_mask, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["__hne2_mask"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -7568,7 +7725,9 @@ def _lower__ZL11__hle2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL11__hle2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL11__hle2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__hle2_mask, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["__hle2_mask"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -7614,7 +7773,9 @@ def _lower__ZL11__hge2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL11__hge2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL11__hge2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__hge2_mask, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["__hge2_mask"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -7660,7 +7821,9 @@ def _lower__ZL11__hlt2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL11__hlt2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL11__hlt2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__hlt2_mask, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["__hlt2_mask"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -7706,7 +7869,9 @@ def _lower__ZL11__hgt2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL11__hgt2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL11__hgt2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__hgt2_mask, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["__hgt2_mask"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -7752,7 +7917,9 @@ def _lower__ZL12__hequ2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL12__hequ2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL12__hequ2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__hequ2_mask, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["__hequ2_mask"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -7798,7 +7965,9 @@ def _lower__ZL12__hneu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL12__hneu2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL12__hneu2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__hneu2_mask, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["__hneu2_mask"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -7844,7 +8013,9 @@ def _lower__ZL12__hleu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL12__hleu2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL12__hleu2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__hleu2_mask, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["__hleu2_mask"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -7890,7 +8061,9 @@ def _lower__ZL12__hgeu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL12__hgeu2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL12__hgeu2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__hgeu2_mask, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["__hgeu2_mask"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -7936,7 +8109,9 @@ def _lower__ZL12__hltu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL12__hltu2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL12__hltu2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__hltu2_mask, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["__hltu2_mask"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -7982,7 +8157,9 @@ def _lower__ZL12__hgtu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL12__hgtu2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL12__hgtu2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__hgtu2_mask, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["__hgtu2_mask"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -8028,7 +8205,9 @@ def _lower__ZL9__hisnan214__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL9__hisnan214__nv_bfloat162_nbst_caller(arg_0):
         return _ZL9__hisnan214__nv_bfloat162_nbst(arg_0)
 
-    @lower(__hisnan2, _type___nv_bfloat162)
+    handle = globals()["__hisnan2"]
+
+    @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -8072,7 +8251,9 @@ def _lower__ZL7__hadd214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL7__hadd214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL7__hadd214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__hadd2, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["__hadd2"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -8120,7 +8301,9 @@ def _lower__ZL7__hsub214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL7__hsub214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL7__hsub214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__hsub2, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["__hsub2"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -8168,7 +8351,9 @@ def _lower__ZL7__hmul214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL7__hmul214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL7__hmul214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__hmul2, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["__hmul2"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -8216,7 +8401,9 @@ def _lower__ZL10__hadd2_rn14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL10__hadd2_rn14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL10__hadd2_rn14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__hadd2_rn, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["__hadd2_rn"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -8264,7 +8451,9 @@ def _lower__ZL10__hsub2_rn14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL10__hsub2_rn14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL10__hsub2_rn14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__hsub2_rn, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["__hsub2_rn"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -8312,7 +8501,9 @@ def _lower__ZL10__hmul2_rn14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL10__hmul2_rn14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL10__hmul2_rn14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__hmul2_rn, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["__hmul2_rn"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -8360,7 +8551,9 @@ def _lower__ZL7__h2div14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL7__h2div14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL7__h2div14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__h2div, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["__h2div"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -8406,7 +8599,9 @@ def _lower__ZL7__habs214__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL7__habs214__nv_bfloat162_nbst_caller(arg_0):
         return _ZL7__habs214__nv_bfloat162_nbst(arg_0)
 
-    @lower(__habs2, _type___nv_bfloat162)
+    handle = globals()["__habs2"]
+
+    @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -8450,7 +8645,9 @@ def _lower__ZL11__hadd2_sat14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL11__hadd2_sat14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL11__hadd2_sat14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__hadd2_sat, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["__hadd2_sat"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -8498,7 +8695,9 @@ def _lower__ZL11__hsub2_sat14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL11__hsub2_sat14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL11__hsub2_sat14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__hsub2_sat, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["__hsub2_sat"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -8546,7 +8745,9 @@ def _lower__ZL11__hmul2_sat14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL11__hmul2_sat14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL11__hmul2_sat14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__hmul2_sat, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["__hmul2_sat"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -8596,11 +8797,10 @@ def _lower__ZL7__hfma214__nv_bfloat162S_S__nbst(shim_stream, shim_obj):
     def _ZL7__hfma214__nv_bfloat162S_S__nbst_caller(arg_0, arg_1, arg_2):
         return _ZL7__hfma214__nv_bfloat162S_S__nbst(arg_0, arg_1, arg_2)
 
+    handle = globals()["__hfma2"]
+
     @lower(
-        __hfma2,
-        _type___nv_bfloat162,
-        _type___nv_bfloat162,
-        _type___nv_bfloat162,
+        handle, _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
     )
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
@@ -8652,11 +8852,10 @@ def _lower__ZL11__hfma2_sat14__nv_bfloat162S_S__nbst(shim_stream, shim_obj):
     def _ZL11__hfma2_sat14__nv_bfloat162S_S__nbst_caller(arg_0, arg_1, arg_2):
         return _ZL11__hfma2_sat14__nv_bfloat162S_S__nbst(arg_0, arg_1, arg_2)
 
+    handle = globals()["__hfma2_sat"]
+
     @lower(
-        __hfma2_sat,
-        _type___nv_bfloat162,
-        _type___nv_bfloat162,
-        _type___nv_bfloat162,
+        handle, _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
     )
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
@@ -8704,7 +8903,9 @@ def _lower__ZL7__hneg214__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL7__hneg214__nv_bfloat162_nbst_caller(arg_0):
         return _ZL7__hneg214__nv_bfloat162_nbst(arg_0)
 
-    @lower(__hneg2, _type___nv_bfloat162)
+    handle = globals()["__hneg2"]
+
+    @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -8746,7 +8947,9 @@ def _lower__ZL6__habs13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL6__habs13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL6__habs13__nv_bfloat16_nbst(arg_0)
 
-    @lower(__habs, _type___nv_bfloat16)
+    handle = globals()["__habs"]
+
+    @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -8790,7 +8993,9 @@ def _lower__ZL6__hadd13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL6__hadd13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL6__hadd13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    @lower(__hadd, _type___nv_bfloat16, _type___nv_bfloat16)
+    handle = globals()["__hadd"]
+
+    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -8838,7 +9043,9 @@ def _lower__ZL6__hsub13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL6__hsub13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL6__hsub13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    @lower(__hsub, _type___nv_bfloat16, _type___nv_bfloat16)
+    handle = globals()["__hsub"]
+
+    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -8886,7 +9093,9 @@ def _lower__ZL6__hmul13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL6__hmul13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL6__hmul13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    @lower(__hmul, _type___nv_bfloat16, _type___nv_bfloat16)
+    handle = globals()["__hmul"]
+
+    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -8934,7 +9143,9 @@ def _lower__ZL9__hadd_rn13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL9__hadd_rn13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL9__hadd_rn13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    @lower(__hadd_rn, _type___nv_bfloat16, _type___nv_bfloat16)
+    handle = globals()["__hadd_rn"]
+
+    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -8982,7 +9193,9 @@ def _lower__ZL9__hsub_rn13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL9__hsub_rn13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL9__hsub_rn13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    @lower(__hsub_rn, _type___nv_bfloat16, _type___nv_bfloat16)
+    handle = globals()["__hsub_rn"]
+
+    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -9030,7 +9243,9 @@ def _lower__ZL9__hmul_rn13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL9__hmul_rn13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL9__hmul_rn13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    @lower(__hmul_rn, _type___nv_bfloat16, _type___nv_bfloat16)
+    handle = globals()["__hmul_rn"]
+
+    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -9078,7 +9293,9 @@ def _lower__ZL6__hdiv13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL6__hdiv13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL6__hdiv13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    @lower(__hdiv, _type___nv_bfloat16, _type___nv_bfloat16)
+    handle = globals()["__hdiv"]
+
+    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -9126,7 +9343,9 @@ def _lower__ZL10__hadd_sat13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL10__hadd_sat13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL10__hadd_sat13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    @lower(__hadd_sat, _type___nv_bfloat16, _type___nv_bfloat16)
+    handle = globals()["__hadd_sat"]
+
+    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -9174,7 +9393,9 @@ def _lower__ZL10__hsub_sat13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL10__hsub_sat13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL10__hsub_sat13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    @lower(__hsub_sat, _type___nv_bfloat16, _type___nv_bfloat16)
+    handle = globals()["__hsub_sat"]
+
+    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -9222,7 +9443,9 @@ def _lower__ZL10__hmul_sat13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL10__hmul_sat13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL10__hmul_sat13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    @lower(__hmul_sat, _type___nv_bfloat16, _type___nv_bfloat16)
+    handle = globals()["__hmul_sat"]
+
+    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -9272,8 +9495,10 @@ def _lower__ZL6__hfma13__nv_bfloat16S_S__nbst(shim_stream, shim_obj):
     def _ZL6__hfma13__nv_bfloat16S_S__nbst_caller(arg_0, arg_1, arg_2):
         return _ZL6__hfma13__nv_bfloat16S_S__nbst(arg_0, arg_1, arg_2)
 
+    handle = globals()["__hfma"]
+
     @lower(
-        __hfma, _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16
+        handle, _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16
     )
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
@@ -9325,11 +9550,10 @@ def _lower__ZL10__hfma_sat13__nv_bfloat16S_S__nbst(shim_stream, shim_obj):
     def _ZL10__hfma_sat13__nv_bfloat16S_S__nbst_caller(arg_0, arg_1, arg_2):
         return _ZL10__hfma_sat13__nv_bfloat16S_S__nbst(arg_0, arg_1, arg_2)
 
+    handle = globals()["__hfma_sat"]
+
     @lower(
-        __hfma_sat,
-        _type___nv_bfloat16,
-        _type___nv_bfloat16,
-        _type___nv_bfloat16,
+        handle, _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16
     )
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
@@ -9377,7 +9601,9 @@ def _lower__ZL6__hneg13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL6__hneg13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL6__hneg13__nv_bfloat16_nbst(arg_0)
 
-    @lower(__hneg, _type___nv_bfloat16)
+    handle = globals()["__hneg"]
+
+    @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -9419,7 +9645,9 @@ def _lower__ZL7__hbeq214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL7__hbeq214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL7__hbeq214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__hbeq2, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["__hbeq2"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -9465,7 +9693,9 @@ def _lower__ZL7__hbne214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL7__hbne214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL7__hbne214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__hbne2, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["__hbne2"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -9511,7 +9741,9 @@ def _lower__ZL7__hble214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL7__hble214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL7__hble214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__hble2, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["__hble2"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -9557,7 +9789,9 @@ def _lower__ZL7__hbge214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL7__hbge214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL7__hbge214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__hbge2, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["__hbge2"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -9603,7 +9837,9 @@ def _lower__ZL7__hblt214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL7__hblt214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL7__hblt214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__hblt2, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["__hblt2"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -9649,7 +9885,9 @@ def _lower__ZL7__hbgt214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL7__hbgt214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL7__hbgt214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__hbgt2, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["__hbgt2"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -9695,7 +9933,9 @@ def _lower__ZL8__hbequ214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL8__hbequ214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL8__hbequ214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__hbequ2, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["__hbequ2"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -9741,7 +9981,9 @@ def _lower__ZL8__hbneu214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL8__hbneu214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL8__hbneu214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__hbneu2, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["__hbneu2"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -9787,7 +10029,9 @@ def _lower__ZL8__hbleu214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL8__hbleu214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL8__hbleu214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__hbleu2, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["__hbleu2"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -9833,7 +10077,9 @@ def _lower__ZL8__hbgeu214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL8__hbgeu214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL8__hbgeu214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__hbgeu2, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["__hbgeu2"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -9879,7 +10125,9 @@ def _lower__ZL8__hbltu214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL8__hbltu214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL8__hbltu214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__hbltu2, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["__hbltu2"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -9925,7 +10173,9 @@ def _lower__ZL8__hbgtu214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL8__hbgtu214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL8__hbgtu214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__hbgtu2, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["__hbgtu2"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -9971,7 +10221,9 @@ def _lower__ZL5__heq13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL5__heq13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL5__heq13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    @lower(__heq, _type___nv_bfloat16, _type___nv_bfloat16)
+    handle = globals()["__heq"]
+
+    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -10017,7 +10269,9 @@ def _lower__ZL5__hne13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL5__hne13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL5__hne13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    @lower(__hne, _type___nv_bfloat16, _type___nv_bfloat16)
+    handle = globals()["__hne"]
+
+    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -10063,7 +10317,9 @@ def _lower__ZL5__hle13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL5__hle13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL5__hle13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    @lower(__hle, _type___nv_bfloat16, _type___nv_bfloat16)
+    handle = globals()["__hle"]
+
+    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -10109,7 +10365,9 @@ def _lower__ZL5__hge13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL5__hge13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL5__hge13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    @lower(__hge, _type___nv_bfloat16, _type___nv_bfloat16)
+    handle = globals()["__hge"]
+
+    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -10155,7 +10413,9 @@ def _lower__ZL5__hlt13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL5__hlt13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL5__hlt13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    @lower(__hlt, _type___nv_bfloat16, _type___nv_bfloat16)
+    handle = globals()["__hlt"]
+
+    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -10201,7 +10461,9 @@ def _lower__ZL5__hgt13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL5__hgt13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL5__hgt13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    @lower(__hgt, _type___nv_bfloat16, _type___nv_bfloat16)
+    handle = globals()["__hgt"]
+
+    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -10247,7 +10509,9 @@ def _lower__ZL6__hequ13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL6__hequ13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL6__hequ13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    @lower(__hequ, _type___nv_bfloat16, _type___nv_bfloat16)
+    handle = globals()["__hequ"]
+
+    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -10293,7 +10557,9 @@ def _lower__ZL6__hneu13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL6__hneu13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL6__hneu13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    @lower(__hneu, _type___nv_bfloat16, _type___nv_bfloat16)
+    handle = globals()["__hneu"]
+
+    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -10339,7 +10605,9 @@ def _lower__ZL6__hleu13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL6__hleu13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL6__hleu13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    @lower(__hleu, _type___nv_bfloat16, _type___nv_bfloat16)
+    handle = globals()["__hleu"]
+
+    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -10385,7 +10653,9 @@ def _lower__ZL6__hgeu13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL6__hgeu13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL6__hgeu13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    @lower(__hgeu, _type___nv_bfloat16, _type___nv_bfloat16)
+    handle = globals()["__hgeu"]
+
+    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -10431,7 +10701,9 @@ def _lower__ZL6__hltu13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL6__hltu13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL6__hltu13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    @lower(__hltu, _type___nv_bfloat16, _type___nv_bfloat16)
+    handle = globals()["__hltu"]
+
+    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -10477,7 +10749,9 @@ def _lower__ZL6__hgtu13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL6__hgtu13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL6__hgtu13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    @lower(__hgtu, _type___nv_bfloat16, _type___nv_bfloat16)
+    handle = globals()["__hgtu"]
+
+    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -10522,7 +10796,9 @@ def _lower__ZL8__hisnan13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL8__hisnan13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL8__hisnan13__nv_bfloat16_nbst(arg_0)
 
-    @lower(__hisnan, _type___nv_bfloat16)
+    handle = globals()["__hisnan"]
+
+    @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -10566,7 +10842,9 @@ def _lower__ZL6__hmax13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL6__hmax13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL6__hmax13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    @lower(__hmax, _type___nv_bfloat16, _type___nv_bfloat16)
+    handle = globals()["__hmax"]
+
+    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -10614,7 +10892,9 @@ def _lower__ZL6__hmin13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL6__hmin13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL6__hmin13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    @lower(__hmin, _type___nv_bfloat16, _type___nv_bfloat16)
+    handle = globals()["__hmin"]
+
+    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -10662,7 +10942,9 @@ def _lower__ZL10__hmax_nan13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL10__hmax_nan13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL10__hmax_nan13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    @lower(__hmax_nan, _type___nv_bfloat16, _type___nv_bfloat16)
+    handle = globals()["__hmax_nan"]
+
+    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -10710,7 +10992,9 @@ def _lower__ZL10__hmin_nan13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL10__hmin_nan13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL10__hmin_nan13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    @lower(__hmin_nan, _type___nv_bfloat16, _type___nv_bfloat16)
+    handle = globals()["__hmin_nan"]
+
+    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -10760,11 +11044,10 @@ def _lower__ZL11__hfma_relu13__nv_bfloat16S_S__nbst(shim_stream, shim_obj):
     def _ZL11__hfma_relu13__nv_bfloat16S_S__nbst_caller(arg_0, arg_1, arg_2):
         return _ZL11__hfma_relu13__nv_bfloat16S_S__nbst(arg_0, arg_1, arg_2)
 
+    handle = globals()["__hfma_relu"]
+
     @lower(
-        __hfma_relu,
-        _type___nv_bfloat16,
-        _type___nv_bfloat16,
-        _type___nv_bfloat16,
+        handle, _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16
     )
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
@@ -10814,7 +11097,9 @@ def _lower__ZL7__hmax214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL7__hmax214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL7__hmax214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__hmax2, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["__hmax2"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -10862,7 +11147,9 @@ def _lower__ZL7__hmin214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL7__hmin214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL7__hmin214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__hmin2, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["__hmin2"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -10910,7 +11197,9 @@ def _lower__ZL11__hmax2_nan14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL11__hmax2_nan14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL11__hmax2_nan14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__hmax2_nan, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["__hmax2_nan"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -10958,7 +11247,9 @@ def _lower__ZL11__hmin2_nan14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL11__hmin2_nan14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL11__hmin2_nan14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(__hmin2_nan, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["__hmin2_nan"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -11008,11 +11299,10 @@ def _lower__ZL12__hfma2_relu14__nv_bfloat162S_S__nbst(shim_stream, shim_obj):
     def _ZL12__hfma2_relu14__nv_bfloat162S_S__nbst_caller(arg_0, arg_1, arg_2):
         return _ZL12__hfma2_relu14__nv_bfloat162S_S__nbst(arg_0, arg_1, arg_2)
 
+    handle = globals()["__hfma2_relu"]
+
     @lower(
-        __hfma2_relu,
-        _type___nv_bfloat162,
-        _type___nv_bfloat162,
-        _type___nv_bfloat162,
+        handle, _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
     )
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
@@ -11064,11 +11354,10 @@ def _lower__ZL8__hcmadd14__nv_bfloat162S_S__nbst(shim_stream, shim_obj):
     def _ZL8__hcmadd14__nv_bfloat162S_S__nbst_caller(arg_0, arg_1, arg_2):
         return _ZL8__hcmadd14__nv_bfloat162S_S__nbst(arg_0, arg_1, arg_2)
 
+    handle = globals()["__hcmadd"]
+
     @lower(
-        __hcmadd,
-        _type___nv_bfloat162,
-        _type___nv_bfloat162,
-        _type___nv_bfloat162,
+        handle, _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
     )
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
@@ -11116,7 +11405,9 @@ def _lower__ZL5hsqrt13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL5hsqrt13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL5hsqrt13__nv_bfloat16_nbst(arg_0)
 
-    @lower(hsqrt, _type___nv_bfloat16)
+    handle = globals()["hsqrt"]
+
+    @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -11158,7 +11449,9 @@ def _lower__ZL6hrsqrt13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL6hrsqrt13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL6hrsqrt13__nv_bfloat16_nbst(arg_0)
 
-    @lower(hrsqrt, _type___nv_bfloat16)
+    handle = globals()["hrsqrt"]
+
+    @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -11200,7 +11493,9 @@ def _lower__ZL4hrcp13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL4hrcp13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL4hrcp13__nv_bfloat16_nbst(arg_0)
 
-    @lower(hrcp, _type___nv_bfloat16)
+    handle = globals()["hrcp"]
+
+    @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key("_ZL4hrcp13__nv_bfloat16_nbst", shim_raw_str)
@@ -11240,7 +11535,9 @@ def _lower__ZL4hlog13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL4hlog13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL4hlog13__nv_bfloat16_nbst(arg_0)
 
-    @lower(hlog, _type___nv_bfloat16)
+    handle = globals()["hlog"]
+
+    @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key("_ZL4hlog13__nv_bfloat16_nbst", shim_raw_str)
@@ -11280,7 +11577,9 @@ def _lower__ZL5hlog213__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL5hlog213__nv_bfloat16_nbst_caller(arg_0):
         return _ZL5hlog213__nv_bfloat16_nbst(arg_0)
 
-    @lower(hlog2, _type___nv_bfloat16)
+    handle = globals()["hlog2"]
+
+    @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -11322,7 +11621,9 @@ def _lower__ZL6hlog1013__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL6hlog1013__nv_bfloat16_nbst_caller(arg_0):
         return _ZL6hlog1013__nv_bfloat16_nbst(arg_0)
 
-    @lower(hlog10, _type___nv_bfloat16)
+    handle = globals()["hlog10"]
+
+    @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -11364,7 +11665,9 @@ def _lower__ZL4hexp13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL4hexp13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL4hexp13__nv_bfloat16_nbst(arg_0)
 
-    @lower(hexp, _type___nv_bfloat16)
+    handle = globals()["hexp"]
+
+    @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key("_ZL4hexp13__nv_bfloat16_nbst", shim_raw_str)
@@ -11404,7 +11707,9 @@ def _lower__ZL12htanh_approx13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL12htanh_approx13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL12htanh_approx13__nv_bfloat16_nbst(arg_0)
 
-    @lower(htanh_approx, _type___nv_bfloat16)
+    handle = globals()["htanh_approx"]
+
+    @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -11446,7 +11751,9 @@ def _lower__ZL13h2tanh_approx14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL13h2tanh_approx14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL13h2tanh_approx14__nv_bfloat162_nbst(arg_0)
 
-    @lower(h2tanh_approx, _type___nv_bfloat162)
+    handle = globals()["h2tanh_approx"]
+
+    @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -11488,7 +11795,9 @@ def _lower__ZL5htanh13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL5htanh13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL5htanh13__nv_bfloat16_nbst(arg_0)
 
-    @lower(htanh, _type___nv_bfloat16)
+    handle = globals()["htanh"]
+
+    @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -11530,7 +11839,9 @@ def _lower__ZL6h2tanh14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL6h2tanh14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL6h2tanh14__nv_bfloat162_nbst(arg_0)
 
-    @lower(h2tanh, _type___nv_bfloat162)
+    handle = globals()["h2tanh"]
+
+    @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -11572,7 +11883,9 @@ def _lower__ZL5hexp213__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL5hexp213__nv_bfloat16_nbst_caller(arg_0):
         return _ZL5hexp213__nv_bfloat16_nbst(arg_0)
 
-    @lower(hexp2, _type___nv_bfloat16)
+    handle = globals()["hexp2"]
+
+    @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -11614,7 +11927,9 @@ def _lower__ZL6hexp1013__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL6hexp1013__nv_bfloat16_nbst_caller(arg_0):
         return _ZL6hexp1013__nv_bfloat16_nbst(arg_0)
 
-    @lower(hexp10, _type___nv_bfloat16)
+    handle = globals()["hexp10"]
+
+    @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -11656,7 +11971,9 @@ def _lower__ZL4hcos13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL4hcos13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL4hcos13__nv_bfloat16_nbst(arg_0)
 
-    @lower(hcos, _type___nv_bfloat16)
+    handle = globals()["hcos"]
+
+    @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key("_ZL4hcos13__nv_bfloat16_nbst", shim_raw_str)
@@ -11696,7 +12013,9 @@ def _lower__ZL4hsin13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL4hsin13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL4hsin13__nv_bfloat16_nbst(arg_0)
 
-    @lower(hsin, _type___nv_bfloat16)
+    handle = globals()["hsin"]
+
+    @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key("_ZL4hsin13__nv_bfloat16_nbst", shim_raw_str)
@@ -11736,7 +12055,9 @@ def _lower__ZL6h2sqrt14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL6h2sqrt14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL6h2sqrt14__nv_bfloat162_nbst(arg_0)
 
-    @lower(h2sqrt, _type___nv_bfloat162)
+    handle = globals()["h2sqrt"]
+
+    @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -11778,7 +12099,9 @@ def _lower__ZL7h2rsqrt14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL7h2rsqrt14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL7h2rsqrt14__nv_bfloat162_nbst(arg_0)
 
-    @lower(h2rsqrt, _type___nv_bfloat162)
+    handle = globals()["h2rsqrt"]
+
+    @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -11820,7 +12143,9 @@ def _lower__ZL5h2rcp14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL5h2rcp14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL5h2rcp14__nv_bfloat162_nbst(arg_0)
 
-    @lower(h2rcp, _type___nv_bfloat162)
+    handle = globals()["h2rcp"]
+
+    @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -11862,7 +12187,9 @@ def _lower__ZL5h2log14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL5h2log14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL5h2log14__nv_bfloat162_nbst(arg_0)
 
-    @lower(h2log, _type___nv_bfloat162)
+    handle = globals()["h2log"]
+
+    @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -11904,7 +12231,9 @@ def _lower__ZL6h2log214__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL6h2log214__nv_bfloat162_nbst_caller(arg_0):
         return _ZL6h2log214__nv_bfloat162_nbst(arg_0)
 
-    @lower(h2log2, _type___nv_bfloat162)
+    handle = globals()["h2log2"]
+
+    @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -11946,7 +12275,9 @@ def _lower__ZL7h2log1014__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL7h2log1014__nv_bfloat162_nbst_caller(arg_0):
         return _ZL7h2log1014__nv_bfloat162_nbst(arg_0)
 
-    @lower(h2log10, _type___nv_bfloat162)
+    handle = globals()["h2log10"]
+
+    @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -11988,7 +12319,9 @@ def _lower__ZL5h2exp14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL5h2exp14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL5h2exp14__nv_bfloat162_nbst(arg_0)
 
-    @lower(h2exp, _type___nv_bfloat162)
+    handle = globals()["h2exp"]
+
+    @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -12030,7 +12363,9 @@ def _lower__ZL6h2exp214__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL6h2exp214__nv_bfloat162_nbst_caller(arg_0):
         return _ZL6h2exp214__nv_bfloat162_nbst(arg_0)
 
-    @lower(h2exp2, _type___nv_bfloat162)
+    handle = globals()["h2exp2"]
+
+    @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -12072,7 +12407,9 @@ def _lower__ZL7h2exp1014__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL7h2exp1014__nv_bfloat162_nbst_caller(arg_0):
         return _ZL7h2exp1014__nv_bfloat162_nbst(arg_0)
 
-    @lower(h2exp10, _type___nv_bfloat162)
+    handle = globals()["h2exp10"]
+
+    @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -12114,7 +12451,9 @@ def _lower__ZL5h2cos14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL5h2cos14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL5h2cos14__nv_bfloat162_nbst(arg_0)
 
-    @lower(h2cos, _type___nv_bfloat162)
+    handle = globals()["h2cos"]
+
+    @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -12156,7 +12495,9 @@ def _lower__ZL5h2sin14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL5h2sin14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL5h2sin14__nv_bfloat162_nbst(arg_0)
 
-    @lower(h2sin, _type___nv_bfloat162)
+    handle = globals()["h2sin"]
+
+    @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -12201,7 +12542,9 @@ def _lower__ZL9atomicAddP14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL9atomicAddP14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL9atomicAddP14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    @lower(atomicAdd, CPointer(_type___nv_bfloat162), _type___nv_bfloat162)
+    handle = globals()["atomicAdd"]
+
+    @lower(handle, CPointer(_type___nv_bfloat162), _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -12246,7 +12589,9 @@ def _lower__ZL9atomicAddP13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL9atomicAddP13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL9atomicAddP13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    @lower(atomicAdd, CPointer(_type___nv_bfloat16), _type___nv_bfloat16)
+    handle = globals()["atomicAdd"]
+
+    @lower(handle, CPointer(_type___nv_bfloat16), _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -12290,7 +12635,9 @@ def _lower__ZplRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj):
     def _ZplRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1):
         return _ZplRK13__nv_bfloat16S1__nbst(arg_0, arg_1)
 
-    @lower(operator.add, _type___nv_bfloat16, _type___nv_bfloat16)
+    handle = globals()["operator.add"]
+
+    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -12334,7 +12681,9 @@ def _lower__ZmiRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj):
     def _ZmiRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1):
         return _ZmiRK13__nv_bfloat16S1__nbst(arg_0, arg_1)
 
-    @lower(operator.sub, _type___nv_bfloat16, _type___nv_bfloat16)
+    handle = globals()["operator.sub"]
+
+    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -12378,7 +12727,9 @@ def _lower__ZmlRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj):
     def _ZmlRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1):
         return _ZmlRK13__nv_bfloat16S1__nbst(arg_0, arg_1)
 
-    @lower(operator.mul, _type___nv_bfloat16, _type___nv_bfloat16)
+    handle = globals()["operator.mul"]
+
+    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -12422,7 +12773,9 @@ def _lower__ZdvRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj):
     def _ZdvRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1):
         return _ZdvRK13__nv_bfloat16S1__nbst(arg_0, arg_1)
 
-    @lower(operator.truediv, _type___nv_bfloat16, _type___nv_bfloat16)
+    handle = globals()["operator.truediv"]
+
+    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -12466,7 +12819,9 @@ def _lower__ZpLR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj):
     def _ZpLR13__nv_bfloat16RKS__nbst_caller(arg_0, arg_1):
         return _ZpLR13__nv_bfloat16RKS__nbst(arg_0, arg_1)
 
-    @lower(operator.iadd, _type___nv_bfloat16, _type___nv_bfloat16)
+    handle = globals()["operator.iadd"]
+
+    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -12510,7 +12865,9 @@ def _lower__ZmIR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj):
     def _ZmIR13__nv_bfloat16RKS__nbst_caller(arg_0, arg_1):
         return _ZmIR13__nv_bfloat16RKS__nbst(arg_0, arg_1)
 
-    @lower(operator.isub, _type___nv_bfloat16, _type___nv_bfloat16)
+    handle = globals()["operator.isub"]
+
+    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -12554,7 +12911,9 @@ def _lower__ZmLR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj):
     def _ZmLR13__nv_bfloat16RKS__nbst_caller(arg_0, arg_1):
         return _ZmLR13__nv_bfloat16RKS__nbst(arg_0, arg_1)
 
-    @lower(operator.imul, _type___nv_bfloat16, _type___nv_bfloat16)
+    handle = globals()["operator.imul"]
+
+    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -12598,7 +12957,9 @@ def _lower__ZdVR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj):
     def _ZdVR13__nv_bfloat16RKS__nbst_caller(arg_0, arg_1):
         return _ZdVR13__nv_bfloat16RKS__nbst(arg_0, arg_1)
 
-    @lower(operator.itruediv, _type___nv_bfloat16, _type___nv_bfloat16)
+    handle = globals()["operator.itruediv"]
+
+    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -12640,7 +13001,9 @@ def _lower__ZpsRK13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZpsRK13__nv_bfloat16_nbst_caller(arg_0):
         return _ZpsRK13__nv_bfloat16_nbst(arg_0)
 
-    @lower(operator.pos, _type___nv_bfloat16)
+    handle = globals()["operator.pos"]
+
+    @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key("_ZpsRK13__nv_bfloat16_nbst", shim_raw_str)
@@ -12676,7 +13039,9 @@ def _lower__ZngRK13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZngRK13__nv_bfloat16_nbst_caller(arg_0):
         return _ZngRK13__nv_bfloat16_nbst(arg_0)
 
-    @lower(operator.neg, _type___nv_bfloat16)
+    handle = globals()["operator.neg"]
+
+    @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key("_ZngRK13__nv_bfloat16_nbst", shim_raw_str)
@@ -12712,7 +13077,9 @@ def _lower__ZeqRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj):
     def _ZeqRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1):
         return _ZeqRK13__nv_bfloat16S1__nbst(arg_0, arg_1)
 
-    @lower(operator.eq, _type___nv_bfloat16, _type___nv_bfloat16)
+    handle = globals()["operator.eq"]
+
+    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -12754,7 +13121,9 @@ def _lower__ZneRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj):
     def _ZneRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1):
         return _ZneRK13__nv_bfloat16S1__nbst(arg_0, arg_1)
 
-    @lower(operator.ne, _type___nv_bfloat16, _type___nv_bfloat16)
+    handle = globals()["operator.ne"]
+
+    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -12796,7 +13165,9 @@ def _lower__ZgtRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj):
     def _ZgtRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1):
         return _ZgtRK13__nv_bfloat16S1__nbst(arg_0, arg_1)
 
-    @lower(operator.gt, _type___nv_bfloat16, _type___nv_bfloat16)
+    handle = globals()["operator.gt"]
+
+    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -12838,7 +13209,9 @@ def _lower__ZltRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj):
     def _ZltRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1):
         return _ZltRK13__nv_bfloat16S1__nbst(arg_0, arg_1)
 
-    @lower(operator.lt, _type___nv_bfloat16, _type___nv_bfloat16)
+    handle = globals()["operator.lt"]
+
+    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -12880,7 +13253,9 @@ def _lower__ZgeRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj):
     def _ZgeRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1):
         return _ZgeRK13__nv_bfloat16S1__nbst(arg_0, arg_1)
 
-    @lower(operator.ge, _type___nv_bfloat16, _type___nv_bfloat16)
+    handle = globals()["operator.ge"]
+
+    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -12922,7 +13297,9 @@ def _lower__ZleRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj):
     def _ZleRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1):
         return _ZleRK13__nv_bfloat16S1__nbst(arg_0, arg_1)
 
-    @lower(operator.le, _type___nv_bfloat16, _type___nv_bfloat16)
+    handle = globals()["operator.le"]
+
+    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -12966,7 +13343,9 @@ def _lower__ZplRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj):
     def _ZplRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1):
         return _ZplRK14__nv_bfloat162S1__nbst(arg_0, arg_1)
 
-    @lower(operator.add, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["operator.add"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -13010,7 +13389,9 @@ def _lower__ZmiRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj):
     def _ZmiRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1):
         return _ZmiRK14__nv_bfloat162S1__nbst(arg_0, arg_1)
 
-    @lower(operator.sub, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["operator.sub"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -13054,7 +13435,9 @@ def _lower__ZmlRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj):
     def _ZmlRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1):
         return _ZmlRK14__nv_bfloat162S1__nbst(arg_0, arg_1)
 
-    @lower(operator.mul, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["operator.mul"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -13098,7 +13481,9 @@ def _lower__ZdvRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj):
     def _ZdvRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1):
         return _ZdvRK14__nv_bfloat162S1__nbst(arg_0, arg_1)
 
-    @lower(operator.truediv, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["operator.truediv"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -13142,7 +13527,9 @@ def _lower__ZpLR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj):
     def _ZpLR14__nv_bfloat162RKS__nbst_caller(arg_0, arg_1):
         return _ZpLR14__nv_bfloat162RKS__nbst(arg_0, arg_1)
 
-    @lower(operator.iadd, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["operator.iadd"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -13186,7 +13573,9 @@ def _lower__ZmIR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj):
     def _ZmIR14__nv_bfloat162RKS__nbst_caller(arg_0, arg_1):
         return _ZmIR14__nv_bfloat162RKS__nbst(arg_0, arg_1)
 
-    @lower(operator.isub, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["operator.isub"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -13230,7 +13619,9 @@ def _lower__ZmLR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj):
     def _ZmLR14__nv_bfloat162RKS__nbst_caller(arg_0, arg_1):
         return _ZmLR14__nv_bfloat162RKS__nbst(arg_0, arg_1)
 
-    @lower(operator.imul, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["operator.imul"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -13274,7 +13665,9 @@ def _lower__ZdVR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj):
     def _ZdVR14__nv_bfloat162RKS__nbst_caller(arg_0, arg_1):
         return _ZdVR14__nv_bfloat162RKS__nbst(arg_0, arg_1)
 
-    @lower(operator.itruediv, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["operator.itruediv"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -13316,7 +13709,9 @@ def _lower__ZpsRK14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZpsRK14__nv_bfloat162_nbst_caller(arg_0):
         return _ZpsRK14__nv_bfloat162_nbst(arg_0)
 
-    @lower(operator.pos, _type___nv_bfloat162)
+    handle = globals()["operator.pos"]
+
+    @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key("_ZpsRK14__nv_bfloat162_nbst", shim_raw_str)
@@ -13352,7 +13747,9 @@ def _lower__ZngRK14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZngRK14__nv_bfloat162_nbst_caller(arg_0):
         return _ZngRK14__nv_bfloat162_nbst(arg_0)
 
-    @lower(operator.neg, _type___nv_bfloat162)
+    handle = globals()["operator.neg"]
+
+    @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key("_ZngRK14__nv_bfloat162_nbst", shim_raw_str)
@@ -13388,7 +13785,9 @@ def _lower__ZeqRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj):
     def _ZeqRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1):
         return _ZeqRK14__nv_bfloat162S1__nbst(arg_0, arg_1)
 
-    @lower(operator.eq, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["operator.eq"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -13430,7 +13829,9 @@ def _lower__ZneRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj):
     def _ZneRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1):
         return _ZneRK14__nv_bfloat162S1__nbst(arg_0, arg_1)
 
-    @lower(operator.ne, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["operator.ne"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -13472,7 +13873,9 @@ def _lower__ZgtRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj):
     def _ZgtRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1):
         return _ZgtRK14__nv_bfloat162S1__nbst(arg_0, arg_1)
 
-    @lower(operator.gt, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["operator.gt"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -13514,7 +13917,9 @@ def _lower__ZltRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj):
     def _ZltRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1):
         return _ZltRK14__nv_bfloat162S1__nbst(arg_0, arg_1)
 
-    @lower(operator.lt, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["operator.lt"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -13556,7 +13961,9 @@ def _lower__ZgeRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj):
     def _ZgeRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1):
         return _ZgeRK14__nv_bfloat162S1__nbst(arg_0, arg_1)
 
-    @lower(operator.ge, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["operator.ge"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -13598,7 +14005,9 @@ def _lower__ZleRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj):
     def _ZleRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1):
         return _ZleRK14__nv_bfloat162S1__nbst(arg_0, arg_1)
 
-    @lower(operator.le, _type___nv_bfloat162, _type___nv_bfloat162)
+    handle = globals()["operator.le"]
+
+    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -13643,7 +14052,9 @@ def _lower__ZN6__halfC1E13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZN6__halfC1E13__nv_bfloat16_nbst_caller(arg_0):
         return _ZN6__halfC1E13__nv_bfloat16_nbst(arg_0)
 
-    @lower(__half, _type___nv_bfloat16)
+    handle = globals()["__half"]
+
+    @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(

From c647ae320486b998fb6ea724377db5a02ce19763 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Fri, 15 Aug 2025 13:22:57 -0700
Subject: [PATCH 17/56] apply binding patches

---
 numba_cuda/numba/cuda/_internal/cuda_bf16.py | 46 ++++++--------------
 1 file changed, 14 insertions(+), 32 deletions(-)

diff --git a/numba_cuda/numba/cuda/_internal/cuda_bf16.py b/numba_cuda/numba/cuda/_internal/cuda_bf16.py
index 86ddba13b..398db5cbf 100644
--- a/numba_cuda/numba/cuda/_internal/cuda_bf16.py
+++ b/numba_cuda/numba/cuda/_internal/cuda_bf16.py
@@ -27,7 +27,6 @@
 from numba.core.typing.templates import AttributeTemplate, ConcreteTemplate
 from numba.core.typing.templates import Registry as TypingRegistry
 from numba.cuda import CUSource, declare_device
-from numba.cuda._internal.cuda_bf16 import _type___nv_bfloat16
 from numba.cuda.vector_types import vector_types
 from numba.extending import as_numba_type
 from numba.types import (
@@ -49,8 +48,10 @@
     uint64,
     void,
 )
+from numba.cuda.types import bfloat16
 
 float32x2 = vector_types["float32x2"]
+__half = float16
 
 
 typing_registry = TypingRegistry()
@@ -180,37 +181,7 @@ def resolve_y(self, obj):
 make_attribute_wrapper(_type_class_unnamed1405416, "y", "y")
 
 
-@register
-class _ctor_template_unnamed1405416(ConcreteTemplate):
-    key = globals()["unnamed1405416"]
-    cases = []
-
-
-register_global(unnamed1405416, Function(_ctor_template_unnamed1405416))
-
-
-# Typing for __nv_bfloat16
-class _type_class___nv_bfloat16(Number):
-    def __init__(self):
-        super().__init__(name="__nv_bfloat16")
-        self.alignof_ = 2
-        self.bitwidth = 2 * 8
-
-
-_type___nv_bfloat16 = _type_class___nv_bfloat16()
-
-
-# Make Python API for struct
-__nv_bfloat16 = type("__nv_bfloat16", (), {"_nbtype": _type___nv_bfloat16})
-
-as_numba_type.register(__nv_bfloat16, _type___nv_bfloat16)
-
-
-@register_model(_type_class___nv_bfloat16)
-class _model___nv_bfloat16(PrimitiveModel):
-    def __init__(self, dmm, fe_type):
-        be_type = ir.IntType(fe_type.bitwidth)
-        super(_model___nv_bfloat16, self).__init__(dmm, fe_type, be_type)
+__nv_bfloat16 = _type___nv_bfloat16 = bfloat16
 
 
 def _lower__ZN13__nv_bfloat16C1Ev(shim_stream, shim_obj):
@@ -356,6 +327,17 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
         )
 
+    # By default, Numbast does not generate this cast because the c++ conversion
+    # constructor is marked explict. We enable it by hand here.
+    @lower_cast(float16, __nv_bfloat16)
+    def conversion_impl(context, builder, fromty, toty, value):
+        return ctor_impl(
+            context,
+            builder,
+            signature(__nv_bfloat16, fromty),
+            [value],
+        )
+
 
 _lower__ZN13__nv_bfloat16C1E6__half(shim_stream, shim_obj)
 

From 4b262e999a3059e59ddf0f64bd42ca2550bce4f4 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Fri, 15 Aug 2025 13:24:02 -0700
Subject: [PATCH 18/56] generate the bindings

---
 numba_cuda/numba/cuda/_internal/cuda_bf16.py | 1104 +++++++++++++-----
 1 file changed, 825 insertions(+), 279 deletions(-)

diff --git a/numba_cuda/numba/cuda/_internal/cuda_bf16.py b/numba_cuda/numba/cuda/_internal/cuda_bf16.py
index 398db5cbf..fb4e4de21 100644
--- a/numba_cuda/numba/cuda/_internal/cuda_bf16.py
+++ b/numba_cuda/numba/cuda/_internal/cuda_bf16.py
@@ -2,7 +2,7 @@
 # Generator Information:
 # Ast_canopy version: 0.4.0
 # Numbast version: 0.5.0
-# Generation command: /home/wangm/miniforge3/envs/numbast/lib/python3.13/site-packages/numbast/__main__.py --cfg-path configs/cuda_bf16.yml --output-dir numba_cuda/numba/cuda/_internal/
+# Generation command: /home/wangm/numbast/numbast/src/numbast/__main__.py --cfg-path configs/cuda_bf16.yml --output-dir numba_cuda/numba/cuda/_internal/
 # Static binding generator parameters: {'cfg_path': 'configs/cuda_bf16.yml', 'output_dir': 'numba_cuda/numba/cuda/_internal/', 'run_ruff_format': True}
 # Config file path (relative to the path of the generated binding): ../../../../../configs/cuda_bf16.yml
 # Cudatoolkit version: (12, 8)
@@ -27,6 +27,7 @@
 from numba.core.typing.templates import AttributeTemplate, ConcreteTemplate
 from numba.core.typing.templates import Registry as TypingRegistry
 from numba.cuda import CUSource, declare_device
+from numba.cuda._internal.cuda_bf16 import _type___nv_bfloat16
 from numba.cuda.vector_types import vector_types
 from numba.extending import as_numba_type
 from numba.types import (
@@ -48,10 +49,8 @@
     uint64,
     void,
 )
-from numba.cuda.types import bfloat16
 
 float32x2 = vector_types["float32x2"]
-__half = float16
 
 
 typing_registry = TypingRegistry()
@@ -181,7 +180,37 @@ def resolve_y(self, obj):
 make_attribute_wrapper(_type_class_unnamed1405416, "y", "y")
 
 
-__nv_bfloat16 = _type___nv_bfloat16 = bfloat16
+@register
+class _ctor_template_unnamed1405416(ConcreteTemplate):
+    key = globals()["unnamed1405416"]
+    cases = []
+
+
+register_global(unnamed1405416, Function(_ctor_template_unnamed1405416))
+
+
+# Typing for __nv_bfloat16
+class _type_class___nv_bfloat16(Number):
+    def __init__(self):
+        super().__init__(name="__nv_bfloat16")
+        self.alignof_ = 2
+        self.bitwidth = 2 * 8
+
+
+_type___nv_bfloat16 = _type_class___nv_bfloat16()
+
+
+# Make Python API for struct
+__nv_bfloat16 = type("__nv_bfloat16", (), {"_nbtype": _type___nv_bfloat16})
+
+as_numba_type.register(__nv_bfloat16, _type___nv_bfloat16)
+
+
+@register_model(_type_class___nv_bfloat16)
+class _model___nv_bfloat16(PrimitiveModel):
+    def __init__(self, dmm, fe_type):
+        be_type = ir.IntType(fe_type.bitwidth)
+        super(_model___nv_bfloat16, self).__init__(dmm, fe_type, be_type)
 
 
 def _lower__ZN13__nv_bfloat16C1Ev(shim_stream, shim_obj):
@@ -327,17 +356,6 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
         )
 
-    # By default, Numbast does not generate this cast because the c++ conversion
-    # constructor is marked explict. We enable it by hand here.
-    @lower_cast(float16, __nv_bfloat16)
-    def conversion_impl(context, builder, fromty, toty, value):
-        return ctor_impl(
-            context,
-            builder,
-            signature(__nv_bfloat16, fromty),
-            [value],
-        )
-
 
 _lower__ZN13__nv_bfloat16C1E6__half(shim_stream, shim_obj)
 
@@ -1885,7 +1903,9 @@ def _lower__ZL17__double2bfloat16d_nbst(shim_stream, shim_obj):
     def _ZL17__double2bfloat16d_nbst_caller(arg_0):
         return _ZL17__double2bfloat16d_nbst(arg_0)
 
-    handle = globals()["__double2bfloat16"]
+    handle = globals().get("__double2bfloat16")
+    if handle is None:
+        handle = __double2bfloat16
 
     @lower(handle, float64)
     def impl(context, builder, sig, args):
@@ -1926,7 +1946,9 @@ def _lower__ZL16__float2bfloat16f_nbst(shim_stream, shim_obj):
     def _ZL16__float2bfloat16f_nbst_caller(arg_0):
         return _ZL16__float2bfloat16f_nbst(arg_0)
 
-    handle = globals()["__float2bfloat16"]
+    handle = globals().get("__float2bfloat16")
+    if handle is None:
+        handle = __float2bfloat16
 
     @lower(handle, float32)
     def impl(context, builder, sig, args):
@@ -1967,7 +1989,9 @@ def _lower__ZL19__float2bfloat16_rnf_nbst(shim_stream, shim_obj):
     def _ZL19__float2bfloat16_rnf_nbst_caller(arg_0):
         return _ZL19__float2bfloat16_rnf_nbst(arg_0)
 
-    handle = globals()["__float2bfloat16_rn"]
+    handle = globals().get("__float2bfloat16_rn")
+    if handle is None:
+        handle = __float2bfloat16_rn
 
     @lower(handle, float32)
     def impl(context, builder, sig, args):
@@ -2010,7 +2034,9 @@ def _lower__ZL19__float2bfloat16_rzf_nbst(shim_stream, shim_obj):
     def _ZL19__float2bfloat16_rzf_nbst_caller(arg_0):
         return _ZL19__float2bfloat16_rzf_nbst(arg_0)
 
-    handle = globals()["__float2bfloat16_rz"]
+    handle = globals().get("__float2bfloat16_rz")
+    if handle is None:
+        handle = __float2bfloat16_rz
 
     @lower(handle, float32)
     def impl(context, builder, sig, args):
@@ -2053,7 +2079,9 @@ def _lower__ZL19__float2bfloat16_rdf_nbst(shim_stream, shim_obj):
     def _ZL19__float2bfloat16_rdf_nbst_caller(arg_0):
         return _ZL19__float2bfloat16_rdf_nbst(arg_0)
 
-    handle = globals()["__float2bfloat16_rd"]
+    handle = globals().get("__float2bfloat16_rd")
+    if handle is None:
+        handle = __float2bfloat16_rd
 
     @lower(handle, float32)
     def impl(context, builder, sig, args):
@@ -2096,7 +2124,9 @@ def _lower__ZL19__float2bfloat16_ruf_nbst(shim_stream, shim_obj):
     def _ZL19__float2bfloat16_ruf_nbst_caller(arg_0):
         return _ZL19__float2bfloat16_ruf_nbst(arg_0)
 
-    handle = globals()["__float2bfloat16_ru"]
+    handle = globals().get("__float2bfloat16_ru")
+    if handle is None:
+        handle = __float2bfloat16_ru
 
     @lower(handle, float32)
     def impl(context, builder, sig, args):
@@ -2140,7 +2170,9 @@ def _lower__ZL16__bfloat162float13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL16__bfloat162float13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL16__bfloat162float13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["__bfloat162float"]
+    handle = globals().get("__bfloat162float")
+    if handle is None:
+        handle = __bfloat162float
 
     @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -2184,7 +2216,9 @@ def _lower__ZL20__float2bfloat162_rnf_nbst(shim_stream, shim_obj):
     def _ZL20__float2bfloat162_rnf_nbst_caller(arg_0):
         return _ZL20__float2bfloat162_rnf_nbst(arg_0)
 
-    handle = globals()["__float2bfloat162_rn"]
+    handle = globals().get("__float2bfloat162_rn")
+    if handle is None:
+        handle = __float2bfloat162_rn
 
     @lower(handle, float32)
     def impl(context, builder, sig, args):
@@ -2228,7 +2262,9 @@ def _lower__ZL21__floats2bfloat162_rnff_nbst(shim_stream, shim_obj):
     def _ZL21__floats2bfloat162_rnff_nbst_caller(arg_0, arg_1):
         return _ZL21__floats2bfloat162_rnff_nbst(arg_0, arg_1)
 
-    handle = globals()["__floats2bfloat162_rn"]
+    handle = globals().get("__floats2bfloat162_rn")
+    if handle is None:
+        handle = __floats2bfloat162_rn
 
     @lower(handle, float32, float32)
     def impl(context, builder, sig, args):
@@ -2274,7 +2310,9 @@ def _lower__ZL11__low2float14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL11__low2float14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL11__low2float14__nv_bfloat162_nbst(arg_0)
 
-    handle = globals()["__low2float"]
+    handle = globals().get("__low2float")
+    if handle is None:
+        handle = __low2float
 
     @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -2318,7 +2356,9 @@ def _lower__ZL12__high2float14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL12__high2float14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL12__high2float14__nv_bfloat162_nbst(arg_0)
 
-    handle = globals()["__high2float"]
+    handle = globals().get("__high2float")
+    if handle is None:
+        handle = __high2float
 
     @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -2362,7 +2402,9 @@ def _lower__ZL21__float22bfloat162_rn6float2_nbst(shim_stream, shim_obj):
     def _ZL21__float22bfloat162_rn6float2_nbst_caller(arg_0):
         return _ZL21__float22bfloat162_rn6float2_nbst(arg_0)
 
-    handle = globals()["__float22bfloat162_rn"]
+    handle = globals().get("__float22bfloat162_rn")
+    if handle is None:
+        handle = __float22bfloat162_rn
 
     @lower(handle, float32x2)
     def impl(context, builder, sig, args):
@@ -2406,7 +2448,9 @@ def _lower__ZL18__bfloat1622float214__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL18__bfloat1622float214__nv_bfloat162_nbst_caller(arg_0):
         return _ZL18__bfloat1622float214__nv_bfloat162_nbst(arg_0)
 
-    handle = globals()["__bfloat1622float2"]
+    handle = globals().get("__bfloat1622float2")
+    if handle is None:
+        handle = __bfloat1622float2
 
     @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -2450,7 +2494,9 @@ def _lower__ZL18__bfloat162char_rz13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL18__bfloat162char_rz13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL18__bfloat162char_rz13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["__bfloat162char_rz"]
+    handle = globals().get("__bfloat162char_rz")
+    if handle is None:
+        handle = __bfloat162char_rz
 
     @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -2494,7 +2540,9 @@ def _lower__ZL19__bfloat162uchar_rz13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL19__bfloat162uchar_rz13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL19__bfloat162uchar_rz13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["__bfloat162uchar_rz"]
+    handle = globals().get("__bfloat162uchar_rz")
+    if handle is None:
+        handle = __bfloat162uchar_rz
 
     @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -2538,7 +2586,9 @@ def _lower__ZL17__bfloat162int_rn13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL17__bfloat162int_rn13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL17__bfloat162int_rn13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["__bfloat162int_rn"]
+    handle = globals().get("__bfloat162int_rn")
+    if handle is None:
+        handle = __bfloat162int_rn
 
     @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -2582,7 +2632,9 @@ def _lower__ZL17__bfloat162int_rz13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL17__bfloat162int_rz13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL17__bfloat162int_rz13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["__bfloat162int_rz"]
+    handle = globals().get("__bfloat162int_rz")
+    if handle is None:
+        handle = __bfloat162int_rz
 
     @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -2626,7 +2678,9 @@ def _lower__ZL17__bfloat162int_rd13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL17__bfloat162int_rd13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL17__bfloat162int_rd13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["__bfloat162int_rd"]
+    handle = globals().get("__bfloat162int_rd")
+    if handle is None:
+        handle = __bfloat162int_rd
 
     @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -2670,7 +2724,9 @@ def _lower__ZL17__bfloat162int_ru13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL17__bfloat162int_ru13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL17__bfloat162int_ru13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["__bfloat162int_ru"]
+    handle = globals().get("__bfloat162int_ru")
+    if handle is None:
+        handle = __bfloat162int_ru
 
     @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -2713,7 +2769,9 @@ def _lower__ZL17__int2bfloat16_rni_nbst(shim_stream, shim_obj):
     def _ZL17__int2bfloat16_rni_nbst_caller(arg_0):
         return _ZL17__int2bfloat16_rni_nbst(arg_0)
 
-    handle = globals()["__int2bfloat16_rn"]
+    handle = globals().get("__int2bfloat16_rn")
+    if handle is None:
+        handle = __int2bfloat16_rn
 
     @lower(handle, int32)
     def impl(context, builder, sig, args):
@@ -2754,7 +2812,9 @@ def _lower__ZL17__int2bfloat16_rzi_nbst(shim_stream, shim_obj):
     def _ZL17__int2bfloat16_rzi_nbst_caller(arg_0):
         return _ZL17__int2bfloat16_rzi_nbst(arg_0)
 
-    handle = globals()["__int2bfloat16_rz"]
+    handle = globals().get("__int2bfloat16_rz")
+    if handle is None:
+        handle = __int2bfloat16_rz
 
     @lower(handle, int32)
     def impl(context, builder, sig, args):
@@ -2795,7 +2855,9 @@ def _lower__ZL17__int2bfloat16_rdi_nbst(shim_stream, shim_obj):
     def _ZL17__int2bfloat16_rdi_nbst_caller(arg_0):
         return _ZL17__int2bfloat16_rdi_nbst(arg_0)
 
-    handle = globals()["__int2bfloat16_rd"]
+    handle = globals().get("__int2bfloat16_rd")
+    if handle is None:
+        handle = __int2bfloat16_rd
 
     @lower(handle, int32)
     def impl(context, builder, sig, args):
@@ -2836,7 +2898,9 @@ def _lower__ZL17__int2bfloat16_rui_nbst(shim_stream, shim_obj):
     def _ZL17__int2bfloat16_rui_nbst_caller(arg_0):
         return _ZL17__int2bfloat16_rui_nbst(arg_0)
 
-    handle = globals()["__int2bfloat16_ru"]
+    handle = globals().get("__int2bfloat16_ru")
+    if handle is None:
+        handle = __int2bfloat16_ru
 
     @lower(handle, int32)
     def impl(context, builder, sig, args):
@@ -2878,7 +2942,9 @@ def _lower__ZL19__bfloat162short_rn13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL19__bfloat162short_rn13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL19__bfloat162short_rn13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["__bfloat162short_rn"]
+    handle = globals().get("__bfloat162short_rn")
+    if handle is None:
+        handle = __bfloat162short_rn
 
     @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -2922,7 +2988,9 @@ def _lower__ZL19__bfloat162short_rz13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL19__bfloat162short_rz13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL19__bfloat162short_rz13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["__bfloat162short_rz"]
+    handle = globals().get("__bfloat162short_rz")
+    if handle is None:
+        handle = __bfloat162short_rz
 
     @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -2966,7 +3034,9 @@ def _lower__ZL19__bfloat162short_rd13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL19__bfloat162short_rd13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL19__bfloat162short_rd13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["__bfloat162short_rd"]
+    handle = globals().get("__bfloat162short_rd")
+    if handle is None:
+        handle = __bfloat162short_rd
 
     @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -3010,7 +3080,9 @@ def _lower__ZL19__bfloat162short_ru13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL19__bfloat162short_ru13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL19__bfloat162short_ru13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["__bfloat162short_ru"]
+    handle = globals().get("__bfloat162short_ru")
+    if handle is None:
+        handle = __bfloat162short_ru
 
     @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -3053,7 +3125,9 @@ def _lower__ZL19__short2bfloat16_rns_nbst(shim_stream, shim_obj):
     def _ZL19__short2bfloat16_rns_nbst_caller(arg_0):
         return _ZL19__short2bfloat16_rns_nbst(arg_0)
 
-    handle = globals()["__short2bfloat16_rn"]
+    handle = globals().get("__short2bfloat16_rn")
+    if handle is None:
+        handle = __short2bfloat16_rn
 
     @lower(handle, int16)
     def impl(context, builder, sig, args):
@@ -3096,7 +3170,9 @@ def _lower__ZL19__short2bfloat16_rzs_nbst(shim_stream, shim_obj):
     def _ZL19__short2bfloat16_rzs_nbst_caller(arg_0):
         return _ZL19__short2bfloat16_rzs_nbst(arg_0)
 
-    handle = globals()["__short2bfloat16_rz"]
+    handle = globals().get("__short2bfloat16_rz")
+    if handle is None:
+        handle = __short2bfloat16_rz
 
     @lower(handle, int16)
     def impl(context, builder, sig, args):
@@ -3139,7 +3215,9 @@ def _lower__ZL19__short2bfloat16_rds_nbst(shim_stream, shim_obj):
     def _ZL19__short2bfloat16_rds_nbst_caller(arg_0):
         return _ZL19__short2bfloat16_rds_nbst(arg_0)
 
-    handle = globals()["__short2bfloat16_rd"]
+    handle = globals().get("__short2bfloat16_rd")
+    if handle is None:
+        handle = __short2bfloat16_rd
 
     @lower(handle, int16)
     def impl(context, builder, sig, args):
@@ -3182,7 +3260,9 @@ def _lower__ZL19__short2bfloat16_rus_nbst(shim_stream, shim_obj):
     def _ZL19__short2bfloat16_rus_nbst_caller(arg_0):
         return _ZL19__short2bfloat16_rus_nbst(arg_0)
 
-    handle = globals()["__short2bfloat16_ru"]
+    handle = globals().get("__short2bfloat16_ru")
+    if handle is None:
+        handle = __short2bfloat16_ru
 
     @lower(handle, int16)
     def impl(context, builder, sig, args):
@@ -3226,7 +3306,9 @@ def _lower__ZL18__bfloat162uint_rn13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL18__bfloat162uint_rn13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL18__bfloat162uint_rn13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["__bfloat162uint_rn"]
+    handle = globals().get("__bfloat162uint_rn")
+    if handle is None:
+        handle = __bfloat162uint_rn
 
     @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -3270,7 +3352,9 @@ def _lower__ZL18__bfloat162uint_rz13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL18__bfloat162uint_rz13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL18__bfloat162uint_rz13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["__bfloat162uint_rz"]
+    handle = globals().get("__bfloat162uint_rz")
+    if handle is None:
+        handle = __bfloat162uint_rz
 
     @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -3314,7 +3398,9 @@ def _lower__ZL18__bfloat162uint_rd13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL18__bfloat162uint_rd13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL18__bfloat162uint_rd13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["__bfloat162uint_rd"]
+    handle = globals().get("__bfloat162uint_rd")
+    if handle is None:
+        handle = __bfloat162uint_rd
 
     @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -3358,7 +3444,9 @@ def _lower__ZL18__bfloat162uint_ru13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL18__bfloat162uint_ru13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL18__bfloat162uint_ru13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["__bfloat162uint_ru"]
+    handle = globals().get("__bfloat162uint_ru")
+    if handle is None:
+        handle = __bfloat162uint_ru
 
     @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -3401,7 +3489,9 @@ def _lower__ZL18__uint2bfloat16_rnj_nbst(shim_stream, shim_obj):
     def _ZL18__uint2bfloat16_rnj_nbst_caller(arg_0):
         return _ZL18__uint2bfloat16_rnj_nbst(arg_0)
 
-    handle = globals()["__uint2bfloat16_rn"]
+    handle = globals().get("__uint2bfloat16_rn")
+    if handle is None:
+        handle = __uint2bfloat16_rn
 
     @lower(handle, uint32)
     def impl(context, builder, sig, args):
@@ -3444,7 +3534,9 @@ def _lower__ZL18__uint2bfloat16_rzj_nbst(shim_stream, shim_obj):
     def _ZL18__uint2bfloat16_rzj_nbst_caller(arg_0):
         return _ZL18__uint2bfloat16_rzj_nbst(arg_0)
 
-    handle = globals()["__uint2bfloat16_rz"]
+    handle = globals().get("__uint2bfloat16_rz")
+    if handle is None:
+        handle = __uint2bfloat16_rz
 
     @lower(handle, uint32)
     def impl(context, builder, sig, args):
@@ -3487,7 +3579,9 @@ def _lower__ZL18__uint2bfloat16_rdj_nbst(shim_stream, shim_obj):
     def _ZL18__uint2bfloat16_rdj_nbst_caller(arg_0):
         return _ZL18__uint2bfloat16_rdj_nbst(arg_0)
 
-    handle = globals()["__uint2bfloat16_rd"]
+    handle = globals().get("__uint2bfloat16_rd")
+    if handle is None:
+        handle = __uint2bfloat16_rd
 
     @lower(handle, uint32)
     def impl(context, builder, sig, args):
@@ -3530,7 +3624,9 @@ def _lower__ZL18__uint2bfloat16_ruj_nbst(shim_stream, shim_obj):
     def _ZL18__uint2bfloat16_ruj_nbst_caller(arg_0):
         return _ZL18__uint2bfloat16_ruj_nbst(arg_0)
 
-    handle = globals()["__uint2bfloat16_ru"]
+    handle = globals().get("__uint2bfloat16_ru")
+    if handle is None:
+        handle = __uint2bfloat16_ru
 
     @lower(handle, uint32)
     def impl(context, builder, sig, args):
@@ -3574,7 +3670,9 @@ def _lower__ZL20__bfloat162ushort_rn13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL20__bfloat162ushort_rn13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL20__bfloat162ushort_rn13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["__bfloat162ushort_rn"]
+    handle = globals().get("__bfloat162ushort_rn")
+    if handle is None:
+        handle = __bfloat162ushort_rn
 
     @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -3618,7 +3716,9 @@ def _lower__ZL20__bfloat162ushort_rz13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL20__bfloat162ushort_rz13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL20__bfloat162ushort_rz13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["__bfloat162ushort_rz"]
+    handle = globals().get("__bfloat162ushort_rz")
+    if handle is None:
+        handle = __bfloat162ushort_rz
 
     @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -3662,7 +3762,9 @@ def _lower__ZL20__bfloat162ushort_rd13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL20__bfloat162ushort_rd13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL20__bfloat162ushort_rd13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["__bfloat162ushort_rd"]
+    handle = globals().get("__bfloat162ushort_rd")
+    if handle is None:
+        handle = __bfloat162ushort_rd
 
     @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -3706,7 +3808,9 @@ def _lower__ZL20__bfloat162ushort_ru13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL20__bfloat162ushort_ru13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL20__bfloat162ushort_ru13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["__bfloat162ushort_ru"]
+    handle = globals().get("__bfloat162ushort_ru")
+    if handle is None:
+        handle = __bfloat162ushort_ru
 
     @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -3749,7 +3853,9 @@ def _lower__ZL20__ushort2bfloat16_rnt_nbst(shim_stream, shim_obj):
     def _ZL20__ushort2bfloat16_rnt_nbst_caller(arg_0):
         return _ZL20__ushort2bfloat16_rnt_nbst(arg_0)
 
-    handle = globals()["__ushort2bfloat16_rn"]
+    handle = globals().get("__ushort2bfloat16_rn")
+    if handle is None:
+        handle = __ushort2bfloat16_rn
 
     @lower(handle, uint16)
     def impl(context, builder, sig, args):
@@ -3792,7 +3898,9 @@ def _lower__ZL20__ushort2bfloat16_rzt_nbst(shim_stream, shim_obj):
     def _ZL20__ushort2bfloat16_rzt_nbst_caller(arg_0):
         return _ZL20__ushort2bfloat16_rzt_nbst(arg_0)
 
-    handle = globals()["__ushort2bfloat16_rz"]
+    handle = globals().get("__ushort2bfloat16_rz")
+    if handle is None:
+        handle = __ushort2bfloat16_rz
 
     @lower(handle, uint16)
     def impl(context, builder, sig, args):
@@ -3835,7 +3943,9 @@ def _lower__ZL20__ushort2bfloat16_rdt_nbst(shim_stream, shim_obj):
     def _ZL20__ushort2bfloat16_rdt_nbst_caller(arg_0):
         return _ZL20__ushort2bfloat16_rdt_nbst(arg_0)
 
-    handle = globals()["__ushort2bfloat16_rd"]
+    handle = globals().get("__ushort2bfloat16_rd")
+    if handle is None:
+        handle = __ushort2bfloat16_rd
 
     @lower(handle, uint16)
     def impl(context, builder, sig, args):
@@ -3878,7 +3988,9 @@ def _lower__ZL20__ushort2bfloat16_rut_nbst(shim_stream, shim_obj):
     def _ZL20__ushort2bfloat16_rut_nbst_caller(arg_0):
         return _ZL20__ushort2bfloat16_rut_nbst(arg_0)
 
-    handle = globals()["__ushort2bfloat16_ru"]
+    handle = globals().get("__ushort2bfloat16_ru")
+    if handle is None:
+        handle = __ushort2bfloat16_ru
 
     @lower(handle, uint16)
     def impl(context, builder, sig, args):
@@ -3922,7 +4034,9 @@ def _lower__ZL17__bfloat162ull_rn13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL17__bfloat162ull_rn13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL17__bfloat162ull_rn13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["__bfloat162ull_rn"]
+    handle = globals().get("__bfloat162ull_rn")
+    if handle is None:
+        handle = __bfloat162ull_rn
 
     @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -3966,7 +4080,9 @@ def _lower__ZL17__bfloat162ull_rz13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL17__bfloat162ull_rz13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL17__bfloat162ull_rz13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["__bfloat162ull_rz"]
+    handle = globals().get("__bfloat162ull_rz")
+    if handle is None:
+        handle = __bfloat162ull_rz
 
     @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -4012,7 +4128,9 @@ def _lower__ZL14make_bfloat16213__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL14make_bfloat16213__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL14make_bfloat16213__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals()["make_bfloat162"]
+    handle = globals().get("make_bfloat162")
+    if handle is None:
+        handle = make_bfloat162
 
     @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -4060,7 +4178,9 @@ def _lower__ZL17__bfloat162ull_rd13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL17__bfloat162ull_rd13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL17__bfloat162ull_rd13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["__bfloat162ull_rd"]
+    handle = globals().get("__bfloat162ull_rd")
+    if handle is None:
+        handle = __bfloat162ull_rd
 
     @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -4104,7 +4224,9 @@ def _lower__ZL17__bfloat162ull_ru13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL17__bfloat162ull_ru13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL17__bfloat162ull_ru13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["__bfloat162ull_ru"]
+    handle = globals().get("__bfloat162ull_ru")
+    if handle is None:
+        handle = __bfloat162ull_ru
 
     @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -4147,7 +4269,9 @@ def _lower__ZL17__ull2bfloat16_rny_nbst(shim_stream, shim_obj):
     def _ZL17__ull2bfloat16_rny_nbst_caller(arg_0):
         return _ZL17__ull2bfloat16_rny_nbst(arg_0)
 
-    handle = globals()["__ull2bfloat16_rn"]
+    handle = globals().get("__ull2bfloat16_rn")
+    if handle is None:
+        handle = __ull2bfloat16_rn
 
     @lower(handle, uint64)
     def impl(context, builder, sig, args):
@@ -4188,7 +4312,9 @@ def _lower__ZL17__ull2bfloat16_rzy_nbst(shim_stream, shim_obj):
     def _ZL17__ull2bfloat16_rzy_nbst_caller(arg_0):
         return _ZL17__ull2bfloat16_rzy_nbst(arg_0)
 
-    handle = globals()["__ull2bfloat16_rz"]
+    handle = globals().get("__ull2bfloat16_rz")
+    if handle is None:
+        handle = __ull2bfloat16_rz
 
     @lower(handle, uint64)
     def impl(context, builder, sig, args):
@@ -4229,7 +4355,9 @@ def _lower__ZL17__ull2bfloat16_rdy_nbst(shim_stream, shim_obj):
     def _ZL17__ull2bfloat16_rdy_nbst_caller(arg_0):
         return _ZL17__ull2bfloat16_rdy_nbst(arg_0)
 
-    handle = globals()["__ull2bfloat16_rd"]
+    handle = globals().get("__ull2bfloat16_rd")
+    if handle is None:
+        handle = __ull2bfloat16_rd
 
     @lower(handle, uint64)
     def impl(context, builder, sig, args):
@@ -4270,7 +4398,9 @@ def _lower__ZL17__ull2bfloat16_ruy_nbst(shim_stream, shim_obj):
     def _ZL17__ull2bfloat16_ruy_nbst_caller(arg_0):
         return _ZL17__ull2bfloat16_ruy_nbst(arg_0)
 
-    handle = globals()["__ull2bfloat16_ru"]
+    handle = globals().get("__ull2bfloat16_ru")
+    if handle is None:
+        handle = __ull2bfloat16_ru
 
     @lower(handle, uint64)
     def impl(context, builder, sig, args):
@@ -4312,7 +4442,9 @@ def _lower__ZL16__bfloat162ll_rn13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL16__bfloat162ll_rn13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL16__bfloat162ll_rn13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["__bfloat162ll_rn"]
+    handle = globals().get("__bfloat162ll_rn")
+    if handle is None:
+        handle = __bfloat162ll_rn
 
     @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -4356,7 +4488,9 @@ def _lower__ZL16__bfloat162ll_rz13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL16__bfloat162ll_rz13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL16__bfloat162ll_rz13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["__bfloat162ll_rz"]
+    handle = globals().get("__bfloat162ll_rz")
+    if handle is None:
+        handle = __bfloat162ll_rz
 
     @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -4400,7 +4534,9 @@ def _lower__ZL16__bfloat162ll_rd13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL16__bfloat162ll_rd13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL16__bfloat162ll_rd13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["__bfloat162ll_rd"]
+    handle = globals().get("__bfloat162ll_rd")
+    if handle is None:
+        handle = __bfloat162ll_rd
 
     @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -4444,7 +4580,9 @@ def _lower__ZL16__bfloat162ll_ru13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL16__bfloat162ll_ru13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL16__bfloat162ll_ru13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["__bfloat162ll_ru"]
+    handle = globals().get("__bfloat162ll_ru")
+    if handle is None:
+        handle = __bfloat162ll_ru
 
     @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -4487,7 +4625,9 @@ def _lower__ZL16__ll2bfloat16_rnx_nbst(shim_stream, shim_obj):
     def _ZL16__ll2bfloat16_rnx_nbst_caller(arg_0):
         return _ZL16__ll2bfloat16_rnx_nbst(arg_0)
 
-    handle = globals()["__ll2bfloat16_rn"]
+    handle = globals().get("__ll2bfloat16_rn")
+    if handle is None:
+        handle = __ll2bfloat16_rn
 
     @lower(handle, int64)
     def impl(context, builder, sig, args):
@@ -4528,7 +4668,9 @@ def _lower__ZL16__ll2bfloat16_rzx_nbst(shim_stream, shim_obj):
     def _ZL16__ll2bfloat16_rzx_nbst_caller(arg_0):
         return _ZL16__ll2bfloat16_rzx_nbst(arg_0)
 
-    handle = globals()["__ll2bfloat16_rz"]
+    handle = globals().get("__ll2bfloat16_rz")
+    if handle is None:
+        handle = __ll2bfloat16_rz
 
     @lower(handle, int64)
     def impl(context, builder, sig, args):
@@ -4569,7 +4711,9 @@ def _lower__ZL16__ll2bfloat16_rdx_nbst(shim_stream, shim_obj):
     def _ZL16__ll2bfloat16_rdx_nbst_caller(arg_0):
         return _ZL16__ll2bfloat16_rdx_nbst(arg_0)
 
-    handle = globals()["__ll2bfloat16_rd"]
+    handle = globals().get("__ll2bfloat16_rd")
+    if handle is None:
+        handle = __ll2bfloat16_rd
 
     @lower(handle, int64)
     def impl(context, builder, sig, args):
@@ -4610,7 +4754,9 @@ def _lower__ZL16__ll2bfloat16_rux_nbst(shim_stream, shim_obj):
     def _ZL16__ll2bfloat16_rux_nbst_caller(arg_0):
         return _ZL16__ll2bfloat16_rux_nbst(arg_0)
 
-    handle = globals()["__ll2bfloat16_ru"]
+    handle = globals().get("__ll2bfloat16_ru")
+    if handle is None:
+        handle = __ll2bfloat16_ru
 
     @lower(handle, int64)
     def impl(context, builder, sig, args):
@@ -4652,7 +4798,9 @@ def _lower__ZL6htrunc13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL6htrunc13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL6htrunc13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["htrunc"]
+    handle = globals().get("htrunc")
+    if handle is None:
+        handle = htrunc
 
     @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -4696,7 +4844,9 @@ def _lower__ZL5hceil13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL5hceil13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL5hceil13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["hceil"]
+    handle = globals().get("hceil")
+    if handle is None:
+        handle = hceil
 
     @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -4740,7 +4890,9 @@ def _lower__ZL6hfloor13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL6hfloor13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL6hfloor13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["hfloor"]
+    handle = globals().get("hfloor")
+    if handle is None:
+        handle = hfloor
 
     @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -4784,7 +4936,9 @@ def _lower__ZL5hrint13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL5hrint13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL5hrint13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["hrint"]
+    handle = globals().get("hrint")
+    if handle is None:
+        handle = hrint
 
     @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -4828,7 +4982,9 @@ def _lower__ZL7h2trunc14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL7h2trunc14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL7h2trunc14__nv_bfloat162_nbst(arg_0)
 
-    handle = globals()["h2trunc"]
+    handle = globals().get("h2trunc")
+    if handle is None:
+        handle = h2trunc
 
     @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -4872,7 +5028,9 @@ def _lower__ZL6h2ceil14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL6h2ceil14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL6h2ceil14__nv_bfloat162_nbst(arg_0)
 
-    handle = globals()["h2ceil"]
+    handle = globals().get("h2ceil")
+    if handle is None:
+        handle = h2ceil
 
     @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -4916,7 +5074,9 @@ def _lower__ZL7h2floor14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL7h2floor14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL7h2floor14__nv_bfloat162_nbst(arg_0)
 
-    handle = globals()["h2floor"]
+    handle = globals().get("h2floor")
+    if handle is None:
+        handle = h2floor
 
     @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -4960,7 +5120,9 @@ def _lower__ZL6h2rint14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL6h2rint14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL6h2rint14__nv_bfloat162_nbst(arg_0)
 
-    handle = globals()["h2rint"]
+    handle = globals().get("h2rint")
+    if handle is None:
+        handle = h2rint
 
     @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -5004,7 +5166,9 @@ def _lower__ZL20__bfloat162bfloat16213__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL20__bfloat162bfloat16213__nv_bfloat16_nbst_caller(arg_0):
         return _ZL20__bfloat162bfloat16213__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["__bfloat162bfloat162"]
+    handle = globals().get("__bfloat162bfloat162")
+    if handle is None:
+        handle = __bfloat162bfloat162
 
     @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -5048,7 +5212,9 @@ def _lower__ZL17__lowhigh2highlow14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL17__lowhigh2highlow14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL17__lowhigh2highlow14__nv_bfloat162_nbst(arg_0)
 
-    handle = globals()["__lowhigh2highlow"]
+    handle = globals().get("__lowhigh2highlow")
+    if handle is None:
+        handle = __lowhigh2highlow
 
     @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -5094,7 +5260,9 @@ def _lower__ZL16__lows2bfloat16214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL16__lows2bfloat16214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL16__lows2bfloat16214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__lows2bfloat162"]
+    handle = globals().get("__lows2bfloat162")
+    if handle is None:
+        handle = __lows2bfloat162
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -5144,7 +5312,9 @@ def _lower__ZL17__highs2bfloat16214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL17__highs2bfloat16214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL17__highs2bfloat16214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__highs2bfloat162"]
+    handle = globals().get("__highs2bfloat162")
+    if handle is None:
+        handle = __highs2bfloat162
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -5192,7 +5362,9 @@ def _lower__ZL15__high2bfloat1614__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL15__high2bfloat1614__nv_bfloat162_nbst_caller(arg_0):
         return _ZL15__high2bfloat1614__nv_bfloat162_nbst(arg_0)
 
-    handle = globals()["__high2bfloat16"]
+    handle = globals().get("__high2bfloat16")
+    if handle is None:
+        handle = __high2bfloat16
 
     @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -5236,7 +5408,9 @@ def _lower__ZL14__low2bfloat1614__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL14__low2bfloat1614__nv_bfloat162_nbst_caller(arg_0):
         return _ZL14__low2bfloat1614__nv_bfloat162_nbst(arg_0)
 
-    handle = globals()["__low2bfloat16"]
+    handle = globals().get("__low2bfloat16")
+    if handle is None:
+        handle = __low2bfloat16
 
     @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -5279,7 +5453,9 @@ def _lower__ZL8__hisinf13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL8__hisinf13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL8__hisinf13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["__hisinf"]
+    handle = globals().get("__hisinf")
+    if handle is None:
+        handle = __hisinf
 
     @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -5325,7 +5501,9 @@ def _lower__ZL18__halves2bfloat16213__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL18__halves2bfloat16213__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL18__halves2bfloat16213__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals()["__halves2bfloat162"]
+    handle = globals().get("__halves2bfloat162")
+    if handle is None:
+        handle = __halves2bfloat162
 
     @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -5373,7 +5551,9 @@ def _lower__ZL15__low2bfloat16214__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL15__low2bfloat16214__nv_bfloat162_nbst_caller(arg_0):
         return _ZL15__low2bfloat16214__nv_bfloat162_nbst(arg_0)
 
-    handle = globals()["__low2bfloat162"]
+    handle = globals().get("__low2bfloat162")
+    if handle is None:
+        handle = __low2bfloat162
 
     @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -5417,7 +5597,9 @@ def _lower__ZL16__high2bfloat16214__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL16__high2bfloat16214__nv_bfloat162_nbst_caller(arg_0):
         return _ZL16__high2bfloat16214__nv_bfloat162_nbst(arg_0)
 
-    handle = globals()["__high2bfloat162"]
+    handle = globals().get("__high2bfloat162")
+    if handle is None:
+        handle = __high2bfloat162
 
     @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -5461,7 +5643,9 @@ def _lower__ZL19__bfloat16_as_short13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL19__bfloat16_as_short13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL19__bfloat16_as_short13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["__bfloat16_as_short"]
+    handle = globals().get("__bfloat16_as_short")
+    if handle is None:
+        handle = __bfloat16_as_short
 
     @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -5505,7 +5689,9 @@ def _lower__ZL20__bfloat16_as_ushort13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL20__bfloat16_as_ushort13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL20__bfloat16_as_ushort13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["__bfloat16_as_ushort"]
+    handle = globals().get("__bfloat16_as_ushort")
+    if handle is None:
+        handle = __bfloat16_as_ushort
 
     @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -5548,7 +5734,9 @@ def _lower__ZL19__short_as_bfloat16s_nbst(shim_stream, shim_obj):
     def _ZL19__short_as_bfloat16s_nbst_caller(arg_0):
         return _ZL19__short_as_bfloat16s_nbst(arg_0)
 
-    handle = globals()["__short_as_bfloat16"]
+    handle = globals().get("__short_as_bfloat16")
+    if handle is None:
+        handle = __short_as_bfloat16
 
     @lower(handle, int16)
     def impl(context, builder, sig, args):
@@ -5591,7 +5779,9 @@ def _lower__ZL20__ushort_as_bfloat16t_nbst(shim_stream, shim_obj):
     def _ZL20__ushort_as_bfloat16t_nbst_caller(arg_0):
         return _ZL20__ushort_as_bfloat16t_nbst(arg_0)
 
-    handle = globals()["__ushort_as_bfloat16"]
+    handle = globals().get("__ushort_as_bfloat16")
+    if handle is None:
+        handle = __ushort_as_bfloat16
 
     @lower(handle, uint16)
     def impl(context, builder, sig, args):
@@ -5644,7 +5834,9 @@ def _ZL11__shfl_syncj14__nv_bfloat162ii_nbst_caller(
             arg_0, arg_1, arg_2, arg_3
         )
 
-    handle = globals()["__shfl_sync"]
+    handle = globals().get("__shfl_sync")
+    if handle is None:
+        handle = __shfl_sync
 
     @lower(handle, uint32, _type___nv_bfloat162, int32, int32)
     def impl(context, builder, sig, args):
@@ -5703,7 +5895,9 @@ def _ZL14__shfl_up_syncj14__nv_bfloat162ji_nbst_caller(
             arg_0, arg_1, arg_2, arg_3
         )
 
-    handle = globals()["__shfl_up_sync"]
+    handle = globals().get("__shfl_up_sync")
+    if handle is None:
+        handle = __shfl_up_sync
 
     @lower(handle, uint32, _type___nv_bfloat162, uint32, int32)
     def impl(context, builder, sig, args):
@@ -5762,7 +5956,9 @@ def _ZL16__shfl_down_syncj14__nv_bfloat162ji_nbst_caller(
             arg_0, arg_1, arg_2, arg_3
         )
 
-    handle = globals()["__shfl_down_sync"]
+    handle = globals().get("__shfl_down_sync")
+    if handle is None:
+        handle = __shfl_down_sync
 
     @lower(handle, uint32, _type___nv_bfloat162, uint32, int32)
     def impl(context, builder, sig, args):
@@ -5821,7 +6017,9 @@ def _ZL15__shfl_xor_syncj14__nv_bfloat162ii_nbst_caller(
             arg_0, arg_1, arg_2, arg_3
         )
 
-    handle = globals()["__shfl_xor_sync"]
+    handle = globals().get("__shfl_xor_sync")
+    if handle is None:
+        handle = __shfl_xor_sync
 
     @lower(handle, uint32, _type___nv_bfloat162, int32, int32)
     def impl(context, builder, sig, args):
@@ -5876,7 +6074,9 @@ def _ZL11__shfl_syncj13__nv_bfloat16ii_nbst_caller(
             arg_0, arg_1, arg_2, arg_3
         )
 
-    handle = globals()["__shfl_sync"]
+    handle = globals().get("__shfl_sync")
+    if handle is None:
+        handle = __shfl_sync
 
     @lower(handle, uint32, _type___nv_bfloat16, int32, int32)
     def impl(context, builder, sig, args):
@@ -5931,7 +6131,9 @@ def _ZL14__shfl_up_syncj13__nv_bfloat16ji_nbst_caller(
             arg_0, arg_1, arg_2, arg_3
         )
 
-    handle = globals()["__shfl_up_sync"]
+    handle = globals().get("__shfl_up_sync")
+    if handle is None:
+        handle = __shfl_up_sync
 
     @lower(handle, uint32, _type___nv_bfloat16, uint32, int32)
     def impl(context, builder, sig, args):
@@ -5986,7 +6188,9 @@ def _ZL16__shfl_down_syncj13__nv_bfloat16ji_nbst_caller(
             arg_0, arg_1, arg_2, arg_3
         )
 
-    handle = globals()["__shfl_down_sync"]
+    handle = globals().get("__shfl_down_sync")
+    if handle is None:
+        handle = __shfl_down_sync
 
     @lower(handle, uint32, _type___nv_bfloat16, uint32, int32)
     def impl(context, builder, sig, args):
@@ -6041,7 +6245,9 @@ def _ZL15__shfl_xor_syncj13__nv_bfloat16ii_nbst_caller(
             arg_0, arg_1, arg_2, arg_3
         )
 
-    handle = globals()["__shfl_xor_sync"]
+    handle = globals().get("__shfl_xor_sync")
+    if handle is None:
+        handle = __shfl_xor_sync
 
     @lower(handle, uint32, _type___nv_bfloat16, int32, int32)
     def impl(context, builder, sig, args):
@@ -6091,7 +6297,9 @@ def _lower__ZL5__ldgPK14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL5__ldgPK14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL5__ldgPK14__nv_bfloat162_nbst(arg_0)
 
-    handle = globals()["__ldg"]
+    handle = globals().get("__ldg")
+    if handle is None:
+        handle = __ldg
 
     @lower(handle, CPointer(_type___nv_bfloat162))
     def impl(context, builder, sig, args):
@@ -6133,7 +6341,9 @@ def _lower__ZL5__ldgPK13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL5__ldgPK13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL5__ldgPK13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["__ldg"]
+    handle = globals().get("__ldg")
+    if handle is None:
+        handle = __ldg
 
     @lower(handle, CPointer(_type___nv_bfloat16))
     def impl(context, builder, sig, args):
@@ -6179,7 +6389,9 @@ def _lower__ZL6__ldcgPK14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL6__ldcgPK14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL6__ldcgPK14__nv_bfloat162_nbst(arg_0)
 
-    handle = globals()["__ldcg"]
+    handle = globals().get("__ldcg")
+    if handle is None:
+        handle = __ldcg
 
     @lower(handle, CPointer(_type___nv_bfloat162))
     def impl(context, builder, sig, args):
@@ -6221,7 +6433,9 @@ def _lower__ZL6__ldcgPK13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL6__ldcgPK13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL6__ldcgPK13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["__ldcg"]
+    handle = globals().get("__ldcg")
+    if handle is None:
+        handle = __ldcg
 
     @lower(handle, CPointer(_type___nv_bfloat16))
     def impl(context, builder, sig, args):
@@ -6267,7 +6481,9 @@ def _lower__ZL6__ldcaPK14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL6__ldcaPK14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL6__ldcaPK14__nv_bfloat162_nbst(arg_0)
 
-    handle = globals()["__ldca"]
+    handle = globals().get("__ldca")
+    if handle is None:
+        handle = __ldca
 
     @lower(handle, CPointer(_type___nv_bfloat162))
     def impl(context, builder, sig, args):
@@ -6309,7 +6525,9 @@ def _lower__ZL6__ldcaPK13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL6__ldcaPK13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL6__ldcaPK13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["__ldca"]
+    handle = globals().get("__ldca")
+    if handle is None:
+        handle = __ldca
 
     @lower(handle, CPointer(_type___nv_bfloat16))
     def impl(context, builder, sig, args):
@@ -6355,7 +6573,9 @@ def _lower__ZL6__ldcsPK14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL6__ldcsPK14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL6__ldcsPK14__nv_bfloat162_nbst(arg_0)
 
-    handle = globals()["__ldcs"]
+    handle = globals().get("__ldcs")
+    if handle is None:
+        handle = __ldcs
 
     @lower(handle, CPointer(_type___nv_bfloat162))
     def impl(context, builder, sig, args):
@@ -6397,7 +6617,9 @@ def _lower__ZL6__ldcsPK13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL6__ldcsPK13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL6__ldcsPK13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["__ldcs"]
+    handle = globals().get("__ldcs")
+    if handle is None:
+        handle = __ldcs
 
     @lower(handle, CPointer(_type___nv_bfloat16))
     def impl(context, builder, sig, args):
@@ -6443,7 +6665,9 @@ def _lower__ZL6__ldluPK14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL6__ldluPK14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL6__ldluPK14__nv_bfloat162_nbst(arg_0)
 
-    handle = globals()["__ldlu"]
+    handle = globals().get("__ldlu")
+    if handle is None:
+        handle = __ldlu
 
     @lower(handle, CPointer(_type___nv_bfloat162))
     def impl(context, builder, sig, args):
@@ -6485,7 +6709,9 @@ def _lower__ZL6__ldluPK13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL6__ldluPK13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL6__ldluPK13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["__ldlu"]
+    handle = globals().get("__ldlu")
+    if handle is None:
+        handle = __ldlu
 
     @lower(handle, CPointer(_type___nv_bfloat16))
     def impl(context, builder, sig, args):
@@ -6531,7 +6757,9 @@ def _lower__ZL6__ldcvPK14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL6__ldcvPK14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL6__ldcvPK14__nv_bfloat162_nbst(arg_0)
 
-    handle = globals()["__ldcv"]
+    handle = globals().get("__ldcv")
+    if handle is None:
+        handle = __ldcv
 
     @lower(handle, CPointer(_type___nv_bfloat162))
     def impl(context, builder, sig, args):
@@ -6573,7 +6801,9 @@ def _lower__ZL6__ldcvPK13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL6__ldcvPK13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL6__ldcvPK13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["__ldcv"]
+    handle = globals().get("__ldcv")
+    if handle is None:
+        handle = __ldcv
 
     @lower(handle, CPointer(_type___nv_bfloat16))
     def impl(context, builder, sig, args):
@@ -6622,7 +6852,9 @@ def _lower__ZL6__stwbP14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL6__stwbP14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL6__stwbP14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__stwb"]
+    handle = globals().get("__stwb")
+    if handle is None:
+        handle = __stwb
 
     @lower(handle, CPointer(_type___nv_bfloat162), _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -6669,7 +6901,9 @@ def _lower__ZL6__stwbP13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL6__stwbP13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL6__stwbP13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals()["__stwb"]
+    handle = globals().get("__stwb")
+    if handle is None:
+        handle = __stwb
 
     @lower(handle, CPointer(_type___nv_bfloat16), _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -6720,7 +6954,9 @@ def _lower__ZL6__stcgP14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL6__stcgP14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL6__stcgP14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__stcg"]
+    handle = globals().get("__stcg")
+    if handle is None:
+        handle = __stcg
 
     @lower(handle, CPointer(_type___nv_bfloat162), _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -6767,7 +7003,9 @@ def _lower__ZL6__stcgP13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL6__stcgP13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL6__stcgP13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals()["__stcg"]
+    handle = globals().get("__stcg")
+    if handle is None:
+        handle = __stcg
 
     @lower(handle, CPointer(_type___nv_bfloat16), _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -6818,7 +7056,9 @@ def _lower__ZL6__stcsP14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL6__stcsP14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL6__stcsP14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__stcs"]
+    handle = globals().get("__stcs")
+    if handle is None:
+        handle = __stcs
 
     @lower(handle, CPointer(_type___nv_bfloat162), _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -6865,7 +7105,9 @@ def _lower__ZL6__stcsP13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL6__stcsP13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL6__stcsP13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals()["__stcs"]
+    handle = globals().get("__stcs")
+    if handle is None:
+        handle = __stcs
 
     @lower(handle, CPointer(_type___nv_bfloat16), _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -6916,7 +7158,9 @@ def _lower__ZL6__stwtP14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL6__stwtP14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL6__stwtP14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__stwt"]
+    handle = globals().get("__stwt")
+    if handle is None:
+        handle = __stwt
 
     @lower(handle, CPointer(_type___nv_bfloat162), _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -6963,7 +7207,9 @@ def _lower__ZL6__stwtP13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL6__stwtP13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL6__stwtP13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals()["__stwt"]
+    handle = globals().get("__stwt")
+    if handle is None:
+        handle = __stwt
 
     @lower(handle, CPointer(_type___nv_bfloat16), _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -7013,7 +7259,9 @@ def _lower__ZL6__heq214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL6__heq214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL6__heq214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__heq2"]
+    handle = globals().get("__heq2")
+    if handle is None:
+        handle = __heq2
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -7063,7 +7311,9 @@ def _lower__ZL6__hne214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL6__hne214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL6__hne214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hne2"]
+    handle = globals().get("__hne2")
+    if handle is None:
+        handle = __hne2
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -7113,7 +7363,9 @@ def _lower__ZL6__hle214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL6__hle214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL6__hle214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hle2"]
+    handle = globals().get("__hle2")
+    if handle is None:
+        handle = __hle2
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -7163,7 +7415,9 @@ def _lower__ZL6__hge214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL6__hge214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL6__hge214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hge2"]
+    handle = globals().get("__hge2")
+    if handle is None:
+        handle = __hge2
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -7213,7 +7467,9 @@ def _lower__ZL6__hlt214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL6__hlt214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL6__hlt214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hlt2"]
+    handle = globals().get("__hlt2")
+    if handle is None:
+        handle = __hlt2
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -7263,7 +7519,9 @@ def _lower__ZL6__hgt214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL6__hgt214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL6__hgt214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hgt2"]
+    handle = globals().get("__hgt2")
+    if handle is None:
+        handle = __hgt2
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -7313,7 +7571,9 @@ def _lower__ZL7__hequ214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL7__hequ214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL7__hequ214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hequ2"]
+    handle = globals().get("__hequ2")
+    if handle is None:
+        handle = __hequ2
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -7363,7 +7623,9 @@ def _lower__ZL7__hneu214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL7__hneu214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL7__hneu214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hneu2"]
+    handle = globals().get("__hneu2")
+    if handle is None:
+        handle = __hneu2
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -7413,7 +7675,9 @@ def _lower__ZL7__hleu214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL7__hleu214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL7__hleu214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hleu2"]
+    handle = globals().get("__hleu2")
+    if handle is None:
+        handle = __hleu2
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -7463,7 +7727,9 @@ def _lower__ZL7__hgeu214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL7__hgeu214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL7__hgeu214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hgeu2"]
+    handle = globals().get("__hgeu2")
+    if handle is None:
+        handle = __hgeu2
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -7513,7 +7779,9 @@ def _lower__ZL7__hltu214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL7__hltu214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL7__hltu214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hltu2"]
+    handle = globals().get("__hltu2")
+    if handle is None:
+        handle = __hltu2
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -7563,7 +7831,9 @@ def _lower__ZL7__hgtu214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL7__hgtu214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL7__hgtu214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hgtu2"]
+    handle = globals().get("__hgtu2")
+    if handle is None:
+        handle = __hgtu2
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -7611,7 +7881,9 @@ def _lower__ZL11__heq2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL11__heq2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL11__heq2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__heq2_mask"]
+    handle = globals().get("__heq2_mask")
+    if handle is None:
+        handle = __heq2_mask
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -7659,7 +7931,9 @@ def _lower__ZL11__hne2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL11__hne2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL11__hne2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hne2_mask"]
+    handle = globals().get("__hne2_mask")
+    if handle is None:
+        handle = __hne2_mask
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -7707,7 +7981,9 @@ def _lower__ZL11__hle2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL11__hle2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL11__hle2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hle2_mask"]
+    handle = globals().get("__hle2_mask")
+    if handle is None:
+        handle = __hle2_mask
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -7755,7 +8031,9 @@ def _lower__ZL11__hge2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL11__hge2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL11__hge2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hge2_mask"]
+    handle = globals().get("__hge2_mask")
+    if handle is None:
+        handle = __hge2_mask
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -7803,7 +8081,9 @@ def _lower__ZL11__hlt2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL11__hlt2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL11__hlt2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hlt2_mask"]
+    handle = globals().get("__hlt2_mask")
+    if handle is None:
+        handle = __hlt2_mask
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -7851,7 +8131,9 @@ def _lower__ZL11__hgt2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL11__hgt2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL11__hgt2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hgt2_mask"]
+    handle = globals().get("__hgt2_mask")
+    if handle is None:
+        handle = __hgt2_mask
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -7899,7 +8181,9 @@ def _lower__ZL12__hequ2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL12__hequ2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL12__hequ2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hequ2_mask"]
+    handle = globals().get("__hequ2_mask")
+    if handle is None:
+        handle = __hequ2_mask
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -7947,7 +8231,9 @@ def _lower__ZL12__hneu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL12__hneu2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL12__hneu2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hneu2_mask"]
+    handle = globals().get("__hneu2_mask")
+    if handle is None:
+        handle = __hneu2_mask
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -7995,7 +8281,9 @@ def _lower__ZL12__hleu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL12__hleu2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL12__hleu2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hleu2_mask"]
+    handle = globals().get("__hleu2_mask")
+    if handle is None:
+        handle = __hleu2_mask
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -8043,7 +8331,9 @@ def _lower__ZL12__hgeu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL12__hgeu2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL12__hgeu2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hgeu2_mask"]
+    handle = globals().get("__hgeu2_mask")
+    if handle is None:
+        handle = __hgeu2_mask
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -8091,7 +8381,9 @@ def _lower__ZL12__hltu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL12__hltu2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL12__hltu2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hltu2_mask"]
+    handle = globals().get("__hltu2_mask")
+    if handle is None:
+        handle = __hltu2_mask
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -8139,7 +8431,9 @@ def _lower__ZL12__hgtu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL12__hgtu2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL12__hgtu2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hgtu2_mask"]
+    handle = globals().get("__hgtu2_mask")
+    if handle is None:
+        handle = __hgtu2_mask
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -8187,7 +8481,9 @@ def _lower__ZL9__hisnan214__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL9__hisnan214__nv_bfloat162_nbst_caller(arg_0):
         return _ZL9__hisnan214__nv_bfloat162_nbst(arg_0)
 
-    handle = globals()["__hisnan2"]
+    handle = globals().get("__hisnan2")
+    if handle is None:
+        handle = __hisnan2
 
     @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -8233,7 +8529,9 @@ def _lower__ZL7__hadd214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL7__hadd214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL7__hadd214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hadd2"]
+    handle = globals().get("__hadd2")
+    if handle is None:
+        handle = __hadd2
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -8283,7 +8581,9 @@ def _lower__ZL7__hsub214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL7__hsub214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL7__hsub214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hsub2"]
+    handle = globals().get("__hsub2")
+    if handle is None:
+        handle = __hsub2
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -8333,7 +8633,9 @@ def _lower__ZL7__hmul214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL7__hmul214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL7__hmul214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hmul2"]
+    handle = globals().get("__hmul2")
+    if handle is None:
+        handle = __hmul2
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -8383,7 +8685,9 @@ def _lower__ZL10__hadd2_rn14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL10__hadd2_rn14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL10__hadd2_rn14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hadd2_rn"]
+    handle = globals().get("__hadd2_rn")
+    if handle is None:
+        handle = __hadd2_rn
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -8433,7 +8737,9 @@ def _lower__ZL10__hsub2_rn14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL10__hsub2_rn14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL10__hsub2_rn14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hsub2_rn"]
+    handle = globals().get("__hsub2_rn")
+    if handle is None:
+        handle = __hsub2_rn
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -8483,7 +8789,9 @@ def _lower__ZL10__hmul2_rn14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL10__hmul2_rn14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL10__hmul2_rn14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hmul2_rn"]
+    handle = globals().get("__hmul2_rn")
+    if handle is None:
+        handle = __hmul2_rn
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -8533,7 +8841,9 @@ def _lower__ZL7__h2div14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL7__h2div14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL7__h2div14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__h2div"]
+    handle = globals().get("__h2div")
+    if handle is None:
+        handle = __h2div
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -8581,7 +8891,9 @@ def _lower__ZL7__habs214__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL7__habs214__nv_bfloat162_nbst_caller(arg_0):
         return _ZL7__habs214__nv_bfloat162_nbst(arg_0)
 
-    handle = globals()["__habs2"]
+    handle = globals().get("__habs2")
+    if handle is None:
+        handle = __habs2
 
     @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -8627,7 +8939,9 @@ def _lower__ZL11__hadd2_sat14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL11__hadd2_sat14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL11__hadd2_sat14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hadd2_sat"]
+    handle = globals().get("__hadd2_sat")
+    if handle is None:
+        handle = __hadd2_sat
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -8677,7 +8991,9 @@ def _lower__ZL11__hsub2_sat14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL11__hsub2_sat14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL11__hsub2_sat14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hsub2_sat"]
+    handle = globals().get("__hsub2_sat")
+    if handle is None:
+        handle = __hsub2_sat
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -8727,7 +9043,9 @@ def _lower__ZL11__hmul2_sat14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL11__hmul2_sat14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL11__hmul2_sat14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hmul2_sat"]
+    handle = globals().get("__hmul2_sat")
+    if handle is None:
+        handle = __hmul2_sat
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -8779,7 +9097,9 @@ def _lower__ZL7__hfma214__nv_bfloat162S_S__nbst(shim_stream, shim_obj):
     def _ZL7__hfma214__nv_bfloat162S_S__nbst_caller(arg_0, arg_1, arg_2):
         return _ZL7__hfma214__nv_bfloat162S_S__nbst(arg_0, arg_1, arg_2)
 
-    handle = globals()["__hfma2"]
+    handle = globals().get("__hfma2")
+    if handle is None:
+        handle = __hfma2
 
     @lower(
         handle, _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
@@ -8834,7 +9154,9 @@ def _lower__ZL11__hfma2_sat14__nv_bfloat162S_S__nbst(shim_stream, shim_obj):
     def _ZL11__hfma2_sat14__nv_bfloat162S_S__nbst_caller(arg_0, arg_1, arg_2):
         return _ZL11__hfma2_sat14__nv_bfloat162S_S__nbst(arg_0, arg_1, arg_2)
 
-    handle = globals()["__hfma2_sat"]
+    handle = globals().get("__hfma2_sat")
+    if handle is None:
+        handle = __hfma2_sat
 
     @lower(
         handle, _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
@@ -8885,7 +9207,9 @@ def _lower__ZL7__hneg214__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL7__hneg214__nv_bfloat162_nbst_caller(arg_0):
         return _ZL7__hneg214__nv_bfloat162_nbst(arg_0)
 
-    handle = globals()["__hneg2"]
+    handle = globals().get("__hneg2")
+    if handle is None:
+        handle = __hneg2
 
     @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -8929,7 +9253,9 @@ def _lower__ZL6__habs13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL6__habs13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL6__habs13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["__habs"]
+    handle = globals().get("__habs")
+    if handle is None:
+        handle = __habs
 
     @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -8975,7 +9301,9 @@ def _lower__ZL6__hadd13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL6__hadd13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL6__hadd13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hadd"]
+    handle = globals().get("__hadd")
+    if handle is None:
+        handle = __hadd
 
     @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -9025,7 +9353,9 @@ def _lower__ZL6__hsub13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL6__hsub13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL6__hsub13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hsub"]
+    handle = globals().get("__hsub")
+    if handle is None:
+        handle = __hsub
 
     @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -9075,7 +9405,9 @@ def _lower__ZL6__hmul13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL6__hmul13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL6__hmul13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hmul"]
+    handle = globals().get("__hmul")
+    if handle is None:
+        handle = __hmul
 
     @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -9125,7 +9457,9 @@ def _lower__ZL9__hadd_rn13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL9__hadd_rn13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL9__hadd_rn13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hadd_rn"]
+    handle = globals().get("__hadd_rn")
+    if handle is None:
+        handle = __hadd_rn
 
     @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -9175,7 +9509,9 @@ def _lower__ZL9__hsub_rn13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL9__hsub_rn13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL9__hsub_rn13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hsub_rn"]
+    handle = globals().get("__hsub_rn")
+    if handle is None:
+        handle = __hsub_rn
 
     @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -9225,7 +9561,9 @@ def _lower__ZL9__hmul_rn13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL9__hmul_rn13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL9__hmul_rn13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hmul_rn"]
+    handle = globals().get("__hmul_rn")
+    if handle is None:
+        handle = __hmul_rn
 
     @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -9275,7 +9613,9 @@ def _lower__ZL6__hdiv13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL6__hdiv13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL6__hdiv13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hdiv"]
+    handle = globals().get("__hdiv")
+    if handle is None:
+        handle = __hdiv
 
     @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -9325,7 +9665,9 @@ def _lower__ZL10__hadd_sat13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL10__hadd_sat13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL10__hadd_sat13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hadd_sat"]
+    handle = globals().get("__hadd_sat")
+    if handle is None:
+        handle = __hadd_sat
 
     @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -9375,7 +9717,9 @@ def _lower__ZL10__hsub_sat13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL10__hsub_sat13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL10__hsub_sat13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hsub_sat"]
+    handle = globals().get("__hsub_sat")
+    if handle is None:
+        handle = __hsub_sat
 
     @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -9425,7 +9769,9 @@ def _lower__ZL10__hmul_sat13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL10__hmul_sat13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL10__hmul_sat13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hmul_sat"]
+    handle = globals().get("__hmul_sat")
+    if handle is None:
+        handle = __hmul_sat
 
     @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -9477,7 +9823,9 @@ def _lower__ZL6__hfma13__nv_bfloat16S_S__nbst(shim_stream, shim_obj):
     def _ZL6__hfma13__nv_bfloat16S_S__nbst_caller(arg_0, arg_1, arg_2):
         return _ZL6__hfma13__nv_bfloat16S_S__nbst(arg_0, arg_1, arg_2)
 
-    handle = globals()["__hfma"]
+    handle = globals().get("__hfma")
+    if handle is None:
+        handle = __hfma
 
     @lower(
         handle, _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16
@@ -9532,7 +9880,9 @@ def _lower__ZL10__hfma_sat13__nv_bfloat16S_S__nbst(shim_stream, shim_obj):
     def _ZL10__hfma_sat13__nv_bfloat16S_S__nbst_caller(arg_0, arg_1, arg_2):
         return _ZL10__hfma_sat13__nv_bfloat16S_S__nbst(arg_0, arg_1, arg_2)
 
-    handle = globals()["__hfma_sat"]
+    handle = globals().get("__hfma_sat")
+    if handle is None:
+        handle = __hfma_sat
 
     @lower(
         handle, _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16
@@ -9583,7 +9933,9 @@ def _lower__ZL6__hneg13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL6__hneg13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL6__hneg13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["__hneg"]
+    handle = globals().get("__hneg")
+    if handle is None:
+        handle = __hneg
 
     @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -9627,7 +9979,9 @@ def _lower__ZL7__hbeq214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL7__hbeq214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL7__hbeq214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hbeq2"]
+    handle = globals().get("__hbeq2")
+    if handle is None:
+        handle = __hbeq2
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -9675,7 +10029,9 @@ def _lower__ZL7__hbne214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL7__hbne214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL7__hbne214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hbne2"]
+    handle = globals().get("__hbne2")
+    if handle is None:
+        handle = __hbne2
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -9723,7 +10079,9 @@ def _lower__ZL7__hble214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL7__hble214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL7__hble214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hble2"]
+    handle = globals().get("__hble2")
+    if handle is None:
+        handle = __hble2
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -9771,7 +10129,9 @@ def _lower__ZL7__hbge214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL7__hbge214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL7__hbge214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hbge2"]
+    handle = globals().get("__hbge2")
+    if handle is None:
+        handle = __hbge2
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -9819,7 +10179,9 @@ def _lower__ZL7__hblt214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL7__hblt214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL7__hblt214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hblt2"]
+    handle = globals().get("__hblt2")
+    if handle is None:
+        handle = __hblt2
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -9867,7 +10229,9 @@ def _lower__ZL7__hbgt214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL7__hbgt214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL7__hbgt214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hbgt2"]
+    handle = globals().get("__hbgt2")
+    if handle is None:
+        handle = __hbgt2
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -9915,7 +10279,9 @@ def _lower__ZL8__hbequ214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL8__hbequ214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL8__hbequ214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hbequ2"]
+    handle = globals().get("__hbequ2")
+    if handle is None:
+        handle = __hbequ2
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -9963,7 +10329,9 @@ def _lower__ZL8__hbneu214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL8__hbneu214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL8__hbneu214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hbneu2"]
+    handle = globals().get("__hbneu2")
+    if handle is None:
+        handle = __hbneu2
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -10011,7 +10379,9 @@ def _lower__ZL8__hbleu214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL8__hbleu214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL8__hbleu214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hbleu2"]
+    handle = globals().get("__hbleu2")
+    if handle is None:
+        handle = __hbleu2
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -10059,7 +10429,9 @@ def _lower__ZL8__hbgeu214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL8__hbgeu214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL8__hbgeu214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hbgeu2"]
+    handle = globals().get("__hbgeu2")
+    if handle is None:
+        handle = __hbgeu2
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -10107,7 +10479,9 @@ def _lower__ZL8__hbltu214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL8__hbltu214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL8__hbltu214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hbltu2"]
+    handle = globals().get("__hbltu2")
+    if handle is None:
+        handle = __hbltu2
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -10155,7 +10529,9 @@ def _lower__ZL8__hbgtu214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL8__hbgtu214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL8__hbgtu214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hbgtu2"]
+    handle = globals().get("__hbgtu2")
+    if handle is None:
+        handle = __hbgtu2
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -10203,7 +10579,9 @@ def _lower__ZL5__heq13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL5__heq13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL5__heq13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals()["__heq"]
+    handle = globals().get("__heq")
+    if handle is None:
+        handle = __heq
 
     @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -10251,7 +10629,9 @@ def _lower__ZL5__hne13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL5__hne13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL5__hne13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hne"]
+    handle = globals().get("__hne")
+    if handle is None:
+        handle = __hne
 
     @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -10299,7 +10679,9 @@ def _lower__ZL5__hle13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL5__hle13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL5__hle13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hle"]
+    handle = globals().get("__hle")
+    if handle is None:
+        handle = __hle
 
     @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -10347,7 +10729,9 @@ def _lower__ZL5__hge13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL5__hge13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL5__hge13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hge"]
+    handle = globals().get("__hge")
+    if handle is None:
+        handle = __hge
 
     @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -10395,7 +10779,9 @@ def _lower__ZL5__hlt13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL5__hlt13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL5__hlt13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hlt"]
+    handle = globals().get("__hlt")
+    if handle is None:
+        handle = __hlt
 
     @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -10443,7 +10829,9 @@ def _lower__ZL5__hgt13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL5__hgt13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL5__hgt13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hgt"]
+    handle = globals().get("__hgt")
+    if handle is None:
+        handle = __hgt
 
     @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -10491,7 +10879,9 @@ def _lower__ZL6__hequ13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL6__hequ13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL6__hequ13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hequ"]
+    handle = globals().get("__hequ")
+    if handle is None:
+        handle = __hequ
 
     @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -10539,7 +10929,9 @@ def _lower__ZL6__hneu13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL6__hneu13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL6__hneu13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hneu"]
+    handle = globals().get("__hneu")
+    if handle is None:
+        handle = __hneu
 
     @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -10587,7 +10979,9 @@ def _lower__ZL6__hleu13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL6__hleu13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL6__hleu13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hleu"]
+    handle = globals().get("__hleu")
+    if handle is None:
+        handle = __hleu
 
     @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -10635,7 +11029,9 @@ def _lower__ZL6__hgeu13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL6__hgeu13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL6__hgeu13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hgeu"]
+    handle = globals().get("__hgeu")
+    if handle is None:
+        handle = __hgeu
 
     @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -10683,7 +11079,9 @@ def _lower__ZL6__hltu13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL6__hltu13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL6__hltu13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hltu"]
+    handle = globals().get("__hltu")
+    if handle is None:
+        handle = __hltu
 
     @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -10731,7 +11129,9 @@ def _lower__ZL6__hgtu13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL6__hgtu13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL6__hgtu13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hgtu"]
+    handle = globals().get("__hgtu")
+    if handle is None:
+        handle = __hgtu
 
     @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -10778,7 +11178,9 @@ def _lower__ZL8__hisnan13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL8__hisnan13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL8__hisnan13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["__hisnan"]
+    handle = globals().get("__hisnan")
+    if handle is None:
+        handle = __hisnan
 
     @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -10824,7 +11226,9 @@ def _lower__ZL6__hmax13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL6__hmax13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL6__hmax13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hmax"]
+    handle = globals().get("__hmax")
+    if handle is None:
+        handle = __hmax
 
     @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -10874,7 +11278,9 @@ def _lower__ZL6__hmin13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL6__hmin13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL6__hmin13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hmin"]
+    handle = globals().get("__hmin")
+    if handle is None:
+        handle = __hmin
 
     @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -10924,7 +11330,9 @@ def _lower__ZL10__hmax_nan13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL10__hmax_nan13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL10__hmax_nan13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hmax_nan"]
+    handle = globals().get("__hmax_nan")
+    if handle is None:
+        handle = __hmax_nan
 
     @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -10974,7 +11382,9 @@ def _lower__ZL10__hmin_nan13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL10__hmin_nan13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL10__hmin_nan13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hmin_nan"]
+    handle = globals().get("__hmin_nan")
+    if handle is None:
+        handle = __hmin_nan
 
     @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -11026,7 +11436,9 @@ def _lower__ZL11__hfma_relu13__nv_bfloat16S_S__nbst(shim_stream, shim_obj):
     def _ZL11__hfma_relu13__nv_bfloat16S_S__nbst_caller(arg_0, arg_1, arg_2):
         return _ZL11__hfma_relu13__nv_bfloat16S_S__nbst(arg_0, arg_1, arg_2)
 
-    handle = globals()["__hfma_relu"]
+    handle = globals().get("__hfma_relu")
+    if handle is None:
+        handle = __hfma_relu
 
     @lower(
         handle, _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16
@@ -11079,7 +11491,9 @@ def _lower__ZL7__hmax214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL7__hmax214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL7__hmax214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hmax2"]
+    handle = globals().get("__hmax2")
+    if handle is None:
+        handle = __hmax2
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -11129,7 +11543,9 @@ def _lower__ZL7__hmin214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL7__hmin214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL7__hmin214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hmin2"]
+    handle = globals().get("__hmin2")
+    if handle is None:
+        handle = __hmin2
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -11179,7 +11595,9 @@ def _lower__ZL11__hmax2_nan14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL11__hmax2_nan14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL11__hmax2_nan14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hmax2_nan"]
+    handle = globals().get("__hmax2_nan")
+    if handle is None:
+        handle = __hmax2_nan
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -11229,7 +11647,9 @@ def _lower__ZL11__hmin2_nan14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL11__hmin2_nan14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL11__hmin2_nan14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["__hmin2_nan"]
+    handle = globals().get("__hmin2_nan")
+    if handle is None:
+        handle = __hmin2_nan
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -11281,7 +11701,9 @@ def _lower__ZL12__hfma2_relu14__nv_bfloat162S_S__nbst(shim_stream, shim_obj):
     def _ZL12__hfma2_relu14__nv_bfloat162S_S__nbst_caller(arg_0, arg_1, arg_2):
         return _ZL12__hfma2_relu14__nv_bfloat162S_S__nbst(arg_0, arg_1, arg_2)
 
-    handle = globals()["__hfma2_relu"]
+    handle = globals().get("__hfma2_relu")
+    if handle is None:
+        handle = __hfma2_relu
 
     @lower(
         handle, _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
@@ -11336,7 +11758,9 @@ def _lower__ZL8__hcmadd14__nv_bfloat162S_S__nbst(shim_stream, shim_obj):
     def _ZL8__hcmadd14__nv_bfloat162S_S__nbst_caller(arg_0, arg_1, arg_2):
         return _ZL8__hcmadd14__nv_bfloat162S_S__nbst(arg_0, arg_1, arg_2)
 
-    handle = globals()["__hcmadd"]
+    handle = globals().get("__hcmadd")
+    if handle is None:
+        handle = __hcmadd
 
     @lower(
         handle, _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
@@ -11387,7 +11811,9 @@ def _lower__ZL5hsqrt13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL5hsqrt13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL5hsqrt13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["hsqrt"]
+    handle = globals().get("hsqrt")
+    if handle is None:
+        handle = hsqrt
 
     @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -11431,7 +11857,9 @@ def _lower__ZL6hrsqrt13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL6hrsqrt13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL6hrsqrt13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["hrsqrt"]
+    handle = globals().get("hrsqrt")
+    if handle is None:
+        handle = hrsqrt
 
     @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -11475,7 +11903,9 @@ def _lower__ZL4hrcp13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL4hrcp13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL4hrcp13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["hrcp"]
+    handle = globals().get("hrcp")
+    if handle is None:
+        handle = hrcp
 
     @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -11517,7 +11947,9 @@ def _lower__ZL4hlog13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL4hlog13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL4hlog13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["hlog"]
+    handle = globals().get("hlog")
+    if handle is None:
+        handle = hlog
 
     @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -11559,7 +11991,9 @@ def _lower__ZL5hlog213__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL5hlog213__nv_bfloat16_nbst_caller(arg_0):
         return _ZL5hlog213__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["hlog2"]
+    handle = globals().get("hlog2")
+    if handle is None:
+        handle = hlog2
 
     @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -11603,7 +12037,9 @@ def _lower__ZL6hlog1013__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL6hlog1013__nv_bfloat16_nbst_caller(arg_0):
         return _ZL6hlog1013__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["hlog10"]
+    handle = globals().get("hlog10")
+    if handle is None:
+        handle = hlog10
 
     @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -11647,7 +12083,9 @@ def _lower__ZL4hexp13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL4hexp13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL4hexp13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["hexp"]
+    handle = globals().get("hexp")
+    if handle is None:
+        handle = hexp
 
     @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -11689,7 +12127,9 @@ def _lower__ZL12htanh_approx13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL12htanh_approx13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL12htanh_approx13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["htanh_approx"]
+    handle = globals().get("htanh_approx")
+    if handle is None:
+        handle = htanh_approx
 
     @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -11733,7 +12173,9 @@ def _lower__ZL13h2tanh_approx14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL13h2tanh_approx14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL13h2tanh_approx14__nv_bfloat162_nbst(arg_0)
 
-    handle = globals()["h2tanh_approx"]
+    handle = globals().get("h2tanh_approx")
+    if handle is None:
+        handle = h2tanh_approx
 
     @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -11777,7 +12219,9 @@ def _lower__ZL5htanh13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL5htanh13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL5htanh13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["htanh"]
+    handle = globals().get("htanh")
+    if handle is None:
+        handle = htanh
 
     @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -11821,7 +12265,9 @@ def _lower__ZL6h2tanh14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL6h2tanh14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL6h2tanh14__nv_bfloat162_nbst(arg_0)
 
-    handle = globals()["h2tanh"]
+    handle = globals().get("h2tanh")
+    if handle is None:
+        handle = h2tanh
 
     @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -11865,7 +12311,9 @@ def _lower__ZL5hexp213__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL5hexp213__nv_bfloat16_nbst_caller(arg_0):
         return _ZL5hexp213__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["hexp2"]
+    handle = globals().get("hexp2")
+    if handle is None:
+        handle = hexp2
 
     @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -11909,7 +12357,9 @@ def _lower__ZL6hexp1013__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL6hexp1013__nv_bfloat16_nbst_caller(arg_0):
         return _ZL6hexp1013__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["hexp10"]
+    handle = globals().get("hexp10")
+    if handle is None:
+        handle = hexp10
 
     @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -11953,7 +12403,9 @@ def _lower__ZL4hcos13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL4hcos13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL4hcos13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["hcos"]
+    handle = globals().get("hcos")
+    if handle is None:
+        handle = hcos
 
     @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -11995,7 +12447,9 @@ def _lower__ZL4hsin13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL4hsin13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL4hsin13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["hsin"]
+    handle = globals().get("hsin")
+    if handle is None:
+        handle = hsin
 
     @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -12037,7 +12491,9 @@ def _lower__ZL6h2sqrt14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL6h2sqrt14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL6h2sqrt14__nv_bfloat162_nbst(arg_0)
 
-    handle = globals()["h2sqrt"]
+    handle = globals().get("h2sqrt")
+    if handle is None:
+        handle = h2sqrt
 
     @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -12081,7 +12537,9 @@ def _lower__ZL7h2rsqrt14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL7h2rsqrt14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL7h2rsqrt14__nv_bfloat162_nbst(arg_0)
 
-    handle = globals()["h2rsqrt"]
+    handle = globals().get("h2rsqrt")
+    if handle is None:
+        handle = h2rsqrt
 
     @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -12125,7 +12583,9 @@ def _lower__ZL5h2rcp14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL5h2rcp14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL5h2rcp14__nv_bfloat162_nbst(arg_0)
 
-    handle = globals()["h2rcp"]
+    handle = globals().get("h2rcp")
+    if handle is None:
+        handle = h2rcp
 
     @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -12169,7 +12629,9 @@ def _lower__ZL5h2log14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL5h2log14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL5h2log14__nv_bfloat162_nbst(arg_0)
 
-    handle = globals()["h2log"]
+    handle = globals().get("h2log")
+    if handle is None:
+        handle = h2log
 
     @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -12213,7 +12675,9 @@ def _lower__ZL6h2log214__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL6h2log214__nv_bfloat162_nbst_caller(arg_0):
         return _ZL6h2log214__nv_bfloat162_nbst(arg_0)
 
-    handle = globals()["h2log2"]
+    handle = globals().get("h2log2")
+    if handle is None:
+        handle = h2log2
 
     @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -12257,7 +12721,9 @@ def _lower__ZL7h2log1014__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL7h2log1014__nv_bfloat162_nbst_caller(arg_0):
         return _ZL7h2log1014__nv_bfloat162_nbst(arg_0)
 
-    handle = globals()["h2log10"]
+    handle = globals().get("h2log10")
+    if handle is None:
+        handle = h2log10
 
     @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -12301,7 +12767,9 @@ def _lower__ZL5h2exp14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL5h2exp14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL5h2exp14__nv_bfloat162_nbst(arg_0)
 
-    handle = globals()["h2exp"]
+    handle = globals().get("h2exp")
+    if handle is None:
+        handle = h2exp
 
     @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -12345,7 +12813,9 @@ def _lower__ZL6h2exp214__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL6h2exp214__nv_bfloat162_nbst_caller(arg_0):
         return _ZL6h2exp214__nv_bfloat162_nbst(arg_0)
 
-    handle = globals()["h2exp2"]
+    handle = globals().get("h2exp2")
+    if handle is None:
+        handle = h2exp2
 
     @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -12389,7 +12859,9 @@ def _lower__ZL7h2exp1014__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL7h2exp1014__nv_bfloat162_nbst_caller(arg_0):
         return _ZL7h2exp1014__nv_bfloat162_nbst(arg_0)
 
-    handle = globals()["h2exp10"]
+    handle = globals().get("h2exp10")
+    if handle is None:
+        handle = h2exp10
 
     @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -12433,7 +12905,9 @@ def _lower__ZL5h2cos14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL5h2cos14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL5h2cos14__nv_bfloat162_nbst(arg_0)
 
-    handle = globals()["h2cos"]
+    handle = globals().get("h2cos")
+    if handle is None:
+        handle = h2cos
 
     @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -12477,7 +12951,9 @@ def _lower__ZL5h2sin14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL5h2sin14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL5h2sin14__nv_bfloat162_nbst(arg_0)
 
-    handle = globals()["h2sin"]
+    handle = globals().get("h2sin")
+    if handle is None:
+        handle = h2sin
 
     @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -12524,7 +13000,9 @@ def _lower__ZL9atomicAddP14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL9atomicAddP14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL9atomicAddP14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals()["atomicAdd"]
+    handle = globals().get("atomicAdd")
+    if handle is None:
+        handle = atomicAdd
 
     @lower(handle, CPointer(_type___nv_bfloat162), _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -12571,7 +13049,9 @@ def _lower__ZL9atomicAddP13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL9atomicAddP13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL9atomicAddP13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals()["atomicAdd"]
+    handle = globals().get("atomicAdd")
+    if handle is None:
+        handle = atomicAdd
 
     @lower(handle, CPointer(_type___nv_bfloat16), _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -12617,7 +13097,9 @@ def _lower__ZplRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj):
     def _ZplRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1):
         return _ZplRK13__nv_bfloat16S1__nbst(arg_0, arg_1)
 
-    handle = globals()["operator.add"]
+    handle = globals().get("operator.add")
+    if handle is None:
+        handle = operator.add
 
     @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -12663,7 +13145,9 @@ def _lower__ZmiRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj):
     def _ZmiRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1):
         return _ZmiRK13__nv_bfloat16S1__nbst(arg_0, arg_1)
 
-    handle = globals()["operator.sub"]
+    handle = globals().get("operator.sub")
+    if handle is None:
+        handle = operator.sub
 
     @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -12709,7 +13193,9 @@ def _lower__ZmlRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj):
     def _ZmlRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1):
         return _ZmlRK13__nv_bfloat16S1__nbst(arg_0, arg_1)
 
-    handle = globals()["operator.mul"]
+    handle = globals().get("operator.mul")
+    if handle is None:
+        handle = operator.mul
 
     @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -12755,7 +13241,9 @@ def _lower__ZdvRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj):
     def _ZdvRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1):
         return _ZdvRK13__nv_bfloat16S1__nbst(arg_0, arg_1)
 
-    handle = globals()["operator.truediv"]
+    handle = globals().get("operator.truediv")
+    if handle is None:
+        handle = operator.truediv
 
     @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -12801,7 +13289,9 @@ def _lower__ZpLR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj):
     def _ZpLR13__nv_bfloat16RKS__nbst_caller(arg_0, arg_1):
         return _ZpLR13__nv_bfloat16RKS__nbst(arg_0, arg_1)
 
-    handle = globals()["operator.iadd"]
+    handle = globals().get("operator.iadd")
+    if handle is None:
+        handle = operator.iadd
 
     @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -12847,7 +13337,9 @@ def _lower__ZmIR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj):
     def _ZmIR13__nv_bfloat16RKS__nbst_caller(arg_0, arg_1):
         return _ZmIR13__nv_bfloat16RKS__nbst(arg_0, arg_1)
 
-    handle = globals()["operator.isub"]
+    handle = globals().get("operator.isub")
+    if handle is None:
+        handle = operator.isub
 
     @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -12893,7 +13385,9 @@ def _lower__ZmLR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj):
     def _ZmLR13__nv_bfloat16RKS__nbst_caller(arg_0, arg_1):
         return _ZmLR13__nv_bfloat16RKS__nbst(arg_0, arg_1)
 
-    handle = globals()["operator.imul"]
+    handle = globals().get("operator.imul")
+    if handle is None:
+        handle = operator.imul
 
     @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -12939,7 +13433,9 @@ def _lower__ZdVR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj):
     def _ZdVR13__nv_bfloat16RKS__nbst_caller(arg_0, arg_1):
         return _ZdVR13__nv_bfloat16RKS__nbst(arg_0, arg_1)
 
-    handle = globals()["operator.itruediv"]
+    handle = globals().get("operator.itruediv")
+    if handle is None:
+        handle = operator.itruediv
 
     @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -12983,7 +13479,9 @@ def _lower__ZpsRK13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZpsRK13__nv_bfloat16_nbst_caller(arg_0):
         return _ZpsRK13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["operator.pos"]
+    handle = globals().get("operator.pos")
+    if handle is None:
+        handle = operator.pos
 
     @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -13021,7 +13519,9 @@ def _lower__ZngRK13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZngRK13__nv_bfloat16_nbst_caller(arg_0):
         return _ZngRK13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["operator.neg"]
+    handle = globals().get("operator.neg")
+    if handle is None:
+        handle = operator.neg
 
     @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -13059,7 +13559,9 @@ def _lower__ZeqRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj):
     def _ZeqRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1):
         return _ZeqRK13__nv_bfloat16S1__nbst(arg_0, arg_1)
 
-    handle = globals()["operator.eq"]
+    handle = globals().get("operator.eq")
+    if handle is None:
+        handle = operator.eq
 
     @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -13103,7 +13605,9 @@ def _lower__ZneRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj):
     def _ZneRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1):
         return _ZneRK13__nv_bfloat16S1__nbst(arg_0, arg_1)
 
-    handle = globals()["operator.ne"]
+    handle = globals().get("operator.ne")
+    if handle is None:
+        handle = operator.ne
 
     @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -13147,7 +13651,9 @@ def _lower__ZgtRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj):
     def _ZgtRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1):
         return _ZgtRK13__nv_bfloat16S1__nbst(arg_0, arg_1)
 
-    handle = globals()["operator.gt"]
+    handle = globals().get("operator.gt")
+    if handle is None:
+        handle = operator.gt
 
     @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -13191,7 +13697,9 @@ def _lower__ZltRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj):
     def _ZltRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1):
         return _ZltRK13__nv_bfloat16S1__nbst(arg_0, arg_1)
 
-    handle = globals()["operator.lt"]
+    handle = globals().get("operator.lt")
+    if handle is None:
+        handle = operator.lt
 
     @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -13235,7 +13743,9 @@ def _lower__ZgeRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj):
     def _ZgeRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1):
         return _ZgeRK13__nv_bfloat16S1__nbst(arg_0, arg_1)
 
-    handle = globals()["operator.ge"]
+    handle = globals().get("operator.ge")
+    if handle is None:
+        handle = operator.ge
 
     @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -13279,7 +13789,9 @@ def _lower__ZleRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj):
     def _ZleRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1):
         return _ZleRK13__nv_bfloat16S1__nbst(arg_0, arg_1)
 
-    handle = globals()["operator.le"]
+    handle = globals().get("operator.le")
+    if handle is None:
+        handle = operator.le
 
     @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
@@ -13325,7 +13837,9 @@ def _lower__ZplRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj):
     def _ZplRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1):
         return _ZplRK14__nv_bfloat162S1__nbst(arg_0, arg_1)
 
-    handle = globals()["operator.add"]
+    handle = globals().get("operator.add")
+    if handle is None:
+        handle = operator.add
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -13371,7 +13885,9 @@ def _lower__ZmiRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj):
     def _ZmiRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1):
         return _ZmiRK14__nv_bfloat162S1__nbst(arg_0, arg_1)
 
-    handle = globals()["operator.sub"]
+    handle = globals().get("operator.sub")
+    if handle is None:
+        handle = operator.sub
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -13417,7 +13933,9 @@ def _lower__ZmlRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj):
     def _ZmlRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1):
         return _ZmlRK14__nv_bfloat162S1__nbst(arg_0, arg_1)
 
-    handle = globals()["operator.mul"]
+    handle = globals().get("operator.mul")
+    if handle is None:
+        handle = operator.mul
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -13463,7 +13981,9 @@ def _lower__ZdvRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj):
     def _ZdvRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1):
         return _ZdvRK14__nv_bfloat162S1__nbst(arg_0, arg_1)
 
-    handle = globals()["operator.truediv"]
+    handle = globals().get("operator.truediv")
+    if handle is None:
+        handle = operator.truediv
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -13509,7 +14029,9 @@ def _lower__ZpLR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj):
     def _ZpLR14__nv_bfloat162RKS__nbst_caller(arg_0, arg_1):
         return _ZpLR14__nv_bfloat162RKS__nbst(arg_0, arg_1)
 
-    handle = globals()["operator.iadd"]
+    handle = globals().get("operator.iadd")
+    if handle is None:
+        handle = operator.iadd
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -13555,7 +14077,9 @@ def _lower__ZmIR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj):
     def _ZmIR14__nv_bfloat162RKS__nbst_caller(arg_0, arg_1):
         return _ZmIR14__nv_bfloat162RKS__nbst(arg_0, arg_1)
 
-    handle = globals()["operator.isub"]
+    handle = globals().get("operator.isub")
+    if handle is None:
+        handle = operator.isub
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -13601,7 +14125,9 @@ def _lower__ZmLR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj):
     def _ZmLR14__nv_bfloat162RKS__nbst_caller(arg_0, arg_1):
         return _ZmLR14__nv_bfloat162RKS__nbst(arg_0, arg_1)
 
-    handle = globals()["operator.imul"]
+    handle = globals().get("operator.imul")
+    if handle is None:
+        handle = operator.imul
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -13647,7 +14173,9 @@ def _lower__ZdVR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj):
     def _ZdVR14__nv_bfloat162RKS__nbst_caller(arg_0, arg_1):
         return _ZdVR14__nv_bfloat162RKS__nbst(arg_0, arg_1)
 
-    handle = globals()["operator.itruediv"]
+    handle = globals().get("operator.itruediv")
+    if handle is None:
+        handle = operator.itruediv
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -13691,7 +14219,9 @@ def _lower__ZpsRK14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZpsRK14__nv_bfloat162_nbst_caller(arg_0):
         return _ZpsRK14__nv_bfloat162_nbst(arg_0)
 
-    handle = globals()["operator.pos"]
+    handle = globals().get("operator.pos")
+    if handle is None:
+        handle = operator.pos
 
     @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -13729,7 +14259,9 @@ def _lower__ZngRK14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZngRK14__nv_bfloat162_nbst_caller(arg_0):
         return _ZngRK14__nv_bfloat162_nbst(arg_0)
 
-    handle = globals()["operator.neg"]
+    handle = globals().get("operator.neg")
+    if handle is None:
+        handle = operator.neg
 
     @lower(handle, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -13767,7 +14299,9 @@ def _lower__ZeqRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj):
     def _ZeqRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1):
         return _ZeqRK14__nv_bfloat162S1__nbst(arg_0, arg_1)
 
-    handle = globals()["operator.eq"]
+    handle = globals().get("operator.eq")
+    if handle is None:
+        handle = operator.eq
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -13811,7 +14345,9 @@ def _lower__ZneRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj):
     def _ZneRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1):
         return _ZneRK14__nv_bfloat162S1__nbst(arg_0, arg_1)
 
-    handle = globals()["operator.ne"]
+    handle = globals().get("operator.ne")
+    if handle is None:
+        handle = operator.ne
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -13855,7 +14391,9 @@ def _lower__ZgtRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj):
     def _ZgtRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1):
         return _ZgtRK14__nv_bfloat162S1__nbst(arg_0, arg_1)
 
-    handle = globals()["operator.gt"]
+    handle = globals().get("operator.gt")
+    if handle is None:
+        handle = operator.gt
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -13899,7 +14437,9 @@ def _lower__ZltRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj):
     def _ZltRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1):
         return _ZltRK14__nv_bfloat162S1__nbst(arg_0, arg_1)
 
-    handle = globals()["operator.lt"]
+    handle = globals().get("operator.lt")
+    if handle is None:
+        handle = operator.lt
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -13943,7 +14483,9 @@ def _lower__ZgeRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj):
     def _ZgeRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1):
         return _ZgeRK14__nv_bfloat162S1__nbst(arg_0, arg_1)
 
-    handle = globals()["operator.ge"]
+    handle = globals().get("operator.ge")
+    if handle is None:
+        handle = operator.ge
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -13987,7 +14529,9 @@ def _lower__ZleRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj):
     def _ZleRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1):
         return _ZleRK14__nv_bfloat162S1__nbst(arg_0, arg_1)
 
-    handle = globals()["operator.le"]
+    handle = globals().get("operator.le")
+    if handle is None:
+        handle = operator.le
 
     @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
@@ -14034,7 +14578,9 @@ def _lower__ZN6__halfC1E13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZN6__halfC1E13__nv_bfloat16_nbst_caller(arg_0):
         return _ZN6__halfC1E13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals()["__half"]
+    handle = globals().get("__half")
+    if handle is None:
+        handle = __half
 
     @lower(handle, _type___nv_bfloat16)
     def impl(context, builder, sig, args):

From 9e79d370a435e7b354423a9df068338606d22918 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Fri, 15 Aug 2025 13:22:57 -0700
Subject: [PATCH 19/56] apply binding patches

---
 numba_cuda/numba/cuda/_internal/cuda_bf16.py | 46 ++++++--------------
 1 file changed, 14 insertions(+), 32 deletions(-)

diff --git a/numba_cuda/numba/cuda/_internal/cuda_bf16.py b/numba_cuda/numba/cuda/_internal/cuda_bf16.py
index fb4e4de21..e27df978d 100644
--- a/numba_cuda/numba/cuda/_internal/cuda_bf16.py
+++ b/numba_cuda/numba/cuda/_internal/cuda_bf16.py
@@ -27,7 +27,6 @@
 from numba.core.typing.templates import AttributeTemplate, ConcreteTemplate
 from numba.core.typing.templates import Registry as TypingRegistry
 from numba.cuda import CUSource, declare_device
-from numba.cuda._internal.cuda_bf16 import _type___nv_bfloat16
 from numba.cuda.vector_types import vector_types
 from numba.extending import as_numba_type
 from numba.types import (
@@ -49,8 +48,10 @@
     uint64,
     void,
 )
+from numba.cuda.types import bfloat16
 
 float32x2 = vector_types["float32x2"]
+__half = float16
 
 
 typing_registry = TypingRegistry()
@@ -180,37 +181,7 @@ def resolve_y(self, obj):
 make_attribute_wrapper(_type_class_unnamed1405416, "y", "y")
 
 
-@register
-class _ctor_template_unnamed1405416(ConcreteTemplate):
-    key = globals()["unnamed1405416"]
-    cases = []
-
-
-register_global(unnamed1405416, Function(_ctor_template_unnamed1405416))
-
-
-# Typing for __nv_bfloat16
-class _type_class___nv_bfloat16(Number):
-    def __init__(self):
-        super().__init__(name="__nv_bfloat16")
-        self.alignof_ = 2
-        self.bitwidth = 2 * 8
-
-
-_type___nv_bfloat16 = _type_class___nv_bfloat16()
-
-
-# Make Python API for struct
-__nv_bfloat16 = type("__nv_bfloat16", (), {"_nbtype": _type___nv_bfloat16})
-
-as_numba_type.register(__nv_bfloat16, _type___nv_bfloat16)
-
-
-@register_model(_type_class___nv_bfloat16)
-class _model___nv_bfloat16(PrimitiveModel):
-    def __init__(self, dmm, fe_type):
-        be_type = ir.IntType(fe_type.bitwidth)
-        super(_model___nv_bfloat16, self).__init__(dmm, fe_type, be_type)
+__nv_bfloat16 = _type___nv_bfloat16 = bfloat16
 
 
 def _lower__ZN13__nv_bfloat16C1Ev(shim_stream, shim_obj):
@@ -356,6 +327,17 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
         )
 
+    # By default, Numbast does not generate this cast because the c++ conversion
+    # constructor is marked explict. We enable it by hand here.
+    @lower_cast(float16, __nv_bfloat16)
+    def conversion_impl(context, builder, fromty, toty, value):
+        return ctor_impl(
+            context,
+            builder,
+            signature(__nv_bfloat16, fromty),
+            [value],
+        )
+
 
 _lower__ZN13__nv_bfloat16C1E6__half(shim_stream, shim_obj)
 

From 667d9fae07a1a20e4f4f5f26f25f5ead70fc6385 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Fri, 15 Aug 2025 13:32:50 -0700
Subject: [PATCH 20/56] generate bindings

---
 numba_cuda/numba/cuda/_internal/cuda_bf16.py | 157 +++++++++++++++++--
 1 file changed, 142 insertions(+), 15 deletions(-)

diff --git a/numba_cuda/numba/cuda/_internal/cuda_bf16.py b/numba_cuda/numba/cuda/_internal/cuda_bf16.py
index e27df978d..5adb821eb 100644
--- a/numba_cuda/numba/cuda/_internal/cuda_bf16.py
+++ b/numba_cuda/numba/cuda/_internal/cuda_bf16.py
@@ -1,6 +1,6 @@
 # Automatically generated by Numbast Static Binding Generator
 # Generator Information:
-# Ast_canopy version: 0.4.0
+# Ast_canopy version: 0.5.0
 # Numbast version: 0.5.0
 # Generation command: /home/wangm/numbast/numbast/src/numbast/__main__.py --cfg-path configs/cuda_bf16.yml --output-dir numba_cuda/numba/cuda/_internal/
 # Static binding generator parameters: {'cfg_path': 'configs/cuda_bf16.yml', 'output_dir': 'numba_cuda/numba/cuda/_internal/', 'run_ruff_format': True}
@@ -23,10 +23,12 @@
     register_model,
 )
 from numba.core.imputils import Registry as TargetRegistry
+from numba.core.imputils import lower_cast
 from numba.core.typing import signature
 from numba.core.typing.templates import AttributeTemplate, ConcreteTemplate
 from numba.core.typing.templates import Registry as TypingRegistry
 from numba.cuda import CUSource, declare_device
+from numba.cuda._internal.cuda_bf16 import _type___nv_bfloat16
 from numba.cuda.vector_types import vector_types
 from numba.extending import as_numba_type
 from numba.types import (
@@ -48,10 +50,8 @@
     uint64,
     void,
 )
-from numba.cuda.types import bfloat16
 
 float32x2 = vector_types["float32x2"]
-__half = float16
 
 
 typing_registry = TypingRegistry()
@@ -181,7 +181,37 @@ def resolve_y(self, obj):
 make_attribute_wrapper(_type_class_unnamed1405416, "y", "y")
 
 
-__nv_bfloat16 = _type___nv_bfloat16 = bfloat16
+@register
+class _ctor_template_unnamed1405416(ConcreteTemplate):
+    key = globals()["unnamed1405416"]
+    cases = []
+
+
+register_global(unnamed1405416, Function(_ctor_template_unnamed1405416))
+
+
+# Typing for __nv_bfloat16
+class _type_class___nv_bfloat16(Number):
+    def __init__(self):
+        super().__init__(name="__nv_bfloat16")
+        self.alignof_ = 2
+        self.bitwidth = 2 * 8
+
+
+_type___nv_bfloat16 = _type_class___nv_bfloat16()
+
+
+# Make Python API for struct
+__nv_bfloat16 = type("__nv_bfloat16", (), {"_nbtype": _type___nv_bfloat16})
+
+as_numba_type.register(__nv_bfloat16, _type___nv_bfloat16)
+
+
+@register_model(_type_class___nv_bfloat16)
+class _model___nv_bfloat16(PrimitiveModel):
+    def __init__(self, dmm, fe_type):
+        be_type = ir.IntType(fe_type.bitwidth)
+        super(_model___nv_bfloat16, self).__init__(dmm, fe_type, be_type)
 
 
 def _lower__ZN13__nv_bfloat16C1Ev(shim_stream, shim_obj):
@@ -281,6 +311,15 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
         )
 
+    @lower_cast(_type_unnamed1405307, _type___nv_bfloat16)
+    def conversion_impl(context, builder, fromty, toty, value):
+        return ctor_impl(
+            context,
+            builder,
+            signature(_type___nv_bfloat16, fromty),
+            [value],
+        )
+
 
 _lower__ZN13__nv_bfloat16C1ERK17__nv_bfloat16_raw(shim_stream, shim_obj)
 
@@ -327,17 +366,6 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
         )
 
-    # By default, Numbast does not generate this cast because the c++ conversion
-    # constructor is marked explict. We enable it by hand here.
-    @lower_cast(float16, __nv_bfloat16)
-    def conversion_impl(context, builder, fromty, toty, value):
-        return ctor_impl(
-            context,
-            builder,
-            signature(__nv_bfloat16, fromty),
-            [value],
-        )
-
 
 _lower__ZN13__nv_bfloat16C1E6__half(shim_stream, shim_obj)
 
@@ -382,6 +410,15 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
         )
 
+    @lower_cast(float32, _type___nv_bfloat16)
+    def conversion_impl(context, builder, fromty, toty, value):
+        return ctor_impl(
+            context,
+            builder,
+            signature(_type___nv_bfloat16, fromty),
+            [value],
+        )
+
 
 _lower__ZN13__nv_bfloat16C1Ef(shim_stream, shim_obj)
 
@@ -426,6 +463,15 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
         )
 
+    @lower_cast(float64, _type___nv_bfloat16)
+    def conversion_impl(context, builder, fromty, toty, value):
+        return ctor_impl(
+            context,
+            builder,
+            signature(_type___nv_bfloat16, fromty),
+            [value],
+        )
+
 
 _lower__ZN13__nv_bfloat16C1Ed(shim_stream, shim_obj)
 
@@ -470,6 +516,15 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
         )
 
+    @lower_cast(int16, _type___nv_bfloat16)
+    def conversion_impl(context, builder, fromty, toty, value):
+        return ctor_impl(
+            context,
+            builder,
+            signature(_type___nv_bfloat16, fromty),
+            [value],
+        )
+
 
 _lower__ZN13__nv_bfloat16C1Es(shim_stream, shim_obj)
 
@@ -514,6 +569,15 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
         )
 
+    @lower_cast(uint16, _type___nv_bfloat16)
+    def conversion_impl(context, builder, fromty, toty, value):
+        return ctor_impl(
+            context,
+            builder,
+            signature(_type___nv_bfloat16, fromty),
+            [value],
+        )
+
 
 _lower__ZN13__nv_bfloat16C1Et(shim_stream, shim_obj)
 
@@ -558,6 +622,15 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
         )
 
+    @lower_cast(int32, _type___nv_bfloat16)
+    def conversion_impl(context, builder, fromty, toty, value):
+        return ctor_impl(
+            context,
+            builder,
+            signature(_type___nv_bfloat16, fromty),
+            [value],
+        )
+
 
 _lower__ZN13__nv_bfloat16C1Ei(shim_stream, shim_obj)
 
@@ -602,6 +675,15 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
         )
 
+    @lower_cast(uint32, _type___nv_bfloat16)
+    def conversion_impl(context, builder, fromty, toty, value):
+        return ctor_impl(
+            context,
+            builder,
+            signature(_type___nv_bfloat16, fromty),
+            [value],
+        )
+
 
 _lower__ZN13__nv_bfloat16C1Ej(shim_stream, shim_obj)
 
@@ -646,6 +728,15 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
         )
 
+    @lower_cast(int64, _type___nv_bfloat16)
+    def conversion_impl(context, builder, fromty, toty, value):
+        return ctor_impl(
+            context,
+            builder,
+            signature(_type___nv_bfloat16, fromty),
+            [value],
+        )
+
 
 _lower__ZN13__nv_bfloat16C1El(shim_stream, shim_obj)
 
@@ -690,6 +781,15 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
         )
 
+    @lower_cast(uint64, _type___nv_bfloat16)
+    def conversion_impl(context, builder, fromty, toty, value):
+        return ctor_impl(
+            context,
+            builder,
+            signature(_type___nv_bfloat16, fromty),
+            [value],
+        )
+
 
 _lower__ZN13__nv_bfloat16C1Em(shim_stream, shim_obj)
 
@@ -734,6 +834,15 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
         )
 
+    @lower_cast(int64, _type___nv_bfloat16)
+    def conversion_impl(context, builder, fromty, toty, value):
+        return ctor_impl(
+            context,
+            builder,
+            signature(_type___nv_bfloat16, fromty),
+            [value],
+        )
+
 
 _lower__ZN13__nv_bfloat16C1Ex(shim_stream, shim_obj)
 
@@ -778,6 +887,15 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
         )
 
+    @lower_cast(uint64, _type___nv_bfloat16)
+    def conversion_impl(context, builder, fromty, toty, value):
+        return ctor_impl(
+            context,
+            builder,
+            signature(_type___nv_bfloat16, fromty),
+            [value],
+        )
+
 
 _lower__ZN13__nv_bfloat16C1Ey(shim_stream, shim_obj)
 
@@ -1792,6 +1910,15 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat162, "alignof_", None)
         )
 
+    @lower_cast(_type_unnamed1405416, _type___nv_bfloat162)
+    def conversion_impl(context, builder, fromty, toty, value):
+        return ctor_impl(
+            context,
+            builder,
+            signature(_type___nv_bfloat162, fromty),
+            [value],
+        )
+
 
 _lower__ZN14__nv_bfloat162C1ERK18__nv_bfloat162_raw(shim_stream, shim_obj)
 

From c4cf6858b5b6c073ac58375ff3c33652c1adf94c Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Fri, 15 Aug 2025 13:22:57 -0700
Subject: [PATCH 21/56] apply binding patches

---
 numba_cuda/numba/cuda/_internal/cuda_bf16.py | 46 ++++++--------------
 1 file changed, 14 insertions(+), 32 deletions(-)

diff --git a/numba_cuda/numba/cuda/_internal/cuda_bf16.py b/numba_cuda/numba/cuda/_internal/cuda_bf16.py
index 5adb821eb..21babf0c9 100644
--- a/numba_cuda/numba/cuda/_internal/cuda_bf16.py
+++ b/numba_cuda/numba/cuda/_internal/cuda_bf16.py
@@ -28,7 +28,6 @@
 from numba.core.typing.templates import AttributeTemplate, ConcreteTemplate
 from numba.core.typing.templates import Registry as TypingRegistry
 from numba.cuda import CUSource, declare_device
-from numba.cuda._internal.cuda_bf16 import _type___nv_bfloat16
 from numba.cuda.vector_types import vector_types
 from numba.extending import as_numba_type
 from numba.types import (
@@ -50,8 +49,10 @@
     uint64,
     void,
 )
+from numba.cuda.types import bfloat16
 
 float32x2 = vector_types["float32x2"]
+__half = float16
 
 
 typing_registry = TypingRegistry()
@@ -181,37 +182,7 @@ def resolve_y(self, obj):
 make_attribute_wrapper(_type_class_unnamed1405416, "y", "y")
 
 
-@register
-class _ctor_template_unnamed1405416(ConcreteTemplate):
-    key = globals()["unnamed1405416"]
-    cases = []
-
-
-register_global(unnamed1405416, Function(_ctor_template_unnamed1405416))
-
-
-# Typing for __nv_bfloat16
-class _type_class___nv_bfloat16(Number):
-    def __init__(self):
-        super().__init__(name="__nv_bfloat16")
-        self.alignof_ = 2
-        self.bitwidth = 2 * 8
-
-
-_type___nv_bfloat16 = _type_class___nv_bfloat16()
-
-
-# Make Python API for struct
-__nv_bfloat16 = type("__nv_bfloat16", (), {"_nbtype": _type___nv_bfloat16})
-
-as_numba_type.register(__nv_bfloat16, _type___nv_bfloat16)
-
-
-@register_model(_type_class___nv_bfloat16)
-class _model___nv_bfloat16(PrimitiveModel):
-    def __init__(self, dmm, fe_type):
-        be_type = ir.IntType(fe_type.bitwidth)
-        super(_model___nv_bfloat16, self).__init__(dmm, fe_type, be_type)
+__nv_bfloat16 = _type___nv_bfloat16 = bfloat16
 
 
 def _lower__ZN13__nv_bfloat16C1Ev(shim_stream, shim_obj):
@@ -366,6 +337,17 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
         )
 
+    # By default, Numbast does not generate this cast because the c++ conversion
+    # constructor is marked explict. We enable it by hand here.
+    @lower_cast(float16, __nv_bfloat16)
+    def conversion_impl(context, builder, fromty, toty, value):
+        return ctor_impl(
+            context,
+            builder,
+            signature(__nv_bfloat16, fromty),
+            [value],
+        )
+
 
 _lower__ZN13__nv_bfloat16C1E6__half(shim_stream, shim_obj)
 

From 7a89d3ebfd620491da36323dc5f956e87364d411 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Fri, 15 Aug 2025 13:38:13 -0700
Subject: [PATCH 22/56] generate the bindings

---
 numba_cuda/numba/cuda/_internal/cuda_bf16.py | 1648 ++++--------------
 1 file changed, 314 insertions(+), 1334 deletions(-)

diff --git a/numba_cuda/numba/cuda/_internal/cuda_bf16.py b/numba_cuda/numba/cuda/_internal/cuda_bf16.py
index 21babf0c9..0eef75e12 100644
--- a/numba_cuda/numba/cuda/_internal/cuda_bf16.py
+++ b/numba_cuda/numba/cuda/_internal/cuda_bf16.py
@@ -28,6 +28,7 @@
 from numba.core.typing.templates import AttributeTemplate, ConcreteTemplate
 from numba.core.typing.templates import Registry as TypingRegistry
 from numba.cuda import CUSource, declare_device
+from numba.cuda._internal.cuda_bf16 import _type___nv_bfloat16
 from numba.cuda.vector_types import vector_types
 from numba.extending import as_numba_type
 from numba.types import (
@@ -49,10 +50,8 @@
     uint64,
     void,
 )
-from numba.cuda.types import bfloat16
 
 float32x2 = vector_types["float32x2"]
-__half = float16
 
 
 typing_registry = TypingRegistry()
@@ -182,7 +181,37 @@ def resolve_y(self, obj):
 make_attribute_wrapper(_type_class_unnamed1405416, "y", "y")
 
 
-__nv_bfloat16 = _type___nv_bfloat16 = bfloat16
+@register
+class _ctor_template_unnamed1405416(ConcreteTemplate):
+    key = globals()["unnamed1405416"]
+    cases = []
+
+
+register_global(unnamed1405416, Function(_ctor_template_unnamed1405416))
+
+
+# Typing for __nv_bfloat16
+class _type_class___nv_bfloat16(Number):
+    def __init__(self):
+        super().__init__(name="__nv_bfloat16")
+        self.alignof_ = 2
+        self.bitwidth = 2 * 8
+
+
+_type___nv_bfloat16 = _type_class___nv_bfloat16()
+
+
+# Make Python API for struct
+__nv_bfloat16 = type("__nv_bfloat16", (), {"_nbtype": _type___nv_bfloat16})
+
+as_numba_type.register(__nv_bfloat16, _type___nv_bfloat16)
+
+
+@register_model(_type_class___nv_bfloat16)
+class _model___nv_bfloat16(PrimitiveModel):
+    def __init__(self, dmm, fe_type):
+        be_type = ir.IntType(fe_type.bitwidth)
+        super(_model___nv_bfloat16, self).__init__(dmm, fe_type, be_type)
 
 
 def _lower__ZN13__nv_bfloat16C1Ev(shim_stream, shim_obj):
@@ -337,17 +366,6 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
         )
 
-    # By default, Numbast does not generate this cast because the c++ conversion
-    # constructor is marked explict. We enable it by hand here.
-    @lower_cast(float16, __nv_bfloat16)
-    def conversion_impl(context, builder, fromty, toty, value):
-        return ctor_impl(
-            context,
-            builder,
-            signature(__nv_bfloat16, fromty),
-            [value],
-        )
-
 
 _lower__ZN13__nv_bfloat16C1E6__half(shim_stream, shim_obj)
 
@@ -1994,11 +2012,7 @@ def _lower__ZL17__double2bfloat16d_nbst(shim_stream, shim_obj):
     def _ZL17__double2bfloat16d_nbst_caller(arg_0):
         return _ZL17__double2bfloat16d_nbst(arg_0)
 
-    handle = globals().get("__double2bfloat16")
-    if handle is None:
-        handle = __double2bfloat16
-
-    @lower(handle, float64)
+    @lower(__double2bfloat16, float64)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key("_ZL17__double2bfloat16d_nbst", shim_raw_str)
@@ -2037,11 +2051,7 @@ def _lower__ZL16__float2bfloat16f_nbst(shim_stream, shim_obj):
     def _ZL16__float2bfloat16f_nbst_caller(arg_0):
         return _ZL16__float2bfloat16f_nbst(arg_0)
 
-    handle = globals().get("__float2bfloat16")
-    if handle is None:
-        handle = __float2bfloat16
-
-    @lower(handle, float32)
+    @lower(__float2bfloat16, float32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key("_ZL16__float2bfloat16f_nbst", shim_raw_str)
@@ -2080,11 +2090,7 @@ def _lower__ZL19__float2bfloat16_rnf_nbst(shim_stream, shim_obj):
     def _ZL19__float2bfloat16_rnf_nbst_caller(arg_0):
         return _ZL19__float2bfloat16_rnf_nbst(arg_0)
 
-    handle = globals().get("__float2bfloat16_rn")
-    if handle is None:
-        handle = __float2bfloat16_rn
-
-    @lower(handle, float32)
+    @lower(__float2bfloat16_rn, float32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -2125,11 +2131,7 @@ def _lower__ZL19__float2bfloat16_rzf_nbst(shim_stream, shim_obj):
     def _ZL19__float2bfloat16_rzf_nbst_caller(arg_0):
         return _ZL19__float2bfloat16_rzf_nbst(arg_0)
 
-    handle = globals().get("__float2bfloat16_rz")
-    if handle is None:
-        handle = __float2bfloat16_rz
-
-    @lower(handle, float32)
+    @lower(__float2bfloat16_rz, float32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -2170,11 +2172,7 @@ def _lower__ZL19__float2bfloat16_rdf_nbst(shim_stream, shim_obj):
     def _ZL19__float2bfloat16_rdf_nbst_caller(arg_0):
         return _ZL19__float2bfloat16_rdf_nbst(arg_0)
 
-    handle = globals().get("__float2bfloat16_rd")
-    if handle is None:
-        handle = __float2bfloat16_rd
-
-    @lower(handle, float32)
+    @lower(__float2bfloat16_rd, float32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -2215,11 +2213,7 @@ def _lower__ZL19__float2bfloat16_ruf_nbst(shim_stream, shim_obj):
     def _ZL19__float2bfloat16_ruf_nbst_caller(arg_0):
         return _ZL19__float2bfloat16_ruf_nbst(arg_0)
 
-    handle = globals().get("__float2bfloat16_ru")
-    if handle is None:
-        handle = __float2bfloat16_ru
-
-    @lower(handle, float32)
+    @lower(__float2bfloat16_ru, float32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -2261,11 +2255,7 @@ def _lower__ZL16__bfloat162float13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL16__bfloat162float13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL16__bfloat162float13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("__bfloat162float")
-    if handle is None:
-        handle = __bfloat162float
-
-    @lower(handle, _type___nv_bfloat16)
+    @lower(__bfloat162float, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -2307,11 +2297,7 @@ def _lower__ZL20__float2bfloat162_rnf_nbst(shim_stream, shim_obj):
     def _ZL20__float2bfloat162_rnf_nbst_caller(arg_0):
         return _ZL20__float2bfloat162_rnf_nbst(arg_0)
 
-    handle = globals().get("__float2bfloat162_rn")
-    if handle is None:
-        handle = __float2bfloat162_rn
-
-    @lower(handle, float32)
+    @lower(__float2bfloat162_rn, float32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -2353,11 +2339,7 @@ def _lower__ZL21__floats2bfloat162_rnff_nbst(shim_stream, shim_obj):
     def _ZL21__floats2bfloat162_rnff_nbst_caller(arg_0, arg_1):
         return _ZL21__floats2bfloat162_rnff_nbst(arg_0, arg_1)
 
-    handle = globals().get("__floats2bfloat162_rn")
-    if handle is None:
-        handle = __floats2bfloat162_rn
-
-    @lower(handle, float32, float32)
+    @lower(__floats2bfloat162_rn, float32, float32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -2401,11 +2383,7 @@ def _lower__ZL11__low2float14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL11__low2float14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL11__low2float14__nv_bfloat162_nbst(arg_0)
 
-    handle = globals().get("__low2float")
-    if handle is None:
-        handle = __low2float
-
-    @lower(handle, _type___nv_bfloat162)
+    @lower(__low2float, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -2447,11 +2425,7 @@ def _lower__ZL12__high2float14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL12__high2float14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL12__high2float14__nv_bfloat162_nbst(arg_0)
 
-    handle = globals().get("__high2float")
-    if handle is None:
-        handle = __high2float
-
-    @lower(handle, _type___nv_bfloat162)
+    @lower(__high2float, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -2493,11 +2467,7 @@ def _lower__ZL21__float22bfloat162_rn6float2_nbst(shim_stream, shim_obj):
     def _ZL21__float22bfloat162_rn6float2_nbst_caller(arg_0):
         return _ZL21__float22bfloat162_rn6float2_nbst(arg_0)
 
-    handle = globals().get("__float22bfloat162_rn")
-    if handle is None:
-        handle = __float22bfloat162_rn
-
-    @lower(handle, float32x2)
+    @lower(__float22bfloat162_rn, float32x2)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -2539,11 +2509,7 @@ def _lower__ZL18__bfloat1622float214__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL18__bfloat1622float214__nv_bfloat162_nbst_caller(arg_0):
         return _ZL18__bfloat1622float214__nv_bfloat162_nbst(arg_0)
 
-    handle = globals().get("__bfloat1622float2")
-    if handle is None:
-        handle = __bfloat1622float2
-
-    @lower(handle, _type___nv_bfloat162)
+    @lower(__bfloat1622float2, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -2585,11 +2551,7 @@ def _lower__ZL18__bfloat162char_rz13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL18__bfloat162char_rz13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL18__bfloat162char_rz13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("__bfloat162char_rz")
-    if handle is None:
-        handle = __bfloat162char_rz
-
-    @lower(handle, _type___nv_bfloat16)
+    @lower(__bfloat162char_rz, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -2631,11 +2593,7 @@ def _lower__ZL19__bfloat162uchar_rz13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL19__bfloat162uchar_rz13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL19__bfloat162uchar_rz13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("__bfloat162uchar_rz")
-    if handle is None:
-        handle = __bfloat162uchar_rz
-
-    @lower(handle, _type___nv_bfloat16)
+    @lower(__bfloat162uchar_rz, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -2677,11 +2635,7 @@ def _lower__ZL17__bfloat162int_rn13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL17__bfloat162int_rn13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL17__bfloat162int_rn13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("__bfloat162int_rn")
-    if handle is None:
-        handle = __bfloat162int_rn
-
-    @lower(handle, _type___nv_bfloat16)
+    @lower(__bfloat162int_rn, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -2723,11 +2677,7 @@ def _lower__ZL17__bfloat162int_rz13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL17__bfloat162int_rz13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL17__bfloat162int_rz13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("__bfloat162int_rz")
-    if handle is None:
-        handle = __bfloat162int_rz
-
-    @lower(handle, _type___nv_bfloat16)
+    @lower(__bfloat162int_rz, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -2769,11 +2719,7 @@ def _lower__ZL17__bfloat162int_rd13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL17__bfloat162int_rd13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL17__bfloat162int_rd13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("__bfloat162int_rd")
-    if handle is None:
-        handle = __bfloat162int_rd
-
-    @lower(handle, _type___nv_bfloat16)
+    @lower(__bfloat162int_rd, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -2815,11 +2761,7 @@ def _lower__ZL17__bfloat162int_ru13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL17__bfloat162int_ru13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL17__bfloat162int_ru13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("__bfloat162int_ru")
-    if handle is None:
-        handle = __bfloat162int_ru
-
-    @lower(handle, _type___nv_bfloat16)
+    @lower(__bfloat162int_ru, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -2860,11 +2802,7 @@ def _lower__ZL17__int2bfloat16_rni_nbst(shim_stream, shim_obj):
     def _ZL17__int2bfloat16_rni_nbst_caller(arg_0):
         return _ZL17__int2bfloat16_rni_nbst(arg_0)
 
-    handle = globals().get("__int2bfloat16_rn")
-    if handle is None:
-        handle = __int2bfloat16_rn
-
-    @lower(handle, int32)
+    @lower(__int2bfloat16_rn, int32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key("_ZL17__int2bfloat16_rni_nbst", shim_raw_str)
@@ -2903,11 +2841,7 @@ def _lower__ZL17__int2bfloat16_rzi_nbst(shim_stream, shim_obj):
     def _ZL17__int2bfloat16_rzi_nbst_caller(arg_0):
         return _ZL17__int2bfloat16_rzi_nbst(arg_0)
 
-    handle = globals().get("__int2bfloat16_rz")
-    if handle is None:
-        handle = __int2bfloat16_rz
-
-    @lower(handle, int32)
+    @lower(__int2bfloat16_rz, int32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key("_ZL17__int2bfloat16_rzi_nbst", shim_raw_str)
@@ -2946,11 +2880,7 @@ def _lower__ZL17__int2bfloat16_rdi_nbst(shim_stream, shim_obj):
     def _ZL17__int2bfloat16_rdi_nbst_caller(arg_0):
         return _ZL17__int2bfloat16_rdi_nbst(arg_0)
 
-    handle = globals().get("__int2bfloat16_rd")
-    if handle is None:
-        handle = __int2bfloat16_rd
-
-    @lower(handle, int32)
+    @lower(__int2bfloat16_rd, int32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key("_ZL17__int2bfloat16_rdi_nbst", shim_raw_str)
@@ -2989,11 +2919,7 @@ def _lower__ZL17__int2bfloat16_rui_nbst(shim_stream, shim_obj):
     def _ZL17__int2bfloat16_rui_nbst_caller(arg_0):
         return _ZL17__int2bfloat16_rui_nbst(arg_0)
 
-    handle = globals().get("__int2bfloat16_ru")
-    if handle is None:
-        handle = __int2bfloat16_ru
-
-    @lower(handle, int32)
+    @lower(__int2bfloat16_ru, int32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key("_ZL17__int2bfloat16_rui_nbst", shim_raw_str)
@@ -3033,11 +2959,7 @@ def _lower__ZL19__bfloat162short_rn13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL19__bfloat162short_rn13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL19__bfloat162short_rn13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("__bfloat162short_rn")
-    if handle is None:
-        handle = __bfloat162short_rn
-
-    @lower(handle, _type___nv_bfloat16)
+    @lower(__bfloat162short_rn, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -3079,11 +3001,7 @@ def _lower__ZL19__bfloat162short_rz13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL19__bfloat162short_rz13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL19__bfloat162short_rz13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("__bfloat162short_rz")
-    if handle is None:
-        handle = __bfloat162short_rz
-
-    @lower(handle, _type___nv_bfloat16)
+    @lower(__bfloat162short_rz, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -3125,11 +3043,7 @@ def _lower__ZL19__bfloat162short_rd13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL19__bfloat162short_rd13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL19__bfloat162short_rd13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("__bfloat162short_rd")
-    if handle is None:
-        handle = __bfloat162short_rd
-
-    @lower(handle, _type___nv_bfloat16)
+    @lower(__bfloat162short_rd, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -3171,11 +3085,7 @@ def _lower__ZL19__bfloat162short_ru13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL19__bfloat162short_ru13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL19__bfloat162short_ru13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("__bfloat162short_ru")
-    if handle is None:
-        handle = __bfloat162short_ru
-
-    @lower(handle, _type___nv_bfloat16)
+    @lower(__bfloat162short_ru, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -3216,11 +3126,7 @@ def _lower__ZL19__short2bfloat16_rns_nbst(shim_stream, shim_obj):
     def _ZL19__short2bfloat16_rns_nbst_caller(arg_0):
         return _ZL19__short2bfloat16_rns_nbst(arg_0)
 
-    handle = globals().get("__short2bfloat16_rn")
-    if handle is None:
-        handle = __short2bfloat16_rn
-
-    @lower(handle, int16)
+    @lower(__short2bfloat16_rn, int16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -3261,11 +3167,7 @@ def _lower__ZL19__short2bfloat16_rzs_nbst(shim_stream, shim_obj):
     def _ZL19__short2bfloat16_rzs_nbst_caller(arg_0):
         return _ZL19__short2bfloat16_rzs_nbst(arg_0)
 
-    handle = globals().get("__short2bfloat16_rz")
-    if handle is None:
-        handle = __short2bfloat16_rz
-
-    @lower(handle, int16)
+    @lower(__short2bfloat16_rz, int16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -3306,11 +3208,7 @@ def _lower__ZL19__short2bfloat16_rds_nbst(shim_stream, shim_obj):
     def _ZL19__short2bfloat16_rds_nbst_caller(arg_0):
         return _ZL19__short2bfloat16_rds_nbst(arg_0)
 
-    handle = globals().get("__short2bfloat16_rd")
-    if handle is None:
-        handle = __short2bfloat16_rd
-
-    @lower(handle, int16)
+    @lower(__short2bfloat16_rd, int16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -3351,11 +3249,7 @@ def _lower__ZL19__short2bfloat16_rus_nbst(shim_stream, shim_obj):
     def _ZL19__short2bfloat16_rus_nbst_caller(arg_0):
         return _ZL19__short2bfloat16_rus_nbst(arg_0)
 
-    handle = globals().get("__short2bfloat16_ru")
-    if handle is None:
-        handle = __short2bfloat16_ru
-
-    @lower(handle, int16)
+    @lower(__short2bfloat16_ru, int16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -3397,11 +3291,7 @@ def _lower__ZL18__bfloat162uint_rn13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL18__bfloat162uint_rn13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL18__bfloat162uint_rn13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("__bfloat162uint_rn")
-    if handle is None:
-        handle = __bfloat162uint_rn
-
-    @lower(handle, _type___nv_bfloat16)
+    @lower(__bfloat162uint_rn, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -3443,11 +3333,7 @@ def _lower__ZL18__bfloat162uint_rz13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL18__bfloat162uint_rz13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL18__bfloat162uint_rz13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("__bfloat162uint_rz")
-    if handle is None:
-        handle = __bfloat162uint_rz
-
-    @lower(handle, _type___nv_bfloat16)
+    @lower(__bfloat162uint_rz, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -3489,11 +3375,7 @@ def _lower__ZL18__bfloat162uint_rd13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL18__bfloat162uint_rd13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL18__bfloat162uint_rd13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("__bfloat162uint_rd")
-    if handle is None:
-        handle = __bfloat162uint_rd
-
-    @lower(handle, _type___nv_bfloat16)
+    @lower(__bfloat162uint_rd, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -3535,11 +3417,7 @@ def _lower__ZL18__bfloat162uint_ru13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL18__bfloat162uint_ru13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL18__bfloat162uint_ru13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("__bfloat162uint_ru")
-    if handle is None:
-        handle = __bfloat162uint_ru
-
-    @lower(handle, _type___nv_bfloat16)
+    @lower(__bfloat162uint_ru, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -3580,11 +3458,7 @@ def _lower__ZL18__uint2bfloat16_rnj_nbst(shim_stream, shim_obj):
     def _ZL18__uint2bfloat16_rnj_nbst_caller(arg_0):
         return _ZL18__uint2bfloat16_rnj_nbst(arg_0)
 
-    handle = globals().get("__uint2bfloat16_rn")
-    if handle is None:
-        handle = __uint2bfloat16_rn
-
-    @lower(handle, uint32)
+    @lower(__uint2bfloat16_rn, uint32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -3625,11 +3499,7 @@ def _lower__ZL18__uint2bfloat16_rzj_nbst(shim_stream, shim_obj):
     def _ZL18__uint2bfloat16_rzj_nbst_caller(arg_0):
         return _ZL18__uint2bfloat16_rzj_nbst(arg_0)
 
-    handle = globals().get("__uint2bfloat16_rz")
-    if handle is None:
-        handle = __uint2bfloat16_rz
-
-    @lower(handle, uint32)
+    @lower(__uint2bfloat16_rz, uint32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -3670,11 +3540,7 @@ def _lower__ZL18__uint2bfloat16_rdj_nbst(shim_stream, shim_obj):
     def _ZL18__uint2bfloat16_rdj_nbst_caller(arg_0):
         return _ZL18__uint2bfloat16_rdj_nbst(arg_0)
 
-    handle = globals().get("__uint2bfloat16_rd")
-    if handle is None:
-        handle = __uint2bfloat16_rd
-
-    @lower(handle, uint32)
+    @lower(__uint2bfloat16_rd, uint32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -3715,11 +3581,7 @@ def _lower__ZL18__uint2bfloat16_ruj_nbst(shim_stream, shim_obj):
     def _ZL18__uint2bfloat16_ruj_nbst_caller(arg_0):
         return _ZL18__uint2bfloat16_ruj_nbst(arg_0)
 
-    handle = globals().get("__uint2bfloat16_ru")
-    if handle is None:
-        handle = __uint2bfloat16_ru
-
-    @lower(handle, uint32)
+    @lower(__uint2bfloat16_ru, uint32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -3761,11 +3623,7 @@ def _lower__ZL20__bfloat162ushort_rn13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL20__bfloat162ushort_rn13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL20__bfloat162ushort_rn13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("__bfloat162ushort_rn")
-    if handle is None:
-        handle = __bfloat162ushort_rn
-
-    @lower(handle, _type___nv_bfloat16)
+    @lower(__bfloat162ushort_rn, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -3807,11 +3665,7 @@ def _lower__ZL20__bfloat162ushort_rz13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL20__bfloat162ushort_rz13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL20__bfloat162ushort_rz13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("__bfloat162ushort_rz")
-    if handle is None:
-        handle = __bfloat162ushort_rz
-
-    @lower(handle, _type___nv_bfloat16)
+    @lower(__bfloat162ushort_rz, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -3853,11 +3707,7 @@ def _lower__ZL20__bfloat162ushort_rd13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL20__bfloat162ushort_rd13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL20__bfloat162ushort_rd13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("__bfloat162ushort_rd")
-    if handle is None:
-        handle = __bfloat162ushort_rd
-
-    @lower(handle, _type___nv_bfloat16)
+    @lower(__bfloat162ushort_rd, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -3899,11 +3749,7 @@ def _lower__ZL20__bfloat162ushort_ru13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL20__bfloat162ushort_ru13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL20__bfloat162ushort_ru13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("__bfloat162ushort_ru")
-    if handle is None:
-        handle = __bfloat162ushort_ru
-
-    @lower(handle, _type___nv_bfloat16)
+    @lower(__bfloat162ushort_ru, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -3944,11 +3790,7 @@ def _lower__ZL20__ushort2bfloat16_rnt_nbst(shim_stream, shim_obj):
     def _ZL20__ushort2bfloat16_rnt_nbst_caller(arg_0):
         return _ZL20__ushort2bfloat16_rnt_nbst(arg_0)
 
-    handle = globals().get("__ushort2bfloat16_rn")
-    if handle is None:
-        handle = __ushort2bfloat16_rn
-
-    @lower(handle, uint16)
+    @lower(__ushort2bfloat16_rn, uint16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -3989,11 +3831,7 @@ def _lower__ZL20__ushort2bfloat16_rzt_nbst(shim_stream, shim_obj):
     def _ZL20__ushort2bfloat16_rzt_nbst_caller(arg_0):
         return _ZL20__ushort2bfloat16_rzt_nbst(arg_0)
 
-    handle = globals().get("__ushort2bfloat16_rz")
-    if handle is None:
-        handle = __ushort2bfloat16_rz
-
-    @lower(handle, uint16)
+    @lower(__ushort2bfloat16_rz, uint16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -4034,11 +3872,7 @@ def _lower__ZL20__ushort2bfloat16_rdt_nbst(shim_stream, shim_obj):
     def _ZL20__ushort2bfloat16_rdt_nbst_caller(arg_0):
         return _ZL20__ushort2bfloat16_rdt_nbst(arg_0)
 
-    handle = globals().get("__ushort2bfloat16_rd")
-    if handle is None:
-        handle = __ushort2bfloat16_rd
-
-    @lower(handle, uint16)
+    @lower(__ushort2bfloat16_rd, uint16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -4079,11 +3913,7 @@ def _lower__ZL20__ushort2bfloat16_rut_nbst(shim_stream, shim_obj):
     def _ZL20__ushort2bfloat16_rut_nbst_caller(arg_0):
         return _ZL20__ushort2bfloat16_rut_nbst(arg_0)
 
-    handle = globals().get("__ushort2bfloat16_ru")
-    if handle is None:
-        handle = __ushort2bfloat16_ru
-
-    @lower(handle, uint16)
+    @lower(__ushort2bfloat16_ru, uint16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -4125,11 +3955,7 @@ def _lower__ZL17__bfloat162ull_rn13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL17__bfloat162ull_rn13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL17__bfloat162ull_rn13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("__bfloat162ull_rn")
-    if handle is None:
-        handle = __bfloat162ull_rn
-
-    @lower(handle, _type___nv_bfloat16)
+    @lower(__bfloat162ull_rn, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -4171,11 +3997,7 @@ def _lower__ZL17__bfloat162ull_rz13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL17__bfloat162ull_rz13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL17__bfloat162ull_rz13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("__bfloat162ull_rz")
-    if handle is None:
-        handle = __bfloat162ull_rz
-
-    @lower(handle, _type___nv_bfloat16)
+    @lower(__bfloat162ull_rz, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -4219,11 +4041,7 @@ def _lower__ZL14make_bfloat16213__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL14make_bfloat16213__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL14make_bfloat16213__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals().get("make_bfloat162")
-    if handle is None:
-        handle = make_bfloat162
-
-    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(make_bfloat162, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -4269,11 +4087,7 @@ def _lower__ZL17__bfloat162ull_rd13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL17__bfloat162ull_rd13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL17__bfloat162ull_rd13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("__bfloat162ull_rd")
-    if handle is None:
-        handle = __bfloat162ull_rd
-
-    @lower(handle, _type___nv_bfloat16)
+    @lower(__bfloat162ull_rd, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -4315,11 +4129,7 @@ def _lower__ZL17__bfloat162ull_ru13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL17__bfloat162ull_ru13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL17__bfloat162ull_ru13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("__bfloat162ull_ru")
-    if handle is None:
-        handle = __bfloat162ull_ru
-
-    @lower(handle, _type___nv_bfloat16)
+    @lower(__bfloat162ull_ru, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -4360,11 +4170,7 @@ def _lower__ZL17__ull2bfloat16_rny_nbst(shim_stream, shim_obj):
     def _ZL17__ull2bfloat16_rny_nbst_caller(arg_0):
         return _ZL17__ull2bfloat16_rny_nbst(arg_0)
 
-    handle = globals().get("__ull2bfloat16_rn")
-    if handle is None:
-        handle = __ull2bfloat16_rn
-
-    @lower(handle, uint64)
+    @lower(__ull2bfloat16_rn, uint64)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key("_ZL17__ull2bfloat16_rny_nbst", shim_raw_str)
@@ -4403,11 +4209,7 @@ def _lower__ZL17__ull2bfloat16_rzy_nbst(shim_stream, shim_obj):
     def _ZL17__ull2bfloat16_rzy_nbst_caller(arg_0):
         return _ZL17__ull2bfloat16_rzy_nbst(arg_0)
 
-    handle = globals().get("__ull2bfloat16_rz")
-    if handle is None:
-        handle = __ull2bfloat16_rz
-
-    @lower(handle, uint64)
+    @lower(__ull2bfloat16_rz, uint64)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key("_ZL17__ull2bfloat16_rzy_nbst", shim_raw_str)
@@ -4446,11 +4248,7 @@ def _lower__ZL17__ull2bfloat16_rdy_nbst(shim_stream, shim_obj):
     def _ZL17__ull2bfloat16_rdy_nbst_caller(arg_0):
         return _ZL17__ull2bfloat16_rdy_nbst(arg_0)
 
-    handle = globals().get("__ull2bfloat16_rd")
-    if handle is None:
-        handle = __ull2bfloat16_rd
-
-    @lower(handle, uint64)
+    @lower(__ull2bfloat16_rd, uint64)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key("_ZL17__ull2bfloat16_rdy_nbst", shim_raw_str)
@@ -4489,11 +4287,7 @@ def _lower__ZL17__ull2bfloat16_ruy_nbst(shim_stream, shim_obj):
     def _ZL17__ull2bfloat16_ruy_nbst_caller(arg_0):
         return _ZL17__ull2bfloat16_ruy_nbst(arg_0)
 
-    handle = globals().get("__ull2bfloat16_ru")
-    if handle is None:
-        handle = __ull2bfloat16_ru
-
-    @lower(handle, uint64)
+    @lower(__ull2bfloat16_ru, uint64)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key("_ZL17__ull2bfloat16_ruy_nbst", shim_raw_str)
@@ -4533,11 +4327,7 @@ def _lower__ZL16__bfloat162ll_rn13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL16__bfloat162ll_rn13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL16__bfloat162ll_rn13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("__bfloat162ll_rn")
-    if handle is None:
-        handle = __bfloat162ll_rn
-
-    @lower(handle, _type___nv_bfloat16)
+    @lower(__bfloat162ll_rn, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -4579,11 +4369,7 @@ def _lower__ZL16__bfloat162ll_rz13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL16__bfloat162ll_rz13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL16__bfloat162ll_rz13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("__bfloat162ll_rz")
-    if handle is None:
-        handle = __bfloat162ll_rz
-
-    @lower(handle, _type___nv_bfloat16)
+    @lower(__bfloat162ll_rz, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -4625,11 +4411,7 @@ def _lower__ZL16__bfloat162ll_rd13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL16__bfloat162ll_rd13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL16__bfloat162ll_rd13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("__bfloat162ll_rd")
-    if handle is None:
-        handle = __bfloat162ll_rd
-
-    @lower(handle, _type___nv_bfloat16)
+    @lower(__bfloat162ll_rd, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -4671,11 +4453,7 @@ def _lower__ZL16__bfloat162ll_ru13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL16__bfloat162ll_ru13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL16__bfloat162ll_ru13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("__bfloat162ll_ru")
-    if handle is None:
-        handle = __bfloat162ll_ru
-
-    @lower(handle, _type___nv_bfloat16)
+    @lower(__bfloat162ll_ru, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -4716,11 +4494,7 @@ def _lower__ZL16__ll2bfloat16_rnx_nbst(shim_stream, shim_obj):
     def _ZL16__ll2bfloat16_rnx_nbst_caller(arg_0):
         return _ZL16__ll2bfloat16_rnx_nbst(arg_0)
 
-    handle = globals().get("__ll2bfloat16_rn")
-    if handle is None:
-        handle = __ll2bfloat16_rn
-
-    @lower(handle, int64)
+    @lower(__ll2bfloat16_rn, int64)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key("_ZL16__ll2bfloat16_rnx_nbst", shim_raw_str)
@@ -4759,11 +4533,7 @@ def _lower__ZL16__ll2bfloat16_rzx_nbst(shim_stream, shim_obj):
     def _ZL16__ll2bfloat16_rzx_nbst_caller(arg_0):
         return _ZL16__ll2bfloat16_rzx_nbst(arg_0)
 
-    handle = globals().get("__ll2bfloat16_rz")
-    if handle is None:
-        handle = __ll2bfloat16_rz
-
-    @lower(handle, int64)
+    @lower(__ll2bfloat16_rz, int64)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key("_ZL16__ll2bfloat16_rzx_nbst", shim_raw_str)
@@ -4802,11 +4572,7 @@ def _lower__ZL16__ll2bfloat16_rdx_nbst(shim_stream, shim_obj):
     def _ZL16__ll2bfloat16_rdx_nbst_caller(arg_0):
         return _ZL16__ll2bfloat16_rdx_nbst(arg_0)
 
-    handle = globals().get("__ll2bfloat16_rd")
-    if handle is None:
-        handle = __ll2bfloat16_rd
-
-    @lower(handle, int64)
+    @lower(__ll2bfloat16_rd, int64)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key("_ZL16__ll2bfloat16_rdx_nbst", shim_raw_str)
@@ -4845,11 +4611,7 @@ def _lower__ZL16__ll2bfloat16_rux_nbst(shim_stream, shim_obj):
     def _ZL16__ll2bfloat16_rux_nbst_caller(arg_0):
         return _ZL16__ll2bfloat16_rux_nbst(arg_0)
 
-    handle = globals().get("__ll2bfloat16_ru")
-    if handle is None:
-        handle = __ll2bfloat16_ru
-
-    @lower(handle, int64)
+    @lower(__ll2bfloat16_ru, int64)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key("_ZL16__ll2bfloat16_rux_nbst", shim_raw_str)
@@ -4889,11 +4651,7 @@ def _lower__ZL6htrunc13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL6htrunc13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL6htrunc13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("htrunc")
-    if handle is None:
-        handle = htrunc
-
-    @lower(handle, _type___nv_bfloat16)
+    @lower(htrunc, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -4935,11 +4693,7 @@ def _lower__ZL5hceil13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL5hceil13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL5hceil13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("hceil")
-    if handle is None:
-        handle = hceil
-
-    @lower(handle, _type___nv_bfloat16)
+    @lower(hceil, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -4981,11 +4735,7 @@ def _lower__ZL6hfloor13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL6hfloor13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL6hfloor13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("hfloor")
-    if handle is None:
-        handle = hfloor
-
-    @lower(handle, _type___nv_bfloat16)
+    @lower(hfloor, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -5027,11 +4777,7 @@ def _lower__ZL5hrint13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL5hrint13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL5hrint13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("hrint")
-    if handle is None:
-        handle = hrint
-
-    @lower(handle, _type___nv_bfloat16)
+    @lower(hrint, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -5073,11 +4819,7 @@ def _lower__ZL7h2trunc14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL7h2trunc14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL7h2trunc14__nv_bfloat162_nbst(arg_0)
 
-    handle = globals().get("h2trunc")
-    if handle is None:
-        handle = h2trunc
-
-    @lower(handle, _type___nv_bfloat162)
+    @lower(h2trunc, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -5119,11 +4861,7 @@ def _lower__ZL6h2ceil14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL6h2ceil14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL6h2ceil14__nv_bfloat162_nbst(arg_0)
 
-    handle = globals().get("h2ceil")
-    if handle is None:
-        handle = h2ceil
-
-    @lower(handle, _type___nv_bfloat162)
+    @lower(h2ceil, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -5165,11 +4903,7 @@ def _lower__ZL7h2floor14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL7h2floor14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL7h2floor14__nv_bfloat162_nbst(arg_0)
 
-    handle = globals().get("h2floor")
-    if handle is None:
-        handle = h2floor
-
-    @lower(handle, _type___nv_bfloat162)
+    @lower(h2floor, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -5211,11 +4945,7 @@ def _lower__ZL6h2rint14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL6h2rint14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL6h2rint14__nv_bfloat162_nbst(arg_0)
 
-    handle = globals().get("h2rint")
-    if handle is None:
-        handle = h2rint
-
-    @lower(handle, _type___nv_bfloat162)
+    @lower(h2rint, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -5257,11 +4987,7 @@ def _lower__ZL20__bfloat162bfloat16213__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL20__bfloat162bfloat16213__nv_bfloat16_nbst_caller(arg_0):
         return _ZL20__bfloat162bfloat16213__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("__bfloat162bfloat162")
-    if handle is None:
-        handle = __bfloat162bfloat162
-
-    @lower(handle, _type___nv_bfloat16)
+    @lower(__bfloat162bfloat162, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -5303,11 +5029,7 @@ def _lower__ZL17__lowhigh2highlow14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL17__lowhigh2highlow14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL17__lowhigh2highlow14__nv_bfloat162_nbst(arg_0)
 
-    handle = globals().get("__lowhigh2highlow")
-    if handle is None:
-        handle = __lowhigh2highlow
-
-    @lower(handle, _type___nv_bfloat162)
+    @lower(__lowhigh2highlow, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -5351,11 +5073,7 @@ def _lower__ZL16__lows2bfloat16214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL16__lows2bfloat16214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL16__lows2bfloat16214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__lows2bfloat162")
-    if handle is None:
-        handle = __lows2bfloat162
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__lows2bfloat162, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -5403,11 +5121,7 @@ def _lower__ZL17__highs2bfloat16214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL17__highs2bfloat16214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL17__highs2bfloat16214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__highs2bfloat162")
-    if handle is None:
-        handle = __highs2bfloat162
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__highs2bfloat162, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -5453,11 +5167,7 @@ def _lower__ZL15__high2bfloat1614__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL15__high2bfloat1614__nv_bfloat162_nbst_caller(arg_0):
         return _ZL15__high2bfloat1614__nv_bfloat162_nbst(arg_0)
 
-    handle = globals().get("__high2bfloat16")
-    if handle is None:
-        handle = __high2bfloat16
-
-    @lower(handle, _type___nv_bfloat162)
+    @lower(__high2bfloat16, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -5499,11 +5209,7 @@ def _lower__ZL14__low2bfloat1614__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL14__low2bfloat1614__nv_bfloat162_nbst_caller(arg_0):
         return _ZL14__low2bfloat1614__nv_bfloat162_nbst(arg_0)
 
-    handle = globals().get("__low2bfloat16")
-    if handle is None:
-        handle = __low2bfloat16
-
-    @lower(handle, _type___nv_bfloat162)
+    @lower(__low2bfloat16, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -5544,11 +5250,7 @@ def _lower__ZL8__hisinf13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL8__hisinf13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL8__hisinf13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("__hisinf")
-    if handle is None:
-        handle = __hisinf
-
-    @lower(handle, _type___nv_bfloat16)
+    @lower(__hisinf, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -5592,11 +5294,7 @@ def _lower__ZL18__halves2bfloat16213__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL18__halves2bfloat16213__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL18__halves2bfloat16213__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__halves2bfloat162")
-    if handle is None:
-        handle = __halves2bfloat162
-
-    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__halves2bfloat162, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -5642,11 +5340,7 @@ def _lower__ZL15__low2bfloat16214__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL15__low2bfloat16214__nv_bfloat162_nbst_caller(arg_0):
         return _ZL15__low2bfloat16214__nv_bfloat162_nbst(arg_0)
 
-    handle = globals().get("__low2bfloat162")
-    if handle is None:
-        handle = __low2bfloat162
-
-    @lower(handle, _type___nv_bfloat162)
+    @lower(__low2bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -5688,11 +5382,7 @@ def _lower__ZL16__high2bfloat16214__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL16__high2bfloat16214__nv_bfloat162_nbst_caller(arg_0):
         return _ZL16__high2bfloat16214__nv_bfloat162_nbst(arg_0)
 
-    handle = globals().get("__high2bfloat162")
-    if handle is None:
-        handle = __high2bfloat162
-
-    @lower(handle, _type___nv_bfloat162)
+    @lower(__high2bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -5734,11 +5424,7 @@ def _lower__ZL19__bfloat16_as_short13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL19__bfloat16_as_short13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL19__bfloat16_as_short13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("__bfloat16_as_short")
-    if handle is None:
-        handle = __bfloat16_as_short
-
-    @lower(handle, _type___nv_bfloat16)
+    @lower(__bfloat16_as_short, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -5780,11 +5466,7 @@ def _lower__ZL20__bfloat16_as_ushort13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL20__bfloat16_as_ushort13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL20__bfloat16_as_ushort13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("__bfloat16_as_ushort")
-    if handle is None:
-        handle = __bfloat16_as_ushort
-
-    @lower(handle, _type___nv_bfloat16)
+    @lower(__bfloat16_as_ushort, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -5825,11 +5507,7 @@ def _lower__ZL19__short_as_bfloat16s_nbst(shim_stream, shim_obj):
     def _ZL19__short_as_bfloat16s_nbst_caller(arg_0):
         return _ZL19__short_as_bfloat16s_nbst(arg_0)
 
-    handle = globals().get("__short_as_bfloat16")
-    if handle is None:
-        handle = __short_as_bfloat16
-
-    @lower(handle, int16)
+    @lower(__short_as_bfloat16, int16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -5870,11 +5548,7 @@ def _lower__ZL20__ushort_as_bfloat16t_nbst(shim_stream, shim_obj):
     def _ZL20__ushort_as_bfloat16t_nbst_caller(arg_0):
         return _ZL20__ushort_as_bfloat16t_nbst(arg_0)
 
-    handle = globals().get("__ushort_as_bfloat16")
-    if handle is None:
-        handle = __ushort_as_bfloat16
-
-    @lower(handle, uint16)
+    @lower(__ushort_as_bfloat16, uint16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -5925,11 +5599,7 @@ def _ZL11__shfl_syncj14__nv_bfloat162ii_nbst_caller(
             arg_0, arg_1, arg_2, arg_3
         )
 
-    handle = globals().get("__shfl_sync")
-    if handle is None:
-        handle = __shfl_sync
-
-    @lower(handle, uint32, _type___nv_bfloat162, int32, int32)
+    @lower(__shfl_sync, uint32, _type___nv_bfloat162, int32, int32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -5986,11 +5656,7 @@ def _ZL14__shfl_up_syncj14__nv_bfloat162ji_nbst_caller(
             arg_0, arg_1, arg_2, arg_3
         )
 
-    handle = globals().get("__shfl_up_sync")
-    if handle is None:
-        handle = __shfl_up_sync
-
-    @lower(handle, uint32, _type___nv_bfloat162, uint32, int32)
+    @lower(__shfl_up_sync, uint32, _type___nv_bfloat162, uint32, int32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -6047,11 +5713,7 @@ def _ZL16__shfl_down_syncj14__nv_bfloat162ji_nbst_caller(
             arg_0, arg_1, arg_2, arg_3
         )
 
-    handle = globals().get("__shfl_down_sync")
-    if handle is None:
-        handle = __shfl_down_sync
-
-    @lower(handle, uint32, _type___nv_bfloat162, uint32, int32)
+    @lower(__shfl_down_sync, uint32, _type___nv_bfloat162, uint32, int32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -6108,11 +5770,7 @@ def _ZL15__shfl_xor_syncj14__nv_bfloat162ii_nbst_caller(
             arg_0, arg_1, arg_2, arg_3
         )
 
-    handle = globals().get("__shfl_xor_sync")
-    if handle is None:
-        handle = __shfl_xor_sync
-
-    @lower(handle, uint32, _type___nv_bfloat162, int32, int32)
+    @lower(__shfl_xor_sync, uint32, _type___nv_bfloat162, int32, int32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -6165,11 +5823,7 @@ def _ZL11__shfl_syncj13__nv_bfloat16ii_nbst_caller(
             arg_0, arg_1, arg_2, arg_3
         )
 
-    handle = globals().get("__shfl_sync")
-    if handle is None:
-        handle = __shfl_sync
-
-    @lower(handle, uint32, _type___nv_bfloat16, int32, int32)
+    @lower(__shfl_sync, uint32, _type___nv_bfloat16, int32, int32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -6222,11 +5876,7 @@ def _ZL14__shfl_up_syncj13__nv_bfloat16ji_nbst_caller(
             arg_0, arg_1, arg_2, arg_3
         )
 
-    handle = globals().get("__shfl_up_sync")
-    if handle is None:
-        handle = __shfl_up_sync
-
-    @lower(handle, uint32, _type___nv_bfloat16, uint32, int32)
+    @lower(__shfl_up_sync, uint32, _type___nv_bfloat16, uint32, int32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -6279,11 +5929,7 @@ def _ZL16__shfl_down_syncj13__nv_bfloat16ji_nbst_caller(
             arg_0, arg_1, arg_2, arg_3
         )
 
-    handle = globals().get("__shfl_down_sync")
-    if handle is None:
-        handle = __shfl_down_sync
-
-    @lower(handle, uint32, _type___nv_bfloat16, uint32, int32)
+    @lower(__shfl_down_sync, uint32, _type___nv_bfloat16, uint32, int32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -6336,11 +5982,7 @@ def _ZL15__shfl_xor_syncj13__nv_bfloat16ii_nbst_caller(
             arg_0, arg_1, arg_2, arg_3
         )
 
-    handle = globals().get("__shfl_xor_sync")
-    if handle is None:
-        handle = __shfl_xor_sync
-
-    @lower(handle, uint32, _type___nv_bfloat16, int32, int32)
+    @lower(__shfl_xor_sync, uint32, _type___nv_bfloat16, int32, int32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -6388,11 +6030,7 @@ def _lower__ZL5__ldgPK14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL5__ldgPK14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL5__ldgPK14__nv_bfloat162_nbst(arg_0)
 
-    handle = globals().get("__ldg")
-    if handle is None:
-        handle = __ldg
-
-    @lower(handle, CPointer(_type___nv_bfloat162))
+    @lower(__ldg, CPointer(_type___nv_bfloat162))
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -6432,11 +6070,7 @@ def _lower__ZL5__ldgPK13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL5__ldgPK13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL5__ldgPK13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("__ldg")
-    if handle is None:
-        handle = __ldg
-
-    @lower(handle, CPointer(_type___nv_bfloat16))
+    @lower(__ldg, CPointer(_type___nv_bfloat16))
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -6480,11 +6114,7 @@ def _lower__ZL6__ldcgPK14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL6__ldcgPK14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL6__ldcgPK14__nv_bfloat162_nbst(arg_0)
 
-    handle = globals().get("__ldcg")
-    if handle is None:
-        handle = __ldcg
-
-    @lower(handle, CPointer(_type___nv_bfloat162))
+    @lower(__ldcg, CPointer(_type___nv_bfloat162))
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -6524,11 +6154,7 @@ def _lower__ZL6__ldcgPK13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL6__ldcgPK13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL6__ldcgPK13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("__ldcg")
-    if handle is None:
-        handle = __ldcg
-
-    @lower(handle, CPointer(_type___nv_bfloat16))
+    @lower(__ldcg, CPointer(_type___nv_bfloat16))
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -6572,11 +6198,7 @@ def _lower__ZL6__ldcaPK14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL6__ldcaPK14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL6__ldcaPK14__nv_bfloat162_nbst(arg_0)
 
-    handle = globals().get("__ldca")
-    if handle is None:
-        handle = __ldca
-
-    @lower(handle, CPointer(_type___nv_bfloat162))
+    @lower(__ldca, CPointer(_type___nv_bfloat162))
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -6616,11 +6238,7 @@ def _lower__ZL6__ldcaPK13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL6__ldcaPK13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL6__ldcaPK13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("__ldca")
-    if handle is None:
-        handle = __ldca
-
-    @lower(handle, CPointer(_type___nv_bfloat16))
+    @lower(__ldca, CPointer(_type___nv_bfloat16))
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -6664,11 +6282,7 @@ def _lower__ZL6__ldcsPK14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL6__ldcsPK14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL6__ldcsPK14__nv_bfloat162_nbst(arg_0)
 
-    handle = globals().get("__ldcs")
-    if handle is None:
-        handle = __ldcs
-
-    @lower(handle, CPointer(_type___nv_bfloat162))
+    @lower(__ldcs, CPointer(_type___nv_bfloat162))
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -6708,11 +6322,7 @@ def _lower__ZL6__ldcsPK13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL6__ldcsPK13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL6__ldcsPK13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("__ldcs")
-    if handle is None:
-        handle = __ldcs
-
-    @lower(handle, CPointer(_type___nv_bfloat16))
+    @lower(__ldcs, CPointer(_type___nv_bfloat16))
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -6756,11 +6366,7 @@ def _lower__ZL6__ldluPK14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL6__ldluPK14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL6__ldluPK14__nv_bfloat162_nbst(arg_0)
 
-    handle = globals().get("__ldlu")
-    if handle is None:
-        handle = __ldlu
-
-    @lower(handle, CPointer(_type___nv_bfloat162))
+    @lower(__ldlu, CPointer(_type___nv_bfloat162))
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -6800,11 +6406,7 @@ def _lower__ZL6__ldluPK13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL6__ldluPK13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL6__ldluPK13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("__ldlu")
-    if handle is None:
-        handle = __ldlu
-
-    @lower(handle, CPointer(_type___nv_bfloat16))
+    @lower(__ldlu, CPointer(_type___nv_bfloat16))
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -6848,11 +6450,7 @@ def _lower__ZL6__ldcvPK14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL6__ldcvPK14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL6__ldcvPK14__nv_bfloat162_nbst(arg_0)
 
-    handle = globals().get("__ldcv")
-    if handle is None:
-        handle = __ldcv
-
-    @lower(handle, CPointer(_type___nv_bfloat162))
+    @lower(__ldcv, CPointer(_type___nv_bfloat162))
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -6892,11 +6490,7 @@ def _lower__ZL6__ldcvPK13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL6__ldcvPK13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL6__ldcvPK13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("__ldcv")
-    if handle is None:
-        handle = __ldcv
-
-    @lower(handle, CPointer(_type___nv_bfloat16))
+    @lower(__ldcv, CPointer(_type___nv_bfloat16))
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -6943,11 +6537,7 @@ def _lower__ZL6__stwbP14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL6__stwbP14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL6__stwbP14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__stwb")
-    if handle is None:
-        handle = __stwb
-
-    @lower(handle, CPointer(_type___nv_bfloat162), _type___nv_bfloat162)
+    @lower(__stwb, CPointer(_type___nv_bfloat162), _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -6992,11 +6582,7 @@ def _lower__ZL6__stwbP13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL6__stwbP13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL6__stwbP13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__stwb")
-    if handle is None:
-        handle = __stwb
-
-    @lower(handle, CPointer(_type___nv_bfloat16), _type___nv_bfloat16)
+    @lower(__stwb, CPointer(_type___nv_bfloat16), _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -7045,11 +6631,7 @@ def _lower__ZL6__stcgP14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL6__stcgP14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL6__stcgP14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__stcg")
-    if handle is None:
-        handle = __stcg
-
-    @lower(handle, CPointer(_type___nv_bfloat162), _type___nv_bfloat162)
+    @lower(__stcg, CPointer(_type___nv_bfloat162), _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -7094,11 +6676,7 @@ def _lower__ZL6__stcgP13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL6__stcgP13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL6__stcgP13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__stcg")
-    if handle is None:
-        handle = __stcg
-
-    @lower(handle, CPointer(_type___nv_bfloat16), _type___nv_bfloat16)
+    @lower(__stcg, CPointer(_type___nv_bfloat16), _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -7147,11 +6725,7 @@ def _lower__ZL6__stcsP14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL6__stcsP14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL6__stcsP14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__stcs")
-    if handle is None:
-        handle = __stcs
-
-    @lower(handle, CPointer(_type___nv_bfloat162), _type___nv_bfloat162)
+    @lower(__stcs, CPointer(_type___nv_bfloat162), _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -7196,11 +6770,7 @@ def _lower__ZL6__stcsP13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL6__stcsP13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL6__stcsP13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__stcs")
-    if handle is None:
-        handle = __stcs
-
-    @lower(handle, CPointer(_type___nv_bfloat16), _type___nv_bfloat16)
+    @lower(__stcs, CPointer(_type___nv_bfloat16), _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -7249,11 +6819,7 @@ def _lower__ZL6__stwtP14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL6__stwtP14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL6__stwtP14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__stwt")
-    if handle is None:
-        handle = __stwt
-
-    @lower(handle, CPointer(_type___nv_bfloat162), _type___nv_bfloat162)
+    @lower(__stwt, CPointer(_type___nv_bfloat162), _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -7298,11 +6864,7 @@ def _lower__ZL6__stwtP13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL6__stwtP13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL6__stwtP13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__stwt")
-    if handle is None:
-        handle = __stwt
-
-    @lower(handle, CPointer(_type___nv_bfloat16), _type___nv_bfloat16)
+    @lower(__stwt, CPointer(_type___nv_bfloat16), _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -7350,11 +6912,7 @@ def _lower__ZL6__heq214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL6__heq214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL6__heq214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__heq2")
-    if handle is None:
-        handle = __heq2
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__heq2, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -7402,11 +6960,7 @@ def _lower__ZL6__hne214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL6__hne214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL6__hne214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hne2")
-    if handle is None:
-        handle = __hne2
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__hne2, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -7454,11 +7008,7 @@ def _lower__ZL6__hle214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL6__hle214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL6__hle214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hle2")
-    if handle is None:
-        handle = __hle2
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__hle2, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -7506,11 +7056,7 @@ def _lower__ZL6__hge214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL6__hge214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL6__hge214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hge2")
-    if handle is None:
-        handle = __hge2
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__hge2, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -7558,11 +7104,7 @@ def _lower__ZL6__hlt214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL6__hlt214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL6__hlt214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hlt2")
-    if handle is None:
-        handle = __hlt2
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__hlt2, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -7610,11 +7152,7 @@ def _lower__ZL6__hgt214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL6__hgt214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL6__hgt214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hgt2")
-    if handle is None:
-        handle = __hgt2
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__hgt2, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -7662,11 +7200,7 @@ def _lower__ZL7__hequ214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL7__hequ214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL7__hequ214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hequ2")
-    if handle is None:
-        handle = __hequ2
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__hequ2, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -7714,11 +7248,7 @@ def _lower__ZL7__hneu214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL7__hneu214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL7__hneu214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hneu2")
-    if handle is None:
-        handle = __hneu2
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__hneu2, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -7766,11 +7296,7 @@ def _lower__ZL7__hleu214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL7__hleu214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL7__hleu214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hleu2")
-    if handle is None:
-        handle = __hleu2
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__hleu2, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -7818,11 +7344,7 @@ def _lower__ZL7__hgeu214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL7__hgeu214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL7__hgeu214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hgeu2")
-    if handle is None:
-        handle = __hgeu2
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__hgeu2, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -7870,11 +7392,7 @@ def _lower__ZL7__hltu214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL7__hltu214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL7__hltu214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hltu2")
-    if handle is None:
-        handle = __hltu2
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__hltu2, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -7922,11 +7440,7 @@ def _lower__ZL7__hgtu214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL7__hgtu214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL7__hgtu214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hgtu2")
-    if handle is None:
-        handle = __hgtu2
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__hgtu2, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -7972,11 +7486,7 @@ def _lower__ZL11__heq2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL11__heq2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL11__heq2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__heq2_mask")
-    if handle is None:
-        handle = __heq2_mask
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__heq2_mask, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -8022,11 +7532,7 @@ def _lower__ZL11__hne2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL11__hne2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL11__hne2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hne2_mask")
-    if handle is None:
-        handle = __hne2_mask
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__hne2_mask, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -8072,11 +7578,7 @@ def _lower__ZL11__hle2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL11__hle2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL11__hle2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hle2_mask")
-    if handle is None:
-        handle = __hle2_mask
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__hle2_mask, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -8122,11 +7624,7 @@ def _lower__ZL11__hge2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL11__hge2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL11__hge2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hge2_mask")
-    if handle is None:
-        handle = __hge2_mask
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__hge2_mask, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -8172,11 +7670,7 @@ def _lower__ZL11__hlt2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL11__hlt2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL11__hlt2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hlt2_mask")
-    if handle is None:
-        handle = __hlt2_mask
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__hlt2_mask, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -8222,11 +7716,7 @@ def _lower__ZL11__hgt2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL11__hgt2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL11__hgt2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hgt2_mask")
-    if handle is None:
-        handle = __hgt2_mask
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__hgt2_mask, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -8272,11 +7762,7 @@ def _lower__ZL12__hequ2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL12__hequ2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL12__hequ2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hequ2_mask")
-    if handle is None:
-        handle = __hequ2_mask
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__hequ2_mask, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -8322,11 +7808,7 @@ def _lower__ZL12__hneu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL12__hneu2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL12__hneu2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hneu2_mask")
-    if handle is None:
-        handle = __hneu2_mask
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__hneu2_mask, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -8372,11 +7854,7 @@ def _lower__ZL12__hleu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL12__hleu2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL12__hleu2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hleu2_mask")
-    if handle is None:
-        handle = __hleu2_mask
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__hleu2_mask, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -8422,11 +7900,7 @@ def _lower__ZL12__hgeu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL12__hgeu2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL12__hgeu2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hgeu2_mask")
-    if handle is None:
-        handle = __hgeu2_mask
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__hgeu2_mask, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -8472,11 +7946,7 @@ def _lower__ZL12__hltu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL12__hltu2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL12__hltu2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hltu2_mask")
-    if handle is None:
-        handle = __hltu2_mask
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__hltu2_mask, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -8522,11 +7992,7 @@ def _lower__ZL12__hgtu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL12__hgtu2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL12__hgtu2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hgtu2_mask")
-    if handle is None:
-        handle = __hgtu2_mask
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__hgtu2_mask, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -8572,11 +8038,7 @@ def _lower__ZL9__hisnan214__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL9__hisnan214__nv_bfloat162_nbst_caller(arg_0):
         return _ZL9__hisnan214__nv_bfloat162_nbst(arg_0)
 
-    handle = globals().get("__hisnan2")
-    if handle is None:
-        handle = __hisnan2
-
-    @lower(handle, _type___nv_bfloat162)
+    @lower(__hisnan2, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -8620,11 +8082,7 @@ def _lower__ZL7__hadd214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL7__hadd214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL7__hadd214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hadd2")
-    if handle is None:
-        handle = __hadd2
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__hadd2, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -8672,11 +8130,7 @@ def _lower__ZL7__hsub214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL7__hsub214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL7__hsub214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hsub2")
-    if handle is None:
-        handle = __hsub2
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__hsub2, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -8724,11 +8178,7 @@ def _lower__ZL7__hmul214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL7__hmul214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL7__hmul214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hmul2")
-    if handle is None:
-        handle = __hmul2
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__hmul2, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -8776,11 +8226,7 @@ def _lower__ZL10__hadd2_rn14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL10__hadd2_rn14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL10__hadd2_rn14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hadd2_rn")
-    if handle is None:
-        handle = __hadd2_rn
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__hadd2_rn, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -8828,11 +8274,7 @@ def _lower__ZL10__hsub2_rn14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL10__hsub2_rn14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL10__hsub2_rn14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hsub2_rn")
-    if handle is None:
-        handle = __hsub2_rn
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__hsub2_rn, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -8880,11 +8322,7 @@ def _lower__ZL10__hmul2_rn14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL10__hmul2_rn14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL10__hmul2_rn14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hmul2_rn")
-    if handle is None:
-        handle = __hmul2_rn
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__hmul2_rn, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -8932,11 +8370,7 @@ def _lower__ZL7__h2div14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL7__h2div14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL7__h2div14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__h2div")
-    if handle is None:
-        handle = __h2div
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__h2div, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -8982,11 +8416,7 @@ def _lower__ZL7__habs214__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL7__habs214__nv_bfloat162_nbst_caller(arg_0):
         return _ZL7__habs214__nv_bfloat162_nbst(arg_0)
 
-    handle = globals().get("__habs2")
-    if handle is None:
-        handle = __habs2
-
-    @lower(handle, _type___nv_bfloat162)
+    @lower(__habs2, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -9030,11 +8460,7 @@ def _lower__ZL11__hadd2_sat14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL11__hadd2_sat14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL11__hadd2_sat14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hadd2_sat")
-    if handle is None:
-        handle = __hadd2_sat
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__hadd2_sat, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -9082,11 +8508,7 @@ def _lower__ZL11__hsub2_sat14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL11__hsub2_sat14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL11__hsub2_sat14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hsub2_sat")
-    if handle is None:
-        handle = __hsub2_sat
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__hsub2_sat, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -9134,11 +8556,7 @@ def _lower__ZL11__hmul2_sat14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL11__hmul2_sat14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL11__hmul2_sat14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hmul2_sat")
-    if handle is None:
-        handle = __hmul2_sat
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__hmul2_sat, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -9188,12 +8606,11 @@ def _lower__ZL7__hfma214__nv_bfloat162S_S__nbst(shim_stream, shim_obj):
     def _ZL7__hfma214__nv_bfloat162S_S__nbst_caller(arg_0, arg_1, arg_2):
         return _ZL7__hfma214__nv_bfloat162S_S__nbst(arg_0, arg_1, arg_2)
 
-    handle = globals().get("__hfma2")
-    if handle is None:
-        handle = __hfma2
-
     @lower(
-        handle, _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        __hfma2,
+        _type___nv_bfloat162,
+        _type___nv_bfloat162,
+        _type___nv_bfloat162,
     )
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
@@ -9245,12 +8662,11 @@ def _lower__ZL11__hfma2_sat14__nv_bfloat162S_S__nbst(shim_stream, shim_obj):
     def _ZL11__hfma2_sat14__nv_bfloat162S_S__nbst_caller(arg_0, arg_1, arg_2):
         return _ZL11__hfma2_sat14__nv_bfloat162S_S__nbst(arg_0, arg_1, arg_2)
 
-    handle = globals().get("__hfma2_sat")
-    if handle is None:
-        handle = __hfma2_sat
-
     @lower(
-        handle, _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        __hfma2_sat,
+        _type___nv_bfloat162,
+        _type___nv_bfloat162,
+        _type___nv_bfloat162,
     )
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
@@ -9298,11 +8714,7 @@ def _lower__ZL7__hneg214__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL7__hneg214__nv_bfloat162_nbst_caller(arg_0):
         return _ZL7__hneg214__nv_bfloat162_nbst(arg_0)
 
-    handle = globals().get("__hneg2")
-    if handle is None:
-        handle = __hneg2
-
-    @lower(handle, _type___nv_bfloat162)
+    @lower(__hneg2, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -9344,11 +8756,7 @@ def _lower__ZL6__habs13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL6__habs13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL6__habs13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("__habs")
-    if handle is None:
-        handle = __habs
-
-    @lower(handle, _type___nv_bfloat16)
+    @lower(__habs, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -9392,11 +8800,7 @@ def _lower__ZL6__hadd13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL6__hadd13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL6__hadd13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hadd")
-    if handle is None:
-        handle = __hadd
-
-    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__hadd, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -9444,11 +8848,7 @@ def _lower__ZL6__hsub13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL6__hsub13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL6__hsub13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hsub")
-    if handle is None:
-        handle = __hsub
-
-    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__hsub, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -9496,11 +8896,7 @@ def _lower__ZL6__hmul13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL6__hmul13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL6__hmul13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hmul")
-    if handle is None:
-        handle = __hmul
-
-    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__hmul, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -9548,11 +8944,7 @@ def _lower__ZL9__hadd_rn13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL9__hadd_rn13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL9__hadd_rn13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hadd_rn")
-    if handle is None:
-        handle = __hadd_rn
-
-    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__hadd_rn, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -9600,11 +8992,7 @@ def _lower__ZL9__hsub_rn13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL9__hsub_rn13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL9__hsub_rn13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hsub_rn")
-    if handle is None:
-        handle = __hsub_rn
-
-    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__hsub_rn, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -9652,11 +9040,7 @@ def _lower__ZL9__hmul_rn13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL9__hmul_rn13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL9__hmul_rn13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hmul_rn")
-    if handle is None:
-        handle = __hmul_rn
-
-    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__hmul_rn, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -9704,11 +9088,7 @@ def _lower__ZL6__hdiv13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL6__hdiv13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL6__hdiv13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hdiv")
-    if handle is None:
-        handle = __hdiv
-
-    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__hdiv, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -9756,11 +9136,7 @@ def _lower__ZL10__hadd_sat13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL10__hadd_sat13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL10__hadd_sat13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hadd_sat")
-    if handle is None:
-        handle = __hadd_sat
-
-    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__hadd_sat, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -9808,11 +9184,7 @@ def _lower__ZL10__hsub_sat13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL10__hsub_sat13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL10__hsub_sat13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hsub_sat")
-    if handle is None:
-        handle = __hsub_sat
-
-    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__hsub_sat, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -9860,11 +9232,7 @@ def _lower__ZL10__hmul_sat13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL10__hmul_sat13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL10__hmul_sat13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hmul_sat")
-    if handle is None:
-        handle = __hmul_sat
-
-    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__hmul_sat, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -9914,12 +9282,8 @@ def _lower__ZL6__hfma13__nv_bfloat16S_S__nbst(shim_stream, shim_obj):
     def _ZL6__hfma13__nv_bfloat16S_S__nbst_caller(arg_0, arg_1, arg_2):
         return _ZL6__hfma13__nv_bfloat16S_S__nbst(arg_0, arg_1, arg_2)
 
-    handle = globals().get("__hfma")
-    if handle is None:
-        handle = __hfma
-
     @lower(
-        handle, _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16
+        __hfma, _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16
     )
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
@@ -9971,12 +9335,11 @@ def _lower__ZL10__hfma_sat13__nv_bfloat16S_S__nbst(shim_stream, shim_obj):
     def _ZL10__hfma_sat13__nv_bfloat16S_S__nbst_caller(arg_0, arg_1, arg_2):
         return _ZL10__hfma_sat13__nv_bfloat16S_S__nbst(arg_0, arg_1, arg_2)
 
-    handle = globals().get("__hfma_sat")
-    if handle is None:
-        handle = __hfma_sat
-
     @lower(
-        handle, _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16
+        __hfma_sat,
+        _type___nv_bfloat16,
+        _type___nv_bfloat16,
+        _type___nv_bfloat16,
     )
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
@@ -10024,11 +9387,7 @@ def _lower__ZL6__hneg13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL6__hneg13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL6__hneg13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("__hneg")
-    if handle is None:
-        handle = __hneg
-
-    @lower(handle, _type___nv_bfloat16)
+    @lower(__hneg, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -10070,11 +9429,7 @@ def _lower__ZL7__hbeq214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL7__hbeq214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL7__hbeq214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hbeq2")
-    if handle is None:
-        handle = __hbeq2
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__hbeq2, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -10120,11 +9475,7 @@ def _lower__ZL7__hbne214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL7__hbne214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL7__hbne214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hbne2")
-    if handle is None:
-        handle = __hbne2
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__hbne2, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -10170,11 +9521,7 @@ def _lower__ZL7__hble214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL7__hble214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL7__hble214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hble2")
-    if handle is None:
-        handle = __hble2
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__hble2, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -10220,11 +9567,7 @@ def _lower__ZL7__hbge214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL7__hbge214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL7__hbge214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hbge2")
-    if handle is None:
-        handle = __hbge2
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__hbge2, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -10270,11 +9613,7 @@ def _lower__ZL7__hblt214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL7__hblt214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL7__hblt214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hblt2")
-    if handle is None:
-        handle = __hblt2
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__hblt2, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -10320,11 +9659,7 @@ def _lower__ZL7__hbgt214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL7__hbgt214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL7__hbgt214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hbgt2")
-    if handle is None:
-        handle = __hbgt2
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__hbgt2, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -10370,11 +9705,7 @@ def _lower__ZL8__hbequ214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL8__hbequ214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL8__hbequ214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hbequ2")
-    if handle is None:
-        handle = __hbequ2
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__hbequ2, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -10420,11 +9751,7 @@ def _lower__ZL8__hbneu214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL8__hbneu214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL8__hbneu214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hbneu2")
-    if handle is None:
-        handle = __hbneu2
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__hbneu2, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -10470,11 +9797,7 @@ def _lower__ZL8__hbleu214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL8__hbleu214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL8__hbleu214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hbleu2")
-    if handle is None:
-        handle = __hbleu2
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__hbleu2, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -10520,11 +9843,7 @@ def _lower__ZL8__hbgeu214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL8__hbgeu214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL8__hbgeu214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hbgeu2")
-    if handle is None:
-        handle = __hbgeu2
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__hbgeu2, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -10570,11 +9889,7 @@ def _lower__ZL8__hbltu214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL8__hbltu214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL8__hbltu214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hbltu2")
-    if handle is None:
-        handle = __hbltu2
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__hbltu2, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -10620,11 +9935,7 @@ def _lower__ZL8__hbgtu214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL8__hbgtu214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL8__hbgtu214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hbgtu2")
-    if handle is None:
-        handle = __hbgtu2
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__hbgtu2, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -10670,11 +9981,7 @@ def _lower__ZL5__heq13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL5__heq13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL5__heq13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__heq")
-    if handle is None:
-        handle = __heq
-
-    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__heq, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -10720,11 +10027,7 @@ def _lower__ZL5__hne13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL5__hne13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL5__hne13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hne")
-    if handle is None:
-        handle = __hne
-
-    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__hne, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -10770,11 +10073,7 @@ def _lower__ZL5__hle13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL5__hle13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL5__hle13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hle")
-    if handle is None:
-        handle = __hle
-
-    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__hle, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -10820,11 +10119,7 @@ def _lower__ZL5__hge13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL5__hge13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL5__hge13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hge")
-    if handle is None:
-        handle = __hge
-
-    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__hge, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -10870,11 +10165,7 @@ def _lower__ZL5__hlt13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL5__hlt13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL5__hlt13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hlt")
-    if handle is None:
-        handle = __hlt
-
-    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__hlt, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -10920,11 +10211,7 @@ def _lower__ZL5__hgt13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL5__hgt13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL5__hgt13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hgt")
-    if handle is None:
-        handle = __hgt
-
-    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__hgt, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -10970,11 +10257,7 @@ def _lower__ZL6__hequ13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL6__hequ13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL6__hequ13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hequ")
-    if handle is None:
-        handle = __hequ
-
-    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__hequ, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -11020,11 +10303,7 @@ def _lower__ZL6__hneu13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL6__hneu13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL6__hneu13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hneu")
-    if handle is None:
-        handle = __hneu
-
-    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__hneu, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -11070,11 +10349,7 @@ def _lower__ZL6__hleu13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL6__hleu13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL6__hleu13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hleu")
-    if handle is None:
-        handle = __hleu
-
-    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__hleu, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -11120,11 +10395,7 @@ def _lower__ZL6__hgeu13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL6__hgeu13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL6__hgeu13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hgeu")
-    if handle is None:
-        handle = __hgeu
-
-    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__hgeu, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -11170,11 +10441,7 @@ def _lower__ZL6__hltu13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL6__hltu13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL6__hltu13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hltu")
-    if handle is None:
-        handle = __hltu
-
-    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__hltu, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -11220,11 +10487,7 @@ def _lower__ZL6__hgtu13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL6__hgtu13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL6__hgtu13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hgtu")
-    if handle is None:
-        handle = __hgtu
-
-    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__hgtu, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -11269,11 +10532,7 @@ def _lower__ZL8__hisnan13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL8__hisnan13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL8__hisnan13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("__hisnan")
-    if handle is None:
-        handle = __hisnan
-
-    @lower(handle, _type___nv_bfloat16)
+    @lower(__hisnan, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -11317,11 +10576,7 @@ def _lower__ZL6__hmax13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL6__hmax13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL6__hmax13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hmax")
-    if handle is None:
-        handle = __hmax
-
-    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__hmax, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -11369,11 +10624,7 @@ def _lower__ZL6__hmin13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL6__hmin13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL6__hmin13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hmin")
-    if handle is None:
-        handle = __hmin
-
-    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__hmin, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -11421,11 +10672,7 @@ def _lower__ZL10__hmax_nan13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL10__hmax_nan13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL10__hmax_nan13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hmax_nan")
-    if handle is None:
-        handle = __hmax_nan
-
-    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__hmax_nan, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -11473,11 +10720,7 @@ def _lower__ZL10__hmin_nan13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL10__hmin_nan13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL10__hmin_nan13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hmin_nan")
-    if handle is None:
-        handle = __hmin_nan
-
-    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__hmin_nan, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -11527,12 +10770,11 @@ def _lower__ZL11__hfma_relu13__nv_bfloat16S_S__nbst(shim_stream, shim_obj):
     def _ZL11__hfma_relu13__nv_bfloat16S_S__nbst_caller(arg_0, arg_1, arg_2):
         return _ZL11__hfma_relu13__nv_bfloat16S_S__nbst(arg_0, arg_1, arg_2)
 
-    handle = globals().get("__hfma_relu")
-    if handle is None:
-        handle = __hfma_relu
-
     @lower(
-        handle, _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16
+        __hfma_relu,
+        _type___nv_bfloat16,
+        _type___nv_bfloat16,
+        _type___nv_bfloat16,
     )
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
@@ -11582,11 +10824,7 @@ def _lower__ZL7__hmax214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL7__hmax214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL7__hmax214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hmax2")
-    if handle is None:
-        handle = __hmax2
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__hmax2, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -11634,11 +10872,7 @@ def _lower__ZL7__hmin214__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL7__hmin214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL7__hmin214__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hmin2")
-    if handle is None:
-        handle = __hmin2
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__hmin2, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -11686,11 +10920,7 @@ def _lower__ZL11__hmax2_nan14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL11__hmax2_nan14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL11__hmax2_nan14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hmax2_nan")
-    if handle is None:
-        handle = __hmax2_nan
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__hmax2_nan, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -11738,11 +10968,7 @@ def _lower__ZL11__hmin2_nan14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL11__hmin2_nan14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL11__hmin2_nan14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("__hmin2_nan")
-    if handle is None:
-        handle = __hmin2_nan
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__hmin2_nan, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -11792,12 +11018,11 @@ def _lower__ZL12__hfma2_relu14__nv_bfloat162S_S__nbst(shim_stream, shim_obj):
     def _ZL12__hfma2_relu14__nv_bfloat162S_S__nbst_caller(arg_0, arg_1, arg_2):
         return _ZL12__hfma2_relu14__nv_bfloat162S_S__nbst(arg_0, arg_1, arg_2)
 
-    handle = globals().get("__hfma2_relu")
-    if handle is None:
-        handle = __hfma2_relu
-
     @lower(
-        handle, _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        __hfma2_relu,
+        _type___nv_bfloat162,
+        _type___nv_bfloat162,
+        _type___nv_bfloat162,
     )
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
@@ -11849,12 +11074,11 @@ def _lower__ZL8__hcmadd14__nv_bfloat162S_S__nbst(shim_stream, shim_obj):
     def _ZL8__hcmadd14__nv_bfloat162S_S__nbst_caller(arg_0, arg_1, arg_2):
         return _ZL8__hcmadd14__nv_bfloat162S_S__nbst(arg_0, arg_1, arg_2)
 
-    handle = globals().get("__hcmadd")
-    if handle is None:
-        handle = __hcmadd
-
     @lower(
-        handle, _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        __hcmadd,
+        _type___nv_bfloat162,
+        _type___nv_bfloat162,
+        _type___nv_bfloat162,
     )
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
@@ -11902,11 +11126,7 @@ def _lower__ZL5hsqrt13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL5hsqrt13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL5hsqrt13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("hsqrt")
-    if handle is None:
-        handle = hsqrt
-
-    @lower(handle, _type___nv_bfloat16)
+    @lower(hsqrt, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -11948,11 +11168,7 @@ def _lower__ZL6hrsqrt13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL6hrsqrt13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL6hrsqrt13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("hrsqrt")
-    if handle is None:
-        handle = hrsqrt
-
-    @lower(handle, _type___nv_bfloat16)
+    @lower(hrsqrt, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -11994,11 +11210,7 @@ def _lower__ZL4hrcp13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL4hrcp13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL4hrcp13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("hrcp")
-    if handle is None:
-        handle = hrcp
-
-    @lower(handle, _type___nv_bfloat16)
+    @lower(hrcp, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key("_ZL4hrcp13__nv_bfloat16_nbst", shim_raw_str)
@@ -12038,11 +11250,7 @@ def _lower__ZL4hlog13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL4hlog13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL4hlog13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("hlog")
-    if handle is None:
-        handle = hlog
-
-    @lower(handle, _type___nv_bfloat16)
+    @lower(hlog, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key("_ZL4hlog13__nv_bfloat16_nbst", shim_raw_str)
@@ -12082,11 +11290,7 @@ def _lower__ZL5hlog213__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL5hlog213__nv_bfloat16_nbst_caller(arg_0):
         return _ZL5hlog213__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("hlog2")
-    if handle is None:
-        handle = hlog2
-
-    @lower(handle, _type___nv_bfloat16)
+    @lower(hlog2, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -12128,11 +11332,7 @@ def _lower__ZL6hlog1013__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL6hlog1013__nv_bfloat16_nbst_caller(arg_0):
         return _ZL6hlog1013__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("hlog10")
-    if handle is None:
-        handle = hlog10
-
-    @lower(handle, _type___nv_bfloat16)
+    @lower(hlog10, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -12174,11 +11374,7 @@ def _lower__ZL4hexp13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL4hexp13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL4hexp13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("hexp")
-    if handle is None:
-        handle = hexp
-
-    @lower(handle, _type___nv_bfloat16)
+    @lower(hexp, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key("_ZL4hexp13__nv_bfloat16_nbst", shim_raw_str)
@@ -12218,11 +11414,7 @@ def _lower__ZL12htanh_approx13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL12htanh_approx13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL12htanh_approx13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("htanh_approx")
-    if handle is None:
-        handle = htanh_approx
-
-    @lower(handle, _type___nv_bfloat16)
+    @lower(htanh_approx, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -12264,11 +11456,7 @@ def _lower__ZL13h2tanh_approx14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL13h2tanh_approx14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL13h2tanh_approx14__nv_bfloat162_nbst(arg_0)
 
-    handle = globals().get("h2tanh_approx")
-    if handle is None:
-        handle = h2tanh_approx
-
-    @lower(handle, _type___nv_bfloat162)
+    @lower(h2tanh_approx, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -12310,11 +11498,7 @@ def _lower__ZL5htanh13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL5htanh13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL5htanh13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("htanh")
-    if handle is None:
-        handle = htanh
-
-    @lower(handle, _type___nv_bfloat16)
+    @lower(htanh, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -12356,11 +11540,7 @@ def _lower__ZL6h2tanh14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL6h2tanh14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL6h2tanh14__nv_bfloat162_nbst(arg_0)
 
-    handle = globals().get("h2tanh")
-    if handle is None:
-        handle = h2tanh
-
-    @lower(handle, _type___nv_bfloat162)
+    @lower(h2tanh, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -12402,11 +11582,7 @@ def _lower__ZL5hexp213__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL5hexp213__nv_bfloat16_nbst_caller(arg_0):
         return _ZL5hexp213__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("hexp2")
-    if handle is None:
-        handle = hexp2
-
-    @lower(handle, _type___nv_bfloat16)
+    @lower(hexp2, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -12448,11 +11624,7 @@ def _lower__ZL6hexp1013__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL6hexp1013__nv_bfloat16_nbst_caller(arg_0):
         return _ZL6hexp1013__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("hexp10")
-    if handle is None:
-        handle = hexp10
-
-    @lower(handle, _type___nv_bfloat16)
+    @lower(hexp10, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -12494,11 +11666,7 @@ def _lower__ZL4hcos13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL4hcos13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL4hcos13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("hcos")
-    if handle is None:
-        handle = hcos
-
-    @lower(handle, _type___nv_bfloat16)
+    @lower(hcos, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key("_ZL4hcos13__nv_bfloat16_nbst", shim_raw_str)
@@ -12538,11 +11706,7 @@ def _lower__ZL4hsin13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZL4hsin13__nv_bfloat16_nbst_caller(arg_0):
         return _ZL4hsin13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("hsin")
-    if handle is None:
-        handle = hsin
-
-    @lower(handle, _type___nv_bfloat16)
+    @lower(hsin, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key("_ZL4hsin13__nv_bfloat16_nbst", shim_raw_str)
@@ -12582,11 +11746,7 @@ def _lower__ZL6h2sqrt14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL6h2sqrt14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL6h2sqrt14__nv_bfloat162_nbst(arg_0)
 
-    handle = globals().get("h2sqrt")
-    if handle is None:
-        handle = h2sqrt
-
-    @lower(handle, _type___nv_bfloat162)
+    @lower(h2sqrt, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -12628,11 +11788,7 @@ def _lower__ZL7h2rsqrt14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL7h2rsqrt14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL7h2rsqrt14__nv_bfloat162_nbst(arg_0)
 
-    handle = globals().get("h2rsqrt")
-    if handle is None:
-        handle = h2rsqrt
-
-    @lower(handle, _type___nv_bfloat162)
+    @lower(h2rsqrt, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -12674,11 +11830,7 @@ def _lower__ZL5h2rcp14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL5h2rcp14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL5h2rcp14__nv_bfloat162_nbst(arg_0)
 
-    handle = globals().get("h2rcp")
-    if handle is None:
-        handle = h2rcp
-
-    @lower(handle, _type___nv_bfloat162)
+    @lower(h2rcp, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -12720,11 +11872,7 @@ def _lower__ZL5h2log14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL5h2log14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL5h2log14__nv_bfloat162_nbst(arg_0)
 
-    handle = globals().get("h2log")
-    if handle is None:
-        handle = h2log
-
-    @lower(handle, _type___nv_bfloat162)
+    @lower(h2log, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -12766,11 +11914,7 @@ def _lower__ZL6h2log214__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL6h2log214__nv_bfloat162_nbst_caller(arg_0):
         return _ZL6h2log214__nv_bfloat162_nbst(arg_0)
 
-    handle = globals().get("h2log2")
-    if handle is None:
-        handle = h2log2
-
-    @lower(handle, _type___nv_bfloat162)
+    @lower(h2log2, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -12812,11 +11956,7 @@ def _lower__ZL7h2log1014__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL7h2log1014__nv_bfloat162_nbst_caller(arg_0):
         return _ZL7h2log1014__nv_bfloat162_nbst(arg_0)
 
-    handle = globals().get("h2log10")
-    if handle is None:
-        handle = h2log10
-
-    @lower(handle, _type___nv_bfloat162)
+    @lower(h2log10, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -12858,11 +11998,7 @@ def _lower__ZL5h2exp14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL5h2exp14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL5h2exp14__nv_bfloat162_nbst(arg_0)
 
-    handle = globals().get("h2exp")
-    if handle is None:
-        handle = h2exp
-
-    @lower(handle, _type___nv_bfloat162)
+    @lower(h2exp, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -12904,11 +12040,7 @@ def _lower__ZL6h2exp214__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL6h2exp214__nv_bfloat162_nbst_caller(arg_0):
         return _ZL6h2exp214__nv_bfloat162_nbst(arg_0)
 
-    handle = globals().get("h2exp2")
-    if handle is None:
-        handle = h2exp2
-
-    @lower(handle, _type___nv_bfloat162)
+    @lower(h2exp2, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -12950,11 +12082,7 @@ def _lower__ZL7h2exp1014__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL7h2exp1014__nv_bfloat162_nbst_caller(arg_0):
         return _ZL7h2exp1014__nv_bfloat162_nbst(arg_0)
 
-    handle = globals().get("h2exp10")
-    if handle is None:
-        handle = h2exp10
-
-    @lower(handle, _type___nv_bfloat162)
+    @lower(h2exp10, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -12996,11 +12124,7 @@ def _lower__ZL5h2cos14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL5h2cos14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL5h2cos14__nv_bfloat162_nbst(arg_0)
 
-    handle = globals().get("h2cos")
-    if handle is None:
-        handle = h2cos
-
-    @lower(handle, _type___nv_bfloat162)
+    @lower(h2cos, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -13042,11 +12166,7 @@ def _lower__ZL5h2sin14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZL5h2sin14__nv_bfloat162_nbst_caller(arg_0):
         return _ZL5h2sin14__nv_bfloat162_nbst(arg_0)
 
-    handle = globals().get("h2sin")
-    if handle is None:
-        handle = h2sin
-
-    @lower(handle, _type___nv_bfloat162)
+    @lower(h2sin, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -13091,11 +12211,7 @@ def _lower__ZL9atomicAddP14__nv_bfloat162S__nbst(shim_stream, shim_obj):
     def _ZL9atomicAddP14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
         return _ZL9atomicAddP14__nv_bfloat162S__nbst(arg_0, arg_1)
 
-    handle = globals().get("atomicAdd")
-    if handle is None:
-        handle = atomicAdd
-
-    @lower(handle, CPointer(_type___nv_bfloat162), _type___nv_bfloat162)
+    @lower(atomicAdd, CPointer(_type___nv_bfloat162), _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -13140,11 +12256,7 @@ def _lower__ZL9atomicAddP13__nv_bfloat16S__nbst(shim_stream, shim_obj):
     def _ZL9atomicAddP13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
         return _ZL9atomicAddP13__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    handle = globals().get("atomicAdd")
-    if handle is None:
-        handle = atomicAdd
-
-    @lower(handle, CPointer(_type___nv_bfloat16), _type___nv_bfloat16)
+    @lower(atomicAdd, CPointer(_type___nv_bfloat16), _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -13188,11 +12300,7 @@ def _lower__ZplRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj):
     def _ZplRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1):
         return _ZplRK13__nv_bfloat16S1__nbst(arg_0, arg_1)
 
-    handle = globals().get("operator.add")
-    if handle is None:
-        handle = operator.add
-
-    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(operator.add, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -13236,11 +12344,7 @@ def _lower__ZmiRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj):
     def _ZmiRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1):
         return _ZmiRK13__nv_bfloat16S1__nbst(arg_0, arg_1)
 
-    handle = globals().get("operator.sub")
-    if handle is None:
-        handle = operator.sub
-
-    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(operator.sub, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -13284,11 +12388,7 @@ def _lower__ZmlRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj):
     def _ZmlRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1):
         return _ZmlRK13__nv_bfloat16S1__nbst(arg_0, arg_1)
 
-    handle = globals().get("operator.mul")
-    if handle is None:
-        handle = operator.mul
-
-    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(operator.mul, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -13332,11 +12432,7 @@ def _lower__ZdvRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj):
     def _ZdvRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1):
         return _ZdvRK13__nv_bfloat16S1__nbst(arg_0, arg_1)
 
-    handle = globals().get("operator.truediv")
-    if handle is None:
-        handle = operator.truediv
-
-    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(operator.truediv, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -13380,11 +12476,7 @@ def _lower__ZpLR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj):
     def _ZpLR13__nv_bfloat16RKS__nbst_caller(arg_0, arg_1):
         return _ZpLR13__nv_bfloat16RKS__nbst(arg_0, arg_1)
 
-    handle = globals().get("operator.iadd")
-    if handle is None:
-        handle = operator.iadd
-
-    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(operator.iadd, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -13428,11 +12520,7 @@ def _lower__ZmIR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj):
     def _ZmIR13__nv_bfloat16RKS__nbst_caller(arg_0, arg_1):
         return _ZmIR13__nv_bfloat16RKS__nbst(arg_0, arg_1)
 
-    handle = globals().get("operator.isub")
-    if handle is None:
-        handle = operator.isub
-
-    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(operator.isub, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -13476,11 +12564,7 @@ def _lower__ZmLR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj):
     def _ZmLR13__nv_bfloat16RKS__nbst_caller(arg_0, arg_1):
         return _ZmLR13__nv_bfloat16RKS__nbst(arg_0, arg_1)
 
-    handle = globals().get("operator.imul")
-    if handle is None:
-        handle = operator.imul
-
-    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(operator.imul, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -13524,11 +12608,7 @@ def _lower__ZdVR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj):
     def _ZdVR13__nv_bfloat16RKS__nbst_caller(arg_0, arg_1):
         return _ZdVR13__nv_bfloat16RKS__nbst(arg_0, arg_1)
 
-    handle = globals().get("operator.itruediv")
-    if handle is None:
-        handle = operator.itruediv
-
-    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(operator.itruediv, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -13570,11 +12650,7 @@ def _lower__ZpsRK13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZpsRK13__nv_bfloat16_nbst_caller(arg_0):
         return _ZpsRK13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("operator.pos")
-    if handle is None:
-        handle = operator.pos
-
-    @lower(handle, _type___nv_bfloat16)
+    @lower(operator.pos, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key("_ZpsRK13__nv_bfloat16_nbst", shim_raw_str)
@@ -13610,11 +12686,7 @@ def _lower__ZngRK13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZngRK13__nv_bfloat16_nbst_caller(arg_0):
         return _ZngRK13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("operator.neg")
-    if handle is None:
-        handle = operator.neg
-
-    @lower(handle, _type___nv_bfloat16)
+    @lower(operator.neg, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key("_ZngRK13__nv_bfloat16_nbst", shim_raw_str)
@@ -13650,11 +12722,7 @@ def _lower__ZeqRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj):
     def _ZeqRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1):
         return _ZeqRK13__nv_bfloat16S1__nbst(arg_0, arg_1)
 
-    handle = globals().get("operator.eq")
-    if handle is None:
-        handle = operator.eq
-
-    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(operator.eq, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -13696,11 +12764,7 @@ def _lower__ZneRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj):
     def _ZneRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1):
         return _ZneRK13__nv_bfloat16S1__nbst(arg_0, arg_1)
 
-    handle = globals().get("operator.ne")
-    if handle is None:
-        handle = operator.ne
-
-    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(operator.ne, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -13742,11 +12806,7 @@ def _lower__ZgtRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj):
     def _ZgtRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1):
         return _ZgtRK13__nv_bfloat16S1__nbst(arg_0, arg_1)
 
-    handle = globals().get("operator.gt")
-    if handle is None:
-        handle = operator.gt
-
-    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(operator.gt, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -13788,11 +12848,7 @@ def _lower__ZltRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj):
     def _ZltRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1):
         return _ZltRK13__nv_bfloat16S1__nbst(arg_0, arg_1)
 
-    handle = globals().get("operator.lt")
-    if handle is None:
-        handle = operator.lt
-
-    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(operator.lt, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -13834,11 +12890,7 @@ def _lower__ZgeRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj):
     def _ZgeRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1):
         return _ZgeRK13__nv_bfloat16S1__nbst(arg_0, arg_1)
 
-    handle = globals().get("operator.ge")
-    if handle is None:
-        handle = operator.ge
-
-    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(operator.ge, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -13880,11 +12932,7 @@ def _lower__ZleRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj):
     def _ZleRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1):
         return _ZleRK13__nv_bfloat16S1__nbst(arg_0, arg_1)
 
-    handle = globals().get("operator.le")
-    if handle is None:
-        handle = operator.le
-
-    @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(operator.le, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -13928,11 +12976,7 @@ def _lower__ZplRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj):
     def _ZplRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1):
         return _ZplRK14__nv_bfloat162S1__nbst(arg_0, arg_1)
 
-    handle = globals().get("operator.add")
-    if handle is None:
-        handle = operator.add
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(operator.add, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -13976,11 +13020,7 @@ def _lower__ZmiRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj):
     def _ZmiRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1):
         return _ZmiRK14__nv_bfloat162S1__nbst(arg_0, arg_1)
 
-    handle = globals().get("operator.sub")
-    if handle is None:
-        handle = operator.sub
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(operator.sub, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -14024,11 +13064,7 @@ def _lower__ZmlRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj):
     def _ZmlRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1):
         return _ZmlRK14__nv_bfloat162S1__nbst(arg_0, arg_1)
 
-    handle = globals().get("operator.mul")
-    if handle is None:
-        handle = operator.mul
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(operator.mul, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -14072,11 +13108,7 @@ def _lower__ZdvRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj):
     def _ZdvRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1):
         return _ZdvRK14__nv_bfloat162S1__nbst(arg_0, arg_1)
 
-    handle = globals().get("operator.truediv")
-    if handle is None:
-        handle = operator.truediv
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(operator.truediv, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -14120,11 +13152,7 @@ def _lower__ZpLR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj):
     def _ZpLR14__nv_bfloat162RKS__nbst_caller(arg_0, arg_1):
         return _ZpLR14__nv_bfloat162RKS__nbst(arg_0, arg_1)
 
-    handle = globals().get("operator.iadd")
-    if handle is None:
-        handle = operator.iadd
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(operator.iadd, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -14168,11 +13196,7 @@ def _lower__ZmIR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj):
     def _ZmIR14__nv_bfloat162RKS__nbst_caller(arg_0, arg_1):
         return _ZmIR14__nv_bfloat162RKS__nbst(arg_0, arg_1)
 
-    handle = globals().get("operator.isub")
-    if handle is None:
-        handle = operator.isub
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(operator.isub, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -14216,11 +13240,7 @@ def _lower__ZmLR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj):
     def _ZmLR14__nv_bfloat162RKS__nbst_caller(arg_0, arg_1):
         return _ZmLR14__nv_bfloat162RKS__nbst(arg_0, arg_1)
 
-    handle = globals().get("operator.imul")
-    if handle is None:
-        handle = operator.imul
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(operator.imul, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -14264,11 +13284,7 @@ def _lower__ZdVR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj):
     def _ZdVR14__nv_bfloat162RKS__nbst_caller(arg_0, arg_1):
         return _ZdVR14__nv_bfloat162RKS__nbst(arg_0, arg_1)
 
-    handle = globals().get("operator.itruediv")
-    if handle is None:
-        handle = operator.itruediv
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(operator.itruediv, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -14310,11 +13326,7 @@ def _lower__ZpsRK14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZpsRK14__nv_bfloat162_nbst_caller(arg_0):
         return _ZpsRK14__nv_bfloat162_nbst(arg_0)
 
-    handle = globals().get("operator.pos")
-    if handle is None:
-        handle = operator.pos
-
-    @lower(handle, _type___nv_bfloat162)
+    @lower(operator.pos, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key("_ZpsRK14__nv_bfloat162_nbst", shim_raw_str)
@@ -14350,11 +13362,7 @@ def _lower__ZngRK14__nv_bfloat162_nbst(shim_stream, shim_obj):
     def _ZngRK14__nv_bfloat162_nbst_caller(arg_0):
         return _ZngRK14__nv_bfloat162_nbst(arg_0)
 
-    handle = globals().get("operator.neg")
-    if handle is None:
-        handle = operator.neg
-
-    @lower(handle, _type___nv_bfloat162)
+    @lower(operator.neg, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key("_ZngRK14__nv_bfloat162_nbst", shim_raw_str)
@@ -14390,11 +13398,7 @@ def _lower__ZeqRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj):
     def _ZeqRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1):
         return _ZeqRK14__nv_bfloat162S1__nbst(arg_0, arg_1)
 
-    handle = globals().get("operator.eq")
-    if handle is None:
-        handle = operator.eq
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(operator.eq, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -14436,11 +13440,7 @@ def _lower__ZneRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj):
     def _ZneRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1):
         return _ZneRK14__nv_bfloat162S1__nbst(arg_0, arg_1)
 
-    handle = globals().get("operator.ne")
-    if handle is None:
-        handle = operator.ne
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(operator.ne, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -14482,11 +13482,7 @@ def _lower__ZgtRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj):
     def _ZgtRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1):
         return _ZgtRK14__nv_bfloat162S1__nbst(arg_0, arg_1)
 
-    handle = globals().get("operator.gt")
-    if handle is None:
-        handle = operator.gt
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(operator.gt, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -14528,11 +13524,7 @@ def _lower__ZltRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj):
     def _ZltRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1):
         return _ZltRK14__nv_bfloat162S1__nbst(arg_0, arg_1)
 
-    handle = globals().get("operator.lt")
-    if handle is None:
-        handle = operator.lt
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(operator.lt, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -14574,11 +13566,7 @@ def _lower__ZgeRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj):
     def _ZgeRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1):
         return _ZgeRK14__nv_bfloat162S1__nbst(arg_0, arg_1)
 
-    handle = globals().get("operator.ge")
-    if handle is None:
-        handle = operator.ge
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(operator.ge, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -14620,11 +13608,7 @@ def _lower__ZleRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj):
     def _ZleRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1):
         return _ZleRK14__nv_bfloat162S1__nbst(arg_0, arg_1)
 
-    handle = globals().get("operator.le")
-    if handle is None:
-        handle = operator.le
-
-    @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(operator.le, _type___nv_bfloat162, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
@@ -14669,11 +13653,7 @@ def _lower__ZN6__halfC1E13__nv_bfloat16_nbst(shim_stream, shim_obj):
     def _ZN6__halfC1E13__nv_bfloat16_nbst_caller(arg_0):
         return _ZN6__halfC1E13__nv_bfloat16_nbst(arg_0)
 
-    handle = globals().get("__half")
-    if handle is None:
-        handle = __half
-
-    @lower(handle, _type___nv_bfloat16)
+    @lower(__half, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(

From da312aa0b74a9f67abf6317545b79d2c55b753d2 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Fri, 15 Aug 2025 13:22:57 -0700
Subject: [PATCH 23/56] apply binding patches

---
 numba_cuda/numba/cuda/_internal/cuda_bf16.py | 46 ++++++--------------
 1 file changed, 14 insertions(+), 32 deletions(-)

diff --git a/numba_cuda/numba/cuda/_internal/cuda_bf16.py b/numba_cuda/numba/cuda/_internal/cuda_bf16.py
index 0eef75e12..a2af16d04 100644
--- a/numba_cuda/numba/cuda/_internal/cuda_bf16.py
+++ b/numba_cuda/numba/cuda/_internal/cuda_bf16.py
@@ -28,7 +28,6 @@
 from numba.core.typing.templates import AttributeTemplate, ConcreteTemplate
 from numba.core.typing.templates import Registry as TypingRegistry
 from numba.cuda import CUSource, declare_device
-from numba.cuda._internal.cuda_bf16 import _type___nv_bfloat16
 from numba.cuda.vector_types import vector_types
 from numba.extending import as_numba_type
 from numba.types import (
@@ -50,8 +49,10 @@
     uint64,
     void,
 )
+from numba.cuda.types import bfloat16
 
 float32x2 = vector_types["float32x2"]
+__half = float16
 
 
 typing_registry = TypingRegistry()
@@ -181,37 +182,7 @@ def resolve_y(self, obj):
 make_attribute_wrapper(_type_class_unnamed1405416, "y", "y")
 
 
-@register
-class _ctor_template_unnamed1405416(ConcreteTemplate):
-    key = globals()["unnamed1405416"]
-    cases = []
-
-
-register_global(unnamed1405416, Function(_ctor_template_unnamed1405416))
-
-
-# Typing for __nv_bfloat16
-class _type_class___nv_bfloat16(Number):
-    def __init__(self):
-        super().__init__(name="__nv_bfloat16")
-        self.alignof_ = 2
-        self.bitwidth = 2 * 8
-
-
-_type___nv_bfloat16 = _type_class___nv_bfloat16()
-
-
-# Make Python API for struct
-__nv_bfloat16 = type("__nv_bfloat16", (), {"_nbtype": _type___nv_bfloat16})
-
-as_numba_type.register(__nv_bfloat16, _type___nv_bfloat16)
-
-
-@register_model(_type_class___nv_bfloat16)
-class _model___nv_bfloat16(PrimitiveModel):
-    def __init__(self, dmm, fe_type):
-        be_type = ir.IntType(fe_type.bitwidth)
-        super(_model___nv_bfloat16, self).__init__(dmm, fe_type, be_type)
+__nv_bfloat16 = _type___nv_bfloat16 = bfloat16
 
 
 def _lower__ZN13__nv_bfloat16C1Ev(shim_stream, shim_obj):
@@ -366,6 +337,17 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
         )
 
+    # By default, Numbast does not generate this cast because the c++ conversion
+    # constructor is marked explict. We enable it by hand here.
+    @lower_cast(float16, __nv_bfloat16)
+    def conversion_impl(context, builder, fromty, toty, value):
+        return ctor_impl(
+            context,
+            builder,
+            signature(__nv_bfloat16, fromty),
+            [value],
+        )
+
 
 _lower__ZN13__nv_bfloat16C1E6__half(shim_stream, shim_obj)
 

From bc7dbaae3d1c785e41126562a52c3f1a1c642436 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Fri, 15 Aug 2025 13:56:10 -0700
Subject: [PATCH 24/56] re-imports the bf16 intrinsics

---
 numba_cuda/numba/cuda/bf16.py                 | 63 +++++++++----------
 .../numba/cuda/tests/cudapy/test_bfloat16.py  | 60 +++++++++---------
 2 files changed, 60 insertions(+), 63 deletions(-)

diff --git a/numba_cuda/numba/cuda/bf16.py b/numba_cuda/numba/cuda/bf16.py
index 1ac3798c0..e29123bb2 100644
--- a/numba_cuda/numba/cuda/bf16.py
+++ b/numba_cuda/numba/cuda/bf16.py
@@ -3,22 +3,21 @@
     target_registry,
     nv_bfloat16 as bfloat16,
     # Arithmetic intrinsics
-    __habs,
-    __hadd,
-    __hsub,
-    __hmul,
-    __hadd_rn,
-    __hsub_rn,
-    __hmul_rn,
-    __hdiv,
-    __hadd_sat,
-    __hsub_sat,
-    __hmul_sat,
-    __hfma,
-    __hfma_sat,
-    __hneg,
-    __hfma_relu,
-    atomicAdd,
+    __habs as habs,
+    __hadd as hadd,
+    __hsub as hsub,
+    __hmul as hmul,
+    __hadd_rn as hadd_rn,
+    __hsub_rn as hsub_rn,
+    __hmul_rn as hmul_rn,
+    __hdiv as hdiv,
+    __hadd_sat as hadd_sat,
+    __hsub_sat as hsub_sat,
+    __hmul_sat as hmul_sat,
+    __hfma as hfma,
+    __hfma_sat as hfma_sat,
+    __hneg as hneg,
+    __hfma_relu as hfma_relu,
     htrunc,
     hceil,
     hfloor,
@@ -107,28 +106,26 @@ def exp2_ol(a):
 except ImportError:
     pass
 
-
 __all__ = [
     "typing_registry",
     "target_registry",
     "bfloat16",
     # Arithmetic intrinsics
-    "__habs",
-    "__hadd",
-    "__hsub",
-    "__hmul",
-    "__hadd_rn",
-    "__hsub_rn",
-    "__hmul_rn",
-    "__hdiv",
-    "__hadd_sat",
-    "__hsub_sat",
-    "__hmul_sat",
-    "__hfma",
-    "__hfma_sat",
-    "__hneg",
-    "__hfma_relu",
-    "atomicAdd",
+    "habs",
+    "hadd",
+    "hsub",
+    "hmul",
+    "hadd_rn",
+    "hsub_rn",
+    "hmul_rn",
+    "hdiv",
+    "hadd_sat",
+    "hsub_sat",
+    "hmul_sat",
+    "hfma",
+    "hfma_sat",
+    "hneg",
+    "hfma_relu",
     "htrunc",
     "hceil",
     "hfloor",
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py
index af25a3860..b6210498a 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py
@@ -1,21 +1,21 @@
 from numba import cuda, float32
 from numba.cuda.bf16 import (
     bfloat16,
-    __habs,
-    __hadd,
-    __hsub,
-    __hmul,
-    __hdiv,
-    __hadd_rn,
-    __hsub_rn,
-    __hmul_rn,
-    __hadd_sat,
-    __hsub_sat,
-    __hmul_sat,
-    __hfma,
-    __hfma_sat,
-    __hfma_relu,
-    __hneg,
+    habs,
+    hadd,
+    hsub,
+    hmul,
+    hadd_rn,
+    hsub_rn,
+    hmul_rn,
+    hdiv,
+    hadd_sat,
+    hsub_sat,
+    hmul_sat,
+    hfma,
+    hfma_sat,
+    hneg,
+    hfma_relu,
 )
 from numba.cuda.testing import CUDATestCase
 
@@ -86,17 +86,17 @@ def kernel(out):
             a = bfloat16(1.25)
             b = bfloat16(-2.5)
 
-            out[0] = float32(__habs(b))
-            out[1] = float32(__hadd(a, b))
-            out[2] = float32(__hsub(a, b))
-            out[3] = float32(__hmul(a, b))
-            out[4] = float32(__hdiv(b, a))
-            out[5] = float32(__hneg(a))
-            out[6] = float32(__hfma(a, b, b))
+            out[0] = float32(habs(b))
+            out[1] = float32(hadd(a, b))
+            out[2] = float32(hsub(a, b))
+            out[3] = float32(hmul(a, b))
+            out[4] = float32(hdiv(b, a))
+            out[5] = float32(hneg(a))
+            out[6] = float32(hfma(a, b, b))
 
-            out[7] = float32(__hadd_rn(a, b))
-            out[8] = float32(__hsub_rn(a, b))
-            out[9] = float32(__hmul_rn(a, b))
+            out[7] = float32(hadd_rn(a, b))
+            out[8] = float32(hsub_rn(a, b))
+            out[9] = float32(hmul_rn(a, b))
 
         out = cuda.device_array((10,), dtype="float32")
         kernel[1, 1](out)
@@ -126,10 +126,10 @@ def kernel(out):
             a = bfloat16(1.5)
             b = bfloat16(0.75)
 
-            out[0] = float32(__hadd_sat(a, b))  # 2.25 -> 1.0
-            out[1] = float32(__hsub_sat(b, a))  # -0.75 -> 0.0
-            out[2] = float32(__hmul_sat(a, b))  # 1.125 -> 1.0
-            out[3] = float32(__hfma_sat(a, b, a))  # 1.125 + 1.5 -> 1.0
+            out[0] = float32(hadd_sat(a, b))  # 2.25 -> 1.0
+            out[1] = float32(hsub_sat(b, a))  # -0.75 -> 0.0
+            out[2] = float32(hmul_sat(a, b))  # 1.125 -> 1.0
+            out[3] = float32(hfma_sat(a, b, a))  # 1.125 + 1.5 -> 1.0
 
         out = cuda.device_array((4,), dtype="float32")
         kernel[1, 1](out)
@@ -153,7 +153,7 @@ def kernel(out):
             b = bfloat16(2.0)
             c = bfloat16(0.0)
 
-            out[0] = float32(__hfma_relu(a, b, c))  # -3.0 -> relu -> 0.0
+            out[0] = float32(hfma_relu(a, b, c))  # -3.0 -> relu -> 0.0
 
         out = cuda.device_array((1,), dtype="float32")
         kernel[1, 1](out)

From 04823e8acef71eedd286182805e6a398d68c8541 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Fri, 15 Aug 2025 14:09:53 -0700
Subject: [PATCH 25/56] Add documentation for arithmetic operations

---
 docs/source/reference/types.rst | 86 +++++++++++++++++++++++++++++++--
 1 file changed, 82 insertions(+), 4 deletions(-)

diff --git a/docs/source/reference/types.rst b/docs/source/reference/types.rst
index 774714c53..fc5e583e0 100644
--- a/docs/source/reference/types.rst
+++ b/docs/source/reference/types.rst
@@ -92,6 +92,7 @@ Construction of a single instance of a ``bfloat16`` object:
     - ``int32``
     - ``uint64``
     - ``uint32``
+    - ``float16``
 
 Conversely, ``bfloat16`` data can be cast back to existing native data type via
 ``dtype(b)``, where ``dtype`` is one of the data types above (except float16),
@@ -100,7 +101,7 @@ and ``b`` is a bfloat16 object.
 Arithmetic
 **********
 
-Supported arithmetic operations on ``bfloat`16`` operands are:
+Supported arithmetic operations on ``bfloat16`` operands are:
 
 - Arithmetic (``+``, ``-``, ``*``, ``/``)
 - Arithmetic assignment operators (``+=``, ``-=``, ``*=``, ``/=``)
@@ -140,11 +141,11 @@ on ``bfloat16`` are provided:
     mode.
 
 .. function:: numba.cuda.bf16.hlog2(b)
-    Calculates bfloat16 decimal logarithm of input ``b`` in round-to-nearest-even
-    mode.
+    Calculates bfloat16 binary logarithm (base-2) of input ``b`` in
+    round-to-nearest-even mode.
 
 .. function:: numba.cuda.bf16.hlog10(b)
-    Calculates bfloat16 natural exponential function of input ``b`` in
+    Calculates bfloat16 common logarithm (base-10) of input ``b`` in
     round-to-nearest-even mode.
 
 .. function:: numba.cuda.bf16.hcos(b)
@@ -187,3 +188,80 @@ on ``bfloat16`` are provided:
 .. function:: numba.cuda.bf16.hexp10(b)
     Calculates bfloat16 decimal exponential function of input ``b`` in
     round-to-nearest-even mode.
+
+
+Arithmetic Intrinsics
+*********************
+
+The following low-level arithmetic intrinsics are available under
+``numba.cuda.bf16`` and map to CUDA bfloat16 arithmetic functions. Unless
+otherwise noted, operations are performed in round-to-nearest-even mode.
+
+.. function:: numba.cuda.bf16.habs(a)
+
+    Calculates the absolute value of input ``a`` (bfloat16) and returns the result.
+
+.. function:: numba.cuda.bf16.hneg(a)
+
+    Negates input ``a`` (bfloat16) and returns the result.
+
+.. function:: numba.cuda.bf16.hadd(a, b)
+
+    Adds ``a`` and ``b`` (bfloat16) in round-to-nearest-even mode.
+
+.. function:: numba.cuda.bf16.hadd_rn(a, b)
+
+    Adds ``a`` and ``b`` (bfloat16) in round-to-nearest-even mode. Prevents
+    contraction of separate operations into a fused-multiply-add.
+
+.. function:: numba.cuda.bf16.hadd_sat(a, b)
+
+    Adds ``a`` and ``b`` (bfloat16) in round-to-nearest-even mode, with
+    saturation to the range ``[0.0, 1.0]``. NaN results are flushed to ``+0.0``.
+
+.. function:: numba.cuda.bf16.hsub(a, b)
+
+    Subtracts ``b`` from ``a`` (bfloat16) in round-to-nearest-even mode.
+
+.. function:: numba.cuda.bf16.hsub_rn(a, b)
+
+    Subtracts ``b`` from ``a`` (bfloat16) in round-to-nearest-even mode.
+    Prevents contraction of separate operations into a fused-multiply-add.
+
+.. function:: numba.cuda.bf16.hsub_sat(a, b)
+
+    Subtracts ``b`` from ``a`` (bfloat16) in round-to-nearest-even mode, with
+    saturation to the range ``[0.0, 1.0]``. NaN results are flushed to ``+0.0``.
+
+.. function:: numba.cuda.bf16.hmul(a, b)
+
+    Multiplies ``a`` and ``b`` (bfloat16) in round-to-nearest-even mode.
+
+.. function:: numba.cuda.bf16.hmul_rn(a, b)
+
+    Multiplies ``a`` and ``b`` (bfloat16) in round-to-nearest-even mode.
+    Prevents contraction of separate operations into a fused-multiply-add.
+
+.. function:: numba.cuda.bf16.hmul_sat(a, b)
+
+    Multiplies ``a`` and ``b`` (bfloat16) in round-to-nearest-even mode, with
+    saturation to the range ``[0.0, 1.0]``. NaN results are flushed to ``+0.0``.
+
+.. function:: numba.cuda.bf16.hdiv(a, b)
+
+    Divides ``a`` by ``b`` (bfloat16) in round-to-nearest-even mode.
+
+.. function:: numba.cuda.bf16.hfma(a, b, c)
+
+    Computes a fused multiply-add of ``a`` and ``b`` plus ``c`` (bfloat16) in
+    round-to-nearest-even mode; i.e. returns ``a * b + c``.
+
+.. function:: numba.cuda.bf16.hfma_sat(a, b, c)
+
+    Fused multiply-add in round-to-nearest-even mode with saturation to the
+    range ``[0.0, 1.0]``. NaN results are flushed to ``+0.0``.
+
+.. function:: numba.cuda.bf16.hfma_relu(a, b, c)
+
+    Fused multiply-add in round-to-nearest-even mode with ReLU saturation;
+    i.e. returns ``max(0, a * b + c)``.

From b7e0e8b89d572acb553e9e01a2ad1b9e43008c39 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Fri, 15 Aug 2025 14:34:38 -0700
Subject: [PATCH 26/56] add logical intrinsics

---
 numba_cuda/numba/cuda/bf16.py                 |  39 +++++++
 .../numba/cuda/tests/cudapy/test_bfloat16.py  | 107 ++++++++++++++++++
 2 files changed, 146 insertions(+)

diff --git a/numba_cuda/numba/cuda/bf16.py b/numba_cuda/numba/cuda/bf16.py
index e29123bb2..86cf7c510 100644
--- a/numba_cuda/numba/cuda/bf16.py
+++ b/numba_cuda/numba/cuda/bf16.py
@@ -18,6 +18,26 @@
     __hfma_sat as hfma_sat,
     __hneg as hneg,
     __hfma_relu as hfma_relu,
+    # Comparison intrinsics
+    __heq as heq,
+    __hne as hne,
+    __hge as hge,
+    __hgt as hgt,
+    __hle as hle,
+    __hlt as hlt,
+    __hmax as hmax,
+    __hmin as hmin,
+    __hmax_nan as hmax_nan,
+    __hmin_nan as hmin_nan,
+    __hisinf as hisinf,
+    __hisnan as hisnan,
+    # Unordered comparison intrinsics
+    __hequ as hequ,
+    __hneu as hneu,
+    __hgeu as hgeu,
+    __hgtu as hgtu,
+    __hleu as hleu,
+    __hltu as hltu,
     htrunc,
     hceil,
     hfloor,
@@ -126,6 +146,25 @@ def exp2_ol(a):
     "hfma_sat",
     "hneg",
     "hfma_relu",
+    # Comparison intrinsics
+    "heq",
+    "hne",
+    "hge",
+    "hgt",
+    "hle",
+    "hlt",
+    "hmax",
+    "hmin",
+    "hmax_nan",
+    "hmin_nan",
+    "hisinf",
+    "hisnan",
+    "hequ",
+    "hneu",
+    "hgeu",
+    "hgtu",
+    "hleu",
+    "hltu",
     "htrunc",
     "hceil",
     "hfloor",
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py
index b6210498a..3721c1506 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py
@@ -16,6 +16,19 @@
     hfma_sat,
     hneg,
     hfma_relu,
+    # Comparison intrinsics
+    heq,
+    hne,
+    hge,
+    hgt,
+    hle,
+    hlt,
+    hmax,
+    hmin,
+    hmax_nan,
+    hmin_nan,
+    hisnan,
+    hisinf,
 )
 from numba.cuda.testing import CUDATestCase
 
@@ -159,3 +172,97 @@ def kernel(out):
         kernel[1, 1](out)
 
         self.assertAlmostEqual(out[0], 0.0, delta=1e-3)
+
+    def test_comparison_intrinsics(self):
+        self.skip_unsupported()
+
+        def make_kernel(cmpfn):
+            @cuda.jit
+            def kernel(out, a, b):
+                a_bf16 = bfloat16(a)
+                b_bf16 = bfloat16(b)
+                out[0] = cmpfn(a_bf16, b_bf16)
+
+            return kernel
+
+        comparisons = [heq, hne, hge, hgt, hle, hlt]
+        ops = [
+            lambda x, y: x == y,
+            lambda x, y: x != y,
+            lambda x, y: x >= y,
+            lambda x, y: x > y,
+            lambda x, y: x <= y,
+            lambda x, y: x < y,
+        ]
+
+        for cmpfn, op in zip(comparisons, ops):
+            with self.subTest(cmpfn=cmpfn):
+                kernel = make_kernel(cmpfn)
+                out = cuda.device_array((1,), dtype="bool")
+
+                a = 3.0
+                b = 3.0
+                kernel[1, 1](out, a, b)
+                self.assertEqual(bool(out[0]), op(3.0, 3.0))
+
+                a = 3.0
+                b = 4.0
+                kernel[1, 1](out, a, b)
+                self.assertEqual(bool(out[0]), op(3.0, 4.0))
+
+                a = 4.0
+                b = 3.0
+                kernel[1, 1](out, a, b)
+                self.assertEqual(bool(out[0]), op(4.0, 3.0))
+
+    def test_hmax_hmin_intrinsics(self):
+        self.skip_unsupported()
+
+        @cuda.jit
+        def kernel(out):
+            a = bfloat16(3.0)
+            b = bfloat16(4.0)
+            out[0] = float32(hmax(a, b))
+            out[1] = float32(hmin(a, b))
+
+        out = cuda.device_array((2,), dtype="float32")
+        kernel[1, 1](out)
+        self.assertAlmostEqual(out[0], 4.0, delta=1e-3)
+        self.assertAlmostEqual(out[1], 3.0, delta=1e-3)
+
+    def test_nan_and_inf_intrinsics(self):
+        self.skip_unsupported()
+
+        @cuda.jit
+        def kernel(out_bool, out_int):
+            nanv = bfloat16(float("nan"))
+            infv = bfloat16(float("inf"))
+            out_bool[0] = hisnan(nanv)
+            out_int[0] = hisinf(infv)
+
+        out_bool = cuda.device_array((1,), dtype="bool")
+        out_int = cuda.device_array((1,), dtype="int32")
+        kernel[1, 1](out_bool, out_int)
+        self.assertTrue(bool(out_bool[0]))
+        self.assertNotEqual(int(out_int[0]), 0)
+
+    def test_hmax_nan_hmin_nan_intrinsics(self):
+        self.skip_unsupported()
+
+        @cuda.jit
+        def kernel(out):
+            a = bfloat16(float("nan"))
+            b = bfloat16(2.0)
+            out[0] = float32(hmax_nan(a, b))
+            out[1] = float32(hmin_nan(a, b))
+            out[2] = float32(hmax(a, b))
+            out[3] = float32(hmin(a, b))
+
+        out = cuda.device_array((4,), dtype="float32")
+        kernel[1, 1](out)
+        # NaN-propagating variants should produce NaN
+        self.assertTrue(math.isnan(out[0]))
+        self.assertTrue(math.isnan(out[1]))
+        # Non-NaN variants should return the non-NaN operand
+        self.assertAlmostEqual(out[2], 2.0, delta=1e-3)
+        self.assertAlmostEqual(out[3], 2.0, delta=1e-3)

From 3407e19bd078bc4a9ab943c13e125eaf95d55c91 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Fri, 15 Aug 2025 14:36:02 -0700
Subject: [PATCH 27/56] make bfloat16 usable on host if ml_dtypes is installed

---
 numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py | 11 +++++++++++
 numba_cuda/numba/cuda/types.py                      |  8 ++++++++
 2 files changed, 19 insertions(+)

diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py
index 3721c1506..7077ed122 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py
@@ -1,3 +1,6 @@
+import unittest
+from importlib.util import find_spec
+
 from numba import cuda, float32
 from numba.cuda.bf16 import (
     bfloat16,
@@ -266,3 +269,11 @@ def kernel(out):
         # Non-NaN variants should return the non-NaN operand
         self.assertAlmostEqual(out[2], 2.0, delta=1e-3)
         self.assertAlmostEqual(out[3], 2.0, delta=1e-3)
+
+    @unittest.skipIf(
+        find_spec("ml_dtypes") is None,
+        "ml_dtypes is required to use bfloat16 on host",
+    )
+    def test_use_bfloat16_on_host(self):
+        x = bfloat16(3.0)
+        self.assertEqual(x, 3.0)
diff --git a/numba_cuda/numba/cuda/types.py b/numba_cuda/numba/cuda/types.py
index 92e3cafde..5ddcaef5e 100644
--- a/numba_cuda/numba/cuda/types.py
+++ b/numba_cuda/numba/cuda/types.py
@@ -77,5 +77,13 @@ def unify(self, typingctx, other):
         if isinstance(other, (types.Float, types.Integer)):
             return typingctx.unify_pairs(self, other)
 
+    def cast_python_value(self, value):
+        try:
+            import ml_dtypes  # noqa: F401
+
+            return ml_dtypes.bfloat16(value)
+        except ImportError:
+            raise NotImplementedError
+
 
 bfloat16 = Bfloat16()

From 2ce64ed10b1aa1602248f6706ec34c0ae221e871 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Fri, 15 Aug 2025 14:45:44 -0700
Subject: [PATCH 28/56] add comparison operators

---
 docs/source/reference/types.rst | 99 +++++++++++++++++++++++++++++++++
 1 file changed, 99 insertions(+)

diff --git a/docs/source/reference/types.rst b/docs/source/reference/types.rst
index fc5e583e0..b11d68186 100644
--- a/docs/source/reference/types.rst
+++ b/docs/source/reference/types.rst
@@ -265,3 +265,102 @@ otherwise noted, operations are performed in round-to-nearest-even mode.
 
     Fused multiply-add in round-to-nearest-even mode with ReLU saturation;
     i.e. returns ``max(0, a * b + c)``.
+
+Comparison Intrinsics
+*********************
+
+Device-level comparison intrinsics operating on ``bfloat16`` values are
+available under ``numba.cuda.bf16``. Unless stated otherwise, the ordered
+comparisons return ``False`` if either input is NaN, following IEEE semantics.
+
+.. function:: numba.cuda.bf16.heq(a, b)
+
+    Ordered equality. Returns ``True`` iff ``a == b``. NaN inputs yield ``False``.
+
+.. function:: numba.cuda.bf16.hne(a, b)
+
+    Ordered inequality. Returns ``True`` iff ``a != b`` and neither input is NaN.
+    NaN inputs yield ``False``.
+
+.. function:: numba.cuda.bf16.hge(a, b)
+
+    Ordered greater-or-equal. NaN inputs yield ``False``.
+
+.. function:: numba.cuda.bf16.hgt(a, b)
+
+    Ordered greater-than. NaN inputs yield ``False``.
+
+.. function:: numba.cuda.bf16.hle(a, b)
+
+    Ordered less-or-equal. NaN inputs yield ``False``.
+
+.. function:: numba.cuda.bf16.hlt(a, b)
+
+    Ordered less-than. NaN inputs yield ``False``.
+
+The unordered comparison variants return ``True`` when either input is NaN:
+
+.. function:: numba.cuda.bf16.hequ(a, b)
+
+    Unordered equality. Returns ``True`` if ``a`` or ``b`` is NaN, or if ``a == b``.
+
+.. function:: numba.cuda.bf16.hneu(a, b)
+
+    Unordered inequality. Returns ``True`` if ``a`` or ``b`` is NaN, or if ``a != b``.
+
+.. function:: numba.cuda.bf16.hgeu(a, b)
+
+    Unordered greater-or-equal. Returns ``True`` if ``a`` or ``b`` is NaN, or if ``a >= b``.
+
+.. function:: numba.cuda.bf16.hgtu(a, b)
+
+    Unordered greater-than. Returns ``True`` if ``a`` or ``b`` is NaN, or if ``a > b``.
+
+.. function:: numba.cuda.bf16.hleu(a, b)
+
+    Unordered less-or-equal. Returns ``True`` if ``a`` or ``b`` is NaN, or if ``a <= b``.
+
+.. function:: numba.cuda.bf16.hltu(a, b)
+
+    Unordered less-than. Returns ``True`` if ``a`` or ``b`` is NaN, or if ``a < b``.
+
+Min/Max operations follow CUDA semantics for zeros and NaNs:
+
+.. function:: numba.cuda.bf16.hmax(a, b)
+
+    Returns ``max(a, b)`` with the following behavior:
+    if either input is NaN, the other input is returned; if both are NaN,
+    the canonical NaN is returned. If both inputs are zero, ``+0.0 > -0.0``.
+
+.. function:: numba.cuda.bf16.hmin(a, b)
+
+    Returns ``min(a, b)`` with the following behavior:
+    if either input is NaN, the other input is returned; if both are NaN,
+    the canonical NaN is returned. If both inputs are zero, ``+0.0 > -0.0``.
+
+.. function:: numba.cuda.bf16.hmax_nan(a, b)
+
+    Returns ``max(a, b)`` where NaNs pass through: if either input is NaN,
+    the canonical NaN is returned.
+
+.. function:: numba.cuda.bf16.hmin_nan(a, b)
+
+    Returns ``min(a, b)`` where NaNs pass through: if either input is NaN,
+    the canonical NaN is returned.
+
+Special value predicates:
+
+.. function:: numba.cuda.bf16.hisnan(a)
+
+    Returns ``True`` if ``a`` is a NaN, ``False`` otherwise.
+
+.. function:: numba.cuda.bf16.hisinf(a)
+
+    Returns a nonzero integer if ``a`` is infinite, otherwise ``0``.
+
+.. note::
+
+    Python comparison operators on ``bfloat16`` values in device code map to
+    the ordered comparisons above. For more details on the CUDA bfloat16
+    comparison semantics, see `NVIDIA CUDA Math API: Bfloat16 Comparison Functions
+    <https://docs.nvidia.com/cuda/cuda-math-api/cuda_math_api/group__CUDA__MATH____BFLOAT16__COMPARISON.html#group__cuda__math____bfloat16__comparison>`_.

From 7d289b22c382188fb2b1d3a9f38f3c22348a0f6f Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Fri, 15 Aug 2025 14:59:35 -0700
Subject: [PATCH 29/56] add basic conversion: float, int bidirectional
 conversion intrinsics

---
 numba_cuda/numba/cuda/bf16.py                 | 28 +++++++
 .../numba/cuda/tests/cudapy/test_bfloat16.py  | 75 +++++++++++++++++++
 2 files changed, 103 insertions(+)

diff --git a/numba_cuda/numba/cuda/bf16.py b/numba_cuda/numba/cuda/bf16.py
index 86cf7c510..ad893961d 100644
--- a/numba_cuda/numba/cuda/bf16.py
+++ b/numba_cuda/numba/cuda/bf16.py
@@ -38,6 +38,20 @@
     __hgtu as hgtu,
     __hleu as hleu,
     __hltu as hltu,
+    # Precision conversion and data movement
+    __bfloat162float as bfloat162float,
+    __float2bfloat16_rn as float2bfloat16_rn,
+    __float2bfloat16_rz as float2bfloat16_rz,
+    __float2bfloat16_rd as float2bfloat16_rd,
+    __float2bfloat16_ru as float2bfloat16_ru,
+    __int2bfloat16_rn as int2bfloat16_rn,
+    __int2bfloat16_rz as int2bfloat16_rz,
+    __int2bfloat16_rd as int2bfloat16_rd,
+    __int2bfloat16_ru as int2bfloat16_ru,
+    __bfloat162int_rn as bfloat162int_rn,
+    __bfloat162int_rz as bfloat162int_rz,
+    __bfloat162int_rd as bfloat162int_rd,
+    __bfloat162int_ru as bfloat162int_ru,
     htrunc,
     hceil,
     hfloor,
@@ -165,6 +179,20 @@ def exp2_ol(a):
     "hgtu",
     "hleu",
     "hltu",
+    # Precision conversion and data movement
+    "bfloat162float",
+    "float2bfloat16_rn",
+    "float2bfloat16_rz",
+    "float2bfloat16_rd",
+    "float2bfloat16_ru",
+    "int2bfloat16_rn",
+    "int2bfloat16_rz",
+    "int2bfloat16_rd",
+    "int2bfloat16_ru",
+    "bfloat162int_rn",
+    "bfloat162int_rz",
+    "bfloat162int_rd",
+    "bfloat162int_ru",
     "htrunc",
     "hceil",
     "hfloor",
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py
index 7077ed122..53309f671 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py
@@ -32,6 +32,20 @@
     hmin_nan,
     hisnan,
     hisinf,
+    # Conversion intrinsics
+    bfloat162float,
+    float2bfloat16_rn,
+    float2bfloat16_rz,
+    float2bfloat16_rd,
+    float2bfloat16_ru,
+    int2bfloat16_rn,
+    int2bfloat16_rz,
+    int2bfloat16_rd,
+    int2bfloat16_ru,
+    bfloat162int_rn,
+    bfloat162int_rz,
+    bfloat162int_rd,
+    bfloat162int_ru,
 )
 from numba.cuda.testing import CUDATestCase
 
@@ -270,6 +284,67 @@ def kernel(out):
         self.assertAlmostEqual(out[2], 2.0, delta=1e-3)
         self.assertAlmostEqual(out[3], 2.0, delta=1e-3)
 
+    def test_precision_conversion_intrinsics(self):
+        self.skip_unsupported()
+
+        @cuda.jit
+        def kernel_float_to_bf16(out):
+            f = float32(3.14)
+            out[0] = float32(float2bfloat16_rn(f))
+            out[1] = float32(float2bfloat16_rz(f))
+            out[2] = float32(float2bfloat16_rd(f))
+            out[3] = float32(float2bfloat16_ru(f))
+
+        @cuda.jit
+        def kernel_bf16_to_float(out):
+            a = bfloat16(3.14)
+            out[0] = bfloat162float(a)
+
+        @cuda.jit
+        def kernel_int_to_bf16(out):
+            i = 3
+            out[0] = float32(int2bfloat16_rn(i))
+            out[1] = float32(int2bfloat16_rz(i))
+            out[2] = float32(int2bfloat16_rd(i))
+            out[3] = float32(int2bfloat16_ru(i))
+
+        @cuda.jit
+        def kernel_bf16_to_int(out):
+            a = bfloat16(3.14)
+            out[0] = bfloat162int_rn(a)
+            out[1] = bfloat162int_rz(a)
+            out[2] = bfloat162int_rd(a)
+            out[3] = bfloat162int_ru(a)
+
+        out = cuda.device_array((4,), dtype="float32")
+        kernel_float_to_bf16[1, 1](out)
+        # Check they are near the original value in float32 after round-trip
+        # Note: Different rounding modes produce slightly different values
+        self.assertAlmostEqual(out[0], 3.140625, delta=1e-3)  # rn
+        self.assertTrue(abs(out[1] - 3.140625) < 2e-2, out[1] - 3.140625)  # rz
+        self.assertTrue(abs(out[2] - 3.140625) < 2e-2, out[2] - 3.140625)  # rd
+        self.assertTrue(abs(out[3] - 3.140625) < 2e-2, out[3] - 3.140625)  # ru
+
+        out = cuda.device_array((1,), dtype="float32")
+        kernel_bf16_to_float[1, 1](out)
+        self.assertAlmostEqual(out[0], 3.140625, delta=1e-3)
+
+        outi = cuda.device_array((4,), dtype="int32")
+        kernel_int_to_bf16[1, 1](outi)
+        # int to bf16 should be exactly representable for small integers
+        self.assertEqual(int(outi[0]), 3)
+        self.assertEqual(int(outi[1]), 3)
+        self.assertEqual(int(outi[2]), 3)
+        self.assertEqual(int(outi[3]), 3)
+
+        outi = cuda.device_array((4,), dtype="int32")
+        kernel_bf16_to_int[1, 1](outi)
+        # 3.14 -> 3 for rz/rd, 3 or 4 for rn/ru depending on rounding
+        self.assertIn(int(outi[0]), (3, 4))
+        self.assertEqual(int(outi[1]), 3)
+        self.assertEqual(int(outi[2]), 3)
+        self.assertIn(int(outi[3]), (3, 4))
+
     @unittest.skipIf(
         find_spec("ml_dtypes") is None,
         "ml_dtypes is required to use bfloat16 on host",

From 9317e7a03f37e09af61f88ce16e3918d80e310b2 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Fri, 15 Aug 2025 15:43:41 -0700
Subject: [PATCH 30/56] add numerical precision cast and tests

---
 numba_cuda/numba/cuda/bf16.py                 | 208 ++++++++++++++++--
 .../numba/cuda/tests/cudapy/test_bfloat16.py  | 160 +++++++++++---
 2 files changed, 327 insertions(+), 41 deletions(-)

diff --git a/numba_cuda/numba/cuda/bf16.py b/numba_cuda/numba/cuda/bf16.py
index ad893961d..96c1c6f78 100644
--- a/numba_cuda/numba/cuda/bf16.py
+++ b/numba_cuda/numba/cuda/bf16.py
@@ -39,11 +39,18 @@
     __hleu as hleu,
     __hltu as hltu,
     # Precision conversion and data movement
+    # - floating-point family
     __bfloat162float as bfloat162float,
+    __float2bfloat16 as float2bfloat16,
+    __double2bfloat16 as double2bfloat16,
     __float2bfloat16_rn as float2bfloat16_rn,
     __float2bfloat16_rz as float2bfloat16_rz,
     __float2bfloat16_rd as float2bfloat16_rd,
     __float2bfloat16_ru as float2bfloat16_ru,
+    # - char family
+    __bfloat162char_rz as bfloat162char_rz,
+    __bfloat162uchar_rz as bfloat162uchar_rz,
+    # - int family (signed 32-bit)
     __int2bfloat16_rn as int2bfloat16_rn,
     __int2bfloat16_rz as int2bfloat16_rz,
     __int2bfloat16_rd as int2bfloat16_rd,
@@ -52,6 +59,56 @@
     __bfloat162int_rz as bfloat162int_rz,
     __bfloat162int_rd as bfloat162int_rd,
     __bfloat162int_ru as bfloat162int_ru,
+    # - short family (signed 16-bit)
+    __short2bfloat16_rn as short2bfloat16_rn,
+    __short2bfloat16_rz as short2bfloat16_rz,
+    __short2bfloat16_rd as short2bfloat16_rd,
+    __short2bfloat16_ru as short2bfloat16_ru,
+    __bfloat162short_rn as bfloat162short_rn,
+    __bfloat162short_rz as bfloat162short_rz,
+    __bfloat162short_rd as bfloat162short_rd,
+    __bfloat162short_ru as bfloat162short_ru,
+    # - ushort family (unsigned 16-bit)
+    __ushort2bfloat16_rn as ushort2bfloat16_rn,
+    __ushort2bfloat16_rz as ushort2bfloat16_rz,
+    __ushort2bfloat16_rd as ushort2bfloat16_rd,
+    __ushort2bfloat16_ru as ushort2bfloat16_ru,
+    __bfloat162ushort_rn as bfloat162ushort_rn,
+    __bfloat162ushort_rz as bfloat162ushort_rz,
+    __bfloat162ushort_rd as bfloat162ushort_rd,
+    __bfloat162ushort_ru as bfloat162ushort_ru,
+    # - uint family (unsigned 32-bit)
+    __uint2bfloat16_rn as uint2bfloat16_rn,
+    __uint2bfloat16_rz as uint2bfloat16_rz,
+    __uint2bfloat16_rd as uint2bfloat16_rd,
+    __uint2bfloat16_ru as uint2bfloat16_ru,
+    __bfloat162uint_rn as bfloat162uint_rn,
+    __bfloat162uint_rz as bfloat162uint_rz,
+    __bfloat162uint_rd as bfloat162uint_rd,
+    __bfloat162uint_ru as bfloat162uint_ru,
+    # - ll family (signed 64-bit)
+    __ll2bfloat16_rn as ll2bfloat16_rn,
+    __ll2bfloat16_rz as ll2bfloat16_rz,
+    __ll2bfloat16_rd as ll2bfloat16_rd,
+    __ll2bfloat16_ru as ll2bfloat16_ru,
+    __bfloat162ll_rn as bfloat162ll_rn,
+    __bfloat162ll_rz as bfloat162ll_rz,
+    __bfloat162ll_rd as bfloat162ll_rd,
+    __bfloat162ll_ru as bfloat162ll_ru,
+    # - ull family (unsigned 64-bit)
+    __ull2bfloat16_rn as ull2bfloat16_rn,
+    __ull2bfloat16_rz as ull2bfloat16_rz,
+    __ull2bfloat16_rd as ull2bfloat16_rd,
+    __ull2bfloat16_ru as ull2bfloat16_ru,
+    __bfloat162ull_rn as bfloat162ull_rn,
+    __bfloat162ull_rz as bfloat162ull_rz,
+    __bfloat162ull_rd as bfloat162ull_rd,
+    __bfloat162ull_ru as bfloat162ull_ru,
+    # - bit reinterpret casts
+    __bfloat16_as_short as bfloat16_as_short,
+    __bfloat16_as_ushort as bfloat16_as_ushort,
+    __short_as_bfloat16 as short_as_bfloat16,
+    __ushort_as_bfloat16 as ushort_as_bfloat16,
     htrunc,
     hceil,
     hfloor,
@@ -140,6 +197,83 @@ def exp2_ol(a):
 except ImportError:
     pass
 
+## Public aliases using Numba/Numpy-style type names
+# Floating-point
+float32_to_bfloat16 = float2bfloat16
+float64_to_bfloat16 = double2bfloat16
+bfloat16_to_float32 = bfloat162float
+float32_to_bfloat16_rn = float2bfloat16_rn
+float32_to_bfloat16_rz = float2bfloat16_rz
+float32_to_bfloat16_rd = float2bfloat16_rd
+float32_to_bfloat16_ru = float2bfloat16_ru
+
+# Char (8-bit)
+bfloat16_to_int8_rz = bfloat162char_rz
+bfloat16_to_uint8_rz = bfloat162uchar_rz
+
+# Int16 / UInt16
+int16_to_bfloat16_rn = short2bfloat16_rn
+int16_to_bfloat16_rz = short2bfloat16_rz
+int16_to_bfloat16_rd = short2bfloat16_rd
+int16_to_bfloat16_ru = short2bfloat16_ru
+bfloat16_to_int16_rn = bfloat162short_rn
+bfloat16_to_int16_rz = bfloat162short_rz
+bfloat16_to_int16_rd = bfloat162short_rd
+bfloat16_to_int16_ru = bfloat162short_ru
+
+uint16_to_bfloat16_rn = ushort2bfloat16_rn
+uint16_to_bfloat16_rz = ushort2bfloat16_rz
+uint16_to_bfloat16_rd = ushort2bfloat16_rd
+uint16_to_bfloat16_ru = ushort2bfloat16_ru
+bfloat16_to_uint16_rn = bfloat162ushort_rn
+bfloat16_to_uint16_rz = bfloat162ushort_rz
+bfloat16_to_uint16_rd = bfloat162ushort_rd
+bfloat16_to_uint16_ru = bfloat162ushort_ru
+
+# Int32 / UInt32
+int32_to_bfloat16_rn = int2bfloat16_rn
+int32_to_bfloat16_rz = int2bfloat16_rz
+int32_to_bfloat16_rd = int2bfloat16_rd
+int32_to_bfloat16_ru = int2bfloat16_ru
+bfloat16_to_int32_rn = bfloat162int_rn
+bfloat16_to_int32_rz = bfloat162int_rz
+bfloat16_to_int32_rd = bfloat162int_rd
+bfloat16_to_int32_ru = bfloat162int_ru
+
+uint32_to_bfloat16_rn = uint2bfloat16_rn
+uint32_to_bfloat16_rz = uint2bfloat16_rz
+uint32_to_bfloat16_rd = uint2bfloat16_rd
+uint32_to_bfloat16_ru = uint2bfloat16_ru
+bfloat16_to_uint32_rn = bfloat162uint_rn
+bfloat16_to_uint32_rz = bfloat162uint_rz
+bfloat16_to_uint32_rd = bfloat162uint_rd
+bfloat16_to_uint32_ru = bfloat162uint_ru
+
+# Int64 / UInt64
+int64_to_bfloat16_rn = ll2bfloat16_rn
+int64_to_bfloat16_rz = ll2bfloat16_rz
+int64_to_bfloat16_rd = ll2bfloat16_rd
+int64_to_bfloat16_ru = ll2bfloat16_ru
+bfloat16_to_int64_rn = bfloat162ll_rn
+bfloat16_to_int64_rz = bfloat162ll_rz
+bfloat16_to_int64_rd = bfloat162ll_rd
+bfloat16_to_int64_ru = bfloat162ll_ru
+
+uint64_to_bfloat16_rn = ull2bfloat16_rn
+uint64_to_bfloat16_rz = ull2bfloat16_rz
+uint64_to_bfloat16_rd = ull2bfloat16_rd
+uint64_to_bfloat16_ru = ull2bfloat16_ru
+bfloat16_to_uint64_rn = bfloat162ull_rn
+bfloat16_to_uint64_rz = bfloat162ull_rz
+bfloat16_to_uint64_rd = bfloat162ull_rd
+bfloat16_to_uint64_ru = bfloat162ull_ru
+
+# Bit reinterpret casts
+bfloat16_as_int16 = bfloat16_as_short
+bfloat16_as_uint16 = bfloat16_as_ushort
+int16_as_bfloat16 = short_as_bfloat16
+uint16_as_bfloat16 = ushort_as_bfloat16
+
 __all__ = [
     "typing_registry",
     "target_registry",
@@ -180,19 +314,67 @@ def exp2_ol(a):
     "hleu",
     "hltu",
     # Precision conversion and data movement
-    "bfloat162float",
-    "float2bfloat16_rn",
-    "float2bfloat16_rz",
-    "float2bfloat16_rd",
-    "float2bfloat16_ru",
-    "int2bfloat16_rn",
-    "int2bfloat16_rz",
-    "int2bfloat16_rd",
-    "int2bfloat16_ru",
-    "bfloat162int_rn",
-    "bfloat162int_rz",
-    "bfloat162int_rd",
-    "bfloat162int_ru",
+    "float32_to_bfloat16",
+    "float64_to_bfloat16",
+    "bfloat16_to_float32",
+    "float32_to_bfloat16_rn",
+    "float32_to_bfloat16_rz",
+    "float32_to_bfloat16_rd",
+    "float32_to_bfloat16_ru",
+    "bfloat16_to_int8_rz",
+    "bfloat16_to_uint8_rz",
+    "int16_to_bfloat16_rn",
+    "int16_to_bfloat16_rz",
+    "int16_to_bfloat16_rd",
+    "int16_to_bfloat16_ru",
+    "bfloat16_to_int16_rn",
+    "bfloat16_to_int16_rz",
+    "bfloat16_to_int16_rd",
+    "bfloat16_to_int16_ru",
+    "uint16_to_bfloat16_rn",
+    "uint16_to_bfloat16_rz",
+    "uint16_to_bfloat16_rd",
+    "uint16_to_bfloat16_ru",
+    "bfloat16_to_uint16_rn",
+    "bfloat16_to_uint16_rz",
+    "bfloat16_to_uint16_rd",
+    "bfloat16_to_uint16_ru",
+    "int32_to_bfloat16_rn",
+    "int32_to_bfloat16_rz",
+    "int32_to_bfloat16_rd",
+    "int32_to_bfloat16_ru",
+    "bfloat16_to_int32_rn",
+    "bfloat16_to_int32_rz",
+    "bfloat16_to_int32_rd",
+    "bfloat16_to_int32_ru",
+    "uint32_to_bfloat16_rn",
+    "uint32_to_bfloat16_rz",
+    "uint32_to_bfloat16_rd",
+    "uint32_to_bfloat16_ru",
+    "bfloat16_to_uint32_rn",
+    "bfloat16_to_uint32_rz",
+    "bfloat16_to_uint32_rd",
+    "bfloat16_to_uint32_ru",
+    "int64_to_bfloat16_rn",
+    "int64_to_bfloat16_rz",
+    "int64_to_bfloat16_rd",
+    "int64_to_bfloat16_ru",
+    "bfloat16_to_int64_rn",
+    "bfloat16_to_int64_rz",
+    "bfloat16_to_int64_rd",
+    "bfloat16_to_int64_ru",
+    "uint64_to_bfloat16_rn",
+    "uint64_to_bfloat16_rz",
+    "uint64_to_bfloat16_rd",
+    "uint64_to_bfloat16_ru",
+    "bfloat16_to_uint64_rn",
+    "bfloat16_to_uint64_rz",
+    "bfloat16_to_uint64_rd",
+    "bfloat16_to_uint64_ru",
+    "bfloat16_as_int16",
+    "bfloat16_as_uint16",
+    "int16_as_bfloat16",
+    "uint16_as_bfloat16",
     "htrunc",
     "hceil",
     "hfloor",
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py
index 53309f671..1147bba11 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py
@@ -32,20 +32,38 @@
     hmin_nan,
     hisnan,
     hisinf,
-    # Conversion intrinsics
-    bfloat162float,
-    float2bfloat16_rn,
-    float2bfloat16_rz,
-    float2bfloat16_rd,
-    float2bfloat16_ru,
-    int2bfloat16_rn,
-    int2bfloat16_rz,
-    int2bfloat16_rd,
-    int2bfloat16_ru,
-    bfloat162int_rn,
-    bfloat162int_rz,
-    bfloat162int_rd,
-    bfloat162int_ru,
+    # Conversion intrinsics (NumPy-style names)
+    bfloat16_to_float32,
+    float32_to_bfloat16,
+    float64_to_bfloat16,
+    float32_to_bfloat16_rn,
+    float32_to_bfloat16_rz,
+    float32_to_bfloat16_rd,
+    float32_to_bfloat16_ru,
+    int32_to_bfloat16_rn,
+    int32_to_bfloat16_rz,
+    int32_to_bfloat16_rd,
+    int32_to_bfloat16_ru,
+    bfloat16_to_int32_rn,
+    bfloat16_to_int32_rz,
+    bfloat16_to_int32_rd,
+    bfloat16_to_int32_ru,
+    bfloat16_to_int16_rn,
+    int16_to_bfloat16_rn,
+    bfloat16_to_uint16_rn,
+    uint16_to_bfloat16_rn,
+    bfloat16_to_uint32_rn,
+    uint32_to_bfloat16_rn,
+    bfloat16_to_int64_rn,
+    int64_to_bfloat16_rn,
+    bfloat16_to_uint64_rn,
+    uint64_to_bfloat16_rn,
+    bfloat16_as_short,
+    bfloat16_as_ushort,
+    short_as_bfloat16,
+    ushort_as_bfloat16,
+    bfloat16_to_int8_rz,
+    bfloat16_to_uint8_rz,
 )
 from numba.cuda.testing import CUDATestCase
 
@@ -284,37 +302,37 @@ def kernel(out):
         self.assertAlmostEqual(out[2], 2.0, delta=1e-3)
         self.assertAlmostEqual(out[3], 2.0, delta=1e-3)
 
-    def test_precision_conversion_intrinsics(self):
+    def test_int32_float32_precision_conversion_intrinsics(self):
         self.skip_unsupported()
 
         @cuda.jit
         def kernel_float_to_bf16(out):
             f = float32(3.14)
-            out[0] = float32(float2bfloat16_rn(f))
-            out[1] = float32(float2bfloat16_rz(f))
-            out[2] = float32(float2bfloat16_rd(f))
-            out[3] = float32(float2bfloat16_ru(f))
+            out[0] = float32(float32_to_bfloat16_rn(f))
+            out[1] = float32(float32_to_bfloat16_rz(f))
+            out[2] = float32(float32_to_bfloat16_rd(f))
+            out[3] = float32(float32_to_bfloat16_ru(f))
 
         @cuda.jit
         def kernel_bf16_to_float(out):
             a = bfloat16(3.14)
-            out[0] = bfloat162float(a)
+            out[0] = bfloat16_to_float32(a)
 
         @cuda.jit
         def kernel_int_to_bf16(out):
             i = 3
-            out[0] = float32(int2bfloat16_rn(i))
-            out[1] = float32(int2bfloat16_rz(i))
-            out[2] = float32(int2bfloat16_rd(i))
-            out[3] = float32(int2bfloat16_ru(i))
+            out[0] = float32(int32_to_bfloat16_rn(i))
+            out[1] = float32(int32_to_bfloat16_rz(i))
+            out[2] = float32(int32_to_bfloat16_rd(i))
+            out[3] = float32(int32_to_bfloat16_ru(i))
 
         @cuda.jit
         def kernel_bf16_to_int(out):
             a = bfloat16(3.14)
-            out[0] = bfloat162int_rn(a)
-            out[1] = bfloat162int_rz(a)
-            out[2] = bfloat162int_rd(a)
-            out[3] = bfloat162int_ru(a)
+            out[0] = bfloat16_to_int32_rn(a)
+            out[1] = bfloat16_to_int32_rz(a)
+            out[2] = bfloat16_to_int32_rd(a)
+            out[3] = bfloat16_to_int32_ru(a)
 
         out = cuda.device_array((4,), dtype="float32")
         kernel_float_to_bf16[1, 1](out)
@@ -345,6 +363,92 @@ def kernel_bf16_to_int(out):
         self.assertEqual(int(outi[2]), 3)
         self.assertIn(int(outi[3]), (3, 4))
 
+    def test_floatroundtrip_integer_conversion_intrinsics(self):
+        self.skip_unsupported()
+
+        @cuda.jit
+        def kernel_scalar_roundtrip(out):
+            f = 3.14
+            bf = float32_to_bfloat16(f)
+            out[0] = bfloat16_to_float32(bf)
+            d = 3.14
+            bf2 = float64_to_bfloat16(d)
+            out[1] = bfloat16_to_float32(bf2)
+
+        out = cuda.device_array((2,), dtype="float32")
+        kernel_scalar_roundtrip[1, 1](out)
+        self.assertAlmostEqual(out[0], 3.140625, delta=1e-3)
+        self.assertAlmostEqual(out[1], 3.140625, delta=1e-3)
+
+        @cuda.jit
+        def kernel_int_family(outf):
+            outf[0] = float32(int16_to_bfloat16_rn(123))
+            outf[1] = float32(uint16_to_bfloat16_rn(456))
+            outf[2] = float32(uint32_to_bfloat16_rn(789))
+            outf[3] = float32(int64_to_bfloat16_rn(1011))
+            outf[4] = float32(uint64_to_bfloat16_rn(1213))
+
+        outf = cuda.device_array((5,), dtype="float32")
+        kernel_int_family[1, 1](outf)
+        vals = [123, 456, 789, 1011, 1213]
+        for i, v in enumerate(vals):
+            got = int(outf[i])
+            # `step` estimates ULP near the integer `v`.
+            # Bfloat16 has 7 bits of precision, spacing between representable values are 2**(e-7).
+            # We use the exponent of the value `v` to raise the minSpacing, the result is a reasonable
+            # esitmate the local ULP.
+            step = (
+                0 if v == 0 else 2 ** (int(math.floor(math.log2(abs(v)))) - 7)
+            )
+            # `allowed` is the maximum error in ULP, with a minimum of 1
+            # In general, half ULP is the typical rounding error bound.
+            allowed = max(1, int(step // 2))
+            self.assertLessEqual(abs(got - v), allowed)
+
+        @cuda.jit
+        def kernel_from_bf16_to_ints(outi):
+            a = bfloat16(5.75)
+            outi[0] = bfloat16_to_int16_rn(a)
+            outi[1] = bfloat16_to_uint16_rn(a)
+            outi[2] = bfloat16_to_uint32_rn(a)
+            outi[3] = bfloat16_to_int64_rn(a)
+            outi[4] = bfloat16_to_uint64_rn(a)
+
+        outi = cuda.device_array((5,), dtype="int64")
+        kernel_from_bf16_to_ints[1, 1](outi)
+        self.assertEqual(int(outi[0]), 6)
+        self.assertEqual(int(outi[1]), 6)
+        self.assertEqual(int(outi[2]), 6)
+        self.assertEqual(int(outi[3]), 6)
+        self.assertEqual(int(outi[4]), 6)
+
+        @cuda.jit
+        def kernel_bit_reinterpret(out_short, out_ushort):
+            s = 12345
+            bf = short_as_bfloat16(s)
+            out_short[0] = bfloat16_as_short(bf)
+            us = 54321
+            bf2 = ushort_as_bfloat16(us)
+            out_ushort[0] = bfloat16_as_ushort(bf2)
+
+        out_short = cuda.device_array((1,), dtype="int32")
+        out_ushort = cuda.device_array((1,), dtype="uint32")
+        kernel_bit_reinterpret[1, 1](out_short, out_ushort)
+        self.assertEqual(int(out_short[0]), 12345)
+        self.assertEqual(int(out_ushort[0]), 54321)
+
+        @cuda.jit
+        def kernel_char(out_c, out_uc):
+            a = bfloat16(3.9)
+            out_c[0] = bfloat16_to_int8_rz(a)
+            out_uc[0] = bfloat16_to_uint8_rz(a)
+
+        out_c = cuda.device_array((1,), dtype="int8")
+        out_uc = cuda.device_array((1,), dtype="uint8")
+        kernel_char[1, 1](out_c, out_uc)
+        self.assertEqual(int(out_c[0]), 3)
+        self.assertEqual(int(out_uc[0]), 3)
+
     @unittest.skipIf(
         find_spec("ml_dtypes") is None,
         "ml_dtypes is required to use bfloat16 on host",

From 55d222024a929bf4c4b942dec87a877224a453df Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Fri, 15 Aug 2025 15:51:39 -0700
Subject: [PATCH 31/56] add documentation for conversions

---
 docs/source/reference/types.rst | 173 ++++++++++++++++++++++++++++++++
 1 file changed, 173 insertions(+)

diff --git a/docs/source/reference/types.rst b/docs/source/reference/types.rst
index b11d68186..40112210a 100644
--- a/docs/source/reference/types.rst
+++ b/docs/source/reference/types.rst
@@ -364,3 +364,176 @@ Special value predicates:
     the ordered comparisons above. For more details on the CUDA bfloat16
     comparison semantics, see `NVIDIA CUDA Math API: Bfloat16 Comparison Functions
     <https://docs.nvidia.com/cuda/cuda-math-api/cuda_math_api/group__CUDA__MATH____BFLOAT16__COMPARISON.html#group__cuda__math____bfloat16__comparison>`_.
+
+Precision Conversion and Data Movement
+*************************************
+
+The following conversion intrinsics convert between ``bfloat16`` and other
+scalar types. Rounding-mode suffixes:
+
+- ``_rn``: round-to-nearest-even
+- ``_rz``: round-towards-zero
+- ``_rd``: round-down (towards −∞)
+- ``_ru``: round-up (towards +∞)
+
+Floating-point conversions
+==========================
+
+.. function:: numba.cuda.bf16.float32_to_bfloat16(x)
+
+    Convert a ``float32`` to ``bfloat16`` (default rounding is round-to-nearest-even).
+
+.. function:: numba.cuda.bf16.float64_to_bfloat16(x)
+
+    Convert a ``float64`` to ``bfloat16`` (default rounding is round-to-nearest-even).
+
+.. function:: numba.cuda.bf16.bfloat16_to_float32(x)
+
+    Convert a ``bfloat16`` to ``float32``.
+
+.. function:: numba.cuda.bf16.float32_to_bfloat16_rn(x)
+.. function:: numba.cuda.bf16.float32_to_bfloat16_rz(x)
+.. function:: numba.cuda.bf16.float32_to_bfloat16_rd(x)
+.. function:: numba.cuda.bf16.float32_to_bfloat16_ru(x)
+
+    Convert a ``float32`` to ``bfloat16`` using the specified rounding mode.
+
+Integer conversions
+===================
+
+Representative APIs for each integer width are listed below. All have
+rounding-mode variants ``_rn``, ``_rz``, ``_rd``, ``_ru``.
+
+int16 (signed 16-bit)
+---------------------
+
+.. function:: numba.cuda.bf16.int16_to_bfloat16_rn(x)
+.. function:: numba.cuda.bf16.int16_to_bfloat16_rz(x)
+.. function:: numba.cuda.bf16.int16_to_bfloat16_rd(x)
+.. function:: numba.cuda.bf16.int16_to_bfloat16_ru(x)
+
+    Convert an ``int16`` to ``bfloat16`` with the selected rounding mode.
+
+.. function:: numba.cuda.bf16.bfloat16_to_int16_rn(x)
+.. function:: numba.cuda.bf16.bfloat16_to_int16_rz(x)
+.. function:: numba.cuda.bf16.bfloat16_to_int16_rd(x)
+.. function:: numba.cuda.bf16.bfloat16_to_int16_ru(x)
+
+    Convert a ``bfloat16`` to ``int16`` with the selected rounding mode.
+
+uint16 (unsigned 16-bit)
+------------------------
+
+.. function:: numba.cuda.bf16.uint16_to_bfloat16_rn(x)
+.. function:: numba.cuda.bf16.uint16_to_bfloat16_rz(x)
+.. function:: numba.cuda.bf16.uint16_to_bfloat16_rd(x)
+.. function:: numba.cuda.bf16.uint16_to_bfloat16_ru(x)
+
+    Convert a ``uint16`` to ``bfloat16`` with the selected rounding mode.
+
+.. function:: numba.cuda.bf16.bfloat16_to_uint16_rn(x)
+.. function:: numba.cuda.bf16.bfloat16_to_uint16_rz(x)
+.. function:: numba.cuda.bf16.bfloat16_to_uint16_rd(x)
+.. function:: numba.cuda.bf16.bfloat16_to_uint16_ru(x)
+
+    Convert a ``bfloat16`` to ``uint16`` with the selected rounding mode.
+
+int32 (signed 32-bit)
+---------------------
+
+.. function:: numba.cuda.bf16.int32_to_bfloat16_rn(x)
+.. function:: numba.cuda.bf16.int32_to_bfloat16_rz(x)
+.. function:: numba.cuda.bf16.int32_to_bfloat16_rd(x)
+.. function:: numba.cuda.bf16.int32_to_bfloat16_ru(x)
+
+    Convert an ``int32`` to ``bfloat16`` with the selected rounding mode.
+
+.. function:: numba.cuda.bf16.bfloat16_to_int32_rn(x)
+.. function:: numba.cuda.bf16.bfloat16_to_int32_rz(x)
+.. function:: numba.cuda.bf16.bfloat16_to_int32_rd(x)
+.. function:: numba.cuda.bf16.bfloat16_to_int32_ru(x)
+
+    Convert a ``bfloat16`` to ``int32`` with the selected rounding mode.
+
+uint32 (unsigned 32-bit)
+------------------------
+
+.. function:: numba.cuda.bf16.uint32_to_bfloat16_rn(x)
+.. function:: numba.cuda.bf16.uint32_to_bfloat16_rz(x)
+.. function:: numba.cuda.bf16.uint32_to_bfloat16_rd(x)
+.. function:: numba.cuda.bf16.uint32_to_bfloat16_ru(x)
+
+    Convert a ``uint32`` to ``bfloat16`` with the selected rounding mode.
+
+.. function:: numba.cuda.bf16.bfloat16_to_uint32_rn(x)
+.. function:: numba.cuda.bf16.bfloat16_to_uint32_rz(x)
+.. function:: numba.cuda.bf16.bfloat16_to_uint32_rd(x)
+.. function:: numba.cuda.bf16.bfloat16_to_uint32_ru(x)
+
+    Convert a ``bfloat16`` to ``uint32`` with the selected rounding mode.
+
+int64 (signed 64-bit)
+---------------------
+
+.. function:: numba.cuda.bf16.int64_to_bfloat16_rn(x)
+.. function:: numba.cuda.bf16.int64_to_bfloat16_rz(x)
+.. function:: numba.cuda.bf16.int64_to_bfloat16_rd(x)
+.. function:: numba.cuda.bf16.int64_to_bfloat16_ru(x)
+
+    Convert an ``int64`` to ``bfloat16`` with the selected rounding mode.
+
+.. function:: numba.cuda.bf16.bfloat16_to_int64_rn(x)
+.. function:: numba.cuda.bf16.bfloat16_to_int64_rz(x)
+.. function:: numba.cuda.bf16.bfloat16_to_int64_rd(x)
+.. function:: numba.cuda.bf16.bfloat16_to_int64_ru(x)
+
+    Convert a ``bfloat16`` to ``int64`` with the selected rounding mode.
+
+uint64 (unsigned 64-bit)
+------------------------
+
+.. function:: numba.cuda.bf16.uint64_to_bfloat16_rn(x)
+.. function:: numba.cuda.bf16.uint64_to_bfloat16_rz(x)
+.. function:: numba.cuda.bf16.uint64_to_bfloat16_rd(x)
+.. function:: numba.cuda.bf16.uint64_to_bfloat16_ru(x)
+
+    Convert a ``uint64`` to ``bfloat16`` with the selected rounding mode.
+
+.. function:: numba.cuda.bf16.bfloat16_to_uint64_rn(x)
+.. function:: numba.cuda.bf16.bfloat16_to_uint64_rz(x)
+.. function:: numba.cuda.bf16.bfloat16_to_uint64_rd(x)
+.. function:: numba.cuda.bf16.bfloat16_to_uint64_ru(x)
+
+    Convert a ``bfloat16`` to ``uint64`` with the selected rounding mode.
+
+8-bit conversions
+=================
+
+.. function:: numba.cuda.bf16.bfloat16_to_int8_rz(x)
+
+    Convert a ``bfloat16`` to ``int8`` with round-towards-zero.
+
+.. function:: numba.cuda.bf16.bfloat16_to_uint8_rz(x)
+
+    Convert a ``bfloat16`` to ``uint8`` with round-towards-zero.
+
+Bit Reinterpret Casts
+*********************
+
+These APIs reinterpret bits without numeric conversion:
+
+.. function:: numba.cuda.bf16.bfloat16_as_int16(x)
+
+    Reinterpret the bits of ``bfloat16`` as an ``int16``.
+
+.. function:: numba.cuda.bf16.bfloat16_as_uint16(x)
+
+    Reinterpret the bits of ``bfloat16`` as a ``uint16``.
+
+.. function:: numba.cuda.bf16.int16_as_bfloat16(x)
+
+    Reinterpret the bits of an ``int16`` as a ``bfloat16``.
+
+.. function:: numba.cuda.bf16.uint16_as_bfloat16(x)
+
+    Reinterpret the bits of a ``uint16`` as a ``bfloat16``.

From 702b8cab9eb2bac265a00d384ac750ca1676c315 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Fri, 15 Aug 2025 15:54:57 -0700
Subject: [PATCH 32/56] removing cuda_bf16 vended headers

---
 numba_cuda/numba/cuda/include/11/cuda_bf16.h  | 3749 -----------------
 .../numba/cuda/include/11/cuda_bf16.hpp       | 2683 ------------
 2 files changed, 6432 deletions(-)
 delete mode 100644 numba_cuda/numba/cuda/include/11/cuda_bf16.h
 delete mode 100644 numba_cuda/numba/cuda/include/11/cuda_bf16.hpp

diff --git a/numba_cuda/numba/cuda/include/11/cuda_bf16.h b/numba_cuda/numba/cuda/include/11/cuda_bf16.h
deleted file mode 100644
index 78f660d38..000000000
--- a/numba_cuda/numba/cuda/include/11/cuda_bf16.h
+++ /dev/null
@@ -1,3749 +0,0 @@
-/*
-* Copyright 1993-2021 NVIDIA Corporation.  All rights reserved.
-*
-* NOTICE TO LICENSEE:
-*
-* This source code and/or documentation ("Licensed Deliverables") are
-* subject to NVIDIA intellectual property rights under U.S. and
-* international Copyright laws.
-*
-* These Licensed Deliverables contained herein is PROPRIETARY and
-* CONFIDENTIAL to NVIDIA and is being provided under the terms and
-* conditions of a form of NVIDIA software license agreement by and
-* between NVIDIA and Licensee ("License Agreement") or electronically
-* accepted by Licensee.  Notwithstanding any terms or conditions to
-* the contrary in the License Agreement, reproduction or disclosure
-* of the Licensed Deliverables to any third party without the express
-* written consent of NVIDIA is prohibited.
-*
-* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
-* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
-* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
-* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
-* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
-* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
-* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
-* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
-* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
-* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
-* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
-* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
-* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
-* OF THESE LICENSED DELIVERABLES.
-*
-* U.S. Government End Users.  These Licensed Deliverables are a
-* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
-* 1995), consisting of "commercial computer software" and "commercial
-* computer software documentation" as such terms are used in 48
-* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
-* only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
-* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
-* U.S. Government End Users acquire the Licensed Deliverables with
-* only those rights set forth herein.
-*
-* Any use of the Licensed Deliverables in individual and commercial
-* software must include, in the user documentation and internal
-* comments to the code, the above Disclaimer and U.S. Government End
-* Users Notice.
-*/
-
-/**
-* \defgroup CUDA_MATH_INTRINSIC_BFLOAT16 Bfloat16 Precision Intrinsics
-* This section describes nv_bfloat16 precision intrinsic functions that are
-* only supported in device code.
-* To use these functions, include the header file \p cuda_bf16.h in your program.
-*/
-
-/**
-* \defgroup CUDA_MATH__BFLOAT16_ARITHMETIC Bfloat16 Arithmetic Functions
-* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
-* To use these functions, include the header file \p cuda_bf16.h in your program.
-*/
-
-/**
-* \defgroup CUDA_MATH__BFLOAT162_ARITHMETIC Bfloat162 Arithmetic Functions
-* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
-* To use these functions, include the header file \p cuda_bf16.h in your program.
-*/
-
-/**
-* \defgroup CUDA_MATH__BFLOAT16_COMPARISON Bfloat16 Comparison Functions
-* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
-* To use these functions, include the header file \p cuda_bf16.h in your program.
-*/
-
-/**
-* \defgroup CUDA_MATH__BFLOAT162_COMPARISON Bfloat162 Comparison Functions
-* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
-* To use these functions, include the header file \p cuda_bf16.h in your program.
-*/
-
-/**
-* \defgroup CUDA_MATH__BFLOAT16_MISC Bfloat16 Precision Conversion and Data Movement
-* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
-* To use these functions, include the header file \p cuda_bf16.h in your program.
-*/
-
-/**
-* \defgroup CUDA_MATH__BFLOAT16_FUNCTIONS Bfloat16 Math Functions
-* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
-* To use these functions, include the header file \p cuda_bf16.h in your program.
-*/
-
-/**
-* \defgroup CUDA_MATH__BFLOAT162_FUNCTIONS Bfloat162 Math Functions
-* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
-* To use these functions, include the header file \p cuda_bf16.h in your program.
-*/
-
-#ifndef __CUDA_BF16_H__
-#define __CUDA_BF16_H__
-
-#define ___CUDA_BF16_STRINGIFY_INNERMOST(x) #x
-#define __CUDA_BF16_STRINGIFY(x) ___CUDA_BF16_STRINGIFY_INNERMOST(x)
-
-#if defined(__cplusplus)
-#if defined(__CUDACC__)
-#define __CUDA_BF16_DECL__ static __device__ __inline__
-#define __CUDA_HOSTDEVICE_BF16_DECL__ static __host__ __device__ __inline__
-#else
-#define __CUDA_HOSTDEVICE_BF16_DECL__ static
-#endif /* defined(__CUDACC__) */
-
-#define __CUDA_BF16_TYPES_EXIST__
-
-/* Forward-declaration of structures defined in "cuda_bf16.hpp" */
-
-/**
- * \brief nv_bfloat16 datatype
- *
- * \details This structure implements the datatype for storing
- * nv_bfloat16 floating-point numbers. The structure implements
- * assignment operators and type conversions. 16 bits are being
- * used in total: 1 sign bit, 8 bits for the exponent, and
- * the significand is being stored in 7 bits. The total
- * precision is 8 bits.
- *
- */
-struct __nv_bfloat16;
-
-/**
- * \brief nv_bfloat162 datatype
- *
- * \details This structure implements the datatype for storing two
- * nv_bfloat16 floating-point numbers.
- * The structure implements assignment operators and type conversions.
- *
- */
-struct __nv_bfloat162;
-
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Converts double number to nv_bfloat16 precision in round-to-nearest-even mode
-* and returns \p nv_bfloat16 with converted value.
-*
-* \details Converts double number \p a to nv_bfloat16 precision in round-to-nearest-even mode.
-* \param[in] a - double. Is only being read.
-* \returns nv_bfloat16
-* - \p a converted to nv_bfloat16.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __double2bfloat16(const double a);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Converts float number to nv_bfloat16 precision in round-to-nearest-even mode
-* and returns \p nv_bfloat16 with converted value.
-*
-* \details Converts float number \p a to nv_bfloat16 precision in round-to-nearest-even mode.
-* \param[in] a - float. Is only being read.
-* \returns nv_bfloat16
-* - \p a converted to nv_bfloat16.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16(const float a);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Converts float number to nv_bfloat16 precision in round-to-nearest-even mode
-* and returns \p nv_bfloat16 with converted value.
-*
-* \details Converts float number \p a to nv_bfloat16 precision in round-to-nearest-even mode.
-* \param[in] a - float. Is only being read.
-* \returns nv_bfloat16
-* - \p a converted to nv_bfloat16.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rn(const float a);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Converts float number to nv_bfloat16 precision in round-towards-zero mode
-* and returns \p nv_bfloat16 with converted value.
-*
-* \details Converts float number \p a to nv_bfloat16 precision in round-towards-zero mode.
-* \param[in] a - float. Is only being read.
-* \returns nv_bfloat16
-* - \p a converted to nv_bfloat16.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rz(const float a);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Converts float number to nv_bfloat16 precision in round-down mode
-* and returns \p nv_bfloat16 with converted value.
-*
-* \details Converts float number \p a to nv_bfloat16 precision in round-down mode.
-* \param[in] a - float. Is only being read.
-*
-* \returns nv_bfloat16
-* - \p a converted to nv_bfloat16.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rd(const float a);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Converts float number to nv_bfloat16 precision in round-up mode
-* and returns \p nv_bfloat16 with converted value.
-*
-* \details Converts float number \p a to nv_bfloat16 precision in round-up mode.
-* \param[in] a - float. Is only being read.
-*
-* \returns nv_bfloat16
-* - \p a converted to nv_bfloat16.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_ru(const float a);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Converts \p nv_bfloat16 number to float.
-*
-* \details Converts nv_bfloat16 number \p a to float.
-* \param[in] a - float. Is only being read.
-*
-* \returns float
-* - \p a converted to float.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_HOSTDEVICE_BF16_DECL__ float __bfloat162float(const __nv_bfloat16 a);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Converts input to nv_bfloat16 precision in round-to-nearest-even mode and
-* populates both halves of \p nv_bfloat162 with converted value.
-*
-* \details Converts input \p a to nv_bfloat16 precision in round-to-nearest-even mode and
-* populates both halves of \p nv_bfloat162 with converted value.
-* \param[in] a - float. Is only being read.
-*
-* \returns nv_bfloat162
-* - The \p nv_bfloat162 value with both halves equal to the converted nv_bfloat16
-* precision number.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __float2bfloat162_rn(const float a);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Converts both input floats to nv_bfloat16 precision in round-to-nearest-even
-* mode and returns \p nv_bfloat162 with converted values.
-*
-* \details Converts both input floats to nv_bfloat16 precision in round-to-nearest-even mode
-* and combines the results into one \p nv_bfloat162 number. Low 16 bits of the return
-* value correspond to the input \p a, high 16 bits correspond to the input \p
-* b.
-* \param[in] a - float. Is only being read.
-* \param[in] b - float. Is only being read.
-*
-* \returns nv_bfloat162
-* - The \p nv_bfloat162 value with corresponding halves equal to the
-* converted input floats.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __floats2bfloat162_rn(const float a, const float b);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Converts low 16 bits of \p nv_bfloat162 to float and returns the result
-*
-* \details Converts low 16 bits of \p nv_bfloat162 input \p a to 32-bit floating-point number
-* and returns the result.
-* \param[in] a - nv_bfloat162. Is only being read.
-*
-* \returns float
-* - The low 16 bits of \p a converted to float.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_HOSTDEVICE_BF16_DECL__ float __low2float(const __nv_bfloat162 a);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Converts high 16 bits of \p nv_bfloat162 to float and returns the result
-*
-* \details Converts high 16 bits of \p nv_bfloat162 input \p a to 32-bit floating-point number
-* and returns the result.
-* \param[in] a - nv_bfloat162. Is only being read.
-*
-* \returns float
-* - The high 16 bits of \p a converted to float.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_HOSTDEVICE_BF16_DECL__ float __high2float(const __nv_bfloat162 a);
-
-#if defined(__CUDACC__) && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Converts both components of float2 number to nv_bfloat16 precision in
-* round-to-nearest-even mode and returns \p nv_bfloat162 with converted values.
-*
-* \details Converts both components of float2 to nv_bfloat16 precision in round-to-nearest
-* mode and combines the results into one \p nv_bfloat162 number. Low 16 bits of the
-* return value correspond to \p a.x and high 16 bits of the return value
-* correspond to \p a.y.
-* \param[in] a - float2. Is only being read.
-*
-* \returns nv_bfloat162
-* - The \p nv_bfloat162 which has corresponding halves equal to the
-* converted float2 components.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __float22bfloat162_rn(const float2 a);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Converts both halves of \p nv_bfloat162 to float2 and returns the result.
-*
-* \details Converts both halves of \p nv_bfloat162 input \p a to float2 and returns the
-* result.
-* \param[in] a - nv_bfloat162. Is only being read.
-*
-* \returns float2
-* - \p a converted to float2.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_HOSTDEVICE_BF16_DECL__ float2 __bfloat1622float2(const __nv_bfloat162 a);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Convert a nv_bfloat16 to a signed integer in round-to-nearest-even mode.
-*
-* \details Convert the nv_bfloat16 floating-point value \p h to a signed integer in
-* round-to-nearest-even mode. NaN inputs are converted to 0.
-* \param[in] h - nv_bfloat16. Is only being read.
-*
-* \returns int
-* - \p h converted to a signed integer.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ int __bfloat162int_rn(const __nv_bfloat16 h);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Convert a nv_bfloat16 to a signed integer in round-towards-zero mode.
-*
-* \details Convert the nv_bfloat16 floating-point value \p h to a signed integer in
-* round-towards-zero mode. NaN inputs are converted to 0.
-* \param[in] h - nv_bfloat16. Is only being read.
-*
-* \returns int
-* - \p h converted to a signed integer.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_HOSTDEVICE_BF16_DECL__ int __bfloat162int_rz(const __nv_bfloat16 h);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Convert a nv_bfloat16 to a signed integer in round-down mode.
-*
-* \details Convert the nv_bfloat16 floating-point value \p h to a signed integer in
-* round-down mode. NaN inputs are converted to 0.
-* \param[in] h - nv_bfloat16. Is only being read.
-*
-* \returns int
-* - \p h converted to a signed integer.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ int __bfloat162int_rd(const __nv_bfloat16 h);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Convert a nv_bfloat16 to a signed integer in round-up mode.
-*
-* \details Convert the nv_bfloat16 floating-point value \p h to a signed integer in
-* round-up mode. NaN inputs are converted to 0.
-* \param[in] h - nv_bfloat16. Is only being read.
-*
-* \returns int
-* - \p h converted to a signed integer.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ int __bfloat162int_ru(const __nv_bfloat16 h);
-
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Convert a signed integer to a nv_bfloat16 in round-to-nearest-even mode.
-*
-* \details Convert the signed integer value \p i to a nv_bfloat16 floating-point
-* value in round-to-nearest-even mode.
-* \param[in] i - int. Is only being read.
-*
-* \returns nv_bfloat16
-* - \p i converted to nv_bfloat16.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rn(const int i);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Convert a signed integer to a nv_bfloat16 in round-towards-zero mode.
-*
-* \details Convert the signed integer value \p i to a nv_bfloat16 floating-point
-* value in round-towards-zero mode.
-* \param[in] i - int. Is only being read.
-*
-* \returns nv_bfloat16
-* - \p i converted to nv_bfloat16.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rz(const int i);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Convert a signed integer to a nv_bfloat16 in round-down mode.
-*
-* \details Convert the signed integer value \p i to a nv_bfloat16 floating-point
-* value in round-down mode.
-* \param[in] i - int. Is only being read.
-*
-* \returns nv_bfloat16
-* - \p i converted to nv_bfloat16.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rd(const int i);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Convert a signed integer to a nv_bfloat16 in round-up mode.
-*
-* \details Convert the signed integer value \p i to a nv_bfloat16 floating-point
-* value in round-up mode.
-* \param[in] i - int. Is only being read.
-*
-* \returns nv_bfloat16
-* - \p i converted to nv_bfloat16.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_ru(const int i);
-
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Convert a nv_bfloat16 to a signed short integer in round-to-nearest-even
-* mode.
-*
-* \details Convert the nv_bfloat16 floating-point value \p h to a signed short
-* integer in round-to-nearest-even mode. NaN inputs are converted to 0.
-* \param[in] h - nv_bfloat16. Is only being read.
-*
-* \returns short int
-* - \p h converted to a signed short integer.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ short int __bfloat162short_rn(const __nv_bfloat16 h);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Convert a nv_bfloat16 to a signed short integer in round-towards-zero mode.
-*
-* \details Convert the nv_bfloat16 floating-point value \p h to a signed short
-* integer in round-towards-zero mode. NaN inputs are converted to 0.
-* \param[in] h - nv_bfloat16. Is only being read.
-*
-* \returns short int
-* - \p h converted to a signed short integer.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_HOSTDEVICE_BF16_DECL__ short int __bfloat162short_rz(const __nv_bfloat16 h);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Convert a nv_bfloat16 to a signed short integer in round-down mode.
-*
-* \details Convert the nv_bfloat16 floating-point value \p h to a signed short
-* integer in round-down mode. NaN inputs are converted to 0.
-* \param[in] h - nv_bfloat16. Is only being read.
-*
-* \returns short int
-* - \p h converted to a signed short integer.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ short int __bfloat162short_rd(const __nv_bfloat16 h);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Convert a nv_bfloat16 to a signed short integer in round-up mode.
-*
-* \details Convert the nv_bfloat16 floating-point value \p h to a signed short
-* integer in round-up mode. NaN inputs are converted to 0.
-* \param[in] h - nv_bfloat16. Is only being read.
-*
-* \returns short int
-* - \p h converted to a signed short integer.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ short int __bfloat162short_ru(const __nv_bfloat16 h);
-
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Convert a signed short integer to a nv_bfloat16 in round-to-nearest-even
-* mode.
-*
-* \details Convert the signed short integer value \p i to a nv_bfloat16 floating-point
-* value in round-to-nearest-even mode.
-* \param[in] i - short int. Is only being read.
-*
-* \returns nv_bfloat16
-* - \p i converted to nv_bfloat16.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rn(const short int i);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Convert a signed short integer to a nv_bfloat16 in round-towards-zero mode.
-*
-* \details Convert the signed short integer value \p i to a nv_bfloat16 floating-point
-* value in round-towards-zero mode.
-* \param[in] i - short int. Is only being read.
-*
-* \returns nv_bfloat16
-* - \p i converted to nv_bfloat16.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rz(const short int i);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Convert a signed short integer to a nv_bfloat16 in round-down mode.
-*
-* \details Convert the signed short integer value \p i to a nv_bfloat16 floating-point
-* value in round-down mode.
-* \param[in] i - short int. Is only being read.
-*
-* \returns nv_bfloat16
-* - \p i converted to nv_bfloat16.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rd(const short int i);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Convert a signed short integer to a nv_bfloat16 in round-up mode.
-*
-* \details Convert the signed short integer value \p i to a nv_bfloat16 floating-point
-* value in round-up mode.
-* \param[in] i - short int. Is only being read.
-*
-* \returns nv_bfloat16
-* - \p i converted to nv_bfloat16.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_ru(const short int i);
-
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Convert a nv_bfloat16 to an unsigned integer in round-to-nearest-even mode.
-*
-* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned integer
-* in round-to-nearest-even mode. NaN inputs are converted to 0.
-* \param[in] h - nv_bfloat16. Is only being read.
-*
-* \returns unsigned int
-* - \p h converted to an unsigned integer.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ unsigned int __bfloat162uint_rn(const __nv_bfloat16 h);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Convert a nv_bfloat16 to an unsigned integer in round-towards-zero mode.
-*
-* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned integer
-* in round-towards-zero mode. NaN inputs are converted to 0.
-* \param[in] h - nv_bfloat16. Is only being read.
-*
-* \returns unsigned int
-* - \p h converted to an unsigned integer.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __bfloat162uint_rz(const __nv_bfloat16 h);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Convert a nv_bfloat16 to an unsigned integer in round-down mode.
-*
-* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned integer
-* in round-down mode. NaN inputs are converted to 0.
-* \param[in] h - nv_bfloat16. Is only being read.
-*
-* \returns unsigned int
-* - \p h converted to an unsigned integer.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ unsigned int __bfloat162uint_rd(const __nv_bfloat16 h);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Convert a nv_bfloat16 to an unsigned integer in round-up mode.
-*
-* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned integer
-* in round-up mode. NaN inputs are converted to 0.
-* \param[in] h - nv_bfloat16. Is only being read.
-*
-* \returns unsigned int
-* - \p h converted to an unsigned integer.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ unsigned int __bfloat162uint_ru(const __nv_bfloat16 h);
-
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Convert an unsigned integer to a nv_bfloat16 in round-to-nearest-even mode.
-*
-* \details Convert the unsigned integer value \p i to a nv_bfloat16 floating-point
-* value in round-to-nearest-even mode.
-* \param[in] i - unsigned int. Is only being read.
-*
-* \returns nv_bfloat16
-* - \p i converted to nv_bfloat16.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rn(const unsigned int i);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Convert an unsigned integer to a nv_bfloat16 in round-towards-zero mode.
-*
-* \details Convert the unsigned integer value \p i to a nv_bfloat16 floating-point
-* value in round-towards-zero mode.
-* \param[in] i - unsigned int. Is only being read.
-*
-* \returns nv_bfloat16
-* - \p i converted to nv_bfloat16.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rz(const unsigned int i);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Convert an unsigned integer to a nv_bfloat16 in round-down mode.
-*
-* \details Convert the unsigned integer value \p i to a nv_bfloat16 floating-point
-* value in round-down mode.
-* \param[in] i - unsigned int. Is only being read.
-*
-* \returns nv_bfloat16
-* - \p i converted to nv_bfloat16.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rd(const unsigned int i);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Convert an unsigned integer to a nv_bfloat16 in round-up mode.
-*
-* \details Convert the unsigned integer value \p i to a nv_bfloat16 floating-point
-* value in round-up mode.
-* \param[in] i - unsigned int. Is only being read.
-*
-* \returns nv_bfloat16
-* - \p i converted to nv_bfloat16.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_ru(const unsigned int i);
-
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Convert a nv_bfloat16 to an unsigned short integer in round-to-nearest-even
-* mode.
-*
-* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned short
-* integer in round-to-nearest-even mode. NaN inputs are converted to 0.
-* \param[in] h - nv_bfloat16. Is only being read.
-*
-* \returns unsigned short int
-* - \p h converted to an unsigned short integer.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_rn(const __nv_bfloat16 h);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Convert a nv_bfloat16 to an unsigned short integer in round-towards-zero
-* mode.
-*
-* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned short
-* integer in round-towards-zero mode. NaN inputs are converted to 0.
-* \param[in] h - nv_bfloat16. Is only being read.
-*
-* \returns unsigned short int
-* - \p h converted to an unsigned short integer.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_HOSTDEVICE_BF16_DECL__ unsigned short int __bfloat162ushort_rz(const __nv_bfloat16 h);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Convert a nv_bfloat16 to an unsigned short integer in round-down mode.
-*
-* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned short
-* integer in round-down mode. NaN inputs are converted to 0.
-* \param[in] h - nv_bfloat16. Is only being read.
-*
-* \returns unsigned short int
-* - \p h converted to an unsigned short integer.
-*/
-__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_rd(const __nv_bfloat16 h);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Convert a nv_bfloat16 to an unsigned short integer in round-up mode.
-*
-* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned short
-* integer in round-up mode. NaN inputs are converted to 0.
-* \param[in] h - nv_bfloat16. Is only being read.
-*
-* \returns unsigned short int
-* - \p h converted to an unsigned short integer.
-*/
-__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_ru(const __nv_bfloat16 h);
-
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Convert an unsigned short integer to a nv_bfloat16 in round-to-nearest-even
-* mode.
-*
-* \details Convert the unsigned short integer value \p i to a nv_bfloat16 floating-point
-* value in round-to-nearest-even mode.
-* \param[in] i - unsigned short int. Is only being read.
-*
-* \returns nv_bfloat16
-* - \p i converted to nv_bfloat16.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rn(const unsigned short int i);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Convert an unsigned short integer to a nv_bfloat16 in round-towards-zero
-* mode.
-*
-* \details Convert the unsigned short integer value \p i to a nv_bfloat16 floating-point
-* value in round-towards-zero mode.
-* \param[in] i - unsigned short int. Is only being read.
-*
-* \returns nv_bfloat16
-* - \p i converted to nv_bfloat16.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rz(const unsigned short int i);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Convert an unsigned short integer to a nv_bfloat16 in round-down mode.
-*
-* \details Convert the unsigned short integer value \p i to a nv_bfloat16 floating-point
-* value in round-down mode.
-* \param[in] i - unsigned short int. Is only being read.
-*
-* \returns nv_bfloat16
-* - \p i converted to nv_bfloat16.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rd(const unsigned short int i);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Convert an unsigned short integer to a nv_bfloat16 in round-up mode.
-*
-* \details Convert the unsigned short integer value \p i to a nv_bfloat16 floating-point
-* value in round-up mode.
-* \param[in] i - unsigned short int. Is only being read.
-*
-* \returns nv_bfloat16
-* - \p i converted to nv_bfloat16.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_ru(const unsigned short int i);
-
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Convert a nv_bfloat16 to an unsigned 64-bit integer in round-to-nearest-even
-* mode.
-*
-* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned 64-bit
-* integer in round-to-nearest-even mode. NaN inputs return 0x8000000000000000.
-* \param[in] h - nv_bfloat16. Is only being read.
-*
-* \returns unsigned long long int
-* - \p h converted to an unsigned 64-bit integer.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_rn(const __nv_bfloat16 h);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Convert a nv_bfloat16 to an unsigned 64-bit integer in round-towards-zero
-* mode.
-*
-* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned 64-bit
-* integer in round-towards-zero mode. NaN inputs return 0x8000000000000000.
-* \param[in] h - nv_bfloat16. Is only being read.
-*
-* \returns unsigned long long int
-* - \p h converted to an unsigned 64-bit integer.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_HOSTDEVICE_BF16_DECL__ unsigned long long int __bfloat162ull_rz(const __nv_bfloat16 h);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Convert a nv_bfloat16 to an unsigned 64-bit integer in round-down mode.
-*
-* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned 64-bit
-* integer in round-down mode. NaN inputs return 0x8000000000000000.
-* \param[in] h - nv_bfloat16. Is only being read.
-*
-* \returns unsigned long long int
-* - \p h converted to an unsigned 64-bit integer.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_rd(const __nv_bfloat16 h);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Convert a nv_bfloat16 to an unsigned 64-bit integer in round-up mode.
-*
-* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned 64-bit
-* integer in round-up mode. NaN inputs return 0x8000000000000000.
-* \param[in] h - nv_bfloat16. Is only being read.
-*
-* \returns unsigned long long int
-* - \p h converted to an unsigned 64-bit integer.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_ru(const __nv_bfloat16 h);
-
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Convert an unsigned 64-bit integer to a nv_bfloat16 in round-to-nearest-even
-* mode.
-*
-* \details Convert the unsigned 64-bit integer value \p i to a nv_bfloat16 floating-point
-* value in round-to-nearest-even mode.
-* \param[in] i - unsigned long long int. Is only being read.
-*
-* \returns nv_bfloat16
-* - \p i converted to nv_bfloat16.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rn(const unsigned long long int i);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Convert an unsigned 64-bit integer to a nv_bfloat16 in round-towards-zero
-* mode.
-*
-* \details Convert the unsigned 64-bit integer value \p i to a nv_bfloat16 floating-point
-* value in round-towards-zero mode.
-* \param[in] i - unsigned long long int. Is only being read.
-*
-* \returns nv_bfloat16
-* - \p i converted to nv_bfloat16.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rz(const unsigned long long int i);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Convert an unsigned 64-bit integer to a nv_bfloat16 in round-down mode.
-*
-* \details Convert the unsigned 64-bit integer value \p i to a nv_bfloat16 floating-point
-* value in round-down mode.
-* \param[in] i - unsigned long long int. Is only being read.
-*
-* \returns nv_bfloat16
-* - \p i converted to nv_bfloat16.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rd(const unsigned long long int i);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Convert an unsigned 64-bit integer to a nv_bfloat16 in round-up mode.
-*
-* \details Convert the unsigned 64-bit integer value \p i to a nv_bfloat16 floating-point
-* value in round-up mode.
-* \param[in] i - unsigned long long int. Is only being read.
-*
-* \returns nv_bfloat16
-* - \p i converted to nv_bfloat16.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_ru(const unsigned long long int i);
-
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Convert a nv_bfloat16 to a signed 64-bit integer in round-to-nearest-even
-* mode.
-*
-* \details Convert the nv_bfloat16 floating-point value \p h to a signed 64-bit
-* integer in round-to-nearest-even mode. NaN inputs return a long long int with hex value of 0x8000000000000000.
-* \param[in] h - nv_bfloat16. Is only being read.
-*
-* \returns long long int
-* - \p h converted to a signed 64-bit integer.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ long long int __bfloat162ll_rn(const __nv_bfloat16 h);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Convert a nv_bfloat16 to a signed 64-bit integer in round-towards-zero mode.
-*
-* \details Convert the nv_bfloat16 floating-point value \p h to a signed 64-bit
-* integer in round-towards-zero mode. NaN inputs return a long long int with hex value of 0x8000000000000000.
-* \param[in] h - nv_bfloat16. Is only being read.
-*
-* \returns long long int
-* - \p h converted to a signed 64-bit integer.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_HOSTDEVICE_BF16_DECL__ long long int __bfloat162ll_rz(const __nv_bfloat16 h);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Convert a nv_bfloat16 to a signed 64-bit integer in round-down mode.
-*
-* \details Convert the nv_bfloat16 floating-point value \p h to a signed 64-bit
-* integer in round-down mode. NaN inputs return a long long int with hex value of 0x8000000000000000.
-* \param[in] h - nv_bfloat16. Is only being read.
-*
-* \returns long long int
-* - \p h converted to a signed 64-bit integer.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ long long int __bfloat162ll_rd(const __nv_bfloat16 h);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Convert a nv_bfloat16 to a signed 64-bit integer in round-up mode.
-*
-* \details Convert the nv_bfloat16 floating-point value \p h to a signed 64-bit
-* integer in round-up mode. NaN inputs return a long long int with hex value of 0x8000000000000000.
-* \param[in] h - nv_bfloat16. Is only being read.
-*
-* \returns long long int
-* - \p h converted to a signed 64-bit integer.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ long long int __bfloat162ll_ru(const __nv_bfloat16 h);
-
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Convert a signed 64-bit integer to a nv_bfloat16 in round-to-nearest-even
-* mode.
-*
-* \details Convert the signed 64-bit integer value \p i to a nv_bfloat16 floating-point
-* value in round-to-nearest-even mode.
-* \param[in] i - long long int. Is only being read.
-*
-* \returns nv_bfloat16
-* - \p i converted to nv_bfloat16.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rn(const long long int i);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Convert a signed 64-bit integer to a nv_bfloat16 in round-towards-zero mode.
-*
-* \details Convert the signed 64-bit integer value \p i to a nv_bfloat16 floating-point
-* value in round-towards-zero mode.
-* \param[in] i - long long int. Is only being read.
-*
-* \returns nv_bfloat16
-* - \p i converted to nv_bfloat16.
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rz(const long long int i);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Convert a signed 64-bit integer to a nv_bfloat16 in round-down mode.
-*
-* \details Convert the signed 64-bit integer value \p i to a nv_bfloat16 floating-point
-* value in round-down mode.
-* \param[in] i - long long int. Is only being read.
-*
-* \returns nv_bfloat16
-* - \p i converted to nv_bfloat16.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rd(const long long int i);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Convert a signed 64-bit integer to a nv_bfloat16 in round-up mode.
-*
-* \details Convert the signed 64-bit integer value \p i to a nv_bfloat16 floating-point
-* value in round-up mode.
-* \param[in] i - long long int. Is only being read.
-*
-* \returns nv_bfloat16
-* - \p i converted to nv_bfloat16.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_ru(const long long int i);
-
-/**
-* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
-* \brief Truncate input argument to the integral part.
-*
-* \details Round \p h to the nearest integer value that does not exceed \p h in
-* magnitude.
-* \param[in] h - nv_bfloat16. Is only being read.
-*
-* \returns nv_bfloat16
-* - The truncated integer value.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 htrunc(const __nv_bfloat16 h);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
-* \brief Calculate ceiling of the input argument.
-*
-* \details Compute the smallest integer value not less than \p h.
-* \param[in] h - nv_bfloat16. Is only being read.
-*
-* \returns nv_bfloat16
-* - The smallest integer value not less than \p h.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 hceil(const __nv_bfloat16 h);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
-* \brief Calculate the largest integer less than or equal to \p h.
-*
-* \details Calculate the largest integer value which is less than or equal to \p h.
-* \param[in] h - nv_bfloat16. Is only being read.
-*
-* \returns nv_bfloat16
-* - The largest integer value which is less than or equal to \p h.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 hfloor(const __nv_bfloat16 h);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
-* \brief Round input to nearest integer value in nv_bfloat16 floating-point
-* number.
-*
-* \details Round \p h to the nearest integer value in nv_bfloat16 floating-point
-* format, with bfloat16way cases rounded to the nearest even integer value.
-* \param[in] h - nv_bfloat16. Is only being read.
-*
-* \returns nv_bfloat16
-* - The nearest integer to \p h.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 hrint(const __nv_bfloat16 h);
-
-/**
-* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
-* \brief Truncate \p nv_bfloat162 vector input argument to the integral part.
-*
-* \details Round each component of vector \p h to the nearest integer value that does
-* not exceed \p h in magnitude.
-* \param[in] h - nv_bfloat162. Is only being read.
-*
-* \returns nv_bfloat162
-* - The truncated \p h.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 h2trunc(const __nv_bfloat162 h);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
-* \brief Calculate \p nv_bfloat162 vector ceiling of the input argument.
-*
-* \details For each component of vector \p h compute the smallest integer value not less
-* than \p h.
-* \param[in] h - nv_bfloat162. Is only being read.
-*
-* \returns nv_bfloat162
-* - The vector of smallest integers not less than \p h.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 h2ceil(const __nv_bfloat162 h);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
-* \brief Calculate the largest integer less than or equal to \p h.
-*
-* \details For each component of vector \p h calculate the largest integer value which
-* is less than or equal to \p h.
-* \param[in] h - nv_bfloat162. Is only being read.
-*
-* \returns nv_bfloat162
-* - The vector of largest integers which is less than or equal to \p h.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 h2floor(const __nv_bfloat162 h);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
-* \brief Round input to nearest integer value in nv_bfloat16 floating-point
-* number.
-*
-* \details Round each component of \p nv_bfloat162 vector \p h to the nearest integer value in
-* nv_bfloat16 floating-point format, with bfloat16way cases rounded to the
-* nearest even integer value.
-* \param[in] h - nv_bfloat162. Is only being read.
-*
-* \returns nv_bfloat162
-* - The vector of rounded integer values.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 h2rint(const __nv_bfloat162 h);
-
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Returns \p nv_bfloat162 with both halves equal to the input value.
-*
-* \details Returns \p nv_bfloat162 number with both halves equal to the input \p a \p nv_bfloat16
-* number.
-* \param[in] a - nv_bfloat16. Is only being read.
-*
-* \returns nv_bfloat162
-* - The vector which has both its halves equal to the input \p a.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 __bfloat162bfloat162(const __nv_bfloat16 a);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Swaps both halves of the \p nv_bfloat162 input.
-*
-* \details Swaps both halves of the \p nv_bfloat162 input and returns a new \p nv_bfloat162 number
-* with swapped halves.
-* \param[in] a - nv_bfloat162. Is only being read.
-*
-* \returns nv_bfloat162
-* - \p a with its halves being swapped.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 __lowhigh2highlow(const __nv_bfloat162 a);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Extracts low 16 bits from each of the two \p nv_bfloat162 inputs and combines
-* into one \p nv_bfloat162 number.
-*
-* \details Extracts low 16 bits from each of the two \p nv_bfloat162 inputs and combines into
-* one \p nv_bfloat162 number. Low 16 bits from input \p a is stored in low 16 bits of
-* the return value, low 16 bits from input \p b is stored in high 16 bits of
-* the return value.
-* \param[in] a - nv_bfloat162. Is only being read.
-* \param[in] b - nv_bfloat162. Is only being read.
-*
-* \returns nv_bfloat162
-* - The low 16 bits of \p a and of \p b.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 __lows2bfloat162(const __nv_bfloat162 a, const __nv_bfloat162 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Extracts high 16 bits from each of the two \p nv_bfloat162 inputs and
-* combines into one \p nv_bfloat162 number.
-*
-* \details Extracts high 16 bits from each of the two \p nv_bfloat162 inputs and combines into
-* one \p nv_bfloat162 number. High 16 bits from input \p a is stored in low 16 bits of
-* the return value, high 16 bits from input \p b is stored in high 16 bits of
-* the return value.
-* \param[in] a - nv_bfloat162. Is only being read.
-* \param[in] b - nv_bfloat162. Is only being read.
-*
-* \returns nv_bfloat162
-* - The high 16 bits of \p a and of \p b.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 __highs2bfloat162(const __nv_bfloat162 a, const __nv_bfloat162 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Returns high 16 bits of \p nv_bfloat162 input.
-*
-* \details Returns high 16 bits of \p nv_bfloat162 input \p a.
-* \param[in] a - nv_bfloat162. Is only being read.
-*
-* \returns nv_bfloat16
-* - The high 16 bits of the input.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 __high2bfloat16(const __nv_bfloat162 a);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Returns low 16 bits of \p nv_bfloat162 input.
-*
-* \details Returns low 16 bits of \p nv_bfloat162 input \p a.
-* \param[in] a - nv_bfloat162. Is only being read.
-*
-* \returns nv_bfloat16
-* - Returns \p nv_bfloat16 which contains low 16 bits of the input \p a.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 __low2bfloat16(const __nv_bfloat162 a);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
-* \brief Checks if the input \p nv_bfloat16 number is infinite.
-*
-* \details Checks if the input \p nv_bfloat16 number \p a is infinite.
-* \param[in] a - nv_bfloat16. Is only being read.
-*
-* \returns int
-* - -1 iff \p a is equal to negative infinity,
-* - 1 iff \p a is equal to positive infinity,
-* - 0 otherwise.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ int __hisinf(const __nv_bfloat16 a);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Combines two \p nv_bfloat16 numbers into one \p nv_bfloat162 number.
-*
-* \details Combines two input \p nv_bfloat16 number \p a and \p b into one \p nv_bfloat162 number.
-* Input \p a is stored in low 16 bits of the return value, input \p b is stored
-* in high 16 bits of the return value.
-* \param[in] a - nv_bfloat16. Is only being read.
-* \param[in] b - nv_bfloat16. Is only being read.
-*
-* \returns nv_bfloat162
-* - The nv_bfloat162 with one nv_bfloat16 equal to \p a and the other to \p b.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 __halves2bfloat162(const __nv_bfloat16 a, const __nv_bfloat16 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Extracts low 16 bits from \p nv_bfloat162 input.
-*
-* \details Extracts low 16 bits from \p nv_bfloat162 input \p a and returns a new \p nv_bfloat162
-* number which has both halves equal to the extracted bits.
-* \param[in] a - nv_bfloat162. Is only being read.
-*
-* \returns nv_bfloat162
-* - The nv_bfloat162 with both halves equal to the low 16 bits of the input.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 __low2bfloat162(const __nv_bfloat162 a);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Extracts high 16 bits from \p nv_bfloat162 input.
-*
-* \details Extracts high 16 bits from \p nv_bfloat162 input \p a and returns a new \p nv_bfloat162
-* number which has both halves equal to the extracted bits.
-* \param[in] a - nv_bfloat162. Is only being read.
-*
-* \returns nv_bfloat162
-* - The nv_bfloat162 with both halves equal to the high 16 bits of the input.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 __high2bfloat162(const __nv_bfloat162 a);
-
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Reinterprets bits in a \p nv_bfloat16 as a signed short integer.
-*
-* \details Reinterprets the bits in the nv_bfloat16 floating-point number \p h
-* as a signed short integer.
-* \param[in] h - nv_bfloat16. Is only being read.
-*
-* \returns short int
-* - The reinterpreted value.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ short int __bfloat16_as_short(const __nv_bfloat16 h);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Reinterprets bits in a \p nv_bfloat16 as an unsigned short integer.
-*
-* \details Reinterprets the bits in the nv_bfloat16 floating-point \p h
-* as an unsigned short number.
-* \param[in] h - nv_bfloat16. Is only being read.
-*
-* \returns unsigned short int
-* - The reinterpreted value.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ unsigned short int __bfloat16_as_ushort(const __nv_bfloat16 h);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Reinterprets bits in a signed short integer as a \p nv_bfloat16.
-*
-* \details Reinterprets the bits in the signed short integer \p i as a
-* nv_bfloat16 floating-point number.
-* \param[in] i - short int. Is only being read.
-*
-* \returns nv_bfloat16
-* - The reinterpreted value.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 __short_as_bfloat16(const short int i);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Reinterprets bits in an unsigned short integer as a \p nv_bfloat16.
-*
-* \details Reinterprets the bits in the unsigned short integer \p i as a
-* nv_bfloat16 floating-point number.
-* \param[in] i - unsigned short int. Is only being read.
-*
-* \returns nv_bfloat16
-* - The reinterpreted value.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 __ushort_as_bfloat16(const unsigned short int i);
-
-#if !defined warpSize && !defined __local_warpSize
-#define warpSize    32
-#define __local_warpSize
-#endif
-
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Exchange a variable between threads within a warp. Direct copy from indexed thread.
-*
-* \details Returns the value of var held by the thread whose ID is given by delta.
-* If width is less than warpSize then each subsection of the warp behaves as a separate
-* entity with a starting logical thread ID of 0. If delta is outside the range [0:width-1],
-* the value returned corresponds to the value of var held by the delta modulo width (i.e.
-* within the same subsection). width must have a value which is a power of 2;
-* results are undefined if width is not a power of 2, or is a number greater than
-* warpSize.
-* \param[in] mask - unsigned int. Is only being read.
-* \param[in] var - nv_bfloat162. Is only being read.
-* \param[in] delta - int. Is only being read.
-* \param[in] width - int. Is only being read.
-*
-* \returns Returns the 4-byte word referenced by var from the source thread ID as nv_bfloat162.
-* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned.
-* \note_ref_guide_warp_shuffle
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior not reentrant, not thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_sync(const unsigned mask, const __nv_bfloat162 var, const int delta, const int width = warpSize);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller.
-*
-* \details Calculates a source thread ID by subtracting delta from the caller's lane ID.
-* The value of var held by the resulting lane ID is returned: in effect, var is shifted up
-* the warp by delta threads. If width is less than warpSize then each subsection of the warp
-* behaves as a separate entity with a starting logical thread ID of 0. The source thread index
-* will not wrap around the value of width, so effectively the lower delta threads will be unchanged.
-* width must have a value which is a power of 2; results are undefined if width is not a power of 2,
-* or is a number greater than warpSize.
-* \param[in] mask - unsigned int. Is only being read.
-* \param[in] var - nv_bfloat162. Is only being read.
-* \param[in] delta - int. Is only being read.
-* \param[in] width - int. Is only being read.
-*
-* \returns Returns the 4-byte word referenced by var from the source thread ID as nv_bfloat162.
-* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned.
-* \note_ref_guide_warp_shuffle
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior not reentrant, not thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_up_sync(const unsigned mask, const __nv_bfloat162 var, const unsigned int delta, const int width = warpSize);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller.
-*
-* \details Calculates a source thread ID by adding delta to the caller's thread ID.
-* The value of var held by the resulting thread ID is returned: this has the effect
-* of shifting var down the warp by delta threads. If width is less than warpSize then
-* each subsection of the warp behaves as a separate entity with a starting logical
-* thread ID of 0. As for __shfl_up_sync(), the ID number of the source thread
-* will not wrap around the value of width and so the upper delta threads
-* will remain unchanged.
-* \param[in] mask - unsigned int. Is only being read.
-* \param[in] var - nv_bfloat162. Is only being read.
-* \param[in] delta - int. Is only being read.
-* \param[in] width - int. Is only being read.
-*
-* \returns Returns the 4-byte word referenced by var from the source thread ID as nv_bfloat162.
-* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned.
-* \note_ref_guide_warp_shuffle
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior not reentrant, not thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_down_sync(const unsigned mask, const __nv_bfloat162 var, const unsigned int delta, const int width = warpSize);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID.
-*
-* \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with mask:
-* the value of var held by the resulting thread ID is returned. If width is less than warpSize then each
-* group of width consecutive threads are able to access elements from earlier groups of threads,
-* however if they attempt to access elements from later groups of threads their own value of var
-* will be returned. This mode implements a butterfly addressing pattern such as is used in tree
-* reduction and broadcast.
-* \param[in] mask - unsigned int. Is only being read.
-* \param[in] var - nv_bfloat162. Is only being read.
-* \param[in] delta - int. Is only being read.
-* \param[in] width - int. Is only being read.
-*
-* \returns Returns the 4-byte word referenced by var from the source thread ID as nv_bfloat162.
-* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned.
-* \note_ref_guide_warp_shuffle
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior not reentrant, not thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_xor_sync(const unsigned mask, const __nv_bfloat162 var, const int delta, const int width = warpSize);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Exchange a variable between threads within a warp. Direct copy from indexed thread.
-*
-* \details Returns the value of var held by the thread whose ID is given by delta.
-* If width is less than warpSize then each subsection of the warp behaves as a separate
-* entity with a starting logical thread ID of 0. If delta is outside the range [0:width-1],
-* the value returned corresponds to the value of var held by the delta modulo width (i.e.
-* within the same subsection). width must have a value which is a power of 2;
-* results are undefined if width is not a power of 2, or is a number greater than
-* warpSize.
-* \param[in] mask - unsigned int. Is only being read.
-* \param[in] var - nv_bfloat16. Is only being read.
-* \param[in] delta - int. Is only being read.
-* \param[in] width - int. Is only being read.
-*
-* \returns Returns the 2-byte word referenced by var from the source thread ID as nv_bfloat16.
-* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned.
-* \note_ref_guide_warp_shuffle
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior not reentrant, not thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_sync(const unsigned mask, const __nv_bfloat16 var, const int delta, const int width = warpSize);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller.
-* \details Calculates a source thread ID by subtracting delta from the caller's lane ID.
-* The value of var held by the resulting lane ID is returned: in effect, var is shifted up
-* the warp by delta threads. If width is less than warpSize then each subsection of the warp
-* behaves as a separate entity with a starting logical thread ID of 0. The source thread index
-* will not wrap around the value of width, so effectively the lower delta threads will be unchanged.
-* width must have a value which is a power of 2; results are undefined if width is not a power of 2,
-* or is a number greater than warpSize.
-* \param[in] mask - unsigned int. Is only being read.
-* \param[in] var - nv_bfloat16. Is only being read.
-* \param[in] delta - int. Is only being read.
-* \param[in] width - int. Is only being read.
-*
-* \returns Returns the 2-byte word referenced by var from the source thread ID as nv_bfloat16.
-* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned.
-* \note_ref_guide_warp_shuffle
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior not reentrant, not thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_up_sync(const unsigned mask, const __nv_bfloat16 var, const unsigned int delta, const int width = warpSize);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller.
-*
-* \details Calculates a source thread ID by adding delta to the caller's thread ID.
-* The value of var held by the resulting thread ID is returned: this has the effect
-* of shifting var down the warp by delta threads. If width is less than warpSize then
-* each subsection of the warp behaves as a separate entity with a starting logical
-* thread ID of 0. As for __shfl_up_sync(), the ID number of the source thread
-* will not wrap around the value of width and so the upper delta threads
-* will remain unchanged.
-* \param[in] mask - unsigned int. Is only being read.
-* \param[in] var - nv_bfloat16. Is only being read.
-* \param[in] delta - int. Is only being read.
-* \param[in] width - int. Is only being read.
-*
-* \returns Returns the 2-byte word referenced by var from the source thread ID as nv_bfloat16.
-* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned.
-* \note_ref_guide_warp_shuffle
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior not reentrant, not thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_down_sync(const unsigned mask, const __nv_bfloat16 var, const unsigned int delta, const int width = warpSize);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID.
-*
-* \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with mask:
-* the value of var held by the resulting thread ID is returned. If width is less than warpSize then each
-* group of width consecutive threads are able to access elements from earlier groups of threads,
-* however if they attempt to access elements from later groups of threads their own value of var
-* will be returned. This mode implements a butterfly addressing pattern such as is used in tree
-* reduction and broadcast.
-* \param[in] mask - unsigned int. Is only being read.
-* \param[in] var - nv_bfloat16. Is only being read.
-* \param[in] delta - int. Is only being read.
-* \param[in] width - int. Is only being read.
-*
-* \returns Returns the 2-byte word referenced by var from the source thread ID as nv_bfloat16.
-* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned.
-* \note_ref_guide_warp_shuffle
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior not reentrant, not thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_xor_sync(const unsigned mask, const __nv_bfloat16 var, const int delta, const int width = warpSize);
-
-#if defined(__local_warpSize)
-#undef warpSize
-#undef __local_warpSize
-#endif
-
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Generates a `ld.global.nc` load instruction.
-* \param[in] ptr - memory location
-* \returns The value pointed by `ptr`
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 __ldg(const  __nv_bfloat162 *const ptr);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Generates a `ld.global.nc` load instruction.
-* \param[in] ptr - memory location
-* \returns The value pointed by `ptr`
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 __ldg(const __nv_bfloat16 *const ptr);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Generates a `ld.global.cg` load instruction.
-* \param[in] ptr - memory location
-* \returns The value pointed by `ptr`
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 __ldcg(const  __nv_bfloat162 *const ptr);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Generates a `ld.global.cg` load instruction.
-* \param[in] ptr - memory location
-* \returns The value pointed by `ptr`
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 __ldcg(const __nv_bfloat16 *const ptr);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Generates a `ld.global.ca` load instruction.
-* \param[in] ptr - memory location
-* \returns The value pointed by `ptr`
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 __ldca(const  __nv_bfloat162 *const ptr);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Generates a `ld.global.ca` load instruction.
-* \param[in] ptr - memory location
-* \returns The value pointed by `ptr`
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 __ldca(const __nv_bfloat16 *const ptr);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Generates a `ld.global.cs` load instruction.
-* \param[in] ptr - memory location
-* \returns The value pointed by `ptr`
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 __ldcs(const  __nv_bfloat162 *const ptr);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Generates a `ld.global.cs` load instruction.
-* \param[in] ptr - memory location
-* \returns The value pointed by `ptr`
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 __ldcs(const __nv_bfloat16 *const ptr);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Generates a `ld.global.lu` load instruction.
-* \param[in] ptr - memory location
-* \returns The value pointed by `ptr`
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 __ldlu(const  __nv_bfloat162 *const ptr);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Generates a `ld.global.lu` load instruction.
-* \param[in] ptr - memory location
-* \returns The value pointed by `ptr`
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 __ldlu(const __nv_bfloat16 *const ptr);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Generates a `ld.global.cv` load instruction.
-* \param[in] ptr - memory location
-* \returns The value pointed by `ptr`
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 __ldcv(const  __nv_bfloat162 *const ptr);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Generates a `ld.global.cv` load instruction.
-* \param[in] ptr - memory location
-* \returns The value pointed by `ptr`
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 __ldcv(const __nv_bfloat16 *const ptr);
-
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Generates a `st.global.wb` store instruction.
-* \param[out] ptr - memory location
-* \param[in] value - the value to be stored
-*/
-__CUDA_BF16_DECL__ void __stwb(__nv_bfloat162 *const ptr, const __nv_bfloat162 value);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Generates a `st.global.wb` store instruction.
-* \param[out] ptr - memory location
-* \param[in] value - the value to be stored
-*/
-__CUDA_BF16_DECL__ void __stwb(__nv_bfloat16 *const ptr, const __nv_bfloat16 value);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Generates a `st.global.cg` store instruction.
-* \param[out] ptr - memory location
-* \param[in] value - the value to be stored
-*/
-__CUDA_BF16_DECL__ void __stcg(__nv_bfloat162 *const ptr, const __nv_bfloat162 value);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Generates a `st.global.cg` store instruction.
-* \param[out] ptr - memory location
-* \param[in] value - the value to be stored
-*/
-__CUDA_BF16_DECL__ void __stcg(__nv_bfloat16 *const ptr, const __nv_bfloat16 value);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Generates a `st.global.cs` store instruction.
-* \param[out] ptr - memory location
-* \param[in] value - the value to be stored
-*/
-__CUDA_BF16_DECL__ void __stcs(__nv_bfloat162 *const ptr, const __nv_bfloat162 value);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Generates a `st.global.cs` store instruction.
-* \param[out] ptr - memory location
-* \param[in] value - the value to be stored
-*/
-__CUDA_BF16_DECL__ void __stcs(__nv_bfloat16 *const ptr, const __nv_bfloat16 value);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Generates a `st.global.wt` store instruction.
-* \param[out] ptr - memory location
-* \param[in] value - the value to be stored
-*/
-__CUDA_BF16_DECL__ void __stwt(__nv_bfloat162 *const ptr, const __nv_bfloat162 value);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_MISC
-* \brief Generates a `st.global.wt` store instruction.
-* \param[out] ptr - memory location
-* \param[in] value - the value to be stored
-*/
-__CUDA_BF16_DECL__ void __stwt(__nv_bfloat16 *const ptr, const __nv_bfloat16 value);
-
-/**
-* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
-* \brief Performs nv_bfloat162 vector if-equal comparison.
-*
-* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b.
-* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
-* NaN inputs generate false results.
-* \param[in] a - nv_bfloat162. Is only being read.
-* \param[in] b - nv_bfloat162. Is only being read.
-*
-* \returns nv_bfloat162
-* - The vector result of if-equal comparison of vectors \p a and \p b.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 __heq2(const __nv_bfloat162 a, const __nv_bfloat162 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
-* \brief Performs \p nv_bfloat162 vector not-equal comparison.
-*
-* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b.
-* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
-* NaN inputs generate false results.
-* \param[in] a - nv_bfloat162. Is only being read.
-* \param[in] b - nv_bfloat162. Is only being read.
-*
-* \returns nv_bfloat162
-* - The vector result of not-equal comparison of vectors \p a and \p b.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 __hne2(const __nv_bfloat162 a, const __nv_bfloat162 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
-* \brief Performs \p nv_bfloat162 vector less-equal comparison.
-*
-* \details Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b.
-* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
-* NaN inputs generate false results.
-* \param[in] a - nv_bfloat162. Is only being read.
-* \param[in] b - nv_bfloat162. Is only being read.
-*
-* \returns nv_bfloat162
-* - The \p nv_bfloat162 result of less-equal comparison of vectors \p a and \p b.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 __hle2(const __nv_bfloat162 a, const __nv_bfloat162 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
-* \brief Performs \p nv_bfloat162 vector greater-equal comparison.
-*
-* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b.
-* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
-* NaN inputs generate false results.
-* \param[in] a - nv_bfloat162. Is only being read.
-* \param[in] b - nv_bfloat162. Is only being read.
-*
-* \returns nv_bfloat162
-* - The vector result of greater-equal comparison of vectors \p a and \p b.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 __hge2(const __nv_bfloat162 a, const __nv_bfloat162 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
-* \brief Performs \p nv_bfloat162 vector less-than comparison.
-*
-* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b.
-* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
-* NaN inputs generate false results.
-* \param[in] a - nv_bfloat162. Is only being read.
-* \param[in] b - nv_bfloat162. Is only being read.
-*
-* \returns nv_bfloat162
-* - The nv_bfloat162 vector result of less-than comparison of vectors \p a and \p b.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 __hlt2(const __nv_bfloat162 a, const __nv_bfloat162 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
-* \brief Performs \p nv_bfloat162 vector greater-than comparison.
-*
-* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b.
-* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
-* NaN inputs generate false results.
-* \param[in] a - nv_bfloat162. Is only being read.
-* \param[in] b - nv_bfloat162. Is only being read.
-*
-* \returns nv_bfloat162
-* - The vector result of greater-than comparison of vectors \p a and \p b.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 __hgt2(const __nv_bfloat162 a, const __nv_bfloat162 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
-* \brief Performs \p nv_bfloat162 vector unordered if-equal comparison.
-*
-* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b.
-* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
-* NaN inputs generate true results.
-* \param[in] a - nv_bfloat162. Is only being read.
-* \param[in] b - nv_bfloat162. Is only being read.
-*
-* \returns nv_bfloat162
-* - The vector result of unordered if-equal comparison of vectors \p a and \p b.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 __hequ2(const __nv_bfloat162 a, const __nv_bfloat162 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
-* \brief Performs \p nv_bfloat162 vector unordered not-equal comparison.
-*
-* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b.
-* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
-* NaN inputs generate true results.
-* \param[in] a - nv_bfloat162. Is only being read.
-* \param[in] b - nv_bfloat162. Is only being read.
-*
-* \returns nv_bfloat162
-* - The vector result of unordered not-equal comparison of vectors \p a and \p b.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 __hneu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
-* \brief Performs \p nv_bfloat162 vector unordered less-equal comparison.
-*
-* Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b.
-* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
-* NaN inputs generate true results.
-* \param[in] a - nv_bfloat162. Is only being read.
-* \param[in] b - nv_bfloat162. Is only being read.
-*
-* \returns nv_bfloat162
-* - The vector result of unordered less-equal comparison of vectors \p a and \p b.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 __hleu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
-* \brief Performs \p nv_bfloat162 vector unordered greater-equal comparison.
-*
-* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b.
-* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
-* NaN inputs generate true results.
-* \param[in] a - nv_bfloat162. Is only being read.
-* \param[in] b - nv_bfloat162. Is only being read.
-*
-* \returns nv_bfloat162
-* - The \p nv_bfloat162 vector result of unordered greater-equal comparison of vectors \p a and \p b.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 __hgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
-* \brief Performs \p nv_bfloat162 vector unordered less-than comparison.
-*
-* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b.
-* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
-* NaN inputs generate true results.
-* \param[in] a - nv_bfloat162. Is only being read.
-* \param[in] b - nv_bfloat162. Is only being read.
-*
-* \returns nv_bfloat162
-* - The vector result of unordered less-than comparison of vectors \p a and \p b.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 __hltu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
-* \brief Performs \p nv_bfloat162 vector unordered greater-than comparison.
-*
-* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b.
-* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
-* NaN inputs generate true results.
-* \param[in] a - nv_bfloat162. Is only being read.
-* \param[in] b - nv_bfloat162. Is only being read.
-*
-* \returns nv_bfloat162
-* - The \p nv_bfloat162 vector result of unordered greater-than comparison of vectors \p a and \p b.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 __hgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
-* \brief Determine whether \p nv_bfloat162 argument is a NaN.
-*
-* \details Determine whether each nv_bfloat16 of input \p nv_bfloat162 number \p a is a NaN.
-* \param[in] a - nv_bfloat162. Is only being read.
-*
-* \returns nv_bfloat162
-* - The nv_bfloat162 with the corresponding \p nv_bfloat16 results set to
-* 1.0 for NaN, 0.0 otherwise.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 __hisnan2(const __nv_bfloat162 a);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
-* \brief Performs \p nv_bfloat162 vector addition in round-to-nearest-even mode.
-*
-* \details Performs \p nv_bfloat162 vector add of inputs \p a and \p b, in round-to-nearest
-* mode.
-* \internal
-* \req DEEPLEARN-SRM_REQ-95
-* \endinternal
-* \param[in] a - nv_bfloat162. Is only being read.
-* \param[in] b - nv_bfloat162. Is only being read.
-*
-* \returns nv_bfloat162
-* - The sum of vectors \p a and \p b.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 __hadd2(const __nv_bfloat162 a, const __nv_bfloat162 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
-* \brief Performs \p nv_bfloat162 vector subtraction in round-to-nearest-even mode.
-*
-* \details Subtracts \p nv_bfloat162 input vector \p b from input vector \p a in
-* round-to-nearest-even mode.
-* \internal
-* \req DEEPLEARN-SRM_REQ-104
-* \endinternal
-* \param[in] a - nv_bfloat162. Is only being read.
-* \param[in] b - nv_bfloat162. Is only being read.
-*
-* \returns nv_bfloat162
-* - The subtraction of vector \p b from \p a.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 __hsub2(const __nv_bfloat162 a, const __nv_bfloat162 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
-* \brief Performs \p nv_bfloat162 vector multiplication in round-to-nearest-even mode.
-*
-* \details Performs \p nv_bfloat162 vector multiplication of inputs \p a and \p b, in
-* round-to-nearest-even mode.
-* \internal
-* \req DEEPLEARN-SRM_REQ-102
-* \endinternal
-* \param[in] a - nv_bfloat162. Is only being read.
-* \param[in] b - nv_bfloat162. Is only being read.
-*
-* \returns nv_bfloat162
-* - The result of elementwise multiplying the vectors \p a and \p b.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 __hmul2(const __nv_bfloat162 a, const __nv_bfloat162 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
-* \brief Performs \p nv_bfloat162 vector addition in round-to-nearest-even mode.
-*
-* \details Performs \p nv_bfloat162 vector add of inputs \p a and \p b, in round-to-nearest
-* mode. Prevents floating-point contractions of mul+add into fma.
-* \internal
-* \req DEEPLEARN-SRM_REQ-95
-* \endinternal
-* \param[in] a - nv_bfloat162. Is only being read.
-* \param[in] b - nv_bfloat162. Is only being read.
-*
-* \returns nv_bfloat162
-* - The sum of vectors \p a and \p b.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 __hadd2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
-* \brief Performs \p nv_bfloat162 vector subtraction in round-to-nearest-even mode.
-*
-* \details Subtracts \p nv_bfloat162 input vector \p b from input vector \p a in
-* round-to-nearest-even mode. Prevents floating-point contractions of mul+sub into fma.
-* \internal
-* \req DEEPLEARN-SRM_REQ-104
-* \endinternal
-* \param[in] a - nv_bfloat162. Is only being read.
-* \param[in] b - nv_bfloat162. Is only being read.
-*
-* \returns nv_bfloat162
-* - The subtraction of vector \p b from \p a.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 __hsub2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
-* \brief Performs \p nv_bfloat162 vector multiplication in round-to-nearest-even mode.
-*
-* \details Performs \p nv_bfloat162 vector multiplication of inputs \p a and \p b, in
-* round-to-nearest-even mode. Prevents floating-point contractions of mul+add
-* or sub into fma.
-* \internal
-* \req DEEPLEARN-SRM_REQ-102
-* \endinternal
-* \param[in] a - nv_bfloat162. Is only being read.
-* \param[in] b - nv_bfloat162. Is only being read.
-*
-* \returns nv_bfloat162
-* - The result of elementwise multiplying the vectors \p a and \p b.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 __hmul2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
-* \brief Performs \p nv_bfloat162 vector division in round-to-nearest-even mode.
-*
-* \details Divides \p nv_bfloat162 input vector \p a by input vector \p b in round-to-nearest
-* mode.
-* \internal
-* \req DEEPLEARN-SRM_REQ-103
-* \endinternal
-* \param[in] a - nv_bfloat162. Is only being read.
-* \param[in] b - nv_bfloat162. Is only being read.
-*
-* \returns nv_bfloat162
-* - The elementwise division of \p a with \p b.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 __h2div(const __nv_bfloat162 a, const __nv_bfloat162 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
-* \brief Calculates the absolute value of both halves of the input \p nv_bfloat162 number and
-* returns the result.
-*
-* \details Calculates the absolute value of both halves of the input \p nv_bfloat162 number and
-* returns the result.
-* \param[in] a - nv_bfloat162. Is only being read.
-*
-* \returns bfloat2
-* - Returns \p a with the absolute value of both halves.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 __habs2(const __nv_bfloat162 a);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
-* \brief Performs \p nv_bfloat162 vector addition in round-to-nearest-even mode, with
-* saturation to [0.0, 1.0].
-*
-* \details Performs \p nv_bfloat162 vector add of inputs \p a and \p b, in round-to-nearest
-* mode, and clamps the results to range [0.0, 1.0]. NaN results are flushed to
-* +0.0.
-* \param[in] a - nv_bfloat162. Is only being read.
-* \param[in] b - nv_bfloat162. Is only being read.
-*
-* \returns nv_bfloat162
-* - The sum of \p a and \p b, with respect to saturation.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 __hadd2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
-* \brief Performs \p nv_bfloat162 vector subtraction in round-to-nearest-even mode,
-* with saturation to [0.0, 1.0].
-*
-* \details Subtracts \p nv_bfloat162 input vector \p b from input vector \p a in
-* round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN
-* results are flushed to +0.0.
-* \param[in] a - nv_bfloat162. Is only being read.
-* \param[in] b - nv_bfloat162. Is only being read.
-*
-* \returns nv_bfloat162
-* - The subtraction of vector \p b from \p a, with respect to saturation.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 __hsub2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
-* \brief Performs \p nv_bfloat162 vector multiplication in round-to-nearest-even mode,
-* with saturation to [0.0, 1.0].
-*
-* \details Performs \p nv_bfloat162 vector multiplication of inputs \p a and \p b, in
-* round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN
-* results are flushed to +0.0.
-* \param[in] a - nv_bfloat162. Is only being read.
-* \param[in] b - nv_bfloat162. Is only being read.
-*
-* \returns nv_bfloat162
-* - The result of elementwise multiplication of vectors \p a and \p b,
-* with respect to saturation.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 __hmul2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
-* \brief Performs \p nv_bfloat162 vector fused multiply-add in round-to-nearest-even
-* mode.
-*
-* \details Performs \p nv_bfloat162 vector multiply on inputs \p a and \p b,
-* then performs a \p nv_bfloat162 vector add of the result with \p c,
-* rounding the result once in round-to-nearest-even mode.
-* \internal
-* \req DEEPLEARN-SRM_REQ-105
-* \endinternal
-* \param[in] a - nv_bfloat162. Is only being read.
-* \param[in] b - nv_bfloat162. Is only being read.
-* \param[in] c - nv_bfloat162. Is only being read.
-*
-* \returns nv_bfloat162
-* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
-* \brief Performs \p nv_bfloat162 vector fused multiply-add in round-to-nearest-even
-* mode, with saturation to [0.0, 1.0].
-*
-* \details Performs \p nv_bfloat162 vector multiply on inputs \p a and \p b,
-* then performs a \p nv_bfloat162 vector add of the result with \p c,
-* rounding the result once in round-to-nearest-even mode, and clamps the
-* results to range [0.0, 1.0]. NaN results are flushed to +0.0.
-* \param[in] a - nv_bfloat162. Is only being read.
-* \param[in] b - nv_bfloat162. Is only being read.
-* \param[in] c - nv_bfloat162. Is only being read.
-*
-* \returns nv_bfloat162
-* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c,
-* with respect to saturation.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
-* \brief Negates both halves of the input \p nv_bfloat162 number and returns the
-* result.
-*
-* \details Negates both halves of the input \p nv_bfloat162 number \p a and returns the result.
-* \internal
-* \req DEEPLEARN-SRM_REQ-101
-* \endinternal
-* \param[in] a - nv_bfloat162. Is only being read.
-*
-* \returns nv_bfloat162
-* - Returns \p a with both halves negated.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 __hneg2(const __nv_bfloat162 a);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
-* \brief Calculates the absolute value of input \p nv_bfloat16 number and returns the result.
-*
-* \details Calculates the absolute value of input \p nv_bfloat16 number and returns the result.
-* \param[in] a - nv_bfloat16. Is only being read.
-*
-* \returns nv_bfloat16
-* - The absolute value of a.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 __habs(const __nv_bfloat16 a);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
-* \brief Performs \p nv_bfloat16 addition in round-to-nearest-even mode.
-*
-* \details Performs \p nv_bfloat16 addition of inputs \p a and \p b, in round-to-nearest-even
-* mode.
-* \internal
-* \req DEEPLEARN-SRM_REQ-94
-* \endinternal
-* \param[in] a - nv_bfloat16. Is only being read.
-* \param[in] b - nv_bfloat16. Is only being read.
-*
-* \returns nv_bfloat16
-* - The sum of \p a and \p b.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 __hadd(const __nv_bfloat16 a, const __nv_bfloat16 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
-* \brief Performs \p nv_bfloat16 subtraction in round-to-nearest-even mode.
-*
-* \details Subtracts \p nv_bfloat16 input \p b from input \p a in round-to-nearest
-* mode.
-* \internal
-* \req DEEPLEARN-SRM_REQ-97
-* \endinternal
-* \param[in] a - nv_bfloat16. Is only being read.
-* \param[in] b - nv_bfloat16. Is only being read.
-*
-* \returns nv_bfloat16
-* - The result of subtracting \p b from \p a.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 __hsub(const __nv_bfloat16 a, const __nv_bfloat16 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
-* \brief Performs \p nv_bfloat16 multiplication in round-to-nearest-even mode.
-*
-* \details Performs \p nv_bfloat16 multiplication of inputs \p a and \p b, in round-to-nearest
-* mode.
-* \internal
-* \req DEEPLEARN-SRM_REQ-99
-* \endinternal
-* \param[in] a - nv_bfloat16. Is only being read.
-* \param[in] b - nv_bfloat16. Is only being read.
-*
-* \returns nv_bfloat16
-* - The result of multiplying \p a and \p b.
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 __hmul(const __nv_bfloat16 a, const __nv_bfloat16 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
-* \brief Performs \p nv_bfloat16 addition in round-to-nearest-even mode.
-*
-* \details Performs \p nv_bfloat16 addition of inputs \p a and \p b, in round-to-nearest-even
-* mode. Prevents floating-point contractions of mul+add into fma.
-* \internal
-* \req DEEPLEARN-SRM_REQ-94
-* \endinternal
-* \param[in] a - nv_bfloat16. Is only being read.
-* \param[in] b - nv_bfloat16. Is only being read.
-*
-* \returns nv_bfloat16
-* - The sum of \p a and \p b.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 __hadd_rn(const __nv_bfloat16 a, const __nv_bfloat16 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
-* \brief Performs \p nv_bfloat16 subtraction in round-to-nearest-even mode.
-*
-* \details Subtracts \p nv_bfloat16 input \p b from input \p a in round-to-nearest
-* mode. Prevents floating-point contractions of mul+sub into fma.
-* \internal
-* \req DEEPLEARN-SRM_REQ-97
-* \endinternal
-* \param[in] a - nv_bfloat16. Is only being read.
-* \param[in] b - nv_bfloat16. Is only being read.
-*
-* \returns nv_bfloat16
-* - The result of subtracting \p b from \p a.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 __hsub_rn(const __nv_bfloat16 a, const __nv_bfloat16 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
-* \brief Performs \p nv_bfloat16 multiplication in round-to-nearest-even mode.
-*
-* \details Performs \p nv_bfloat16 multiplication of inputs \p a and \p b, in round-to-nearest
-* mode. Prevents floating-point contractions of mul+add or sub into fma.
-* \internal
-* \req DEEPLEARN-SRM_REQ-99
-* \endinternal
-* \param[in] a - nv_bfloat16. Is only being read.
-* \param[in] b - nv_bfloat16. Is only being read.
-*
-* \returns nv_bfloat16
-* - The result of multiplying \p a and \p b.
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 __hmul_rn(const __nv_bfloat16 a, const __nv_bfloat16 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
-* \brief Performs \p nv_bfloat16 division in round-to-nearest-even mode.
-*
-* \details Divides \p nv_bfloat16 input \p a by input \p b in round-to-nearest
-* mode.
-* \internal
-* \req DEEPLEARN-SRM_REQ-98
-* \endinternal
-* \param[in] a - nv_bfloat16. Is only being read.
-* \param[in] b - nv_bfloat16. Is only being read.
-*
-* \returns nv_bfloat16
-* - The result of dividing \p a by \p b.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__  __nv_bfloat16 __hdiv(const __nv_bfloat16 a, const __nv_bfloat16 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
-* \brief Performs \p nv_bfloat16 addition in round-to-nearest-even mode, with
-* saturation to [0.0, 1.0].
-*
-* \details Performs \p nv_bfloat16 add of inputs \p a and \p b, in round-to-nearest-even mode,
-* and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0.
-* \param[in] a - nv_bfloat16. Is only being read.
-* \param[in] b - nv_bfloat16. Is only being read.
-*
-* \returns nv_bfloat16
-* - The sum of \p a and \p b, with respect to saturation.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 __hadd_sat(const __nv_bfloat16 a, const __nv_bfloat16 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
-* \brief Performs \p nv_bfloat16 subtraction in round-to-nearest-even mode, with
-* saturation to [0.0, 1.0].
-*
-* \details Subtracts \p nv_bfloat16 input \p b from input \p a in round-to-nearest
-* mode,
-* and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0.
-* \param[in] a - nv_bfloat16. Is only being read.
-* \param[in] b - nv_bfloat16. Is only being read.
-*
-* \returns nv_bfloat16
-* - The result of subtraction of \p b from \p a, with respect to saturation.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 __hsub_sat(const __nv_bfloat16 a, const __nv_bfloat16 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
-* \brief Performs \p nv_bfloat16 multiplication in round-to-nearest-even mode, with
-* saturation to [0.0, 1.0].
-*
-* \details Performs \p nv_bfloat16 multiplication of inputs \p a and \p b, in round-to-nearest
-* mode, and clamps the result to range [0.0, 1.0]. NaN results are flushed to
-* +0.0.
-* \param[in] a - nv_bfloat16. Is only being read.
-* \param[in] b - nv_bfloat16. Is only being read.
-*
-* \returns nv_bfloat16
-* - The result of multiplying \p a and \p b, with respect to saturation.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 __hmul_sat(const __nv_bfloat16 a, const __nv_bfloat16 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
-* \brief Performs \p nv_bfloat16 fused multiply-add in round-to-nearest-even mode.
-*
-* \details Performs \p nv_bfloat16 multiply on inputs \p a and \p b,
-* then performs a \p nv_bfloat16 add of the result with \p c,
-* rounding the result once in round-to-nearest-even mode.
-* \internal
-* \req DEEPLEARN-SRM_REQ-96
-* \endinternal
-* \param[in] a - nv_bfloat16. Is only being read.
-* \param[in] b - nv_bfloat16. Is only being read.
-* \param[in] c - nv_bfloat16. Is only being read.
-*
-* \returns nv_bfloat16
-* - The result of fused multiply-add operation on \p
-* a, \p b, and \p c.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 __hfma(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
-* \brief Performs \p nv_bfloat16 fused multiply-add in round-to-nearest-even mode,
-* with saturation to [0.0, 1.0].
-*
-* \details Performs \p nv_bfloat16 multiply on inputs \p a and \p b,
-* then performs a \p nv_bfloat16 add of the result with \p c,
-* rounding the result once in round-to-nearest-even mode, and clamps the result
-* to range [0.0, 1.0]. NaN results are flushed to +0.0.
-* \param[in] a - nv_bfloat16. Is only being read.
-* \param[in] b - nv_bfloat16. Is only being read.
-* \param[in] c - nv_bfloat16. Is only being read.
-*
-* \returns nv_bfloat16
-* - The result of fused multiply-add operation on \p
-* a, \p b, and \p c, with respect to saturation.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 __hfma_sat(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
-* \brief Negates input \p nv_bfloat16 number and returns the result.
-*
-* \details Negates input \p nv_bfloat16 number and returns the result.
-* \internal
-* \req DEEPLEARN-SRM_REQ-100
-* \endinternal
-* \param[in] a - nv_bfloat16. Is only being read.
-*
-* \returns nv_bfloat16
-* - minus a
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 __hneg(const __nv_bfloat16 a);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
-* \brief Performs \p nv_bfloat162 vector if-equal comparison and returns boolean true
-* iff both \p nv_bfloat16 results are true, boolean false otherwise.
-*
-* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b.
-* The bool result is set to true only if both \p nv_bfloat16 if-equal comparisons
-* evaluate to true, or false otherwise.
-* NaN inputs generate false results.
-* \param[in] a - nv_bfloat162. Is only being read.
-* \param[in] b - nv_bfloat162. Is only being read.
-*
-* \returns bool
-* - true if both \p nv_bfloat16 results of if-equal comparison
-* of vectors \p a and \p b are true;
-* - false otherwise.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ bool __hbeq2(const __nv_bfloat162 a, const __nv_bfloat162 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
-* \brief Performs \p nv_bfloat162 vector not-equal comparison and returns boolean
-* true iff both \p nv_bfloat16 results are true, boolean false otherwise.
-*
-* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b.
-* The bool result is set to true only if both \p nv_bfloat16 not-equal comparisons
-* evaluate to true, or false otherwise.
-* NaN inputs generate false results.
-* \param[in] a - nv_bfloat162. Is only being read.
-* \param[in] b - nv_bfloat162. Is only being read.
-*
-* \returns bool
-* - true if both \p nv_bfloat16 results of not-equal comparison
-* of vectors \p a and \p b are true,
-* - false otherwise.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ bool __hbne2(const __nv_bfloat162 a, const __nv_bfloat162 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
-* \brief Performs \p nv_bfloat162 vector less-equal comparison and returns boolean
-* true iff both \p nv_bfloat16 results are true, boolean false otherwise.
-*
-* \details Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b.
-* The bool result is set to true only if both \p nv_bfloat16 less-equal comparisons
-* evaluate to true, or false otherwise.
-* NaN inputs generate false results.
-* \param[in] a - nv_bfloat162. Is only being read.
-* \param[in] b - nv_bfloat162. Is only being read.
-*
-* \returns bool
-* - true if both \p nv_bfloat16 results of less-equal comparison
-* of vectors \p a and \p b are true;
-* - false otherwise.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ bool __hble2(const __nv_bfloat162 a, const __nv_bfloat162 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
-* \brief Performs \p nv_bfloat162 vector greater-equal comparison and returns boolean
-* true iff both \p nv_bfloat16 results are true, boolean false otherwise.
-*
-* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b.
-* The bool result is set to true only if both \p nv_bfloat16 greater-equal comparisons
-* evaluate to true, or false otherwise.
-* NaN inputs generate false results.
-* \param[in] a - nv_bfloat162. Is only being read.
-* \param[in] b - nv_bfloat162. Is only being read.
-*
-* \returns bool
-* - true if both \p nv_bfloat16 results of greater-equal
-* comparison of vectors \p a and \p b are true;
-* - false otherwise.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ bool __hbge2(const __nv_bfloat162 a, const __nv_bfloat162 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
-* \brief Performs \p nv_bfloat162 vector less-than comparison and returns boolean
-* true iff both \p nv_bfloat16 results are true, boolean false otherwise.
-*
-* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b.
-* The bool result is set to true only if both \p nv_bfloat16 less-than comparisons
-* evaluate to true, or false otherwise.
-* NaN inputs generate false results.
-* \param[in] a - nv_bfloat162. Is only being read.
-* \param[in] b - nv_bfloat162. Is only being read.
-*
-* \returns bool
-* - true if both \p nv_bfloat16 results of less-than comparison
-* of vectors \p a and \p b are true;
-* - false otherwise.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ bool __hblt2(const __nv_bfloat162 a, const __nv_bfloat162 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
-* \brief Performs \p nv_bfloat162 vector greater-than comparison and returns boolean
-* true iff both \p nv_bfloat16 results are true, boolean false otherwise.
-*
-* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b.
-* The bool result is set to true only if both \p nv_bfloat16 greater-than comparisons
-* evaluate to true, or false otherwise.
-* NaN inputs generate false results.
-* \param[in] a - nv_bfloat162. Is only being read.
-* \param[in] b - nv_bfloat162. Is only being read.
-*
-* \returns bool
-* - true if both \p nv_bfloat16 results of greater-than
-* comparison of vectors \p a and \p b are true;
-* - false otherwise.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ bool __hbgt2(const __nv_bfloat162 a, const __nv_bfloat162 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
-* \brief Performs \p nv_bfloat162 vector unordered if-equal comparison and returns
-* boolean true iff both \p nv_bfloat16 results are true, boolean false otherwise.
-*
-* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b.
-* The bool result is set to true only if both \p nv_bfloat16 if-equal comparisons
-* evaluate to true, or false otherwise.
-* NaN inputs generate true results.
-* \param[in] a - nv_bfloat162. Is only being read.
-* \param[in] b - nv_bfloat162. Is only being read.
-*
-* \returns bool
-* - true if both \p nv_bfloat16 results of unordered if-equal
-* comparison of vectors \p a and \p b are true;
-* - false otherwise.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ bool __hbequ2(const __nv_bfloat162 a, const __nv_bfloat162 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
-* \brief Performs \p nv_bfloat162 vector unordered not-equal comparison and returns
-* boolean true iff both \p nv_bfloat16 results are true, boolean false otherwise.
-*
-* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b.
-* The bool result is set to true only if both \p nv_bfloat16 not-equal comparisons
-* evaluate to true, or false otherwise.
-* NaN inputs generate true results.
-* \param[in] a - nv_bfloat162. Is only being read.
-* \param[in] b - nv_bfloat162. Is only being read.
-*
-* \returns bool
-* - true if both \p nv_bfloat16 results of unordered not-equal
-* comparison of vectors \p a and \p b are true;
-* - false otherwise.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ bool __hbneu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
-* \brief Performs \p nv_bfloat162 vector unordered less-equal comparison and returns
-* boolean true iff both \p nv_bfloat16 results are true, boolean false otherwise.
-*
-* \details Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b.
-* The bool result is set to true only if both \p nv_bfloat16 less-equal comparisons
-* evaluate to true, or false otherwise.
-* NaN inputs generate true results.
-* \param[in] a - nv_bfloat162. Is only being read.
-* \param[in] b - nv_bfloat162. Is only being read.
-*
-* \returns bool
-* - true if both \p nv_bfloat16 results of unordered less-equal
-* comparison of vectors \p a and \p b are true;
-* - false otherwise.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ bool __hbleu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
-* \brief Performs \p nv_bfloat162 vector unordered greater-equal comparison and
-* returns boolean true iff both \p nv_bfloat16 results are true, boolean false
-* otherwise.
-*
-* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b.
-* The bool result is set to true only if both \p nv_bfloat16 greater-equal comparisons
-* evaluate to true, or false otherwise.
-* NaN inputs generate true results.
-* \param[in] a - nv_bfloat162. Is only being read.
-* \param[in] b - nv_bfloat162. Is only being read.
-*
-* \returns bool
-* - true if both \p nv_bfloat16 results of unordered
-* greater-equal comparison of vectors \p a and \p b are true;
-* - false otherwise.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ bool __hbgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
-* \brief Performs \p nv_bfloat162 vector unordered less-than comparison and returns
-* boolean true iff both \p nv_bfloat16 results are true, boolean false otherwise.
-*
-* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b.
-* The bool result is set to true only if both \p nv_bfloat16 less-than comparisons
-* evaluate to true, or false otherwise.
-* NaN inputs generate true results.
-* \param[in] a - nv_bfloat162. Is only being read.
-* \param[in] b - nv_bfloat162. Is only being read.
-*
-* \returns bool
-* - true if both \p nv_bfloat16 results of unordered less-than comparison of
-* vectors \p a and \p b are true;
-* - false otherwise.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ bool __hbltu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
-* \brief Performs \p nv_bfloat162 vector unordered greater-than comparison and
-* returns boolean true iff both \p nv_bfloat16 results are true, boolean false
-* otherwise.
-*
-* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b.
-* The bool result is set to true only if both \p nv_bfloat16 greater-than comparisons
-* evaluate to true, or false otherwise.
-* NaN inputs generate true results.
-* \param[in] a - nv_bfloat162. Is only being read.
-* \param[in] b - nv_bfloat162. Is only being read.
-*
-* \returns bool
-* - true if both \p nv_bfloat16 results of unordered
-* greater-than comparison of vectors \p a and \p b are true;
-* - false otherwise.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ bool __hbgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
-* \brief Performs \p nv_bfloat16 if-equal comparison.
-*
-* \details Performs \p nv_bfloat16 if-equal comparison of inputs \p a and \p b.
-* NaN inputs generate false results.
-* \param[in] a - nv_bfloat16. Is only being read.
-* \param[in] b - nv_bfloat16. Is only being read.
-*
-* \returns bool
-* - The boolean result of if-equal comparison of \p a and \p b.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ bool __heq(const __nv_bfloat16 a, const __nv_bfloat16 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
-* \brief Performs \p nv_bfloat16 not-equal comparison.
-*
-* \details Performs \p nv_bfloat16 not-equal comparison of inputs \p a and \p b.
-* NaN inputs generate false results.
-* \param[in] a - nv_bfloat16. Is only being read.
-* \param[in] b - nv_bfloat16. Is only being read.
-*
-* \returns bool
-* - The boolean result of not-equal comparison of \p a and \p b.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ bool __hne(const __nv_bfloat16 a, const __nv_bfloat16 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
-* \brief Performs \p nv_bfloat16 less-equal comparison.
-*
-* \details Performs \p nv_bfloat16 less-equal comparison of inputs \p a and \p b.
-* NaN inputs generate false results.
-* \param[in] a - nv_bfloat16. Is only being read.
-* \param[in] b - nv_bfloat16. Is only being read.
-*
-* \returns bool
-* - The boolean result of less-equal comparison of \p a and \p b.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ bool __hle(const __nv_bfloat16 a, const __nv_bfloat16 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
-* \brief Performs \p nv_bfloat16 greater-equal comparison.
-*
-* \details Performs \p nv_bfloat16 greater-equal comparison of inputs \p a and \p b.
-* NaN inputs generate false results.
-* \param[in] a - nv_bfloat16. Is only being read.
-* \param[in] b - nv_bfloat16. Is only being read.
-*
-* \returns bool
-* - The boolean result of greater-equal comparison of \p a and \p b.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ bool __hge(const __nv_bfloat16 a, const __nv_bfloat16 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
-* \brief Performs \p nv_bfloat16 less-than comparison.
-*
-* \details Performs \p nv_bfloat16 less-than comparison of inputs \p a and \p b.
-* NaN inputs generate false results.
-* \param[in] a - nv_bfloat16. Is only being read.
-* \param[in] b - nv_bfloat16. Is only being read.
-*
-* \returns bool
-* - The boolean result of less-than comparison of \p a and \p b.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ bool __hlt(const __nv_bfloat16 a, const __nv_bfloat16 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
-* \brief Performs \p nv_bfloat16 greater-than comparison.
-*
-* \details Performs \p nv_bfloat16 greater-than comparison of inputs \p a and \p b.
-* NaN inputs generate false results.
-* \param[in] a - nv_bfloat16. Is only being read.
-* \param[in] b - nv_bfloat16. Is only being read.
-*
-* \returns bool
-* - The boolean result of greater-than comparison of \p a and \p b.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ bool __hgt(const __nv_bfloat16 a, const __nv_bfloat16 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
-* \brief Performs \p nv_bfloat16 unordered if-equal comparison.
-*
-* \details Performs \p nv_bfloat16 if-equal comparison of inputs \p a and \p b.
-* NaN inputs generate true results.
-* \param[in] a - nv_bfloat16. Is only being read.
-* \param[in] b - nv_bfloat16. Is only being read.
-*
-* \returns bool
-* - The boolean result of unordered if-equal comparison of \p a and
-* \p b.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ bool __hequ(const __nv_bfloat16 a, const __nv_bfloat16 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
-* \brief Performs \p nv_bfloat16 unordered not-equal comparison.
-*
-* \details Performs \p nv_bfloat16 not-equal comparison of inputs \p a and \p b.
-* NaN inputs generate true results.
-* \param[in] a - nv_bfloat16. Is only being read.
-* \param[in] b - nv_bfloat16. Is only being read.
-*
-* \returns bool
-* - The boolean result of unordered not-equal comparison of \p a and
-* \p b.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ bool __hneu(const __nv_bfloat16 a, const __nv_bfloat16 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
-* \brief Performs \p nv_bfloat16 unordered less-equal comparison.
-*
-* \details Performs \p nv_bfloat16 less-equal comparison of inputs \p a and \p b.
-* NaN inputs generate true results.
-* \param[in] a - nv_bfloat16. Is only being read.
-* \param[in] b - nv_bfloat16. Is only being read.
-*
-* \returns bool
-* - The boolean result of unordered less-equal comparison of \p a and
-* \p b.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ bool __hleu(const __nv_bfloat16 a, const __nv_bfloat16 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
-* \brief Performs \p nv_bfloat16 unordered greater-equal comparison.
-*
-* \details Performs \p nv_bfloat16 greater-equal comparison of inputs \p a and \p b.
-* NaN inputs generate true results.
-* \param[in] a - nv_bfloat16. Is only being read.
-* \param[in] b - nv_bfloat16. Is only being read.
-*
-* \returns bool
-* - The boolean result of unordered greater-equal comparison of \p a
-* and \p b.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ bool __hgeu(const __nv_bfloat16 a, const __nv_bfloat16 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
-* \brief Performs \p nv_bfloat16 unordered less-than comparison.
-*
-* \details Performs \p nv_bfloat16 less-than comparison of inputs \p a and \p b.
-* NaN inputs generate true results.
-* \param[in] a - nv_bfloat16. Is only being read.
-* \param[in] b - nv_bfloat16. Is only being read.
-*
-* \returns bool
-* - The boolean result of unordered less-than comparison of \p a and
-* \p b.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ bool __hltu(const __nv_bfloat16 a, const __nv_bfloat16 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
-* \brief Performs \p nv_bfloat16 unordered greater-than comparison.
-*
-* \details Performs \p nv_bfloat16 greater-than comparison of inputs \p a and \p b.
-* NaN inputs generate true results.
-* \param[in] a - nv_bfloat16. Is only being read.
-* \param[in] b - nv_bfloat16. Is only being read.
-*
-* \returns bool
-* - The boolean result of unordered greater-than comparison of \p a
-* and \p b.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ bool __hgtu(const __nv_bfloat16 a, const __nv_bfloat16 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
-* \brief Determine whether \p nv_bfloat16 argument is a NaN.
-*
-* \details Determine whether \p nv_bfloat16 value \p a is a NaN.
-* \param[in] a - nv_bfloat16. Is only being read.
-*
-* \returns bool
-* - true iff argument is NaN.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ bool __hisnan(const __nv_bfloat16 a);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
-* \brief Calculates \p nv_bfloat16 maximum of two input values.
-*
-* \details Calculates \p nv_bfloat16 max(\p a, \p b)
-* defined as (\p a > \p b) ? \p a : \p b.
-* - If either of inputs is NaN, the other input is returned.
-* - If both inputs are NaNs, then canonical NaN is returned.
-* - If values of both inputs are 0.0, then +0.0 > -0.0
-* \param[in] a - nv_bfloat16. Is only being read.
-* \param[in] b - nv_bfloat16. Is only being read.
-*
-* \returns nv_bfloat16
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 __hmax(const __nv_bfloat16 a, const __nv_bfloat16 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
-* \brief Calculates \p nv_bfloat16 minimum of two input values.
-*
-* \details Calculates \p nv_bfloat16 min(\p a, \p b)
-* defined as (\p a < \p b) ? \p a : \p b.
-* - If either of inputs is NaN, the other input is returned.
-* - If both inputs are NaNs, then canonical NaN is returned.
-* - If values of both inputs are 0.0, then +0.0 > -0.0
-* \param[in] a - nv_bfloat16. Is only being read.
-* \param[in] b - nv_bfloat16. Is only being read.
-*
-* \returns nv_bfloat16
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 __hmin(const __nv_bfloat16 a, const __nv_bfloat16 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
-* \brief Calculates \p nv_bfloat16 maximum of two input values, NaNs pass through.
-*
-* \details Calculates \p nv_bfloat16 max(\p a, \p b)
-* defined as (\p a > \p b) ? \p a : \p b.
-* - If either of inputs is NaN, then canonical NaN is returned.
-* - If values of both inputs are 0.0, then +0.0 > -0.0
-* \param[in] a - nv_bfloat16. Is only being read.
-* \param[in] b - nv_bfloat16. Is only being read.
-*
-* \returns nv_bfloat16
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 __hmax_nan(const __nv_bfloat16 a, const __nv_bfloat16 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
-* \brief Calculates \p nv_bfloat16 minimum of two input values, NaNs pass through.
-*
-* \details Calculates \p nv_bfloat16 min(\p a, \p b)
-* defined as (\p a < \p b) ? \p a : \p b.
-* - If either of inputs is NaN, then canonical NaN is returned.
-* - If values of both inputs are 0.0, then +0.0 > -0.0
-* \param[in] a - nv_bfloat16. Is only being read.
-* \param[in] b - nv_bfloat16. Is only being read.
-*
-* \returns nv_bfloat16
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 __hmin_nan(const __nv_bfloat16 a, const __nv_bfloat16 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
-* \brief Performs \p nv_bfloat16 fused multiply-add in round-to-nearest-even mode with relu saturation.
-*
-* \details Performs \p nv_bfloat16 multiply on inputs \p a and \p b,
-* then performs a \p nv_bfloat16 add of the result with \p c,
-* rounding the result once in round-to-nearest-even mode.
-* Then negative result is clamped to 0.
-* NaN result is converted to canonical NaN.
-* \param[in] a - nv_bfloat16. Is only being read.
-* \param[in] b - nv_bfloat16. Is only being read.
-* \param[in] c - nv_bfloat16. Is only being read.
-*
-* \returns nv_bfloat16
-* - The result of fused multiply-add operation on \p
-* a, \p b, and \p c with relu saturation.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 __hfma_relu(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
-* \brief Calculates \p nv_bfloat162 vector maximum of two inputs.
-*
-* \details Calculates \p nv_bfloat162 vector max(\p a, \p b).
-* Elementwise \p nv_bfloat16 operation is defined as
-* (\p a > \p b) ? \p a : \p b.
-* - If either of inputs is NaN, the other input is returned.
-* - If both inputs are NaNs, then canonical NaN is returned.
-* - If values of both inputs are 0.0, then +0.0 > -0.0
-* \param[in] a - nv_bfloat162. Is only being read.
-* \param[in] b - nv_bfloat162. Is only being read.
-*
-* \returns nv_bfloat162
-* - The result of elementwise maximum of vectors \p a  and \p b
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 __hmax2(const __nv_bfloat162 a, const __nv_bfloat162 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
-* \brief Calculates \p nv_bfloat162 vector minimum of two inputs.
-*
-* \details Calculates \p nv_bfloat162 vector min(\p a, \p b).
-* Elementwise \p nv_bfloat16 operation is defined as
-* (\p a < \p b) ? \p a : \p b.
-* - If either of inputs is NaN, the other input is returned.
-* - If both inputs are NaNs, then canonical NaN is returned.
-* - If values of both inputs are 0.0, then +0.0 > -0.0
-* \param[in] a - nv_bfloat162. Is only being read.
-* \param[in] b - nv_bfloat162. Is only being read.
-*
-* \returns nv_bfloat162
-* - The result of elementwise minimum of vectors \p a  and \p b
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 __hmin2(const __nv_bfloat162 a, const __nv_bfloat162 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
-* \brief Calculates \p nv_bfloat162 vector maximum of two inputs, NaNs pass through.
-*
-* \details Calculates \p nv_bfloat162 vector max(\p a, \p b).
-* Elementwise \p nv_bfloat16 operation is defined as
-* (\p a > \p b) ? \p a : \p b.
-* - If either of inputs is NaN, then canonical NaN is returned.
-* - If values of both inputs are 0.0, then +0.0 > -0.0
-* \param[in] a - nv_bfloat162. Is only being read.
-* \param[in] b - nv_bfloat162. Is only being read.
-*
-* \returns nv_bfloat162
-* - The result of elementwise maximum of vectors \p a  and \p b, with NaNs pass through
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 __hmax2_nan(const __nv_bfloat162 a, const __nv_bfloat162 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
-* \brief Calculates \p nv_bfloat162 vector minimum of two inputs, NaNs pass through.
-*
-* \details Calculates \p nv_bfloat162 vector min(\p a, \p b).
-* Elementwise \p nv_bfloat16 operation is defined as
-* (\p a < \p b) ? \p a : \p b.
-* - If either of inputs is NaN, then canonical NaN is returned.
-* - If values of both inputs are 0.0, then +0.0 > -0.0
-* \param[in] a - nv_bfloat162. Is only being read.
-* \param[in] b - nv_bfloat162. Is only being read.
-*
-* \returns nv_bfloat162
-* - The result of elementwise minimum of vectors \p a  and \p b, with NaNs pass through
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 __hmin2_nan(const __nv_bfloat162 a, const __nv_bfloat162 b);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
-* \brief Performs \p nv_bfloat162 vector fused multiply-add in round-to-nearest-even
-* mode with relu saturation.
-*
-* \details Performs \p nv_bfloat162 vector multiply on inputs \p a and \p b,
-* then performs a \p nv_bfloat162 vector add of the result with \p c,
-* rounding the result once in round-to-nearest-even mode.
-* Then negative result is clamped to 0.
-* NaN result is converted to canonical NaN.
-* \param[in] a - nv_bfloat162. Is only being read.
-* \param[in] b - nv_bfloat162. Is only being read.
-* \param[in] c - nv_bfloat162. Is only being read.
-*
-* \returns nv_bfloat162
-* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c with relu saturation.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2_relu(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
-* \brief Performs fast complex multiply-accumulate
-*
-* \details Interprets vector \p nv_bfloat162 input pairs \p a, \p b, and \p c as
-* complex numbers in \p nv_bfloat16 precision and performs
-* complex multiply-accumulate operation: a*b + c
-* \param[in] a - nv_bfloat162. Is only being read.
-* \param[in] b - nv_bfloat162. Is only being read.
-* \param[in] c - nv_bfloat162. Is only being read.
-*
-* \returns nv_bfloat162
-* - The result of complex multiply-accumulate operation on complex numbers \p a, \p b, and \p c
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 __hcmadd(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c);
-
-/**
-* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
-* \brief Calculates \p nv_bfloat16 square root in round-to-nearest-even mode.
-*
-* \details Calculates \p nv_bfloat16 square root of input \p a in round-to-nearest-even mode.
-* \param[in] a - nv_bfloat16. Is only being read.
-*
-* \returns nv_bfloat16
-* - The square root of \p a.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 hsqrt(const __nv_bfloat16 a);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
-* \brief Calculates \p nv_bfloat16 reciprocal square root in round-to-nearest-even
-* mode.
-*
-* \details Calculates \p nv_bfloat16 reciprocal square root of input \p a in round-to-nearest
-* mode.
-* \param[in] a - nv_bfloat16. Is only being read.
-*
-* \returns nv_bfloat16
-* - The reciprocal square root of \p a.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 hrsqrt(const __nv_bfloat16 a);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
-* \brief Calculates \p nv_bfloat16 reciprocal in round-to-nearest-even mode.
-*
-* \details Calculates \p nv_bfloat16 reciprocal of input \p a in round-to-nearest-even mode.
-* \param[in] a - nv_bfloat16. Is only being read.
-*
-* \returns nv_bfloat16
-* - The reciprocal of \p a.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 hrcp(const __nv_bfloat16 a);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
-* \brief Calculates \p nv_bfloat16 natural logarithm in round-to-nearest-even mode.
-*
-* \details Calculates \p nv_bfloat16 natural logarithm of input \p a in round-to-nearest-even
-* mode.
-* \param[in] a - nv_bfloat16. Is only being read.
-*
-* \returns nv_bfloat16
-* - The natural logarithm of \p a.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 hlog(const __nv_bfloat16 a);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
-* \brief Calculates \p nv_bfloat16 binary logarithm in round-to-nearest-even mode.
-*
-* \details Calculates \p nv_bfloat16 binary logarithm of input \p a in round-to-nearest-even
-* mode.
-* \param[in] a - nv_bfloat16. Is only being read.
-*
-* \returns nv_bfloat16
-* - The binary logarithm of \p a.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 hlog2(const __nv_bfloat16 a);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
-* \brief Calculates \p nv_bfloat16 decimal logarithm in round-to-nearest-even mode.
-*
-* \details Calculates \p nv_bfloat16 decimal logarithm of input \p a in round-to-nearest-even
-* mode.
-* \param[in] a - nv_bfloat16. Is only being read.
-*
-* \returns nv_bfloat16
-* - The decimal logarithm of \p a.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 hlog10(const __nv_bfloat16 a);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
-* \brief Calculates \p nv_bfloat16 natural exponential function in round-to-nearest
-* mode.
-*
-* \details Calculates \p nv_bfloat16 natural exponential function of input \p a in
-* round-to-nearest-even mode.
-* \param[in] a - nv_bfloat16. Is only being read.
-*
-* \returns nv_bfloat16
-* - The natural exponential function on \p a.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 hexp(const __nv_bfloat16 a);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
-* \brief Calculates \p nv_bfloat16 binary exponential function in round-to-nearest
-* mode.
-*
-* \details Calculates \p nv_bfloat16 binary exponential function of input \p a in
-* round-to-nearest-even mode.
-* \param[in] a - nv_bfloat16. Is only being read.
-*
-* \returns nv_bfloat16
-* - The binary exponential function on \p a.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 hexp2(const __nv_bfloat16 a);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
-* \brief Calculates \p nv_bfloat16 decimal exponential function in round-to-nearest
-* mode.
-*
-* \details Calculates \p nv_bfloat16 decimal exponential function of input \p a in
-* round-to-nearest-even mode.
-* \param[in] a - nv_bfloat16. Is only being read.
-*
-* \returns nv_bfloat16
-* - The decimal exponential function on \p a.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 hexp10(const __nv_bfloat16 a);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
-* \brief Calculates \p nv_bfloat16 cosine in round-to-nearest-even mode.
-*
-* \details Calculates \p nv_bfloat16 cosine of input \p a in round-to-nearest-even mode.
-* \param[in] a - nv_bfloat16. Is only being read.
-*
-* \returns nv_bfloat16
-* - The cosine of \p a.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 hcos(const __nv_bfloat16 a);
-/**
-* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
-* \brief Calculates \p nv_bfloat16 sine in round-to-nearest-even mode.
-*
-* \details Calculates \p nv_bfloat16 sine of input \p a in round-to-nearest-even mode.
-* \param[in] a - nv_bfloat16. Is only being read.
-*
-* \returns nv_bfloat16
-* - The sine of \p a.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 hsin(const __nv_bfloat16 a);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
-* \brief Calculates \p nv_bfloat162 vector square root in round-to-nearest-even mode.
-*
-* \details Calculates \p nv_bfloat162 square root of input vector \p a in round-to-nearest
-* mode.
-* \param[in] a - nv_bfloat162. Is only being read.
-*
-* \returns nv_bfloat162
-* - The elementwise square root on vector \p a.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 h2sqrt(const __nv_bfloat162 a);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
-* \brief Calculates \p nv_bfloat162 vector reciprocal square root in round-to-nearest
-* mode.
-*
-* \details Calculates \p nv_bfloat162 reciprocal square root of input vector \p a in
-* round-to-nearest-even mode.
-* \param[in] a - nv_bfloat162. Is only being read.
-*
-* \returns nv_bfloat162
-* - The elementwise reciprocal square root on vector \p a.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 h2rsqrt(const __nv_bfloat162 a);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
-* \brief Calculates \p nv_bfloat162 vector reciprocal in round-to-nearest-even mode.
-*
-* \details Calculates \p nv_bfloat162 reciprocal of input vector \p a in round-to-nearest-even
-* mode.
-* \param[in] a - nv_bfloat162. Is only being read.
-*
-* \returns nv_bfloat162
-* - The elementwise reciprocal on vector \p a.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 h2rcp(const __nv_bfloat162 a);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
-* \brief Calculates \p nv_bfloat162 vector natural logarithm in round-to-nearest-even
-* mode.
-*
-* \details Calculates \p nv_bfloat162 natural logarithm of input vector \p a in
-* round-to-nearest-even mode.
-* \param[in] a - nv_bfloat162. Is only being read.
-*
-* \returns nv_bfloat162
-* - The elementwise natural logarithm on vector \p a.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 h2log(const __nv_bfloat162 a);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
-* \brief Calculates \p nv_bfloat162 vector binary logarithm in round-to-nearest-even
-* mode.
-*
-* \details Calculates \p nv_bfloat162 binary logarithm of input vector \p a in round-to-nearest
-* mode.
-* \param[in] a - nv_bfloat162. Is only being read.
-*
-* \returns nv_bfloat162
-* - The elementwise binary logarithm on vector \p a.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 h2log2(const __nv_bfloat162 a);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
-* \brief Calculates \p nv_bfloat162 vector decimal logarithm in round-to-nearest-even
-* mode.
-*
-* \details Calculates \p nv_bfloat162 decimal logarithm of input vector \p a in
-* round-to-nearest-even mode.
-* \param[in] a - nv_bfloat162. Is only being read.
-*
-* \returns nv_bfloat162
-* - The elementwise decimal logarithm on vector \p a.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 h2log10(const __nv_bfloat162 a);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
-* \brief Calculates \p nv_bfloat162 vector exponential function in round-to-nearest
-* mode.
-*
-* \details Calculates \p nv_bfloat162 exponential function of input vector \p a in
-* round-to-nearest-even mode.
-* \param[in] a - nv_bfloat162. Is only being read.
-*
-* \returns nv_bfloat162
-* - The elementwise exponential function on vector \p a.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 h2exp(const __nv_bfloat162 a);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
-* \brief Calculates \p nv_bfloat162 vector binary exponential function in
-* round-to-nearest-even mode.
-*
-* \details Calculates \p nv_bfloat162 binary exponential function of input vector \p a in
-* round-to-nearest-even mode.
-* \param[in] a - nv_bfloat162. Is only being read.
-*
-* \returns nv_bfloat162
-* - The elementwise binary exponential function on vector \p a.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 h2exp2(const __nv_bfloat162 a);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
-* \brief Calculates \p nv_bfloat162 vector decimal exponential function in
-* round-to-nearest-even mode.
-*
-* \details Calculates \p nv_bfloat162 decimal exponential function of input vector \p a in
-* round-to-nearest-even mode.
-* \param[in] a - nv_bfloat162. Is only being read.
-*
-* \returns nv_bfloat162
-* - The elementwise decimal exponential function on vector \p a.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 h2exp10(const __nv_bfloat162 a);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
-* \brief Calculates \p nv_bfloat162 vector cosine in round-to-nearest-even mode.
-*
-* \details Calculates \p nv_bfloat162 cosine of input vector \p a in round-to-nearest-even
-* mode.
-* \param[in] a - nv_bfloat162. Is only being read.
-*
-* \returns nv_bfloat162
-* - The elementwise cosine on vector \p a.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 h2cos(const __nv_bfloat162 a);
-/**
-* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
-* \brief Calculates \p nv_bfloat162 vector sine in round-to-nearest-even mode.
-*
-* \details Calculates \p nv_bfloat162 sine of input vector \p a in round-to-nearest-even mode.
-* \param[in] a - nv_bfloat162. Is only being read.
-*
-* \returns nv_bfloat162
-* - The elementwise sine on vector \p a.
-* \internal
-* \exception-guarantee no-throw guarantee
-* \behavior reentrant, thread safe
-* \endinternal
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 h2sin(const __nv_bfloat162 a);
-
-/**
-* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
-* \brief Vector add \p val to the value stored at \p address in global or shared memory, and writes this
-* value back to \p address. The atomicity of the add operation is guaranteed separately for each of the
-* two nv_bfloat16 elements; the entire __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access.
-*
-* \details The location of \p address must be in global or shared memory. This operation has undefined
-* behavior otherwise. This operation is only supported by devices of compute capability 8.x and higher.
-*
-* \param[in] address - __nv_bfloat162*. An address in global or shared memory.
-* \param[in] val - __nv_bfloat162. The value to be added.
-*
-* \returns __nv_bfloat162
-* - The old value read from \p address.
-*
-* \note_ref_guide_atomic
-*/
-__CUDA_BF16_DECL__ __nv_bfloat162 atomicAdd(__nv_bfloat162 *const address, const __nv_bfloat162 val);
-
-/**
-* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
-* \brief Adds \p val to the value stored at \p address in global or shared memory, and writes this value
-* back to \p address. This operation is performed in one atomic operation.
-*
-* \details The location of \p address must be in global or shared memory. This operation has undefined
-* behavior otherwise. This operation is only supported by devices of compute capability 8.x and higher.
-*
-* \param[in] address - __nv_bfloat16*. An address in global or shared memory.
-* \param[in] val - __nv_bfloat16. The value to be added.
-*
-* \returns __nv_bfloat16
-* - The old value read from \p address.
-*
-* \note_ref_guide_atomic
-*/
-__CUDA_BF16_DECL__ __nv_bfloat16 atomicAdd(__nv_bfloat16 *const address, const __nv_bfloat16 val);
-
-#endif /* defined(__CUDACC__) && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) */
-
-#undef __CUDA_BF16_DECL__
-#undef __CUDA_HOSTDEVICE_BF16_DECL__
-
-#endif /* defined(__cplusplus) */
-
-/* Note the .hpp file is included even for host-side compilation, to capture the "nv_bfloat16" & "nv_bfloat162" definitions */
-#include "cuda_bf16.hpp"
-#undef ___CUDA_BF16_STRINGIFY_INNERMOST
-#undef __CUDA_BF16_STRINGIFY
-
-#endif /* end of include guard: __CUDA_BF16_H__ */
diff --git a/numba_cuda/numba/cuda/include/11/cuda_bf16.hpp b/numba_cuda/numba/cuda/include/11/cuda_bf16.hpp
deleted file mode 100644
index 30085da5e..000000000
--- a/numba_cuda/numba/cuda/include/11/cuda_bf16.hpp
+++ /dev/null
@@ -1,2683 +0,0 @@
-/*
-* Copyright 1993-2022 NVIDIA Corporation.  All rights reserved.
-*
-* NOTICE TO LICENSEE:
-*
-* This source code and/or documentation ("Licensed Deliverables") are
-* subject to NVIDIA intellectual property rights under U.S. and
-* international Copyright laws.
-*
-* These Licensed Deliverables contained herein is PROPRIETARY and
-* CONFIDENTIAL to NVIDIA and is being provided under the terms and
-* conditions of a form of NVIDIA software license agreement by and
-* between NVIDIA and Licensee ("License Agreement") or electronically
-* accepted by Licensee.  Notwithstanding any terms or conditions to
-* the contrary in the License Agreement, reproduction or disclosure
-* of the Licensed Deliverables to any third party without the express
-* written consent of NVIDIA is prohibited.
-*
-* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
-* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
-* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
-* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
-* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
-* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
-* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
-* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
-* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
-* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
-* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
-* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
-* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
-* OF THESE LICENSED DELIVERABLES.
-*
-* U.S. Government End Users.  These Licensed Deliverables are a
-* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
-* 1995), consisting of "commercial computer software" and "commercial
-* computer software documentation" as such terms are used in 48
-* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
-* only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
-* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
-* U.S. Government End Users acquire the Licensed Deliverables with
-* only those rights set forth herein.
-*
-* Any use of the Licensed Deliverables in individual and commercial
-* software must include, in the user documentation and internal
-* comments to the code, the above Disclaimer and U.S. Government End
-* Users Notice.
-*/
-
-#if !defined(__CUDA_BF16_HPP__)
-#define __CUDA_BF16_HPP__
-
-#if !defined(__CUDA_BF16_H__)
-#error "Do not include this file directly. Instead, include cuda_bf16.h."
-#endif
-
-#if !defined(_MSC_VER) && __cplusplus >= 201103L
-#   define __CPP_VERSION_AT_LEAST_11_BF16
-#elif _MSC_FULL_VER >= 190024210 && _MSVC_LANG >= 201103L
-#   define __CPP_VERSION_AT_LEAST_11_BF16
-#endif
-
-/* C++11 header for std::move.
- * In RTC mode, std::move is provided implicitly; don't include the header
- */
-#if defined(__CPP_VERSION_AT_LEAST_11_BF16) && !defined(__CUDACC_RTC__)
-#include <utility>
-#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) && !defined(__CUDACC_RTC__) */
-
-/* C++ header for std::memcpy (used for type punning in host-side implementations).
- * When compiling as a CUDA source file memcpy is provided implicitly.
- * !defined(__CUDACC__) implies !defined(__CUDACC_RTC__).
- */
-#if defined(__cplusplus) && !defined(__CUDACC__)
-#include <cstring>
-#endif /* defined(__cplusplus) && !defined(__CUDACC__) */
-
-
-/* Set up function decorations */
-#if defined(__CUDACC__)
-#define __CUDA_BF16_DECL__ static __device__ __inline__
-#define __CUDA_HOSTDEVICE_BF16_DECL__ static __host__ __device__ __inline__
-#define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__
-#define __CUDA_HOSTDEVICE__ __host__ __device__
-#else /* !defined(__CUDACC__) */
-#if defined(__GNUC__)
-#define __CUDA_HOSTDEVICE_BF16_DECL__ static __attribute__ ((unused))
-#else
-#define __CUDA_HOSTDEVICE_BF16_DECL__ static
-#endif /* defined(__GNUC__) */
-#define __CUDA_HOSTDEVICE__
-#endif /* defined(__CUDACC_) */
-
-/* Set up structure-alignment attribute */
-#if defined(__CUDACC__)
-#define __CUDA_ALIGN__(align) __align__(align)
-#else
-/* Define alignment macro based on compiler type (cannot assume C11 "_Alignas" is available) */
-#if defined(__CPP_VERSION_AT_LEAST_11_BF16)
-#define __CUDA_ALIGN__(n) alignas(n)    /* C++11 kindly gives us a keyword for this */
-#else /* defined(__CPP_VERSION_AT_LEAST_11_BF16)*/
-#if defined(__GNUC__)
-#define __CUDA_ALIGN__(n) __attribute__ ((aligned(n)))
-#elif defined(_MSC_VER)
-#define __CUDA_ALIGN__(n) __declspec(align(n))
-#else
-#define __CUDA_ALIGN__(n)
-#endif /* defined(__GNUC__) */
-#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */
-#endif /* defined(__CUDACC__) */
-
-/* Macros to allow nv_bfloat16 & nv_bfloat162 to be used by inline assembly */
-#define __BFLOAT16_TO_US(var) *(reinterpret_cast<unsigned short *>(&(var)))
-#define __BFLOAT16_TO_CUS(var) *(reinterpret_cast<const unsigned short *>(&(var)))
-#define __BFLOAT162_TO_UI(var) *(reinterpret_cast<unsigned int *>(&(var)))
-#define __BFLOAT162_TO_CUI(var) *(reinterpret_cast<const unsigned int *>(&(var)))
-
-/**
-* Types which allow static initialization of "nv_bfloat16" and "nv_bfloat162" until
-* these become an actual builtin. Note this initialization is as a
-* bitfield representation of "nv_bfloat16", and not a conversion from short->nv_bfloat16.
-* Such a representation will be deprecated in a future version of CUDA.
-* (Note these are visible to non-nvcc compilers, including C-only compilation)
-*/
-typedef struct __CUDA_ALIGN__(2) {
-    unsigned short x;
-} __nv_bfloat16_raw;
-
-typedef struct __CUDA_ALIGN__(4) {
-    unsigned short x;
-    unsigned short y;
-} __nv_bfloat162_raw;
-
-/* All other definitions in this file are only visible to C++ compilers */
-#if defined(__cplusplus)
-
-/* Hide GCC member initialization list warnings because of host/device in-function init requirement */
-#if defined(__GNUC__)
-#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wstrict-aliasing"
-#pragma GCC diagnostic ignored "-Weffc++"
-#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */
-#endif /* defined(__GNUC__) */
-
-/* class' : multiple assignment operators specified
-   The class has multiple assignment operators of a single type. This warning is informational */
-#if defined(_MSC_VER) && _MSC_VER >= 1500
-#pragma warning( push )
-#pragma warning( disable:4522 )
-#endif /* defined(__GNUC__) */
-
-struct __CUDA_ALIGN__(2) __nv_bfloat16 {
-protected:
-    unsigned short __x;
-
-public:
-#if defined(__CPP_VERSION_AT_LEAST_11_BF16)
-    __nv_bfloat16() = default;
-#else
-    __CUDA_HOSTDEVICE__ __nv_bfloat16() { }
-#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */
-
-    /* Convert to/from __nv_bfloat16_raw */
-    __CUDA_HOSTDEVICE__ __nv_bfloat16(const __nv_bfloat16_raw &hr) : __x(hr.x) { }
-    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(const __nv_bfloat16_raw &hr) { __x = hr.x; return *this; }
-    __CUDA_HOSTDEVICE__ volatile __nv_bfloat16 &operator=(const __nv_bfloat16_raw &hr) volatile { __x = hr.x; return *this; }
-    __CUDA_HOSTDEVICE__ volatile __nv_bfloat16 &operator=(const volatile __nv_bfloat16_raw &hr) volatile { __x = hr.x; return *this; }
-    __CUDA_HOSTDEVICE__ operator __nv_bfloat16_raw() const { __nv_bfloat16_raw ret; ret.x = __x; return ret; }
-    __CUDA_HOSTDEVICE__ operator __nv_bfloat16_raw() const volatile { __nv_bfloat16_raw ret; ret.x = __x; return ret; }
-
-#if !defined(__CUDA_NO_BFLOAT16_CONVERSIONS__)
-    /* Construct from float/double */
-    __CUDA_HOSTDEVICE__ __nv_bfloat16(const float f) { __x = __float2bfloat16(f).__x;  }
-    __CUDA_HOSTDEVICE__ __nv_bfloat16(const double f) { __x = __double2bfloat16(f).__x;  }
-
-    __CUDA_HOSTDEVICE__ operator float() const { return __bfloat162float(*this); }
-    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(const float f) { __x = __float2bfloat16(f).__x; return *this; }
-
-    /* We omit "cast to double" operator, so as to not be ambiguous about up-cast */
-    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(const double f) { __x = __double2bfloat16(f).__x; return *this; }
-
-/* Member functions only available to nvcc compilation so far */
-#if defined(__CUDACC__) && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
-    /* Allow automatic construction from types supported natively in hardware */
-    /* Note we do avoid constructor init-list because of special host/device compilation rules */
-    __CUDA_HOSTDEVICE__ __nv_bfloat16(short val) { __x = __short2bfloat16_rn(val).__x;  }
-    __CUDA_HOSTDEVICE__ __nv_bfloat16(unsigned short val) { __x = __ushort2bfloat16_rn(val).__x;  }
-    __CUDA_HOSTDEVICE__ __nv_bfloat16(int val) { __x = __int2bfloat16_rn(val).__x;  }
-    __CUDA_HOSTDEVICE__ __nv_bfloat16(unsigned int val) { __x = __uint2bfloat16_rn(val).__x;  }
-    __CUDA_HOSTDEVICE__ __nv_bfloat16(long long val) { __x = __ll2bfloat16_rn(val).__x;  }
-    __CUDA_HOSTDEVICE__ __nv_bfloat16(unsigned long long val) { __x = __ull2bfloat16_rn(val).__x; }
-
-    /* Allow automatic casts to supported builtin types, matching all that are permitted with float */
-    __CUDA_HOSTDEVICE__ operator short() const { return __bfloat162short_rz(*this); }
-    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(short val) { __x = __short2bfloat16_rn(val).__x; return *this; }
-
-    __CUDA_HOSTDEVICE__ operator unsigned short() const { return __bfloat162ushort_rz(*this); }
-    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(unsigned short val) { __x = __ushort2bfloat16_rn(val).__x; return *this; }
-
-    __CUDA_HOSTDEVICE__ operator int() const { return __bfloat162int_rz(*this); }
-    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(int val) { __x = __int2bfloat16_rn(val).__x; return *this; }
-
-    __CUDA_HOSTDEVICE__ operator unsigned int() const { return __bfloat162uint_rz(*this); }
-    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(unsigned int val) { __x = __uint2bfloat16_rn(val).__x; return *this; }
-
-    __CUDA_HOSTDEVICE__ operator long long() const { return __bfloat162ll_rz(*this); }
-    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(long long val) { __x = __ll2bfloat16_rn(val).__x; return *this; }
-
-    __CUDA_HOSTDEVICE__ operator unsigned long long() const { return __bfloat162ull_rz(*this); }
-    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(unsigned long long val) { __x = __ull2bfloat16_rn(val).__x; return *this; }
-
-    /* Boolean conversion - note both 0 and -0 must return false */
-    __CUDA_HOSTDEVICE__ operator bool() const { return (__x & 0x7FFF) != 0; }
-#endif /* defined(__CUDACC__) && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) */
-#endif /* !defined(__CUDA_NO_BFLOAT16_CONVERSIONS__) */
-};
-
-/* Global-space operator functions are only available to nvcc compilation */
-#if defined(__CUDACC__)
-
-#if __CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)
-#if !defined(__CUDA_NO_BFLOAT16_OPERATORS__)
-/* Some basic arithmetic operations expected of a builtin */
-__device__ __forceinline__ __nv_bfloat16 operator+(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hadd(lh, rh); }
-__device__ __forceinline__ __nv_bfloat16 operator-(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hsub(lh, rh); }
-__device__ __forceinline__ __nv_bfloat16 operator*(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hmul(lh, rh); }
-__device__ __forceinline__ __nv_bfloat16 operator/(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hdiv(lh, rh); }
-
-__device__ __forceinline__ __nv_bfloat16 &operator+=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh) { lh = __hadd(lh, rh); return lh; }
-__device__ __forceinline__ __nv_bfloat16 &operator-=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh) { lh = __hsub(lh, rh); return lh; }
-__device__ __forceinline__ __nv_bfloat16 &operator*=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh) { lh = __hmul(lh, rh); return lh; }
-__device__ __forceinline__ __nv_bfloat16 &operator/=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh) { lh = __hdiv(lh, rh); return lh; }
-
-/* Note for increment and decrement we use the raw value 0x3F80 equating to nv_bfloat16(1.0f), to avoid the extra conversion */
-__device__ __forceinline__ __nv_bfloat16 &operator++(__nv_bfloat16 &h)      { __nv_bfloat16_raw one; one.x = 0x3F80; h += one; return h; }
-__device__ __forceinline__ __nv_bfloat16 &operator--(__nv_bfloat16 &h)      { __nv_bfloat16_raw one; one.x = 0x3F80; h -= one; return h; }
-__device__ __forceinline__ __nv_bfloat16  operator++(__nv_bfloat16 &h, const int ignored)
-{
-    // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
-    static_cast<void>(ignored);
-
-    const __nv_bfloat16 ret = h;
-    __nv_bfloat16_raw one;
-    one.x = 0x3F80;
-    h += one;
-    return ret;
-}
-__device__ __forceinline__ __nv_bfloat16  operator--(__nv_bfloat16 &h, const int ignored)
-{
-    // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
-    static_cast<void>(ignored);
-
-    const __nv_bfloat16 ret = h;
-    __nv_bfloat16_raw one;
-    one.x = 0x3F80;
-    h -= one;
-    return ret;
-}
-/* Unary plus and inverse operators */
-__device__ __forceinline__ __nv_bfloat16 operator+(const __nv_bfloat16 &h) { return h; }
-__device__ __forceinline__ __nv_bfloat16 operator-(const __nv_bfloat16 &h) { return __hneg(h); }
-
-/* Some basic comparison operations to make it look like a builtin */
-__device__ __forceinline__ bool operator==(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __heq(lh, rh); }
-__device__ __forceinline__ bool operator!=(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hneu(lh, rh); }
-__device__ __forceinline__ bool operator> (const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hgt(lh, rh); }
-__device__ __forceinline__ bool operator< (const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hlt(lh, rh); }
-__device__ __forceinline__ bool operator>=(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hge(lh, rh); }
-__device__ __forceinline__ bool operator<=(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hle(lh, rh); }
-#endif /* !defined(__CUDA_NO_BFLOAT16_OPERATORS__) */
-#endif /* __CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__) */
-#endif /* defined(__CUDACC__) */
-
-/* __nv_bfloat162 is visible to non-nvcc host compilers */
-struct __CUDA_ALIGN__(4) __nv_bfloat162 {
-    __nv_bfloat16 x;
-    __nv_bfloat16 y;
-
-    // All construct/copy/assign/move
-public:
-#if defined(__CPP_VERSION_AT_LEAST_11_BF16)
-    __nv_bfloat162() = default;
-    __CUDA_HOSTDEVICE__ __nv_bfloat162(__nv_bfloat162 &&src) { __BFLOAT162_TO_UI(*this) = std::move(__BFLOAT162_TO_CUI(src)); }
-    __CUDA_HOSTDEVICE__ __nv_bfloat162 &operator=(__nv_bfloat162 &&src) { __BFLOAT162_TO_UI(*this) = std::move(__BFLOAT162_TO_CUI(src)); return *this; }
-#else
-    __CUDA_HOSTDEVICE__ __nv_bfloat162() { }
-#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */
-    __CUDA_HOSTDEVICE__ __nv_bfloat162(const __nv_bfloat16 &a, const __nv_bfloat16 &b) : x(a), y(b) { }
-    __CUDA_HOSTDEVICE__ __nv_bfloat162(const __nv_bfloat162 &src) { __BFLOAT162_TO_UI(*this) = __BFLOAT162_TO_CUI(src); }
-    __CUDA_HOSTDEVICE__ __nv_bfloat162 &operator=(const __nv_bfloat162 &src) { __BFLOAT162_TO_UI(*this) = __BFLOAT162_TO_CUI(src); return *this; }
-
-    /* Convert to/from __nv_bfloat162_raw */
-    __CUDA_HOSTDEVICE__ __nv_bfloat162(const __nv_bfloat162_raw &h2r ) { __BFLOAT162_TO_UI(*this) = __BFLOAT162_TO_CUI(h2r); }
-    __CUDA_HOSTDEVICE__ __nv_bfloat162 &operator=(const __nv_bfloat162_raw &h2r) { __BFLOAT162_TO_UI(*this) = __BFLOAT162_TO_CUI(h2r); return *this; }
-    __CUDA_HOSTDEVICE__ operator __nv_bfloat162_raw() const { __nv_bfloat162_raw ret; ret.x = 0U; ret.y = 0U; __BFLOAT162_TO_UI(ret) = __BFLOAT162_TO_CUI(*this); return ret; }
-};
-
-/* Global-space operator functions are only available to nvcc compilation */
-#if defined(__CUDACC__)
-
-#if (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) && !defined(__CUDA_NO_BFLOAT162_OPERATORS__)
-
-__device__ __forceinline__ __nv_bfloat162 operator+(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hadd2(lh, rh); }
-__device__ __forceinline__ __nv_bfloat162 operator-(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hsub2(lh, rh); }
-__device__ __forceinline__ __nv_bfloat162 operator*(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hmul2(lh, rh); }
-__device__ __forceinline__ __nv_bfloat162 operator/(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __h2div(lh, rh); }
-
-__device__ __forceinline__ __nv_bfloat162& operator+=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh) { lh = __hadd2(lh, rh); return lh; }
-__device__ __forceinline__ __nv_bfloat162& operator-=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh) { lh = __hsub2(lh, rh); return lh; }
-__device__ __forceinline__ __nv_bfloat162& operator*=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh) { lh = __hmul2(lh, rh); return lh; }
-__device__ __forceinline__ __nv_bfloat162& operator/=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh) { lh = __h2div(lh, rh); return lh; }
-
-__device__ __forceinline__ __nv_bfloat162 &operator++(__nv_bfloat162 &h)      { __nv_bfloat162_raw one; one.x = 0x3F80; one.y = 0x3F80; h = __hadd2(h, one); return h; }
-__device__ __forceinline__ __nv_bfloat162 &operator--(__nv_bfloat162 &h)      { __nv_bfloat162_raw one; one.x = 0x3F80; one.y = 0x3F80; h = __hsub2(h, one); return h; }
-__device__ __forceinline__ __nv_bfloat162  operator++(__nv_bfloat162 &h, const int ignored)
-{
-    // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
-    static_cast<void>(ignored);
-
-    const __nv_bfloat162 ret = h;
-    __nv_bfloat162_raw one;
-    one.x = 0x3F80;
-    one.y = 0x3F80;
-    h = __hadd2(h, one);
-    return ret;
-}
-__device__ __forceinline__ __nv_bfloat162  operator--(__nv_bfloat162 &h, const int ignored)
-{
-    // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
-    static_cast<void>(ignored);
-
-    const __nv_bfloat162 ret = h;
-    __nv_bfloat162_raw one;
-    one.x = 0x3F80;
-    one.y = 0x3F80;
-    h = __hsub2(h, one);
-    return ret;
-}
-__device__ __forceinline__ __nv_bfloat162 operator+(const __nv_bfloat162 &h) { return h; }
-__device__ __forceinline__ __nv_bfloat162 operator-(const __nv_bfloat162 &h) { return __hneg2(h); }
-
-__device__ __forceinline__ bool operator==(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hbeq2(lh, rh); }
-__device__ __forceinline__ bool operator!=(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hbneu2(lh, rh); }
-__device__ __forceinline__ bool operator>(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hbgt2(lh, rh); }
-__device__ __forceinline__ bool operator<(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hblt2(lh, rh); }
-__device__ __forceinline__ bool operator>=(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hbge2(lh, rh); }
-__device__ __forceinline__ bool operator<=(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hble2(lh, rh); }
-
-#endif /* __CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__) */
-#endif /* defined(__CUDACC__) */
-
-/* Restore warning for multiple assignment operators */
-#if defined(_MSC_VER) && _MSC_VER >= 1500
-#pragma warning( pop )
-#endif /* defined(_MSC_VER) && _MSC_VER >= 1500 */
-
-/* Restore -Weffc++ warnings from here on */
-#if defined(__GNUC__)
-#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
-#pragma GCC diagnostic pop
-#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */
-#endif /* defined(__GNUC__) */
-
-#undef __CUDA_HOSTDEVICE__
-#undef __CUDA_ALIGN__
-
-__CUDA_HOSTDEVICE_BF16_DECL__ unsigned short __internal_float2bfloat16(const float f, unsigned int &sign, unsigned int &remainder)
-{
-    unsigned int x;
-
-#if defined(__CUDA_ARCH__)
-    x = __float_as_uint(f);
-#elif defined(__CUDACC__)
-    (void)memcpy(&x, &f, sizeof(f));
-#else
-    (void)std::memcpy(&x, &f, sizeof(f));
-#endif
-
-    if ((x & 0x7fffffffU) > 0x7f800000U) {
-        sign = 0U;
-        remainder = 0U;
-        return static_cast<unsigned short>(0x7fffU);
-    }
-    sign = x >> 31U;
-    remainder = x << 16U;
-    return static_cast<unsigned short>(x >> 16U);
-}
-
-__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __double2bfloat16(const double x)
-{
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-    __nv_bfloat16 val;
-    asm("{  cvt.rn.bf16.f64 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "d"(x));
-    return val;
-#else
-
-    float f = static_cast<float>(x);
-    const double d = static_cast<double>(f);
-    unsigned int u;
-
-#if defined(__CUDA_ARCH__)
-    u = __float_as_uint(f);
-#elif defined(__CUDACC__)
-    (void)memcpy(&u, &f, sizeof(f));
-#else
-    (void)std::memcpy(&u, &f, sizeof(f));
-#endif
-    bool x_is_not_nan = ((u << (unsigned)1U) <= (unsigned)0xFF000000U);
-
-
-    if ((x > 0.0) && (d > x)) {
-        u--;
-    }
-    if ((x < 0.0) && (d < x)) {
-        u--;
-    }
-    if ((d != x) && x_is_not_nan) {
-        u |= 1U;
-    }
-
-#if defined(__CUDA_ARCH__)
-    f = __int_as_float(static_cast<int>(u));
-#elif defined(__CUDACC__)
-    (void)memcpy(&f, &u, sizeof(f));
-#else
-    (void)std::memcpy(&f, &u, sizeof(f));
-#endif
-
-    return __float2bfloat16(f);
-
-#endif // defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-}
-
-__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16(const float a)
-{
-    __nv_bfloat16 val;
-#if __CUDA_ARCH__ >= 800
-    asm("{  cvt.rn.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a));
-#else
-    __nv_bfloat16_raw r;
-    unsigned int sign = 0U;
-    unsigned int remainder = 0U;
-    r.x = __internal_float2bfloat16(a, sign, remainder);
-    if ((remainder > 0x80000000U) || ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) {
-        r.x++;
-    }
-    val = r;
-#endif
-    return val;
-}
-__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rn(const float a)
-{
-    __nv_bfloat16 val;
-#if __CUDA_ARCH__ >= 800
-    asm("{  cvt.rn.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a));
-#else
-    __nv_bfloat16_raw r;
-    unsigned int sign = 0U;
-    unsigned int remainder = 0U;
-    r.x = __internal_float2bfloat16(a, sign, remainder);
-    if ((remainder > 0x80000000U) || ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) {
-        r.x++;
-    }
-    val = r;
-#endif
-    return val;
-}
-__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rz(const float a)
-{
-    __nv_bfloat16 val;
-#if __CUDA_ARCH__ >= 800
-    asm("{  cvt.rz.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a));
-#else
-    __nv_bfloat16_raw r;
-    unsigned int sign = 0U;
-    unsigned int remainder = 0U;
-    r.x = __internal_float2bfloat16(a, sign, remainder);
-    val = r;
-#endif
-    return val;
-}
-__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rd(const float a)
-{
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-    __nv_bfloat16 val;
-    asm("{  cvt.rm.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a));
-    return val;
-#else
-    __nv_bfloat16 val;
-    __nv_bfloat16_raw r;
-    unsigned int sign = 0U;
-    unsigned int remainder = 0U;
-    r.x = __internal_float2bfloat16(a, sign, remainder);
-    if ((remainder != 0U) && (sign != 0U)) {
-        r.x++;
-    }
-    val = r;
-    return val;
-#endif
-}
-__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_ru(const float a)
-{
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-    __nv_bfloat16 val;
-    asm("{  cvt.rp.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a));
-    return val;
-#else
-    __nv_bfloat16 val;
-    __nv_bfloat16_raw r;
-    unsigned int sign = 0U;
-    unsigned int remainder = 0U;
-    r.x = __internal_float2bfloat16(a, sign, remainder);
-    if ((remainder != 0U) && (sign == 0U)) {
-        r.x++;
-    }
-    val = r;
-    return val;
-#endif
-}
-__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __float2bfloat162_rn(const float a)
-{
-    __nv_bfloat162 val;
-#if __CUDA_ARCH__ >= 800
-    asm("{.reg .b16 low;\n"
-        "  cvt.rn.bf16.f32 low, %1;\n"
-        "  mov.b32 %0, {low,low};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "f"(a));
-#else
-    val = __nv_bfloat162(__float2bfloat16_rn(a), __float2bfloat16_rn(a));
-#endif
-    return val;
-}
-__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __floats2bfloat162_rn(const float a, const float b)
-{
-    __nv_bfloat162 val;
-#if __CUDA_ARCH__ >= 800
-    asm("{ cvt.rn.bf16x2.f32 %0, %2, %1;}\n"
-        : "=r"(__BFLOAT162_TO_UI(val)) : "f"(a), "f"(b));
-#else
-    val = __nv_bfloat162(__float2bfloat16_rn(a), __float2bfloat16_rn(b));
-#endif
-    return val;
-}
-
-__CUDA_HOSTDEVICE_BF16_DECL__ float __internal_bfloat162float(const unsigned short h)
-{
-    float f;
-#if defined(__CUDA_ARCH__)
-    #if (__CUDA_ARCH__ >= 900)
-        asm("{ cvt.f32.bf16 %0, %1;}\n" : "=f"(f) : "h"(h));
-    #else
-        asm("{ mov.b32 %0, {0,%1};}\n" : "=f"(f) : "h"(h));
-    #endif
-#else
-    unsigned int u = static_cast<unsigned int>(h) << 16;
-    #if defined(__CUDACC__)
-        (void)memcpy(&f, &u, sizeof(f));
-    #else
-        (void)std::memcpy(&f, &u, sizeof(f));
-    #endif
-#endif
-    return f;
-}
-
-__CUDA_HOSTDEVICE_BF16_DECL__ float __bfloat162float(const __nv_bfloat16 a)
-{
-    return __internal_bfloat162float(static_cast<__nv_bfloat16_raw>(a).x);
-}
-__CUDA_HOSTDEVICE_BF16_DECL__ float __low2float(const __nv_bfloat162 a)
-{
-    return __internal_bfloat162float(static_cast<__nv_bfloat162_raw>(a).x);
-}
-
-__CUDA_HOSTDEVICE_BF16_DECL__ float __high2float(const __nv_bfloat162 a)
-{
-    return __internal_bfloat162float(static_cast<__nv_bfloat162_raw>(a).y);
-}
-
-#if defined(__CUDACC__) && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
-
-/* CUDA vector-types compatible vector creation function (note returns __nv_bfloat162, not nv_bfloat162) */
-__VECTOR_FUNCTIONS_DECL__ __nv_bfloat162 make_bfloat162(const __nv_bfloat16 x, const __nv_bfloat16 y)
-{
-    __nv_bfloat162 t; t.x = x; t.y = y; return t;
-}
-#undef __VECTOR_FUNCTIONS_DECL__
-
-
-/* Definitions of intrinsics */
-__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __float22bfloat162_rn(const float2 a)
-{
-    __nv_bfloat162 val = __floats2bfloat162_rn(a.x, a.y);
-    return val;
-}
-__CUDA_HOSTDEVICE_BF16_DECL__ float2 __bfloat1622float2(const __nv_bfloat162 a)
-{
-    float hi_float;
-    float lo_float;
-    lo_float = __internal_bfloat162float(((__nv_bfloat162_raw)a).x);
-    hi_float = __internal_bfloat162float(((__nv_bfloat162_raw)a).y);
-    return make_float2(lo_float, hi_float);
-}
-__CUDA_BF16_DECL__ int __bfloat162int_rn(const __nv_bfloat16 h)
-{
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-    int val;
-    asm("{  cvt.rni.s32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h)));
-    return val;
-#else
-    return __float2int_rn(__bfloat162float(h));
-#endif
-}
-__CUDA_HOSTDEVICE_BF16_DECL__ int __bfloat162int_rz(const __nv_bfloat16 h)
-{
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-    int val;
-    asm("{  cvt.rzi.s32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h)));
-    return val;
-#else
-    const float f = __bfloat162float(h);
-    int   i;
-    i = static_cast<int>(f);
-#if !(defined __CUDA_ARCH__)
-    const int max_val = (int)0x7fffffffU;
-    const int min_val = (int)0x80000000U;
-    const unsigned short bits = static_cast<unsigned short>(static_cast<__nv_bfloat16_raw>(h).x << 1U);
-    // saturation fixup
-    if (bits > (unsigned short)0xFF00U) {
-        // NaN
-        i = 0;
-    } else if (f >= static_cast<float>(max_val)) {
-        // saturate maximum
-        i = max_val;
-    } else if (f < static_cast<float>(min_val)) {
-        // saturate minimum
-        i = min_val;
-    }
-#endif
-    return i;
-#endif // defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-}
-__CUDA_BF16_DECL__ int __bfloat162int_rd(const __nv_bfloat16 h)
-{
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-    int val;
-    asm("{  cvt.rmi.s32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h)));
-    return val;
-#else
-    return __float2int_rd(__bfloat162float(h));
-#endif
-}
-__CUDA_BF16_DECL__ int __bfloat162int_ru(const __nv_bfloat16 h)
-{
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-    int val;
-    asm("{  cvt.rpi.s32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h)));
-    return val;
-#else
-    return __float2int_ru(__bfloat162float(h));
-#endif
-}
-__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rn(const int i)
-{
-#if (defined __CUDA_ARCH__)
-    #if (__CUDA_ARCH__ >= 900)
-        __nv_bfloat16 val;
-       asm("cvt.rn.bf16.s32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i));
-       return val;
-    #else
-        const float ru = __int2float_ru(i);
-        const float rd = __int2float_rd(i);
-        float rz = __int2float_rz(i);
-        if (ru != rd) {
-            rz = __uint_as_float(__float_as_uint(rz) | 1U);
-        }
-        return __float2bfloat16_rn(rz);
-    #endif
-#else
-    const double d = static_cast<double>(i);
-    return __double2bfloat16(d);
-#endif
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rz(const int i)
-{
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-     __nv_bfloat16 val;
-    asm("cvt.rz.bf16.s32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i));
-    return val;
-#else
-    return __float2bfloat16_rz(__int2float_rz(i));
-#endif
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rd(const int i)
-{
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-     __nv_bfloat16 val;
-    asm("cvt.rm.bf16.s32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i));
-    return val;
-#else
-    return __float2bfloat16_rd(__int2float_rd(i));
-#endif
-}
-
-__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_ru(const int i)
-{
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-     __nv_bfloat16 val;
-    asm("cvt.rp.bf16.s32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i));
-    return val;
-#else
-    return __float2bfloat16_ru(__int2float_ru(i));
-#endif
-}
-
-__CUDA_BF16_DECL__ short int __bfloat162short_rn(const __nv_bfloat16 h)
-{
-   short int val;
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-   asm("cvt.rni.s16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
-#else
-   asm("{ .reg.f32 f;\n"
-       "  mov.b32 f, {0,%1};\n"
-       "  cvt.rni.s16.f32 %0,f;\n}"
-        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
-#endif
-   return val;
-}
-
-__CUDA_HOSTDEVICE_BF16_DECL__ short int __bfloat162short_rz(const __nv_bfloat16 h)
-{
-   short int val;
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-   asm("cvt.rzi.s16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
-#elif (defined __CUDA_ARCH__)
-   asm("{ .reg.f32 f;\n"
-       "  mov.b32 f, {0,%1};\n"
-       "  cvt.rzi.s16.f32 %0,f;\n}"
-        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
-#else
-    const float f = __bfloat162float(h);
-    val = static_cast<short int>(f);
-    const short int max_val = (short int)0x7fffU;
-    const short int min_val = (short int)0x8000U;
-    const unsigned short bits = static_cast<unsigned short>(static_cast<__nv_bfloat16_raw>(h).x << 1U);
-    // saturation fixup
-    if (bits > (unsigned short)0xFF00U) {
-        // NaN
-        val = 0;
-    } else if (f > static_cast<float>(max_val)) {
-        // saturate maximum
-        val = max_val;
-    } else if (f < static_cast<float>(min_val)) {
-        // saturate minimum
-        val = min_val;
-    }
-#endif
-   return val;
-}
-__CUDA_BF16_DECL__ short int __bfloat162short_rd(const __nv_bfloat16 h)
-{
-   short int val;
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-   asm("cvt.rmi.s16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
-#else
-   asm("{ .reg.f32 f;\n"
-       "  mov.b32 f, {0,%1};\n"
-       "  cvt.rmi.s16.f32 %0,f;\n}"
-        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
-#endif
-   return val;
-}
-__CUDA_BF16_DECL__ short int __bfloat162short_ru(const __nv_bfloat16 h)
-{
-   short int val;
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-   asm("cvt.rpi.s16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
-#else
-   asm("{ .reg.f32 f;\n"
-       "  mov.b32 f, {0,%1};\n"
-       "  cvt.rpi.s16.f32 %0,f;\n}"
-        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
-#endif
-   return val;
-}
-__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rn(const short int i)
-{
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-    __nv_bfloat16 val;
-    asm("cvt.rn.bf16.s16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i));
-    return val;
-#else
-    const float f = static_cast<float>(i);
-    return __float2bfloat16_rn(f);
-#endif
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rz(const short int i)
-{
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-    __nv_bfloat16 val;
-    asm("cvt.rz.bf16.s16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i));
-    return val;
-#else
-    return __float2bfloat16_rz(__int2float_rz(static_cast<int>(i)));
-#endif
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rd(const short int i)
-{
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-    __nv_bfloat16 val;
-    asm("cvt.rm.bf16.s16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i));
-    return val;
-#else
-    return __float2bfloat16_rd(__int2float_rd(static_cast<int>(i)));
-#endif
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_ru(const short int i)
-{
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-    __nv_bfloat16 val;
-    asm("cvt.rp.bf16.s16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i));
-    return val;
-#else
-    return __float2bfloat16_ru(__int2float_ru(static_cast<int>(i)));
-#endif
-}
-
-__CUDA_BF16_DECL__ unsigned int __bfloat162uint_rn(const __nv_bfloat16 h)
-{
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-    unsigned int val;
-    asm("{  cvt.rni.u32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h)));
-    return val;
-#else
-    return __float2uint_rn(__bfloat162float(h));
-#endif
-}
-__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __bfloat162uint_rz(const __nv_bfloat16 h)
-{
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-    unsigned int val;
-    asm("{  cvt.rzi.u32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h)));
-    return val;
-#else
-
-    const float f = __bfloat162float(h);
-    unsigned int i;
-    i = static_cast<unsigned int>(f);
-#if !(defined __CUDA_ARCH__)
-    const unsigned int max_val = 0xffffffffU;
-    const unsigned int min_val = 0U;
-    const unsigned short bits = static_cast<unsigned short>(static_cast<__nv_bfloat16_raw>(h).x << 1U);
-    // saturation fixup
-    if (bits > (unsigned short)0xFF00U) {
-        // NaN
-        i = 0U;
-    } else if (f >= static_cast<float>(max_val)) {
-        // saturate maximum
-        i = max_val;
-    } else if (f < static_cast<float>(min_val)) {
-        // saturate minimum
-        i = min_val;
-    }
-#endif
-    return i;
-
-#endif // defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-}
-__CUDA_BF16_DECL__ unsigned int __bfloat162uint_rd(const __nv_bfloat16 h)
-{
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-    unsigned int val;
-    asm("{  cvt.rmi.u32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h)));
-    return val;
-#else
-    return __float2uint_rd(__bfloat162float(h));
-#endif
-}
-__CUDA_BF16_DECL__ unsigned int __bfloat162uint_ru(const __nv_bfloat16 h)
-{
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-    unsigned int val;
-    asm("{  cvt.rpi.u32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h)));
-    return val;
-#else
-    return __float2uint_ru(__bfloat162float(h));
-#endif
-}
-__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rn(const unsigned int i)
-{
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-     __nv_bfloat16 val;
-    asm("cvt.rn.bf16.u32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i));
-    return val;
-#elif (defined __CUDA_ARCH__)
-    const float ru = __uint2float_ru(i);
-    const float rd = __uint2float_rd(i);
-    float rz = __uint2float_rz(i);
-    if (ru != rd) {
-        rz = __uint_as_float(__float_as_uint(rz) | 1U);
-    }
-    return __float2bfloat16_rn(rz);
-#else
-    const double d = static_cast<double>(i);
-    return __double2bfloat16(d);
-#endif
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rz(const unsigned int i)
-{
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-     __nv_bfloat16 val;
-    asm("cvt.rz.bf16.u32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i));
-    return val;
-#else
-    return __float2bfloat16_rz(__uint2float_rz(i));
-#endif
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rd(const unsigned int i)
-{
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-     __nv_bfloat16 val;
-    asm("cvt.rm.bf16.u32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i));
-    return val;
-#else
-    return __float2bfloat16_rd(__uint2float_rd(i));
-#endif
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_ru(const unsigned int i)
-{
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-     __nv_bfloat16 val;
-    asm("cvt.rp.bf16.u32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i));
-    return val;
-#else
-    return __float2bfloat16_ru(__uint2float_ru(i));
-#endif
-}
-
-__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_rn(const __nv_bfloat16 h)
-{
-   unsigned short int val;
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-   asm("cvt.rni.u16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
-#else
-   asm("{ .reg.f32 f;\n"
-       "  mov.b32 f, {0,%1};\n"
-       "  cvt.rni.u16.f32 %0,f;\n}"
-        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
-#endif
-   return val;
-}
-__CUDA_HOSTDEVICE_BF16_DECL__ unsigned short int __bfloat162ushort_rz(const __nv_bfloat16 h)
-{
-   unsigned short int val;
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-   asm("cvt.rzi.u16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
-#elif (defined __CUDA_ARCH__)
-   asm("{ .reg.f32 f;\n"
-       "  mov.b32 f, {0,%1};\n"
-       "  cvt.rzi.u16.f32 %0,f;\n}"
-        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
-#else
-    const float f = __bfloat162float(h);
-    val = static_cast<unsigned short int>(f);
-    const unsigned short int max_val = 0xffffU;
-    const unsigned short int min_val = 0U;
-    const unsigned short bits = static_cast<unsigned short>(static_cast<__nv_bfloat16_raw>(h).x << 1U);
-    // saturation fixup
-    if (bits > (unsigned short)0xFF00U) {
-        // NaN
-        val = 0U;
-    } else if (f > static_cast<float>(max_val)) {
-        // saturate maximum
-        val = max_val;
-    } else if (f < static_cast<float>(min_val)) {
-        // saturate minimum
-        val = min_val;
-    }
-#endif
-   return val;
-}
-__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_rd(const __nv_bfloat16 h)
-{
-   unsigned short int val;
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-   asm("cvt.rmi.u16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
-#else
-   asm("{ .reg.f32 f;\n"
-       "  mov.b32 f, {0,%1};\n"
-       "  cvt.rmi.u16.f32 %0,f;\n}"
-        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
-#endif
-   return val;
-}
-__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_ru(const __nv_bfloat16 h)
-{
-   unsigned short int val;
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-   asm("cvt.rpi.u16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
-#else
-   asm("{ .reg.f32 f;\n"
-       "  mov.b32 f, {0,%1};\n"
-       "  cvt.rpi.u16.f32 %0,f;\n}"
-        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
-#endif
-   return val;
-}
-__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rn(const unsigned short int i)
-{
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-    __nv_bfloat16 val;
-    asm("cvt.rn.bf16.u16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i));
-    return val;
-#else
-    const float f = static_cast<float>(i);
-    return __float2bfloat16_rn(f);
-#endif
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rz(const unsigned short int i)
-{
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-    __nv_bfloat16 val;
-    asm("cvt.rz.bf16.u16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i));
-    return val;
-#else
-    return __float2bfloat16_rz(__uint2float_rz(static_cast<unsigned int>(i)));
-#endif
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rd(const unsigned short int i)
-{
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-    __nv_bfloat16 val;
-    asm("cvt.rm.bf16.u16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i));
-    return val;
-#else
-    return __float2bfloat16_rd(__uint2float_rd(static_cast<unsigned int>(i)));
-#endif
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_ru(const unsigned short int i)
-{
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-    __nv_bfloat16 val;
-    asm("cvt.rp.bf16.u16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i));
-    return val;
-#else
-    return __float2bfloat16_ru(__uint2float_ru(static_cast<unsigned int>(i)));
-#endif
-}
-
-__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_rn(const __nv_bfloat16 h)
-{
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-    unsigned long long int i;
-    asm("cvt.rni.u64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h)));
-    return i;
-#else
-    return __float2ull_rn(__bfloat162float(h));
-#endif
-}
-__CUDA_HOSTDEVICE_BF16_DECL__ unsigned long long int __bfloat162ull_rz(const __nv_bfloat16 h)
-{
-    unsigned long long int i;
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-    asm("cvt.rzi.u64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h)));
-    return i;
-#else
-    const float f = __bfloat162float(h);
-    i = static_cast<unsigned long long int>(f);
-#if !(defined __CUDA_ARCH__)
-    const unsigned long long int max_val = 0xffffffffffffffffULL;
-    const unsigned long long int min_val = 0ULL;
-    const unsigned short bits = static_cast<unsigned short>(static_cast<__nv_bfloat16_raw>(h).x << 1U);
-    // saturation fixup
-    if (bits > (unsigned short)0xFF00U) {
-        // NaN
-        i = 0x8000000000000000ULL;
-    } else if (f >= static_cast<float>(max_val)) {
-        // saturate maximum
-        i = max_val;
-    } else if (f < static_cast<float>(min_val)) {
-        // saturate minimum
-        i = min_val;
-    }
-#endif
-#endif // defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-    return i;
-}
-__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_rd(const __nv_bfloat16 h)
-{
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-    unsigned long long int i;
-    asm("cvt.rmi.u64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h)));
-    return i;
-#else
-    return __float2ull_rd(__bfloat162float(h));
-#endif
-}
-__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_ru(const __nv_bfloat16 h)
-{
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-    unsigned long long int i;
-    asm("cvt.rpi.u64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h)));
-    return i;
-#else
-    return __float2ull_ru(__bfloat162float(h));
-#endif
-}
-__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rn(const unsigned long long int i)
-{
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-    __nv_bfloat16 h;
-    asm("cvt.rn.bf16.u64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i));
-    return h;
-#elif (defined __CUDA_ARCH__)
-    const float ru = __ull2float_ru(i);
-    const float rd = __ull2float_rd(i);
-    float rz = __ull2float_rz(i);
-    if (ru != rd) {
-        rz = __uint_as_float(__float_as_uint(rz) | 1U);
-    }
-    return __float2bfloat16_rn(rz);
-#else
-    float f = static_cast<float>(i);
-    const unsigned long long int uf = static_cast<unsigned long long int>(f);
-    unsigned int u;
-
-    #if defined(__CUDA_ARCH__)
-        u = __float_as_uint(f);
-    #elif defined(__CUDACC__)
-        (void)memcpy(&u, &f, sizeof(f));
-    #else
-        (void)std::memcpy(&u, &f, sizeof(f));
-    #endif
-
-    // round up happened here
-    // note: no need to handle round up to f == 0x1.p64 specially
-    if (uf > i) {
-        u--;
-    }
-    if (uf != i) {
-        u |= 1U;
-    }
-
-    #if defined(__CUDA_ARCH__)
-        f = __int_as_float(static_cast<int>(u));
-    #elif defined(__CUDACC__)
-        (void)memcpy(&f, &u, sizeof(f));
-    #else
-        (void)std::memcpy(&f, &u, sizeof(f));
-    #endif
-
-    return __float2bfloat16_rn(f);
-#endif
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rz(const unsigned long long int i)
-{
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-    __nv_bfloat16 h;
-    asm("cvt.rz.bf16.u64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i));
-    return h;
-#else
-    return __float2bfloat16_rz(__ull2float_rz(i));
-#endif
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rd(const unsigned long long int i)
-{
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-    __nv_bfloat16 h;
-    asm("cvt.rm.bf16.u64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i));
-    return h;
-#else
-    return __float2bfloat16_rd(__ull2float_rd(i));
-#endif
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_ru(const unsigned long long int i)
-{
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-    __nv_bfloat16 h;
-    asm("cvt.rp.bf16.u64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i));
-    return h;
-#else
-    return __float2bfloat16_ru(__ull2float_ru(i));
-#endif
-}
-__CUDA_BF16_DECL__ long long int __bfloat162ll_rn(const __nv_bfloat16 h)
-{
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-    long long int i;
-    asm("cvt.rni.s64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h)));
-    return i;
-#else
-    return __float2ll_rn(__bfloat162float(h));
-#endif
-}
-__CUDA_HOSTDEVICE_BF16_DECL__ long long int __bfloat162ll_rz(const __nv_bfloat16 h)
-{
-    long long int i;
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-    asm("cvt.rzi.s64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h)));
-#else
-    const float f = __bfloat162float(h);
-    i = static_cast<long long int>(f);
-#if !(defined __CUDA_ARCH__)
-    const long long int max_val = (long long int)0x7fffffffffffffffULL;
-    const long long int min_val = (long long int)0x8000000000000000ULL;
-    const unsigned short bits = static_cast<unsigned short>(static_cast<__nv_bfloat16_raw>(h).x << 1U);
-    // saturation fixup
-    if (bits > (unsigned short)0xFF00U) {
-        // NaN
-        i = min_val;
-    } else if (f >= static_cast<float>(max_val)) {
-        // saturate maximum
-        i = max_val;
-    } else if (f < static_cast<float>(min_val)) {
-        // saturate minimum
-        i = min_val;
-    }
-#endif
-#endif // defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-    return i;
-}
-__CUDA_BF16_DECL__ long long int __bfloat162ll_rd(const __nv_bfloat16 h)
-{
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-    long long int i;
-    asm("cvt.rmi.s64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h)));
-    return i;
-#else
-    return __float2ll_rd(__bfloat162float(h));
-#endif
-}
-__CUDA_BF16_DECL__ long long int __bfloat162ll_ru(const __nv_bfloat16 h)
-{
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-    long long int i;
-    asm("cvt.rpi.s64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h)));
-    return i;
-#else
-    return __float2ll_ru(__bfloat162float(h));
-#endif
-}
-__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rn(const long long int i)
-{
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-    __nv_bfloat16 h;
-    asm("cvt.rn.bf16.s64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i));
-    return h;
-#elif (defined __CUDA_ARCH__)
-    const float ru = __ll2float_ru(i);
-    const float rd = __ll2float_rd(i);
-    float rz = __ll2float_rz(i);
-    if (ru != rd) {
-        rz = __uint_as_float(__float_as_uint(rz) | 1U);
-    }
-    return __float2bfloat16_rn(rz);
-#else
-    float f = static_cast<float>(i);
-    const long long int lf = static_cast<long long int>(f);
-    unsigned int u;
-
-    #if defined(__CUDA_ARCH__)
-        u = __float_as_uint(f);
-    #elif defined(__CUDACC__)
-        (void)memcpy(&u, &f, sizeof(f));
-    #else
-        (void)std::memcpy(&u, &f, sizeof(f));
-    #endif
-
-    if ((f > 0.0f) && (lf > i)) {
-        u--;
-    }
-    if ((f < 0.0f) && (lf < i)) {
-        u--;
-    }
-    if (lf != i) {
-        u |= 1U;
-    }
-
-    #if defined(__CUDA_ARCH__)
-        f = __int_as_float(static_cast<int>(u));
-    #elif defined(__CUDACC__)
-        (void)memcpy(&f, &u, sizeof(f));
-    #else
-        (void)std::memcpy(&f, &u, sizeof(f));
-    #endif
-
-    return __float2bfloat16_rn(f);
-#endif
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rz(const long long int i)
-{
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-    __nv_bfloat16 h;
-    asm("cvt.rz.bf16.s64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i));
-    return h;
-#else
-    return __float2bfloat16_rz(__ll2float_rz(i));
-#endif
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rd(const long long int i)
-{
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-    __nv_bfloat16 h;
-    asm("cvt.rm.bf16.s64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i));
-    return h;
-#else
-    return __float2bfloat16_rd(__ll2float_rd(i));
-#endif
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_ru(const long long int i)
-{
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-    __nv_bfloat16 h;
-    asm("cvt.rp.bf16.s64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i));
-    return h;
-#else
-    return __float2bfloat16_ru(__ll2float_ru(i));
-#endif
-}
-
-__CUDA_BF16_DECL__ __nv_bfloat16 htrunc(const __nv_bfloat16 h)
-{
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-    __nv_bfloat16 r;
-    asm("cvt.rzi.bf16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(h)));
-    return r;
-#else
-    return __float2bfloat16_rz(truncf(__bfloat162float(h)));
-#endif
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 hceil(const __nv_bfloat16 h)
-{
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-    __nv_bfloat16 r;
-    asm("cvt.rpi.bf16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(h)));
-    return r;
-#else
-    return __float2bfloat16_ru(ceilf(__bfloat162float(h)));
-#endif
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 hfloor(const __nv_bfloat16 h)
-{
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-    __nv_bfloat16 r;
-    asm("cvt.rmi.bf16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(h)));
-    return r;
-#else
-    return __float2bfloat16_rd(floorf(__bfloat162float(h)));
-#endif
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 hrint(const __nv_bfloat16 h)
-{
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-    __nv_bfloat16 r;
-    asm("cvt.rni.bf16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(h)));
-    return r;
-#else
-    return __float2bfloat16_rn(rintf(__bfloat162float(h)));
-#endif
-}
-
-__CUDA_BF16_DECL__ __nv_bfloat162 h2trunc(const __nv_bfloat162 h)
-{
-    const __nv_bfloat16 low = __float2bfloat16_rz(truncf(__low2float(h)));
-    const __nv_bfloat16 high = __float2bfloat16_rz(truncf(__high2float(h)));
-    return __nv_bfloat162(low, high);
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 h2ceil(const __nv_bfloat162 h)
-{
-    const __nv_bfloat16 low = __float2bfloat16_ru(ceilf(__low2float(h)));
-    const __nv_bfloat16 high = __float2bfloat16_ru(ceilf(__high2float(h)));
-    return __nv_bfloat162(low, high);
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 h2floor(const __nv_bfloat162 h)
-{
-    const __nv_bfloat16 low = __float2bfloat16_rd(floorf(__low2float(h)));
-    const __nv_bfloat16 high = __float2bfloat16_rd(floorf(__high2float(h)));
-    return __nv_bfloat162(low, high);
-}
-
-__CUDA_BF16_DECL__ __nv_bfloat162 h2rint(const __nv_bfloat162 h)
-{
-    return __halves2bfloat162(hrint(__low2bfloat16(h)), hrint(__high2bfloat16(h)));
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 __lows2bfloat162(const __nv_bfloat162 a, const __nv_bfloat162 b)
-{
-    __nv_bfloat162 val;
-    asm("{.reg .b16 alow,ahigh,blow,bhigh;\n"
-        "  mov.b32 {alow,ahigh}, %1;\n"
-        "  mov.b32 {blow,bhigh}, %2;\n"
-        "  mov.b32 %0, {alow,blow};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)), "r"(__BFLOAT162_TO_CUI(b)));
-    return val;
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 __highs2bfloat162(const __nv_bfloat162 a, const __nv_bfloat162 b)
-{
-    __nv_bfloat162 val;
-    asm("{.reg .b16 alow,ahigh,blow,bhigh;\n"
-        "  mov.b32 {alow,ahigh}, %1;\n"
-        "  mov.b32 {blow,bhigh}, %2;\n"
-        "  mov.b32 %0, {ahigh,bhigh};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)), "r"(__BFLOAT162_TO_CUI(b)));
-    return val;
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 __low2bfloat16(const __nv_bfloat162 a)
-{
-    __nv_bfloat16 ret;
-    asm("{.reg .b16 low,high;\n"
-        " mov.b32 {low,high}, %1;\n"
-        " mov.b16 %0, low;}" : "=h"(__BFLOAT16_TO_US(ret)) : "r"(__BFLOAT162_TO_CUI(a)));
-    return ret;
-}
-__CUDA_BF16_DECL__ int __hisinf(const __nv_bfloat16 a)
-{
-    int retval;
-    if (__BFLOAT16_TO_CUS(a) == 0xFF80U) {
-        retval = -1;
-    } else if (__BFLOAT16_TO_CUS(a) == 0x7F80U) {
-        retval = 1;
-    } else {
-        retval = 0;
-    }
-    return retval;
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 __low2bfloat162(const __nv_bfloat162 a)
-{
-    __nv_bfloat162 val;
-    asm("{.reg .b16 low,high;\n"
-        "  mov.b32 {low,high}, %1;\n"
-        "  mov.b32 %0, {low,low};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)));
-    return val;
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 __high2bfloat162(const __nv_bfloat162 a)
-{
-    __nv_bfloat162 val;
-    asm("{.reg .b16 low,high;\n"
-        "  mov.b32 {low,high}, %1;\n"
-        "  mov.b32 %0, {high,high};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)));
-    return val;
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 __high2bfloat16(const __nv_bfloat162 a)
-{
-    __nv_bfloat16 ret;
-    asm("{.reg .b16 low,high;\n"
-        " mov.b32 {low,high}, %1;\n"
-        " mov.b16 %0, high;}" : "=h"(__BFLOAT16_TO_US(ret)) : "r"(__BFLOAT162_TO_CUI(a)));
-    return ret;
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 __halves2bfloat162(const __nv_bfloat16 a, const __nv_bfloat16 b)
-{
-    __nv_bfloat162 val;
-    asm("{  mov.b32 %0, {%1,%2};}\n"
-        : "=r"(__BFLOAT162_TO_UI(val)) : "h"(__BFLOAT16_TO_CUS(a)), "h"(__BFLOAT16_TO_CUS(b)));
-    return val;
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 __bfloat162bfloat162(const __nv_bfloat16 a)
-{
-    __nv_bfloat162 val;
-    asm("{  mov.b32 %0, {%1,%1};}\n"
-        : "=r"(__BFLOAT162_TO_UI(val)) : "h"(__BFLOAT16_TO_CUS(a)));
-    return val;
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 __lowhigh2highlow(const __nv_bfloat162 a)
-{
-    __nv_bfloat162 val;
-    asm("{.reg .b16 low,high;\n"
-        "  mov.b32 {low,high}, %1;\n"
-        "  mov.b32 %0, {high,low};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)));
-    return val;
-}
-__CUDA_BF16_DECL__ short int __bfloat16_as_short(const __nv_bfloat16 h)
-{
-    return static_cast<short int>(__BFLOAT16_TO_CUS(h));
-}
-__CUDA_BF16_DECL__ unsigned short int __bfloat16_as_ushort(const __nv_bfloat16 h)
-{
-    return __BFLOAT16_TO_CUS(h);
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 __short_as_bfloat16(const short int i)
-{
-    __nv_bfloat16 h;
-    __BFLOAT16_TO_US(h) = static_cast<unsigned short int>(i);
-    return h;
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 __ushort_as_bfloat16(const unsigned short int i)
-{
-    __nv_bfloat16 h;
-    __BFLOAT16_TO_US(h) = i;
-    return h;
-}
-
-/******************************************************************************
-*                           __nv_bfloat16, __nv_bfloat162 warp shuffle                     *
-******************************************************************************/
-#define __SHUFFLE_SYNC_BFLOAT162_MACRO(name) /* do */ {\
-   __nv_bfloat162 r; \
-   asm volatile ("{" __CUDA_BF16_STRINGIFY(name) " %0,%1,%2,%3,%4;\n}" \
-       :"=r"(__BFLOAT162_TO_UI(r)): "r"(__BFLOAT162_TO_CUI(var)), "r"(delta), "r"(c), "r"(mask)); \
-   return r; \
-} /* while(0) */
-
-__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_sync(const unsigned mask, const __nv_bfloat162 var, const int delta, const int width)
-{
-    unsigned int warp_size;
-    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
-    const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
-    __SHUFFLE_SYNC_BFLOAT162_MACRO(shfl.sync.idx.b32)
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_up_sync(const unsigned mask, const __nv_bfloat162 var, const unsigned int delta, const int width)
-{
-    unsigned int warp_size;
-    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
-    const unsigned int c = (warp_size - static_cast<unsigned>(width)) << 8U;
-    __SHUFFLE_SYNC_BFLOAT162_MACRO(shfl.sync.up.b32)
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_down_sync(const unsigned mask, const __nv_bfloat162 var, const unsigned int delta, const int width)
-{
-    unsigned int warp_size;
-    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
-    const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
-    __SHUFFLE_SYNC_BFLOAT162_MACRO(shfl.sync.down.b32)
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_xor_sync(const unsigned mask, const __nv_bfloat162 var, const int delta, const int width)
-{
-    unsigned int warp_size;
-    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
-    const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
-    __SHUFFLE_SYNC_BFLOAT162_MACRO(shfl.sync.bfly.b32)
-}
-
-#undef __SHUFFLE_SYNC_BFLOAT162_MACRO
-
-__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_sync(const unsigned mask, const __nv_bfloat16 var, const int delta, const int width)
-{
-    const __nv_bfloat162 temp1 = __halves2bfloat162(var, var);
-    const __nv_bfloat162 temp2 = __shfl_sync(mask, temp1, delta, width);
-    return __low2bfloat16(temp2);
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_up_sync(const unsigned mask, const __nv_bfloat16 var, const unsigned int delta, const int width)
-{
-    const __nv_bfloat162 temp1 = __halves2bfloat162(var, var);
-    const __nv_bfloat162 temp2 = __shfl_up_sync(mask, temp1, delta, width);
-    return __low2bfloat16(temp2);
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_down_sync(const unsigned mask, const __nv_bfloat16 var, const unsigned int delta, const int width)
-{
-    const __nv_bfloat162 temp1 = __halves2bfloat162(var, var);
-    const __nv_bfloat162 temp2 = __shfl_down_sync(mask, temp1, delta, width);
-    return __low2bfloat16(temp2);
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_xor_sync(const unsigned mask, const __nv_bfloat16 var, const int delta, const int width)
-{
-    const __nv_bfloat162 temp1 = __halves2bfloat162(var, var);
-    const __nv_bfloat162 temp2 = __shfl_xor_sync(mask, temp1, delta, width);
-    return __low2bfloat16(temp2);
-}
-
-/******************************************************************************
-*               __nv_bfloat16 and __nv_bfloat162 __ldg,__ldcg,__ldca,__ldcs                *
-******************************************************************************/
-
-#if defined(__cplusplus)
-#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
-#define __LDG_PTR   "l"
-#else
-#define __LDG_PTR   "r"
-#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/
-__CUDA_BF16_DECL__ __nv_bfloat162 __ldg(const  __nv_bfloat162 *const ptr)
-{
-    __nv_bfloat162 ret;
-    asm ("ld.global.nc.b32 %0, [%1];"  : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr));
-    return ret;
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 __ldg(const __nv_bfloat16 *const ptr)
-{
-    __nv_bfloat16 ret;
-    asm ("ld.global.nc.b16 %0, [%1];"  : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr));
-    return ret;
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 __ldcg(const  __nv_bfloat162 *const ptr)
-{
-    __nv_bfloat162 ret;
-    asm ("ld.global.cg.b32 %0, [%1];"  : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr));
-    return ret;
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 __ldcg(const __nv_bfloat16 *const ptr)
-{
-    __nv_bfloat16 ret;
-    asm ("ld.global.cg.b16 %0, [%1];"  : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr));
-    return ret;
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 __ldca(const  __nv_bfloat162 *const ptr)
-{
-    __nv_bfloat162 ret;
-    asm ("ld.global.ca.b32 %0, [%1];"  : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr));
-    return ret;
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 __ldca(const __nv_bfloat16 *const ptr)
-{
-    __nv_bfloat16 ret;
-    asm ("ld.global.ca.b16 %0, [%1];"  : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr));
-    return ret;
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 __ldcs(const  __nv_bfloat162 *const ptr)
-{
-    __nv_bfloat162 ret;
-    asm ("ld.global.cs.b32 %0, [%1];"  : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr));
-    return ret;
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 __ldcs(const __nv_bfloat16 *const ptr)
-{
-    __nv_bfloat16 ret;
-    asm ("ld.global.cs.b16 %0, [%1];"  : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr));
-    return ret;
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 __ldlu(const  __nv_bfloat162 *const ptr)
-{
-    __nv_bfloat162 ret;
-    asm ("ld.global.lu.b32 %0, [%1];"  : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr) : "memory");
-    return ret;
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 __ldlu(const __nv_bfloat16 *const ptr)
-{
-    __nv_bfloat16 ret;
-    asm ("ld.global.lu.b16 %0, [%1];"  : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr) : "memory");
-    return ret;
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 __ldcv(const  __nv_bfloat162 *const ptr)
-{
-    __nv_bfloat162 ret;
-    asm ("ld.global.cv.b32 %0, [%1];"  : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr) : "memory");
-    return ret;
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 __ldcv(const __nv_bfloat16 *const ptr)
-{
-    __nv_bfloat16 ret;
-    asm ("ld.global.cv.b16 %0, [%1];"  : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr) : "memory");
-    return ret;
-}
-
-__CUDA_BF16_DECL__ void __stwb(__nv_bfloat162 *const ptr, const __nv_bfloat162 value)
-{
-    asm ("st.global.wb.b32 [%0], %1;"  :: __LDG_PTR(ptr), "r"(__BFLOAT162_TO_CUI(value)) : "memory");
-}
-__CUDA_BF16_DECL__ void __stwb(__nv_bfloat16 *const ptr, const __nv_bfloat16 value)
-{
-    asm ("st.global.wb.b16 [%0], %1;"  :: __LDG_PTR(ptr),  "h"(__BFLOAT16_TO_CUS(value)) : "memory");
-}
-__CUDA_BF16_DECL__ void __stcg(__nv_bfloat162 *const ptr, const __nv_bfloat162 value)
-{
-    asm ("st.global.cg.b32 [%0], %1;"  :: __LDG_PTR(ptr), "r"(__BFLOAT162_TO_CUI(value)) : "memory");
-}
-__CUDA_BF16_DECL__ void __stcg(__nv_bfloat16 *const ptr, const __nv_bfloat16 value)
-{
-    asm ("st.global.cg.b16 [%0], %1;"  :: __LDG_PTR(ptr),  "h"(__BFLOAT16_TO_CUS(value)) : "memory");
-}
-__CUDA_BF16_DECL__ void __stcs(__nv_bfloat162 *const ptr, const __nv_bfloat162 value)
-{
-    asm ("st.global.cs.b32 [%0], %1;"  :: __LDG_PTR(ptr), "r"(__BFLOAT162_TO_CUI(value)) : "memory");
-}
-__CUDA_BF16_DECL__ void __stcs(__nv_bfloat16 *const ptr, const __nv_bfloat16 value)
-{
-    asm ("st.global.cs.b16 [%0], %1;"  :: __LDG_PTR(ptr),  "h"(__BFLOAT16_TO_CUS(value)) : "memory");
-}
-__CUDA_BF16_DECL__ void __stwt(__nv_bfloat162 *const ptr, const __nv_bfloat162 value)
-{
-    asm ("st.global.wt.b32 [%0], %1;"  :: __LDG_PTR(ptr), "r"(__BFLOAT162_TO_CUI(value)) : "memory");
-}
-__CUDA_BF16_DECL__ void __stwt(__nv_bfloat16 *const ptr, const __nv_bfloat16 value)
-{
-    asm ("st.global.wt.b16 [%0], %1;"  :: __LDG_PTR(ptr),  "h"(__BFLOAT16_TO_CUS(value)) : "memory");
-}
-
-#undef __LDG_PTR
-#endif /*defined(__cplusplus) */
-/******************************************************************************
-*                             __nv_bfloat162 comparison                             *
-******************************************************************************/
-#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-#define __COMPARISON_OP_BFLOAT162_MACRO(name) {\
-   __nv_bfloat162 val; \
-   asm( "{ " __CUDA_BF16_STRINGIFY(name) ".bf16x2.bf16x2 %0,%1,%2;\n}" \
-        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \
-   return val; \
-}
-#else
-#define __COMPARISON_OP_BFLOAT162_MACRO(name) {\
-   __nv_bfloat162 val; \
-   asm( "{.reg .b32 low_a,low_b,high_a,high_b,high_res,low_res;\n"\
-        "  and.b32 high_a, %1, 0xffff0000U;\n"\
-        "  and.b32 high_b, %2, 0xffff0000U;\n"\
-        "  shl.b32 low_a, %1, 16;\n"\
-        "  shl.b32 low_b, %2, 16;\n"\
-        "  " __CUDA_BF16_STRINGIFY(name) ".f32.f32 low_res, low_a, low_b;\n"\
-        "  " __CUDA_BF16_STRINGIFY(name) ".f32.f32 high_res, high_a, high_b;\n"\
-        "  shr.u32 low_res, low_res, 16;\n"\
-        "  or.b32  %0, high_res, low_res;}\n"\
-        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \
-   return val; \
-}
-#endif
-
-__CUDA_BF16_DECL__ __nv_bfloat162 __heq2(const __nv_bfloat162 a, const __nv_bfloat162 b)
-{
-    __COMPARISON_OP_BFLOAT162_MACRO(set.eq)
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 __hne2(const __nv_bfloat162 a, const __nv_bfloat162 b)
-{
-    __COMPARISON_OP_BFLOAT162_MACRO(set.ne)
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 __hle2(const __nv_bfloat162 a, const __nv_bfloat162 b)
-{
-    __COMPARISON_OP_BFLOAT162_MACRO(set.le)
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 __hge2(const __nv_bfloat162 a, const __nv_bfloat162 b)
-{
-    __COMPARISON_OP_BFLOAT162_MACRO(set.ge)
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 __hlt2(const __nv_bfloat162 a, const __nv_bfloat162 b)
-{
-    __COMPARISON_OP_BFLOAT162_MACRO(set.lt)
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 __hgt2(const __nv_bfloat162 a, const __nv_bfloat162 b)
-{
-    __COMPARISON_OP_BFLOAT162_MACRO(set.gt)
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 __hequ2(const __nv_bfloat162 a, const __nv_bfloat162 b)
-{
-    __COMPARISON_OP_BFLOAT162_MACRO(set.equ)
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 __hneu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
-{
-    __COMPARISON_OP_BFLOAT162_MACRO(set.neu)
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 __hleu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
-{
-    __COMPARISON_OP_BFLOAT162_MACRO(set.leu)
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 __hgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
-{
-    __COMPARISON_OP_BFLOAT162_MACRO(set.geu)
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 __hltu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
-{
-    __COMPARISON_OP_BFLOAT162_MACRO(set.ltu)
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 __hgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
-{
-    __COMPARISON_OP_BFLOAT162_MACRO(set.gtu)
-}
-#undef __COMPARISON_OP_BFLOAT162_MACRO
-
-#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-#define __BOOL_COMPARISON_OP_BFLOAT162_MACRO(name) {\
-   __nv_bfloat162 val; \
-   bool retval; \
-   asm( "{ " __CUDA_BF16_STRINGIFY(name) ".bf16x2.bf16x2 %0,%1,%2;\n}" \
-        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \
-   if (__BFLOAT162_TO_CUI(val) == 0x3F803F80U) {\
-      retval = true; \
-   } else { \
-      retval = false; \
-   }\
-   return retval;\
-}
-#else
-
-#define __BOOL_COMPARISON_OP_BFLOAT162_MACRO(name) {\
-   unsigned int val; \
-   asm( "{.reg .b32 low_a,low_b,high_a,high_b,high_res,low_res;\n"\
-        "  and.b32 high_a, %1, 0xffff0000U;\n"\
-        "  and.b32 high_b, %2, 0xffff0000U;\n"\
-        "  shl.b32 low_a, %1, 16;\n"\
-        "  shl.b32 low_b, %2, 16;\n"\
-        "  " __CUDA_BF16_STRINGIFY(name) ".f32.f32 low_res, low_a, low_b;\n"\
-        "  " __CUDA_BF16_STRINGIFY(name) ".f32.f32 high_res, high_a, high_b;\n"\
-        "  and.b32 %0, high_res, low_res;}\n"\
-        :"=r"(val) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \
-   return (val != 0U) ? true : false; \
-}
-#endif
-
-__CUDA_BF16_DECL__ bool __hbeq2(const __nv_bfloat162 a, const __nv_bfloat162 b)
-{
-    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.eq)
-}
-__CUDA_BF16_DECL__ bool __hbne2(const __nv_bfloat162 a, const __nv_bfloat162 b)
-{
-    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.ne)
-}
-__CUDA_BF16_DECL__ bool __hble2(const __nv_bfloat162 a, const __nv_bfloat162 b)
-{
-    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.le)
-}
-__CUDA_BF16_DECL__ bool __hbge2(const __nv_bfloat162 a, const __nv_bfloat162 b)
-{
-    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.ge)
-}
-__CUDA_BF16_DECL__ bool __hblt2(const __nv_bfloat162 a, const __nv_bfloat162 b)
-{
-    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.lt)
-}
-__CUDA_BF16_DECL__ bool __hbgt2(const __nv_bfloat162 a, const __nv_bfloat162 b)
-{
-    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.gt)
-}
-__CUDA_BF16_DECL__ bool __hbequ2(const __nv_bfloat162 a, const __nv_bfloat162 b)
-{
-    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.equ)
-}
-__CUDA_BF16_DECL__ bool __hbneu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
-{
-    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.neu)
-}
-__CUDA_BF16_DECL__ bool __hbleu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
-{
-    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.leu)
-}
-__CUDA_BF16_DECL__ bool __hbgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
-{
-    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.geu)
-}
-__CUDA_BF16_DECL__ bool __hbltu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
-{
-    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.ltu)
-}
-__CUDA_BF16_DECL__ bool __hbgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
-{
-    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.gtu)
-}
-#undef __BOOL_COMPARISON_OP_BFLOAT162_MACRO
-/******************************************************************************
-*                             __nv_bfloat16 comparison                              *
-******************************************************************************/
-#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-#define __COMPARISON_OP_BFLOAT16_MACRO(name) {\
-   unsigned short val; \
-   asm( "{ .reg .pred __$temp3;\n" \
-        "  setp." __CUDA_BF16_STRINGIFY(name) ".bf16  __$temp3, %1, %2;\n" \
-        "  selp.u16 %0, 1, 0, __$temp3;}" \
-        : "=h"(val) : "h"(__BFLOAT16_TO_CUS(a)), "h"(__BFLOAT16_TO_CUS(b))); \
-   return (val != 0U) ? true : false; \
-}
-#else
-#define __COMPARISON_OP_BFLOAT16_MACRO(name) {\
-   unsigned int val; \
-   asm( "{.reg .b32 a,b;\n"\
-        "  mov.b32 a, {0, %1};\n"\
-        "  mov.b32 b, {0, %2};\n"\
-        "  set." __CUDA_BF16_STRINGIFY(name) ".f32.f32 %0, a, b;}\n"\
-        :"=r"(val) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \
-   return (val != 0U) ? true : false; \
-}
-#endif
-__CUDA_BF16_DECL__ bool __heq(const __nv_bfloat16 a, const __nv_bfloat16 b)
-{
-    __COMPARISON_OP_BFLOAT16_MACRO(eq)
-}
-__CUDA_BF16_DECL__ bool __hne(const __nv_bfloat16 a, const __nv_bfloat16 b)
-{
-    __COMPARISON_OP_BFLOAT16_MACRO(ne)
-}
-__CUDA_BF16_DECL__ bool __hle(const __nv_bfloat16 a, const __nv_bfloat16 b)
-{
-    __COMPARISON_OP_BFLOAT16_MACRO(le)
-}
-__CUDA_BF16_DECL__ bool __hge(const __nv_bfloat16 a, const __nv_bfloat16 b)
-{
-    __COMPARISON_OP_BFLOAT16_MACRO(ge)
-}
-__CUDA_BF16_DECL__ bool __hlt(const __nv_bfloat16 a, const __nv_bfloat16 b)
-{
-    __COMPARISON_OP_BFLOAT16_MACRO(lt)
-}
-__CUDA_BF16_DECL__ bool __hgt(const __nv_bfloat16 a, const __nv_bfloat16 b)
-{
-    __COMPARISON_OP_BFLOAT16_MACRO(gt)
-}
-__CUDA_BF16_DECL__ bool __hequ(const __nv_bfloat16 a, const __nv_bfloat16 b)
-{
-    __COMPARISON_OP_BFLOAT16_MACRO(equ)
-}
-__CUDA_BF16_DECL__ bool __hneu(const __nv_bfloat16 a, const __nv_bfloat16 b)
-{
-    __COMPARISON_OP_BFLOAT16_MACRO(neu)
-}
-__CUDA_BF16_DECL__ bool __hleu(const __nv_bfloat16 a, const __nv_bfloat16 b)
-{
-    __COMPARISON_OP_BFLOAT16_MACRO(leu)
-}
-__CUDA_BF16_DECL__ bool __hgeu(const __nv_bfloat16 a, const __nv_bfloat16 b)
-{
-    __COMPARISON_OP_BFLOAT16_MACRO(geu)
-}
-__CUDA_BF16_DECL__ bool __hltu(const __nv_bfloat16 a, const __nv_bfloat16 b)
-{
-    __COMPARISON_OP_BFLOAT16_MACRO(ltu)
-}
-__CUDA_BF16_DECL__ bool __hgtu(const __nv_bfloat16 a, const __nv_bfloat16 b)
-{
-    __COMPARISON_OP_BFLOAT16_MACRO(gtu)
-}
-#undef __COMPARISON_OP_BFLOAT16_MACRO
-/******************************************************************************
-*                            __nv_bfloat162 arithmetic                             *
-******************************************************************************/
-#define __BINARY_OP_BFLOAT162_MACRO(name) /* do */ {\
-   __nv_bfloat162 val; \
-   asm( "{.reg .b32 low_a,low_b,high_a,high_b,high_res,low_res;\n"\
-        " .reg .b16 low,high;\n"\
-        "  and.b32 high_a, %1, 0xffff0000U;\n"\
-        "  and.b32 high_b, %2, 0xffff0000U;\n"\
-        "  shl.b32 low_a, %1, 16;\n"\
-        "  shl.b32 low_b, %2, 16;\n"\
-        "  " __CUDA_BF16_STRINGIFY(name) ".f32 low_res, low_a, low_b;\n"\
-        "  " __CUDA_BF16_STRINGIFY(name) ".f32 high_res, high_a, high_b;\n"\
-        "  cvt.rn.bf16.f32 low, low_res;\n"\
-        "  cvt.rn.bf16.f32 high, high_res;\n"\
-        "  mov.b32 %0, {low,high};}\n"\
-        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \
-   return val; \
-} /* while(0) */
-
-__CUDA_BF16_DECL__ __nv_bfloat162 __hadd2(const __nv_bfloat162 a, const __nv_bfloat162 b)
-{
-   __nv_bfloat162 val;
-#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-   asm( "{ add.bf16x2 %0,%1,%2; }\n"
-#else
-   asm( "{.reg .b32 c;\n"
-        "  mov.b32 c, 0x3f803f80U;\n"
-        "  fma.rn.bf16x2 %0,%1,c,%2;}\n"
-#endif
-        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \
-   return val;
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 __hsub2(const __nv_bfloat162 a, const __nv_bfloat162 b)
-{
-   __nv_bfloat162 val;
-#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-   asm( "{ sub.bf16x2 %0,%1,%2; }\n"
-#else
-   asm( "{.reg .b32 c;\n"
-        "  mov.b32 c, 0xbf80bf80U;\n"
-        "  fma.rn.bf16x2 %0,%2,c,%1;}\n"
-#endif
-        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \
-   return val;
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 __hmul2(const __nv_bfloat162 a, const __nv_bfloat162 b)
-{
-   __nv_bfloat162 val;
-#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-   asm( "{ mul.bf16x2 %0,%1,%2; }\n"
-#else
-   asm( "{.reg .b32 c;\n"
-        "  mov.b32 c, 0x80008000U;\n"
-        "  fma.rn.bf16x2 %0,%1,%2,c;}\n"
-#endif
-        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \
-   return val;
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 __hadd2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b)
-{
-   __nv_bfloat162 val;
-#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-   asm( "{ add.rn.bf16x2 %0,%1,%2; }\n"
-#else
-   asm( "{.reg .b32 c;\n"
-        "  mov.b32 c, 0x3f803f80U;\n"
-        "  fma.rn.bf16x2 %0,%1,c,%2;}\n"
-#endif
-        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \
-   return val;
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 __hsub2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b)
-{
-   __nv_bfloat162 val;
-#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-   asm( "{ sub.rn.bf16x2 %0,%1,%2; }\n"
-#else
-   asm( "{.reg .b32 c;\n"
-        "  mov.b32 c, 0xbf80bf80U;\n"
-        "  fma.rn.bf16x2 %0,%2,c,%1;}\n"
-#endif
-        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \
-   return val;
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 __hmul2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b)
-{
-   __nv_bfloat162 val;
-#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-   asm( "{ mul.rn.bf16x2 %0,%1,%2; }\n"
-#else
-   asm( "{.reg .b32 c;\n"
-        "  mov.b32 c, 0x80008000U;\n"
-        "  fma.rn.bf16x2 %0,%1,%2,c;}\n"
-#endif
-        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \
-   return val;
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 __hadd2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b)
-{
-   __nv_bfloat162 val;
-   asm( "{.reg .b32 f, one, zero;\n"
-        "  mov.b32 one, 0x3f803f80U;\n"
-        "  mov.b32 zero, 0;\n"
-        "  fma.rn.bf16x2 f,%1,one,%2;\n"
-        "  max.bf16x2 f, f, zero;\n"
-        "  min.bf16x2 %0, f, one;\n}"
-        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \
-   return val;
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 __hsub2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b)
-{
-   __nv_bfloat162 val;
-   asm( "{.reg .b32 f, one, zero, mone;\n"
-        "  mov.b32 one, 0x3f803f80U;\n"
-        "  mov.b32 zero, 0;\n"
-        "  mov.b32 mone, 0xbf80bf80U;\n"
-        "  fma.rn.bf16x2 f,%2,mone,%1;\n"
-        "  max.bf16x2 f, f, zero;\n"
-        "  min.bf16x2 %0, f, one;\n}"
-        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \
-   return val;
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 __hmul2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b)
-{
-   __nv_bfloat162 val;
-   asm( "{.reg .b32 f, one, zero, mzero;\n"
-        "  mov.b32 one, 0x3f803f80U;\n"
-        "  mov.b32 zero, 0;\n"
-        "  mov.b32 mzero, 0x80008000U;\n"
-        "  fma.rn.bf16x2 f,%1,%2,mzero;\n"
-        "  max.bf16x2 f, f, zero;\n"
-        "  min.bf16x2 %0, f, one;\n}"
-        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \
-   return val;
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c)
-{
-    __nv_bfloat162 val;
-    asm( "{fma.rn.bf16x2 %0,%1,%2,%3;\n}"
-         :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)),"r"(__BFLOAT162_TO_CUI(c)));
-    return val;
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c)
-{
-    __nv_bfloat162 val;
-    asm( "{ .reg .b32 f, one, zero;\n"
-         "  mov.b32 one, 0x3f803f80U;\n"
-         "  mov.b32 zero, 0;\n"
-         "  fma.rn.bf16x2 f, %1, %2, %3;\n"
-         "  max.bf16x2 f, f, zero;\n"
-         "  min.bf16x2 %0, f, one;\n}"
-         :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)),"r"(__BFLOAT162_TO_CUI(c)));
-    return val;
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 __h2div(const __nv_bfloat162 a, const __nv_bfloat162 b) {
-    __nv_bfloat16 ha, hb;
-
-    ha = __low2bfloat16(a);
-    hb = __low2bfloat16(b);
-
-    const __nv_bfloat16 v1 = __hdiv(ha, hb);
-
-    ha = __high2bfloat16(a);
-    hb = __high2bfloat16(b);
-
-    const __nv_bfloat16 v2 = __hdiv(ha, hb);
-
-    return __halves2bfloat162(v1, v2);
-}
-/******************************************************************************
-*                             __nv_bfloat16 arithmetic                             *
-******************************************************************************/
-#define __BINARY_OP_BFLOAT16_MACRO(name) /* do */ {\
-   __nv_bfloat16 val; \
-   asm( "{.reg .b32 a,b,res;\n"\
-        "  mov.b32 a, {0,%1};\n"\
-        "  mov.b32 b, {0,%2};\n"\
-        "  " __CUDA_BF16_STRINGIFY(name) ".f32 res, a, b;\n"\
-        "  cvt.rn.bf16.f32 %0, res;}\n"\
-        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \
-   return val; \
-} /* while(0) */
-
-__CUDA_BF16_DECL__ __nv_bfloat16 __hadd(const __nv_bfloat16 a, const __nv_bfloat16 b)
-{
-   __nv_bfloat16 val;
-#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-   asm( "{ add.bf16 %0,%1,%2; }\n"
-#else
-   asm( "{.reg .b16 c;\n"
-        "  mov.b16 c, 0x3f80U;\n"
-        "  fma.rn.bf16 %0,%1,c,%2;}\n"
-#endif
-        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \
-   return val;
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 __hsub(const __nv_bfloat16 a, const __nv_bfloat16 b)
-{
-   __nv_bfloat16 val;
-#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-   asm( "{ sub.bf16 %0,%1,%2; }\n"
-#else
-   asm( "{.reg .b16 c;\n"
-        "  mov.b16 c, 0xbf80U;\n"
-        "  fma.rn.bf16 %0,%2,c,%1;}\n"
-#endif
-        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \
-   return val;
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 __hmul(const __nv_bfloat16 a, const __nv_bfloat16 b)
-{
-   __nv_bfloat16 val;
-#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-   asm( "{ mul.bf16 %0,%1,%2; }\n"
-#else
-   asm( "{.reg .b16 c;\n"
-        "  mov.b16 c, 0x8000U;\n"
-        "  fma.rn.bf16 %0,%1,%2,c;}\n"
-#endif
-        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \
-   return val;
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 __hadd_rn(const __nv_bfloat16 a, const __nv_bfloat16 b)
-{
-   __nv_bfloat16 val;
-#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-   asm( "{ add.rn.bf16 %0,%1,%2; }\n"
-#else
-   asm( "{.reg .b16 c;\n"
-        "  mov.b16 c, 0x3f80U;\n"
-        "  fma.rn.bf16 %0,%1,c,%2;}\n"
-#endif
-        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \
-   return val;
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 __hsub_rn(const __nv_bfloat16 a, const __nv_bfloat16 b)
-{
-   __nv_bfloat16 val;
-#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-   asm( "{ sub.rn.bf16 %0,%1,%2; }\n"
-#else
-   asm( "{.reg .b16 c;\n"
-        "  mov.b16 c, 0xbf80U;\n"
-        "  fma.rn.bf16 %0,%2,c,%1;}\n"
-#endif
-        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \
-   return val;
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 __hmul_rn(const __nv_bfloat16 a, const __nv_bfloat16 b)
-{
-   __nv_bfloat16 val;
-#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-   asm( "{ mul.rn.bf16 %0,%1,%2; }\n"
-#else
-   asm( "{.reg .b16 c;\n"
-        "  mov.b16 c, 0x8000U;\n"
-        "  fma.rn.bf16 %0,%1,%2,c;}\n"
-#endif
-        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \
-   return val;
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 __hadd_sat(const __nv_bfloat16 a, const __nv_bfloat16 b)
-{
-    __nv_bfloat16 val;
-    asm( "{ .reg .b16 f, one, zero;\n"
-         "  mov.b16 one, 0x3f80U;\n"
-         "  mov.b16 zero, 0;\n"
-         "  fma.rn.bf16 f, %1, one, %2;\n"
-         "  max.bf16 f, f, zero;\n"
-         "  min.bf16 %0, f, one;\n}"
-         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
-    return val;
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 __hsub_sat(const __nv_bfloat16 a, const __nv_bfloat16 b)
-{
-    __nv_bfloat16 val;
-    asm( "{ .reg .b16 f, one, zero, mone;\n"
-         "  mov.b16 one, 0x3f80U;\n"
-         "  mov.b16 zero, 0;\n"
-         "  mov.b16 mone, 0xbf80U;\n"
-         "  fma.rn.bf16 f, %2, mone, %1;\n"
-         "  max.bf16 f, f, zero;\n"
-         "  min.bf16 %0, f, one;\n}"
-         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
-    return val;
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 __hmul_sat(const __nv_bfloat16 a, const __nv_bfloat16 b)
-{
-    __nv_bfloat16 val;
-    asm( "{ .reg .b16 f, one, zero, mzero;\n"
-         "  mov.b16 one, 0x3f80U;\n"
-         "  mov.b16 zero, 0;\n"
-         "  mov.b16 mzero, 0x8000U;\n"
-         "  fma.rn.bf16 f, %1, %2, mzero;\n"
-         "  max.bf16 f, f, zero;\n"
-         "  min.bf16 %0, f, one;\n}"
-         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
-    return val;
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 __hfma(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c)
-{
-    __nv_bfloat16 val;
-    asm( "{fma.rn.bf16 %0,%1,%2,%3;\n}"
-         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)),"h"(__BFLOAT16_TO_CUS(c)));
-    return val;
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 __hfma_sat(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c)
-{
-    __nv_bfloat16 val;
-    asm( "{ .reg .b16 f, one, zero;\n"
-         "  mov.b16 one, 0x3f80U;\n"
-         "  mov.b16 zero, 0;\n"
-         "  fma.rn.bf16 f, %1, %2, %3;\n"
-         "  max.bf16 f, f, zero;\n"
-         "  min.bf16 %0, f, one;\n}"
-         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)),"h"(__BFLOAT16_TO_CUS(c)));
-    return val;
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 __hdiv(const __nv_bfloat16 a, const __nv_bfloat16 b) {
-    __BINARY_OP_BFLOAT16_MACRO(div.rn)
-}
-
-/******************************************************************************
-*                             __nv_bfloat162 functions                  *
-******************************************************************************/
-#define __APPROX_FCAST(fun) /* do */ {\
-   __nv_bfloat16 val;\
-   asm("{.reg.b32         f;        \n"\
-                " .reg.b16         r;        \n"\
-                "  mov.b16         r,%1;     \n"\
-                "  mov.b32         f,{0,r};  \n"\
-                "  " __CUDA_BF16_STRINGIFY(fun) ".approx.f32   f,f;  \n"\
-                "  cvt.rn.bf16.f32    r,f;  \n"\
-                "  mov.b16         %0,r;     \n"\
-                "}": "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)));\
-   return val;\
-} /* while(0) */
-#define __APPROX_FCAST2(fun) /* do */ {\
-   __nv_bfloat162 val;\
-   asm("{.reg.b16         hl, hu;         \n"\
-                " .reg.b32         fl, fu;         \n"\
-                "  mov.b32         {hl, hu}, %1;   \n"\
-                "  mov.b32         fl, {0,hl};     \n"\
-                "  mov.b32         fu, {0,hu};     \n"\
-                "  " __CUDA_BF16_STRINGIFY(fun) ".approx.f32   fl, fl;     \n"\
-                "  " __CUDA_BF16_STRINGIFY(fun) ".approx.f32   fu, fu;     \n"\
-                "  cvt.rn.bf16.f32    hl, fl;     \n"\
-                "  cvt.rn.bf16.f32    hu, fu;     \n"\
-                "  mov.b32         %0, {hl, hu};   \n"\
-                "}":"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)));       \
-   return val;\
-} /* while(0) */
-__CUDA_BF16_DECL__ __nv_bfloat16 __hsin_internal(const __nv_bfloat16 a) {
-    float f = __bfloat162float(a);
-    f = sinf(f);
-    return __float2bfloat16_rn(f);
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 hsin(const __nv_bfloat16 a) {
-    return __hsin_internal(a);
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 h2sin(const __nv_bfloat162 a) {
-    const __nv_bfloat16 l = __low2bfloat16(a);
-    const __nv_bfloat16 h = __high2bfloat16(a);
-    return __halves2bfloat162(__hsin_internal(l), __hsin_internal(h));
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 __hcos_internal(const __nv_bfloat16 a) {
-    float f = __bfloat162float(a);
-    f = cosf(f);
-    return __float2bfloat16_rn(f);
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 hcos(const __nv_bfloat16 a) {
-    return __hcos_internal(a);
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 h2cos(const __nv_bfloat162 a) {
-    const __nv_bfloat16 l = __low2bfloat16(a);
-    const __nv_bfloat16 h = __high2bfloat16(a);
-    return __halves2bfloat162(__hcos_internal(l), __hcos_internal(h));
-}
-
-#define __BF16_SPEC_CASE2(i,r, spc, ulp) \
-   "{.reg.b32 spc, ulp, p;\n"\
-   "  mov.b32 spc," __CUDA_BF16_STRINGIFY(spc) ";\n"\
-   "  mov.b32 ulp," __CUDA_BF16_STRINGIFY(ulp) ";\n"\
-   "  set.eq.f16x2.f16x2 p," __CUDA_BF16_STRINGIFY(i) ", spc;\n"\
-   "  fma.rn.bf16x2 " __CUDA_BF16_STRINGIFY(r) ",p,ulp," __CUDA_BF16_STRINGIFY(r) ";\n}\n"
-#define __BF16_SPEC_CASE(i,r, spc, ulp) \
-   "{.reg.b16 spc, ulp, p;\n"\
-   "  mov.b16 spc," __CUDA_BF16_STRINGIFY(spc) ";\n"\
-   "  mov.b16 ulp," __CUDA_BF16_STRINGIFY(ulp) ";\n"\
-   "  set.eq.f16.f16 p," __CUDA_BF16_STRINGIFY(i) ", spc;\n"\
-   "  fma.rn.bf16 " __CUDA_BF16_STRINGIFY(r) ",p,ulp," __CUDA_BF16_STRINGIFY(r) ";\n}\n"
-
-__CUDA_BF16_DECL__ __nv_bfloat16 hexp(const __nv_bfloat16 a) {
-    __nv_bfloat16 val;
-    asm("{.reg.b32          f, C;           \n"
-        " .reg.b16          h,r;            \n"
-        "  mov.b16          h,%1;           \n"
-        "  mov.b32          f,{0,h};        \n"
-        "  mov.b32          C, 0x3FB8AA3CU;  \n"
-        "  mul.f32          f,f,C;          \n"
-        "  ex2.approx.f32   f,f;            \n"
-        "  cvt.rn.bf16.f32 r,f;            \n"
-        "  mov.b16          %0,r;           \n"
-        "}": "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)));
-    return val;
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 h2exp(const __nv_bfloat162 a) {
-    __nv_bfloat162 val;
-    asm("{.reg.b16         hl, hu;         \n"
-        " .reg.b32         h,r,fl,fu, C;   \n"
-        "  mov.b32         {hl, hu}, %1;   \n"
-        "  mov.b32         h, %1;          \n"
-        "  mov.b32         fl, {0,hl};     \n"
-        "  mov.b32         fu, {0,hu};     \n"
-        "  mov.b32         C, 0x3FB8AA3CU;  \n"
-        "  mul.f32         fl,fl,C;        \n"
-        "  mul.f32         fu,fu,C;        \n"
-        "  ex2.approx.f32      fl, fl;     \n"
-        "  ex2.approx.f32      fu, fu;     \n"
-        "  cvt.rn.bf16.f32    hl, fl;     \n"
-        "  cvt.rn.bf16.f32    hu, fu;     \n"
-        "  mov.b32         r, {hl, hu};    \n"
-        "  mov.b32         %0, r;  \n"
-        "}":"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)));
-    return val;
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 hexp2(const __nv_bfloat16 a) {
-    __APPROX_FCAST(ex2)
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 h2exp2(const __nv_bfloat162 a) {
-    __APPROX_FCAST2(ex2)
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 hexp10(const __nv_bfloat16 a) {
-    __nv_bfloat16 val;
-    asm("{.reg.b16         h, r;           \n"
-        " .reg.b32         f, C;           \n"
-        "  mov.b16         h, %1;          \n"
-        "  mov.b32         f, {0,h};       \n"
-        "  mov.b32         C, 0x40549A78U;  \n"
-        "  mul.f32         f,f,C;          \n"
-        "  ex2.approx.f32      f, f;       \n"
-        "  cvt.rn.bf16.f32    r, f;       \n"
-        __BF16_SPEC_CASE(%1, r, 0xBC95U,0xBF00U)
-        "  mov.b16         %0, r;          \n"
-        "}":"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)));
-    return val;
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 h2exp10(const __nv_bfloat162 a) {
-    __nv_bfloat162 val;
-    asm("{.reg.b16         hl, hu;         \n"
-        " .reg.b32         h,r,fl,fu, C;   \n"
-        "  mov.b32         {hl, hu}, %1;   \n"
-        "  mov.b32         fl, {0,hl};     \n"
-        "  mov.b32         fu, {0,hu};     \n"
-        "  mov.b32         C, 0x40549A78U;  \n"
-        "  mul.f32         fl,fl,C;        \n"
-        "  mul.f32         fu,fu,C;        \n"
-        "  ex2.approx.f32      fl, fl;     \n"
-        "  ex2.approx.f32      fu, fu;     \n"
-        "  cvt.rn.bf16.f32    hl, fl;     \n"
-        "  cvt.rn.bf16.f32    hu, fu;     \n"
-        "  mov.b32         r, {hl, hu};    \n"
-        __BF16_SPEC_CASE2(%1, r, 0xBC95BC95U,0xBF00BF00U)
-        "  mov.b32         %0, r;  \n"
-        "}":"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)));
-    return val;
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 hlog2(const __nv_bfloat16 a) {
-    __APPROX_FCAST(lg2)
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 h2log2(const __nv_bfloat162 a) {
-    __APPROX_FCAST2(lg2)
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 hlog(const __nv_bfloat16 a) {
-    __nv_bfloat16 val;
-    asm("{.reg.b32         f, C;           \n"
-        " .reg.b16         r,h;            \n"
-        "  mov.b16         h,%1;           \n"
-        "  mov.b32         f,{0,h};        \n"
-        "  lg2.approx.f32      f,f;        \n"
-        "  mov.b32         C, 0x3f317218U; \n"
-        "  mul.f32         f,f,C;          \n"
-        "  cvt.rn.bf16.f32    r,f;        \n"
-        "  mov.b16         %0,r;           \n"
-        "}": "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)));
-    return val;
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 h2log(const __nv_bfloat162 a) {
-    __nv_bfloat162 val;
-    asm("{.reg.b16         hl, hu;             \n"
-        " .reg.b32         r, fl, fu, C, h;    \n"
-        "  mov.b32         {hl, hu}, %1;       \n"
-        "  mov.b32         h, %1;              \n"
-        "  mov.b32         fl, {0,hl};         \n"
-        "  mov.b32         fu, {0,hu};         \n"
-        "  lg2.approx.f32      fl, fl;         \n"
-        "  lg2.approx.f32      fu, fu;         \n"
-        "  mov.b32         C, 0x3f317218U;     \n"
-        "  mul.f32         fl,fl,C;            \n"
-        "  mul.f32         fu,fu,C;            \n"
-        "  cvt.rn.bf16.f32    hl, fl;         \n"
-        "  cvt.rn.bf16.f32    hu, fu;         \n"
-        "  mov.b32         r, {hl, hu};        \n"
-        "  mov.b32         %0, r;              \n"
-        "}":"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)));
-    return val;
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 hlog10(const __nv_bfloat16 a) {
-    __nv_bfloat16 val;
-    asm("{.reg.b16         h, r;           \n"
-        " .reg.b32         f, C;           \n"
-        "  mov.b16         h, %1;          \n"
-        "  mov.b32         f, {0,h};           \n"
-        "  lg2.approx.f32      f, f;       \n"
-        "  mov.b32         C, 0x3E9A209BU;  \n"
-        "  mul.f32         f,f,C;          \n"
-        "  cvt.rn.bf16.f32    r, f;       \n"
-        "  mov.b16         %0, r;          \n"
-        "}":"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)));
-    return val;
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 h2log10(const __nv_bfloat162 a) {
-    __nv_bfloat162 val;
-    asm("{.reg.b16         hl, hu;             \n"
-        " .reg.b32         r, fl, fu, C, h;    \n"
-        "  mov.b32         {hl, hu}, %1;       \n"
-        "  mov.b32         h, %1;              \n"
-        "  mov.b32         fl, {0,hl};         \n"
-        "  mov.b32         fu, {0,hu};         \n"
-        "  lg2.approx.f32      fl, fl;         \n"
-        "  lg2.approx.f32      fu, fu;         \n"
-        "  mov.b32         C, 0x3E9A209BU;      \n"
-        "  mul.f32         fl,fl,C;            \n"
-        "  mul.f32         fu,fu,C;            \n"
-        "  cvt.rn.bf16.f32    hl, fl;         \n"
-        "  cvt.rn.bf16.f32    hu, fu;         \n"
-        "  mov.b32         r, {hl, hu};        \n"
-        "  mov.b32         %0, r;              \n"
-        "}":"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)));
-    return val;
-}
-#undef __BF16_SPEC_CASE2
-#undef __BF16_SPEC_CASE
-__CUDA_BF16_DECL__ __nv_bfloat162 h2rcp(const __nv_bfloat162 a) {
-    __APPROX_FCAST2(rcp)
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 hrcp(const __nv_bfloat16 a) {
-    __APPROX_FCAST(rcp)
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 h2rsqrt(const __nv_bfloat162 a) {
-    __APPROX_FCAST2(rsqrt)
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 hrsqrt(const __nv_bfloat16 a) {
-    __APPROX_FCAST(rsqrt)
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 h2sqrt(const __nv_bfloat162 a) {
-    __APPROX_FCAST2(sqrt)
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 hsqrt(const __nv_bfloat16 a) {
-    __APPROX_FCAST(sqrt)
-}
-#undef __APPROX_FCAST
-#undef __APPROX_FCAST2
-__CUDA_BF16_DECL__ __nv_bfloat162 __hisnan2(const __nv_bfloat162 a)
-{
-#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-    __nv_bfloat162 r;
-    asm("{set.nan.bf16x2.bf16x2 %0,%1,%1;\n}"
-        :"=r"(__BFLOAT162_TO_UI(r)) : "r"(__BFLOAT162_TO_CUI(a)));
-    return r;
-#else
-    const __nv_bfloat162 b = a;
-    __BINARY_OP_BFLOAT162_MACRO(set.nan.f32)
-#endif
-}
-__CUDA_BF16_DECL__ bool __hisnan(const __nv_bfloat16 a)
-{
-#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-    __nv_bfloat16 r;
-    asm("{set.nan.bf16.bf16 %0,%1,%1;\n}"
-        :"=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(a)));
-    return __BFLOAT16_TO_CUS(r) != 0U;
-#else
-    unsigned int r;
-    asm( "{.reg .b32 a;\n"
-         "  mov.b32 a, {0,%1};\n"
-         "  set.nan.f32.f32 %0, a, a;}\n"
-         :"=r"(r) : "h"(__BFLOAT16_TO_CUS(a)));
-    return r != 0U;
-#endif
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 __hneg2(const __nv_bfloat162 a)
-{
-    __nv_bfloat162 r;
-    asm("{neg.bf16x2 %0,%1;\n}"
-        :"=r"(__BFLOAT162_TO_UI(r)) : "r"(__BFLOAT162_TO_CUI(a)));
-    return r;
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 __hneg(const __nv_bfloat16 a)
-{
-    __nv_bfloat16 r;
-    asm("{neg.bf16 %0,%1;\n}"
-        :"=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(a)));
-    return r;
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 __habs2(const __nv_bfloat162 a)
-{
-    __nv_bfloat162 r;
-    asm("{abs.bf16x2 %0,%1;\n}"
-        :"=r"(__BFLOAT162_TO_UI(r)) : "r"(__BFLOAT162_TO_CUI(a)));
-    return r;
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 __habs(const __nv_bfloat16 a)
-{
-    __nv_bfloat16 r;
-    asm("{abs.bf16 %0,%1;\n}"
-        :"=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(a)));
-    return r;
-}
-/******************************************************************************
-*                             __nv_bfloat16 arithmetic                             *
-******************************************************************************/
-__CUDA_BF16_DECL__ __nv_bfloat16 __hmax(const __nv_bfloat16 a, const __nv_bfloat16 b)
-{
-   __nv_bfloat16 val;
-   asm( "{ max.bf16 %0,%1,%2;\n}"
-        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
-   return val;
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 __hmin(const __nv_bfloat16 a, const __nv_bfloat16 b)
-{
-    __nv_bfloat16 val;
-    asm( "{ min.bf16 %0,%1,%2;\n}"
-         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
-    return val;
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 __hmax_nan(const __nv_bfloat16 a, const __nv_bfloat16 b)
-{
-    __nv_bfloat16 val;
-    asm( "{ max.NaN.bf16 %0,%1,%2;\n}"
-         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
-    return val;
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 __hmin_nan(const __nv_bfloat16 a, const __nv_bfloat16 b)
-{
-    __nv_bfloat16 val;
-    asm( "{ min.NaN.bf16 %0,%1,%2;\n}"
-         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
-    return val;
-}
-__CUDA_BF16_DECL__ __nv_bfloat16 __hfma_relu(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c)
-{
-    __nv_bfloat16 val;
-    asm( "{ fma.rn.relu.bf16 %0,%1,%2,%3;\n}"
-         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)),"h"(__BFLOAT16_TO_CUS(c)));
-    return val;
-}
-/******************************************************************************
-*                            __nv_bfloat162 arithmetic                             *
-******************************************************************************/
-__CUDA_BF16_DECL__ __nv_bfloat162 __hmax2(const __nv_bfloat162 a, const __nv_bfloat162 b)
-{
-    __nv_bfloat162 val;
-    asm( "{ max.bf16x2 %0,%1,%2;\n}"
-         :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
-    return val;
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 __hmin2(const __nv_bfloat162 a, const __nv_bfloat162 b)
-{
-    __nv_bfloat162 val;
-    asm( "{ min.bf16x2 %0,%1,%2;\n}"
-         :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
-    return val;
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 __hmax2_nan(const __nv_bfloat162 a, const __nv_bfloat162 b)
-{
-    __nv_bfloat162 val;
-    asm( "{ max.NaN.bf16x2 %0,%1,%2;\n}"
-         :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
-    return val;
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 __hmin2_nan(const __nv_bfloat162 a, const __nv_bfloat162 b)
-{
-    __nv_bfloat162 val;
-    asm( "{ min.NaN.bf16x2 %0,%1,%2;\n}"
-         :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
-    return val;
-}
-__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2_relu(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c)
-{
-    __nv_bfloat162 val;
-    asm( "{ fma.rn.relu.bf16x2 %0,%1,%2,%3;\n}"
-         :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)),"r"(__BFLOAT162_TO_CUI(c)));
-    return val;
-}
-
-__CUDA_BF16_DECL__ __nv_bfloat162 __hcmadd(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c)
-{
-    // fast version of complex multiply-accumulate
-    // (a.re, a.im) * (b.re, b.im) + (c.re, c.im)
-    // acc.re = (c.re + a.re*b.re) - a.im*b.im
-    // acc.im = (c.im + a.re*b.im) + a.im*b.re
-    __nv_bfloat16 real_tmp = __hfma(a.x, b.x, c.x);
-    __nv_bfloat16 img_tmp  = __hfma(a.x, b.y, c.y);
-    real_tmp = __hfma(__hneg(a.y), b.y, real_tmp);
-    img_tmp  = __hfma(a.y,         b.x, img_tmp);
-    return make_bfloat162(real_tmp, img_tmp);
-}
-
-
-/* Define __PTR for atomicAdd prototypes below, undef after done */
-#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
-#define __PTR   "l"
-#else
-#define __PTR   "r"
-#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/
-
-__CUDA_BF16_DECL__ __nv_bfloat162 atomicAdd(__nv_bfloat162 *const address, const __nv_bfloat162 val)
-{
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-    __nv_bfloat162 r;
-    asm volatile ("{ atom.add.noftz.bf16x2 %0,[%1],%2; }\n"
-                  : "=r"(__BFLOAT162_TO_UI(r)) : __PTR(address), "r"(__BFLOAT162_TO_CUI(val))
-                  : "memory");
-   return r;
-#else
-    unsigned int* address_as_uint = (unsigned int*)address;
-    unsigned int old = *address_as_uint, assumed;
-    do {
-        assumed = old;
-        __nv_bfloat162 new_val = __hadd2(val, *(__nv_bfloat162*)&assumed);
-        old = atomicCAS(address_as_uint, assumed, *(unsigned int*)&new_val);
-    } while (assumed != old);
-    return *(__nv_bfloat162*)&old;
-#endif
-}
-
-__CUDA_BF16_DECL__ __nv_bfloat16 atomicAdd(__nv_bfloat16 *const address, const __nv_bfloat16 val)
-{
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-    __nv_bfloat16 r;
-    asm volatile ("{ atom.add.noftz.bf16 %0,[%1],%2; }\n"
-                  : "=h"(__BFLOAT16_TO_US(r))
-                  : __PTR(address), "h"(__BFLOAT16_TO_CUS(val))
-                  : "memory");
-   return r;
-#else
-    unsigned short int* address_as_us = (unsigned short int*)address;
-    unsigned short int old = *address_as_us, assumed;
-    do {
-        assumed = old;
-        old = atomicCAS(address_as_us, assumed,
-            __bfloat16_as_ushort(__hadd(val, __ushort_as_bfloat16(assumed))));
-    } while (assumed != old);
-    return __ushort_as_bfloat16(old);
-#endif
-}
-
-#undef __PTR
-#undef __CUDA_BF16_DECL__
-#endif /* defined(__CUDACC__) && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) */
-#endif /* defined(__cplusplus) */
-
-#undef __BINARY_OP_BFLOAT162_MACRO
-#undef __BINARY_OP_BFLOAT16_MACRO
-
-#undef __CUDA_HOSTDEVICE_BF16_DECL__
-#undef __CUDA_BF16_DECL__
-
-/* Define first-class types "nv_bfloat16" and "nv_bfloat162", unless user specifies otherwise via "#define CUDA_NO_BFLOAT16" */
-/* C cannot ever have these types defined here, because __nv_bfloat16 and __nv_bfloat162 are C++ classes */
-#if defined(__cplusplus) && !defined(CUDA_NO_BFLOAT16)
-typedef __nv_bfloat16  nv_bfloat16;
-typedef __nv_bfloat162 nv_bfloat162;
-
-#endif /* defined(__cplusplus) && !defined(CUDA_NO_BFLOAT16) */
-
-#if defined(__CPP_VERSION_AT_LEAST_11_BF16)
-#undef __CPP_VERSION_AT_LEAST_11_BF16
-#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */
-
-#endif /* end of include guard: __CUDA_BF16_HPP__ */

From 8b569c6e2d37eecc02d0f8299bfee9a56634ede2 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Fri, 15 Aug 2025 16:17:28 -0700
Subject: [PATCH 33/56] update format constant method for BfloatType

---
 numba_cuda/numba/cuda/models.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/numba_cuda/numba/cuda/models.py b/numba_cuda/numba/cuda/models.py
index 02f629575..d6e28b82f 100644
--- a/numba_cuda/numba/cuda/models.py
+++ b/numba_cuda/numba/cuda/models.py
@@ -1,3 +1,4 @@
+import struct
 import functools
 
 from llvmlite import ir
@@ -46,11 +47,16 @@ def __init__(self, dmm, fe_type):
 
 
 def _as_bfloat(value):
-    # Step 1: Convert to float
-    f = ir.types._as_float(value)
+    # Step 1: Reinterpret the input as u32 bits
+    u = struct.unpack("I", struct.pack("f", value))[0]
+
     # Step 2: Truncate (or round, we choose truncate) last 16 bits
-    bf = f >> 16
-    return bf
+    trunc = u >> 16
+
+    # Step 3: Unpack them back to Python floats
+    f = struct.unpack("f", struct.pack("I", trunc))[0]
+
+    return f
 
 
 class BfloatType(ir.types._BaseFloatType):

From 2148be9ee8a8ee83ee19cddb1ef0d736018ade65 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Mon, 18 Aug 2025 12:33:25 -0700
Subject: [PATCH 34/56] implement printing support for bfloat16

---
 numba_cuda/numba/cuda/printimpl.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/numba_cuda/numba/cuda/printimpl.py b/numba_cuda/numba/cuda/printimpl.py
index 0a4fb4347..52e058358 100644
--- a/numba_cuda/numba/cuda/printimpl.py
+++ b/numba_cuda/numba/cuda/printimpl.py
@@ -5,7 +5,7 @@
 from numba.core.errors import NumbaWarning
 from numba.core.imputils import Registry
 from numba.cuda import nvvmutils
-from numba.cuda.types import Dim3
+from numba.cuda.types import Dim3, Bfloat16
 from warnings import warn
 
 registry = Registry()
@@ -48,6 +48,17 @@ def real_print_impl(ty, context, builder, val):
     return "%f", [lld]
 
 
+@print_item.register(Bfloat16)
+def bfloat16_print_impl(ty, context, builder, val):
+    # Hand rolled bfloat16 -> float32 -> double conversion with zero-ext
+    bits32 = builder.zext(val, ir.IntType(32))
+    shift = builder.shl(bits32, ir.Constant(ir.IntType(32), 16))
+    f32 = builder.bitcast(shift, ir.FloatType())
+    # printf("%f") expects a double; promote to f64 to match vararg expectation
+    f64 = builder.fpext(f32, ir.DoubleType())
+    return "%f", [f64]
+
+
 @print_item.register(types.StringLiteral)
 def const_print_impl(ty, context, builder, sigval):
     pyval = ty.literal_value

From 07b9c1e34084221134919f3d67f9a3d6889d3b14 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Mon, 18 Aug 2025 12:34:14 -0700
Subject: [PATCH 35/56] implement to int conversion tests

---
 .../numba/cuda/tests/cudapy/test_bfloat16.py  | 257 +++++++-----------
 1 file changed, 101 insertions(+), 156 deletions(-)

diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py
index 1147bba11..f4f198d5b 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py
@@ -1,7 +1,8 @@
 import unittest
 from importlib.util import find_spec
+import numpy as np
 
-from numba import cuda, float32
+from numba import cuda, float32, float64
 from numba.cuda.bf16 import (
     bfloat16,
     habs,
@@ -40,6 +41,28 @@
     float32_to_bfloat16_rz,
     float32_to_bfloat16_rd,
     float32_to_bfloat16_ru,
+
+    bfloat16_to_int8_rz,
+    bfloat16_to_uint8_rz,
+
+    int16_to_bfloat16_rn,
+    int16_to_bfloat16_rz,
+    int16_to_bfloat16_rd,
+    int16_to_bfloat16_ru,
+    bfloat16_to_int16_rn,
+    bfloat16_to_int16_rz,
+    bfloat16_to_int16_rd,
+    bfloat16_to_int16_ru,
+
+    uint16_to_bfloat16_rn,
+    uint16_to_bfloat16_rz,
+    uint16_to_bfloat16_rd,
+    uint16_to_bfloat16_ru,
+    bfloat16_to_uint16_rn,
+    bfloat16_to_uint16_rz,
+    bfloat16_to_uint16_rd,
+    bfloat16_to_uint16_ru,
+
     int32_to_bfloat16_rn,
     int32_to_bfloat16_rz,
     int32_to_bfloat16_rd,
@@ -48,22 +71,29 @@
     bfloat16_to_int32_rz,
     bfloat16_to_int32_rd,
     bfloat16_to_int32_ru,
-    bfloat16_to_int16_rn,
-    int16_to_bfloat16_rn,
-    bfloat16_to_uint16_rn,
-    uint16_to_bfloat16_rn,
-    bfloat16_to_uint32_rn,
+
     uint32_to_bfloat16_rn,
+    uint32_to_bfloat16_rz,
+    uint32_to_bfloat16_rd,
+    uint32_to_bfloat16_ru,
+    bfloat16_to_uint32_rn,
+    bfloat16_to_uint32_rz,
+    bfloat16_to_uint32_rd,
+    bfloat16_to_uint32_ru,
+
     bfloat16_to_int64_rn,
-    int64_to_bfloat16_rn,
+    bfloat16_to_int64_rz,
+    bfloat16_to_int64_rd,
+    bfloat16_to_int64_ru,
     bfloat16_to_uint64_rn,
-    uint64_to_bfloat16_rn,
-    bfloat16_as_short,
-    bfloat16_as_ushort,
-    short_as_bfloat16,
-    ushort_as_bfloat16,
-    bfloat16_to_int8_rz,
-    bfloat16_to_uint8_rz,
+    bfloat16_to_uint64_rz,
+    bfloat16_to_uint64_rd,
+    bfloat16_to_uint64_ru,
+
+    bfloat16_as_int16,
+    bfloat16_as_uint16,
+    int16_as_bfloat16,
+    uint16_as_bfloat16,
 )
 from numba.cuda.testing import CUDATestCase
 
@@ -302,152 +332,67 @@ def kernel(out):
         self.assertAlmostEqual(out[2], 2.0, delta=1e-3)
         self.assertAlmostEqual(out[3], 2.0, delta=1e-3)
 
-    def test_int32_float32_precision_conversion_intrinsics(self):
-        self.skip_unsupported()
 
-        @cuda.jit
-        def kernel_float_to_bf16(out):
-            f = float32(3.14)
-            out[0] = float32(float32_to_bfloat16_rn(f))
-            out[1] = float32(float32_to_bfloat16_rz(f))
-            out[2] = float32(float32_to_bfloat16_rd(f))
-            out[3] = float32(float32_to_bfloat16_ru(f))
-
-        @cuda.jit
-        def kernel_bf16_to_float(out):
-            a = bfloat16(3.14)
-            out[0] = bfloat16_to_float32(a)
-
-        @cuda.jit
-        def kernel_int_to_bf16(out):
-            i = 3
-            out[0] = float32(int32_to_bfloat16_rn(i))
-            out[1] = float32(int32_to_bfloat16_rz(i))
-            out[2] = float32(int32_to_bfloat16_rd(i))
-            out[3] = float32(int32_to_bfloat16_ru(i))
-
-        @cuda.jit
-        def kernel_bf16_to_int(out):
-            a = bfloat16(3.14)
-            out[0] = bfloat16_to_int32_rn(a)
-            out[1] = bfloat16_to_int32_rz(a)
-            out[2] = bfloat16_to_int32_rd(a)
-            out[3] = bfloat16_to_int32_ru(a)
-
-        out = cuda.device_array((4,), dtype="float32")
-        kernel_float_to_bf16[1, 1](out)
-        # Check they are near the original value in float32 after round-trip
-        # Note: Different rounding modes produce slightly different values
-        self.assertAlmostEqual(out[0], 3.140625, delta=1e-3)  # rn
-        self.assertTrue(abs(out[1] - 3.140625) < 2e-2, out[1] - 3.140625)  # rz
-        self.assertTrue(abs(out[2] - 3.140625) < 2e-2, out[2] - 3.140625)  # rd
-        self.assertTrue(abs(out[3] - 3.140625) < 2e-2, out[3] - 3.140625)  # ru
-
-        out = cuda.device_array((1,), dtype="float32")
-        kernel_bf16_to_float[1, 1](out)
-        self.assertAlmostEqual(out[0], 3.140625, delta=1e-3)
-
-        outi = cuda.device_array((4,), dtype="int32")
-        kernel_int_to_bf16[1, 1](outi)
-        # int to bf16 should be exactly representable for small integers
-        self.assertEqual(int(outi[0]), 3)
-        self.assertEqual(int(outi[1]), 3)
-        self.assertEqual(int(outi[2]), 3)
-        self.assertEqual(int(outi[3]), 3)
-
-        outi = cuda.device_array((4,), dtype="int32")
-        kernel_bf16_to_int[1, 1](outi)
-        # 3.14 -> 3 for rz/rd, 3 or 4 for rn/ru depending on rounding
-        self.assertIn(int(outi[0]), (3, 4))
-        self.assertEqual(int(outi[1]), 3)
-        self.assertEqual(int(outi[2]), 3)
-        self.assertIn(int(outi[3]), (3, 4))
-
-    def test_floatroundtrip_integer_conversion_intrinsics(self):
+    def test_to_integer_conversions(self):
         self.skip_unsupported()
 
         @cuda.jit
-        def kernel_scalar_roundtrip(out):
-            f = 3.14
-            bf = float32_to_bfloat16(f)
-            out[0] = bfloat16_to_float32(bf)
-            d = 3.14
-            bf2 = float64_to_bfloat16(d)
-            out[1] = bfloat16_to_float32(bf2)
-
-        out = cuda.device_array((2,), dtype="float32")
-        kernel_scalar_roundtrip[1, 1](out)
-        self.assertAlmostEqual(out[0], 3.140625, delta=1e-3)
-        self.assertAlmostEqual(out[1], 3.140625, delta=1e-3)
-
-        @cuda.jit
-        def kernel_int_family(outf):
-            outf[0] = float32(int16_to_bfloat16_rn(123))
-            outf[1] = float32(uint16_to_bfloat16_rn(456))
-            outf[2] = float32(uint32_to_bfloat16_rn(789))
-            outf[3] = float32(int64_to_bfloat16_rn(1011))
-            outf[4] = float32(uint64_to_bfloat16_rn(1213))
-
-        outf = cuda.device_array((5,), dtype="float32")
-        kernel_int_family[1, 1](outf)
-        vals = [123, 456, 789, 1011, 1213]
-        for i, v in enumerate(vals):
-            got = int(outf[i])
-            # `step` estimates ULP near the integer `v`.
-            # Bfloat16 has 7 bits of precision, spacing between representable values are 2**(e-7).
-            # We use the exponent of the value `v` to raise the minSpacing, the result is a reasonable
-            # esitmate the local ULP.
-            step = (
-                0 if v == 0 else 2 ** (int(math.floor(math.log2(abs(v)))) - 7)
-            )
-            # `allowed` is the maximum error in ULP, with a minimum of 1
-            # In general, half ULP is the typical rounding error bound.
-            allowed = max(1, int(step // 2))
-            self.assertLessEqual(abs(got - v), allowed)
-
-        @cuda.jit
-        def kernel_from_bf16_to_ints(outi):
-            a = bfloat16(5.75)
-            outi[0] = bfloat16_to_int16_rn(a)
-            outi[1] = bfloat16_to_uint16_rn(a)
-            outi[2] = bfloat16_to_uint32_rn(a)
-            outi[3] = bfloat16_to_int64_rn(a)
-            outi[4] = bfloat16_to_uint64_rn(a)
-
-        outi = cuda.device_array((5,), dtype="int64")
-        kernel_from_bf16_to_ints[1, 1](outi)
-        self.assertEqual(int(outi[0]), 6)
-        self.assertEqual(int(outi[1]), 6)
-        self.assertEqual(int(outi[2]), 6)
-        self.assertEqual(int(outi[3]), 6)
-        self.assertEqual(int(outi[4]), 6)
-
-        @cuda.jit
-        def kernel_bit_reinterpret(out_short, out_ushort):
-            s = 12345
-            bf = short_as_bfloat16(s)
-            out_short[0] = bfloat16_as_short(bf)
-            us = 54321
-            bf2 = ushort_as_bfloat16(us)
-            out_ushort[0] = bfloat16_as_ushort(bf2)
-
-        out_short = cuda.device_array((1,), dtype="int32")
-        out_ushort = cuda.device_array((1,), dtype="uint32")
-        kernel_bit_reinterpret[1, 1](out_short, out_ushort)
-        self.assertEqual(int(out_short[0]), 12345)
-        self.assertEqual(int(out_ushort[0]), 54321)
-
-        @cuda.jit
-        def kernel_char(out_c, out_uc):
-            a = bfloat16(3.9)
-            out_c[0] = bfloat16_to_int8_rz(a)
-            out_uc[0] = bfloat16_to_uint8_rz(a)
-
-        out_c = cuda.device_array((1,), dtype="int8")
-        out_uc = cuda.device_array((1,), dtype="uint8")
-        kernel_char[1, 1](out_c, out_uc)
-        self.assertEqual(int(out_c[0]), 3)
-        self.assertEqual(int(out_uc[0]), 3)
+        def kernel(test_val, i1, i2, i3, i4, u1, u2, u3, u4):
+            a = int16_as_bfloat16(test_val)
+
+            i1[0] = bfloat16_to_int8_rz(a)
+            u1[0] = bfloat16_to_uint8_rz(a)
+            i2[0] = bfloat16_to_int16_rn(a)
+            i2[1] = bfloat16_to_int16_rz(a)
+            i2[2] = bfloat16_to_int16_rd(a)
+            i2[3] = bfloat16_to_int16_ru(a)
+            u2[0] = bfloat16_to_uint16_rn(a)
+            u2[1] = bfloat16_to_uint16_rz(a)
+            u2[2] = bfloat16_to_uint16_rd(a)
+            u2[3] = bfloat16_to_uint16_ru(a)
+            i3[0] = bfloat16_to_int32_rn(a)
+            i3[1] = bfloat16_to_int32_rz(a)
+            i3[2] = bfloat16_to_int32_rd(a)
+            i3[3] = bfloat16_to_int32_ru(a)
+            u3[0] = bfloat16_to_uint32_rn(a)
+            u3[1] = bfloat16_to_uint32_rz(a)
+            u3[2] = bfloat16_to_uint32_rd(a)
+            u3[3] = bfloat16_to_uint32_ru(a)
+            i4[0] = bfloat16_to_int64_rn(a)
+            i4[1] = bfloat16_to_int64_rz(a)
+            i4[2] = bfloat16_to_int64_rd(a)
+            i4[3] = bfloat16_to_int64_ru(a)
+            u4[0] = bfloat16_to_uint64_rn(a)
+            u4[1] = bfloat16_to_uint64_rz(a)
+            u4[2] = bfloat16_to_uint64_rd(a)
+            u4[3] = bfloat16_to_uint64_ru(a)
+
+        # rz
+        i1 = cuda.device_array((1,), dtype="int8")
+        # rn, rz, rd, ru
+        i2 = cuda.device_array((4,), dtype="int16")
+        i3 = cuda.device_array((4,), dtype="int32")
+        i4 = cuda.device_array((4,), dtype="int64")
+        # rz
+        u1 = cuda.device_array((1,), dtype="uint8")
+        # rn, rz, rd, ru
+        u2 = cuda.device_array((4,), dtype="uint16")
+        u3 = cuda.device_array((4,), dtype="uint32")
+        u4 = cuda.device_array((4,), dtype="uint64")
+
+        test_val = np.int16(0x3fc0) # 1.5 in bfloat16
+
+        kernel[1, 1](test_val, i1, i2, i3, i4, u1, u2, u3, u4)
+
+        self.assertEqual(i1[0], 1)
+        self.assertEqual(u1[0], 1)
+
+        np.testing.assert_equal(i2, np.array([2, 1, 1, 2], "int16"))
+        np.testing.assert_equal(i3, np.array([2, 1, 1, 2], "int32"))
+        np.testing.assert_equal(i4, np.array([2, 1, 1, 2], "int64"))
+        np.testing.assert_equal(u2, np.array([2, 1, 1, 2], "uint16"))
+        np.testing.assert_equal(u3, np.array([2, 1, 1, 2], "uint32"))
+        np.testing.assert_equal(u4, np.array([2, 1, 1, 2], "uint64"))
 
     @unittest.skipIf(
         find_spec("ml_dtypes") is None,

From 0834f6d3ea47bbb6e80a0c6bb73c047359dd090d Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Mon, 18 Aug 2025 14:51:09 -0700
Subject: [PATCH 36/56] add from integer conversion test

---
 .../numba/cuda/tests/cudapy/test_bfloat16.py  | 145 +++++++++++++++---
 1 file changed, 126 insertions(+), 19 deletions(-)

diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py
index f4f198d5b..ec59a5285 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py
@@ -1,8 +1,9 @@
 import unittest
 from importlib.util import find_spec
 import numpy as np
+from ml_dtypes import bfloat16 as mldtypes_bf16
 
-from numba import cuda, float32, float64
+from numba import cuda, float32, int16, int32, int64, uint16, uint32, uint64
 from numba.cuda.bf16 import (
     bfloat16,
     habs,
@@ -34,17 +35,8 @@
     hisnan,
     hisinf,
     # Conversion intrinsics (NumPy-style names)
-    bfloat16_to_float32,
-    float32_to_bfloat16,
-    float64_to_bfloat16,
-    float32_to_bfloat16_rn,
-    float32_to_bfloat16_rz,
-    float32_to_bfloat16_rd,
-    float32_to_bfloat16_ru,
-
     bfloat16_to_int8_rz,
     bfloat16_to_uint8_rz,
-
     int16_to_bfloat16_rn,
     int16_to_bfloat16_rz,
     int16_to_bfloat16_rd,
@@ -53,7 +45,6 @@
     bfloat16_to_int16_rz,
     bfloat16_to_int16_rd,
     bfloat16_to_int16_ru,
-
     uint16_to_bfloat16_rn,
     uint16_to_bfloat16_rz,
     uint16_to_bfloat16_rd,
@@ -62,7 +53,6 @@
     bfloat16_to_uint16_rz,
     bfloat16_to_uint16_rd,
     bfloat16_to_uint16_ru,
-
     int32_to_bfloat16_rn,
     int32_to_bfloat16_rz,
     int32_to_bfloat16_rd,
@@ -71,7 +61,6 @@
     bfloat16_to_int32_rz,
     bfloat16_to_int32_rd,
     bfloat16_to_int32_ru,
-
     uint32_to_bfloat16_rn,
     uint32_to_bfloat16_rz,
     uint32_to_bfloat16_rd,
@@ -80,20 +69,24 @@
     bfloat16_to_uint32_rz,
     bfloat16_to_uint32_rd,
     bfloat16_to_uint32_ru,
-
     bfloat16_to_int64_rn,
     bfloat16_to_int64_rz,
     bfloat16_to_int64_rd,
     bfloat16_to_int64_ru,
+    int64_to_bfloat16_rn,
+    int64_to_bfloat16_rz,
+    int64_to_bfloat16_rd,
+    int64_to_bfloat16_ru,
     bfloat16_to_uint64_rn,
     bfloat16_to_uint64_rz,
     bfloat16_to_uint64_rd,
     bfloat16_to_uint64_ru,
-
+    uint64_to_bfloat16_rn,
+    uint64_to_bfloat16_rz,
+    uint64_to_bfloat16_rd,
+    uint64_to_bfloat16_ru,
     bfloat16_as_int16,
-    bfloat16_as_uint16,
     int16_as_bfloat16,
-    uint16_as_bfloat16,
 )
 from numba.cuda.testing import CUDATestCase
 
@@ -332,7 +325,6 @@ def kernel(out):
         self.assertAlmostEqual(out[2], 2.0, delta=1e-3)
         self.assertAlmostEqual(out[3], 2.0, delta=1e-3)
 
-
     def test_to_integer_conversions(self):
         self.skip_unsupported()
 
@@ -380,7 +372,7 @@ def kernel(test_val, i1, i2, i3, i4, u1, u2, u3, u4):
         u3 = cuda.device_array((4,), dtype="uint32")
         u4 = cuda.device_array((4,), dtype="uint64")
 
-        test_val = np.int16(0x3fc0) # 1.5 in bfloat16
+        test_val = np.int16(0x3FC0)  # 1.5 in bfloat16
 
         kernel[1, 1](test_val, i1, i2, i3, i4, u1, u2, u3, u4)
 
@@ -394,6 +386,101 @@ def kernel(test_val, i1, i2, i3, i4, u1, u2, u3, u4):
         np.testing.assert_equal(u3, np.array([2, 1, 1, 2], "uint32"))
         np.testing.assert_equal(u4, np.array([2, 1, 1, 2], "uint64"))
 
+    def test_from_integer_conversions(self):
+        self.skip_unsupported()
+
+        test_val = 789
+
+        @cuda.jit
+        def kernel(out):
+            i2 = int16(test_val)
+            i3 = int32(test_val)
+            i4 = int64(test_val)
+            u2 = uint16(test_val)
+            u3 = uint32(test_val)
+            u4 = uint64(test_val)
+
+            i2rn = int16_to_bfloat16_rn(i2)
+            i2rz = int16_to_bfloat16_rz(i2)
+            i2rd = int16_to_bfloat16_rd(i2)
+            i2ru = int16_to_bfloat16_ru(i2)
+
+            u2rn = uint16_to_bfloat16_rn(u2)
+            u2rz = uint16_to_bfloat16_rz(u2)
+            u2rd = uint16_to_bfloat16_rd(u2)
+            u2ru = uint16_to_bfloat16_ru(u2)
+
+            i3rn = int32_to_bfloat16_rn(i3)
+            i3rz = int32_to_bfloat16_rz(i3)
+            i3rd = int32_to_bfloat16_rd(i3)
+            i3ru = int32_to_bfloat16_ru(i3)
+
+            u3rn = uint32_to_bfloat16_rn(u3)
+            u3rz = uint32_to_bfloat16_rz(u3)
+            u3rd = uint32_to_bfloat16_rd(u3)
+            u3ru = uint32_to_bfloat16_ru(u3)
+
+            i4rn = int64_to_bfloat16_rn(i4)
+            i4rz = int64_to_bfloat16_rz(i4)
+            i4rd = int64_to_bfloat16_rd(i4)
+            i4ru = int64_to_bfloat16_ru(i4)
+
+            u4rn = uint64_to_bfloat16_rn(u4)
+            u4rz = uint64_to_bfloat16_rz(u4)
+            u4rd = uint64_to_bfloat16_rd(u4)
+            u4ru = uint64_to_bfloat16_ru(u4)
+
+            out[0] = bfloat16_as_int16(i2rn)
+            out[1] = bfloat16_as_int16(i2rz)
+            out[2] = bfloat16_as_int16(i2rd)
+            out[3] = bfloat16_as_int16(i2ru)
+            out[4] = bfloat16_as_int16(u2rn)
+            out[5] = bfloat16_as_int16(u2rz)
+            out[6] = bfloat16_as_int16(u2rd)
+            out[7] = bfloat16_as_int16(u2ru)
+            out[8] = bfloat16_as_int16(i3rn)
+            out[9] = bfloat16_as_int16(i3rz)
+            out[10] = bfloat16_as_int16(i3rd)
+            out[11] = bfloat16_as_int16(i3ru)
+            out[12] = bfloat16_as_int16(u3rn)
+            out[13] = bfloat16_as_int16(u3rz)
+            out[14] = bfloat16_as_int16(u3rd)
+            out[15] = bfloat16_as_int16(u3ru)
+            out[16] = bfloat16_as_int16(i4rn)
+            out[17] = bfloat16_as_int16(i4rz)
+            out[18] = bfloat16_as_int16(i4rd)
+            out[19] = bfloat16_as_int16(i4ru)
+            out[20] = bfloat16_as_int16(u4rn)
+            out[21] = bfloat16_as_int16(u4rz)
+            out[22] = bfloat16_as_int16(u4rd)
+            out[23] = bfloat16_as_int16(u4ru)
+
+        out = cuda.device_array((24,), dtype="int16")
+        kernel[1, 1](out)
+        res = out.copy_to_host()
+
+        i2 = np.int16(789).astype(mldtypes_bf16).view("int16")
+        i3 = np.int32(789).astype(mldtypes_bf16).view("int16")
+        i4 = np.int64(789).astype(mldtypes_bf16).view("int16")
+        u2 = np.uint16(789).astype(mldtypes_bf16).view("int16")
+        u3 = np.uint32(789).astype(mldtypes_bf16).view("int16")
+        u4 = np.uint64(789).astype(mldtypes_bf16).view("int16")
+
+        i2arr = np.array([i2] * 4)
+        i3arr = np.array([i3] * 4)
+        i4arr = np.array([i4] * 4)
+        u2arr = np.array([u2] * 4)
+        u3arr = np.array([u3] * 4)
+        u4arr = np.array([u4] * 4)
+
+        two = np.ones_like(res[0:4]) * 2
+        np.testing.assert_array_less(_bf16_ulp_distance(res[0:4], i2arr), two)
+        np.testing.assert_array_less(_bf16_ulp_distance(res[4:8], i3arr), two)
+        np.testing.assert_array_less(_bf16_ulp_distance(res[8:12], i4arr), two)
+        np.testing.assert_array_less(_bf16_ulp_distance(res[12:16], u2arr), two)
+        np.testing.assert_array_less(_bf16_ulp_distance(res[16:20], u3arr), two)
+        np.testing.assert_array_less(_bf16_ulp_distance(res[20:24], u4arr), two)
+
     @unittest.skipIf(
         find_spec("ml_dtypes") is None,
         "ml_dtypes is required to use bfloat16 on host",
@@ -401,3 +488,23 @@ def kernel(test_val, i1, i2, i3, i4, u1, u2, u3, u4):
     def test_use_bfloat16_on_host(self):
         x = bfloat16(3.0)
         self.assertEqual(x, 3.0)
+
+
+def _bf16_ulp_rank(bits_int16: np.ndarray) -> np.ndarray:
+    """
+    Compute the ULP rank of a bfloat16 value. Input is the bits of the bfloat16 value as an int16.
+    The ULP rank is the number of ULPs between the value and 0.
+    Negative values are performed the inverse of 2's complement before computing the rank.
+    """
+    u = bits_int16.view(np.uint16)
+    sign = u >> 15
+    return np.where(sign == 0, u + 0x8000, 0x8000 - u).astype(np.int32)
+
+
+def _bf16_ulp_distance(
+    a_bits_int16: np.ndarray, b_bits_int16: np.ndarray
+) -> np.ndarray:
+    """
+    Compute the difference between two bfloat16 values in ULPs.
+    """
+    return np.abs(_bf16_ulp_rank(a_bits_int16) - _bf16_ulp_rank(b_bits_int16))

From 264f06986f7ac4b6d17b36bc3ec89a74f100365a Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Mon, 18 Aug 2025 15:16:43 -0700
Subject: [PATCH 37/56] testing bitcast operations

---
 .../numba/cuda/tests/cudapy/test_bfloat16.py  | 95 ++++++++++++++++++-
 numba_cuda/numba/cuda/types.py                |  2 +-
 2 files changed, 95 insertions(+), 2 deletions(-)

diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py
index ec59a5285..79cf6c976 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py
@@ -3,7 +3,17 @@
 import numpy as np
 from ml_dtypes import bfloat16 as mldtypes_bf16
 
-from numba import cuda, float32, int16, int32, int64, uint16, uint32, uint64
+from numba import (
+    cuda,
+    float32,
+    float64,
+    int16,
+    int32,
+    int64,
+    uint16,
+    uint32,
+    uint64,
+)
 from numba.cuda.bf16 import (
     bfloat16,
     habs,
@@ -87,6 +97,15 @@
     uint64_to_bfloat16_ru,
     bfloat16_as_int16,
     int16_as_bfloat16,
+    bfloat16_as_uint16,
+    uint16_as_bfloat16,
+    bfloat16_to_float32,
+    float32_to_bfloat16,
+    float64_to_bfloat16,
+    float32_to_bfloat16_rn,
+    float32_to_bfloat16_rz,
+    float32_to_bfloat16_rd,
+    float32_to_bfloat16_ru,
 )
 from numba.cuda.testing import CUDATestCase
 
@@ -325,6 +344,20 @@ def kernel(out):
         self.assertAlmostEqual(out[2], 2.0, delta=1e-3)
         self.assertAlmostEqual(out[3], 2.0, delta=1e-3)
 
+    def test_bfloat16_as_bitcast(self):
+        @cuda.jit
+        def roundtrip_kernel(test_val, i2, u2):
+            i2[0] = int16_as_bfloat16(bfloat16_as_int16(test_val))
+            u2[0] = uint16_as_bfloat16(bfloat16_as_uint16(test_val))
+
+        test_val = np.int16(0x3FC0)  # 1.5 in bfloat16
+        i2 = cuda.device_array((1,), dtype="int16")
+        u2 = cuda.device_array((1,), dtype="uint16")
+        roundtrip_kernel[1, 1](test_val, i2, u2)
+
+        self.assertEqual(i2[0], test_val)
+        self.assertEqual(u2[0], test_val)
+
     def test_to_integer_conversions(self):
         self.skip_unsupported()
 
@@ -481,6 +514,66 @@ def kernel(out):
         np.testing.assert_array_less(_bf16_ulp_distance(res[16:20], u3arr), two)
         np.testing.assert_array_less(_bf16_ulp_distance(res[20:24], u4arr), two)
 
+    def test_to_float_conversions(self):
+        self.skip_unsupported()
+
+        @cuda.jit
+        def kernel(out):
+            a = bfloat16(1.5)
+            out[0] = bfloat16_to_float32(a)
+
+        out = cuda.device_array((1,), dtype="float32")
+        kernel[1, 1](out)
+
+        self.assertAlmostEqual(out[0], 1.5, delta=1e-7)  # conversion is exact
+
+    def test_from_float_conversions(self):
+        self.skip_unsupported()
+
+        test_val = 1.5
+
+        @cuda.jit
+        def kernel(out):
+            f4 = float32(test_val)
+            f8 = float64(test_val)
+
+            f4rn = float32_to_bfloat16_rn(f4)
+            f4rz = float32_to_bfloat16_rz(f4)
+            f4rd = float32_to_bfloat16_rd(f4)
+            f4ru = float32_to_bfloat16_ru(f4)
+
+            f4_default = float32_to_bfloat16(f4)
+            f8_default = float64_to_bfloat16(f8)
+
+            out[0] = bfloat16_as_int16(f4rn)
+            out[1] = bfloat16_as_int16(f4rz)
+            out[2] = bfloat16_as_int16(f4rd)
+            out[3] = bfloat16_as_int16(f4ru)
+            out[4] = bfloat16_as_int16(f4_default)
+            out[5] = bfloat16_as_int16(f8_default)
+
+        out = cuda.device_array((1,), dtype="int16")
+        kernel[1, 1](out)
+        raw = out.copy_to_host()
+
+        f4_expected = (
+            np.array([test_val] * 4, "float32")
+            .astype(mldtypes_bf16)
+            .view("int16")
+        )
+        f8_expected = (
+            np.array([test_val] * 1, "float64")
+            .astype(mldtypes_bf16)
+            .view("int16")
+        )
+
+        np.testing.assert_array_less(
+            _bf16_ulp_distance(raw[0:4], f4_expected), 2
+        )
+        np.testing.assert_array_less(
+            _bf16_ulp_distance(raw[4:], f8_expected), 2
+        )
+
     @unittest.skipIf(
         find_spec("ml_dtypes") is None,
         "ml_dtypes is required to use bfloat16 on host",
diff --git a/numba_cuda/numba/cuda/types.py b/numba_cuda/numba/cuda/types.py
index 5ddcaef5e..17a4184d1 100644
--- a/numba_cuda/numba/cuda/types.py
+++ b/numba_cuda/numba/cuda/types.py
@@ -52,7 +52,7 @@ def __init__(self):
         self.alignof_ = 2
         self.bitwidth = 16
 
-    def can_convert_from(self, other):
+    def can_convert_from(self, typingctx, other):
         if isinstance(other, types.Float):
             return Conversion.unsafe
 

From 88ac53eb0565dd4b69b75c232baa4d7e455d17bc Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Mon, 18 Aug 2025 21:42:12 -0700
Subject: [PATCH 38/56] add fp16, bf16 vended headers

---
 numba_cuda/numba/cuda/include/13/cuda_bf16.h  | 5118 ++++++++++++++++
 numba_cuda/numba/cuda/include/13/cuda_fp16.h  | 5363 +++++++++++++++++
 .../numba/cuda/include/13/cuda_fp16.hpp       | 3483 +++++++++++
 3 files changed, 13964 insertions(+)
 create mode 100644 numba_cuda/numba/cuda/include/13/cuda_bf16.h
 create mode 100644 numba_cuda/numba/cuda/include/13/cuda_fp16.h
 create mode 100644 numba_cuda/numba/cuda/include/13/cuda_fp16.hpp

diff --git a/numba_cuda/numba/cuda/include/13/cuda_bf16.h b/numba_cuda/numba/cuda/include/13/cuda_bf16.h
new file mode 100644
index 000000000..38feffba0
--- /dev/null
+++ b/numba_cuda/numba/cuda/include/13/cuda_bf16.h
@@ -0,0 +1,5118 @@
+/*
+* Copyright 1993-2024 NVIDIA Corporation.  All rights reserved.
+*
+* NOTICE TO LICENSEE:
+*
+* This source code and/or documentation ("Licensed Deliverables") are
+* subject to NVIDIA intellectual property rights under U.S. and
+* international Copyright laws.
+*
+* These Licensed Deliverables contained herein is PROPRIETARY and
+* CONFIDENTIAL to NVIDIA and is being provided under the terms and
+* conditions of a form of NVIDIA software license agreement by and
+* between NVIDIA and Licensee ("License Agreement") or electronically
+* accepted by Licensee.  Notwithstanding any terms or conditions to
+* the contrary in the License Agreement, reproduction or disclosure
+* of the Licensed Deliverables to any third party without the express
+* written consent of NVIDIA is prohibited.
+*
+* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+* OF THESE LICENSED DELIVERABLES.
+*
+* U.S. Government End Users.  These Licensed Deliverables are a
+* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+* 1995), consisting of "commercial computer software" and "commercial
+* computer software documentation" as such terms are used in 48
+* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+* only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+* U.S. Government End Users acquire the Licensed Deliverables with
+* only those rights set forth herein.
+*
+* Any use of the Licensed Deliverables in individual and commercial
+* software must include, in the user documentation and internal
+* comments to the code, the above Disclaimer and U.S. Government End
+* Users Notice.
+*/
+
+/**
+* \defgroup CUDA_MATH_INTRINSIC_BFLOAT16 Bfloat16 Precision Intrinsics
+* This section describes nv_bfloat16 precision intrinsic functions.
+* To use these functions, include the header file \p cuda_bf16.h in your program.
+* All of the functions defined here are available in device code.
+* Some of the functions are also available to host compilers, please
+* refer to respective functions' documentation for details.
+*
+* NOTE: Aggressive floating-point optimizations performed by host or device
+* compilers may affect numeric behavior of the functions implemented in this
+* header. Specific examples are:
+* - hsin(__nv_bfloat16);
+* - hcos(__nv_bfloat16);
+* - h2sin(__nv_bfloat162);
+* - h2cos(__nv_bfloat162);
+*
+* The following macros are available to help users selectively enable/disable
+* various definitions present in the header file:
+* - \p CUDA_NO_BFLOAT16 - If defined, this macro will prevent the definition of
+* additional type aliases in the global namespace, helping to avoid potential
+* conflicts with symbols defined in the user program.
+* - \p __CUDA_NO_BFLOAT16_CONVERSIONS__ - If defined, this macro will prevent
+* the use of the C++ type conversions (converting constructors and conversion
+* operators) that are common for built-in floating-point types, but may be
+* undesirable for \p __nv_bfloat16 which is essentially a user-defined type.
+* - \p __CUDA_NO_BFLOAT16_OPERATORS__ and \p __CUDA_NO_BFLOAT162_OPERATORS__ -
+* If defined, these macros will prevent the inadvertent use of usual arithmetic
+* and comparison operators. This enforces the storage-only type semantics and
+* prevents C++ style computations on \p __nv_bfloat16 and \p __nv_bfloat162 types.
+*/
+
+/**
+* \defgroup CUDA_MATH_INTRINSIC_BFLOAT16_CONSTANTS Bfloat16 Arithmetic Constants
+* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+* To use these constants, include the header file \p cuda_bf16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__BFLOAT16_ARITHMETIC Bfloat16 Arithmetic Functions
+* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+* To use these functions, include the header file \p cuda_bf16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__BFLOAT162_ARITHMETIC Bfloat162 Arithmetic Functions
+* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+* To use these functions, include the header file \p cuda_bf16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__BFLOAT16_COMPARISON Bfloat16 Comparison Functions
+* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+* To use these functions, include the header file \p cuda_bf16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__BFLOAT162_COMPARISON Bfloat162 Comparison Functions
+* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+* To use these functions, include the header file \p cuda_bf16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__BFLOAT16_MISC Bfloat16 Precision Conversion and Data Movement
+* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+* To use these functions, include the header file \p cuda_bf16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__BFLOAT16_FUNCTIONS Bfloat16 Math Functions
+* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+* To use these functions, include the header file \p cuda_bf16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__BFLOAT162_FUNCTIONS Bfloat162 Math Functions
+* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+* To use these functions, include the header file \p cuda_bf16.h in your program.
+*/
+
+#ifndef __CUDA_BF16_H__
+#define __CUDA_BF16_H__
+
+/* bring in __half data type and operations, for use in converting constructors */
+#include "cuda_fp16.h"
+
+// implicitly provided by NVRTC
+#if !defined(__CUDACC_RTC__)
+/* bring in float2, double4, etc vector types */
+#include "vector_types.h"
+/* bring in operations on vector types like: make_float2 */
+#include "vector_functions.h"
+#endif  /* !defined(__CUDACC_RTC__) */
+
+#define ___CUDA_BF16_STRINGIFY_INNERMOST(x) #x
+#define __CUDA_BF16_STRINGIFY(x) ___CUDA_BF16_STRINGIFY_INNERMOST(x)
+
+#if defined(__cplusplus)
+
+/* Set up function decorations */
+#if (defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3))))
+#define __CUDA_BF16_DECL__ __device__
+#define __CUDA_HOSTDEVICE_BF16_DECL__ __device__
+#define __CUDA_HOSTDEVICE__ __device__
+#elif defined(__CUDACC__) || defined(_NVHPC_CUDA)
+#define __CUDA_BF16_DECL__ static __device__ __inline__
+#define __CUDA_HOSTDEVICE_BF16_DECL__ static __host__ __device__ __inline__
+#define __CUDA_HOSTDEVICE__ __host__ __device__
+#else /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+#if defined(__GNUC__)
+#define __CUDA_HOSTDEVICE_BF16_DECL__ static __attribute__ ((unused))
+#else
+#define __CUDA_HOSTDEVICE_BF16_DECL__ static
+#endif /* defined(__GNUC__) */
+#define __CUDA_HOSTDEVICE__
+#endif /* (defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3))))) */
+
+#define __CUDA_BF16_TYPES_EXIST__
+
+/* Macros to allow nv_bfloat16 & nv_bfloat162 to be used by inline assembly */
+#define __BFLOAT16_TO_US(var) *(reinterpret_cast<unsigned short *>(&(var)))
+#define __BFLOAT16_TO_CUS(var) *(reinterpret_cast<const unsigned short *>(&(var)))
+#define __BFLOAT162_TO_UI(var) *(reinterpret_cast<unsigned int *>(&(var)))
+#define __BFLOAT162_TO_CUI(var) *(reinterpret_cast<const unsigned int *>(&(var)))
+
+/* Forward-declaration of structures defined in "cuda_bf16.hpp" */
+struct __nv_bfloat16;
+struct __nv_bfloat162;
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts double number to nv_bfloat16 precision in round-to-nearest-even mode
+* and returns \p nv_bfloat16 with converted value.
+*
+* \details Converts double number \p a to nv_bfloat16 precision in round-to-nearest-even mode.
+* \param[in] a - double. Is only being read.
+* \returns nv_bfloat16
+* - \p a converted to \p nv_bfloat16 using round-to-nearest-even mode.
+* - __double2bfloat16 \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - __double2bfloat16 \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - __double2bfloat16(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __double2bfloat16(const double a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts float number to nv_bfloat16 precision in round-to-nearest-even mode
+* and returns \p nv_bfloat16 with converted value.
+*
+* \details Converts float number \p a to nv_bfloat16 precision in round-to-nearest-even mode.
+* \param[in] a - float. Is only being read.
+* \returns nv_bfloat16
+* - \p a converted to nv_bfloat16 using round-to-nearest-even mode.
+*
+* \see __float2bfloat16_rn(float) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16(const float a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts float number to nv_bfloat16 precision in round-to-nearest-even mode
+* and returns \p nv_bfloat16 with converted value.
+*
+* \details Converts float number \p a to nv_bfloat16 precision in round-to-nearest-even mode.
+* \param[in] a - float. Is only being read.
+* \returns nv_bfloat16
+* - \p a converted to nv_bfloat16 using round-to-nearest-even mode.
+* - __float2bfloat16_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - __float2bfloat16_rn \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - __float2bfloat16_rn(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rn(const float a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts float number to nv_bfloat16 precision in round-towards-zero mode
+* and returns \p nv_bfloat16 with converted value.
+*
+* \details Converts float number \p a to nv_bfloat16 precision in round-towards-zero mode.
+* \param[in] a - float. Is only being read.
+* \returns nv_bfloat16
+* - \p a converted to nv_bfloat16 using round-towards-zero mode.
+* - __float2bfloat16_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - __float2bfloat16_rz \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - __float2bfloat16_rz(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rz(const float a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts float number to nv_bfloat16 precision in round-down mode
+* and returns \p nv_bfloat16 with converted value.
+*
+* \details Converts float number \p a to nv_bfloat16 precision in round-down mode.
+* \param[in] a - float. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p a converted to nv_bfloat16 using round-down mode.
+* - __float2bfloat16_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - __float2bfloat16_rd \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - __float2bfloat16_rd(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rd(const float a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts float number to nv_bfloat16 precision in round-up mode
+* and returns \p nv_bfloat16 with converted value.
+*
+* \details Converts float number \p a to nv_bfloat16 precision in round-up mode.
+* \param[in] a - float. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p a converted to nv_bfloat16 using round-up mode.
+* - __float2bfloat16_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - __float2bfloat16_ru \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - __float2bfloat16_ru(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_ru(const float a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts \p nv_bfloat16 number to float.
+*
+* \details Converts nv_bfloat16 number \p a to float.
+* \param[in] a - float. Is only being read.
+*
+* \returns float
+* - \p a converted to float.
+* - __bfloat162float \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - __bfloat162float \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - __bfloat162float(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ float __bfloat162float(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts input to nv_bfloat16 precision in round-to-nearest-even mode and
+* populates both halves of \p nv_bfloat162 with converted value.
+*
+* \details Converts input \p a to nv_bfloat16 precision in round-to-nearest-even mode and
+* populates both halves of \p nv_bfloat162 with converted value.
+* \param[in] a - float. Is only being read.
+*
+* \returns nv_bfloat162
+* - The \p nv_bfloat162 value with both halves equal to the converted nv_bfloat16
+* precision number.
+*
+* \see __float2bfloat16_rn(float) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __float2bfloat162_rn(const float a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts both input floats to nv_bfloat16 precision in round-to-nearest-even
+* mode and returns \p nv_bfloat162 with converted values.
+*
+* \details Converts both input floats to nv_bfloat16 precision in round-to-nearest-even mode
+* and combines the results into one \p nv_bfloat162 number. Low 16 bits of the return
+* value correspond to the input \p a, high 16 bits correspond to the input \p
+* b.
+* \param[in] a - float. Is only being read.
+* \param[in] b - float. Is only being read.
+*
+* \returns nv_bfloat162
+* - The \p nv_bfloat162 value with corresponding halves equal to the
+* converted input floats.
+*
+* \see __float2bfloat16_rn(float) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __floats2bfloat162_rn(const float a, const float b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts low 16 bits of \p nv_bfloat162 to float and returns the result
+*
+* \details Converts low 16 bits of \p nv_bfloat162 input \p a to 32-bit floating-point number
+* and returns the result.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns float
+* - The low 16 bits of \p a converted to float.
+*
+* \see __bfloat162float(__nv_bfloat16) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ float __low2float(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts high 16 bits of \p nv_bfloat162 to float and returns the result
+*
+* \details Converts high 16 bits of \p nv_bfloat162 input \p a to 32-bit floating-point number
+* and returns the result.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns float
+* - The high 16 bits of \p a converted to float.
+*
+* \see __bfloat162float(__nv_bfloat16) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ float __high2float(const __nv_bfloat162 a);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts both components of float2 number to nv_bfloat16 precision in
+* round-to-nearest-even mode and returns \p nv_bfloat162 with converted values.
+*
+* \details Converts both components of float2 to nv_bfloat16 precision in round-to-nearest-even
+* mode and combines the results into one \p nv_bfloat162 number. Low 16 bits of the
+* return value correspond to \p a.x and high 16 bits of the return value
+* correspond to \p a.y.
+* \param[in] a - float2. Is only being read.
+*
+* \returns nv_bfloat162
+* - The \p nv_bfloat162 which has corresponding halves equal to the
+* converted float2 components.
+*
+* \see __float2bfloat16_rn(float) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __float22bfloat162_rn(const float2 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts both halves of \p nv_bfloat162 to float2 and returns the result.
+*
+* \details Converts both halves of \p nv_bfloat162 input \p a to float and returns the
+* result as a \p float2 packed value.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns float2
+* - \p a converted to float2.
+*
+* \see __bfloat162float(__nv_bfloat16) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ float2 __bfloat1622float2(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed char in round-towards-zero mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed
+* char in round-towards-zero mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns signed char
+* - \p h converted to a signed char using round-towards-zero mode.
+* - __bfloat162char_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __bfloat162char_rz \cuda_math_formula (x), x > 127\end_cuda_math_formula returns SCHAR_MAX = \p 0x7F.
+* - __bfloat162char_rz \cuda_math_formula (x), x < -128\end_cuda_math_formula returns SCHAR_MIN = \p 0x80.
+* - __bfloat162char_rz(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ signed char __bfloat162char_rz(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned char in round-towards-zero mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned
+* char in round-towards-zero mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns unsigned char
+* - \p h converted to an unsigned char using round-towards-zero mode.
+* - __bfloat162uchar_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __bfloat162uchar_rz \cuda_math_formula (x), x > 255\end_cuda_math_formula returns UCHAR_MAX = \p 0xFF.
+* - __bfloat162uchar_rz \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0.
+* - __bfloat162uchar_rz(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned char __bfloat162uchar_rz(const __nv_bfloat16 h);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed integer in round-to-nearest-even mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed integer in
+* round-to-nearest-even mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns int
+* - \p h converted to a signed integer using round-to-nearest-even mode.
+* - __bfloat162int_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __bfloat162int_rn \cuda_math_formula (x), x > INT_MAX\end_cuda_math_formula returns INT_MAX = \p 0x7FFFFFFF.
+* - __bfloat162int_rn \cuda_math_formula (x), x < INT_MIN\end_cuda_math_formula returns INT_MIN = \p 0x80000000.
+* - __bfloat162int_rn(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ int __bfloat162int_rn(const __nv_bfloat16 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed integer in round-towards-zero mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed integer in
+* round-towards-zero mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns int
+* - \p h converted to a signed integer using round-towards-zero mode.
+* - __bfloat162int_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __bfloat162int_rz \cuda_math_formula (x), x > INT_MAX\end_cuda_math_formula returns INT_MAX = \p 0x7FFFFFFF.
+* - __bfloat162int_rz \cuda_math_formula (x), x < INT_MIN\end_cuda_math_formula returns INT_MIN = \p 0x80000000.
+* - __bfloat162int_rz(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ int __bfloat162int_rz(const __nv_bfloat16 h);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed integer in round-down mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed integer in
+* round-down mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns int
+* - \p h converted to a signed integer using round-down mode.
+* - __bfloat162int_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __bfloat162int_rd \cuda_math_formula (x), x > INT_MAX\end_cuda_math_formula returns INT_MAX = \p 0x7FFFFFFF.
+* - __bfloat162int_rd \cuda_math_formula (x), x < INT_MIN\end_cuda_math_formula returns INT_MIN = \p 0x80000000.
+* - __bfloat162int_rd(NaN) returns 0.* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ int __bfloat162int_rd(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed integer in round-up mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed integer in
+* round-up mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns int
+* - \p h converted to a signed integer using round-up mode.
+* - __bfloat162int_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __bfloat162int_ru \cuda_math_formula (x), x > INT_MAX\end_cuda_math_formula returns INT_MAX = \p 0x7FFFFFFF.
+* - __bfloat162int_ru \cuda_math_formula (x), x < INT_MIN\end_cuda_math_formula returns INT_MIN = \p 0x80000000.
+* - __bfloat162int_ru(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ int __bfloat162int_ru(const __nv_bfloat16 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed integer to a nv_bfloat16 in round-to-nearest-even mode.
+*
+* \details Convert the signed integer value \p i to a nv_bfloat16 floating-point
+* value in round-to-nearest-even mode.
+* \param[in] i - int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rn(const int i);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed integer to a nv_bfloat16 in round-towards-zero mode.
+*
+* \details Convert the signed integer value \p i to a nv_bfloat16 floating-point
+* value in round-towards-zero mode.
+* \param[in] i - int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rz(const int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed integer to a nv_bfloat16 in round-down mode.
+*
+* \details Convert the signed integer value \p i to a nv_bfloat16 floating-point
+* value in round-down mode.
+* \param[in] i - int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rd(const int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed integer to a nv_bfloat16 in round-up mode.
+*
+* \details Convert the signed integer value \p i to a nv_bfloat16 floating-point
+* value in round-up mode.
+* \param[in] i - int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_ru(const int i);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed short integer in round-to-nearest-even
+* mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed short
+* integer in round-to-nearest-even mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns short int
+* - \p h converted to a signed short integer using round-to-nearest-even mode.
+* - __bfloat162short_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __bfloat162short_rn \cuda_math_formula (x), x > 32767\end_cuda_math_formula returns SHRT_MAX = \p 0x7FFF.
+* - __bfloat162short_rn \cuda_math_formula (x), x < -32768\end_cuda_math_formula returns SHRT_MIN = \p 0x8000.
+* - __bfloat162short_rn(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ short int __bfloat162short_rn(const __nv_bfloat16 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed short integer in round-towards-zero mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed short
+* integer in round-towards-zero mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns short int
+* - \p h converted to a signed short integer using round-towards-zero mode.
+* - __bfloat162short_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __bfloat162short_rz \cuda_math_formula (x), x > 32767\end_cuda_math_formula returns SHRT_MAX = \p 0x7FFF.
+* - __bfloat162short_rz \cuda_math_formula (x), x < -32768\end_cuda_math_formula returns SHRT_MIN = \p 0x8000.
+* - __bfloat162short_rz(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ short int __bfloat162short_rz(const __nv_bfloat16 h);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed short integer in round-down mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed short
+* integer in round-down mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns short int
+* - \p h converted to a signed short integer using round-down mode.
+* - __bfloat162short_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __bfloat162short_rd \cuda_math_formula (x), x > 32767\end_cuda_math_formula returns SHRT_MAX = \p 0x7FFF.
+* - __bfloat162short_rd \cuda_math_formula (x), x < -32768\end_cuda_math_formula returns SHRT_MIN = \p 0x8000.
+* - __bfloat162short_rd(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ short int __bfloat162short_rd(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed short integer in round-up mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed short
+* integer in round-up mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns short int
+* - \p h converted to a signed short integer using round-up mode.
+* - __bfloat162short_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __bfloat162short_ru \cuda_math_formula (x), x > 32767\end_cuda_math_formula returns SHRT_MAX = \p 0x7FFF.
+* - __bfloat162short_ru \cuda_math_formula (x), x < -32768\end_cuda_math_formula returns SHRT_MIN = \p 0x8000.
+* - __bfloat162short_ru(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ short int __bfloat162short_ru(const __nv_bfloat16 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed short integer to a nv_bfloat16 in round-to-nearest-even
+* mode.
+*
+* \details Convert the signed short integer value \p i to a nv_bfloat16 floating-point
+* value in round-to-nearest-even mode.
+* \param[in] i - short int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rn(const short int i);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed short integer to a nv_bfloat16 in round-towards-zero mode.
+*
+* \details Convert the signed short integer value \p i to a nv_bfloat16 floating-point
+* value in round-towards-zero mode.
+* \param[in] i - short int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rz(const short int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed short integer to a nv_bfloat16 in round-down mode.
+*
+* \details Convert the signed short integer value \p i to a nv_bfloat16 floating-point
+* value in round-down mode.
+* \param[in] i - short int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rd(const short int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed short integer to a nv_bfloat16 in round-up mode.
+*
+* \details Convert the signed short integer value \p i to a nv_bfloat16 floating-point
+* value in round-up mode.
+* \param[in] i - short int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_ru(const short int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned integer in round-to-nearest-even mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned integer
+* in round-to-nearest-even mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns unsigned int
+* - \p h converted to an unsigned integer.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ unsigned int __bfloat162uint_rn(const __nv_bfloat16 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned integer in round-towards-zero mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned integer
+* in round-towards-zero mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns unsigned int
+* - \p h converted to an unsigned integer.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __bfloat162uint_rz(const __nv_bfloat16 h);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned integer in round-down mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned integer
+* in round-down mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns unsigned int
+* - \p h converted to an unsigned integer.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ unsigned int __bfloat162uint_rd(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned integer in round-up mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned integer
+* in round-up mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns unsigned int
+* - \p h converted to an unsigned integer.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ unsigned int __bfloat162uint_ru(const __nv_bfloat16 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned integer to a nv_bfloat16 in round-to-nearest-even mode.
+*
+* \details Convert the unsigned integer value \p i to a nv_bfloat16 floating-point
+* value in round-to-nearest-even mode.
+* \param[in] i - unsigned int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rn(const unsigned int i);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned integer to a nv_bfloat16 in round-towards-zero mode.
+*
+* \details Convert the unsigned integer value \p i to a nv_bfloat16 floating-point
+* value in round-towards-zero mode.
+* \param[in] i - unsigned int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rz(const unsigned int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned integer to a nv_bfloat16 in round-down mode.
+*
+* \details Convert the unsigned integer value \p i to a nv_bfloat16 floating-point
+* value in round-down mode.
+* \param[in] i - unsigned int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rd(const unsigned int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned integer to a nv_bfloat16 in round-up mode.
+*
+* \details Convert the unsigned integer value \p i to a nv_bfloat16 floating-point
+* value in round-up mode.
+* \param[in] i - unsigned int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_ru(const unsigned int i);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned short integer in round-to-nearest-even
+* mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned short
+* integer in round-to-nearest-even mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns unsigned short int
+* - \p h converted to an unsigned short integer.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_rn(const __nv_bfloat16 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned short integer in round-towards-zero
+* mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned short
+* integer in round-towards-zero mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns unsigned short int
+* - \p h converted to an unsigned short integer.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned short int __bfloat162ushort_rz(const __nv_bfloat16 h);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned short integer in round-down mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned short
+* integer in round-down mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns unsigned short int
+* - \p h converted to an unsigned short integer.
+*/
+__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_rd(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned short integer in round-up mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned short
+* integer in round-up mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns unsigned short int
+* - \p h converted to an unsigned short integer.
+*/
+__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_ru(const __nv_bfloat16 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned short integer to a nv_bfloat16 in round-to-nearest-even
+* mode.
+*
+* \details Convert the unsigned short integer value \p i to a nv_bfloat16 floating-point
+* value in round-to-nearest-even mode.
+* \param[in] i - unsigned short int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rn(const unsigned short int i);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned short integer to a nv_bfloat16 in round-towards-zero
+* mode.
+*
+* \details Convert the unsigned short integer value \p i to a nv_bfloat16 floating-point
+* value in round-towards-zero mode.
+* \param[in] i - unsigned short int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rz(const unsigned short int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned short integer to a nv_bfloat16 in round-down mode.
+*
+* \details Convert the unsigned short integer value \p i to a nv_bfloat16 floating-point
+* value in round-down mode.
+* \param[in] i - unsigned short int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rd(const unsigned short int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned short integer to a nv_bfloat16 in round-up mode.
+*
+* \details Convert the unsigned short integer value \p i to a nv_bfloat16 floating-point
+* value in round-up mode.
+* \param[in] i - unsigned short int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_ru(const unsigned short int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned 64-bit integer in round-to-nearest-even
+* mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned 64-bit
+* integer in round-to-nearest-even mode. NaN inputs return 0x8000000000000000.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns unsigned long long int
+* - \p h converted to an unsigned 64-bit integer.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_rn(const __nv_bfloat16 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned 64-bit integer in round-towards-zero
+* mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned 64-bit
+* integer in round-towards-zero mode. NaN inputs return 0x8000000000000000.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns unsigned long long int
+* - \p h converted to an unsigned 64-bit integer.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned long long int __bfloat162ull_rz(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Vector function, combines two \p nv_bfloat16 numbers into one \p nv_bfloat162 number.
+*
+* \details Combines two input \p nv_bfloat16 number \p x and \p y into one \p nv_bfloat162 number.
+* Input \p x is stored in low 16 bits of the return value, input \p y is stored
+* in high 16 bits of the return value.
+* \param[in] x - nv_bfloat16. Is only being read.
+* \param[in] y - nv_bfloat16. Is only being read.
+*
+* \returns __nv_bfloat162
+* - The \p __nv_bfloat162 vector with one half equal to \p x and the other to \p y.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 make_bfloat162(const __nv_bfloat16 x, const __nv_bfloat16 y);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned 64-bit integer in round-down mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned 64-bit
+* integer in round-down mode. NaN inputs return 0x8000000000000000.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns unsigned long long int
+* - \p h converted to an unsigned 64-bit integer.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_rd(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned 64-bit integer in round-up mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned 64-bit
+* integer in round-up mode. NaN inputs return 0x8000000000000000.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns unsigned long long int
+* - \p h converted to an unsigned 64-bit integer.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_ru(const __nv_bfloat16 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned 64-bit integer to a nv_bfloat16 in round-to-nearest-even
+* mode.
+*
+* \details Convert the unsigned 64-bit integer value \p i to a nv_bfloat16 floating-point
+* value in round-to-nearest-even mode.
+* \param[in] i - unsigned long long int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rn(const unsigned long long int i);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned 64-bit integer to a nv_bfloat16 in round-towards-zero
+* mode.
+*
+* \details Convert the unsigned 64-bit integer value \p i to a nv_bfloat16 floating-point
+* value in round-towards-zero mode.
+* \param[in] i - unsigned long long int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rz(const unsigned long long int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned 64-bit integer to a nv_bfloat16 in round-down mode.
+*
+* \details Convert the unsigned 64-bit integer value \p i to a nv_bfloat16 floating-point
+* value in round-down mode.
+* \param[in] i - unsigned long long int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rd(const unsigned long long int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned 64-bit integer to a nv_bfloat16 in round-up mode.
+*
+* \details Convert the unsigned 64-bit integer value \p i to a nv_bfloat16 floating-point
+* value in round-up mode.
+* \param[in] i - unsigned long long int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_ru(const unsigned long long int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed 64-bit integer in round-to-nearest-even
+* mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed 64-bit
+* integer in round-to-nearest-even mode. NaN inputs return a long long int with hex value of 0x8000000000000000.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns long long int
+* - \p h converted to a signed 64-bit integer.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ long long int __bfloat162ll_rn(const __nv_bfloat16 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed 64-bit integer in round-towards-zero mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed 64-bit
+* integer in round-towards-zero mode. NaN inputs return a long long int with hex value of 0x8000000000000000.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns long long int
+* - \p h converted to a signed 64-bit integer.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ long long int __bfloat162ll_rz(const __nv_bfloat16 h);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed 64-bit integer in round-down mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed 64-bit
+* integer in round-down mode. NaN inputs return a long long int with hex value of 0x8000000000000000.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns long long int
+* - \p h converted to a signed 64-bit integer.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ long long int __bfloat162ll_rd(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed 64-bit integer in round-up mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed 64-bit
+* integer in round-up mode. NaN inputs return a long long int with hex value of 0x8000000000000000.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns long long int
+* - \p h converted to a signed 64-bit integer.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ long long int __bfloat162ll_ru(const __nv_bfloat16 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed 64-bit integer to a nv_bfloat16 in round-to-nearest-even
+* mode.
+*
+* \details Convert the signed 64-bit integer value \p i to a nv_bfloat16 floating-point
+* value in round-to-nearest-even mode.
+* \param[in] i - long long int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rn(const long long int i);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed 64-bit integer to a nv_bfloat16 in round-towards-zero mode.
+*
+* \details Convert the signed 64-bit integer value \p i to a nv_bfloat16 floating-point
+* value in round-towards-zero mode.
+* \param[in] i - long long int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rz(const long long int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed 64-bit integer to a nv_bfloat16 in round-down mode.
+*
+* \details Convert the signed 64-bit integer value \p i to a nv_bfloat16 floating-point
+* value in round-down mode.
+* \param[in] i - long long int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rd(const long long int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed 64-bit integer to a nv_bfloat16 in round-up mode.
+*
+* \details Convert the signed 64-bit integer value \p i to a nv_bfloat16 floating-point
+* value in round-up mode.
+* \param[in] i - long long int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_ru(const long long int i);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Truncate input argument to the integral part.
+*
+* \details Round \p h to the nearest integer value that does not exceed \p h in
+* magnitude.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The truncated integer value.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 htrunc(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculate ceiling of the input argument.
+*
+* \details Compute the smallest integer value not less than \p h.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The smallest integer value not less than \p h.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hceil(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculate the largest integer less than or equal to \p h.
+*
+* \details Calculate the largest integer value which is less than or equal to \p h.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The largest integer value which is less than or equal to \p h.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hfloor(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Round input to nearest integer value in nv_bfloat16 floating-point
+* number.
+*
+* \details Round \p h to the nearest integer value in nv_bfloat16 floating-point
+* format, with bfloat16way cases rounded to the nearest even integer value.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The nearest integer to \p h.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hrint(const __nv_bfloat16 h);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Truncate \p nv_bfloat162 vector input argument to the integral part.
+*
+* \details Round each component of vector \p h to the nearest integer value that does
+* not exceed \p h in magnitude.
+* \param[in] h - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The truncated \p h.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2trunc(const __nv_bfloat162 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculate \p nv_bfloat162 vector ceiling of the input argument.
+*
+* \details For each component of vector \p h compute the smallest integer value not less
+* than \p h.
+* \param[in] h - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The vector of smallest integers not less than \p h.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2ceil(const __nv_bfloat162 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculate the largest integer less than or equal to \p h.
+*
+* \details For each component of vector \p h calculate the largest integer value which
+* is less than or equal to \p h.
+* \param[in] h - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The vector of largest integers which is less than or equal to \p h.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2floor(const __nv_bfloat162 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Round input to nearest integer value in nv_bfloat16 floating-point
+* number.
+*
+* \details Round each component of \p nv_bfloat162 vector \p h to the nearest integer value in
+* nv_bfloat16 floating-point format, with bfloat16way cases rounded to the
+* nearest even integer value.
+* \param[in] h - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The vector of rounded integer values.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2rint(const __nv_bfloat162 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Returns \p nv_bfloat162 with both halves equal to the input value.
+*
+* \details Returns \p nv_bfloat162 number with both halves equal to the input \p a \p nv_bfloat16
+* number.
+* \param[in] a - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat162
+* - The vector which has both its halves equal to the input \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __bfloat162bfloat162(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Swaps both halves of the \p nv_bfloat162 input.
+*
+* \details Swaps both halves of the \p nv_bfloat162 input and returns a new \p nv_bfloat162 number
+* with swapped halves.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - \p a with its halves being swapped.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __lowhigh2highlow(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Extracts low 16 bits from each of the two \p nv_bfloat162 inputs and combines
+* into one \p nv_bfloat162 number.
+*
+* \details Extracts low 16 bits from each of the two \p nv_bfloat162 inputs and combines into
+* one \p nv_bfloat162 number. Low 16 bits from input \p a is stored in low 16 bits of
+* the return value, low 16 bits from input \p b is stored in high 16 bits of
+* the return value.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The low 16 bits of \p a and of \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __lows2bfloat162(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Extracts high 16 bits from each of the two \p nv_bfloat162 inputs and
+* combines into one \p nv_bfloat162 number.
+*
+* \details Extracts high 16 bits from each of the two \p nv_bfloat162 inputs and combines into
+* one \p nv_bfloat162 number. High 16 bits from input \p a is stored in low 16 bits of
+* the return value, high 16 bits from input \p b is stored in high 16 bits of
+* the return value.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The high 16 bits of \p a and of \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __highs2bfloat162(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Returns high 16 bits of \p nv_bfloat162 input.
+*
+* \details Returns high 16 bits of \p nv_bfloat162 input \p a.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat16
+* - The high 16 bits of the input.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __high2bfloat16(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Returns low 16 bits of \p nv_bfloat162 input.
+*
+* \details Returns low 16 bits of \p nv_bfloat162 input \p a.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat16
+* - Returns \p nv_bfloat16 which contains low 16 bits of the input \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __low2bfloat16(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Checks if the input \p nv_bfloat16 number is infinite.
+*
+* \details Checks if the input \p nv_bfloat16 number \p a is infinite.
+* \param[in] a - nv_bfloat16. Is only being read.
+*
+* \returns int
+* - -1 if \p a is equal to negative infinity,
+* - 1 if \p a is equal to positive infinity,
+* - 0 otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ int __hisinf(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Combines two \p nv_bfloat16 numbers into one \p nv_bfloat162 number.
+*
+* \details Combines two input \p nv_bfloat16 number \p a and \p b into one \p nv_bfloat162 number.
+* Input \p a is stored in low 16 bits of the return value, input \p b is stored
+* in high 16 bits of the return value.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat162
+* - The nv_bfloat162 with one nv_bfloat16 equal to \p a and the other to \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __halves2bfloat162(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Extracts low 16 bits from \p nv_bfloat162 input.
+*
+* \details Extracts low 16 bits from \p nv_bfloat162 input \p a and returns a new \p nv_bfloat162
+* number which has both halves equal to the extracted bits.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The nv_bfloat162 with both halves equal to the low 16 bits of the input.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __low2bfloat162(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Extracts high 16 bits from \p nv_bfloat162 input.
+*
+* \details Extracts high 16 bits from \p nv_bfloat162 input \p a and returns a new \p nv_bfloat162
+* number which has both halves equal to the extracted bits.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The nv_bfloat162 with both halves equal to the high 16 bits of the input.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __high2bfloat162(const __nv_bfloat162 a);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Reinterprets bits in a \p nv_bfloat16 as a signed short integer.
+*
+* \details Reinterprets the bits in the nv_bfloat16 floating-point number \p h
+* as a signed short integer.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns short int
+* - The reinterpreted value.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ short int __bfloat16_as_short(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Reinterprets bits in a \p nv_bfloat16 as an unsigned short integer.
+*
+* \details Reinterprets the bits in the nv_bfloat16 floating-point \p h
+* as an unsigned short number.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns unsigned short int
+* - The reinterpreted value.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned short int __bfloat16_as_ushort(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Reinterprets bits in a signed short integer as a \p nv_bfloat16.
+*
+* \details Reinterprets the bits in the signed short integer \p i as a
+* nv_bfloat16 floating-point number.
+* \param[in] i - short int. Is only being read.
+*
+* \returns nv_bfloat16
+* - The reinterpreted value.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __short_as_bfloat16(const short int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Reinterprets bits in an unsigned short integer as a \p nv_bfloat16.
+*
+* \details Reinterprets the bits in the unsigned short integer \p i as a
+* nv_bfloat16 floating-point number.
+* \param[in] i - unsigned short int. Is only being read.
+*
+* \returns nv_bfloat16
+* - The reinterpreted value.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ushort_as_bfloat16(const unsigned short int i);
+
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300))) || defined(_NVHPC_CUDA)
+
+#if !defined warpSize && !defined __local_warpSize
+#define warpSize    32
+#define __local_warpSize
+#endif
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Exchange a variable between threads within a warp. Direct copy from indexed thread.
+*
+* \details Returns the value of \p var held by the thread whose ID is given by \p srcLane.
+* If the \p width is less than \p warpSize, then each subsection of the warp behaves as a separate
+* entity with a starting logical thread ID of 0. If \p srcLane is outside the range \p [0:width-1],
+* the value returned corresponds to the value of \p var held by the \p srcLane modulo \p width (i.e.
+* within the same subsection). \p width must have a value which is a power of 2;
+* results are undefined if \p width is not a power of 2, or is a number greater than
+* \p warpSize.
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read.
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - nv_bfloat162. Is only being read.
+* \param[in] srcLane - int. Is only being read.
+* \param[in] width - int. Is only being read.
+*
+* \returns Returns the 4-byte word referenced by \p var from the source thread ID as \p nv_bfloat162.
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_sync(const unsigned int mask, const __nv_bfloat162 var, const int srcLane, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller.
+*
+* \details Calculates a source thread ID by subtracting \p delta from the caller's lane ID.
+* The value of \p var held by the resulting lane ID is returned: in effect, \p var is shifted up
+* the warp by \p delta threads. If the \p width is less than \p warpSize, then each subsection of the warp
+* behaves as a separate entity with a starting logical thread ID of 0. The source thread index
+* will not wrap around the value of \p width, so effectively the lower \p delta threads will be unchanged.
+* \p width must have a value which is a power of 2; results are undefined if \p width is not a power of 2,
+* or is a number greater than \p warpSize.
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read.
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - nv_bfloat162. Is only being read.
+* \param[in] delta - unsigned int. Is only being read.
+* \param[in] width - int. Is only being read.
+*
+* \returns Returns the 4-byte word referenced by \p var from the source thread ID as \p nv_bfloat162.
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_up_sync(const unsigned int mask, const __nv_bfloat162 var, const unsigned int delta, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller.
+*
+* \details Calculates a source thread ID by adding \p delta to the caller's thread ID.
+* The value of \p var held by the resulting thread ID is returned: this has the effect
+* of shifting \p var down the warp by \p delta threads. If the \p width is less than \p warpSize, then
+* each subsection of the warp behaves as a separate entity with a starting logical
+* thread ID of 0. Similarly to the __shfl_up_sync(), the ID number of the source thread
+* will not wrap around the value of \p width and the upper \p delta threads
+* will remain unchanged.
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read.
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - nv_bfloat162. Is only being read.
+* \param[in] delta - unsigned int. Is only being read.
+* \param[in] width - int. Is only being read.
+*
+* \returns Returns the 4-byte word referenced by \p var from the source thread ID as \p nv_bfloat162.
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_down_sync(const unsigned int mask, const __nv_bfloat162 var, const unsigned int delta, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID.
+*
+* \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with \p laneMask:
+* the value of \p var held by the resulting thread ID is returned. If the \p width is less than \p warpSize, then each
+* group of \p width consecutive threads are able to access elements from earlier groups of threads,
+* however if they attempt to access elements from later groups of threads their own value of \p var
+* will be returned. This mode implements a butterfly addressing pattern such as is used in tree
+* reduction and broadcast.
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read.
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - nv_bfloat162. Is only being read.
+* \param[in] laneMask - int. Is only being read.
+* \param[in] width - int. Is only being read.
+*
+* \returns Returns the 4-byte word referenced by \p var from the source thread ID as \p nv_bfloat162.
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_xor_sync(const unsigned int mask, const __nv_bfloat162 var, const int laneMask, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Exchange a variable between threads within a warp. Direct copy from indexed thread.
+*
+* \details Returns the value of \p var held by the thread whose ID is given by \p srcLane.
+* If the \p width is less than \p warpSize, then each subsection of the warp behaves as a separate
+* entity with a starting logical thread ID of 0. If \p srcLane is outside the range \p [0:width-1],
+* the value returned corresponds to the value of \p var held by the \p srcLane modulo \p width (i.e.
+* within the same subsection). \p width must have a value which is a power of 2;
+* results are undefined if \p width is not a power of 2, or is a number greater than
+* \p warpSize.
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read.
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - nv_bfloat16. Is only being read.
+* \param[in] srcLane - int. Is only being read.
+* \param[in] width - int. Is only being read.
+*
+* \returns Returns the 2-byte word referenced by \p var from the source thread ID as \p nv_bfloat16.
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_sync(const unsigned int mask, const __nv_bfloat16 var, const int srcLane, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller.
+*
+* \details Calculates a source thread ID by subtracting \p delta from the caller's lane ID.
+* The value of \p var held by the resulting lane ID is returned: in effect, \p var is shifted up
+* the warp by \p delta threads. If the \p width is less than \p warpSize, then each subsection of the warp
+* behaves as a separate entity with a starting logical thread ID of 0. The source thread index
+* will not wrap around the value of \p width, so effectively the lower \p delta threads will be unchanged.
+* \p width must have a value which is a power of 2; results are undefined if \p width is not a power of 2,
+* or is a number greater than \p warpSize.
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read.
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - nv_bfloat16. Is only being read.
+* \param[in] delta - unsigned int. Is only being read.
+* \param[in] width - int. Is only being read.
+*
+* \returns Returns the 2-byte word referenced by \p var from the source thread ID as \p nv_bfloat16.
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_up_sync(const unsigned int mask, const __nv_bfloat16 var, const unsigned int delta, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller.
+*
+* \details Calculates a source thread ID by adding \p delta to the caller's thread ID.
+* The value of \p var held by the resulting thread ID is returned: this has the effect
+* of shifting \p var down the warp by \p delta threads. If the \p width is less than \p warpSize, then
+* each subsection of the warp behaves as a separate entity with a starting logical
+* thread ID of 0. Similarly to the __shfl_up_sync(), the ID number of the source thread
+* will not wrap around the value of \p width and the upper \p delta threads
+* will remain unchanged.
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read.
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - nv_bfloat16. Is only being read.
+* \param[in] delta - unsigned int. Is only being read.
+* \param[in] width - int. Is only being read.
+*
+* \returns Returns the 2-byte word referenced by \p var from the source thread ID as \p nv_bfloat16.
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_down_sync(const unsigned int mask, const __nv_bfloat16 var, const unsigned int delta, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID.
+*
+* \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with \p laneMask:
+* the value of \p var held by the resulting thread ID is returned. If the \p width is less than \p warpSize, then each
+* group of \p width consecutive threads are able to access elements from earlier groups of threads,
+* however if they attempt to access elements from later groups of threads their own value of \p var
+* will be returned. This mode implements a butterfly addressing pattern such as is used in tree
+* reduction and broadcast.
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read.
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - nv_bfloat16. Is only being read.
+* \param[in] laneMask - int. Is only being read.
+* \param[in] width - int. Is only being read.
+*
+* \returns Returns the 2-byte word referenced by \p var from the source thread ID as \p nv_bfloat16.
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_xor_sync(const unsigned int mask, const __nv_bfloat16 var, const int laneMask, const int width = warpSize);
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300))) || defined(_NVHPC_CUDA) */
+
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320))) || defined(_NVHPC_CUDA)
+#if defined(__local_warpSize)
+#undef warpSize
+#undef __local_warpSize
+#endif
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.nc` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldg(const  __nv_bfloat162 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.nc` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldg(const __nv_bfloat16 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.cg` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldcg(const  __nv_bfloat162 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.cg` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldcg(const __nv_bfloat16 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.ca` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldca(const  __nv_bfloat162 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.ca` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldca(const __nv_bfloat16 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.cs` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldcs(const  __nv_bfloat162 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.cs` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldcs(const __nv_bfloat16 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.lu` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldlu(const  __nv_bfloat162 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.lu` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldlu(const __nv_bfloat16 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.cv` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldcv(const  __nv_bfloat162 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.cv` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldcv(const __nv_bfloat16 *const ptr);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `st.global.wb` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_BF16_DECL__ void __stwb(__nv_bfloat162 *const ptr, const __nv_bfloat162 value);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `st.global.wb` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_BF16_DECL__ void __stwb(__nv_bfloat16 *const ptr, const __nv_bfloat16 value);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `st.global.cg` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_BF16_DECL__ void __stcg(__nv_bfloat162 *const ptr, const __nv_bfloat162 value);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `st.global.cg` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_BF16_DECL__ void __stcg(__nv_bfloat16 *const ptr, const __nv_bfloat16 value);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `st.global.cs` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_BF16_DECL__ void __stcs(__nv_bfloat162 *const ptr, const __nv_bfloat162 value);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `st.global.cs` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_BF16_DECL__ void __stcs(__nv_bfloat16 *const ptr, const __nv_bfloat16 value);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `st.global.wt` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_BF16_DECL__ void __stwt(__nv_bfloat162 *const ptr, const __nv_bfloat162 value);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `st.global.wt` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_BF16_DECL__ void __stwt(__nv_bfloat16 *const ptr, const __nv_bfloat16 value);
+
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320))) || defined(_NVHPC_CUDA) */
+
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs nv_bfloat162 vector if-equal comparison.
+*
+* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The vector result of if-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __heq2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector not-equal comparison.
+*
+* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The vector result of not-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hne2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector less-equal comparison.
+*
+* \details Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The \p nv_bfloat162 result of less-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hle2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector greater-equal comparison.
+*
+* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The vector result of greater-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hge2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector less-than comparison.
+*
+* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The nv_bfloat162 vector result of less-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hlt2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector greater-than comparison.
+*
+* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The vector result of greater-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hgt2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered if-equal comparison.
+*
+* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The vector result of unordered if-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hequ2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered not-equal comparison.
+*
+* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The vector result of unordered not-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hneu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered less-equal comparison.
+*
+* Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The vector result of unordered less-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hleu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered greater-equal comparison.
+*
+* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The \p nv_bfloat162 vector result of unordered greater-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered less-than comparison.
+*
+* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The vector result of unordered less-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hltu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered greater-than comparison.
+*
+* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The \p nv_bfloat162 vector result of unordered greater-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs nv_bfloat162 vector if-equal comparison.
+*
+* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of if-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __heq2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector not-equal comparison.
+*
+* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of not-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hne2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector less-equal comparison.
+*
+* \details Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of less-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hle2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector greater-equal comparison.
+*
+* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of greater-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hge2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector less-than comparison.
+*
+* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of less-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hlt2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector greater-than comparison.
+*
+* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of greater-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hgt2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered if-equal comparison.
+*
+* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of unordered if-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hequ2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered not-equal comparison.
+*
+* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of unordered not-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hneu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered less-equal comparison.
+*
+* Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of unordered less-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hleu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered greater-equal comparison.
+*
+* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of unordered greater-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hgeu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered less-than comparison.
+*
+* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of unordered less-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hltu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered greater-than comparison.
+*
+* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of unordered greater-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hgtu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Determine whether \p nv_bfloat162 argument is a NaN.
+*
+* \details Determine whether each nv_bfloat16 of input \p nv_bfloat162 number \p a is a NaN.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The nv_bfloat162 with the corresponding \p nv_bfloat16 results set to
+* 1.0 for NaN, 0.0 otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hisnan2(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector addition in round-to-nearest-even mode.
+*
+* \details Performs \p nv_bfloat162 vector add of inputs \p a and \p b, in round-to-nearest-even
+* mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-95
+* \endinternal
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The sum of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hadd2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector subtraction in round-to-nearest-even mode.
+*
+* \details Subtracts \p nv_bfloat162 input vector \p b from input vector \p a in
+* round-to-nearest-even mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-104
+* \endinternal
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The subtraction of vector \p b from \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hsub2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector multiplication in round-to-nearest-even mode.
+*
+* \details Performs \p nv_bfloat162 vector multiplication of inputs \p a and \p b, in
+* round-to-nearest-even mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-102
+* \endinternal
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The result of elementwise multiplying the vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmul2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector addition in round-to-nearest-even mode.
+*
+* \details Performs \p nv_bfloat162 vector add of inputs \p a and \p b, in round-to-nearest-even
+* mode. Prevents floating-point contractions of mul+add into fma.
+* \internal
+* \req DEEPLEARN-SRM_REQ-95
+* \endinternal
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The sum of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hadd2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector subtraction in round-to-nearest-even mode.
+*
+* \details Subtracts \p nv_bfloat162 input vector \p b from input vector \p a in
+* round-to-nearest-even mode. Prevents floating-point contractions of mul+sub into fma.
+* \internal
+* \req DEEPLEARN-SRM_REQ-104
+* \endinternal
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The subtraction of vector \p b from \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hsub2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector multiplication in round-to-nearest-even mode.
+*
+* \details Performs \p nv_bfloat162 vector multiplication of inputs \p a and \p b, in
+* round-to-nearest-even mode. Prevents floating-point contractions of mul+add
+* or sub into fma.
+* \internal
+* \req DEEPLEARN-SRM_REQ-102
+* \endinternal
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The result of elementwise multiplying the vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmul2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector division in round-to-nearest-even mode.
+*
+* \details Divides \p nv_bfloat162 input vector \p a by input vector \p b in round-to-nearest-even
+* mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-103
+* \endinternal
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The elementwise division of \p a with \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __h2div(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Calculates the absolute value of both halves of the input \p nv_bfloat162 number and
+* returns the result.
+*
+* \details Calculates the absolute value of both halves of the input \p nv_bfloat162 number and
+* returns the result.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns bfloat2
+* - Returns \p a with the absolute value of both halves.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __habs2(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector addition in round-to-nearest-even mode, with
+* saturation to [0.0, 1.0].
+*
+* \details Performs \p nv_bfloat162 vector add of inputs \p a and \p b, in round-to-nearest-even
+* mode, and clamps the results to range [0.0, 1.0]. NaN results are flushed to
+* +0.0.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The sum of \p a and \p b, with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hadd2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector subtraction in round-to-nearest-even mode,
+* with saturation to [0.0, 1.0].
+*
+* \details Subtracts \p nv_bfloat162 input vector \p b from input vector \p a in
+* round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN
+* results are flushed to +0.0.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The subtraction of vector \p b from \p a, with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hsub2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector multiplication in round-to-nearest-even mode,
+* with saturation to [0.0, 1.0].
+*
+* \details Performs \p nv_bfloat162 vector multiplication of inputs \p a and \p b, in
+* round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN
+* results are flushed to +0.0.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The result of elementwise multiplication of vectors \p a and \p b,
+* with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmul2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b);
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector fused multiply-add in round-to-nearest-even
+* mode.
+*
+* \details Performs \p nv_bfloat162 vector multiply on inputs \p a and \p b,
+* then performs a \p nv_bfloat162 vector add of the result with \p c,
+* rounding the result once in round-to-nearest-even mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-105
+* \endinternal
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+* \param[in] c - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector fused multiply-add in round-to-nearest-even
+* mode, with saturation to [0.0, 1.0].
+*
+* \details Performs \p nv_bfloat162 vector multiply on inputs \p a and \p b,
+* then performs a \p nv_bfloat162 vector add of the result with \p c,
+* rounding the result once in round-to-nearest-even mode, and clamps the
+* results to range [0.0, 1.0]. NaN results are flushed to +0.0.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+* \param[in] c - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c,
+* with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c);
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Negates both halves of the input \p nv_bfloat162 number and returns the
+* result.
+*
+* \details Negates both halves of the input \p nv_bfloat162 number \p a and returns the result.
+* \internal
+* \req DEEPLEARN-SRM_REQ-101
+* \endinternal
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - Returns \p a with both halves negated.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hneg2(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Calculates the absolute value of input \p nv_bfloat16 number and returns the result.
+*
+* \details Calculates the absolute value of input \p nv_bfloat16 number and returns the result.
+* \param[in] a - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The absolute value of a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __habs(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 addition in round-to-nearest-even mode.
+*
+* \details Performs \p nv_bfloat16 addition of inputs \p a and \p b, in round-to-nearest-even
+* mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-94
+* \endinternal
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The sum of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hadd(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 subtraction in round-to-nearest-even mode.
+*
+* \details Subtracts \p nv_bfloat16 input \p b from input \p a in round-to-nearest-even
+* mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-97
+* \endinternal
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The result of subtracting \p b from \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hsub(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 multiplication in round-to-nearest-even mode.
+*
+* \details Performs \p nv_bfloat16 multiplication of inputs \p a and \p b, in round-to-nearest-even
+* mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-99
+* \endinternal
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The result of multiplying \p a and \p b.
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmul(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 addition in round-to-nearest-even mode.
+*
+* \details Performs \p nv_bfloat16 addition of inputs \p a and \p b, in round-to-nearest-even
+* mode. Prevents floating-point contractions of mul+add into fma.
+* \internal
+* \req DEEPLEARN-SRM_REQ-94
+* \endinternal
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The sum of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hadd_rn(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 subtraction in round-to-nearest-even mode.
+*
+* \details Subtracts \p nv_bfloat16 input \p b from input \p a in round-to-nearest-even
+* mode. Prevents floating-point contractions of mul+sub into fma.
+* \internal
+* \req DEEPLEARN-SRM_REQ-97
+* \endinternal
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The result of subtracting \p b from \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hsub_rn(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 multiplication in round-to-nearest-even mode.
+*
+* \details Performs \p nv_bfloat16 multiplication of inputs \p a and \p b, in round-to-nearest-even
+* mode. Prevents floating-point contractions of mul+add or sub into fma.
+* \internal
+* \req DEEPLEARN-SRM_REQ-99
+* \endinternal
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The result of multiplying \p a and \p b.
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmul_rn(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 division in round-to-nearest-even mode.
+*
+* \details Divides \p nv_bfloat16 input \p a by input \p b in round-to-nearest-even
+* mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-98
+* \endinternal
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The result of dividing \p a by \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__  __nv_bfloat16 __hdiv(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 addition in round-to-nearest-even mode, with
+* saturation to [0.0, 1.0].
+*
+* \details Performs \p nv_bfloat16 add of inputs \p a and \p b, in round-to-nearest-even mode,
+* and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The sum of \p a and \p b, with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hadd_sat(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 subtraction in round-to-nearest-even mode, with
+* saturation to [0.0, 1.0].
+*
+* \details Subtracts \p nv_bfloat16 input \p b from input \p a in round-to-nearest-even
+* mode,
+* and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The result of subtraction of \p b from \p a, with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hsub_sat(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 multiplication in round-to-nearest-even mode, with
+* saturation to [0.0, 1.0].
+*
+* \details Performs \p nv_bfloat16 multiplication of inputs \p a and \p b, in round-to-nearest-even
+* mode, and clamps the result to range [0.0, 1.0]. NaN results are flushed to
+* +0.0.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The result of multiplying \p a and \p b, with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmul_sat(const __nv_bfloat16 a, const __nv_bfloat16 b);
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 fused multiply-add in round-to-nearest-even mode.
+*
+* \details Performs \p nv_bfloat16 multiply on inputs \p a and \p b,
+* then performs a \p nv_bfloat16 add of the result with \p c,
+* rounding the result once in round-to-nearest-even mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-96
+* \endinternal
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+* \param[in] c - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The result of fused multiply-add operation on \p
+* a, \p b, and \p c.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __hfma(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 fused multiply-add in round-to-nearest-even mode,
+* with saturation to [0.0, 1.0].
+*
+* \details Performs \p nv_bfloat16 multiply on inputs \p a and \p b,
+* then performs a \p nv_bfloat16 add of the result with \p c,
+* rounding the result once in round-to-nearest-even mode, and clamps the result
+* to range [0.0, 1.0]. NaN results are flushed to +0.0.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+* \param[in] c - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The result of fused multiply-add operation on \p
+* a, \p b, and \p c, with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __hfma_sat(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c);
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Negates input \p nv_bfloat16 number and returns the result.
+*
+* \details Negates input \p nv_bfloat16 number and returns the result.
+* \internal
+* \req DEEPLEARN-SRM_REQ-100
+* \endinternal
+* \param[in] a - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - minus a
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hneg(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector if-equal comparison and returns boolean true
+* if both \p nv_bfloat16 results are true, boolean false otherwise.
+*
+* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 if-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of if-equal comparison
+* of vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbeq2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector not-equal comparison and returns boolean
+* true if both \p nv_bfloat16 results are true, boolean false otherwise.
+*
+* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 not-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of not-equal comparison
+* of vectors \p a and \p b are true,
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbne2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector less-equal comparison and returns boolean
+* true if both \p nv_bfloat16 results are true, boolean false otherwise.
+*
+* \details Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 less-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of less-equal comparison
+* of vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hble2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector greater-equal comparison and returns boolean
+* true if both \p nv_bfloat16 results are true, boolean false otherwise.
+*
+* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 greater-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of greater-equal
+* comparison of vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbge2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector less-than comparison and returns boolean
+* true if both \p nv_bfloat16 results are true, boolean false otherwise.
+*
+* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 less-than comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of less-than comparison
+* of vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hblt2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector greater-than comparison and returns boolean
+* true if both \p nv_bfloat16 results are true, boolean false otherwise.
+*
+* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 greater-than comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of greater-than
+* comparison of vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbgt2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered if-equal comparison and returns
+* boolean true if both \p nv_bfloat16 results are true, boolean false otherwise.
+*
+* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 if-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of unordered if-equal
+* comparison of vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbequ2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered not-equal comparison and returns
+* boolean true if both \p nv_bfloat16 results are true, boolean false otherwise.
+*
+* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 not-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of unordered not-equal
+* comparison of vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbneu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered less-equal comparison and returns
+* boolean true if both \p nv_bfloat16 results are true, boolean false otherwise.
+*
+* \details Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 less-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of unordered less-equal
+* comparison of vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbleu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered greater-equal comparison and
+* returns boolean true if both \p nv_bfloat16 results are true, boolean false
+* otherwise.
+*
+* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 greater-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of unordered
+* greater-equal comparison of vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered less-than comparison and returns
+* boolean true if both \p nv_bfloat16 results are true, boolean false otherwise.
+*
+* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 less-than comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of unordered less-than comparison of
+* vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbltu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered greater-than comparison and
+* returns boolean true if both \p nv_bfloat16 results are true, boolean false
+* otherwise.
+*
+* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 greater-than comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of unordered
+* greater-than comparison of vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 if-equal comparison.
+*
+* \details Performs \p nv_bfloat16 if-equal comparison of inputs \p a and \p b.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns bool
+* - The boolean result of if-equal comparison of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __heq(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 not-equal comparison.
+*
+* \details Performs \p nv_bfloat16 not-equal comparison of inputs \p a and \p b.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns bool
+* - The boolean result of not-equal comparison of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hne(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 less-equal comparison.
+*
+* \details Performs \p nv_bfloat16 less-equal comparison of inputs \p a and \p b.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns bool
+* - The boolean result of less-equal comparison of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hle(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 greater-equal comparison.
+*
+* \details Performs \p nv_bfloat16 greater-equal comparison of inputs \p a and \p b.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns bool
+* - The boolean result of greater-equal comparison of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hge(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 less-than comparison.
+*
+* \details Performs \p nv_bfloat16 less-than comparison of inputs \p a and \p b.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns bool
+* - The boolean result of less-than comparison of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hlt(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 greater-than comparison.
+*
+* \details Performs \p nv_bfloat16 greater-than comparison of inputs \p a and \p b.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns bool
+* - The boolean result of greater-than comparison of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hgt(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 unordered if-equal comparison.
+*
+* \details Performs \p nv_bfloat16 if-equal comparison of inputs \p a and \p b.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns bool
+* - The boolean result of unordered if-equal comparison of \p a and
+* \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hequ(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 unordered not-equal comparison.
+*
+* \details Performs \p nv_bfloat16 not-equal comparison of inputs \p a and \p b.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns bool
+* - The boolean result of unordered not-equal comparison of \p a and
+* \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hneu(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 unordered less-equal comparison.
+*
+* \details Performs \p nv_bfloat16 less-equal comparison of inputs \p a and \p b.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns bool
+* - The boolean result of unordered less-equal comparison of \p a and
+* \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hleu(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 unordered greater-equal comparison.
+*
+* \details Performs \p nv_bfloat16 greater-equal comparison of inputs \p a and \p b.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns bool
+* - The boolean result of unordered greater-equal comparison of \p a
+* and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hgeu(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 unordered less-than comparison.
+*
+* \details Performs \p nv_bfloat16 less-than comparison of inputs \p a and \p b.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns bool
+* - The boolean result of unordered less-than comparison of \p a and
+* \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hltu(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 unordered greater-than comparison.
+*
+* \details Performs \p nv_bfloat16 greater-than comparison of inputs \p a and \p b.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns bool
+* - The boolean result of unordered greater-than comparison of \p a
+* and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hgtu(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Determine whether \p nv_bfloat16 argument is a NaN.
+*
+* \details Determine whether \p nv_bfloat16 value \p a is a NaN.
+* \param[in] a - nv_bfloat16. Is only being read.
+*
+* \returns bool
+* - true if argument is NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hisnan(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Calculates \p nv_bfloat16 maximum of two input values.
+*
+* \details Calculates \p nv_bfloat16 max(\p a, \p b)
+* defined as (\p a > \p b) ? \p a : \p b.
+* - If either of inputs is NaN, the other input is returned.
+* - If both inputs are NaNs, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmax(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Calculates \p nv_bfloat16 minimum of two input values.
+*
+* \details Calculates \p nv_bfloat16 min(\p a, \p b)
+* defined as (\p a < \p b) ? \p a : \p b.
+* - If either of inputs is NaN, the other input is returned.
+* - If both inputs are NaNs, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmin(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Calculates \p nv_bfloat16 maximum of two input values, NaNs pass through.
+*
+* \details Calculates \p nv_bfloat16 max(\p a, \p b)
+* defined as (\p a > \p b) ? \p a : \p b.
+* - If either of inputs is NaN, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmax_nan(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Calculates \p nv_bfloat16 minimum of two input values, NaNs pass through.
+*
+* \details Calculates \p nv_bfloat16 min(\p a, \p b)
+* defined as (\p a < \p b) ? \p a : \p b.
+* - If either of inputs is NaN, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmin_nan(const __nv_bfloat16 a, const __nv_bfloat16 b);
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 fused multiply-add in round-to-nearest-even mode with relu saturation.
+*
+* \details Performs \p nv_bfloat16 multiply on inputs \p a and \p b,
+* then performs a \p nv_bfloat16 add of the result with \p c,
+* rounding the result once in round-to-nearest-even mode.
+* Then negative result is clamped to 0.
+* NaN result is converted to canonical NaN.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+* \param[in] c - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The result of fused multiply-add operation on \p
+* a, \p b, and \p c with relu saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __hfma_relu(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c);
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Calculates \p nv_bfloat162 vector maximum of two inputs.
+*
+* \details Calculates \p nv_bfloat162 vector max(\p a, \p b).
+* Elementwise \p nv_bfloat16 operation is defined as
+* (\p a > \p b) ? \p a : \p b.
+* - If either of inputs is NaN, the other input is returned.
+* - If both inputs are NaNs, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The result of elementwise maximum of vectors \p a  and \p b
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmax2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Calculates \p nv_bfloat162 vector minimum of two inputs.
+*
+* \details Calculates \p nv_bfloat162 vector min(\p a, \p b).
+* Elementwise \p nv_bfloat16 operation is defined as
+* (\p a < \p b) ? \p a : \p b.
+* - If either of inputs is NaN, the other input is returned.
+* - If both inputs are NaNs, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The result of elementwise minimum of vectors \p a  and \p b
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmin2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Calculates \p nv_bfloat162 vector maximum of two inputs, NaNs pass through.
+*
+* \details Calculates \p nv_bfloat162 vector max(\p a, \p b).
+* Elementwise \p nv_bfloat16 operation is defined as
+* (\p a > \p b) ? \p a : \p b.
+* - If either of inputs is NaN, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The result of elementwise maximum of vectors \p a  and \p b, with NaNs pass through
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmax2_nan(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Calculates \p nv_bfloat162 vector minimum of two inputs, NaNs pass through.
+*
+* \details Calculates \p nv_bfloat162 vector min(\p a, \p b).
+* Elementwise \p nv_bfloat16 operation is defined as
+* (\p a < \p b) ? \p a : \p b.
+* - If either of inputs is NaN, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The result of elementwise minimum of vectors \p a  and \p b, with NaNs pass through
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmin2_nan(const __nv_bfloat162 a, const __nv_bfloat162 b);
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector fused multiply-add in round-to-nearest-even
+* mode with relu saturation.
+*
+* \details Performs \p nv_bfloat162 vector multiply on inputs \p a and \p b,
+* then performs a \p nv_bfloat162 vector add of the result with \p c,
+* rounding the result once in round-to-nearest-even mode.
+* Then negative result is clamped to 0.
+* NaN result is converted to canonical NaN.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+* \param[in] c - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c with relu saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2_relu(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs fast complex multiply-accumulate
+*
+* \details Interprets vector \p nv_bfloat162 input pairs \p a, \p b, and \p c as
+* complex numbers in \p nv_bfloat16 precision and performs
+* complex multiply-accumulate operation: a*b + c
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+* \param[in] c - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The result of complex multiply-accumulate operation on complex numbers \p a, \p b, and \p c
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __hcmadd(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c);
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 square root in round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat16 square root of input \p a in round-to-nearest-even mode.
+* \param[in] a - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The square root of \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hsqrt(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 reciprocal square root in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p nv_bfloat16 reciprocal square root of input \p a in round-to-nearest-even
+* mode.
+* \param[in] a - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The reciprocal square root of \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hrsqrt(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 reciprocal in round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat16 reciprocal of input \p a in round-to-nearest-even mode.
+* \param[in] a - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The reciprocal of \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hrcp(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 natural logarithm in round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat16 natural logarithm of input \p a in round-to-nearest-even
+* mode.
+* \param[in] a - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The natural logarithm of \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hlog(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 binary logarithm in round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat16 binary logarithm of input \p a in round-to-nearest-even
+* mode.
+* \param[in] a - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The binary logarithm of \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hlog2(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 decimal logarithm in round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat16 decimal logarithm of input \p a in round-to-nearest-even
+* mode.
+* \param[in] a - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The decimal logarithm of \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hlog10(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 natural exponential function in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p nv_bfloat16 natural exponential function of input \p a in
+* round-to-nearest-even mode.
+* \param[in] a - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The natural exponential function on \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hexp(const __nv_bfloat16 a);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates approximate \p nv_bfloat16 hyperbolic tangent function.
+*
+* \details Calculates approximate \p nv_bfloat16 hyperbolic tangent function: \cuda_math_formula \tanh(a)\end_cuda_math_formula.
+* This operation uses HW acceleration on devices of compute capability 9.x and higher.
+* \param[in] a - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The approximate hyperbolic tangent function of \p a.
+* - htanh_approx \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula (\pm 0)\end_cuda_math_formula.
+* - htanh_approx \cuda_math_formula (\pm\infty)\end_cuda_math_formula returns \cuda_math_formula (\pm 1)\end_cuda_math_formula.
+* - htanh_approx(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 htanh_approx(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector approximate hyperbolic tangent function.
+*
+* \details Calculates \p nv_bfloat162 approximate hyperbolic tangent function of input vector \p a.
+* This operation uses HW acceleration on devices of compute capability 9.x and higher.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The elementwise approximate hyperbolic tangent function on vector \p a.
+*
+* \see htanh_approx(__nv_bfloat16) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2tanh_approx(const __nv_bfloat162 a);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 hyperbolic tangent function in
+* round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat16 hyperbolic tangent function: \cuda_math_formula \tanh(a)\end_cuda_math_formula in
+* round-to-nearest-even mode.
+* \param[in] a - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The hyperbolic tangent function of \p a.
+* - htanh \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula (\pm 0)\end_cuda_math_formula.
+* - htanh \cuda_math_formula (\pm\infty)\end_cuda_math_formula returns \cuda_math_formula (\pm 1)\end_cuda_math_formula.
+* - htanh(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 htanh(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector hyperbolic tangent function in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p nv_bfloat162 hyperbolic tangent function of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The elementwise hyperbolic tangent function on vector \p a.
+*
+* \see htanh(__nv_bfloat16) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2tanh(const __nv_bfloat162 a);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 binary exponential function in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p nv_bfloat16 binary exponential function of input \p a in
+* round-to-nearest-even mode.
+* \param[in] a - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The binary exponential function on \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hexp2(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 decimal exponential function in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p nv_bfloat16 decimal exponential function of input \p a in
+* round-to-nearest-even mode.
+* \param[in] a - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The decimal exponential function on \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hexp10(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 cosine in round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat16 cosine of input \p a in round-to-nearest-even mode.
+*
+* NOTE: this function's implementation calls cosf(float) function and is exposed
+* to compiler optimizations. Specifically, \p --use_fast_math flag changes cosf(float)
+* into an intrinsic __cosf(float), which has less accurate numeric behavior.
+*
+* \param[in] a - nv_bfloat16. Is only being read.
+* \returns nv_bfloat16
+* - The cosine of \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hcos(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 sine in round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat16 sine of input \p a in round-to-nearest-even mode.
+*
+* NOTE: this function's implementation calls sinf(float) function and is exposed
+* to compiler optimizations. Specifically, \p --use_fast_math flag changes sinf(float)
+* into an intrinsic __sinf(float), which has less accurate numeric behavior.
+*
+* \param[in] a - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The sine of \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hsin(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector square root in round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat162 square root of input vector \p a in round-to-nearest-even
+* mode.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The elementwise square root on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2sqrt(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector reciprocal square root in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p nv_bfloat162 reciprocal square root of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The elementwise reciprocal square root on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2rsqrt(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector reciprocal in round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat162 reciprocal of input vector \p a in round-to-nearest-even
+* mode.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The elementwise reciprocal on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2rcp(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector natural logarithm in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p nv_bfloat162 natural logarithm of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The elementwise natural logarithm on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2log(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector binary logarithm in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p nv_bfloat162 binary logarithm of input vector \p a in round-to-nearest-even
+* mode.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The elementwise binary logarithm on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2log2(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector decimal logarithm in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p nv_bfloat162 decimal logarithm of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The elementwise decimal logarithm on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2log10(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector exponential function in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p nv_bfloat162 exponential function of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The elementwise exponential function on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2exp(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector binary exponential function in
+* round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat162 binary exponential function of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The elementwise binary exponential function on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2exp2(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector decimal exponential function in
+* round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat162 decimal exponential function of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The elementwise decimal exponential function on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2exp10(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector cosine in round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat162 cosine of input vector \p a in round-to-nearest-even
+* mode.
+*
+* NOTE: this function's implementation calls cosf(float) function and is exposed
+* to compiler optimizations. Specifically, \p --use_fast_math flag changes cosf(float)
+* into an intrinsic __cosf(float), which has less accurate numeric behavior.
+*
+* \param[in] a - nv_bfloat162. Is only being read.
+* \returns nv_bfloat162
+* - The elementwise cosine on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2cos(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector sine in round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat162 sine of input vector \p a in round-to-nearest-even mode.
+*
+* NOTE: this function's implementation calls sinf(float) function and is exposed
+* to compiler optimizations. Specifically, \p --use_fast_math flag changes sinf(float)
+* into an intrinsic __sinf(float), which has less accurate numeric behavior.
+*
+* \param[in] a - nv_bfloat162. Is only being read.
+* \returns nv_bfloat162
+* - The elementwise sine on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2sin(const __nv_bfloat162 a);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Vector add \p val to the value stored at \p address in global or shared memory, and writes this
+* value back to \p address. The atomicity of the add operation is guaranteed separately for each of the
+* two nv_bfloat16 elements; the entire __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access.
+*
+* \details The location of \p address must be in global or shared memory. This operation has undefined
+* behavior otherwise. This operation is natively supported by devices of compute capability 9.x and higher,
+* older devices use emulation path.
+*
+* \param[in] address - __nv_bfloat162*. An address in global or shared memory.
+* \param[in] val - __nv_bfloat162. The value to be added.
+*
+* \returns __nv_bfloat162
+* - The old value read from \p address.
+*
+* \note_ref_guide_atomic
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 atomicAdd(__nv_bfloat162 *const address, const __nv_bfloat162 val);
+
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700))) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Adds \p val to the value stored at \p address in global or shared memory, and writes this value
+* back to \p address. This operation is performed in one atomic operation.
+*
+* \details The location of \p address must be in global or shared memory. This operation has undefined
+* behavior otherwise. This operation is natively supported by devices of compute capability 9.x and higher,
+* older devices of compute capability 7.x and 8.x use emulation path.
+*
+* \param[in] address - __nv_bfloat16*. An address in global or shared memory.
+* \param[in] val - __nv_bfloat16. The value to be added.
+*
+* \returns __nv_bfloat16
+* - The old value read from \p address.
+*
+* \note_ref_guide_atomic
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 atomicAdd(__nv_bfloat16 *const address, const __nv_bfloat16 val);
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700))) || defined(_NVHPC_CUDA) */
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+
+
+#endif /* defined(__cplusplus) */
+
+#if !defined(_MSC_VER) && __cplusplus >= 201103L
+#   define __CPP_VERSION_AT_LEAST_11_BF16
+#elif _MSC_FULL_VER >= 190024210 && _MSVC_LANG >= 201103L
+#   define __CPP_VERSION_AT_LEAST_11_BF16
+#endif
+
+/* C++11 header for ::std::move.
+ * In RTC mode, ::std::move is provided implicitly; don't include the header
+ */
+#if defined(__CPP_VERSION_AT_LEAST_11_BF16) && !defined(__CUDACC_RTC__)
+#include <utility>
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) && !defined(__CUDACC_RTC__) */
+
+/* C++ header for ::std::memcpy (used for type punning in host-side implementations).
+ * When compiling as a CUDA source file memcpy is provided implicitly.
+ * !defined(__CUDACC__) implies !defined(__CUDACC_RTC__).
+ */
+#if defined(__cplusplus) && !defined(__CUDACC__)
+#include <cstring>
+#endif /* defined(__cplusplus) && !defined(__CUDACC__) */
+
+// implicitly provided by NVRTC
+#if !defined(__CUDACC_RTC__)
+#include <nv/target>
+#endif  /* !defined(__CUDACC_RTC__) */
+
+#if (defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3))))
+#define __CUDA_BF16_INLINE__
+#define __CUDA_BF16_FORCEINLINE__
+#else
+#define __CUDA_BF16_INLINE__ inline
+#define __CUDA_BF16_FORCEINLINE__ __forceinline__
+#endif /* #if (defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3)))) */
+
+/* Set up structure-alignment attribute */
+#if defined(__CUDACC__)
+#define __CUDA_ALIGN__(align) __align__(align)
+#else
+/* Define alignment macro based on compiler type (cannot assume C11 "_Alignas" is available) */
+#if defined(__CPP_VERSION_AT_LEAST_11_BF16)
+#define __CUDA_ALIGN__(n) alignas(n)    /* C++11 kindly gives us a keyword for this */
+#else /* defined(__CPP_VERSION_AT_LEAST_11_BF16)*/
+#if defined(__GNUC__)
+#define __CUDA_ALIGN__(n) __attribute__ ((aligned(n)))
+#elif defined(_MSC_VER)
+#define __CUDA_ALIGN__(n) __declspec(align(n))
+#else
+#define __CUDA_ALIGN__(n)
+#endif /* defined(__GNUC__) */
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */
+#endif /* defined(__CUDACC__) */
+
+// define __CUDA_BF16_CONSTEXPR__ in order to
+// use constexpr where possible, with supporting C++ dialects
+// undef after use
+#if (defined __CPP_VERSION_AT_LEAST_11_BF16)
+#define __CUDA_BF16_CONSTEXPR__   constexpr
+#else
+#define __CUDA_BF16_CONSTEXPR__
+#endif
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+ * \brief __nv_bfloat16_raw data type
+ * \details Type allows static initialization of \p nv_bfloat16 until it becomes
+ * a built-in type.
+ *
+ * - Note: this initialization is as a bit-field representation of \p nv_bfloat16,
+ * and not a conversion from \p short to \p nv_bfloat16.
+ * Such representation will be deprecated in a future version of CUDA.
+ *
+ * - Note: this is visible to non-nvcc compilers, including C-only compilations
+ */
+typedef struct __CUDA_ALIGN__(2) {
+    /**
+     * Storage field contains bits representation of the \p nv_bfloat16 floating-point number.
+     */
+    unsigned short x;
+} __nv_bfloat16_raw;
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+ * \brief __nv_bfloat162_raw data type
+ * \details Type allows static initialization of \p nv_bfloat162 until it becomes
+ * a built-in type.
+ *
+ * - Note: this initialization is as a bit-field representation of \p nv_bfloat162,
+ * and not a conversion from \p short2 to \p nv_bfloat162.
+ * Such representation will be deprecated in a future version of CUDA.
+ *
+ * - Note: this is visible to non-nvcc compilers, including C-only compilations
+ */
+typedef struct __CUDA_ALIGN__(4) {
+    /**
+     * Storage field contains bits of the lower \p nv_bfloat16 part.
+     */
+    unsigned short x;
+    /**
+     * Storage field contains bits of the upper \p nv_bfloat16 part.
+     */
+    unsigned short y;
+} __nv_bfloat162_raw;
+
+/* All other definitions in this file are only visible to C++ compilers */
+#if defined(__cplusplus)
+
+/* Hide GCC member initialization list warnings because of host/device in-function init requirement */
+#if defined(__GNUC__)
+#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Weffc++"
+#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */
+#endif /* defined(__GNUC__) */
+
+/* class' : multiple assignment operators specified
+   The class has multiple assignment operators of a single type. This warning is informational */
+#if defined(_MSC_VER) && _MSC_VER >= 1500
+#pragma warning( push )
+#pragma warning( disable:4522 )
+#endif /* defined(__GNUC__) */
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+ * \brief nv_bfloat16 datatype
+ *
+ * \details This structure implements the datatype for storing
+ * nv_bfloat16 floating-point numbers. The structure implements
+ * assignment operators and type conversions. 16 bits are being
+ * used in total: 1 sign bit, 8 bits for the exponent, and
+ * the significand is being stored in 7 bits. The total
+ * precision is 8 bits.
+ *
+ */
+struct __CUDA_ALIGN__(2) __nv_bfloat16 {
+protected:
+    /**
+     * Protected storage variable contains the bits of floating-point data.
+     */
+    unsigned short __x;
+
+public:
+
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * \brief Constructor by default.
+     * \details Emtpy default constructor, result is uninitialized.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_BF16)
+    __nv_bfloat16() = default;
+#else
+    __CUDA_HOSTDEVICE__ __nv_bfloat16() { }
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */
+
+    /* Convert to/from __nv_bfloat16_raw */
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Constructor from \p __nv_bfloat16_raw.
+     */
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_CONSTEXPR__ __nv_bfloat16(const __nv_bfloat16_raw &hr) : __x(hr.x) { }
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Assignment operator from \p __nv_bfloat16_raw.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(const __nv_bfloat16_raw &hr);
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Assignment operator from \p __nv_bfloat16_raw to \p volatile \p __nv_bfloat16.
+     */
+    __CUDA_HOSTDEVICE__ volatile __nv_bfloat16 &operator=(const __nv_bfloat16_raw &hr) volatile;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Assignment operator from \p volatile \p __nv_bfloat16_raw to \p volatile \p __nv_bfloat16.
+     */
+    __CUDA_HOSTDEVICE__ volatile __nv_bfloat16 &operator=(const volatile __nv_bfloat16_raw &hr) volatile;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Type cast to \p __nv_bfloat16_raw operator.
+     */
+    __CUDA_HOSTDEVICE__ operator __nv_bfloat16_raw() const;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Type cast to \p __nv_bfloat16_raw operator with \p volatile input.
+     */
+    __CUDA_HOSTDEVICE__ operator __nv_bfloat16_raw() const volatile;
+
+#if !defined(__CUDA_NO_BFLOAT16_CONVERSIONS__)
+#if defined(__CPP_VERSION_AT_LEAST_11_BF16)
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Construct \p __nv_bfloat16 from \p __half input using default round-to-nearest-even rounding mode.
+     */
+    explicit __CUDA_HOSTDEVICE__ __nv_bfloat16(const __half f)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    asm("{  cvt.rn.bf16.f16 %0, %1;}\n" : "=h"(__x) : "h"(__BFLOAT16_TO_CUS(f)));
+,
+    __x = __float2bfloat16(__half2float(f)).__x;
+)
+}
+#endif /* #if defined(__CPP_VERSION_AT_LEAST_11_BF16) */
+
+    /* Construct from float/double */
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Construct \p __nv_bfloat16 from \p float input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(const float f) { __x = __float2bfloat16(f).__x; }
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Construct \p __nv_bfloat16 from \p double input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(const double f) { __x = __double2bfloat16(f).__x; }
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Type cast to \p float operator.
+     */
+    __CUDA_HOSTDEVICE__ operator float() const;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Type cast to \p __nv_bfloat16 assignment operator from \p float input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(const float f);
+
+    /* We omit "cast to double" operator, so as to not be ambiguous about up-cast */
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Type cast to \p __nv_bfloat16 assignment operator from \p double input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(const double f);
+
+/*
+ * Implicit type conversions to/from integer types were only available to nvcc compilation.
+ * Introducing them for all compilers is a potentially breaking change that may affect
+ * overloads resolution and will require users to update their code.
+ * Define __CUDA_BF16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__ to opt-out.
+ */
+#if !(defined __CUDA_BF16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__) || (defined __CUDACC__)
+    /* Allow automatic construction from types supported natively in hardware */
+    /* Note we do avoid constructor init-list because of special host/device compilation rules */
+
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Construct \p __nv_bfloat16 from \p short integer input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(short val) { __x = __short2bfloat16_rn(val).__x; }
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Construct \p __nv_bfloat16 from \p unsigned \p short integer input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(unsigned short val) { __x = __ushort2bfloat16_rn(val).__x; }
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Construct \p __nv_bfloat16 from \p int input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(int val) { __x = __int2bfloat16_rn(val).__x; }
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Construct \p __nv_bfloat16 from \p unsigned \p int input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(unsigned int val) { __x = __uint2bfloat16_rn(val).__x; }
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Construct \p __nv_bfloat16 from \p long input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(const long val) {
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (sizeof(long) == sizeof(long long))
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            __x = __ll2bfloat16_rn(static_cast<long long>(val)).__x;
+        } else {
+            __x = __int2bfloat16_rn(static_cast<int>(val)).__x;
+        }
+    }
+
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Construct \p __nv_bfloat16 from \p unsigned \p long input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(const unsigned long val) {
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (sizeof(unsigned long) == sizeof(unsigned long long))
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            __x = __ull2bfloat16_rn(static_cast<unsigned long long>(val)).__x;
+        } else {
+            __x = __uint2bfloat16_rn(static_cast<unsigned int>(val)).__x;
+        }
+    }
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Construct \p __nv_bfloat16 from \p long \p long input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(long long val) { __x = __ll2bfloat16_rn(val).__x; }
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Construct \p __nv_bfloat16 from \p unsigned \p long \p long input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(unsigned long long val) { __x = __ull2bfloat16_rn(val).__x; }
+
+    /* Allow automatic casts to supported built-in types, matching all that are permitted with float */
+
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Conversion operator to \p signed \p char data type.
+     * Using round-toward-zero rounding mode.
+     *
+     * See __bfloat162char_rz(__nv_bfloat16) for further details
+     */
+    __CUDA_HOSTDEVICE__ operator signed char() const;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Conversion operator to \p unsigned \p char data type.
+     * Using round-toward-zero rounding mode.
+     *
+     * See __bfloat162uchar_rz(__nv_bfloat16) for further details
+     */
+    __CUDA_HOSTDEVICE__ operator unsigned char() const;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Conversion operator to an implementation defined \p char data type.
+     * Using round-toward-zero rounding mode.
+     *
+     * Detects signedness of the \p char type and proceeds accordingly, see
+     * further details in signed and unsigned char operators.
+     */
+    __CUDA_HOSTDEVICE__ operator char() const;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Conversion operator to \p short data type.
+     * Using round-toward-zero rounding mode.
+     *
+     * See __bfloat162short_rz(__nv_bfloat16) for further details
+     */
+    __CUDA_HOSTDEVICE__ operator short() const;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Conversion operator to \p unsigned \p short data type.
+     * Using round-toward-zero rounding mode.
+     *
+     * See __bfloat162ushort_rz(__nv_bfloat16) for further details
+     */
+    __CUDA_HOSTDEVICE__ operator unsigned short() const;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Conversion operator to \p int data type.
+     * Using round-toward-zero rounding mode.
+     *
+     * See __bfloat162int_rz(__nv_bfloat16) for further details
+     */
+    __CUDA_HOSTDEVICE__ operator int() const;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Conversion operator to \p unsigned \p int data type.
+     * Using round-toward-zero rounding mode.
+     *
+     * See __bfloat162uint_rz(__nv_bfloat16) for further details
+     */
+    __CUDA_HOSTDEVICE__ operator unsigned int() const;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Conversion operator to \p long data type.
+     * Using round-toward-zero rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ operator long() const;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Conversion operator to \p unsigned \p long data type.
+     * Using round-toward-zero rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ operator unsigned long() const;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Conversion operator to \p long \p long data type.
+     * Using round-toward-zero rounding mode.
+     *
+     * See __bfloat162ll_rz(__nv_bfloat16) for further details
+     */
+    __CUDA_HOSTDEVICE__ operator long long() const;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Conversion operator to \p unsigned \p long \p long data type.
+     * Using round-toward-zero rounding mode.
+     *
+     * See __bfloat162ull_rz(__nv_bfloat16) for further details
+     */
+    __CUDA_HOSTDEVICE__ operator unsigned long long() const;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Type cast from \p short assignment operator, using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(short val);
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Type cast from \p unsigned \p short assignment operator, using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(unsigned short val);
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Type cast from \p int assignment operator, using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(int val);
+   /**
+    * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Type cast from \p unsigned \p int assignment operator, using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(unsigned int val);
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Type cast from \p long \p long assignment operator, using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(long long val);
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Type cast from \p unsigned \p long \p long assignment operator, using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(unsigned long long val);
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Conversion operator to \p bool data type.
+     * +0 and -0 inputs convert to \p false.
+     * Non-zero inputs convert to \p true.
+     */
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_CONSTEXPR__ operator bool() const { return (__x & 0x7FFFU) != 0U; }
+#endif /* !(defined __CUDA_BF16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__) || (defined __CUDACC__) */
+#endif /* !defined(__CUDA_NO_BFLOAT16_CONVERSIONS__) */
+};
+
+#if !defined(__CUDA_NO_BFLOAT16_OPERATORS__)
+/* Some basic arithmetic operations expected of a built-in */
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Performs \p nv_bfloat16 addition operation.
+ * See also __hadd(__nv_bfloat16, __nv_bfloat16)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator+(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Performs \p nv_bfloat16 subtraction operation.
+ * See also __hsub(__nv_bfloat16, __nv_bfloat16)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator-(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Performs \p nv_bfloat16 multiplication operation.
+ * See also __hmul(__nv_bfloat16, __nv_bfloat16)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator*(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Performs \p nv_bfloat16 division operation.
+ * See also __hdiv(__nv_bfloat16, __nv_bfloat16)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator/(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Performs \p nv_bfloat16 compound assignment with addition operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator+=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Performs \p nv_bfloat16 compound assignment with subtraction operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator-=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Performs \p nv_bfloat16 compound assignment with multiplication operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator*=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Performs \p nv_bfloat16 compound assignment with division operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator/=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+
+/* Note for increment and decrement we use the raw value 0x3F80U equating to nv_bfloat16(1.0F), to avoid the extra conversion */
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Performs \p nv_bfloat16 prefix increment operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator++(__nv_bfloat16 &h);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Performs \p nv_bfloat16 prefix decrement operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator--(__nv_bfloat16 &h);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Performs \p nv_bfloat16 postfix increment operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16  operator++(__nv_bfloat16 &h, const int ignored);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Performs \p nv_bfloat16 postfix decrement operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16  operator--(__nv_bfloat16 &h, const int ignored);
+/* Unary plus and inverse operators */
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Implements \p nv_bfloat16 unary plus operator, returns input value.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator+(const __nv_bfloat16 &h);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Implements \p nv_bfloat16 unary minus operator.
+ * See also __hneg(__nv_bfloat16)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator-(const __nv_bfloat16 &h);
+
+/* Some basic comparison operations to make it look like a built-in */
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+ * Performs \p nv_bfloat16 ordered compare equal operation.
+ * See also __heq(__nv_bfloat16, __nv_bfloat16)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator==(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+ * Performs \p nv_bfloat16 unordered compare not-equal operation.
+ * See also __hneu(__nv_bfloat16, __nv_bfloat16)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator!=(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+ * Performs \p nv_bfloat16 ordered greater-than compare operation.
+ * See also __hgt(__nv_bfloat16, __nv_bfloat16)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator> (const __nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+ * Performs \p nv_bfloat16 ordered less-than compare operation.
+ * See also __hlt(__nv_bfloat16, __nv_bfloat16)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator< (const __nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+ * Performs \p nv_bfloat16 ordered greater-or-equal compare operation.
+ * See also __hge(__nv_bfloat16, __nv_bfloat16)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator>=(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+ * Performs \p nv_bfloat16 ordered less-or-equal compare operation.
+ * See also __hle(__nv_bfloat16, __nv_bfloat16)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator<=(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+#endif /* !defined(__CUDA_NO_BFLOAT16_OPERATORS__) */
+
+/**
+* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+ * \brief nv_bfloat162 datatype
+ * \details This structure implements the datatype for storing two
+ * nv_bfloat16 floating-point numbers.
+ * The structure implements assignment, arithmetic and comparison
+ * operators, and type conversions.
+ *
+ * - NOTE: __nv_bfloat162 is visible to non-nvcc host compilers
+ */
+struct __CUDA_ALIGN__(4) __nv_bfloat162 {
+    /**
+     * Storage field holding lower \p __nv_bfloat16 part.
+     */
+    __nv_bfloat16 x;
+    /**
+     * Storage field holding upper \p __nv_bfloat16 part.
+     */
+    __nv_bfloat16 y;
+
+    // All construct/copy/assign/move
+public:
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * \brief Constructor by default.
+     * \details Emtpy default constructor, result is uninitialized.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_BF16)
+    __nv_bfloat162() = default;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Move constructor, available for \p C++11 and later dialects
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat162(__nv_bfloat162 &&src);
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Move assignment operator, available for \p C++11 and later dialects
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat162 &operator=(__nv_bfloat162 &&src);
+#else
+    __CUDA_HOSTDEVICE__ __nv_bfloat162();
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */
+
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Constructor from two \p __nv_bfloat16 variables
+     */
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_CONSTEXPR__ __nv_bfloat162(const __nv_bfloat16 &a, const __nv_bfloat16 &b) : x(a), y(b) { }
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Copy constructor
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat162(const __nv_bfloat162 &src);
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Copy assignment operator
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat162 &operator=(const __nv_bfloat162 &src);
+
+    /* Convert to/from __nv_bfloat162_raw */
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Constructor from \p __nv_bfloat162_raw
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat162(const __nv_bfloat162_raw &h2r );
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Assignment operator from \p __nv_bfloat162_raw
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat162 &operator=(const __nv_bfloat162_raw &h2r);
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Conversion operator to \p __nv_bfloat162_raw
+     */
+    __CUDA_HOSTDEVICE__ operator __nv_bfloat162_raw() const;
+};
+
+#if !defined(__CUDA_NO_BFLOAT162_OPERATORS__)
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Performs packed \p nv_bfloat16 addition operation.
+ * See also __hadd2(__nv_bfloat162, __nv_bfloat162)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator+(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Performs packed \p nv_bfloat16 subtraction operation.
+ * See also __hsub2(__nv_bfloat162, __nv_bfloat162)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator-(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Performs packed \p nv_bfloat16 multiplication operation.
+ * See also __hmul2(__nv_bfloat162, __nv_bfloat162)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator*(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Performs packed \p nv_bfloat16 division operation.
+ * See also __h2div(__nv_bfloat162, __nv_bfloat162)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator/(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Performs packed \p nv_bfloat16 compound assignment with addition operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162& operator+=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Performs packed \p nv_bfloat16 compound assignment with subtraction operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162& operator-=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Performs packed \p nv_bfloat16 compound assignment with multiplication operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162& operator*=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Performs packed \p nv_bfloat16 compound assignment with division operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162& operator/=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Performs packed \p nv_bfloat16 prefix increment operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 &operator++(__nv_bfloat162 &h);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Performs packed \p nv_bfloat16 prefix decrement operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 &operator--(__nv_bfloat162 &h);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Performs packed \p nv_bfloat16 postfix increment operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162  operator++(__nv_bfloat162 &h, const int ignored);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Performs packed \p nv_bfloat16 postfix decrement operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162  operator--(__nv_bfloat162 &h, const int ignored);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Implements packed \p nv_bfloat16 unary plus operator, returns input value.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator+(const __nv_bfloat162 &h);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Implements packed \p nv_bfloat16 unary minus operator.
+ * See also __hneg2(__nv_bfloat162)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator-(const __nv_bfloat162 &h);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+ * Performs packed \p nv_bfloat16 ordered compare equal operation.
+ * See also __hbeq2(__nv_bfloat162, __nv_bfloat162)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator==(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+ * Performs packed \p nv_bfloat16 unordered compare not-equal operation.
+ * See also __hbneu2(__nv_bfloat162, __nv_bfloat162)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator!=(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+ * Performs packed \p nv_bfloat16 ordered greater-than compare operation.
+ * See also __hbgt2(__nv_bfloat162, __nv_bfloat162)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator>(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+ * Performs packed \p nv_bfloat16 ordered less-than compare operation.
+ * See also __hblt2(__nv_bfloat162, __nv_bfloat162)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator<(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+ * Performs packed \p nv_bfloat16 ordered greater-or-equal compare operation.
+ * See also __hbge2(__nv_bfloat162, __nv_bfloat162)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator>=(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+ * Performs packed \p nv_bfloat16 ordered less-or-equal compare operation.
+ * See also __hble2(__nv_bfloat162, __nv_bfloat162)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator<=(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+
+#endif /* !defined(__CUDA_NO_BFLOAT162_OPERATORS__) */
+
+#if defined(__CPP_VERSION_AT_LEAST_11_BF16)
+#if !defined(__CUDA_NO_HALF_CONVERSIONS__)
+__CUDA_HOSTDEVICE__
+#ifdef __CUDACC_RTC__
+inline
+#else
+__CUDA_BF16_FORCEINLINE__
+#endif
+__half::__half(const __nv_bfloat16 f)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    asm("{  cvt.rn.f16.bf16 %0, %1;}\n" : "=h"(__x) : "h"(__BFLOAT16_TO_CUS(f)));
+,
+    __x = __float2half_rn(__bfloat162float(f)).__x;
+)
+}
+#endif
+#endif /* #if defined(__CPP_VERSION_AT_LEAST_11_BF16) */
+
+#endif /* defined(__cplusplus) */
+
+#if (defined(__FORCE_INCLUDE_CUDA_BF16_HPP_FROM_BF16_H__) || \
+    !(defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3)))))
+/* Note the .hpp file is included to capture the "nv_bfloat16" & "nv_bfloat162" built-in function definitions. For NVRTC, the built-in
+   function definitions are compiled at NVRTC library build-time and are available through the NVRTC built-ins library at
+   link time.
+*/
+#include "cuda_bf16.hpp"
+#endif /* (defined(__FORCE_INCLUDE_CUDA_BF16_HPP_FROM_BF16_H__) || \
+          !(defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3))))) */
+
+/* Define first-class types "nv_bfloat16" and "nv_bfloat162", unless user specifies otherwise via "#define CUDA_NO_BFLOAT16" */
+/* C cannot ever have these types defined here, because __nv_bfloat16 and __nv_bfloat162 are C++ classes */
+#if defined(__cplusplus) && !defined(CUDA_NO_BFLOAT16)
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+ * \brief This datatype is meant to be the first-class or fundamental
+ * implementation of the bfloat16 numbers format.
+ *
+ * \details Should be implemented in the compiler in the future.
+ * Current implementation is a simple typedef to a respective
+ * user-level type with underscores.
+ */
+typedef __nv_bfloat16  nv_bfloat16;
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+ * \brief This datatype is meant to be the first-class or fundamental
+ * implementation of type for pairs of bfloat16 numbers.
+ *
+ * \details Should be implemented in the compiler in the future.
+ * Current implementation is a simple typedef to a respective
+ * user-level type with underscores.
+ */
+typedef __nv_bfloat162 nv_bfloat162;
+
+#endif /* defined(__cplusplus) && !defined(CUDA_NO_BFLOAT16) */
+
+#undef __CUDA_BF16_DECL__
+#undef __CUDA_HOSTDEVICE_BF16_DECL__
+#undef __CUDA_HOSTDEVICE__
+#undef __CUDA_BF16_INLINE__
+#undef __CUDA_BF16_FORCEINLINE__
+#undef ___CUDA_BF16_STRINGIFY_INNERMOST
+#undef __CUDA_BF16_STRINGIFY
+
+#endif /* end of include guard: __CUDA_BF16_H__ */
diff --git a/numba_cuda/numba/cuda/include/13/cuda_fp16.h b/numba_cuda/numba/cuda/include/13/cuda_fp16.h
new file mode 100644
index 000000000..788b81452
--- /dev/null
+++ b/numba_cuda/numba/cuda/include/13/cuda_fp16.h
@@ -0,0 +1,5363 @@
+/*
+* Copyright 1993-2024 NVIDIA Corporation.  All rights reserved.
+*
+* NOTICE TO LICENSEE:
+*
+* This source code and/or documentation ("Licensed Deliverables") are
+* subject to NVIDIA intellectual property rights under U.S. and
+* international Copyright laws.
+*
+* These Licensed Deliverables contained herein is PROPRIETARY and
+* CONFIDENTIAL to NVIDIA and is being provided under the terms and
+* conditions of a form of NVIDIA software license agreement by and
+* between NVIDIA and Licensee ("License Agreement") or electronically
+* accepted by Licensee.  Notwithstanding any terms or conditions to
+* the contrary in the License Agreement, reproduction or disclosure
+* of the Licensed Deliverables to any third party without the express
+* written consent of NVIDIA is prohibited.
+*
+* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+* OF THESE LICENSED DELIVERABLES.
+*
+* U.S. Government End Users.  These Licensed Deliverables are a
+* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+* 1995), consisting of "commercial computer software" and "commercial
+* computer software documentation" as such terms are used in 48
+* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+* only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+* U.S. Government End Users acquire the Licensed Deliverables with
+* only those rights set forth herein.
+*
+* Any use of the Licensed Deliverables in individual and commercial
+* software must include, in the user documentation and internal
+* comments to the code, the above Disclaimer and U.S. Government End
+* Users Notice.
+*/
+
+/**
+* \defgroup CUDA_MATH_INTRINSIC_HALF Half Precision Intrinsics
+* This section describes half precision intrinsic functions.
+* To use these functions, include the header file \p cuda_fp16.h in your program.
+* All of the functions defined here are available in device code.
+* Some of the functions are also available to host compilers, please
+* refer to respective functions' documentation for details.
+*
+* NOTE: Aggressive floating-point optimizations performed by host or device
+* compilers may affect numeric behavior of the functions implemented in this
+* header.
+*
+* The following macros are available to help users selectively enable/disable
+* various definitions present in the header file:
+* - \p CUDA_NO_HALF - If defined, this macro will prevent the definition of
+* additional type aliases in the global namespace, helping to avoid potential
+* conflicts with symbols defined in the user program.
+* - \p __CUDA_NO_HALF_CONVERSIONS__ - If defined, this macro will prevent the
+* use of the C++ type conversions (converting constructors and conversion
+* operators) that are common for built-in floating-point types, but may be
+* undesirable for \p half which is essentially a user-defined type.
+* - \p __CUDA_NO_HALF_OPERATORS__ and \p __CUDA_NO_HALF2_OPERATORS__ - If
+* defined, these macros will prevent the inadvertent use of usual arithmetic
+* and comparison operators. This enforces the storage-only type semantics and
+* prevents C++ style computations on \p half and \p half2 types.
+*/
+
+/**
+* \defgroup CUDA_MATH_INTRINSIC_HALF_CONSTANTS Half Arithmetic Constants
+* \ingroup CUDA_MATH_INTRINSIC_HALF
+* To use these constants, include the header file \p cuda_fp16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__HALF_ARITHMETIC Half Arithmetic Functions
+* \ingroup CUDA_MATH_INTRINSIC_HALF
+* To use these functions, include the header file \p cuda_fp16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__HALF2_ARITHMETIC Half2 Arithmetic Functions
+* \ingroup CUDA_MATH_INTRINSIC_HALF
+* To use these functions, include the header file \p cuda_fp16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__HALF_COMPARISON Half Comparison Functions
+* \ingroup CUDA_MATH_INTRINSIC_HALF
+* To use these functions, include the header file \p cuda_fp16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__HALF2_COMPARISON Half2 Comparison Functions
+* \ingroup CUDA_MATH_INTRINSIC_HALF
+* To use these functions, include the header file \p cuda_fp16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__HALF_MISC Half Precision Conversion and Data Movement
+* \ingroup CUDA_MATH_INTRINSIC_HALF
+* To use these functions, include the header file \p cuda_fp16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__HALF_FUNCTIONS Half Math Functions
+* \ingroup CUDA_MATH_INTRINSIC_HALF
+* To use these functions, include the header file \p cuda_fp16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__HALF2_FUNCTIONS Half2 Math Functions
+* \ingroup CUDA_MATH_INTRINSIC_HALF
+* To use these functions, include the header file \p cuda_fp16.h in your program.
+*/
+
+#ifndef __CUDA_FP16_H__
+#define __CUDA_FP16_H__
+
+// implicitly provided by NVRTC
+#if !defined(__CUDACC_RTC__)
+/* bring in float2, double4, etc vector types */
+#include "vector_types.h"
+/* bring in operations on vector types like: make_float2 */
+#include "vector_functions.h"
+#endif  /* !defined(__CUDACC_RTC__) */
+
+#define ___CUDA_FP16_STRINGIFY_INNERMOST(x) #x
+#define __CUDA_FP16_STRINGIFY(x) ___CUDA_FP16_STRINGIFY_INNERMOST(x)
+
+#if defined(__cplusplus)
+
+/* Set up function decorations */
+#if (defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3))))
+#define __CUDA_FP16_DECL__ __device__
+#define __CUDA_HOSTDEVICE_FP16_DECL__ __device__
+#define __CUDA_HOSTDEVICE__ __device__
+#elif defined(__CUDACC__) || defined(_NVHPC_CUDA)
+#define __CUDA_FP16_DECL__ static __device__ __inline__
+#define __CUDA_HOSTDEVICE_FP16_DECL__ static __host__ __device__ __inline__
+#define __CUDA_HOSTDEVICE__ __host__ __device__
+#else /* !defined(__CUDACC__) */
+#if defined(__GNUC__)
+#define __CUDA_HOSTDEVICE_FP16_DECL__ static __attribute__ ((unused))
+#else
+#define __CUDA_HOSTDEVICE_FP16_DECL__ static
+#endif /* defined(__GNUC__) */
+#define __CUDA_HOSTDEVICE__
+#endif /* (defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3)))) */
+
+#define __CUDA_FP16_TYPES_EXIST__
+
+/* Macros to allow half & half2 to be used by inline assembly */
+#define __HALF_TO_US(var) *(reinterpret_cast<unsigned short *>(&(var)))
+#define __HALF_TO_CUS(var) *(reinterpret_cast<const unsigned short *>(&(var)))
+#define __HALF2_TO_UI(var) *(reinterpret_cast<unsigned int *>(&(var)))
+#define __HALF2_TO_CUI(var) *(reinterpret_cast<const unsigned int *>(&(var)))
+
+/* Forward-declaration of structures defined in "cuda_fp16.hpp" */
+struct __half;
+struct __half2;
+
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Converts double number to half precision in round-to-nearest-even mode
+* and returns \p half with converted value.
+*
+* \details Converts double number \p a to half precision in round-to-nearest-even mode.
+* \param[in] a - double. Is only being read.
+* \returns half
+* - \p a converted to half precision using round-to-nearest-even mode.
+* - __double2half \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - __double2half \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - __double2half(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __double2half(const double a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Converts float number to half precision in round-to-nearest-even mode
+* and returns \p half with converted value.
+*
+* \details Converts float number \p a to half precision in round-to-nearest-even mode.
+* \param[in] a - float. Is only being read.
+* \returns half
+* - \p a converted to half precision using round-to-nearest-even mode.
+*
+* \see __float2half_rn(float) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half(const float a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Converts float number to half precision in round-to-nearest-even mode
+* and returns \p half with converted value.
+*
+* \details Converts float number \p a to half precision in round-to-nearest-even mode.
+* \param[in] a - float. Is only being read.
+* \returns half
+* - \p a converted to half precision using round-to-nearest-even mode.
+* - __float2half_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - __float2half_rn \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - __float2half_rn(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rn(const float a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Converts float number to half precision in round-towards-zero mode
+* and returns \p half with converted value.
+*
+* \details Converts float number \p a to half precision in round-towards-zero mode.
+* \param[in] a - float. Is only being read.
+* \returns half
+* - \p a converted to half precision using round-towards-zero mode.
+* - __float2half_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - __float2half_rz \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - __float2half_rz(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rz(const float a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Converts float number to half precision in round-down mode
+* and returns \p half with converted value.
+*
+* \details Converts float number \p a to half precision in round-down mode.
+* \param[in] a - float. Is only being read.
+*
+* \returns half
+* - \p a converted to half precision using round-down mode.
+* - __float2half_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - __float2half_rd \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - __float2half_rd(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rd(const float a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Converts float number to half precision in round-up mode
+* and returns \p half with converted value.
+*
+* \details Converts float number \p a to half precision in round-up mode.
+* \param[in] a - float. Is only being read.
+*
+* \returns half
+* - \p a converted to half precision using round-up mode.
+* - __float2half_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - __float2half_ru \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - __float2half_ru(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_ru(const float a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Converts \p half number to float.
+*
+* \details Converts half number \p a to float.
+* \param[in] a - float. Is only being read.
+*
+* \returns float
+* - \p a converted to float.
+* - __half2float \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - __half2float \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - __half2float(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ float __half2float(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Converts input to half precision in round-to-nearest-even mode and
+* populates both halves of \p half2 with converted value.
+*
+* \details Converts input \p a to half precision in round-to-nearest-even mode and
+* populates both halves of \p half2 with converted value.
+* \param[in] a - float. Is only being read.
+*
+* \returns half2
+* - The \p half2 value with both halves equal to the converted half
+* precision number.
+*
+* \see __float2half_rn(float) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float2half2_rn(const float a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Converts both input floats to half precision in round-to-nearest-even
+* mode and returns \p half2 with converted values.
+*
+* \details Converts both input floats to half precision in round-to-nearest-even mode
+* and combines the results into one \p half2 number. Low 16 bits of the return
+* value correspond to the input \p a, high 16 bits correspond to the input \p
+* b.
+* \param[in] a - float. Is only being read.
+* \param[in] b - float. Is only being read.
+*
+* \returns half2
+* - The \p half2 value with corresponding halves equal to the
+* converted input floats.
+*
+* \see __float2half_rn(float) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __floats2half2_rn(const float a, const float b);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Converts low 16 bits of \p half2 to float and returns the result
+*
+* \details Converts low 16 bits of \p half2 input \p a to 32-bit floating-point number
+* and returns the result.
+* \param[in] a - half2. Is only being read.
+*
+* \returns float
+* - The low 16 bits of \p a converted to float.
+*
+* \see __half2float(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ float __low2float(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Converts high 16 bits of \p half2 to float and returns the result
+*
+* \details Converts high 16 bits of \p half2 input \p a to 32-bit floating-point number
+* and returns the result.
+* \param[in] a - half2. Is only being read.
+*
+* \returns float
+* - The high 16 bits of \p a converted to float.
+*
+* \see __half2float(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ float __high2float(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to a signed char in round-towards-zero mode.
+*
+* \details Convert the half-precision floating-point value \p h to a signed char
+* integer in round-towards-zero mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read.
+*
+* \returns signed char
+* - \p h converted to a signed char using round-towards-zero mode.
+* - __half2char_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2char_rz \cuda_math_formula (x), x > 127\end_cuda_math_formula returns SCHAR_MAX = \p 0x7F.
+* - __half2char_rz \cuda_math_formula (x), x < -128\end_cuda_math_formula returns SCHAR_MIN = \p 0x80.
+* - __half2char_rz(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ signed char __half2char_rz(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to an unsigned char in round-towards-zero
+* mode.
+*
+* \details Convert the half-precision floating-point value \p h to an unsigned
+* char in round-towards-zero mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read.
+*
+* \returns unsigned char
+* - \p h converted to an unsigned char using round-towards-zero mode.
+* - __half2uchar_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2uchar_rz \cuda_math_formula (x), x > 255\end_cuda_math_formula returns UCHAR_MAX = \p 0xFF.
+* - __half2uchar_rz \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0.
+* - __half2uchar_rz(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned char __half2uchar_rz(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to a signed short integer in round-towards-zero mode.
+*
+* \details Convert the half-precision floating-point value \p h to a signed short
+* integer in round-towards-zero mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read.
+*
+* \returns short int
+* - \p h converted to a signed short integer using round-towards-zero mode.
+* - __half2short_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2short_rz \cuda_math_formula (x), x > 32767\end_cuda_math_formula returns SHRT_MAX = \p 0x7FFF.
+* - __half2short_rz \cuda_math_formula (x), x < -32768\end_cuda_math_formula returns SHRT_MIN = \p 0x8000.
+* - __half2short_rz(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ short int __half2short_rz(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to an unsigned short integer in round-towards-zero
+* mode.
+*
+* \details Convert the half-precision floating-point value \p h to an unsigned short
+* integer in round-towards-zero mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read.
+*
+* \returns unsigned short int
+* - \p h converted to an unsigned short integer using round-towards-zero mode.
+* - __half2ushort_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2ushort_rz \cuda_math_formula (+\infty)\end_cuda_math_formula returns USHRT_MAX = \p 0xFFFF.
+* - __half2ushort_rz \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0.
+* - __half2ushort_rz(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned short int __half2ushort_rz(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to a signed integer in round-towards-zero mode.
+*
+* \details Convert the half-precision floating-point value \p h to a signed integer in
+* round-towards-zero mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read.
+*
+* \returns int
+* - \p h converted to a signed integer using round-towards-zero mode.
+* - __half2int_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2int_rz \cuda_math_formula (+\infty)\end_cuda_math_formula returns INT_MAX = \p 0x7FFFFFFF.
+* - __half2int_rz \cuda_math_formula (-\infty)\end_cuda_math_formula returns INT_MIN = \p 0x80000000.
+* - __half2int_rz(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ int __half2int_rz(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to an unsigned integer in round-towards-zero mode.
+*
+* \details Convert the half-precision floating-point value \p h to an unsigned integer
+* in round-towards-zero mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read.
+*
+* \returns unsigned int
+* - \p h converted to an unsigned integer using round-towards-zero mode.
+* - __half2uint_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2uint_rz \cuda_math_formula (+\infty)\end_cuda_math_formula returns UINT_MAX = \p 0xFFFFFFFF.
+* - __half2uint_rz \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0.
+* - __half2uint_rz(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __half2uint_rz(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to a signed 64-bit integer in round-towards-zero mode.
+*
+* \details Convert the half-precision floating-point value \p h to a signed 64-bit
+* integer in round-towards-zero mode. NaN inputs return a long long int with hex value of \p 0x8000000000000000.
+* \param[in] h - half. Is only being read.
+*
+* \returns long long int
+* - \p h converted to a signed 64-bit integer using round-towards-zero mode.
+* - __half2ll_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2ll_rz \cuda_math_formula (+\infty)\end_cuda_math_formula returns LLONG_MAX = \p 0x7FFFFFFFFFFFFFFF.
+* - __half2ll_rz \cuda_math_formula (-\infty)\end_cuda_math_formula returns LLONG_MIN = \p 0x8000000000000000.
+* - __half2ll_rz(NaN) returns \p 0x8000000000000000.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ long long int __half2ll_rz(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to an unsigned 64-bit integer in round-towards-zero
+* mode.
+*
+* \details Convert the half-precision floating-point value \p h to an unsigned 64-bit
+* integer in round-towards-zero mode. NaN inputs return \p 0x8000000000000000.
+* \param[in] h - half. Is only being read.
+*
+* \returns unsigned long long int
+* - \p h converted to an unsigned 64-bit integer using round-towards-zero mode.
+* - __half2ull_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2ull_rz \cuda_math_formula (+\infty)\end_cuda_math_formula returns ULLONG_MAX = \p 0xFFFFFFFFFFFFFFFF.
+* - __half2ull_rz \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0.
+* - __half2ull_rz(NaN) returns \p 0x8000000000000000.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned long long int __half2ull_rz(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Vector function, combines two \p __half numbers into one \p __half2 number.
+*
+* \details Combines two input \p __half number \p x and \p y into one \p __half2 number.
+* Input \p x is stored in low 16 bits of the return value, input \p y is stored
+* in high 16 bits of the return value.
+* \param[in] x - half. Is only being read.
+* \param[in] y - half. Is only being read.
+*
+* \returns __half2
+* - The \p __half2 vector with one half equal to \p x and the other to \p y.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 make_half2(const __half x, const __half y);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Converts both components of \p float2 number to half precision in
+* round-to-nearest-even mode and returns \p half2 with converted values.
+*
+* \details Converts both components of \p float2 to half precision in round-to-nearest-even
+* mode and combines the results into one \p half2 number. Low 16 bits of the
+* return value correspond to \p a.x and high 16 bits of the return value
+* correspond to \p a.y.
+* \param[in] a - float2. Is only being read.
+*
+* \returns half2
+* - The \p half2 which has corresponding halves equal to the
+* converted \p float2 components.
+*
+* \see __float2half_rn(float) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float22half2_rn(const float2 a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Converts both halves of \p half2 to \p float2 and returns the result.
+*
+* \details Converts both halves of \p half2 input \p a to \p float2 and returns the
+* result.
+* \param[in] a - half2. Is only being read.
+*
+* \returns float2
+* - \p a converted to \p float2.
+*
+* \see __half2float(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ float2 __half22float2(const __half2 a);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to a signed integer in round-to-nearest-even mode.
+*
+* \details Convert the half-precision floating-point value \p h to a signed integer in
+* round-to-nearest-even mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read.
+*
+* \returns int
+* - \p h converted to a signed integer using round-to-nearest-even mode.
+* - __half2int_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2int_rn \cuda_math_formula (+\infty)\end_cuda_math_formula returns INT_MAX = \p 0x7FFFFFFF.
+* - __half2int_rn \cuda_math_formula (-\infty)\end_cuda_math_formula returns INT_MIN = \p 0x80000000.
+* - __half2int_rn(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ int __half2int_rn(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to a signed integer in round-down mode.
+*
+* \details Convert the half-precision floating-point value \p h to a signed integer in
+* round-down mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read.
+*
+* \returns int
+* - \p h converted to a signed integer using round-down mode.
+* - __half2int_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2int_rd \cuda_math_formula (+\infty)\end_cuda_math_formula returns INT_MAX = \p 0x7FFFFFFF.
+* - __half2int_rd \cuda_math_formula (-\infty)\end_cuda_math_formula returns INT_MIN = \p 0x80000000.
+* - __half2int_rd(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ int __half2int_rd(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to a signed integer in round-up mode.
+*
+* \details Convert the half-precision floating-point value \p h to a signed integer in
+* round-up mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read.
+*
+* \returns int
+* - \p h converted to a signed integer using round-up mode.
+* - __half2int_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2int_ru \cuda_math_formula (+\infty)\end_cuda_math_formula returns INT_MAX = \p 0x7FFFFFFF.
+* - __half2int_ru \cuda_math_formula (-\infty)\end_cuda_math_formula returns INT_MIN = \p 0x80000000.
+* - __half2int_ru(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ int __half2int_ru(const __half h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a signed integer to a half in round-to-nearest-even mode.
+*
+* \details Convert the signed integer value \p i to a half-precision floating-point
+* value in round-to-nearest-even mode.
+* \param[in] i - int. Is only being read.
+*
+* \returns half
+* - \p i converted to half.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_rn(const int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a signed integer to a half in round-towards-zero mode.
+*
+* \details Convert the signed integer value \p i to a half-precision floating-point
+* value in round-towards-zero mode.
+* \param[in] i - int. Is only being read.
+*
+* \returns half
+* - \p i converted to half.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_rz(const int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a signed integer to a half in round-down mode.
+*
+* \details Convert the signed integer value \p i to a half-precision floating-point
+* value in round-down mode.
+* \param[in] i - int. Is only being read.
+*
+* \returns half
+* - \p i converted to half.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_rd(const int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a signed integer to a half in round-up mode.
+*
+* \details Convert the signed integer value \p i to a half-precision floating-point
+* value in round-up mode.
+* \param[in] i - int. Is only being read.
+*
+* \returns half
+* - \p i converted to half.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_ru(const int i);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to a signed short integer in round-to-nearest-even
+* mode.
+*
+* \details Convert the half-precision floating-point value \p h to a signed short
+* integer in round-to-nearest-even mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read.
+*
+* \returns short int
+* - \p h converted to a signed short integer using round-to-nearest-even mode.
+* - __half2short_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2short_rn \cuda_math_formula (x), x > 32767\end_cuda_math_formula returns SHRT_MAX = \p 0x7FFF.
+* - __half2short_rn \cuda_math_formula (x), x < -32768\end_cuda_math_formula returns SHRT_MIN = \p 0x8000.
+* - __half2short_rn(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ short int __half2short_rn(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to a signed short integer in round-down mode.
+*
+* \details Convert the half-precision floating-point value \p h to a signed short
+* integer in round-down mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read.
+*
+* \returns short int
+* - \p h converted to a signed short integer using round-down mode.
+* - __half2short_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2short_rd \cuda_math_formula (x), x > 32767\end_cuda_math_formula returns SHRT_MAX = \p 0x7FFF.
+* - __half2short_rd \cuda_math_formula (x), x < -32768\end_cuda_math_formula returns SHRT_MIN = \p 0x8000.
+* - __half2short_rd(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ short int __half2short_rd(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to a signed short integer in round-up mode.
+*
+* \details Convert the half-precision floating-point value \p h to a signed short
+* integer in round-up mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read.
+*
+* \returns short int
+* - \p h converted to a signed short integer using round-up mode.
+* - __half2short_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2short_ru \cuda_math_formula (x), x > 32767\end_cuda_math_formula returns SHRT_MAX = \p 0x7FFF.
+* - __half2short_ru \cuda_math_formula (x), x < -32768\end_cuda_math_formula returns SHRT_MIN = \p 0x8000.
+* - __half2short_ru(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ short int __half2short_ru(const __half h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a signed short integer to a half in round-to-nearest-even
+* mode.
+*
+* \details Convert the signed short integer value \p i to a half-precision floating-point
+* value in round-to-nearest-even mode.
+* \param[in] i - short int. Is only being read.
+*
+* \returns half
+* - \p i converted to half.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_rn(const short int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a signed short integer to a half in round-towards-zero mode.
+*
+* \details Convert the signed short integer value \p i to a half-precision floating-point
+* value in round-towards-zero mode.
+* \param[in] i - short int. Is only being read.
+*
+* \returns half
+* - \p i converted to half.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_rz(const short int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a signed short integer to a half in round-down mode.
+*
+* \details Convert the signed short integer value \p i to a half-precision floating-point
+* value in round-down mode.
+* \param[in] i - short int. Is only being read.
+*
+* \returns half
+* - \p i converted to half.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_rd(const short int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a signed short integer to a half in round-up mode.
+*
+* \details Convert the signed short integer value \p i to a half-precision floating-point
+* value in round-up mode.
+* \param[in] i - short int. Is only being read.
+*
+* \returns half
+* - \p i converted to half.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_ru(const short int i);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to an unsigned integer in round-to-nearest-even mode.
+*
+* \details Convert the half-precision floating-point value \p h to an unsigned integer
+* in round-to-nearest-even mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read.
+*
+* \returns unsigned int
+* - \p h converted to an unsigned integer using round-to-nearest-even mode.
+* - __half2uint_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2uint_rn \cuda_math_formula (+\infty)\end_cuda_math_formula returns UINT_MAX = \p 0xFFFFFFFF.
+* - __half2uint_rn \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0.
+* - __half2uint_rn(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ unsigned int __half2uint_rn(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to an unsigned integer in round-down mode.
+*
+* \details Convert the half-precision floating-point value \p h to an unsigned integer
+* in round-down mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read.
+*
+* \returns unsigned int
+* - \p h converted to an unsigned integer using round-down mode.
+* - __half2uint_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2uint_rd \cuda_math_formula (+\infty)\end_cuda_math_formula returns UINT_MAX = \p 0xFFFFFFFF.
+* - __half2uint_rd \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0.
+* - __half2uint_rd(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ unsigned int __half2uint_rd(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to an unsigned integer in round-up mode.
+*
+* \details Convert the half-precision floating-point value \p h to an unsigned integer
+* in round-up mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read.
+*
+* \returns unsigned int
+* - \p h converted to an unsigned integer using round-up mode.
+* - __half2uint_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2uint_ru \cuda_math_formula (+\infty)\end_cuda_math_formula returns UINT_MAX = \p 0xFFFFFFFF.
+* - __half2uint_ru \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0.
+* - __half2uint_ru(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ unsigned int __half2uint_ru(const __half h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert an unsigned integer to a half in round-to-nearest-even mode.
+*
+* \details Convert the unsigned integer value \p i to a half-precision floating-point
+* value in round-to-nearest-even mode.
+* \param[in] i - unsigned int. Is only being read.
+*
+* \returns half
+* - \p i converted to half.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_rn(const unsigned int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert an unsigned integer to a half in round-towards-zero mode.
+*
+* \details Convert the unsigned integer value \p i to a half-precision floating-point
+* value in round-towards-zero mode.
+* \param[in] i - unsigned int. Is only being read.
+*
+* \returns half
+* - \p i converted to half.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_rz(const unsigned int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert an unsigned integer to a half in round-down mode.
+*
+* \details Convert the unsigned integer value \p i to a half-precision floating-point
+* value in round-down mode.
+* \param[in] i - unsigned int. Is only being read.
+*
+* \returns half
+* - \p i converted to half.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_rd(const unsigned int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert an unsigned integer to a half in round-up mode.
+*
+* \details Convert the unsigned integer value \p i to a half-precision floating-point
+* value in round-up mode.
+* \param[in] i - unsigned int. Is only being read.
+*
+* \returns half
+* - \p i converted to half.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_ru(const unsigned int i);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to an unsigned short integer in round-to-nearest-even
+* mode.
+*
+* \details Convert the half-precision floating-point value \p h to an unsigned short
+* integer in round-to-nearest-even mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read.
+*
+* \returns unsigned short int
+* - \p h converted to an unsigned short integer using round-to-nearest-even mode.
+* - __half2ushort_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2ushort_rn \cuda_math_formula (+\infty)\end_cuda_math_formula returns USHRT_MAX = \p 0xFFFF.
+* - __half2ushort_rn \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0.
+* - __half2ushort_rn(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ unsigned short int __half2ushort_rn(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to an unsigned short integer in round-down mode.
+*
+* \details Convert the half-precision floating-point value \p h to an unsigned short
+* integer in round-down mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read.
+*
+* \returns unsigned short int
+* - \p h converted to an unsigned short integer using round-down mode.
+* - __half2ushort_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2ushort_rd \cuda_math_formula (+\infty)\end_cuda_math_formula returns USHRT_MAX = \p 0xFFFF.
+* - __half2ushort_rd \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0.
+* - __half2ushort_rd(NaN) returns 0.
+*/
+__CUDA_FP16_DECL__ unsigned short int __half2ushort_rd(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to an unsigned short integer in round-up mode.
+*
+* \details Convert the half-precision floating-point value \p h to an unsigned short
+* integer in round-up mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read.
+*
+* \returns unsigned short int
+* - \p h converted to an unsigned short integer using round-up mode.
+* - __half2ushort_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2ushort_ru \cuda_math_formula (+\infty)\end_cuda_math_formula returns USHRT_MAX = \p 0xFFFF.
+* - __half2ushort_ru \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0.
+* - __half2ushort_ru(NaN) returns 0.
+*/
+__CUDA_FP16_DECL__ unsigned short int __half2ushort_ru(const __half h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert an unsigned short integer to a half in round-to-nearest-even
+* mode.
+*
+* \details Convert the unsigned short integer value \p i to a half-precision floating-point
+* value in round-to-nearest-even mode.
+* \param[in] i - unsigned short int. Is only being read.
+*
+* \returns half
+* - \p i converted to half.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_rn(const unsigned short int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert an unsigned short integer to a half in round-towards-zero
+* mode.
+*
+* \details Convert the unsigned short integer value \p i to a half-precision floating-point
+* value in round-towards-zero mode.
+* \param[in] i - unsigned short int. Is only being read.
+*
+* \returns half
+* - \p i converted to half.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_rz(const unsigned short int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert an unsigned short integer to a half in round-down mode.
+*
+* \details Convert the unsigned short integer value \p i to a half-precision floating-point
+* value in round-down mode.
+* \param[in] i - unsigned short int. Is only being read.
+*
+* \returns half
+* - \p i converted to half.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_rd(const unsigned short int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert an unsigned short integer to a half in round-up mode.
+*
+* \details Convert the unsigned short integer value \p i to a half-precision floating-point
+* value in round-up mode.
+* \param[in] i - unsigned short int. Is only being read.
+*
+* \returns half
+* - \p i converted to half.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_ru(const unsigned short int i);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to an unsigned 64-bit integer in round-to-nearest-even
+* mode.
+*
+* \details Convert the half-precision floating-point value \p h to an unsigned 64-bit
+* integer in round-to-nearest-even mode. NaN inputs return \p 0x8000000000000000.
+* \param[in] h - half. Is only being read.
+*
+* \returns unsigned long long int
+* - \p h converted to an unsigned 64-bit integer using round-to-nearest-even mode.
+* - __half2ull_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2ull_rn \cuda_math_formula (+\infty)\end_cuda_math_formula returns ULLONG_MAX = \p 0xFFFFFFFFFFFFFFFF.
+* - __half2ull_rn \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0.
+* - __half2ull_rn(NaN) returns \p 0x8000000000000000.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ unsigned long long int __half2ull_rn(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to an unsigned 64-bit integer in round-down mode.
+*
+* \details Convert the half-precision floating-point value \p h to an unsigned 64-bit
+* integer in round-down mode. NaN inputs return \p 0x8000000000000000.
+* \param[in] h - half. Is only being read.
+*
+* \returns unsigned long long int
+* - \p h converted to an unsigned 64-bit integer using round-down mode.
+* - __half2ull_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2ull_rd \cuda_math_formula (+\infty)\end_cuda_math_formula returns ULLONG_MAX = \p 0xFFFFFFFFFFFFFFFF.
+* - __half2ull_rd \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0.
+* - __half2ull_rd(NaN) returns \p 0x8000000000000000.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ unsigned long long int __half2ull_rd(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to an unsigned 64-bit integer in round-up mode.
+*
+* \details Convert the half-precision floating-point value \p h to an unsigned 64-bit
+* integer in round-up mode. NaN inputs return \p 0x8000000000000000.
+* \param[in] h - half. Is only being read.
+*
+* \returns unsigned long long int
+* - \p h converted to an unsigned 64-bit integer using round-up mode.
+* - __half2ull_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2ull_ru \cuda_math_formula (+\infty)\end_cuda_math_formula returns ULLONG_MAX = \p 0xFFFFFFFFFFFFFFFF.
+* - __half2ull_ru \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0.
+* - __half2ull_ru(NaN) returns \p 0x8000000000000000.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ unsigned long long int __half2ull_ru(const __half h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert an unsigned 64-bit integer to a half in round-to-nearest-even
+* mode.
+*
+* \details Convert the unsigned 64-bit integer value \p i to a half-precision floating-point
+* value in round-to-nearest-even mode.
+* \param[in] i - unsigned long long int. Is only being read.
+*
+* \returns half
+* - \p i converted to half.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_rn(const unsigned long long int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert an unsigned 64-bit integer to a half in round-towards-zero
+* mode.
+*
+* \details Convert the unsigned 64-bit integer value \p i to a half-precision floating-point
+* value in round-towards-zero mode.
+* \param[in] i - unsigned long long int. Is only being read.
+*
+* \returns half
+* - \p i converted to half.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_rz(const unsigned long long int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert an unsigned 64-bit integer to a half in round-down mode.
+*
+* \details Convert the unsigned 64-bit integer value \p i to a half-precision floating-point
+* value in round-down mode.
+* \param[in] i - unsigned long long int. Is only being read.
+*
+* \returns half
+* - \p i converted to half.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_rd(const unsigned long long int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert an unsigned 64-bit integer to a half in round-up mode.
+*
+* \details Convert the unsigned 64-bit integer value \p i to a half-precision floating-point
+* value in round-up mode.
+* \param[in] i - unsigned long long int. Is only being read.
+*
+* \returns half
+* - \p i converted to half.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_ru(const unsigned long long int i);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to a signed 64-bit integer in round-to-nearest-even
+* mode.
+*
+* \details Convert the half-precision floating-point value \p h to a signed 64-bit
+* integer in round-to-nearest-even mode. NaN inputs return a long long int with hex value of \p 0x8000000000000000.
+* \param[in] h - half. Is only being read.
+*
+* \returns long long int
+* - \p h converted to a signed 64-bit integer using round-to-nearest-even mode.
+* - __half2ll_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2ll_rn \cuda_math_formula (+\infty)\end_cuda_math_formula returns LLONG_MAX = \p 0x7FFFFFFFFFFFFFFF.
+* - __half2ll_rn \cuda_math_formula (-\infty)\end_cuda_math_formula returns LLONG_MIN = \p 0x8000000000000000.
+* - __half2ll_rn(NaN) returns \p 0x8000000000000000.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ long long int __half2ll_rn(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to a signed 64-bit integer in round-down mode.
+*
+* \details Convert the half-precision floating-point value \p h to a signed 64-bit
+* integer in round-down mode. NaN inputs return a long long int with hex value of \p 0x8000000000000000.
+* \param[in] h - half. Is only being read.
+*
+* \returns long long int
+* - \p h converted to a signed 64-bit integer using round-down mode.
+* - __half2ll_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2ll_rd \cuda_math_formula (+\infty)\end_cuda_math_formula returns LLONG_MAX = \p 0x7FFFFFFFFFFFFFFF.
+* - __half2ll_rd \cuda_math_formula (-\infty)\end_cuda_math_formula returns LLONG_MIN = \p 0x8000000000000000.
+* - __half2ll_rd(NaN) returns \p 0x8000000000000000.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ long long int __half2ll_rd(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to a signed 64-bit integer in round-up mode.
+*
+* \details Convert the half-precision floating-point value \p h to a signed 64-bit
+* integer in round-up mode. NaN inputs return a long long int with hex value of \p 0x8000000000000000.
+* \param[in] h - half. Is only being read.
+*
+* \returns long long int
+* - \p h converted to a signed 64-bit integer using round-up mode.
+* - __half2ll_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2ll_ru \cuda_math_formula (+\infty)\end_cuda_math_formula returns LLONG_MAX = \p 0x7FFFFFFFFFFFFFFF.
+* - __half2ll_ru \cuda_math_formula (-\infty)\end_cuda_math_formula returns LLONG_MIN = \p 0x8000000000000000.
+* - __half2ll_ru(NaN) returns \p 0x8000000000000000.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ long long int __half2ll_ru(const __half h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a signed 64-bit integer to a half in round-to-nearest-even
+* mode.
+*
+* \details Convert the signed 64-bit integer value \p i to a half-precision floating-point
+* value in round-to-nearest-even mode.
+* \param[in] i - long long int. Is only being read.
+*
+* \returns half
+* - \p i converted to half.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_rn(const long long int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a signed 64-bit integer to a half in round-towards-zero mode.
+*
+* \details Convert the signed 64-bit integer value \p i to a half-precision floating-point
+* value in round-towards-zero mode.
+* \param[in] i - long long int. Is only being read.
+*
+* \returns half
+* - \p i converted to half.
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_rz(const long long int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a signed 64-bit integer to a half in round-down mode.
+*
+* \details Convert the signed 64-bit integer value \p i to a half-precision floating-point
+* value in round-down mode.
+* \param[in] i - long long int. Is only being read.
+*
+* \returns half
+* - \p i converted to half.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_rd(const long long int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a signed 64-bit integer to a half in round-up mode.
+*
+* \details Convert the signed 64-bit integer value \p i to a half-precision floating-point
+* value in round-up mode.
+* \param[in] i - long long int. Is only being read.
+*
+* \returns half
+* - \p i converted to half.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_ru(const long long int i);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Truncate input argument to the integral part.
+*
+* \details Round \p h to the largest integer value that does not exceed \p h in
+* magnitude.
+* \param[in] h - half. Is only being read.
+*
+* \returns half
+* - The truncated value.
+* - htrunc(
+* \cuda_math_formula \pm 0 \end_cuda_math_formula
+* ) returns
+* \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - htrunc(
+* \cuda_math_formula \pm \infty \end_cuda_math_formula
+* ) returns
+* \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - htrunc(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half htrunc(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculate ceiling of the input argument.
+*
+* \details Compute the smallest integer value not less than \p h.
+* \param[in] h - half. Is only being read.
+*
+* \returns half
+* - The smallest integer value not less than \p h.
+* - hceil(
+* \cuda_math_formula \pm 0 \end_cuda_math_formula
+* ) returns
+* \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - hceil(
+* \cuda_math_formula \pm \infty \end_cuda_math_formula
+* ) returns
+* \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - hceil(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hceil(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculate the largest integer less than or equal to \p h.
+*
+* \details Calculate the largest integer value which is less than or equal to \p h.
+* \param[in] h - half. Is only being read.
+*
+* \returns half
+* - The largest integer value which is less than or equal to \p h.
+* - hfloor(
+* \cuda_math_formula \pm 0 \end_cuda_math_formula
+* ) returns
+* \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - hfloor(
+* \cuda_math_formula \pm \infty \end_cuda_math_formula
+* ) returns
+* \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - hfloor(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hfloor(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Round input to nearest integer value in half-precision floating-point
+* number.
+*
+* \details Round \p h to the nearest integer value in half-precision floating-point
+* format, with halfway cases rounded to the nearest even integer value.
+* \param[in] h - half. Is only being read.
+*
+* \returns half
+* - The nearest integer to \p h.
+* - hrint(
+* \cuda_math_formula \pm 0 \end_cuda_math_formula
+* ) returns
+* \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - hrint(
+* \cuda_math_formula \pm \infty \end_cuda_math_formula
+* ) returns
+* \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - hrint(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hrint(const __half h);
+
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Truncate \p half2 vector input argument to the integral part.
+*
+* \details Round each component of vector \p h to the largest integer value that does
+* not exceed \p h in magnitude.
+* \param[in] h - half2. Is only being read.
+*
+* \returns half2
+* - The truncated \p h.
+*
+* \see htrunc(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2trunc(const __half2 h);
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculate \p half2 vector ceiling of the input argument.
+*
+* \details For each component of vector \p h compute the smallest integer value not less
+* than \p h.
+* \param[in] h - half2. Is only being read.
+*
+* \returns half2
+* - The vector of smallest integers not less than \p h.
+*
+* \see hceil(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2ceil(const __half2 h);
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculate the largest integer less than or equal to \p h.
+*
+* \details For each component of vector \p h calculate the largest integer value which
+* is less than or equal to \p h.
+* \param[in] h - half2. Is only being read.
+*
+* \returns half2
+* - The vector of largest integers which is less than or equal to \p h.
+*
+* \see hfloor(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2floor(const __half2 h);
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Round input to nearest integer value in half-precision floating-point
+* number.
+*
+* \details Round each component of \p half2 vector \p h to the nearest integer value in
+* half-precision floating-point format, with halfway cases rounded to the
+* nearest even integer value.
+* \param[in] h - half2. Is only being read.
+*
+* \returns half2
+* - The vector of rounded integer values.
+*
+* \see hrint(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2rint(const __half2 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Returns \p half2 with both halves equal to the input value.
+*
+* \details Returns \p half2 number with both halves equal to the input \p a \p half
+* number.
+* \param[in] a - half. Is only being read.
+*
+* \returns half2
+* - The vector which has both its halves equal to the input \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __half2half2(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Swaps both halves of the \p half2 input.
+*
+* \details Swaps both halves of the \p half2 input and returns a new \p half2 number
+* with swapped halves.
+* \param[in] a - half2. Is only being read.
+*
+* \returns half2
+* - \p a with its halves being swapped.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __lowhigh2highlow(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Extracts low 16 bits from each of the two \p half2 inputs and combines
+* into one \p half2 number.
+*
+* \details Extracts low 16 bits from each of the two \p half2 inputs and combines into
+* one \p half2 number. Low 16 bits from input \p a is stored in low 16 bits of
+* the return value, low 16 bits from input \p b is stored in high 16 bits of
+* the return value.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The low 16 bits of \p a and of \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __lows2half2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Extracts high 16 bits from each of the two \p half2 inputs and
+* combines into one \p half2 number.
+*
+* \details Extracts high 16 bits from each of the two \p half2 inputs and combines into
+* one \p half2 number. High 16 bits from input \p a is stored in low 16 bits of
+* the return value, high 16 bits from input \p b is stored in high 16 bits of
+* the return value.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The high 16 bits of \p a and of \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __highs2half2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Returns high 16 bits of \p half2 input.
+*
+* \details Returns high 16 bits of \p half2 input \p a.
+* \param[in] a - half2. Is only being read.
+*
+* \returns half
+* - The high 16 bits of the input.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __high2half(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Returns low 16 bits of \p half2 input.
+*
+* \details Returns low 16 bits of \p half2 input \p a.
+* \param[in] a - half2. Is only being read.
+*
+* \returns half
+* - Returns \p half which contains low 16 bits of the input \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __low2half(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Checks if the input \p half number is infinite.
+*
+* \details Checks if the input \p half number \p a is infinite.
+* \param[in] a - half. Is only being read.
+*
+* \returns int
+* - -1 if \p a is equal to negative infinity,
+* - 1 if \p a is equal to positive infinity,
+* - 0 otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ int __hisinf(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Combines two \p half numbers into one \p half2 number.
+*
+* \details Combines two input \p half number \p a and \p b into one \p half2 number.
+* Input \p a is stored in low 16 bits of the return value, input \p b is stored
+* in high 16 bits of the return value.
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns half2
+* - The half2 with one half equal to \p a and the other to \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __halves2half2(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Extracts low 16 bits from \p half2 input.
+*
+* \details Extracts low 16 bits from \p half2 input \p a and returns a new \p half2
+* number which has both halves equal to the extracted bits.
+* \param[in] a - half2. Is only being read.
+*
+* \returns half2
+* - The half2 with both halves equal to the low 16 bits of the input.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __low2half2(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Extracts high 16 bits from \p half2 input.
+*
+* \details Extracts high 16 bits from \p half2 input \p a and returns a new \p half2
+* number which has both halves equal to the extracted bits.
+* \param[in] a - half2. Is only being read.
+*
+* \returns half2
+* - The half2 with both halves equal to the high 16 bits of the input.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __high2half2(const __half2 a);
+
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Reinterprets bits in a \p half as a signed short integer.
+*
+* \details Reinterprets the bits in the half-precision floating-point number \p h
+* as a signed short integer.
+* \param[in] h - half. Is only being read.
+*
+* \returns short int
+* - The reinterpreted value.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ short int __half_as_short(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Reinterprets bits in a \p half as an unsigned short integer.
+*
+* \details Reinterprets the bits in the half-precision floating-point \p h
+* as an unsigned short number.
+* \param[in] h - half. Is only being read.
+*
+* \returns unsigned short int
+* - The reinterpreted value.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned short int __half_as_ushort(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Reinterprets bits in a signed short integer as a \p half.
+*
+* \details Reinterprets the bits in the signed short integer \p i as a
+* half-precision floating-point number.
+* \param[in] i - short int. Is only being read.
+*
+* \returns half
+* - The reinterpreted value.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __short_as_half(const short int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Reinterprets bits in an unsigned short integer as a \p half.
+*
+* \details Reinterprets the bits in the unsigned short integer \p i as a
+* half-precision floating-point number.
+* \param[in] i - unsigned short int. Is only being read.
+*
+* \returns half
+* - The reinterpreted value.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort_as_half(const unsigned short int i);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Calculates \p half maximum of two input values.
+*
+* \details Calculates \p half max(\p a, \p b)
+* defined as (\p a > \p b) ? \p a : \p b.
+* - If either of inputs is NaN, the other input is returned.
+* - If both inputs are NaNs, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns half
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmax(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Calculates \p half minimum of two input values.
+*
+* \details Calculates \p half min(\p a, \p b)
+* defined as (\p a < \p b) ? \p a : \p b.
+* - If either of inputs is NaN, the other input is returned.
+* - If both inputs are NaNs, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns half
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmin(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Calculates \p half2 vector maximum of two inputs.
+*
+* \details Calculates \p half2 vector max(\p a, \p b).
+* Elementwise \p half operation is defined as
+* (\p a > \p b) ? \p a : \p b.
+* - If either of inputs is NaN, the other input is returned.
+* - If both inputs are NaNs, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The result of elementwise maximum of vectors \p a  and \p b
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmax2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Calculates \p half2 vector minimum of two inputs.
+*
+* \details Calculates \p half2 vector min(\p a, \p b).
+* Elementwise \p half operation is defined as
+* (\p a < \p b) ? \p a : \p b.
+* - If either of inputs is NaN, the other input is returned.
+* - If both inputs are NaNs, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The result of elementwise minimum of vectors \p a  and \p b
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmin2(const __half2 a, const __half2 b);
+
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300)
+#if !defined warpSize && !defined __local_warpSize
+#define warpSize    32
+#define __local_warpSize
+#endif
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ < 700)
+
+#if defined(_WIN32)
+# define __CUDA_FP16_DEPRECATED__(msg) __declspec(deprecated(msg))
+#elif (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 5 && !defined(__clang__))))
+# define __CUDA_FP16_DEPRECATED__(msg) __attribute__((deprecated))
+#else
+# define __CUDA_FP16_DEPRECATED__(msg) __attribute__((deprecated(msg)))
+#endif
+
+#if defined(_NVHPC_CUDA)
+#define __CUDA_FP16_WSB_DEPRECATION_MESSAGE(x) __CUDA_FP16_STRINGIFY(x) "() is deprecated in favor of " __CUDA_FP16_STRINGIFY(x) "_sync() and may be removed in a future release."
+#else
+#define __CUDA_FP16_WSB_DEPRECATION_MESSAGE(x) __CUDA_FP16_STRINGIFY(x) "() is deprecated in favor of " __CUDA_FP16_STRINGIFY(x) "_sync() and may be removed in a future release (Use -Wno-deprecated-declarations to suppress this warning)."
+#endif
+
+__CUDA_FP16_DECL__ __CUDA_FP16_DEPRECATED__(__CUDA_FP16_WSB_DEPRECATION_MESSAGE(__shfl)) __half2 __shfl(const __half2 var, const int delta, const int width = warpSize);
+__CUDA_FP16_DECL__ __CUDA_FP16_DEPRECATED__(__CUDA_FP16_WSB_DEPRECATION_MESSAGE(__shfl_up)) __half2 __shfl_up(const __half2 var, const unsigned int delta, const int width = warpSize);
+__CUDA_FP16_DECL__ __CUDA_FP16_DEPRECATED__(__CUDA_FP16_WSB_DEPRECATION_MESSAGE(__shfl_down))__half2 __shfl_down(const __half2 var, const unsigned int delta, const int width = warpSize);
+__CUDA_FP16_DECL__ __CUDA_FP16_DEPRECATED__(__CUDA_FP16_WSB_DEPRECATION_MESSAGE(__shfl_xor)) __half2 __shfl_xor(const __half2 var, const int delta, const int width = warpSize);
+__CUDA_FP16_DECL__ __CUDA_FP16_DEPRECATED__(__CUDA_FP16_WSB_DEPRECATION_MESSAGE(__shfl)) __half __shfl(const __half var, const int delta, const int width = warpSize);
+__CUDA_FP16_DECL__ __CUDA_FP16_DEPRECATED__(__CUDA_FP16_WSB_DEPRECATION_MESSAGE(__shfl_up)) __half __shfl_up(const __half var, const unsigned int delta, const int width = warpSize);
+__CUDA_FP16_DECL__ __CUDA_FP16_DEPRECATED__(__CUDA_FP16_WSB_DEPRECATION_MESSAGE(__shfl_down)) __half __shfl_down(const __half var, const unsigned int delta, const int width = warpSize);
+__CUDA_FP16_DECL__ __CUDA_FP16_DEPRECATED__(__CUDA_FP16_WSB_DEPRECATION_MESSAGE(__shfl_xor)) __half __shfl_xor(const __half var, const int delta, const int width = warpSize);
+
+#undef __CUDA_FP16_WSB_DEPRECATION_MESSAGE
+#undef __CUDA_FP16_DEPRECATED__
+#endif /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 */
+
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Exchange a variable between threads within a warp. Direct copy from indexed thread.
+*
+* \details Returns the value of \p var held by the thread whose ID is given by \p srcLane.
+* If the \p width is less than \p warpSize, then each subsection of the warp behaves as a separate
+* entity with a starting logical thread ID of 0. If \p srcLane is outside the range \p [0:width-1],
+* the value returned corresponds to the value of \p var held by the \p srcLane modulo \p width (i.e.
+* within the same subsection). \p width must have a value which is a power of 2;
+* results are undefined if \p width is not a power of 2, or is a number greater than
+* \p warpSize.
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read.
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - half2. Is only being read.
+* \param[in] srcLane - int. Is only being read.
+* \param[in] width - int. Is only being read.
+*
+* \returns Returns the 4-byte word referenced by \p var from the source thread ID as \p half2.
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __shfl_sync(const unsigned int mask, const __half2 var, const int srcLane, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller.
+*
+* \details Calculates a source thread ID by subtracting \p delta from the caller's lane ID.
+* The value of \p var held by the resulting lane ID is returned: in effect, \p var is shifted up
+* the warp by \p delta threads. If the \p width is less than \p warpSize, then each subsection of the warp
+* behaves as a separate entity with a starting logical thread ID of 0. The source thread index
+* will not wrap around the value of \p width, so effectively the lower \p delta threads will be unchanged.
+* \p width must have a value which is a power of 2; results are undefined if \p width is not a power of 2,
+* or is a number greater than \p warpSize.
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read.
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - half2. Is only being read.
+* \param[in] delta - unsigned int. Is only being read.
+* \param[in] width - int. Is only being read.
+*
+* \returns Returns the 4-byte word referenced by \p var from the source thread ID as \p half2.
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __shfl_up_sync(const unsigned int mask, const __half2 var, const unsigned int delta, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller.
+*
+* \details Calculates a source thread ID by adding \p delta to the caller's thread ID.
+* The value of \p var held by the resulting thread ID is returned: this has the effect
+* of shifting \p var down the warp by \p delta threads. If the \p width is less than \p warpSize, then
+* each subsection of the warp behaves as a separate entity with a starting logical
+* thread ID of 0. Similarly to the __shfl_up_sync(), the ID number of the source thread
+* will not wrap around the value of \p width and the upper \p delta threads
+* will remain unchanged.
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read.
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - half2. Is only being read.
+* \param[in] delta - unsigned int. Is only being read.
+* \param[in] width - int. Is only being read.
+*
+* \returns Returns the 4-byte word referenced by \p var from the source thread ID as \p half2.
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __shfl_down_sync(const unsigned int mask, const __half2 var, const unsigned int delta, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID.
+*
+* \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with \p laneMask:
+* the value of \p var held by the resulting thread ID is returned. If the \p width is less than \p warpSize, then each
+* group of \p width consecutive threads are able to access elements from earlier groups of threads,
+* however if they attempt to access elements from later groups of threads their own value of \p var
+* will be returned. This mode implements a butterfly addressing pattern such as is used in tree
+* reduction and broadcast.
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read.
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - half2. Is only being read.
+* \param[in] laneMask - int. Is only being read.
+* \param[in] width - int. Is only being read.
+*
+* \returns Returns the 4-byte word referenced by \p var from the source thread ID as \p half2.
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __shfl_xor_sync(const unsigned int mask, const __half2 var, const int laneMask, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Exchange a variable between threads within a warp. Direct copy from indexed thread.
+*
+* \details Returns the value of \p var held by the thread whose ID is given by \p srcLane.
+* If the \p width is less than \p warpSize, then each subsection of the warp behaves as a separate
+* entity with a starting logical thread ID of 0. If \p srcLane is outside the range \p [0:width-1],
+* the value returned corresponds to the value of \p var held by the \p srcLane modulo \p width (i.e.
+* within the same subsection). \p width must have a value which is a power of 2;
+* results are undefined if \p width is not a power of 2, or is a number greater than
+* \p warpSize.
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read.
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - half. Is only being read.
+* \param[in] srcLane - int. Is only being read.
+* \param[in] width - int. Is only being read.
+*
+* \returns Returns the 2-byte word referenced by \p var from the source thread ID as \p half.
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __shfl_sync(const unsigned int mask, const __half var, const int srcLane, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller.
+*
+* \details Calculates a source thread ID by subtracting \p delta from the caller's lane ID.
+* The value of \p var held by the resulting lane ID is returned: in effect, \p var is shifted up
+* the warp by \p delta threads. If the \p width is less than \p warpSize, then each subsection of the warp
+* behaves as a separate entity with a starting logical thread ID of 0. The source thread index
+* will not wrap around the value of \p width, so effectively the lower \p delta threads will be unchanged.
+* \p width must have a value which is a power of 2; results are undefined if \p width is not a power of 2,
+* or is a number greater than \p warpSize.
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read.
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - half. Is only being read.
+* \param[in] delta - unsigned int. Is only being read.
+* \param[in] width - int. Is only being read.
+*
+* \returns Returns the 2-byte word referenced by \p var from the source thread ID as \p half.
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __shfl_up_sync(const unsigned int mask, const __half var, const unsigned int delta, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller.
+*
+* \details Calculates a source thread ID by adding \p delta to the caller's thread ID.
+* The value of \p var held by the resulting thread ID is returned: this has the effect
+* of shifting \p var down the warp by \p delta threads. If the \p width is less than \p warpSize, then
+* each subsection of the warp behaves as a separate entity with a starting logical
+* thread ID of 0. Similarly to the __shfl_up_sync(), the ID number of the source thread
+* will not wrap around the value of \p width and the upper \p delta threads
+* will remain unchanged.
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read.
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - half. Is only being read.
+* \param[in] delta - unsigned int. Is only being read.
+* \param[in] width - int. Is only being read.
+*
+* \returns Returns the 2-byte word referenced by \p var from the source thread ID as \p half.
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __shfl_down_sync(const unsigned int mask, const __half var, const unsigned int delta, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID.
+*
+* \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with \p laneMask:
+* the value of \p var held by the resulting thread ID is returned. If the \p width is less than \p warpSize, then each
+* group of \p width consecutive threads are able to access elements from earlier groups of threads,
+* however if they attempt to access elements from later groups of threads their own value of \p var
+* will be returned. This mode implements a butterfly addressing pattern such as is used in tree
+* reduction and broadcast.
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read.
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - half. Is only being read.
+* \param[in] laneMask - int. Is only being read.
+* \param[in] width - int. Is only being read.
+*
+* \returns Returns the 2-byte word referenced by \p var from the source thread ID as \p half.
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __shfl_xor_sync(const unsigned int mask, const __half var, const int laneMask, const int width = warpSize);
+
+#if defined(__local_warpSize)
+#undef warpSize
+#undef __local_warpSize
+#endif
+#endif /*!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300) */
+
+#if defined(__cplusplus) && ( !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320) )
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `ld.global.nc` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_FP16_DECL__ __half2 __ldg(const  __half2 *const ptr);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `ld.global.nc` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_FP16_DECL__ __half __ldg(const __half *const ptr);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `ld.global.cg` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_FP16_DECL__ __half2 __ldcg(const  __half2 *const ptr);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `ld.global.cg` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_FP16_DECL__ __half __ldcg(const __half *const ptr);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `ld.global.ca` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_FP16_DECL__ __half2 __ldca(const  __half2 *const ptr);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `ld.global.ca` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_FP16_DECL__ __half __ldca(const __half *const ptr);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `ld.global.cs` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_FP16_DECL__ __half2 __ldcs(const  __half2 *const ptr);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `ld.global.cs` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_FP16_DECL__ __half __ldcs(const __half *const ptr);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `ld.global.lu` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_FP16_DECL__ __half2 __ldlu(const  __half2 *const ptr);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `ld.global.lu` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_FP16_DECL__ __half __ldlu(const __half *const ptr);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `ld.global.cv` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_FP16_DECL__ __half2 __ldcv(const  __half2 *const ptr);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `ld.global.cv` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_FP16_DECL__ __half __ldcv(const __half *const ptr);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `st.global.wb` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_FP16_DECL__ void __stwb(__half2 *const ptr, const __half2 value);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `st.global.wb` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_FP16_DECL__ void __stwb(__half *const ptr, const __half value);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `st.global.cg` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_FP16_DECL__ void __stcg(__half2 *const ptr, const __half2 value);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `st.global.cg` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_FP16_DECL__ void __stcg(__half *const ptr, const __half value);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `st.global.cs` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_FP16_DECL__ void __stcs(__half2 *const ptr, const __half2 value);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `st.global.cs` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_FP16_DECL__ void __stcs(__half *const ptr, const __half value);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `st.global.wt` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_FP16_DECL__ void __stwt(__half2 *const ptr, const __half2 value);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `st.global.wt` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_FP16_DECL__ void __stwt(__half *const ptr, const __half value);
+#endif /*defined(__cplusplus) && ( !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320) )*/
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs half2 vector if-equal comparison.
+*
+* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b.
+* The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The vector result of if-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __heq2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector not-equal comparison.
+*
+* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b.
+* The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The vector result of not-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hne2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector less-equal comparison.
+*
+* \details Performs \p half2 vector less-equal comparison of inputs \p a and \p b.
+* The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The \p half2 result of less-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hle2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector greater-equal comparison.
+*
+* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b.
+* The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The vector result of greater-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hge2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector less-than comparison.
+*
+* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b.
+* The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The half2 vector result of less-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hlt2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector greater-than comparison.
+*
+* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b.
+* The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The vector result of greater-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hgt2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered if-equal comparison.
+*
+* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b.
+* The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The vector result of unordered if-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hequ2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered not-equal comparison.
+*
+* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b.
+* The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The vector result of unordered not-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hneu2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered less-equal comparison.
+*
+* Performs \p half2 vector less-equal comparison of inputs \p a and \p b.
+* The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The vector result of unordered less-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hleu2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered greater-equal comparison.
+*
+* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b.
+* The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The \p half2 vector result of unordered greater-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hgeu2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered less-than comparison.
+*
+* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b.
+* The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The vector result of unordered less-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hltu2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered greater-than comparison.
+*
+* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b.
+* The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The \p half2 vector result of unordered greater-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hgtu2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs half2 vector if-equal comparison.
+*
+* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of if-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __heq2_mask(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector not-equal comparison.
+*
+* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of not-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hne2_mask(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector less-equal comparison.
+*
+* \details Performs \p half2 vector less-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of less-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hle2_mask(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector greater-equal comparison.
+*
+* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of greater-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hge2_mask(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector less-than comparison.
+*
+* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of less-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hlt2_mask(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector greater-than comparison.
+*
+* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of greater-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hgt2_mask(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered if-equal comparison.
+*
+* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of unordered if-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hequ2_mask(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered not-equal comparison.
+*
+* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of unordered not-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hneu2_mask(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered less-equal comparison.
+*
+* Performs \p half2 vector less-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of unordered less-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hleu2_mask(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered greater-equal comparison.
+*
+* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of unordered greater-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hgeu2_mask(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered less-than comparison.
+*
+* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of unordered less-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hltu2_mask(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered greater-than comparison.
+*
+* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of unordered greater-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hgtu2_mask(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Determine whether \p half2 argument is a NaN.
+*
+* \details Determine whether each half of input \p half2 number \p a is a NaN.
+* \param[in] a - half2. Is only being read.
+*
+* \returns half2
+* - The half2 with the corresponding \p half results set to
+* 1.0 for NaN, 0.0 otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hisnan2(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs \p half2 vector addition in round-to-nearest-even mode.
+*
+* \details Performs \p half2 vector add of inputs \p a and \p b, in round-to-nearest-even
+* mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-95
+* \endinternal
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The sum of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hadd2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs \p half2 vector subtraction in round-to-nearest-even mode.
+*
+* \details Subtracts \p half2 input vector \p b from input vector \p a in
+* round-to-nearest-even mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-104
+* \endinternal
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The subtraction of vector \p b from \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hsub2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs \p half2 vector multiplication in round-to-nearest-even mode.
+*
+* \details Performs \p half2 vector multiplication of inputs \p a and \p b, in
+* round-to-nearest-even mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-102
+* \endinternal
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The result of elementwise multiplying the vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmul2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs \p half2 vector addition in round-to-nearest-even mode.
+*
+* \details Performs \p half2 vector add of inputs \p a and \p b, in round-to-nearest-even
+* mode. Prevents floating-point contractions of mul+add into fma.
+* \internal
+* \req DEEPLEARN-SRM_REQ-95
+* \endinternal
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The sum of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hadd2_rn(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs \p half2 vector subtraction in round-to-nearest-even mode.
+*
+* \details Subtracts \p half2 input vector \p b from input vector \p a in
+* round-to-nearest-even mode. Prevents floating-point contractions of mul+sub
+* into fma.
+* \internal
+* \req DEEPLEARN-SRM_REQ-104
+* \endinternal
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The subtraction of vector \p b from \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hsub2_rn(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs \p half2 vector multiplication in round-to-nearest-even mode.
+*
+* \details Performs \p half2 vector multiplication of inputs \p a and \p b, in
+* round-to-nearest-even mode. Prevents floating-point contractions of
+* mul+add or sub into fma.
+* \internal
+* \req DEEPLEARN-SRM_REQ-102
+* \endinternal
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The result of elementwise multiplying the vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmul2_rn(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs \p half2 vector division in round-to-nearest-even mode.
+*
+* \details Divides \p half2 input vector \p a by input vector \p b in round-to-nearest-even
+* mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-103
+* \endinternal
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The elementwise division of \p a with \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __h2div(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Calculates the absolute value of both halves of the input \p half2 number and
+* returns the result.
+*
+* \details Calculates the absolute value of both halves of the input \p half2 number and
+* returns the result.
+* \param[in] a - half2. Is only being read.
+*
+* \returns half2
+* - Returns \p a with the absolute value of both halves.
+*
+* \see __habs(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __habs2(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs \p half2 vector addition in round-to-nearest-even mode, with
+* saturation to [0.0, 1.0].
+*
+* \details Performs \p half2 vector add of inputs \p a and \p b, in round-to-nearest-even
+* mode, and clamps the results to range [0.0, 1.0]. NaN results are flushed to
+* +0.0.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The sum of \p a and \p b, with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hadd2_sat(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs \p half2 vector subtraction in round-to-nearest-even mode,
+* with saturation to [0.0, 1.0].
+*
+* \details Subtracts \p half2 input vector \p b from input vector \p a in
+* round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN
+* results are flushed to +0.0.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The subtraction of vector \p b from \p a, with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hsub2_sat(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs \p half2 vector multiplication in round-to-nearest-even mode,
+* with saturation to [0.0, 1.0].
+*
+* \details Performs \p half2 vector multiplication of inputs \p a and \p b, in
+* round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN
+* results are flushed to +0.0.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The result of elementwise multiplication of vectors \p a and \p b,
+* with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmul2_sat(const __half2 a, const __half2 b);
+
+#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs \p half2 vector fused multiply-add in round-to-nearest-even
+* mode.
+*
+* \details Performs \p half2 vector multiply on inputs \p a and \p b,
+* then performs a \p half2 vector add of the result with \p c,
+* rounding the result once in round-to-nearest-even mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-105
+* \endinternal
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+* \param[in] c - half2. Is only being read.
+*
+* \returns half2
+* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __hfma2(const __half2 a, const __half2 b, const __half2 c);
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs \p half2 vector fused multiply-add in round-to-nearest-even
+* mode, with saturation to [0.0, 1.0].
+*
+* \details Performs \p half2 vector multiply on inputs \p a and \p b,
+* then performs a \p half2 vector add of the result with \p c,
+* rounding the result once in round-to-nearest-even mode, and clamps the
+* results to range [0.0, 1.0]. NaN results are flushed to +0.0.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+* \param[in] c - half2. Is only being read.
+*
+* \returns half2
+* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c,
+* with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __hfma2_sat(const __half2 a, const __half2 b, const __half2 c);
+#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Negates both halves of the input \p half2 number and returns the
+* result.
+*
+* \details Negates both halves of the input \p half2 number \p a and returns the result.
+* \internal
+* \req DEEPLEARN-SRM_REQ-101
+* \endinternal
+* \param[in] a - half2. Is only being read.
+*
+* \returns half2
+* - Returns \p a with both halves negated.
+*
+* \see __hneg(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hneg2(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Calculates the absolute value of input \p half number and returns the result.
+*
+* \details Calculates the absolute value of input \p half number and returns the result.
+* \param[in] a - half. Is only being read.
+*
+* \returns half
+* - The absolute value of \p a.
+* - __habs \cuda_math_formula (\pm 0)\end_cuda_math_formula returns +0.
+* - __habs \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula +\infty \end_cuda_math_formula.
+* - __habs(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __habs(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Performs \p half addition in round-to-nearest-even mode.
+*
+* \details Performs \p half addition of inputs \p a and \p b, in round-to-nearest-even
+* mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-94
+* \endinternal
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns half
+* - The sum of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hadd(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Performs \p half subtraction in round-to-nearest-even mode.
+*
+* \details Subtracts \p half input \p b from input \p a in round-to-nearest-even
+* mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-97
+* \endinternal
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns half
+* - The result of subtracting \p b from \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hsub(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Performs \p half multiplication in round-to-nearest-even mode.
+*
+* \details Performs \p half multiplication of inputs \p a and \p b, in round-to-nearest-even
+* mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-99
+* \endinternal
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns half
+* - The result of multiplying \p a and \p b.
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmul(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Performs \p half addition in round-to-nearest-even mode.
+*
+* \details Performs \p half addition of inputs \p a and \p b, in round-to-nearest-even
+* mode. Prevents floating-point contractions of mul+add into fma.
+* \internal
+* \req DEEPLEARN-SRM_REQ-94
+* \endinternal
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns half
+* - The sum of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hadd_rn(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Performs \p half subtraction in round-to-nearest-even mode.
+*
+* \details Subtracts \p half input \p b from input \p a in round-to-nearest-even
+* mode. Prevents floating-point contractions of mul+sub into fma.
+* \internal
+* \req DEEPLEARN-SRM_REQ-97
+* \endinternal
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns half
+* - The result of subtracting \p b from \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hsub_rn(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Performs \p half multiplication in round-to-nearest-even mode.
+*
+* \details Performs \p half multiplication of inputs \p a and \p b, in round-to-nearest-even
+* mode. Prevents floating-point contractions of mul+add or sub into fma.
+* \internal
+* \req DEEPLEARN-SRM_REQ-99
+* \endinternal
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns half
+* - The result of multiplying \p a and \p b.
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmul_rn(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Performs \p half division in round-to-nearest-even mode.
+*
+* \details Divides \p half input \p a by input \p b in round-to-nearest-even
+* mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-98
+* \endinternal
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns half
+* - The result of dividing \p a by \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__  __half __hdiv(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Performs \p half addition in round-to-nearest-even mode, with
+* saturation to [0.0, 1.0].
+*
+* \details Performs \p half add of inputs \p a and \p b, in round-to-nearest-even mode,
+* and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0.
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns half
+* - The sum of \p a and \p b, with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hadd_sat(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Performs \p half subtraction in round-to-nearest-even mode, with
+* saturation to [0.0, 1.0].
+*
+* \details Subtracts \p half input \p b from input \p a in round-to-nearest-even
+* mode,
+* and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0.
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns half
+* - The result of subtraction of \p b from \p a, with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hsub_sat(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Performs \p half multiplication in round-to-nearest-even mode, with
+* saturation to [0.0, 1.0].
+*
+* \details Performs \p half multiplication of inputs \p a and \p b, in round-to-nearest-even
+* mode, and clamps the result to range [0.0, 1.0]. NaN results are flushed to
+* +0.0.
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns half
+* - The result of multiplying \p a and \p b, with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmul_sat(const __half a, const __half b);
+
+#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Performs \p half fused multiply-add in round-to-nearest-even mode.
+*
+* \details Performs \p half multiply on inputs \p a and \p b,
+* then performs a \p half add of the result with \p c,
+* rounding the result once in round-to-nearest-even mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-96
+* \endinternal
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+* \param[in] c - half. Is only being read.
+*
+* \returns half
+* - The result of fused multiply-add operation on \p
+* a, \p b, and \p c.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __hfma(const __half a, const __half b, const __half c);
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Performs \p half fused multiply-add in round-to-nearest-even mode,
+* with saturation to [0.0, 1.0].
+*
+* \details Performs \p half multiply on inputs \p a and \p b,
+* then performs a \p half add of the result with \p c,
+* rounding the result once in round-to-nearest-even mode, and clamps the result
+* to range [0.0, 1.0]. NaN results are flushed to +0.0.
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+* \param[in] c - half. Is only being read.
+*
+* \returns half
+* - The result of fused multiply-add operation on \p
+* a, \p b, and \p c, with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __hfma_sat(const __half a, const __half b, const __half c);
+#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */
+
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Negates input \p half number and returns the result.
+*
+* \details Negates input \p half number and returns the result.
+* \internal
+* \req DEEPLEARN-SRM_REQ-100
+* \endinternal
+* \param[in] a - half. Is only being read.
+*
+* \returns half
+* - Negated input \p a.
+* - __hneg \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \mp 0 \end_cuda_math_formula.
+* - __hneg \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \mp \infty \end_cuda_math_formula.
+* - __hneg(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hneg(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector if-equal comparison and returns boolean true
+* if both \p half results are true, boolean false otherwise.
+*
+* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p half if-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns bool
+* - true if both \p half results of if-equal comparison
+* of vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbeq2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector not-equal comparison and returns boolean
+* true if both \p half results are true, boolean false otherwise.
+*
+* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p half not-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns bool
+* - true if both \p half results of not-equal comparison
+* of vectors \p a and \p b are true,
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbne2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector less-equal comparison and returns boolean
+* true if both \p half results are true, boolean false otherwise.
+*
+* \details Performs \p half2 vector less-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p half less-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns bool
+* - true if both \p half results of less-equal comparison
+* of vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hble2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector greater-equal comparison and returns boolean
+* true if both \p half results are true, boolean false otherwise.
+*
+* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p half greater-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns bool
+* - true if both \p half results of greater-equal
+* comparison of vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbge2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector less-than comparison and returns boolean
+* true if both \p half results are true, boolean false otherwise.
+*
+* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p half less-than comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns bool
+* - true if both \p half results of less-than comparison
+* of vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hblt2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector greater-than comparison and returns boolean
+* true if both \p half results are true, boolean false otherwise.
+*
+* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p half greater-than comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns bool
+* - true if both \p half results of greater-than
+* comparison of vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbgt2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered if-equal comparison and returns
+* boolean true if both \p half results are true, boolean false otherwise.
+*
+* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p half if-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns bool
+* - true if both \p half results of unordered if-equal
+* comparison of vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbequ2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered not-equal comparison and returns
+* boolean true if both \p half results are true, boolean false otherwise.
+*
+* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p half not-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns bool
+* - true if both \p half results of unordered not-equal
+* comparison of vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbneu2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered less-equal comparison and returns
+* boolean true if both \p half results are true, boolean false otherwise.
+*
+* \details Performs \p half2 vector less-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p half less-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns bool
+* - true if both \p half results of unordered less-equal
+* comparison of vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbleu2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered greater-equal comparison and
+* returns boolean true if both \p half results are true, boolean false
+* otherwise.
+*
+* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p half greater-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns bool
+* - true if both \p half results of unordered
+* greater-equal comparison of vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbgeu2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered less-than comparison and returns
+* boolean true if both \p half results are true, boolean false otherwise.
+*
+* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p half less-than comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns bool
+* - true if both \p half results of unordered less-than comparison of
+* vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbltu2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered greater-than comparison and
+* returns boolean true if both \p half results are true, boolean false
+* otherwise.
+*
+* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p half greater-than comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns bool
+* - true if both \p half results of unordered
+* greater-than comparison of vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbgtu2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Performs \p half if-equal comparison.
+*
+* \details Performs \p half if-equal comparison of inputs \p a and \p b.
+* NaN inputs generate false results.
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns bool
+* - The boolean result of if-equal comparison of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __heq(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Performs \p half not-equal comparison.
+*
+* \details Performs \p half not-equal comparison of inputs \p a and \p b.
+* NaN inputs generate false results.
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns bool
+* - The boolean result of not-equal comparison of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hne(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Performs \p half less-equal comparison.
+*
+* \details Performs \p half less-equal comparison of inputs \p a and \p b.
+* NaN inputs generate false results.
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns bool
+* - The boolean result of less-equal comparison of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hle(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Performs \p half greater-equal comparison.
+*
+* \details Performs \p half greater-equal comparison of inputs \p a and \p b.
+* NaN inputs generate false results.
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns bool
+* - The boolean result of greater-equal comparison of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hge(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Performs \p half less-than comparison.
+*
+* \details Performs \p half less-than comparison of inputs \p a and \p b.
+* NaN inputs generate false results.
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns bool
+* - The boolean result of less-than comparison of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hlt(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Performs \p half greater-than comparison.
+*
+* \details Performs \p half greater-than comparison of inputs \p a and \p b.
+* NaN inputs generate false results.
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns bool
+* - The boolean result of greater-than comparison of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hgt(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Performs \p half unordered if-equal comparison.
+*
+* \details Performs \p half if-equal comparison of inputs \p a and \p b.
+* NaN inputs generate true results.
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns bool
+* - The boolean result of unordered if-equal comparison of \p a and
+* \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hequ(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Performs \p half unordered not-equal comparison.
+*
+* \details Performs \p half not-equal comparison of inputs \p a and \p b.
+* NaN inputs generate true results.
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns bool
+* - The boolean result of unordered not-equal comparison of \p a and
+* \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hneu(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Performs \p half unordered less-equal comparison.
+*
+* \details Performs \p half less-equal comparison of inputs \p a and \p b.
+* NaN inputs generate true results.
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns bool
+* - The boolean result of unordered less-equal comparison of \p a and
+* \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hleu(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Performs \p half unordered greater-equal comparison.
+*
+* \details Performs \p half greater-equal comparison of inputs \p a and \p b.
+* NaN inputs generate true results.
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns bool
+* - The boolean result of unordered greater-equal comparison of \p a
+* and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hgeu(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Performs \p half unordered less-than comparison.
+*
+* \details Performs \p half less-than comparison of inputs \p a and \p b.
+* NaN inputs generate true results.
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns bool
+* - The boolean result of unordered less-than comparison of \p a and
+* \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hltu(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Performs \p half unordered greater-than comparison.
+*
+* \details Performs \p half greater-than comparison of inputs \p a and \p b.
+* NaN inputs generate true results.
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns bool
+* - The boolean result of unordered greater-than comparison of \p a
+* and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hgtu(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Determine whether \p half argument is a NaN.
+*
+* \details Determine whether \p half value \p a is a NaN.
+* \param[in] a - half. Is only being read.
+*
+* \returns bool
+* - true if argument is NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hisnan(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Calculates \p half maximum of two input values, NaNs pass through.
+*
+* \details Calculates \p half max(\p a, \p b)
+* defined as (\p a > \p b) ? \p a : \p b.
+* - If either of inputs is NaN, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns half
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmax_nan(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Calculates \p half minimum of two input values, NaNs pass through.
+*
+* \details Calculates \p half min(\p a, \p b)
+* defined as (\p a < \p b) ? \p a : \p b.
+* - If either of inputs is NaN, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns half
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmin_nan(const __half a, const __half b);
+#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Performs \p half fused multiply-add in round-to-nearest-even mode with relu saturation.
+*
+* \details Performs \p half multiply on inputs \p a and \p b,
+* then performs a \p half add of the result with \p c,
+* rounding the result once in round-to-nearest-even mode.
+* Then negative result is clamped to 0.
+* NaN result is converted to canonical NaN.
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+* \param[in] c - half. Is only being read.
+*
+* \returns half
+* - The result of fused multiply-add operation on \p
+* a, \p b, and \p c with relu saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __hfma_relu(const __half a, const __half b, const __half c);
+#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Calculates \p half2 vector maximum of two inputs, NaNs pass through.
+*
+* \details Calculates \p half2 vector max(\p a, \p b).
+* Elementwise \p half operation is defined as
+* (\p a > \p b) ? \p a : \p b.
+* - If either of inputs is NaN, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The result of elementwise maximum of vectors \p a  and \p b, with NaNs pass through
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmax2_nan(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Calculates \p half2 vector minimum of two inputs, NaNs pass through.
+*
+* \details Calculates \p half2 vector min(\p a, \p b).
+* Elementwise \p half operation is defined as
+* (\p a < \p b) ? \p a : \p b.
+* - If either of inputs is NaN, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The result of elementwise minimum of vectors \p a  and \p b, with NaNs pass through
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmin2_nan(const __half2 a, const __half2 b);
+#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs \p half2 vector fused multiply-add in round-to-nearest-even
+* mode with relu saturation.
+*
+* \details Performs \p half2 vector multiply on inputs \p a and \p b,
+* then performs a \p half2 vector add of the result with \p c,
+* rounding the result once in round-to-nearest-even mode.
+* Then negative result is clamped to 0.
+* NaN result is converted to canonical NaN.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+* \param[in] c - half2. Is only being read.
+*
+* \returns half2
+* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c with relu saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __hfma2_relu(const __half2 a, const __half2 b, const __half2 c);
+
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs fast complex multiply-accumulate
+*
+* \details Interprets vector \p half2 input pairs \p a, \p b, and \p c as
+* complex numbers in \p half precision: (a.x + I*a.y), (b.x + I*b.y), (c.x + I*c.y)
+* and performs complex multiply-accumulate operation: a*b + c in a simple way:
+* ((a.x*b.x + c.x) - a.y*b.y) + I*((a.x*b.y + c.y) + a.y*b.x)
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+* \param[in] c - half2. Is only being read.
+*
+* \returns half2
+* - The result of complex multiply-accumulate operation on complex numbers \p a, \p b, and \p c
+* - __half2 result = __hcmadd(a, b, c) is numerically in agreement with:
+* - result.x = __hfma(-a.y, b.y, __hfma(a.x, b.x, c.x))
+* - result.y = __hfma( a.y, b.x, __hfma(a.x, b.y, c.y))
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __hcmadd(const __half2 a, const __half2 b, const __half2 c);
+#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculates \p half square root in round-to-nearest-even mode.
+*
+* \details Calculates \p half square root of input: \cuda_math_formula \sqrt{a} \end_cuda_math_formula in round-to-nearest-even mode.
+* \param[in] a - half. Is only being read.
+*
+* \returns half
+* - The square root of \p a.
+* - hsqrt \cuda_math_formula (+\infty)\end_cuda_math_formula returns \cuda_math_formula +\infty \end_cuda_math_formula.
+* - hsqrt \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - hsqrt \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns NaN.
+* - hsqrt(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hsqrt(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculates \p half reciprocal square root in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p half reciprocal square root of input: \cuda_math_formula \frac{1}{\sqrt{a}}\end_cuda_math_formula in round-to-nearest-even
+* mode.
+* \param[in] a - half. Is only being read.
+*
+* \returns half
+* - The reciprocal square root of \p a.
+* - hrsqrt \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - hrsqrt \cuda_math_formula (+\infty)\end_cuda_math_formula returns +0.
+* - hrsqrt \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns NaN.
+* - hrsqrt(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hrsqrt(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculates \p half reciprocal in round-to-nearest-even mode.
+*
+* \details Calculates \p half reciprocal of input: \cuda_math_formula \frac{1}{a}\end_cuda_math_formula in round-to-nearest-even mode.
+* \param[in] a - half. Is only being read.
+*
+* \returns half
+* - The reciprocal of \p a.
+* - hrcp \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - hrcp \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - hrcp(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hrcp(const __half a);
+#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculates \p half natural logarithm in round-to-nearest-even mode.
+*
+* \details Calculates \p half natural logarithm of input: \cuda_math_formula \ln(a)\end_cuda_math_formula in round-to-nearest-even
+* mode.
+* \param[in] a - half. Is only being read.
+*
+* \returns half
+* - The natural logarithm of \p a.
+* - hlog \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula -\infty \end_cuda_math_formula.
+* - hlog(1) returns +0.
+* - hlog(x), x < 0 returns NaN.
+* - hlog \cuda_math_formula (+\infty)\end_cuda_math_formula returns \cuda_math_formula +\infty \end_cuda_math_formula.
+* - hlog(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hlog(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculates \p half binary logarithm in round-to-nearest-even mode.
+*
+* \details Calculates \p half binary logarithm of input: \cuda_math_formula \log_{2}(a)\end_cuda_math_formula in round-to-nearest-even
+* mode.
+* \param[in] a - half. Is only being read.
+*
+* \returns half
+* - The binary logarithm of \p a.
+* - hlog2 \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula -\infty \end_cuda_math_formula.
+* - hlog2(1) returns +0.
+* - hlog2(x), x < 0 returns NaN.
+* - hlog2 \cuda_math_formula (+\infty)\end_cuda_math_formula returns \cuda_math_formula +\infty \end_cuda_math_formula.
+* - hlog2(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hlog2(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculates \p half decimal logarithm in round-to-nearest-even mode.
+*
+* \details Calculates \p half decimal logarithm of input: \cuda_math_formula \log_{10}(a)\end_cuda_math_formula in round-to-nearest-even
+* mode.
+* \param[in] a - half. Is only being read.
+*
+* \returns half
+* - The decimal logarithm of \p a.
+* - hlog10 \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula -\infty \end_cuda_math_formula.
+* - hlog10(1) returns +0.
+* - hlog10(x), x < 0 returns NaN.
+* - hlog10 \cuda_math_formula (+\infty)\end_cuda_math_formula returns \cuda_math_formula +\infty \end_cuda_math_formula.
+* - hlog10(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hlog10(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculates \p half natural exponential function in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p half natural exponential function of input: \cuda_math_formula e^{a}\end_cuda_math_formula in
+* round-to-nearest-even mode.
+* \param[in] a - half. Is only being read.
+*
+* \returns half
+* - The natural exponential function on \p a.
+* - hexp \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 1.
+* - hexp \cuda_math_formula (-\infty)\end_cuda_math_formula returns +0.
+* - hexp \cuda_math_formula (+\infty)\end_cuda_math_formula returns \cuda_math_formula +\infty \end_cuda_math_formula.
+* - hexp(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hexp(const __half a);
+#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */
+
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculates approximate \p half hyperbolic tangent function.
+*
+* \details Calculates approximate \p half hyperbolic tangent function: \cuda_math_formula \tanh(a)\end_cuda_math_formula.
+* This operation uses HW acceleration on devices of compute capability 7.5 and higher.
+* \param[in] a - half. Is only being read.
+*
+* \returns half
+* - The approximate hyperbolic tangent function of \p a.
+* - htanh_approx \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula (\pm 0)\end_cuda_math_formula.
+* - htanh_approx \cuda_math_formula (\pm\infty)\end_cuda_math_formula returns \cuda_math_formula (\pm 1)\end_cuda_math_formula.
+* - htanh_approx(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half htanh_approx(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculates \p half2 vector approximate hyperbolic tangent function.
+*
+* \details Calculates \p half2 approximate hyperbolic tangent function of input vector \p a.
+* This operation uses HW acceleration on devices of compute capability 7.5 and higher.
+* \param[in] a - half2. Is only being read.
+*
+* \returns half2
+* - The elementwise approximate hyperbolic tangent function on vector \p a.
+*
+* \see htanh_approx(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2tanh_approx(const __half2 a);
+
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculates \p half hyperbolic tangent function in
+* round-to-nearest-even mode.
+*
+* \details Calculates \p half hyperbolic tangent function: \cuda_math_formula \tanh(a)\end_cuda_math_formula in
+* round-to-nearest-even mode.
+* \param[in] a - half. Is only being read.
+*
+* \returns half
+* - The hyperbolic tangent function of \p a.
+* - htanh \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula (\pm 0)\end_cuda_math_formula.
+* - htanh \cuda_math_formula (\pm\infty)\end_cuda_math_formula returns \cuda_math_formula (\pm 1)\end_cuda_math_formula.
+* - htanh(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half htanh(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculates \p half2 vector hyperbolic tangent function in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p half2 hyperbolic tangent function of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - half2. Is only being read.
+*
+* \returns half2
+* - The elementwise hyperbolic tangent function on vector \p a.
+*
+* \see htanh(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2tanh(const __half2 a);
+
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculates \p half binary exponential function in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p half binary exponential function of input: \cuda_math_formula 2^{a}\end_cuda_math_formula in
+* round-to-nearest-even mode.
+* \param[in] a - half. Is only being read.
+*
+* \returns half
+* - The binary exponential function on \p a.
+* - hexp2 \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 1.
+* - hexp2 \cuda_math_formula (-\infty)\end_cuda_math_formula returns +0.
+* - hexp2 \cuda_math_formula (+\infty)\end_cuda_math_formula returns \cuda_math_formula +\infty \end_cuda_math_formula.
+* - hexp2(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hexp2(const __half a);
+#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculates \p half decimal exponential function in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p half decimal exponential function of input: \cuda_math_formula 10^{a}\end_cuda_math_formula in
+* round-to-nearest-even mode.
+* \param[in] a - half. Is only being read.
+*
+* \returns half
+* - The decimal exponential function on \p a.
+* - hexp10 \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 1.
+* - hexp10 \cuda_math_formula (-\infty)\end_cuda_math_formula returns +0.
+* - hexp10 \cuda_math_formula (+\infty)\end_cuda_math_formula returns \cuda_math_formula +\infty \end_cuda_math_formula.
+* - hexp10(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hexp10(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculates \p half cosine in round-to-nearest-even mode.
+*
+* \details Calculates \p half cosine of input \p a in round-to-nearest-even mode.
+* \param[in] a - half. Is only being read.
+*
+* \returns half
+* - The cosine of \p a.
+* - hcos \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 1.
+* - hcos \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns NaN.
+* - hcos(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hcos(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculates \p half sine in round-to-nearest-even mode.
+*
+* \details Calculates \p half sine of input \p a in round-to-nearest-even mode.
+* \param[in] a - half. Is only being read.
+*
+* \returns half
+* - The sine of \p a.
+* - hsin \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula (\pm 0)\end_cuda_math_formula.
+* - hsin \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns NaN.
+* - hsin(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hsin(const __half a);
+#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculates \p half2 vector square root in round-to-nearest-even mode.
+*
+* \details Calculates \p half2 square root of input vector \p a in round-to-nearest-even
+* mode.
+* \param[in] a - half2. Is only being read.
+*
+* \returns half2
+* - The elementwise square root on vector \p a.
+*
+* \see hsqrt(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2sqrt(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculates \p half2 vector reciprocal square root in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p half2 reciprocal square root of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - half2. Is only being read.
+*
+* \returns half2
+* - The elementwise reciprocal square root on vector \p a.
+*
+* \see hrsqrt(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2rsqrt(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculates \p half2 vector reciprocal in round-to-nearest-even mode.
+*
+* \details Calculates \p half2 reciprocal of input vector \p a in round-to-nearest-even
+* mode.
+* \param[in] a - half2. Is only being read.
+*
+* \returns half2
+* - The elementwise reciprocal on vector \p a.
+*
+* \see hrcp(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2rcp(const __half2 a);
+#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculates \p half2 vector natural logarithm in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p half2 natural logarithm of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - half2. Is only being read.
+*
+* \returns half2
+* - The elementwise natural logarithm on vector \p a.
+*
+* \see hlog(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2log(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculates \p half2 vector binary logarithm in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p half2 binary logarithm of input vector \p a in round-to-nearest-even
+* mode.
+* \param[in] a - half2. Is only being read.
+*
+* \returns half2
+* - The elementwise binary logarithm on vector \p a.
+*
+* \see hlog2(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2log2(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculates \p half2 vector decimal logarithm in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p half2 decimal logarithm of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - half2. Is only being read.
+*
+* \returns half2
+* - The elementwise decimal logarithm on vector \p a.
+*
+* \see hlog10(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2log10(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculates \p half2 vector exponential function in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p half2 exponential function of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - half2. Is only being read.
+*
+* \returns half2
+* - The elementwise exponential function on vector \p a.
+*
+* \see hexp(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2exp(const __half2 a);
+#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculates \p half2 vector binary exponential function in
+* round-to-nearest-even mode.
+*
+* \details Calculates \p half2 binary exponential function of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - half2. Is only being read.
+*
+* \returns half2
+* - The elementwise binary exponential function on vector \p a.
+*
+* \see hexp2(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2exp2(const __half2 a);
+#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculates \p half2 vector decimal exponential function in
+* round-to-nearest-even mode.
+*
+* \details Calculates \p half2 decimal exponential function of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - half2. Is only being read.
+*
+* \returns half2
+* - The elementwise decimal exponential function on vector \p a.
+*
+* \see hexp10(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2exp10(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculates \p half2 vector cosine in round-to-nearest-even mode.
+*
+* \details Calculates \p half2 cosine of input vector \p a in round-to-nearest-even
+* mode.
+* \param[in] a - half2. Is only being read.
+*
+* \returns half2
+* - The elementwise cosine on vector \p a.
+*
+* \see hcos(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2cos(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculates \p half2 vector sine in round-to-nearest-even mode.
+*
+* \details Calculates \p half2 sine of input vector \p a in round-to-nearest-even mode.
+* \param[in] a - half2. Is only being read.
+*
+* \returns half2
+* - The elementwise sine on vector \p a.
+*
+* \see hsin(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2sin(const __half2 a);
+#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */
+
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Vector add \p val to the value stored at \p address in global or shared memory, and writes this
+* value back to \p address. The atomicity of the add operation is guaranteed separately for each of the
+* two \p __half elements; the entire \p __half2 is not guaranteed to be atomic as a single 32-bit access.
+*
+* \details The location of \p address must be in global or shared memory. This operation has undefined
+* behavior otherwise. This operation is natively supported by devices of compute capability 6.x and higher,
+* older devices use emulation path.
+*
+* \param[in] address - half2*. An address in global or shared memory.
+* \param[in] val - half2. The value to be added.
+*
+* \returns half2
+* - The old value read from \p address.
+*
+* \note_ref_guide_atomic
+*/
+__CUDA_FP16_DECL__ __half2 atomicAdd(__half2 *const address, const __half2 val);
+
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700))) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Adds \p val to the value stored at \p address in global or shared memory, and writes this value
+* back to \p address. This operation is performed in one atomic operation.
+*
+* \details The location of \p address must be in global or shared memory. This operation has undefined
+* behavior otherwise. This operation is only supported by devices of compute capability 7.x and higher.
+*
+* \param[in] address - half*. An address in global or shared memory.
+* \param[in] val - half. The value to be added.
+*
+* \returns half
+* - The old value read from \p address.
+*
+* \note_ref_guide_atomic
+*/
+__CUDA_FP16_DECL__ __half atomicAdd(__half *const address, const __half val);
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700))) || defined(_NVHPC_CUDA) */
+#endif /*defined(__CUDACC__) || defined(_NVHPC_CUDA)*/
+
+
+#endif /* defined(__cplusplus) */
+
+#if !defined(_MSC_VER) && __cplusplus >= 201103L
+#   define __CPP_VERSION_AT_LEAST_11_FP16
+#elif _MSC_FULL_VER >= 190024210 && _MSVC_LANG >= 201103L
+#   define __CPP_VERSION_AT_LEAST_11_FP16
+#endif
+
+// implicitly provided by NVRTC
+#if !defined(__CUDACC_RTC__)
+#include <nv/target>
+#endif  /* !defined(__CUDACC_RTC__) */
+
+/* C++11 header for ::std::move.
+ * In RTC mode, ::std::move is provided implicitly; don't include the header
+ */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP16) && !defined(__CUDACC_RTC__)
+#include <utility>
+#endif /* __cplusplus >= 201103L && !defined(__CUDACC_RTC__) */
+
+/* C++ header for ::std::memcpy (used for type punning in host-side implementations).
+ * When compiling as a CUDA source file memcpy is provided implicitly.
+ * !defined(__CUDACC__) implies !defined(__CUDACC_RTC__).
+ */
+#if defined(__cplusplus) && !defined(__CUDACC__)
+#include <cstring>
+#endif /* defined(__cplusplus) && !defined(__CUDACC__) */
+
+#if (defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3))))
+#define __CUDA_FP16_INLINE__
+#define __CUDA_FP16_FORCEINLINE__
+#else
+#define __CUDA_FP16_INLINE__ inline
+#define __CUDA_FP16_FORCEINLINE__ __forceinline__
+#endif /* (defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3)))) */
+
+/* Set up structure-alignment attribute */
+#if defined(__CUDACC__)
+#define __CUDA_ALIGN__(align) __align__(align)
+#else
+/* Define alignment macro based on compiler type (cannot assume C11 "_Alignas" is available) */
+#if __cplusplus >= 201103L
+#define __CUDA_ALIGN__(n) alignas(n)    /* C++11 kindly gives us a keyword for this */
+#else /* !defined(__CPP_VERSION_AT_LEAST_11_FP16)*/
+#if defined(__GNUC__)
+#define __CUDA_ALIGN__(n) __attribute__ ((aligned(n)))
+#elif defined(_MSC_VER)
+#define __CUDA_ALIGN__(n) __declspec(align(n))
+#else
+#define __CUDA_ALIGN__(n)
+#endif /* defined(__GNUC__) */
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */
+#endif /* defined(__CUDACC__) */
+
+// define __CUDA_FP16_CONSTEXPR__ in order to
+// use constexpr where possible, with supporting C++ dialects
+// undef after use
+#if (defined __CPP_VERSION_AT_LEAST_11_FP16)
+#define __CUDA_FP16_CONSTEXPR__   constexpr
+#else
+#define __CUDA_FP16_CONSTEXPR__
+#endif
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF
+ * \brief __half_raw data type
+ * \details Type allows static initialization of \p half until it becomes
+ * a built-in type.
+ *
+ * - Note: this initialization is as a bit-field representation of \p half,
+ * and not a conversion from \p short to \p half.
+ * Such representation will be deprecated in a future version of CUDA.
+ *
+ * - Note: this is visible to non-nvcc compilers, including C-only compilations
+ */
+typedef struct __CUDA_ALIGN__(2) {
+    /**
+     * Storage field contains bits representation of the \p half floating-point number.
+     */
+    unsigned short x;
+} __half_raw;
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF
+ * \brief __half2_raw data type
+ * \details Type allows static initialization of \p half2 until it becomes
+ * a built-in type.
+ *
+ * - Note: this initialization is as a bit-field representation of \p half2,
+ * and not a conversion from \p short2 to \p half2.
+ * Such representation will be deprecated in a future version of CUDA.
+ *
+ * - Note: this is visible to non-nvcc compilers, including C-only compilations
+ */
+typedef struct __CUDA_ALIGN__(4) {
+    /**
+     * Storage field contains bits of the lower \p half part.
+     */
+    unsigned short x;
+    /**
+     * Storage field contains bits of the upper \p half part.
+     */
+    unsigned short y;
+} __half2_raw;
+
+/* All other definitions in this file are only visible to C++ compilers */
+#if defined(__cplusplus)
+
+/* Hide GCC member initialization list warnings because of host/device in-function init requirement */
+#if defined(__GNUC__)
+#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Weffc++"
+#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */
+#endif /* defined(__GNUC__) */
+
+/* class' : multiple assignment operators specified
+   The class has multiple assignment operators of a single type. This warning is informational */
+#if defined(_MSC_VER) && _MSC_VER >= 1500
+#pragma warning( push )
+#pragma warning( disable:4522 )
+#endif /* defined(_MSC_VER) && _MSC_VER >= 1500 */
+
+// forward-declaration of bfloat type to be used in converting constructor
+struct __nv_bfloat16;
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF
+ * \brief __half data type
+ * \details This structure implements the datatype for storing
+ * half-precision floating-point numbers. The structure implements
+ * assignment, arithmetic and comparison operators, and type conversions.
+ * 16 bits are being used in total: 1 sign bit, 5 bits for the exponent,
+ * and the significand is being stored in 10 bits.
+ * The total precision is 11 bits. There are 15361 representable
+ * numbers within the interval [0.0, 1.0], endpoints included.
+ * On average we have log10(2**11) ~ 3.311 decimal digits.
+ *
+ * The objective here is to provide IEEE754-compliant implementation
+ * of \p binary16 type and arithmetic with limitations due to
+ * device HW not supporting floating-point exceptions.
+ */
+struct __CUDA_ALIGN__(2) __half {
+protected:
+    /**
+     * Protected storage variable contains the bits of floating-point data.
+     */
+    unsigned short __x;
+
+public:
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * \brief Constructor by default.
+     * \details Emtpy default constructor, result is uninitialized.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP16)
+    __half() = default;
+#else
+    __CUDA_HOSTDEVICE__ __half() {}
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */
+
+    /* Convert to/from __half_raw */
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Constructor from \p __half_raw.
+     */
+    __CUDA_HOSTDEVICE__ __CUDA_FP16_CONSTEXPR__ __half(const __half_raw &hr) : __x(hr.x) { }
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Assignment operator from \p __half_raw.
+     */
+    __CUDA_HOSTDEVICE__ __half &operator=(const __half_raw &hr);
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Assignment operator from \p __half_raw to \p volatile \p __half.
+     */
+    __CUDA_HOSTDEVICE__ volatile __half &operator=(const __half_raw &hr) volatile;
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Assignment operator from \p volatile \p __half_raw to \p volatile \p __half.
+     */
+    __CUDA_HOSTDEVICE__ volatile __half &operator=(const volatile __half_raw &hr) volatile;
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Type cast to \p __half_raw operator.
+     */
+    __CUDA_HOSTDEVICE__ operator __half_raw() const;
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Type cast to \p __half_raw operator with \p volatile input.
+     */
+    __CUDA_HOSTDEVICE__ operator __half_raw() const volatile;
+#if !defined(__CUDA_NO_HALF_CONVERSIONS__)
+#if defined(__CPP_VERSION_AT_LEAST_11_FP16)
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Construct \p __half from \p __nv_bfloat16 input using default round-to-nearest-even rounding mode.
+     * Need to include the header file \p cuda_bf16.h
+     */
+    explicit __CUDA_HOSTDEVICE__ __half(const __nv_bfloat16 f); //forward declaration only, implemented in cuda_bf16.hpp
+#endif /* #if defined(__CPP_VERSION_AT_LEAST_11_FP16) */
+    /* Construct from float/double */
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Construct \p __half from \p float input using default round-to-nearest-even rounding mode.
+     *
+     * \see __float2half(float) for further details.
+     */
+    __CUDA_HOSTDEVICE__ __half(const float f) { __x = __float2half(f).__x; }
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Construct \p __half from \p double input using default round-to-nearest-even rounding mode.
+     *
+     * \see __double2half(double) for further details.
+     */
+    __CUDA_HOSTDEVICE__ __half(const double f) { __x = __double2half(f).__x; }
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Type cast to \p float operator.
+     */
+    __CUDA_HOSTDEVICE__ operator float() const;
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Type cast to \p __half assignment operator from \p float input using default round-to-nearest-even rounding mode.
+     *
+     * \see __float2half(float) for further details.
+     */
+    __CUDA_HOSTDEVICE__ __half &operator=(const float f);
+
+    /* We omit "cast to double" operator, so as to not be ambiguous about up-cast */
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Type cast to \p __half assignment operator from \p double input using default round-to-nearest-even rounding mode.
+     *
+     * \see __double2half(double) for further details.
+     */
+    __CUDA_HOSTDEVICE__ __half &operator=(const double f);
+
+/*
+ * Implicit type conversions to/from integer types were only available to nvcc compilation.
+ * Introducing them for all compilers is a potentially breaking change that may affect
+ * overloads resolution and will require users to update their code.
+ * Define __CUDA_FP16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__ to opt-out.
+ */
+#if !(defined __CUDA_FP16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__) || (defined __CUDACC__)
+    /* Allow automatic construction from types supported natively in hardware */
+    /* Note we do avoid constructor init-list because of special host/device compilation rules */
+
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Construct \p __half from \p short integer input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __half(const short val) { __x = __short2half_rn(val).__x; }
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Construct \p __half from \p unsigned \p short integer input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __half(const unsigned short val) { __x = __ushort2half_rn(val).__x; }
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Construct \p __half from \p int input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __half(const int val) { __x = __int2half_rn(val).__x; }
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Construct \p __half from \p unsigned \p int input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __half(const unsigned int val) { __x = __uint2half_rn(val).__x; }
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Construct \p __half from \p long input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __half(const long val) {
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (sizeof(long) == sizeof(long long))
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (default: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            __x = __ll2half_rn(static_cast<long long>(val)).__x;
+        } else {
+            __x = __int2half_rn(static_cast<int>(val)).__x;
+        }
+    }
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Construct \p __half from \p unsigned \p long input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __half(const unsigned long val) {
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (sizeof(unsigned long) == sizeof(unsigned long long))
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (default: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            __x = __ull2half_rn(static_cast<unsigned long long>(val)).__x;
+        } else {
+            __x = __uint2half_rn(static_cast<unsigned int>(val)).__x;
+        }
+    }
+
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Construct \p __half from \p long \p long input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __half(const long long val) { __x = __ll2half_rn(val).__x; }
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Construct \p __half from \p unsigned \p long \p long input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __half(const unsigned long long val) { __x = __ull2half_rn(val).__x; }
+
+    /* Allow automatic casts to supported built-in types, matching all that are permitted with float */
+
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Conversion operator to \p signed \p char data type.
+     * Using round-toward-zero rounding mode.
+     *
+     * \see __half2char_rz(__half) for further details.
+     */
+    __CUDA_HOSTDEVICE__ operator signed char() const;
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Conversion operator to \p unsigned \p char data type.
+     * Using round-toward-zero rounding mode.
+     *
+     * \see __half2uchar_rz(__half) for further details.
+     */
+    __CUDA_HOSTDEVICE__ operator unsigned char() const;
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Conversion operator to an implementation defined \p char data type.
+     * Using round-toward-zero rounding mode.
+     *
+     * Detects signedness of the \p char type and proceeds accordingly, see
+     * further details in __half2char_rz(__half) and __half2uchar_rz(__half).
+     */
+    __CUDA_HOSTDEVICE__ operator char() const;
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Conversion operator to \p short data type.
+     * Using round-toward-zero rounding mode.
+     *
+     * \see __half2short_rz(__half) for further details.
+     */
+    __CUDA_HOSTDEVICE__ operator short() const;
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Conversion operator to \p unsigned \p short data type.
+     * Using round-toward-zero rounding mode.
+     *
+     * \see __half2ushort_rz(__half) for further details.
+     */
+    __CUDA_HOSTDEVICE__ operator unsigned short() const;
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Conversion operator to \p int data type.
+     * Using round-toward-zero rounding mode.
+     *
+     * \see __half2int_rz(__half) for further details.
+     */
+    __CUDA_HOSTDEVICE__ operator int() const;
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Conversion operator to \p unsigned \p int data type.
+     * Using round-toward-zero rounding mode.
+     *
+     * \see __half2uint_rz(__half) for further details.
+     */
+    __CUDA_HOSTDEVICE__ operator unsigned int() const;
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Conversion operator to \p long data type.
+     * Using round-toward-zero rounding mode.
+     *
+     * Detects size of the \p long type and proceeds accordingly, see
+     * further details in __half2int_rz(__half) and __half2ll_rz(__half).
+     */
+    __CUDA_HOSTDEVICE__ operator long() const;
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Conversion operator to \p unsigned \p long data type.
+     * Using round-toward-zero rounding mode.
+     *
+     * Detects size of the \p unsigned \p long type and proceeds
+     * accordingly, see further details in __half2uint_rz(__half) and __half2ull_rz(__half).
+     */
+    __CUDA_HOSTDEVICE__ operator unsigned long() const;
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Conversion operator to \p long \p long data type.
+     * Using round-toward-zero rounding mode.
+     *
+     * \see __half2ll_rz(__half) for further details.
+     */
+    __CUDA_HOSTDEVICE__ operator long long() const;
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Conversion operator to \p unsigned \p long \p long data type.
+     * Using round-toward-zero rounding mode.
+     *
+     * \see __half2ull_rz(__half) for further details.
+     */
+    __CUDA_HOSTDEVICE__ operator unsigned long long() const;
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Type cast from \p short assignment operator, using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __half &operator=(const short val);
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Type cast from \p unsigned \p short assignment operator, using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __half &operator=(const unsigned short val);
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Type cast from \p int assignment operator, using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __half &operator=(const int val);
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Type cast from \p unsigned \p int assignment operator, using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __half &operator=(const unsigned int val);
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Type cast from \p long \p long assignment operator, using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __half &operator=(const long long val);
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Type cast from \p unsigned \p long \p long assignment operator, using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __half &operator=(const unsigned long long val);
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Conversion operator to \p bool data type.
+     * +0 and -0 inputs convert to \p false.
+     * Non-zero inputs convert to \p true.
+     */
+    __CUDA_HOSTDEVICE__ __CUDA_FP16_CONSTEXPR__ operator bool() const { return (__x & 0x7FFFU) != 0U; }
+#endif /* #if !(defined __CUDA_FP16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__) || (defined __CUDACC__) */
+#endif /* !defined(__CUDA_NO_HALF_CONVERSIONS__) */
+};
+
+#if !defined(__CUDA_NO_HALF_OPERATORS__)
+/* Some basic arithmetic operations expected of a built-in */
+
+/**
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
+ * Performs \p half addition operation.
+ * \see __hadd(__half, __half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator+(const __half &lh, const __half &rh);
+/**
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
+ * Performs \p half subtraction operation.
+ * \see __hsub(__half, __half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator-(const __half &lh, const __half &rh);
+/**
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
+ * Performs \p half multiplication operation.
+ * \see __hmul(__half, __half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator*(const __half &lh, const __half &rh);
+/**
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
+ * Performs \p half division operation.
+ * \see __hdiv(__half, __half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator/(const __half &lh, const __half &rh);
+/**
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
+ * Performs \p half compound assignment with addition operation.
+ * \see __hadd(__half, __half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator+=(__half &lh, const __half &rh);
+/**
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
+ * Performs \p half compound assignment with subtraction operation.
+ * \see __hsub(__half, __half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator-=(__half &lh, const __half &rh);
+/**
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
+ * Performs \p half compound assignment with multiplication operation.
+ * \see __hmul(__half, __half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator*=(__half &lh, const __half &rh);
+/**
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
+ * Performs \p half compound assignment with division operation.
+ * \see __hdiv(__half, __half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator/=(__half &lh, const __half &rh);
+/* Note for increment and decrement we use the raw value 0x3C00U equating to half(1.0F), to avoid the extra conversion */
+/**
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
+ * Performs \p half prefix increment operation.
+ * \see __hadd(__half, __half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator++(__half &h);
+/**
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
+ * Performs \p half prefix decrement operation.
+ * \see __hsub(__half, __half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator--(__half &h);
+/**
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
+ * Performs \p half postfix increment operation.
+ * \see __hadd(__half, __half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half  operator++(__half &h, const int ignored);
+/**
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
+ * Performs \p half postfix decrement operation.
+ * \see __hsub(__half, __half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half  operator--(__half &h, const int ignored);
+
+/* Unary plus and inverse operators */
+/**
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
+ * Implements \p half unary plus operator, returns input value.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator+(const __half &h);
+/**
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
+ * Implements \p half unary minus operator.
+ * \see __hneg(__half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator-(const __half &h);
+/* Some basic comparison operations to make it look like a built-in */
+/**
+ * \ingroup CUDA_MATH__HALF_COMPARISON
+ * Performs \p half ordered compare equal operation.
+ * \see __heq(__half, __half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator==(const __half &lh, const __half &rh);
+/**
+ * \ingroup CUDA_MATH__HALF_COMPARISON
+ * Performs \p half unordered compare not-equal operation.
+ * \see __hneu(__half, __half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator!=(const __half &lh, const __half &rh);
+/**
+ * \ingroup CUDA_MATH__HALF_COMPARISON
+ * Performs \p half ordered greater-than compare operation.
+ * \see __hgt(__half, __half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator> (const __half &lh, const __half &rh);
+/**
+ * \ingroup CUDA_MATH__HALF_COMPARISON
+ * Performs \p half ordered less-than compare operation.
+ * \see __hlt(__half, __half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator< (const __half &lh, const __half &rh);
+/**
+ * \ingroup CUDA_MATH__HALF_COMPARISON
+ * Performs \p half ordered greater-or-equal compare operation.
+ * \see __hge(__half, __half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator>=(const __half &lh, const __half &rh);
+/**
+ * \ingroup CUDA_MATH__HALF_COMPARISON
+ * Performs \p half ordered less-or-equal compare operation.
+ * \see __hle(__half, __half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator<=(const __half &lh, const __half &rh);
+#endif /* !defined(__CUDA_NO_HALF_OPERATORS__) */
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF
+ * \brief __half2 data type
+ * \details This structure implements the datatype for storing two
+ * half-precision floating-point numbers.
+ * The structure implements assignment, arithmetic and comparison
+ * operators, and type conversions.
+ *
+ * - NOTE: __half2 is visible to non-nvcc host compilers
+ */
+struct __CUDA_ALIGN__(4) __half2 {
+    /**
+     * Storage field holding lower \p __half part.
+     */
+    __half x;
+    /**
+     * Storage field holding upper \p __half part.
+     */
+    __half y;
+
+    // All construct/copy/assign/move
+public:
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * \brief Constructor by default.
+     * \details Emtpy default constructor, result is uninitialized.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP16)
+    __half2() = default;
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Move constructor, available for \p C++11 and later dialects
+     */
+    __CUDA_HOSTDEVICE__ __half2(const __half2 &&src) {
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __HALF2_TO_UI(*this) = ::std::move(__HALF2_TO_CUI(src));
+,
+    this->x = src.x;
+    this->y = src.y;
+)
+}
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Move assignment operator, available for \p C++11 and later dialects
+     */
+    __CUDA_HOSTDEVICE__ __half2 &operator=(const __half2 &&src);
+#else
+    __CUDA_HOSTDEVICE__ __half2() { }
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */
+
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Constructor from two \p __half variables
+     */
+    __CUDA_HOSTDEVICE__ __CUDA_FP16_CONSTEXPR__ __half2(const __half &a, const __half &b) : x(a), y(b) { }
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Copy constructor
+     */
+    __CUDA_HOSTDEVICE__ __half2(const __half2 &src) {
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __HALF2_TO_UI(*this) = __HALF2_TO_CUI(src);
+,
+    this->x = src.x;
+    this->y = src.y;
+)
+}    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Copy assignment operator
+     */
+    __CUDA_HOSTDEVICE__ __half2 &operator=(const __half2 &src);
+
+    /* Convert to/from __half2_raw */
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Constructor from \p __half2_raw
+     */
+    __CUDA_HOSTDEVICE__ __half2(const __half2_raw &h2r ) {
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __HALF2_TO_UI(*this) = __HALF2_TO_CUI(h2r);
+,
+    __half_raw tr;
+    tr.x = h2r.x;
+    this->x = static_cast<__half>(tr);
+    tr.x = h2r.y;
+    this->y = static_cast<__half>(tr);
+)
+}
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Assignment operator from \p __half2_raw
+     */
+    __CUDA_HOSTDEVICE__ __half2 &operator=(const __half2_raw &h2r);
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Conversion operator to \p __half2_raw
+     */
+    __CUDA_HOSTDEVICE__ operator __half2_raw() const;
+};
+
+#if !defined(__CUDA_NO_HALF2_OPERATORS__)
+/**
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
+ * Performs packed \p half addition operation.
+ * \see __hadd2(__half2, __half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator+(const __half2 &lh, const __half2 &rh);
+/**
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
+ * Performs packed \p half subtraction operation.
+ * \see __hsub2(__half2, __half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator-(const __half2 &lh, const __half2 &rh);
+/**
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
+ * Performs packed \p half multiplication operation.
+ * \see __hmul2(__half2, __half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator*(const __half2 &lh, const __half2 &rh);
+/**
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
+ * Performs packed \p half division operation.
+ * \see __h2div(__half2, __half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator/(const __half2 &lh, const __half2 &rh);
+/**
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
+ * Performs packed \p half compound assignment with addition operation.
+ * \see __hadd2(__half2, __half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2& operator+=(__half2 &lh, const __half2 &rh);
+/**
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
+ * Performs packed \p half compound assignment with subtraction operation.
+ * \see __hsub2(__half2, __half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2& operator-=(__half2 &lh, const __half2 &rh);
+/**
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
+ * Performs packed \p half compound assignment with multiplication operation.
+ * \see __hmul2(__half2, __half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2& operator*=(__half2 &lh, const __half2 &rh);
+/**
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
+ * Performs packed \p half compound assignment with division operation.
+ * \see __h2div(__half2, __half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2& operator/=(__half2 &lh, const __half2 &rh);
+/**
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
+ * Performs packed \p half prefix increment operation.
+ * \see __hadd2(__half2, __half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 &operator++(__half2 &h);
+/**
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
+ * Performs packed \p half prefix decrement operation.
+ * \see __hsub2(__half2, __half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 &operator--(__half2 &h);
+/**
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
+ * Performs packed \p half postfix increment operation.
+ * \see __hadd2(__half2, __half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2  operator++(__half2 &h, const int ignored);
+/**
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
+ * Performs packed \p half postfix decrement operation.
+ * \see __hsub2(__half2, __half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2  operator--(__half2 &h, const int ignored);
+/**
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
+ * Implements packed \p half unary plus operator, returns input value.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator+(const __half2 &h);
+/**
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
+ * Implements packed \p half unary minus operator.
+ * \see __hneg2(__half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator-(const __half2 &h);
+/**
+ * \ingroup CUDA_MATH__HALF2_COMPARISON
+ * Performs packed \p half ordered compare equal operation.
+ * \see __hbeq2(__half2, __half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator==(const __half2 &lh, const __half2 &rh);
+/**
+ * \ingroup CUDA_MATH__HALF2_COMPARISON
+ * Performs packed \p half unordered compare not-equal operation.
+ * \see __hbneu2(__half2, __half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator!=(const __half2 &lh, const __half2 &rh);
+/**
+ * \ingroup CUDA_MATH__HALF2_COMPARISON
+ * Performs packed \p half ordered greater-than compare operation.
+ * \see __hbgt2(__half2, __half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator>(const __half2 &lh, const __half2 &rh);
+/**
+ * \ingroup CUDA_MATH__HALF2_COMPARISON
+ * Performs packed \p half ordered less-than compare operation.
+ * \see __hblt2(__half2, __half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator<(const __half2 &lh, const __half2 &rh);
+/**
+ * \ingroup CUDA_MATH__HALF2_COMPARISON
+ * Performs packed \p half ordered greater-or-equal compare operation.
+ * \see __hbge2(__half2, __half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator>=(const __half2 &lh, const __half2 &rh);
+/**
+ * \ingroup CUDA_MATH__HALF2_COMPARISON
+ * Performs packed \p half ordered less-or-equal compare operation.
+ * \see __hble2(__half2, __half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator<=(const __half2 &lh, const __half2 &rh);
+
+#endif /* !defined(__CUDA_NO_HALF2_OPERATORS__) */
+#endif /* defined(__cplusplus) */
+
+#if (defined(__FORCE_INCLUDE_CUDA_FP16_HPP_FROM_FP16_H__) || \
+    !(defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3)))))
+
+/* Note the .hpp file is included to capture the "half" & "half2" built-in function definitions. For NVRTC, the built-in
+   function definitions are compiled at NVRTC library build-time and are available through the NVRTC built-ins library at
+   link time.
+*/
+#include "cuda_fp16.hpp"
+#endif /* (defined(__FORCE_INCLUDE_CUDA_FP16_HPP_FROM_FP16_H__) || \
+          !(defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3))))) */
+
+/* Define first-class types "half" and "half2", unless user specifies otherwise via "#define CUDA_NO_HALF" */
+/* C cannot ever have these types defined here, because __half and __half2 are C++ classes */
+#if defined(__cplusplus) && !defined(CUDA_NO_HALF)
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF
+ * \brief This datatype is meant to be the first-class or fundamental
+ * implementation of the half-precision numbers format.
+ *
+ * \details Should be implemented in the compiler in the future.
+ * Current implementation is a simple typedef to a respective
+ * user-level type with underscores.
+ */
+typedef __half half;
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF
+ * \brief This datatype is meant to be the first-class or fundamental
+ * implementation of type for pairs of half-precision numbers.
+ *
+ * \details Should be implemented in the compiler in the future.
+ * Current implementation is a simple typedef to a respective
+ * user-level type with underscores.
+ */
+typedef __half2 half2;
+// for consistency with __nv_bfloat16
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF
+ * \brief This datatype is an \p __nv_ prefixed alias
+ */
+typedef __half      __nv_half;
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF
+ * \brief This datatype is an \p __nv_ prefixed alias
+ */
+typedef __half2     __nv_half2;
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF
+ * \brief This datatype is an \p __nv_ prefixed alias
+ */
+typedef __half_raw  __nv_half_raw;
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF
+ * \brief This datatype is an \p __nv_ prefixed alias
+ */
+typedef __half2_raw __nv_half2_raw;
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF
+ * \brief This datatype is an \p nv_ prefixed alias
+ */
+typedef __half        nv_half;
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF
+ * \brief This datatype is an \p nv_ prefixed alias
+ */
+typedef __half2       nv_half2;
+#endif /* defined(__cplusplus) && !defined(CUDA_NO_HALF) */
+
+#undef __CUDA_FP16_DECL__
+#undef __CUDA_HOSTDEVICE_FP16_DECL__
+#undef __CUDA_HOSTDEVICE__
+#undef __CUDA_FP16_INLINE__
+#undef __CUDA_FP16_FORCEINLINE__
+#undef ___CUDA_FP16_STRINGIFY_INNERMOST
+#undef __CUDA_FP16_STRINGIFY
+
+#endif /* end of include guard: __CUDA_FP16_H__ */
diff --git a/numba_cuda/numba/cuda/include/13/cuda_fp16.hpp b/numba_cuda/numba/cuda/include/13/cuda_fp16.hpp
new file mode 100644
index 000000000..4259992df
--- /dev/null
+++ b/numba_cuda/numba/cuda/include/13/cuda_fp16.hpp
@@ -0,0 +1,3483 @@
+/*
+* Copyright 1993-2024 NVIDIA Corporation.  All rights reserved.
+*
+* NOTICE TO LICENSEE:
+*
+* This source code and/or documentation ("Licensed Deliverables") are
+* subject to NVIDIA intellectual property rights under U.S. and
+* international Copyright laws.
+*
+* These Licensed Deliverables contained herein is PROPRIETARY and
+* CONFIDENTIAL to NVIDIA and is being provided under the terms and
+* conditions of a form of NVIDIA software license agreement by and
+* between NVIDIA and Licensee ("License Agreement") or electronically
+* accepted by Licensee.  Notwithstanding any terms or conditions to
+* the contrary in the License Agreement, reproduction or disclosure
+* of the Licensed Deliverables to any third party without the express
+* written consent of NVIDIA is prohibited.
+*
+* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+* OF THESE LICENSED DELIVERABLES.
+*
+* U.S. Government End Users.  These Licensed Deliverables are a
+* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+* 1995), consisting of "commercial computer software" and "commercial
+* computer software documentation" as such terms are used in 48
+* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+* only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+* U.S. Government End Users acquire the Licensed Deliverables with
+* only those rights set forth herein.
+*
+* Any use of the Licensed Deliverables in individual and commercial
+* software must include, in the user documentation and internal
+* comments to the code, the above Disclaimer and U.S. Government End
+* Users Notice.
+*/
+
+#if !defined(__CUDA_FP16_HPP__)
+#define __CUDA_FP16_HPP__
+
+#if !defined(__CUDA_FP16_H__)
+#error "Do not include this file directly. Instead, include cuda_fp16.h."
+#endif
+
+#if !defined(IF_DEVICE_OR_CUDACC)
+#if defined(__CUDACC__)
+    #define IF_DEVICE_OR_CUDACC(d, c, f) NV_IF_ELSE_TARGET(NV_IS_DEVICE, d, c)
+#else
+    #define IF_DEVICE_OR_CUDACC(d, c, f) NV_IF_ELSE_TARGET(NV_IS_DEVICE, d, f)
+#endif
+#endif
+
+/* Macros for half & half2 binary arithmetic */
+#define __BINARY_OP_HALF_MACRO(name) /* do */ {\
+   __half val; \
+   asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16 %0,%1,%2;\n}" \
+        :"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)),"h"(__HALF_TO_CUS(b))); \
+   return val; \
+} /* while(0) */
+#define __BINARY_OP_HALF2_MACRO(name) /* do */ {\
+   __half2 val; \
+   asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16x2 %0,%1,%2;\n}" \
+        :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \
+   return val; \
+} /* while(0) */
+#define __TERNARY_OP_HALF_MACRO(name) /* do */ {\
+   __half val; \
+   asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16 %0,%1,%2,%3;\n}" \
+        :"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)),"h"(__HALF_TO_CUS(b)),"h"(__HALF_TO_CUS(c))); \
+   return val; \
+} /* while(0) */
+#define __TERNARY_OP_HALF2_MACRO(name) /* do */ {\
+   __half2 val; \
+   asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16x2 %0,%1,%2,%3;\n}" \
+        :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b)),"r"(__HALF2_TO_CUI(c))); \
+   return val; \
+} /* while(0) */
+
+/* All other definitions in this file are only visible to C++ compilers */
+#if defined(__cplusplus)
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF_CONSTANTS
+ * \brief Defines floating-point positive infinity value for the \p half data type
+ */
+#define CUDART_INF_FP16            __ushort_as_half((unsigned short)0x7C00U)
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF_CONSTANTS
+ * \brief Defines canonical NaN value for the \p half data type
+ */
+#define CUDART_NAN_FP16            __ushort_as_half((unsigned short)0x7FFFU)
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF_CONSTANTS
+ * \brief Defines a minimum representable (denormalized) value for the \p half data type
+ */
+#define CUDART_MIN_DENORM_FP16     __ushort_as_half((unsigned short)0x0001U)
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF_CONSTANTS
+ * \brief Defines a maximum representable value for the \p half data type
+ */
+#define CUDART_MAX_NORMAL_FP16     __ushort_as_half((unsigned short)0x7BFFU)
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF_CONSTANTS
+ * \brief Defines a negative zero value for the \p half data type
+ */
+#define CUDART_NEG_ZERO_FP16       __ushort_as_half((unsigned short)0x8000U)
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF_CONSTANTS
+ * \brief Defines a positive zero value for the \p half data type
+ */
+#define CUDART_ZERO_FP16           __ushort_as_half((unsigned short)0x0000U)
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF_CONSTANTS
+ * \brief Defines a value of 1.0 for the \p half data type
+ */
+#define CUDART_ONE_FP16            __ushort_as_half((unsigned short)0x3C00U)
+
+#if !(defined __DOXYGEN_ONLY__)
+
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const __half_raw &hr) { __x = hr.x; return *this; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ volatile __half &__half::operator=(const __half_raw &hr) volatile { __x = hr.x; return *this; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ volatile __half &__half::operator=(const volatile __half_raw &hr) volatile { __x = hr.x; return *this; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator __half_raw() const { __half_raw ret; ret.x = __x; return ret; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator __half_raw() const volatile { __half_raw ret; ret.x = __x; return ret; }
+#if !defined(__CUDA_NO_HALF_CONVERSIONS__)
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator float() const { return __half2float(*this); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const float f) { __x = __float2half(f).__x; return *this; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const double f) { __x = __double2half(f).__x; return *this; }
+#if !(defined __CUDA_FP16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__) || (defined __CUDACC__)
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator signed char() const { return __half2char_rz(*this); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator unsigned char() const { return __half2uchar_rz(*this); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator char() const {
+    char value;
+    /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+    if (((char)-1) < (char)0)
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+    {
+        value = static_cast<char>(__half2char_rz(*this));
+    }
+    else
+    {
+        value = static_cast<char>(__half2uchar_rz(*this));
+    }
+    return value;
+}
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator short() const { return __half2short_rz(*this); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator unsigned short() const { return __half2ushort_rz(*this); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator int() const { return __half2int_rz(*this); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator unsigned int() const { return __half2uint_rz(*this); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator long() const {
+    long retval;
+    /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+    if (sizeof(long) == sizeof(long long))
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+    {
+        retval = static_cast<long>(__half2ll_rz(*this));
+    }
+    else
+    {
+        retval = static_cast<long>(__half2int_rz(*this));
+    }
+    return retval;
+}
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator unsigned long() const {
+    unsigned long retval;
+    /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+    if (sizeof(unsigned long) == sizeof(unsigned long long))
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+    {
+        retval = static_cast<unsigned long>(__half2ull_rz(*this));
+    }
+    else
+    {
+        retval = static_cast<unsigned long>(__half2uint_rz(*this));
+    }
+    return retval;
+}
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator long long() const { return __half2ll_rz(*this); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator unsigned long long() const { return __half2ull_rz(*this); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const short val) { __x = __short2half_rn(val).__x; return *this; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const unsigned short val) { __x = __ushort2half_rn(val).__x; return *this; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const int val) { __x = __int2half_rn(val).__x; return *this; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const unsigned int val) { __x = __uint2half_rn(val).__x; return *this; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const long long val) { __x = __ll2half_rn(val).__x; return *this; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const unsigned long long val) { __x = __ull2half_rn(val).__x; return *this; }
+
+#endif /* #if !(defined __CUDA_FP16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__) || (defined __CUDACC__) */
+#endif /* !defined(__CUDA_NO_HALF_CONVERSIONS__) */
+#if !defined(__CUDA_NO_HALF_OPERATORS__)
+/* Some basic arithmetic operations expected of a built-in */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator+(const __half &lh, const __half &rh) { return __hadd(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator-(const __half &lh, const __half &rh) { return __hsub(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator*(const __half &lh, const __half &rh) { return __hmul(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator/(const __half &lh, const __half &rh) { return __hdiv(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator+=(__half &lh, const __half &rh) { lh = __hadd(lh, rh); return lh; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator-=(__half &lh, const __half &rh) { lh = __hsub(lh, rh); return lh; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator*=(__half &lh, const __half &rh) { lh = __hmul(lh, rh); return lh; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator/=(__half &lh, const __half &rh) { lh = __hdiv(lh, rh); return lh; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator++(__half &h)      { __half_raw one; one.x = 0x3C00U; h += one; return h; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator--(__half &h)      { __half_raw one; one.x = 0x3C00U; h -= one; return h; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half  operator++(__half &h, const int ignored)
+{
+    // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
+    static_cast<void>(ignored);
+
+    const __half ret = h;
+    __half_raw one;
+    one.x = 0x3C00U;
+    h += one;
+    return ret;
+}
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half  operator--(__half &h, const int ignored)
+{
+    // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
+    static_cast<void>(ignored);
+
+    const __half ret = h;
+    __half_raw one;
+    one.x = 0x3C00U;
+    h -= one;
+    return ret;
+}
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator+(const __half &h) { return h; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator-(const __half &h) { return __hneg(h); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator==(const __half &lh, const __half &rh) { return __heq(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator!=(const __half &lh, const __half &rh) { return __hneu(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator> (const __half &lh, const __half &rh) { return __hgt(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator< (const __half &lh, const __half &rh) { return __hlt(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator>=(const __half &lh, const __half &rh) { return __hge(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator<=(const __half &lh, const __half &rh) { return __hle(lh, rh); }
+#endif /* !defined(__CUDA_NO_HALF_OPERATORS__) */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP16)
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half2 &__half2::operator=(const __half2 &&src) {
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __HALF2_TO_UI(*this) = ::std::move(__HALF2_TO_CUI(src));
+,
+    this->x = src.x;
+    this->y = src.y;
+)
+    return *this;
+}
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half2 &__half2::operator=(const __half2 &src) {
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __HALF2_TO_UI(*this) = __HALF2_TO_CUI(src);
+,
+    this->x = src.x;
+    this->y = src.y;
+)
+    return *this;
+}
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half2 &__half2::operator=(const __half2_raw &h2r) {
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __HALF2_TO_UI(*this) = __HALF2_TO_CUI(h2r);
+,
+    __half_raw tr;
+    tr.x = h2r.x;
+    this->x = static_cast<__half>(tr);
+    tr.x = h2r.y;
+    this->y = static_cast<__half>(tr);
+)
+    return *this;
+}
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half2::operator __half2_raw() const {
+    __half2_raw ret;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    ret.x = 0U;
+    ret.y = 0U;
+    __HALF2_TO_UI(ret) = __HALF2_TO_CUI(*this);
+,
+    ret.x = static_cast<__half_raw>(this->x).x;
+    ret.y = static_cast<__half_raw>(this->y).x;
+)
+    return ret;
+}
+#if !defined(__CUDA_NO_HALF2_OPERATORS__)
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator+(const __half2 &lh, const __half2 &rh) { return __hadd2(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator-(const __half2 &lh, const __half2 &rh) { return __hsub2(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator*(const __half2 &lh, const __half2 &rh) { return __hmul2(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator/(const __half2 &lh, const __half2 &rh) { return __h2div(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2& operator+=(__half2 &lh, const __half2 &rh) { lh = __hadd2(lh, rh); return lh; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2& operator-=(__half2 &lh, const __half2 &rh) { lh = __hsub2(lh, rh); return lh; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2& operator*=(__half2 &lh, const __half2 &rh) { lh = __hmul2(lh, rh); return lh; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2& operator/=(__half2 &lh, const __half2 &rh) { lh = __h2div(lh, rh); return lh; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 &operator++(__half2 &h)      { __half2_raw one; one.x = 0x3C00U; one.y = 0x3C00U; h = __hadd2(h, one); return h; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 &operator--(__half2 &h)      { __half2_raw one; one.x = 0x3C00U; one.y = 0x3C00U; h = __hsub2(h, one); return h; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2  operator++(__half2 &h, const int ignored)
+{
+    // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
+    static_cast<void>(ignored);
+
+    const __half2 ret = h;
+    __half2_raw one;
+    one.x = 0x3C00U;
+    one.y = 0x3C00U;
+    h = __hadd2(h, one);
+    return ret;
+}
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2  operator--(__half2 &h, const int ignored)
+{
+    // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
+    static_cast<void>(ignored);
+
+    const __half2 ret = h;
+    __half2_raw one;
+    one.x = 0x3C00U;
+    one.y = 0x3C00U;
+    h = __hsub2(h, one);
+    return ret;
+}
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator+(const __half2 &h) { return h; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator-(const __half2 &h) { return __hneg2(h); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator==(const __half2 &lh, const __half2 &rh) { return __hbeq2(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator!=(const __half2 &lh, const __half2 &rh) { return __hbneu2(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator>(const __half2 &lh, const __half2 &rh) { return __hbgt2(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator<(const __half2 &lh, const __half2 &rh) { return __hblt2(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator>=(const __half2 &lh, const __half2 &rh) { return __hbge2(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator<=(const __half2 &lh, const __half2 &rh) { return __hble2(lh, rh); }
+#endif /* !defined(__CUDA_NO_HALF2_OPERATORS__) */
+
+/* Restore warning for multiple assignment operators */
+#if defined(_MSC_VER) && _MSC_VER >= 1500
+#pragma warning( pop )
+#endif /* defined(_MSC_VER) && _MSC_VER >= 1500 */
+
+/* Restore -Weffc++ warnings from here on */
+#if defined(__GNUC__)
+#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
+#pragma GCC diagnostic pop
+#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */
+#endif /* defined(__GNUC__) */
+
+#undef __CUDA_HOSTDEVICE__
+#undef __CUDA_ALIGN__
+
+#ifndef __CUDACC_RTC__  /* no host functions in NVRTC mode */
+static inline unsigned short __internal_float2half(const float f, unsigned int &sign, unsigned int &remainder)
+{
+    unsigned int x;
+    unsigned int u;
+    unsigned int result;
+#if defined(__CUDACC__)
+    (void)memcpy(&x, &f, sizeof(f));
+#else
+    (void)::std::memcpy(&x, &f, sizeof(f));
+#endif
+    u = (x & 0x7fffffffU);
+    sign = ((x >> 16U) & 0x8000U);
+    // NaN/+Inf/-Inf
+    if (u >= 0x7f800000U) {
+        remainder = 0U;
+        result = ((u == 0x7f800000U) ? (sign | 0x7c00U) : 0x7fffU);
+    } else if (u > 0x477fefffU) { // Overflows
+        remainder = 0x80000000U;
+        result = (sign | 0x7bffU);
+    } else if (u >= 0x38800000U) { // Normal numbers
+        remainder = u << 19U;
+        u -= 0x38000000U;
+        result = (sign | (u >> 13U));
+    } else if (u < 0x33000001U) { // +0/-0
+        remainder = u;
+        result = sign;
+    } else { // Denormal numbers
+        const unsigned int exponent = u >> 23U;
+        const unsigned int shift = 0x7eU - exponent;
+        unsigned int mantissa = (u & 0x7fffffU);
+        mantissa |= 0x800000U;
+        remainder = mantissa << (32U - shift);
+        result = (sign | (mantissa >> shift));
+        result &= 0x0000FFFFU;
+    }
+    return static_cast<unsigned short>(result);
+}
+#endif  /* #if !defined(__CUDACC_RTC__) */
+
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __double2half(const double a)
+{
+IF_DEVICE_OR_CUDACC(
+    __half val;
+    asm("{  cvt.rn.f16.f64 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "d"(a));
+    return val;
+,
+    __half result;
+    // Perform rounding to 11 bits of precision, convert value
+    // to float and call existing float to half conversion.
+    // By pre-rounding to 11 bits we avoid additional rounding
+    // in float to half conversion.
+    unsigned long long int absa;
+    unsigned long long int ua;
+    (void)memcpy(&ua, &a, sizeof(a));
+    absa = (ua & 0x7fffffffffffffffULL);
+    if ((absa >= 0x40f0000000000000ULL) || (absa <= 0x3e60000000000000ULL))
+    {
+        // |a| >= 2^16 or NaN or |a| <= 2^(-25)
+        // double-rounding is not a problem
+        result = __float2half(static_cast<float>(a));
+    }
+    else
+    {
+        // here 2^(-25) < |a| < 2^16
+        // prepare shifter value such that a + shifter
+        // done in double precision performs round-to-nearest-even
+        // and (a + shifter) - shifter results in a rounded to
+        // 11 bits of precision. Shifter needs to have exponent of
+        // a plus 53 - 11 = 42 and a leading bit in mantissa to guard
+        // against negative values.
+        // So need to have |a| capped to avoid overflow in exponent.
+        // For inputs that are smaller than half precision minnorm
+        // we prepare fixed shifter exponent.
+        unsigned long long shifterBits;
+        if (absa >= 0x3f10000000000000ULL)
+        {   // Here if |a| >= 2^(-14)
+            // add 42 to exponent bits
+            shifterBits  = (ua & 0x7ff0000000000000ULL) + 0x02A0000000000000ULL;
+        }
+        else
+        {   // 2^(-25) < |a| < 2^(-14), potentially results in denormal
+            // set exponent bits to 42 - 14 + bias
+            shifterBits = 0x41B0000000000000ULL;
+        }
+        // set leading mantissa bit to protect against negative inputs
+        shifterBits |= 0x0008000000000000ULL;
+        double shifter;
+        (void)memcpy(&shifter, &shifterBits, sizeof(shifterBits));
+        double aShiftRound = a + shifter;
+
+        // Prevent the compiler from optimizing away a + shifter - shifter
+        // by doing intermediate memcopy and harmless bitwize operation
+        unsigned long long int aShiftRoundBits;
+        (void)memcpy(&aShiftRoundBits, &aShiftRound, sizeof(aShiftRound));
+
+        // the value is positive, so this operation doesn't change anything
+        aShiftRoundBits &= 0x7fffffffffffffffULL;
+
+        (void)memcpy(&aShiftRound, &aShiftRoundBits, sizeof(aShiftRound));
+
+        result = __float2half(static_cast<float>(aShiftRound - shifter));
+    }
+
+    return result;
+,
+    __half result;
+    /*
+    // Perform rounding to 11 bits of precision, convert value
+    // to float and call existing float to half conversion.
+    // By pre-rounding to 11 bits we avoid additional rounding
+    // in float to half conversion.
+    */
+    unsigned long long int absa;
+    unsigned long long int ua;
+    (void)::std::memcpy(&ua, &a, sizeof(a));
+    absa = (ua & 0x7fffffffffffffffULL);
+    if ((absa >= 0x40f0000000000000ULL) || (absa <= 0x3e60000000000000ULL))
+    {
+        /*
+        // |a| >= 2^16 or NaN or |a| <= 2^(-25)
+        // double-rounding is not a problem
+        */
+        result = __float2half(static_cast<float>(a));
+    }
+    else
+    {
+        /*
+        // here 2^(-25) < |a| < 2^16
+        // prepare shifter value such that a + shifter
+        // done in double precision performs round-to-nearest-even
+        // and (a + shifter) - shifter results in a rounded to
+        // 11 bits of precision. Shifter needs to have exponent of
+        // a plus 53 - 11 = 42 and a leading bit in mantissa to guard
+        // against negative values.
+        // So need to have |a| capped to avoid overflow in exponent.
+        // For inputs that are smaller than half precision minnorm
+        // we prepare fixed shifter exponent.
+        */
+        unsigned long long shifterBits;
+        if (absa >= 0x3f10000000000000ULL)
+        {
+            /*
+            // Here if |a| >= 2^(-14)
+            // add 42 to exponent bits
+            */
+            shifterBits  = (ua & 0x7ff0000000000000ULL) + 0x02A0000000000000ULL;
+        }
+        else
+        {
+            /*
+            // 2^(-25) < |a| < 2^(-14), potentially results in denormal
+            // set exponent bits to 42 - 14 + bias
+            */
+            shifterBits = 0x41B0000000000000ULL;
+        }
+        // set leading mantissa bit to protect against negative inputs
+        shifterBits |= 0x0008000000000000ULL;
+        double shifter;
+        (void)::std::memcpy(&shifter, &shifterBits, sizeof(shifterBits));
+        double aShiftRound = a + shifter;
+
+        /*
+        // Prevent the compiler from optimizing away a + shifter - shifter
+        // by doing intermediate memcopy and harmless bitwize operation
+        */
+        unsigned long long int aShiftRoundBits;
+        (void)::std::memcpy(&aShiftRoundBits, &aShiftRound, sizeof(aShiftRound));
+
+        // the value is positive, so this operation doesn't change anything
+        aShiftRoundBits &= 0x7fffffffffffffffULL;
+
+        (void)::std::memcpy(&aShiftRound, &aShiftRoundBits, sizeof(aShiftRound));
+
+        result = __float2half(static_cast<float>(aShiftRound - shifter));
+    }
+
+    return result;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half(const float a)
+{
+    __half val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a));
+,
+    __half_raw r;
+    unsigned int sign = 0U;
+    unsigned int remainder = 0U;
+    r.x = __internal_float2half(a, sign, remainder);
+    if ((remainder > 0x80000000U) || ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) {
+        r.x++;
+    }
+    val = r;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rn(const float a)
+{
+    __half val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a));
+,
+    __half_raw r;
+    unsigned int sign = 0U;
+    unsigned int remainder = 0U;
+    r.x = __internal_float2half(a, sign, remainder);
+    if ((remainder > 0x80000000U) || ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) {
+        r.x++;
+    }
+    val = r;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rz(const float a)
+{
+    __half val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{  cvt.rz.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a));
+,
+    __half_raw r;
+    unsigned int sign = 0U;
+    unsigned int remainder = 0U;
+    r.x = __internal_float2half(a, sign, remainder);
+    val = r;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rd(const float a)
+{
+    __half val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{  cvt.rm.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a));
+,
+    __half_raw r;
+    unsigned int sign = 0U;
+    unsigned int remainder = 0U;
+    r.x = __internal_float2half(a, sign, remainder);
+    if ((remainder != 0U) && (sign != 0U)) {
+        r.x++;
+    }
+    val = r;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_ru(const float a)
+{
+    __half val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{  cvt.rp.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a));
+,
+    __half_raw r;
+    unsigned int sign = 0U;
+    unsigned int remainder = 0U;
+    r.x = __internal_float2half(a, sign, remainder);
+    if ((remainder != 0U) && (sign == 0U)) {
+        r.x++;
+    }
+    val = r;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float2half2_rn(const float a)
+{
+    __half2 val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{.reg .f16 low;\n"
+        "  cvt.rn.f16.f32 low, %1;\n"
+        "  mov.b32 %0, {low,low};}\n" : "=r"(__HALF2_TO_UI(val)) : "f"(a));
+,
+    val = __half2(__float2half_rn(a), __float2half_rn(a));
+)
+    return val;
+}
+
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_FP16_DECL__ __half2 __internal_device_float2_to_half2_rn(const float a, const float b) {
+    __half2 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    asm("{ cvt.rn.f16x2.f32 %0, %2, %1; }\n"
+        : "=r"(__HALF2_TO_UI(val)) : "f"(a), "f"(b));
+,
+    asm("{.reg .f16 low,high;\n"
+        "  cvt.rn.f16.f32 low, %1;\n"
+        "  cvt.rn.f16.f32 high, %2;\n"
+        "  mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "f"(a), "f"(b));
+)
+    return val;
+}
+
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __floats2half2_rn(const float a, const float b)
+{
+    __half2 val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    val = __internal_device_float2_to_half2_rn(a,b);
+,
+    val = __half2(__float2half_rn(a), __float2half_rn(b));
+)
+    return val;
+}
+
+#ifndef __CUDACC_RTC__  /* no host functions in NVRTC mode */
+static inline float __internal_half2float(const unsigned short h)
+{
+    unsigned int sign = ((static_cast<unsigned int>(h) >> 15U) & 1U);
+    unsigned int exponent = ((static_cast<unsigned int>(h) >> 10U) & 0x1fU);
+    unsigned int mantissa = ((static_cast<unsigned int>(h) & 0x3ffU) << 13U);
+    float f;
+    if (exponent == 0x1fU) { /* NaN or Inf */
+        /* discard sign of a NaN */
+        sign = ((mantissa != 0U) ? (sign >> 1U) : sign);
+        mantissa = ((mantissa != 0U) ? 0x7fffffU : 0U);
+        exponent = 0xffU;
+    } else if (exponent == 0U) { /* Denorm or Zero */
+        if (mantissa != 0U) {
+            unsigned int msb;
+            exponent = 0x71U;
+            do {
+                msb = (mantissa & 0x400000U);
+                mantissa <<= 1U; /* normalize */
+                --exponent;
+            } while (msb == 0U);
+            mantissa &= 0x7fffffU; /* 1.mantissa is implicit */
+        }
+    } else {
+        exponent += 0x70U;
+    }
+    const unsigned int u = ((sign << 31U) | (exponent << 23U) | mantissa);
+#if defined(__CUDACC__)
+    (void)memcpy(&f, &u, sizeof(u));
+#else
+    (void)::std::memcpy(&f, &u, sizeof(u));
+#endif
+    return f;
+}
+#endif  /* !defined(__CUDACC_RTC__) */
+
+__CUDA_HOSTDEVICE_FP16_DECL__ float __half2float(const __half a)
+{
+    float val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{  cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(__HALF_TO_CUS(a)));
+,
+    val = __internal_half2float(static_cast<__half_raw>(a).x);
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ float __low2float(const __half2 a)
+{
+    float val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high},%1;\n"
+        "  cvt.f32.f16 %0, low;}\n" : "=f"(val) : "r"(__HALF2_TO_CUI(a)));
+,
+    val = __internal_half2float(static_cast<__half2_raw>(a).x);
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ float __high2float(const __half2 a)
+{
+    float val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high},%1;\n"
+        "  cvt.f32.f16 %0, high;}\n" : "=f"(val) : "r"(__HALF2_TO_CUI(a)));
+,
+    val = __internal_half2float(static_cast<__half2_raw>(a).y);
+)
+    return val;
+}
+
+__CUDA_HOSTDEVICE_FP16_DECL__ signed char __half2char_rz(const __half h)
+{
+    signed char i;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    unsigned int tmp;
+    asm("cvt.rzi.s8.f16 %0, %1;" : "=r"(tmp) : "h"(__HALF_TO_CUS(h)));
+    const unsigned char u = static_cast<unsigned char>(tmp);
+    i = static_cast<signed char>(u);
+,
+    const float f = __half2float(h);
+    const signed char max_val = (signed char)0x7fU;
+    const signed char min_val = (signed char)0x80U;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xF800U) {
+        // NaN
+        i = 0;
+    } else if (f > static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    } else {
+        // normal value, conversion is well-defined
+        i = static_cast<signed char>(f);
+    }
+)
+    return i;
+}
+
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned char __half2uchar_rz(const __half h)
+{
+    unsigned char i;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    unsigned int tmp;
+    asm("cvt.rzi.u8.f16 %0, %1;" : "=r"(tmp) : "h"(__HALF_TO_CUS(h)));
+    i = static_cast<unsigned char>(tmp);
+,
+    const float f = __half2float(h);
+    const unsigned char max_val = 0xffU;
+    const unsigned char min_val = 0U;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xF800U) {
+        // NaN
+        i = 0U;
+    } else if (f > static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    } else {
+        // normal value, conversion is well-defined
+        i = static_cast<unsigned char>(f);
+    }
+)
+    return i;
+}
+
+__CUDA_HOSTDEVICE_FP16_DECL__ short int __half2short_rz(const __half h)
+{
+    short int i;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rzi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
+,
+    const float f = __half2float(h);
+    const short int max_val = (short int)0x7fffU;
+    const short int min_val = (short int)0x8000U;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xF800U) {
+        // NaN
+        i = 0;
+    } else if (f > static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    } else {
+        // normal value, conversion is well-defined
+        i = static_cast<short int>(f);
+    }
+)
+    return i;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned short int __half2ushort_rz(const __half h)
+{
+    unsigned short int i;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rzi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
+,
+    const float f = __half2float(h);
+    const unsigned short int max_val = 0xffffU;
+    const unsigned short int min_val = 0U;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xF800U) {
+        // NaN
+        i = 0U;
+    } else if (f > static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    } else {
+        // normal value, conversion is well-defined
+        i = static_cast<unsigned short int>(f);
+    }
+)
+    return i;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ int __half2int_rz(const __half h)
+{
+    int i;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rzi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
+,
+    const float f = __half2float(h);
+    const int max_val = (int)0x7fffffffU;
+    const int min_val = (int)0x80000000U;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xF800U) {
+        // NaN
+        i = 0;
+    } else if (f > static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    } else {
+        // normal value, conversion is well-defined
+        i = static_cast<int>(f);
+    }
+)
+    return i;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __half2uint_rz(const __half h)
+{
+    unsigned int i;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rzi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
+,
+    const float f = __half2float(h);
+    const unsigned int max_val = 0xffffffffU;
+    const unsigned int min_val = 0U;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xF800U) {
+        // NaN
+        i = 0U;
+    } else if (f > static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    } else {
+        // normal value, conversion is well-defined
+        i = static_cast<unsigned int>(f);
+    }
+)
+    return i;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ long long int __half2ll_rz(const __half h)
+{
+    long long int i;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rzi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
+,
+    const float f = __half2float(h);
+    const long long int max_val = (long long int)0x7fffffffffffffffULL;
+    const long long int min_val = (long long int)0x8000000000000000ULL;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xF800U) {
+        // NaN
+        i = min_val;
+    } else if (f > static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    } else {
+        // normal value, conversion is well-defined
+        i = static_cast<long long int>(f);
+    }
+)
+    return i;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned long long int __half2ull_rz(const __half h)
+{
+    unsigned long long int i;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rzi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
+,
+    const float f = __half2float(h);
+    const unsigned long long int max_val = 0xffffffffffffffffULL;
+    const unsigned long long int min_val = 0ULL;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xF800U) {
+        // NaN
+        i = 0x8000000000000000ULL;
+    } else if (f > static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    } else {
+        // normal value, conversion is well-defined
+        i = static_cast<unsigned long long int>(f);
+    }
+)
+    return i;
+}
+/* CUDA vector-types compatible vector creation function (note returns __half2, not half2) */
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 make_half2(const __half x, const __half y)
+{
+    __half2 t; t.x = x; t.y = y; return t;
+}
+
+
+/* Definitions of intrinsics */
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float22half2_rn(const float2 a)
+{
+    const __half2 val = __floats2half2_rn(a.x, a.y);
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ float2 __half22float2(const __half2 a)
+{
+    float hi_float;
+    float lo_float;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high},%1;\n"
+        "  cvt.f32.f16 %0, low;}\n" : "=f"(lo_float) : "r"(__HALF2_TO_CUI(a)));
+
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high},%1;\n"
+        "  cvt.f32.f16 %0, high;}\n" : "=f"(hi_float) : "r"(__HALF2_TO_CUI(a)));
+,
+    lo_float = __internal_half2float(((__half2_raw)a).x);
+    hi_float = __internal_half2float(((__half2_raw)a).y);
+)
+    return make_float2(lo_float, hi_float);
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_FP16_DECL__ int __half2int_rn(const __half h)
+{
+    int i;
+    asm("cvt.rni.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ int __half2int_rd(const __half h)
+{
+    int i;
+    asm("cvt.rmi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ int __half2int_ru(const __half h)
+{
+    int i;
+    asm("cvt.rpi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_rn(const int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rn.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
+,
+    // double-rounding is not a problem here: if integer
+    // has more than 24 bits, it is already too large to
+    // be represented in half precision, and result will
+    // be infinity.
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rn(f);
+)
+    return h;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_rz(const int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rz.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rz(f);
+)
+    return h;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_rd(const int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rm.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rd(f);
+)
+    return h;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_ru(const int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rp.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_ru(f);
+)
+    return h;
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_FP16_DECL__ short int __half2short_rn(const __half h)
+{
+    short int i;
+    asm("cvt.rni.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ short int __half2short_rd(const __half h)
+{
+    short int i;
+    asm("cvt.rmi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ short int __half2short_ru(const __half h)
+{
+    short int i;
+    asm("cvt.rpi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_rn(const short int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rn.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rn(f);
+)
+    return h;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_rz(const short int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rz.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rz(f);
+)
+    return h;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_rd(const short int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rm.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rd(f);
+)
+    return h;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_ru(const short int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rp.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_ru(f);
+)
+    return h;
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_FP16_DECL__ unsigned int __half2uint_rn(const __half h)
+{
+    unsigned int i;
+    asm("cvt.rni.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ unsigned int __half2uint_rd(const __half h)
+{
+    unsigned int i;
+    asm("cvt.rmi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ unsigned int __half2uint_ru(const __half h)
+{
+    unsigned int i;
+    asm("cvt.rpi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_rn(const unsigned int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rn.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
+,
+    // double-rounding is not a problem here: if integer
+    // has more than 24 bits, it is already too large to
+    // be represented in half precision, and result will
+    // be infinity.
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rn(f);
+)
+    return h;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_rz(const unsigned int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rz.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rz(f);
+)
+    return h;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_rd(const unsigned int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rm.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rd(f);
+)
+    return h;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_ru(const unsigned int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rp.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_ru(f);
+)
+    return h;
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_FP16_DECL__ unsigned short int __half2ushort_rn(const __half h)
+{
+    unsigned short int i;
+    asm("cvt.rni.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ unsigned short int __half2ushort_rd(const __half h)
+{
+    unsigned short int i;
+    asm("cvt.rmi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ unsigned short int __half2ushort_ru(const __half h)
+{
+    unsigned short int i;
+    asm("cvt.rpi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_rn(const unsigned short int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rn.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rn(f);
+)
+    return h;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_rz(const unsigned short int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rz.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rz(f);
+)
+    return h;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_rd(const unsigned short int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rm.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rd(f);
+)
+    return h;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_ru(const unsigned short int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rp.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_ru(f);
+)
+    return h;
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_FP16_DECL__ unsigned long long int __half2ull_rn(const __half h)
+{
+    unsigned long long int i;
+    asm("cvt.rni.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ unsigned long long int __half2ull_rd(const __half h)
+{
+    unsigned long long int i;
+    asm("cvt.rmi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ unsigned long long int __half2ull_ru(const __half h)
+{
+    unsigned long long int i;
+    asm("cvt.rpi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_rn(const unsigned long long int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rn.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
+,
+    // double-rounding is not a problem here: if integer
+    // has more than 24 bits, it is already too large to
+    // be represented in half precision, and result will
+    // be infinity.
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rn(f);
+)
+    return h;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_rz(const unsigned long long int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rz.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rz(f);
+)
+    return h;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_rd(const unsigned long long int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rm.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rd(f);
+)
+    return h;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_ru(const unsigned long long int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rp.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_ru(f);
+)
+    return h;
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_FP16_DECL__ long long int __half2ll_rn(const __half h)
+{
+    long long int i;
+    asm("cvt.rni.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ long long int __half2ll_rd(const __half h)
+{
+    long long int i;
+    asm("cvt.rmi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ long long int __half2ll_ru(const __half h)
+{
+    long long int i;
+    asm("cvt.rpi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_rn(const long long int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rn.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
+,
+    // double-rounding is not a problem here: if integer
+    // has more than 24 bits, it is already too large to
+    // be represented in half precision, and result will
+    // be infinity.
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rn(f);
+)
+    return h;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_rz(const long long int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rz.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rz(f);
+)
+    return h;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_rd(const long long int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rm.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rd(f);
+)
+    return h;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_ru(const long long int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rp.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_ru(f);
+)
+    return h;
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_FP16_DECL__ __half htrunc(const __half h)
+{
+    __half r;
+    asm("cvt.rzi.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h)));
+    return r;
+}
+__CUDA_FP16_DECL__ __half hceil(const __half h)
+{
+    __half r;
+    asm("cvt.rpi.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h)));
+    return r;
+}
+__CUDA_FP16_DECL__ __half hfloor(const __half h)
+{
+    __half r;
+    asm("cvt.rmi.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h)));
+    return r;
+}
+__CUDA_FP16_DECL__ __half hrint(const __half h)
+{
+    __half r;
+    asm("cvt.rni.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h)));
+    return r;
+}
+
+__CUDA_FP16_DECL__ __half2 h2trunc(const __half2 h)
+{
+    __half2 val;
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high}, %1;\n"
+        "  cvt.rzi.f16.f16 low, low;\n"
+        "  cvt.rzi.f16.f16 high, high;\n"
+        "  mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 h2ceil(const __half2 h)
+{
+    __half2 val;
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high}, %1;\n"
+        "  cvt.rpi.f16.f16 low, low;\n"
+        "  cvt.rpi.f16.f16 high, high;\n"
+        "  mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 h2floor(const __half2 h)
+{
+    __half2 val;
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high}, %1;\n"
+        "  cvt.rmi.f16.f16 low, low;\n"
+        "  cvt.rmi.f16.f16 high, high;\n"
+        "  mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 h2rint(const __half2 h)
+{
+    __half2 val;
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high}, %1;\n"
+        "  cvt.rni.f16.f16 low, low;\n"
+        "  cvt.rni.f16.f16 high, high;\n"
+        "  mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h)));
+    return val;
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __lows2half2(const __half2 a, const __half2 b)
+{
+    __half2 val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{.reg .f16 alow,ahigh,blow,bhigh;\n"
+        "  mov.b32 {alow,ahigh}, %1;\n"
+        "  mov.b32 {blow,bhigh}, %2;\n"
+        "  mov.b32 %0, {alow,blow};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)), "r"(__HALF2_TO_CUI(b)));
+,
+    val.x = a.x;
+    val.y = b.x;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __highs2half2(const __half2 a, const __half2 b)
+{
+    __half2 val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{.reg .f16 alow,ahigh,blow,bhigh;\n"
+        "  mov.b32 {alow,ahigh}, %1;\n"
+        "  mov.b32 {blow,bhigh}, %2;\n"
+        "  mov.b32 %0, {ahigh,bhigh};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)), "r"(__HALF2_TO_CUI(b)));
+,
+    val.x = a.y;
+    val.y = b.y;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __low2half(const __half2 a)
+{
+    __half ret;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{.reg .f16 low,high;\n"
+        " mov.b32 {low,high}, %1;\n"
+        " mov.b16 %0, low;}" : "=h"(__HALF_TO_US(ret)) : "r"(__HALF2_TO_CUI(a)));
+,
+    ret = a.x;
+)
+    return ret;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ int __hisinf(const __half a)
+{
+    int retval;
+    const __half_raw araw = __half_raw(a);
+    if (araw.x == 0xFC00U) {
+        retval = -1;
+    } else if (araw.x == 0x7C00U) {
+        retval = 1;
+    } else {
+        retval = 0;
+    }
+    return retval;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __low2half2(const __half2 a)
+{
+    __half2 val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high}, %1;\n"
+        "  mov.b32 %0, {low,low};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
+,
+    val.x = a.x;
+    val.y = a.x;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __high2half2(const __half2 a)
+{
+    __half2 val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high}, %1;\n"
+        "  mov.b32 %0, {high,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
+,
+    val.x = a.y;
+    val.y = a.y;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __high2half(const __half2 a)
+{
+    __half ret;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{.reg .f16 low,high;\n"
+        " mov.b32 {low,high}, %1;\n"
+        " mov.b16 %0, high;}" : "=h"(__HALF_TO_US(ret)) : "r"(__HALF2_TO_CUI(a)));
+,
+    ret = a.y;
+)
+    return ret;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __halves2half2(const __half a, const __half b)
+{
+    __half2 val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{  mov.b32 %0, {%1,%2};}\n"
+        : "=r"(__HALF2_TO_UI(val)) : "h"(__HALF_TO_CUS(a)), "h"(__HALF_TO_CUS(b)));
+,
+    val.x = a;
+    val.y = b;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __half2half2(const __half a)
+{
+    __half2 val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{  mov.b32 %0, {%1,%1};}\n"
+        : "=r"(__HALF2_TO_UI(val)) : "h"(__HALF_TO_CUS(a)));
+,
+    val.x = a;
+    val.y = a;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __lowhigh2highlow(const __half2 a)
+{
+    __half2 val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high}, %1;\n"
+        "  mov.b32 %0, {high,low};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
+,
+    val.x = a.y;
+    val.y = a.x;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ short int __half_as_short(const __half h)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return static_cast<short int>(__HALF_TO_CUS(h));
+,
+    return static_cast<short int>(__half_raw(h).x);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned short int __half_as_ushort(const __half h)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __HALF_TO_CUS(h);
+,
+    return __half_raw(h).x;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __short_as_half(const short int i)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __half h;
+    __HALF_TO_US(h) = static_cast<unsigned short int>(i);
+    return h;
+,
+    __half_raw hr;
+    hr.x = static_cast<unsigned short int>(i);
+    return __half(hr);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort_as_half(const unsigned short int i)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __half h;
+    __HALF_TO_US(h) = i;
+    return h;
+,
+    __half_raw hr;
+    hr.x = i;
+    return __half(hr);)
+}
+
+/******************************************************************************
+*                             __half arithmetic                             *
+******************************************************************************/
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_FP16_DECL__ __half __internal_device_hmax(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __BINARY_OP_HALF_MACRO(max)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    float fr;
+    asm("{max.f32 %0,%1,%2;\n}"
+        :"=f"(fr) : "f"(fa), "f"(fb));
+    const __half hr = __float2half(fr);
+    return hr;
+)
+}
+__CUDA_FP16_DECL__ __half __internal_device_hmin(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __BINARY_OP_HALF_MACRO(min)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    float fr;
+    asm("{min.f32 %0,%1,%2;\n}"
+        :"=f"(fr) : "f"(fa), "f"(fb));
+    const __half hr = __float2half(fr);
+    return hr;
+)
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmax(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hmax(a, b);
+,
+    __half maxval;
+
+    maxval = (__hge(a, b) || __hisnan(b)) ? a : b;
+
+    if (__hisnan(maxval))
+    {
+        // if both inputs are NaN, return canonical NaN
+        maxval = CUDART_NAN_FP16;
+    }
+    else if (__heq(a, b))
+    {
+        // hmax(+0.0, -0.0) = +0.0
+        // unsigned compare 0x8000U > 0x0000U
+        __half_raw ra = __half_raw(a);
+        __half_raw rb = __half_raw(b);
+        maxval = (ra.x > rb.x) ? b : a;
+    }
+    return maxval;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmin(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hmin(a, b);
+,
+    __half minval;
+
+    minval = (__hle(a, b) || __hisnan(b)) ? a : b;
+
+    if (__hisnan(minval))
+    {
+        // if both inputs are NaN, return canonical NaN
+        minval = CUDART_NAN_FP16;
+    }
+    else if (__heq(a, b))
+    {
+        // hmin(+0.0, -0.0) = -0.0
+        // unsigned compare 0x8000U > 0x0000U
+        __half_raw ra = __half_raw(a);
+        __half_raw rb = __half_raw(b);
+        minval = (ra.x > rb.x) ? a : b;
+    }
+
+    return minval;
+)
+}
+
+
+/******************************************************************************
+*                            __half2 arithmetic                             *
+******************************************************************************/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmax2(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __BINARY_OP_HALF2_MACRO(max)
+,
+    __half2 val;
+    val.x = __hmax(a.x, b.x);
+    val.y = __hmax(a.y, b.y);
+    return val;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmin2(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __BINARY_OP_HALF2_MACRO(min)
+,
+    __half2 val;
+    val.x = __hmin(a.x, b.x);
+    val.y = __hmin(a.y, b.y);
+    return val;
+)
+}
+
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300) || defined(_NVHPC_CUDA)
+/******************************************************************************
+*                           __half, __half2 warp shuffle                     *
+******************************************************************************/
+#define __SHUFFLE_HALF2_MACRO(name) /* do */ {\
+   __half2 r; \
+   asm volatile ("{" __CUDA_FP16_STRINGIFY(name) " %0,%1,%2,%3;\n}" \
+       :"=r"(__HALF2_TO_UI(r)): "r"(__HALF2_TO_CUI(var)), "r"(delta), "r"(c)); \
+   return r; \
+} /* while(0) */
+
+#define __SHUFFLE_SYNC_HALF2_MACRO(name, var, delta, c, mask) /* do */ {\
+   __half2 r; \
+   asm volatile ("{" __CUDA_FP16_STRINGIFY(name) " %0,%1,%2,%3,%4;\n}" \
+       :"=r"(__HALF2_TO_UI(r)): "r"(__HALF2_TO_CUI(var)), "r"(delta), "r"(c), "r"(mask)); \
+   return r; \
+} /* while(0) */
+
+#if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ < 700)
+
+__CUDA_FP16_DECL__ __half2 __shfl(const __half2 var, const int delta, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
+    __SHUFFLE_HALF2_MACRO(shfl.idx.b32)
+}
+__CUDA_FP16_DECL__ __half2 __shfl_up(const __half2 var, const unsigned int delta, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = (warp_size - static_cast<unsigned>(width)) << 8U;
+    __SHUFFLE_HALF2_MACRO(shfl.up.b32)
+}
+__CUDA_FP16_DECL__ __half2 __shfl_down(const __half2 var, const unsigned int delta, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
+    __SHUFFLE_HALF2_MACRO(shfl.down.b32)
+}
+__CUDA_FP16_DECL__ __half2 __shfl_xor(const __half2 var, const int delta, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
+    __SHUFFLE_HALF2_MACRO(shfl.bfly.b32)
+}
+
+#endif /* defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ < 700) */
+
+__CUDA_FP16_DECL__ __half2 __shfl_sync(const unsigned int mask, const __half2 var, const int srcLane, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
+    __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.idx.b32, var, srcLane, c, mask)
+}
+__CUDA_FP16_DECL__ __half2 __shfl_up_sync(const unsigned int mask, const __half2 var, const unsigned int delta, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = (warp_size - static_cast<unsigned>(width)) << 8U;
+    __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.up.b32, var, delta, c, mask)
+}
+__CUDA_FP16_DECL__ __half2 __shfl_down_sync(const unsigned int mask, const __half2 var, const unsigned int delta, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
+    __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.down.b32, var, delta, c, mask)
+}
+__CUDA_FP16_DECL__ __half2 __shfl_xor_sync(const unsigned int mask, const __half2 var, const int laneMask, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
+    __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.bfly.b32, var, laneMask, c, mask)
+}
+
+#undef __SHUFFLE_HALF2_MACRO
+#undef __SHUFFLE_SYNC_HALF2_MACRO
+
+#if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ < 700)
+
+__CUDA_FP16_DECL__ __half __shfl(const __half var, const int delta, const int width)
+{
+    const __half2 temp1 = __halves2half2(var, var);
+    const __half2 temp2 = __shfl(temp1, delta, width);
+    return __low2half(temp2);
+}
+__CUDA_FP16_DECL__ __half __shfl_up(const __half var, const unsigned int delta, const int width)
+{
+    const __half2 temp1 = __halves2half2(var, var);
+    const __half2 temp2 = __shfl_up(temp1, delta, width);
+    return __low2half(temp2);
+}
+__CUDA_FP16_DECL__ __half __shfl_down(const __half var, const unsigned int delta, const int width)
+{
+    const __half2 temp1 = __halves2half2(var, var);
+    const __half2 temp2 = __shfl_down(temp1, delta, width);
+    return __low2half(temp2);
+}
+__CUDA_FP16_DECL__ __half __shfl_xor(const __half var, const int delta, const int width)
+{
+    const __half2 temp1 = __halves2half2(var, var);
+    const __half2 temp2 = __shfl_xor(temp1, delta, width);
+    return __low2half(temp2);
+}
+
+#endif /* defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ < 700) */
+
+__CUDA_FP16_DECL__ __half __shfl_sync(const unsigned int mask, const __half var, const int srcLane, const int width)
+{
+    const __half2 temp1 = __halves2half2(var, var);
+    const __half2 temp2 = __shfl_sync(mask, temp1, srcLane, width);
+    return __low2half(temp2);
+}
+__CUDA_FP16_DECL__ __half __shfl_up_sync(const unsigned int mask, const __half var, const unsigned int delta, const int width)
+{
+    const __half2 temp1 = __halves2half2(var, var);
+    const __half2 temp2 = __shfl_up_sync(mask, temp1, delta, width);
+    return __low2half(temp2);
+}
+__CUDA_FP16_DECL__ __half __shfl_down_sync(const unsigned int mask, const __half var, const unsigned int delta, const int width)
+{
+    const __half2 temp1 = __halves2half2(var, var);
+    const __half2 temp2 = __shfl_down_sync(mask, temp1, delta, width);
+    return __low2half(temp2);
+}
+__CUDA_FP16_DECL__ __half __shfl_xor_sync(const unsigned int mask, const __half var, const int laneMask, const int width)
+{
+    const __half2 temp1 = __halves2half2(var, var);
+    const __half2 temp2 = __shfl_xor_sync(mask, temp1, laneMask, width);
+    return __low2half(temp2);
+}
+
+#endif /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300) || defined(_NVHPC_CUDA) */
+/******************************************************************************
+*               __half and __half2 __ldg,__ldcg,__ldca,__ldcs                *
+******************************************************************************/
+
+#if defined(__cplusplus) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320) || defined(_NVHPC_CUDA))
+#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
+#define __LDG_PTR   "l"
+#else
+#define __LDG_PTR   "r"
+#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/
+__CUDA_FP16_DECL__ __half2 __ldg(const  __half2 *const ptr)
+{
+    __half2 ret;
+    asm ("ld.global.nc.b32 %0, [%1];"  : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_FP16_DECL__ __half __ldg(const __half *const ptr)
+{
+    __half ret;
+    asm ("ld.global.nc.b16 %0, [%1];"  : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_FP16_DECL__ __half2 __ldcg(const  __half2 *const ptr)
+{
+    __half2 ret;
+    asm ("ld.global.cg.b32 %0, [%1];"  : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_FP16_DECL__ __half __ldcg(const __half *const ptr)
+{
+    __half ret;
+    asm ("ld.global.cg.b16 %0, [%1];"  : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_FP16_DECL__ __half2 __ldca(const  __half2 *const ptr)
+{
+    __half2 ret;
+    asm ("ld.global.ca.b32 %0, [%1];"  : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_FP16_DECL__ __half __ldca(const __half *const ptr)
+{
+    __half ret;
+    asm ("ld.global.ca.b16 %0, [%1];"  : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_FP16_DECL__ __half2 __ldcs(const  __half2 *const ptr)
+{
+    __half2 ret;
+    asm ("ld.global.cs.b32 %0, [%1];"  : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_FP16_DECL__ __half __ldcs(const __half *const ptr)
+{
+    __half ret;
+    asm ("ld.global.cs.b16 %0, [%1];"  : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_FP16_DECL__ __half2 __ldlu(const  __half2 *const ptr)
+{
+    __half2 ret;
+    asm ("ld.global.lu.b32 %0, [%1];"  : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr) : "memory");
+    return ret;
+}
+__CUDA_FP16_DECL__ __half __ldlu(const __half *const ptr)
+{
+    __half ret;
+    asm ("ld.global.lu.b16 %0, [%1];"  : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr) : "memory");
+    return ret;
+}
+__CUDA_FP16_DECL__ __half2 __ldcv(const  __half2 *const ptr)
+{
+    __half2 ret;
+    asm ("ld.global.cv.b32 %0, [%1];"  : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr) : "memory");
+    return ret;
+}
+__CUDA_FP16_DECL__ __half __ldcv(const __half *const ptr)
+{
+    __half ret;
+    asm ("ld.global.cv.b16 %0, [%1];"  : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr) : "memory");
+    return ret;
+}
+__CUDA_FP16_DECL__ void __stwb(__half2 *const ptr, const __half2 value)
+{
+    asm ("st.global.wb.b32 [%0], %1;"  :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory");
+}
+__CUDA_FP16_DECL__ void __stwb(__half *const ptr, const __half value)
+{
+    asm ("st.global.wb.b16 [%0], %1;"  :: __LDG_PTR(ptr),  "h"(__HALF_TO_CUS(value)) : "memory");
+}
+__CUDA_FP16_DECL__ void __stcg(__half2 *const ptr, const __half2 value)
+{
+    asm ("st.global.cg.b32 [%0], %1;"  :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory");
+}
+__CUDA_FP16_DECL__ void __stcg(__half *const ptr, const __half value)
+{
+    asm ("st.global.cg.b16 [%0], %1;"  :: __LDG_PTR(ptr),  "h"(__HALF_TO_CUS(value)) : "memory");
+}
+__CUDA_FP16_DECL__ void __stcs(__half2 *const ptr, const __half2 value)
+{
+    asm ("st.global.cs.b32 [%0], %1;"  :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory");
+}
+__CUDA_FP16_DECL__ void __stcs(__half *const ptr, const __half value)
+{
+    asm ("st.global.cs.b16 [%0], %1;"  :: __LDG_PTR(ptr),  "h"(__HALF_TO_CUS(value)) : "memory");
+}
+__CUDA_FP16_DECL__ void __stwt(__half2 *const ptr, const __half2 value)
+{
+    asm ("st.global.wt.b32 [%0], %1;"  :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory");
+}
+__CUDA_FP16_DECL__ void __stwt(__half *const ptr, const __half value)
+{
+    asm ("st.global.wt.b16 [%0], %1;"  :: __LDG_PTR(ptr),  "h"(__HALF_TO_CUS(value)) : "memory");
+}
+#undef __LDG_PTR
+#endif /* defined(__cplusplus) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320) || defined(_NVHPC_CUDA)) */
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+
+/******************************************************************************
+*                             __half2 comparison                             *
+******************************************************************************/
+#define __COMPARISON_OP_HALF2_MACRO(name) /* do */ {\
+   __half2 val; \
+   asm( "{ " __CUDA_FP16_STRINGIFY(name) ".f16x2.f16x2 %0,%1,%2;\n}" \
+        :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \
+   return val; \
+} /* while(0) */
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __heq2(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO(set.eq)
+,
+    __half2_raw val;
+    val.x = __heq(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    val.y = __heq(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    return __half2(val);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hne2(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO(set.ne)
+,
+    __half2_raw val;
+    val.x = __hne(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    val.y = __hne(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    return __half2(val);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hle2(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO(set.le)
+,
+    __half2_raw val;
+    val.x = __hle(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    val.y = __hle(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    return __half2(val);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hge2(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO(set.ge)
+,
+    __half2_raw val;
+    val.x = __hge(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    val.y = __hge(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    return __half2(val);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hlt2(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO(set.lt)
+,
+    __half2_raw val;
+    val.x = __hlt(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    val.y = __hlt(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    return __half2(val);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hgt2(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO(set.gt)
+,
+    __half2_raw val;
+    val.x = __hgt(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    val.y = __hgt(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    return __half2(val);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hequ2(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO(set.equ)
+,
+    __half2_raw val;
+    val.x = __hequ(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    val.y = __hequ(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    return __half2(val);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hneu2(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO(set.neu)
+,
+    __half2_raw val;
+    val.x = __hneu(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    val.y = __hneu(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    return __half2(val);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hleu2(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO(set.leu)
+,
+    __half2_raw val;
+    val.x = __hleu(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    val.y = __hleu(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    return __half2(val);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hgeu2(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO(set.geu)
+,
+    __half2_raw val;
+    val.x = __hgeu(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    val.y = __hgeu(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    return __half2(val);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hltu2(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO(set.ltu)
+,
+    __half2_raw val;
+    val.x = __hltu(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    val.y = __hltu(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    return __half2(val);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hgtu2(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO(set.gtu)
+,
+    __half2_raw val;
+    val.x = __hgtu(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    val.y = __hgtu(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    return __half2(val);
+)
+}
+#undef __COMPARISON_OP_HALF2_MACRO
+/******************************************************************************
+*                 __half2 comparison with mask output                        *
+******************************************************************************/
+#define __COMPARISON_OP_HALF2_MACRO_MASK(name) /* do */ {\
+   unsigned val; \
+   asm( "{ " __CUDA_FP16_STRINGIFY(name) ".u32.f16x2 %0,%1,%2;\n}" \
+        :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \
+   return val; \
+} /* while(0) */
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __heq2_mask(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO_MASK(set.eq)
+,
+    const unsigned short px = __heq(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __heq(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hne2_mask(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO_MASK(set.ne)
+,
+    const unsigned short px = __hne(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hne(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hle2_mask(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO_MASK(set.le)
+,
+    const unsigned short px = __hle(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hle(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hge2_mask(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO_MASK(set.ge)
+,
+    const unsigned short px = __hge(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hge(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hlt2_mask(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO_MASK(set.lt)
+,
+    const unsigned short px = __hlt(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hlt(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hgt2_mask(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO_MASK(set.gt)
+,
+    const unsigned short px = __hgt(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hgt(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hequ2_mask(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO_MASK(set.equ)
+,
+    const unsigned short px = __hequ(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hequ(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hneu2_mask(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO_MASK(set.neu)
+,
+    const unsigned short px = __hneu(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hneu(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hleu2_mask(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO_MASK(set.leu)
+,
+    const unsigned short px = __hleu(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hleu(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hgeu2_mask(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO_MASK(set.geu)
+,
+    const unsigned short px = __hgeu(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hgeu(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hltu2_mask(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO_MASK(set.ltu)
+,
+    const unsigned short px = __hltu(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hltu(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hgtu2_mask(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO_MASK(set.gtu)
+,
+    const unsigned short px = __hgtu(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hgtu(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+#undef __COMPARISON_OP_HALF2_MACRO_MASK
+
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbeq2(const __half2 a, const __half2 b)
+{
+    const unsigned mask = __heq2_mask(a, b);
+    return (mask == 0xFFFFFFFFU);
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbne2(const __half2 a, const __half2 b)
+{
+    const unsigned mask = __hne2_mask(a, b);
+    return (mask == 0xFFFFFFFFU);
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hble2(const __half2 a, const __half2 b)
+{
+    const unsigned mask = __hle2_mask(a, b);
+    return (mask == 0xFFFFFFFFU);
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbge2(const __half2 a, const __half2 b)
+{
+    const unsigned mask = __hge2_mask(a, b);
+    return (mask == 0xFFFFFFFFU);
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hblt2(const __half2 a, const __half2 b)
+{
+    const unsigned mask = __hlt2_mask(a, b);
+    return (mask == 0xFFFFFFFFU);
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbgt2(const __half2 a, const __half2 b)
+{
+    const unsigned mask = __hgt2_mask(a, b);
+    return (mask == 0xFFFFFFFFU);
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbequ2(const __half2 a, const __half2 b)
+{
+    const unsigned mask = __hequ2_mask(a, b);
+    return (mask == 0xFFFFFFFFU);
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbneu2(const __half2 a, const __half2 b)
+{
+    const unsigned mask = __hneu2_mask(a, b);
+    return (mask == 0xFFFFFFFFU);
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbleu2(const __half2 a, const __half2 b)
+{
+    const unsigned mask = __hleu2_mask(a, b);
+    return (mask == 0xFFFFFFFFU);
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbgeu2(const __half2 a, const __half2 b)
+{
+    const unsigned mask = __hgeu2_mask(a, b);
+    return (mask == 0xFFFFFFFFU);
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbltu2(const __half2 a, const __half2 b)
+{
+    const unsigned mask = __hltu2_mask(a, b);
+    return (mask == 0xFFFFFFFFU);
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbgtu2(const __half2 a, const __half2 b)
+{
+    const unsigned mask = __hgtu2_mask(a, b);
+    return (mask == 0xFFFFFFFFU);
+}
+/******************************************************************************
+*                             __half comparison                              *
+******************************************************************************/
+#define __COMPARISON_OP_HALF_MACRO(name) /* do */ {\
+   unsigned short val; \
+   asm( "{ .reg .pred __$temp3;\n" \
+        "  setp." __CUDA_FP16_STRINGIFY(name) ".f16  __$temp3, %1, %2;\n" \
+        "  selp.u16 %0, 1, 0, __$temp3;}" \
+        : "=h"(val) : "h"(__HALF_TO_CUS(a)), "h"(__HALF_TO_CUS(b))); \
+   return (val != 0U) ? true : false; \
+} /* while(0) */
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __heq(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF_MACRO(eq)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return (fa == fb);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hne(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF_MACRO(ne)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return (fa != fb) && (!__hisnan(a)) && (!__hisnan(b));
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hle(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF_MACRO(le)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return (fa <= fb);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hge(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF_MACRO(ge)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return (fa >= fb);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hlt(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF_MACRO(lt)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return (fa < fb);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hgt(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF_MACRO(gt)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return (fa > fb);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hequ(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF_MACRO(equ)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return (fa == fb) || (__hisnan(a)) || (__hisnan(b));
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hneu(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF_MACRO(neu)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return (fa != fb);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hleu(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF_MACRO(leu)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return (fa <= fb) || (__hisnan(a)) || (__hisnan(b));
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hgeu(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF_MACRO(geu)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return (fa >= fb) || (__hisnan(a)) || (__hisnan(b));
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hltu(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF_MACRO(ltu)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return (fa < fb) || (__hisnan(a)) || (__hisnan(b));
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hgtu(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF_MACRO(gtu)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return (fa > fb) || (__hisnan(a)) || (__hisnan(b));
+)
+}
+#undef __COMPARISON_OP_HALF_MACRO
+/******************************************************************************
+*                            __half2 arithmetic                             *
+******************************************************************************/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hadd2(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __BINARY_OP_HALF2_MACRO(add)
+,
+    __half2 val;
+    val.x = __hadd(a.x, b.x);
+    val.y = __hadd(a.y, b.y);
+    return val;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hsub2(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __BINARY_OP_HALF2_MACRO(sub)
+,
+    __half2 val;
+    val.x = __hsub(a.x, b.x);
+    val.y = __hsub(a.y, b.y);
+    return val;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmul2(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __BINARY_OP_HALF2_MACRO(mul)
+,
+    __half2 val;
+    val.x = __hmul(a.x, b.x);
+    val.y = __hmul(a.y, b.y);
+    return val;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hadd2_sat(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __BINARY_OP_HALF2_MACRO(add.sat)
+,
+    __half2 val;
+    val.x = __hadd_sat(a.x, b.x);
+    val.y = __hadd_sat(a.y, b.y);
+    return val;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hsub2_sat(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __BINARY_OP_HALF2_MACRO(sub.sat)
+,
+    __half2 val;
+    val.x = __hsub_sat(a.x, b.x);
+    val.y = __hsub_sat(a.y, b.y);
+    return val;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmul2_sat(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __BINARY_OP_HALF2_MACRO(mul.sat)
+,
+    __half2 val;
+    val.x = __hmul_sat(a.x, b.x);
+    val.y = __hmul_sat(a.y, b.y);
+    return val;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hadd2_rn(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __BINARY_OP_HALF2_MACRO(add.rn)
+,
+    __half2 val;
+    val.x = __hadd_rn(a.x, b.x);
+    val.y = __hadd_rn(a.y, b.y);
+    return val;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hsub2_rn(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __BINARY_OP_HALF2_MACRO(sub.rn)
+,
+    __half2 val;
+    val.x = __hsub_rn(a.x, b.x);
+    val.y = __hsub_rn(a.y, b.y);
+    return val;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmul2_rn(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __BINARY_OP_HALF2_MACRO(mul.rn)
+,
+    __half2 val;
+    val.x = __hmul_rn(a.x, b.x);
+    val.y = __hmul_rn(a.y, b.y);
+    return val;
+)
+}
+#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA)
+__CUDA_FP16_DECL__ __half2 __hfma2(const __half2 a, const __half2 b, const __half2 c)
+{
+    __TERNARY_OP_HALF2_MACRO(fma.rn)
+}
+__CUDA_FP16_DECL__ __half2 __hfma2_sat(const __half2 a, const __half2 b, const __half2 c)
+{
+    __TERNARY_OP_HALF2_MACRO(fma.rn.sat)
+}
+#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __h2div(const __half2 a, const __half2 b) {
+    __half ha = __low2half(a);
+    __half hb = __low2half(b);
+
+    const __half v1 = __hdiv(ha, hb);
+
+    ha = __high2half(a);
+    hb = __high2half(b);
+
+    const __half v2 = __hdiv(ha, hb);
+
+    return __halves2half2(v1, v2);
+}
+
+/******************************************************************************
+*                             __half arithmetic                             *
+******************************************************************************/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hadd(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __BINARY_OP_HALF_MACRO(add)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return __float2half(fa + fb);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hsub(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __BINARY_OP_HALF_MACRO(sub)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return __float2half(fa - fb);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmul(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __BINARY_OP_HALF_MACRO(mul)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return __float2half(fa * fb);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hadd_sat(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __BINARY_OP_HALF_MACRO(add.sat)
+,
+    return __hmin(__hmax(__hadd(a, b), CUDART_ZERO_FP16), CUDART_ONE_FP16);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hsub_sat(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __BINARY_OP_HALF_MACRO(sub.sat)
+,
+    return __hmin(__hmax(__hsub(a, b), CUDART_ZERO_FP16), CUDART_ONE_FP16);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmul_sat(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __BINARY_OP_HALF_MACRO(mul.sat)
+,
+    return __hmin(__hmax(__hmul(a, b), CUDART_ZERO_FP16), CUDART_ONE_FP16);
+)
+}
+
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hadd_rn(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __BINARY_OP_HALF_MACRO(add.rn)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return __float2half(fa + fb);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hsub_rn(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __BINARY_OP_HALF_MACRO(sub.rn)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return __float2half(fa - fb);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmul_rn(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __BINARY_OP_HALF_MACRO(mul.rn)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return __float2half(fa * fb);
+)
+}
+#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA)
+__CUDA_FP16_DECL__ __half __hfma(const __half a, const __half b, const __half c)
+{
+    __TERNARY_OP_HALF_MACRO(fma.rn)
+}
+__CUDA_FP16_DECL__ __half __hfma_sat(const __half a, const __half b, const __half c)
+{
+    __TERNARY_OP_HALF_MACRO(fma.rn.sat)
+}
+#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hdiv(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __half v;
+    __half abs;
+    __half den;
+    __HALF_TO_US(den) = 0x008FU;
+
+    float rcp;
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+
+    asm("{rcp.approx.ftz.f32 %0, %1;\n}" :"=f"(rcp) : "f"(fb));
+
+    float fv = rcp * fa;
+
+    v = __float2half(fv);
+    abs = __habs(v);
+    if (__hlt(abs, den) && __hlt(__float2half(0.0f), abs))  {
+        const float err = __fmaf_rn(-fb, fv, fa);
+        fv = __fmaf_rn(rcp, err, fv);
+        v = __float2half(fv);
+    }
+    return v;
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return __float2half(fa / fb);
+)
+}
+
+/******************************************************************************
+*                             __half2 functions                  *
+******************************************************************************/
+#if defined(_NVHPC_CUDA) || defined(__CUDACC__)
+#define __APPROX_FCAST(fun) /* do */ {\
+   __half val;\
+   asm("{.reg.b32         f;        \n"\
+                " .reg.b16         r;        \n"\
+                "  mov.b16         r,%1;     \n"\
+                "  cvt.f32.f16     f,r;      \n"\
+                "  " __CUDA_FP16_STRINGIFY(fun) ".approx.ftz.f32   f,f;  \n"\
+                "  cvt.rn.f16.f32      r,f;  \n"\
+                "  mov.b16         %0,r;     \n"\
+                "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));\
+   return val;\
+} /* while(0) */
+#define __APPROX_FCAST2(fun) /* do */ {\
+   __half2 val;\
+   asm("{.reg.b16         hl, hu;         \n"\
+                " .reg.b32         fl, fu;         \n"\
+                "  mov.b32         {hl, hu}, %1;   \n"\
+                "  cvt.f32.f16     fl, hl;         \n"\
+                "  cvt.f32.f16     fu, hu;         \n"\
+                "  " __CUDA_FP16_STRINGIFY(fun) ".approx.ftz.f32   fl, fl;     \n"\
+                "  " __CUDA_FP16_STRINGIFY(fun) ".approx.ftz.f32   fu, fu;     \n"\
+                "  cvt.rn.f16.f32      hl, fl;     \n"\
+                "  cvt.rn.f16.f32      hu, fu;     \n"\
+                "  mov.b32         %0, {hl, hu};   \n"\
+                "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));       \
+   return val;\
+} /* while(0) */
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) || defined(_NVHPC_CUDA)
+#define __SPEC_CASE2(i,r, spc, ulp) \
+   "{.reg.b32 spc, ulp, p;\n"\
+   "  mov.b32 spc," __CUDA_FP16_STRINGIFY(spc) ";\n"\
+   "  mov.b32 ulp," __CUDA_FP16_STRINGIFY(ulp) ";\n"\
+   "  set.eq.f16x2.f16x2 p," __CUDA_FP16_STRINGIFY(i) ", spc;\n"\
+   "  fma.rn.f16x2 " __CUDA_FP16_STRINGIFY(r) ",p,ulp," __CUDA_FP16_STRINGIFY(r) ";\n}\n"
+#define __SPEC_CASE(i,r, spc, ulp) \
+   "{.reg.b16 spc, ulp, p;\n"\
+   "  mov.b16 spc," __CUDA_FP16_STRINGIFY(spc) ";\n"\
+   "  mov.b16 ulp," __CUDA_FP16_STRINGIFY(ulp) ";\n"\
+   "  set.eq.f16.f16 p," __CUDA_FP16_STRINGIFY(i) ", spc;\n"\
+   "  fma.rn.f16 " __CUDA_FP16_STRINGIFY(r) ",p,ulp," __CUDA_FP16_STRINGIFY(r) ";\n}\n"
+static __device__ __forceinline__ float __float_simpl_sinf(float a);
+static __device__ __forceinline__ float __float_simpl_cosf(float a);
+__CUDA_FP16_DECL__ __half hsin(const __half a) {
+    const float sl = __float_simpl_sinf(__half2float(a));
+    __half r = __float2half_rn(sl);
+    asm("{\n\t"
+        "  .reg.b16 i,r,t;     \n\t"
+        "  mov.b16 r, %0;      \n\t"
+        "  mov.b16 i, %1;      \n\t"
+        "  and.b16 t, r, 0x8000U; \n\t"
+        "  abs.f16 r, r;   \n\t"
+        "  abs.f16 i, i;   \n\t"
+        __SPEC_CASE(i, r, 0X32B3U, 0x0800U)
+        __SPEC_CASE(i, r, 0X5CB0U, 0x9000U)
+        "  or.b16  r,r,t;      \n\t"
+        "  mov.b16 %0, r;      \n"
+        "}\n" : "+h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a)));
+    return r;
+}
+__CUDA_FP16_DECL__ __half2 h2sin(const __half2 a) {
+    const float sl = __float_simpl_sinf(__half2float(a.x));
+    const float sh = __float_simpl_sinf(__half2float(a.y));
+    __half2 r = __floats2half2_rn(sl, sh);
+    asm("{\n\t"
+        "  .reg.b32 i,r,t;             \n\t"
+        "  mov.b32 r, %0;              \n\t"
+        "  mov.b32 i, %1;              \n\t"
+        "  and.b32 t, r, 0x80008000U;   \n\t"
+        "  abs.f16x2 r, r;   \n\t"
+        "  abs.f16x2 i, i;   \n\t"
+        __SPEC_CASE2(i, r, 0X32B332B3U, 0x08000800U)
+        __SPEC_CASE2(i, r, 0X5CB05CB0U, 0x90009000U)
+        "  or.b32  r, r, t;            \n\t"
+        "  mov.b32 %0, r;              \n"
+        "}\n" : "+r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a)));
+    return r;
+}
+__CUDA_FP16_DECL__ __half hcos(const __half a) {
+    const float cl = __float_simpl_cosf(__half2float(a));
+    __half r = __float2half_rn(cl);
+    asm("{\n\t"
+        "  .reg.b16 i,r;        \n\t"
+        "  mov.b16 r, %0;       \n\t"
+        "  mov.b16 i, %1;       \n\t"
+        "  abs.f16 i, i;        \n\t"
+        __SPEC_CASE(i, r, 0X2B7CU, 0x1000U)
+        "  mov.b16 %0, r;       \n"
+        "}\n" : "+h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a)));
+    return r;
+}
+__CUDA_FP16_DECL__ __half2 h2cos(const __half2 a) {
+    const float cl = __float_simpl_cosf(__half2float(a.x));
+    const float ch = __float_simpl_cosf(__half2float(a.y));
+    __half2 r = __floats2half2_rn(cl, ch);
+    asm("{\n\t"
+        "  .reg.b32 i,r;   \n\t"
+        "  mov.b32 r, %0;  \n\t"
+        "  mov.b32 i, %1;  \n\t"
+        "  abs.f16x2 i, i; \n\t"
+        __SPEC_CASE2(i, r, 0X2B7C2B7CU, 0x10001000U)
+        "  mov.b32 %0, r;  \n"
+        "}\n" : "+r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a)));
+    return r;
+}
+static __device__ __forceinline__ float __internal_trig_reduction_kernel(const float a, unsigned int *const quadrant)
+{
+    const float ar = __fmaf_rn(a, 0.636619772F, 12582912.0F);
+    const unsigned q = __float_as_uint(ar);
+    const float j = __fsub_rn(ar, 12582912.0F);
+    float t = __fmaf_rn(j, -1.5707962512969971e+000F, a);
+    t = __fmaf_rn(j, -7.5497894158615964e-008F, t);
+    *quadrant = q;
+    return t;
+}
+static __device__ __forceinline__ float __internal_sin_cos_kernel(const float x, const unsigned int i)
+{
+    float z;
+    const float x2 = x*x;
+    float a8;
+    float a6;
+    float a4;
+    float a2;
+    float a1;
+    float a0;
+
+    if ((i & 1U) != 0U) {
+        // cos
+        a8 =  2.44331571e-5F;
+        a6 = -1.38873163e-3F;
+        a4 =  4.16666457e-2F;
+        a2 = -5.00000000e-1F;
+        a1 = x2;
+        a0 = 1.0F;
+    }
+    else {
+        // sin
+        a8 = -1.95152959e-4F;
+        a6 =  8.33216087e-3F;
+        a4 = -1.66666546e-1F;
+        a2 = 0.0F;
+        a1 = x;
+        a0 = x;
+    }
+
+    z = __fmaf_rn(a8, x2, a6);
+    z = __fmaf_rn(z, x2, a4);
+    z = __fmaf_rn(z, x2, a2);
+    z = __fmaf_rn(z, a1, a0);
+
+    if ((i & 2U) != 0U) {
+        z = -z;
+    }
+    return z;
+}
+static __device__ __forceinline__ float __float_simpl_sinf(float a)
+{
+    float z;
+    unsigned i;
+    a = __internal_trig_reduction_kernel(a, &i);
+    z = __internal_sin_cos_kernel(a, i);
+    return z;
+}
+static __device__ __forceinline__ float __float_simpl_cosf(float a)
+{
+    float z;
+    unsigned i;
+    a = __internal_trig_reduction_kernel(a, &i);
+    z = __internal_sin_cos_kernel(a, (i & 0x3U) + 1U);
+    return z;
+}
+
+__CUDA_FP16_DECL__ __half hexp(const __half a) {
+    __half val;
+    asm("{.reg.b32         f, C, nZ;       \n"
+        " .reg.b16         h,r;            \n"
+        "  mov.b16         h,%1;           \n"
+        "  cvt.f32.f16     f,h;            \n"
+        "  mov.b32         C, 0x3fb8aa3bU; \n"
+        "  mov.b32         nZ, 0x80000000U;\n"
+        "  fma.rn.f32      f,f,C,nZ;       \n"
+        "  ex2.approx.ftz.f32  f,f;        \n"
+        "  cvt.rn.f16.f32      r,f;        \n"
+        __SPEC_CASE(h, r, 0X1F79U, 0x9400U)
+        __SPEC_CASE(h, r, 0X25CFU, 0x9400U)
+        __SPEC_CASE(h, r, 0XC13BU, 0x0400U)
+        __SPEC_CASE(h, r, 0XC1EFU, 0x0200U)
+        "  mov.b16         %0,r;           \n"
+        "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 h2exp(const __half2 a) {
+    __half2 val;
+    asm("{.reg.b16         hl, hu;         \n"
+        " .reg.b32         h,r,fl,fu,C,nZ; \n"
+        "  mov.b32         {hl, hu}, %1;   \n"
+        "  mov.b32         h, %1;          \n"
+        "  cvt.f32.f16     fl, hl;         \n"
+        "  cvt.f32.f16     fu, hu;         \n"
+        "  mov.b32         C, 0x3fb8aa3bU; \n"
+        "  mov.b32         nZ, 0x80000000U;\n"
+        "  fma.rn.f32      fl,fl,C,nZ;     \n"
+        "  fma.rn.f32      fu,fu,C,nZ;     \n"
+        "  ex2.approx.ftz.f32  fl, fl;     \n"
+        "  ex2.approx.ftz.f32  fu, fu;     \n"
+        "  cvt.rn.f16.f32      hl, fl;     \n"
+        "  cvt.rn.f16.f32      hu, fu;     \n"
+        "  mov.b32         r, {hl, hu};    \n"
+        __SPEC_CASE2(h, r, 0X1F791F79U, 0x94009400U)
+        __SPEC_CASE2(h, r, 0X25CF25CFU, 0x94009400U)
+        __SPEC_CASE2(h, r, 0XC13BC13BU, 0x04000400U)
+        __SPEC_CASE2(h, r, 0XC1EFC1EFU, 0x02000200U)
+        "  mov.b32         %0, r;  \n"
+        "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
+    return val;
+}
+#endif /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) || defined(_NVHPC_CUDA) */
+
+__CUDA_FP16_DECL__ __half htanh(const __half a) {
+    float f = __half2float(a);
+    f = tanhf(f);
+    __half h = __float2half_rn(f);
+    return h;
+}
+__CUDA_FP16_DECL__ __half2 h2tanh(const __half2 a) {
+    float2 f = __half22float2(a);
+    f.x = tanhf(f.x);
+    f.y = tanhf(f.y);
+    __half2 h = __float22half2_rn(f);
+    return h;
+}
+
+__CUDA_FP16_DECL__ __half htanh_approx(const __half a) {
+    __half r;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_75,
+    __half_raw hr = (__half_raw)a;
+    asm("tanh.approx.f16 %0, %0;" : "+h"(hr.x));
+    r = (__half)hr;
+,
+    r = htanh(a);
+)
+    return r;
+}
+__CUDA_FP16_DECL__ __half2 h2tanh_approx(const __half2 a) {
+    __half2 res;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_75,
+    asm("tanh.approx.f16x2 %0, %1;" : "=r"(__HALF2_TO_UI(res)) : "r"(__HALF2_TO_CUI(a)));
+,
+    res = h2tanh(a);
+)
+    return res;
+}
+
+__CUDA_FP16_DECL__ __half hexp2(const __half a) {
+    __half val;
+    asm("{.reg.b32         f, ULP;         \n"
+        " .reg.b16         r;              \n"
+        "  mov.b16         r,%1;           \n"
+        "  cvt.f32.f16     f,r;            \n"
+        "  ex2.approx.ftz.f32      f,f;    \n"
+        "  mov.b32         ULP, 0x33800000U;\n"
+        "  fma.rn.f32      f,f,ULP,f;      \n"
+        "  cvt.rn.f16.f32      r,f;        \n"
+        "  mov.b16         %0,r;           \n"
+        "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 h2exp2(const __half2 a) {
+    __half2 val;
+    asm("{.reg.b16         hl, hu;         \n"
+        " .reg.b32         fl, fu, ULP;    \n"
+        "  mov.b32         {hl, hu}, %1;   \n"
+        "  cvt.f32.f16     fl, hl;         \n"
+        "  cvt.f32.f16     fu, hu;         \n"
+        "  ex2.approx.ftz.f32  fl, fl;     \n"
+        "  ex2.approx.ftz.f32  fu, fu;     \n"
+        "  mov.b32         ULP, 0x33800000U;\n"
+        "  fma.rn.f32      fl,fl,ULP,fl;   \n"
+        "  fma.rn.f32      fu,fu,ULP,fu;   \n"
+        "  cvt.rn.f16.f32      hl, fl;     \n"
+        "  cvt.rn.f16.f32      hu, fu;     \n"
+        "  mov.b32         %0, {hl, hu};   \n"
+        "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
+    return val;
+}
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) || defined(_NVHPC_CUDA)
+__CUDA_FP16_DECL__ __half hexp10(const __half a) {
+    __half val;
+    asm("{.reg.b16         h,r;            \n"
+        " .reg.b32         f, C, nZ;       \n"
+        "  mov.b16         h, %1;          \n"
+        "  cvt.f32.f16     f, h;           \n"
+        "  mov.b32         C, 0x40549A78U; \n"
+        "  mov.b32         nZ, 0x80000000U;\n"
+        "  fma.rn.f32      f,f,C,nZ;       \n"
+        "  ex2.approx.ftz.f32  f, f;       \n"
+        "  cvt.rn.f16.f32      r, f;       \n"
+        __SPEC_CASE(h, r, 0x34DEU, 0x9800U)
+        __SPEC_CASE(h, r, 0x9766U, 0x9000U)
+        __SPEC_CASE(h, r, 0x9972U, 0x1000U)
+        __SPEC_CASE(h, r, 0xA5C4U, 0x1000U)
+        __SPEC_CASE(h, r, 0xBF0AU, 0x8100U)
+        "  mov.b16         %0, r;          \n"
+        "}":"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 h2exp10(const __half2 a) {
+    __half2 val;
+    asm("{.reg.b16         hl, hu;         \n"
+        " .reg.b32         h,r,fl,fu,C,nZ; \n"
+        "  mov.b32         {hl, hu}, %1;   \n"
+        "  mov.b32         h, %1;          \n"
+        "  cvt.f32.f16     fl, hl;         \n"
+        "  cvt.f32.f16     fu, hu;         \n"
+        "  mov.b32         C, 0x40549A78U; \n"
+        "  mov.b32         nZ, 0x80000000U;\n"
+        "  fma.rn.f32      fl,fl,C,nZ;     \n"
+        "  fma.rn.f32      fu,fu,C,nZ;     \n"
+        "  ex2.approx.ftz.f32  fl, fl;     \n"
+        "  ex2.approx.ftz.f32  fu, fu;     \n"
+        "  cvt.rn.f16.f32      hl, fl;     \n"
+        "  cvt.rn.f16.f32      hu, fu;     \n"
+        "  mov.b32         r, {hl, hu};    \n"
+        __SPEC_CASE2(h, r, 0x34DE34DEU, 0x98009800U)
+        __SPEC_CASE2(h, r, 0x97669766U, 0x90009000U)
+        __SPEC_CASE2(h, r, 0x99729972U, 0x10001000U)
+        __SPEC_CASE2(h, r, 0xA5C4A5C4U, 0x10001000U)
+        __SPEC_CASE2(h, r, 0xBF0ABF0AU, 0x81008100U)
+        "  mov.b32         %0, r;  \n"
+        "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half hlog2(const __half a) {
+    __half val;
+    asm("{.reg.b16         h, r;           \n"
+        " .reg.b32         f;              \n"
+        "  mov.b16         h, %1;          \n"
+        "  cvt.f32.f16     f, h;           \n"
+        "  lg2.approx.ftz.f32  f, f;       \n"
+        "  cvt.rn.f16.f32      r, f;       \n"
+        __SPEC_CASE(r, r, 0xA2E2U, 0x8080U)
+        __SPEC_CASE(r, r, 0xBF46U, 0x9400U)
+        "  mov.b16         %0, r;          \n"
+        "}":"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 h2log2(const __half2 a) {
+    __half2 val;
+    asm("{.reg.b16         hl, hu;         \n"
+        " .reg.b32         fl, fu, r, p;   \n"
+        "  mov.b32         {hl, hu}, %1;   \n"
+        "  cvt.f32.f16     fl, hl;         \n"
+        "  cvt.f32.f16     fu, hu;         \n"
+        "  lg2.approx.ftz.f32  fl, fl;     \n"
+        "  lg2.approx.ftz.f32  fu, fu;     \n"
+        "  cvt.rn.f16.f32      hl, fl;     \n"
+        "  cvt.rn.f16.f32      hu, fu;     \n"
+        "  mov.b32         r, {hl, hu};    \n"
+        __SPEC_CASE2(r, r, 0xA2E2A2E2U, 0x80808080U)
+        __SPEC_CASE2(r, r, 0xBF46BF46U, 0x94009400U)
+        "  mov.b32         %0, r;          \n"
+        "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half hlog(const __half a) {
+    __half val;
+    asm("{.reg.b32         f, C;           \n"
+        " .reg.b16         r,h;            \n"
+        "  mov.b16         h,%1;           \n"
+        "  cvt.f32.f16     f,h;            \n"
+        "  lg2.approx.ftz.f32  f,f;        \n"
+        "  mov.b32         C, 0x3f317218U;  \n"
+        "  mul.f32         f,f,C;          \n"
+        "  cvt.rn.f16.f32      r,f;        \n"
+        __SPEC_CASE(h, r, 0X160DU, 0x9C00U)
+        __SPEC_CASE(h, r, 0X3BFEU, 0x8010U)
+        __SPEC_CASE(h, r, 0X3C0BU, 0x8080U)
+        __SPEC_CASE(h, r, 0X6051U, 0x1C00U)
+        "  mov.b16         %0,r;           \n"
+        "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 h2log(const __half2 a) {
+    __half2 val;
+    asm("{.reg.b16         hl, hu;             \n"
+        " .reg.b32         r, fl, fu, C, h;    \n"
+        "  mov.b32         {hl, hu}, %1;       \n"
+        "  mov.b32         h, %1;              \n"
+        "  cvt.f32.f16     fl, hl;             \n"
+        "  cvt.f32.f16     fu, hu;             \n"
+        "  lg2.approx.ftz.f32  fl, fl;         \n"
+        "  lg2.approx.ftz.f32  fu, fu;         \n"
+        "  mov.b32         C, 0x3f317218U;     \n"
+        "  mul.f32         fl,fl,C;            \n"
+        "  mul.f32         fu,fu,C;            \n"
+        "  cvt.rn.f16.f32      hl, fl;         \n"
+        "  cvt.rn.f16.f32      hu, fu;         \n"
+        "  mov.b32         r, {hl, hu};        \n"
+        __SPEC_CASE2(h, r, 0X160D160DU, 0x9C009C00U)
+        __SPEC_CASE2(h, r, 0X3BFE3BFEU, 0x80108010U)
+        __SPEC_CASE2(h, r, 0X3C0B3C0BU, 0x80808080U)
+        __SPEC_CASE2(h, r, 0X60516051U, 0x1C001C00U)
+        "  mov.b32         %0, r;              \n"
+        "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half hlog10(const __half a) {
+    __half val;
+    asm("{.reg.b16         h, r;           \n"
+        " .reg.b32         f, C;           \n"
+        "  mov.b16         h, %1;          \n"
+        "  cvt.f32.f16     f, h;           \n"
+        "  lg2.approx.ftz.f32  f, f;       \n"
+        "  mov.b32         C, 0x3E9A209BU; \n"
+        "  mul.f32         f,f,C;          \n"
+        "  cvt.rn.f16.f32      r, f;       \n"
+        __SPEC_CASE(h, r, 0x338FU, 0x1000U)
+        __SPEC_CASE(h, r, 0x33F8U, 0x9000U)
+        __SPEC_CASE(h, r, 0x57E1U, 0x9800U)
+        __SPEC_CASE(h, r, 0x719DU, 0x9C00U)
+        "  mov.b16         %0, r;          \n"
+        "}":"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 h2log10(const __half2 a) {
+    __half2 val;
+    asm("{.reg.b16         hl, hu;             \n"
+        " .reg.b32         r, fl, fu, C, h;    \n"
+        "  mov.b32         {hl, hu}, %1;       \n"
+        "  mov.b32         h, %1;              \n"
+        "  cvt.f32.f16     fl, hl;             \n"
+        "  cvt.f32.f16     fu, hu;             \n"
+        "  lg2.approx.ftz.f32  fl, fl;         \n"
+        "  lg2.approx.ftz.f32  fu, fu;         \n"
+        "  mov.b32         C, 0x3E9A209BU;     \n"
+        "  mul.f32         fl,fl,C;            \n"
+        "  mul.f32         fu,fu,C;            \n"
+        "  cvt.rn.f16.f32      hl, fl;         \n"
+        "  cvt.rn.f16.f32      hu, fu;         \n"
+        "  mov.b32         r, {hl, hu};        \n"
+        __SPEC_CASE2(h, r, 0x338F338FU, 0x10001000U)
+        __SPEC_CASE2(h, r, 0x33F833F8U, 0x90009000U)
+        __SPEC_CASE2(h, r, 0x57E157E1U, 0x98009800U)
+        __SPEC_CASE2(h, r, 0x719D719DU, 0x9C009C00U)
+        "  mov.b32         %0, r;              \n"
+        "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
+    return val;
+}
+#undef __SPEC_CASE2
+#undef __SPEC_CASE
+#endif /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) || defined(_NVHPC_CUDA) */
+__CUDA_FP16_DECL__ __half2 h2rcp(const __half2 a) {
+    __APPROX_FCAST2(rcp)
+}
+__CUDA_FP16_DECL__ __half hrcp(const __half a) {
+    __APPROX_FCAST(rcp)
+}
+__CUDA_FP16_DECL__ __half2 h2rsqrt(const __half2 a) {
+    __APPROX_FCAST2(rsqrt)
+}
+__CUDA_FP16_DECL__ __half hrsqrt(const __half a) {
+    __APPROX_FCAST(rsqrt)
+}
+__CUDA_FP16_DECL__ __half2 h2sqrt(const __half2 a) {
+    __APPROX_FCAST2(sqrt)
+}
+__CUDA_FP16_DECL__ __half hsqrt(const __half a) {
+    __APPROX_FCAST(sqrt)
+}
+#undef __APPROX_FCAST
+#undef __APPROX_FCAST2
+#endif /* defined(_NVHPC_CUDA) || defined(__CUDACC__) */
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hisnan2(const __half2 a)
+{
+    __half2 r;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    asm("{set.nan.f16x2.f16x2 %0,%1,%2;\n}"
+        :"=r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a)), "r"(__HALF2_TO_CUI(a)));
+,
+    __half2_raw val;
+    val.x = __hisnan(a.x) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    val.y = __hisnan(a.y) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    r = __half2(val);
+)
+    return r;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hisnan(const __half a)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __half r;
+    asm("{set.nan.f16.f16 %0,%1,%2;\n}"
+        :"=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a)), "h"(__HALF_TO_CUS(a)));
+    return __HALF_TO_CUS(r) != 0U;
+,
+    const __half_raw hr = static_cast<__half_raw>(a);
+    return ((hr.x & (unsigned short)0x7FFFU) > (unsigned short)0x7C00U);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hneg2(const __half2 a)
+{
+    __half2 r;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    asm("{neg.f16x2 %0,%1;\n}"
+        :"=r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a)));
+,
+    r.x = __hneg(a.x);
+    r.y = __hneg(a.y);
+)
+    return r;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hneg(const __half a)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __half r;
+    asm("{neg.f16 %0,%1;\n}"
+        :"=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a)));
+    return r;
+,
+    const float fa = __half2float(a);
+    return __float2half(-fa);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __habs2(const __half2 a)
+{
+    __half2 r;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    asm("{abs.f16x2 %0,%1;\n}"
+        :"=r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a)));
+,
+    r.x = __habs(a.x);
+    r.y = __habs(a.y);
+)
+    return r;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __habs(const __half a)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __half r;
+    asm("{abs.f16 %0,%1;\n}"
+        :"=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a)));
+    return r;
+,
+    __half_raw abs_a_raw = static_cast<__half_raw>(a);
+    abs_a_raw.x &= (unsigned short)0x7FFFU;
+    if (abs_a_raw.x > (unsigned short)0x7C00U)
+    {
+        // return canonical NaN
+        abs_a_raw.x = (unsigned short)0x7FFFU;
+    }
+    return static_cast<__half>(abs_a_raw);
+)
+}
+#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA)
+__CUDA_FP16_DECL__ __half2 __hcmadd(const __half2 a, const __half2 b, const __half2 c)
+{
+    // fast version of complex multiply-accumulate
+    // (a.re, a.im) * (b.re, b.im) + (c.re, c.im)
+    // acc.re = (c.re + a.re*b.re) - a.im*b.im
+    // acc.im = (c.im + a.re*b.im) + a.im*b.re
+    __half real_tmp =  __hfma(a.x, b.x, c.x);
+    __half img_tmp  =  __hfma(a.x, b.y, c.y);
+    real_tmp = __hfma(__hneg(a.y), b.y, real_tmp);
+    img_tmp  = __hfma(a.y,         b.x, img_tmp);
+    return make_half2(real_tmp, img_tmp);
+}
+#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */
+
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmax_nan(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __BINARY_OP_HALF_MACRO(max.NaN)
+,
+    __half maxval;
+    if (__hisnan(a) || __hisnan(b))
+    {
+        maxval = CUDART_NAN_FP16;
+    }
+    else
+    {
+        maxval = __hmax(a, b);
+    }
+    return maxval;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmin_nan(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __BINARY_OP_HALF_MACRO(min.NaN)
+,
+    __half minval;
+    if (__hisnan(a) || __hisnan(b))
+    {
+        minval = CUDART_NAN_FP16;
+    }
+    else
+    {
+        minval = __hmin(a, b);
+    }
+    return minval;
+)
+}
+
+#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA)
+__CUDA_FP16_DECL__ __half __hfma_relu(const __half a, const __half b, const __half c)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __TERNARY_OP_HALF_MACRO(fma.rn.relu)
+,
+    return __hmax_nan(__hfma(a, b, c), CUDART_ZERO_FP16);
+)
+}
+#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */
+
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmax2_nan(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __BINARY_OP_HALF2_MACRO(max.NaN)
+,
+    __half2 result = __hmax2(a, b);
+    if (__hisnan(a.x) || __hisnan(b.x))
+    {
+        result.x = CUDART_NAN_FP16;
+    }
+    if (__hisnan(a.y) || __hisnan(b.y))
+    {
+        result.y = CUDART_NAN_FP16;
+    }
+    return result;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmin2_nan(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __BINARY_OP_HALF2_MACRO(min.NaN)
+,
+    __half2 result = __hmin2(a, b);
+    if (__hisnan(a.x) || __hisnan(b.x))
+    {
+        result.x = CUDART_NAN_FP16;
+    }
+    if (__hisnan(a.y) || __hisnan(b.y))
+    {
+        result.y = CUDART_NAN_FP16;
+    }
+    return result;
+)
+}
+#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA)
+__CUDA_FP16_DECL__ __half2 __hfma2_relu(const __half2 a, const __half2 b, const __half2 c)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __TERNARY_OP_HALF2_MACRO(fma.rn.relu)
+,
+    __half2_raw hzero;
+    hzero.x = (unsigned short)0U;
+    hzero.y = (unsigned short)0U;
+    return __hmax2_nan(__hfma2(a, b, c), __half2(hzero));
+)
+}
+#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */
+
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/* Define __PTR for atomicAdd prototypes below, undef after done */
+#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
+#define __PTR   "l"
+#else
+#define __PTR   "r"
+#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/
+
+__CUDA_FP16_DECL__  __half2 atomicAdd(__half2 *const address, const __half2 val) {
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_60,
+    __half2 r;
+    asm volatile ("{ atom.add.noftz.f16x2 %0,[%1],%2; }\n"
+                  : "=r"(__HALF2_TO_UI(r)) : __PTR(address), "r"(__HALF2_TO_CUI(val))
+                  : "memory");
+    return r;
+,
+    unsigned int* address_as_uint = (unsigned int*)address;
+    unsigned int old = *address_as_uint;
+    unsigned int assumed;
+    do {
+        assumed = old;
+        __half2 new_val = __hadd2(val, *(__half2*)&assumed);
+        old = atomicCAS(address_as_uint, assumed, *(unsigned int*)&new_val);
+    } while (assumed != old);
+    return *(__half2*)&old;
+)
+}
+
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700))) || defined(_NVHPC_CUDA)
+__CUDA_FP16_DECL__  __half atomicAdd(__half *const address, const __half val) {
+    __half r;
+    asm volatile ("{ atom.add.noftz.f16 %0,[%1],%2; }\n"
+                  : "=h"(__HALF_TO_US(r))
+                  : __PTR(address), "h"(__HALF_TO_CUS(val))
+                  : "memory");
+    return r;
+}
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700))) || defined(_NVHPC_CUDA) */
+
+#undef __PTR
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+#endif /* !(defined __DOXYGEN_ONLY__) */
+#endif /* defined(__cplusplus) */
+
+#undef __TERNARY_OP_HALF2_MACRO
+#undef __TERNARY_OP_HALF_MACRO
+#undef __BINARY_OP_HALF2_MACRO
+#undef __BINARY_OP_HALF_MACRO
+
+#undef __CUDA_HOSTDEVICE_FP16_DECL__
+#undef __CUDA_FP16_DECL__
+
+#undef __HALF_TO_US
+#undef __HALF_TO_CUS
+#undef __HALF2_TO_UI
+#undef __HALF2_TO_CUI
+#undef __CUDA_FP16_CONSTEXPR__
+
+#if defined(__CPP_VERSION_AT_LEAST_11_FP16)
+#undef __CPP_VERSION_AT_LEAST_11_FP16
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */
+
+#undef ___CUDA_FP16_STRINGIFY_INNERMOST
+#undef __CUDA_FP16_STRINGIFY
+
+#endif /* end of include guard: __CUDA_FP16_HPP__ */

From e33c7cc6d57a751d93555cc5b1cf49362b8d0b31 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Tue, 19 Aug 2025 00:33:22 -0700
Subject: [PATCH 39/56] update doc

---
 docs/source/reference/types.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/reference/types.rst b/docs/source/reference/types.rst
index 40112210a..303822eaf 100644
--- a/docs/source/reference/types.rst
+++ b/docs/source/reference/types.rst
@@ -80,7 +80,7 @@ Data Movement and Casts
 
 Construction of a single instance of a ``bfloat16`` object:
 
-.. function:: numba.cuda.bf16.bfloat16(b)
+.. function:: numba.cuda.types.bfloat16(b)
 
     Constructs a ``bfloat16`` from existing device `scalar`. Supported scalar
     types:

From bc3b27de97e60cab7a0abf4df793ff0875c3caa9 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Tue, 19 Aug 2025 11:39:21 -0700
Subject: [PATCH 40/56] remove bfloattype custom impl

---
 numba_cuda/numba/cuda/models.py | 41 +--------------------------------
 1 file changed, 1 insertion(+), 40 deletions(-)

diff --git a/numba_cuda/numba/cuda/models.py b/numba_cuda/numba/cuda/models.py
index d6e28b82f..335f504fa 100644
--- a/numba_cuda/numba/cuda/models.py
+++ b/numba_cuda/numba/cuda/models.py
@@ -1,4 +1,3 @@
-import struct
 import functools
 
 from llvmlite import ir
@@ -46,46 +45,8 @@ def __init__(self, dmm, fe_type):
 register_model(CUDADispatcher)(models.OpaqueModel)
 
 
-def _as_bfloat(value):
-    # Step 1: Reinterpret the input as u32 bits
-    u = struct.unpack("I", struct.pack("f", value))[0]
-
-    # Step 2: Truncate (or round, we choose truncate) last 16 bits
-    trunc = u >> 16
-
-    # Step 3: Unpack them back to Python floats
-    f = struct.unpack("f", struct.pack("I", trunc))[0]
-
-    return f
-
-
-class BfloatType(ir.types._BaseFloatType):
-    """Brain-float type"""
-
-    null = "0.0"
-    intrinsic_name = "bfloat"
-
-    def __str__(self):
-        return "bfloat"
-
-    def format_constant(self, value):
-        return ir.types._format_double(_as_bfloat(value))
-
-
-BfloatType._create_instance()
-
-
 @register_model(Bfloat16)
 class _model___nv_bfloat16(PrimitiveModel):
     def __init__(self, dmm, fe_type):
-        from numba.cuda.api import get_current_device
-
-        major, minor = get_current_device().compute_capability
-
-        # Blackwell device leverage latest nvvm (llvm 20+ dialect) which has
-        # bfloat type
-        if major >= 10:
-            be_type = BfloatType()
-        else:
-            be_type = ir.IntType(16)
+        be_type = ir.IntType(16)
         super(_model___nv_bfloat16, self).__init__(dmm, fe_type, be_type)

From edea3c3e7022e8f6b32a748b3638801b8feb3380 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Tue, 19 Aug 2025 12:23:47 -0700
Subject: [PATCH 41/56] add print tests

---
 .../numba/cuda/tests/cudapy/test_print.py      | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_print.py b/numba_cuda/numba/cuda/tests/cudapy/test_print.py
index 0dbb3139b..a723885e0 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_print.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_print.py
@@ -99,6 +99,20 @@ def print_too_many(r):
 cuda.synchronize()
 """
 
+print_bfloat16_usecase = """\
+from numba import cuda
+
+@cuda.jit
+def print_bfloat16():
+    # 0.9375 is a dyadic rational, it's integer significand can expand within 7 digits.
+    # printing this should not give any rounding error.
+    a = cuda.types.bfloat16(0.9375)
+    print(a, a, a)
+
+print_bfloat16[1, 1]()
+cuda.synchronize()
+"""
+
 
 class TestPrint(CUDATestCase):
     # Note that in these tests we generally strip the output to avoid dealing
@@ -145,6 +159,10 @@ def test_dim3(self):
         expected = [str(i) for i in np.ndindex(2, 2, 2)]
         self.assertEqual(sorted(lines), expected)
 
+    def test_bfloat16(self):
+        output, _ = self.run_code(print_bfloat16_usecase)
+        self.assertEqual(output.strip(), "0.937500 0.937500 0.937500")
+
     @skip_on_cudasim("cudasim can print unlimited output")
     def test_too_many_args(self):
         # Tests that we emit the format string and warn when there are more

From 0fa0174d42320b011f3fd7b3c563042d48f80c5d Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Tue, 19 Aug 2025 12:30:50 -0700
Subject: [PATCH 42/56] add documentation for bfloat16 type

---
 numba_cuda/numba/cuda/types.py | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/numba_cuda/numba/cuda/types.py b/numba_cuda/numba/cuda/types.py
index 17a4184d1..1f7786c37 100644
--- a/numba_cuda/numba/cuda/types.py
+++ b/numba_cuda/numba/cuda/types.py
@@ -43,7 +43,24 @@ class CUDADispatcher(types.Dispatcher):
 
 class Bfloat16(types.Number):
     """
-    A bfloat16 type.
+    A bfloat16 type. Has 8 exponent bits and 7 significand bits.
+
+    Conversion rules:
+    Floats:
+    from:
+        fp32, fp64: UNSAFE
+        fp16: UNSAFE (loses precision)
+    to:
+        fp32, fp64: PROMOTE (same exponent, more mantissa)
+        fp16: UNSAFE (loses range)
+
+    Integers:
+    from:
+        int8: SAFE
+        other int: All UNSAFE (bf16 cannot represent all integers in range)
+    to: UNSAFE (loses precision, round to zeros)
+
+    All other conversions are not allowed.
     """
 
     def __init__(self):
@@ -59,8 +76,8 @@ def can_convert_from(self, typingctx, other):
         elif isinstance(other, types.Integer):
             if other.bitwidth == 8:
                 return Conversion.safe
-
-        return Conversion.unsafe
+            else:
+                return Conversion.unsafe
 
     def can_convert_to(self, typingctx, other):
         if isinstance(other, types.Float):
@@ -71,8 +88,6 @@ def can_convert_to(self, typingctx, other):
         elif isinstance(other, types.Integer):
             return Conversion.unsafe
 
-        return Conversion.unsafe
-
     def unify(self, typingctx, other):
         if isinstance(other, (types.Float, types.Integer)):
             return typingctx.unify_pairs(self, other)
@@ -83,7 +98,9 @@ def cast_python_value(self, value):
 
             return ml_dtypes.bfloat16(value)
         except ImportError:
-            raise NotImplementedError
+            raise NotImplementedError(
+                "Please install ml_dtypes to use bfloat16 on host."
+            )
 
 
 bfloat16 = Bfloat16()

From e09ffc6dff6a935c4cd9325f1d4eab4120921787 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Tue, 19 Aug 2025 12:34:58 -0700
Subject: [PATCH 43/56] update ci script and pyproject toml to make ml_dtypes a
 test time dependency

---
 ci/test_conda.sh                | 1 +
 ci/test_conda_ctypes_binding.sh | 1 +
 pyproject.toml                  | 1 +
 3 files changed, 3 insertions(+)

diff --git a/ci/test_conda.sh b/ci/test_conda.sh
index 06c3c6e06..4aa989c81 100755
--- a/ci/test_conda.sh
+++ b/ci/test_conda.sh
@@ -30,6 +30,7 @@ rapids-mamba-retry create -n test \
     pytest \
     pytest-xdist \
     cffi \
+    ml_dtypes \
     python=${RAPIDS_PY_VERSION}
 
 # Temporarily allow unbound variables for conda activation.
diff --git a/ci/test_conda_ctypes_binding.sh b/ci/test_conda_ctypes_binding.sh
index a274c021e..4365eb0b7 100755
--- a/ci/test_conda_ctypes_binding.sh
+++ b/ci/test_conda_ctypes_binding.sh
@@ -24,6 +24,7 @@ rapids-mamba-retry create -n test \
     pytest \
     pytest-xdist \
     cffi \
+    ml_dtypes \
     python=${RAPIDS_PY_VERSION}
 
 # Temporarily allow unbound variables for conda activation.
diff --git a/pyproject.toml b/pyproject.toml
index f3add3728..0c3fa9479 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -48,6 +48,7 @@ test = [
     "pytest",
     "pytest-xdist",
     "filecheck",
+    "ml_dtypes",
 ]
 test-cu12 = [
     "numba-cuda[cu12]",

From 14b9fd5cfb5d61d87efe4a3ac453099955bf9a33 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Wed, 20 Aug 2025 10:36:18 -0700
Subject: [PATCH 44/56] add manual implementation of bf16->fp64, litint->bf16

---
 numba_cuda/numba/cuda/_internal/cuda_bf16.py | 28 ++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/numba_cuda/numba/cuda/_internal/cuda_bf16.py b/numba_cuda/numba/cuda/_internal/cuda_bf16.py
index a2af16d04..055ae2e00 100644
--- a/numba_cuda/numba/cuda/_internal/cuda_bf16.py
+++ b/numba_cuda/numba/cuda/_internal/cuda_bf16.py
@@ -1599,6 +1599,34 @@ def impl(context, builder, fromty, toty, value):
 _from___nv_bfloat16_to_bool__lower(shim_stream, shim_obj)
 
 
+# C++ does not provide a conversion operator from bfloat16 to double, so we need to implement it manually.
+def _from___nv_bfloat16_to_float64__lower():
+    @lower_cast(_type___nv_bfloat16, float64)
+    def impl(context, builder, fromty, toty, value):
+        # Hand rolled bfloat16 -> float32 -> double conversion with zero-ext
+        bits32 = builder.zext(value, ir.IntType(32))
+        shift = builder.shl(bits32, ir.Constant(ir.IntType(32), 16))
+        f32 = builder.bitcast(shift, ir.FloatType())
+        # printf("%f") expects a double; promote to f64 to match vararg expectation
+        f64 = builder.fpext(f32, ir.DoubleType())
+        return f64
+
+
+_from___nv_bfloat16_to_float64__lower()
+
+
+def _literalint_to_bf16_lower():
+    @lower_cast(types.IntegerLiteral, _type___nv_bfloat16)
+    def impl(context, builder, fromty, toty, value):
+        f32 = context.cast(builder, value, fromty, float32)
+        i32 = builder.bitcast(f32, ir.IntType(32))
+        i16 = builder.trunc(i32, ir.IntType(16))
+        return i16
+
+
+_literalint_to_bf16_lower()
+
+
 # Typing for __nv_bfloat162
 class _type_class___nv_bfloat162(Type):
     def __init__(self):

From b7b70c6b8e6548df7eaa8ab4dc9be197378b5312 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Wed, 20 Aug 2025 15:30:50 -0700
Subject: [PATCH 45/56] Maintain original overload resolution for all native
 operations

---
 numba_cuda/numba/cuda/_internal/cuda_bf16.py | 73 +++++++++++---------
 1 file changed, 40 insertions(+), 33 deletions(-)

diff --git a/numba_cuda/numba/cuda/_internal/cuda_bf16.py b/numba_cuda/numba/cuda/_internal/cuda_bf16.py
index 055ae2e00..8fc977cb0 100644
--- a/numba_cuda/numba/cuda/_internal/cuda_bf16.py
+++ b/numba_cuda/numba/cuda/_internal/cuda_bf16.py
@@ -25,6 +25,14 @@
 from numba.core.imputils import Registry as TargetRegistry
 from numba.core.imputils import lower_cast
 from numba.core.typing import signature
+from numba.core.typing.old_builtins import (
+    BinOp,
+    BinOpTrueDiv,
+    UnaryNegate,
+    UnaryPositive,
+    UnorderedCmpOp,
+    OrderedCmpOp,
+)
 from numba.core.typing.templates import AttributeTemplate, ConcreteTemplate
 from numba.core.typing.templates import Registry as TypingRegistry
 from numba.cuda import CUSource, declare_device
@@ -1607,7 +1615,6 @@ def impl(context, builder, fromty, toty, value):
         bits32 = builder.zext(value, ir.IntType(32))
         shift = builder.shl(bits32, ir.Constant(ir.IntType(32), 16))
         f32 = builder.bitcast(shift, ir.FloatType())
-        # printf("%f") expects a double; promote to f64 to match vararg expectation
         f64 = builder.fpext(f32, ir.DoubleType())
         return f64
 
@@ -15962,8 +15969,8 @@ class _typing___half(ConcreteTemplate):
 
 
 @register_global(operator.add)
-class _typing_operator_add(ConcreteTemplate):
-    cases = [
+class _typing_operator_add(BinOp):
+    cases = BinOp.cases + [
         signature(
             _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16
         ),
@@ -15974,8 +15981,8 @@ class _typing_operator_add(ConcreteTemplate):
 
 
 @register_global(operator.sub)
-class _typing_operator_sub(ConcreteTemplate):
-    cases = [
+class _typing_operator_sub(BinOp):
+    cases = BinOp.cases + [
         signature(
             _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16
         ),
@@ -15986,8 +15993,8 @@ class _typing_operator_sub(ConcreteTemplate):
 
 
 @register_global(operator.mul)
-class _typing_operator_mul(ConcreteTemplate):
-    cases = [
+class _typing_operator_mul(BinOp):
+    cases = BinOp.cases + [
         signature(
             _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16
         ),
@@ -15998,8 +16005,8 @@ class _typing_operator_mul(ConcreteTemplate):
 
 
 @register_global(operator.truediv)
-class _typing_operator_truediv(ConcreteTemplate):
-    cases = [
+class _typing_operator_truediv(BinOpTrueDiv):
+    cases = BinOpTrueDiv.cases + [
         signature(
             _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16
         ),
@@ -16010,8 +16017,8 @@ class _typing_operator_truediv(ConcreteTemplate):
 
 
 @register_global(operator.iadd)
-class _typing_operator_iadd(ConcreteTemplate):
-    cases = [
+class _typing_operator_iadd(BinOp):
+    cases = BinOp.cases + [
         signature(
             _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16
         ),
@@ -16022,8 +16029,8 @@ class _typing_operator_iadd(ConcreteTemplate):
 
 
 @register_global(operator.isub)
-class _typing_operator_isub(ConcreteTemplate):
-    cases = [
+class _typing_operator_isub(BinOp):
+    cases = BinOp.cases + [
         signature(
             _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16
         ),
@@ -16034,8 +16041,8 @@ class _typing_operator_isub(ConcreteTemplate):
 
 
 @register_global(operator.imul)
-class _typing_operator_imul(ConcreteTemplate):
-    cases = [
+class _typing_operator_imul(BinOp):
+    cases = BinOp.cases + [
         signature(
             _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16
         ),
@@ -16046,8 +16053,8 @@ class _typing_operator_imul(ConcreteTemplate):
 
 
 @register_global(operator.itruediv)
-class _typing_operator_itruediv(ConcreteTemplate):
-    cases = [
+class _typing_operator_itruediv(BinOpTrueDiv):
+    cases = BinOp.cases + [
         signature(
             _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16
         ),
@@ -16058,64 +16065,64 @@ class _typing_operator_itruediv(ConcreteTemplate):
 
 
 @register_global(operator.pos)
-class _typing_operator_pos(ConcreteTemplate):
-    cases = [
+class _typing_operator_pos(UnaryPositive):
+    cases = UnaryPositive.cases + [
         signature(_type___nv_bfloat16, _type___nv_bfloat16),
         signature(_type___nv_bfloat162, _type___nv_bfloat162),
     ]
 
 
 @register_global(operator.neg)
-class _typing_operator_neg(ConcreteTemplate):
-    cases = [
+class _typing_operator_neg(UnaryNegate):
+    cases = UnaryNegate.cases + [
         signature(_type___nv_bfloat16, _type___nv_bfloat16),
         signature(_type___nv_bfloat162, _type___nv_bfloat162),
     ]
 
 
 @register_global(operator.eq)
-class _typing_operator_eq(ConcreteTemplate):
-    cases = [
+class _typing_operator_eq(UnorderedCmpOp):
+    cases = UnorderedCmpOp.cases + [
         signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16),
         signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162),
     ]
 
 
 @register_global(operator.ne)
-class _typing_operator_ne(ConcreteTemplate):
-    cases = [
+class _typing_operator_ne(UnorderedCmpOp):
+    cases = UnorderedCmpOp.cases + [
         signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16),
         signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162),
     ]
 
 
 @register_global(operator.gt)
-class _typing_operator_gt(ConcreteTemplate):
-    cases = [
+class _typing_operator_gt(OrderedCmpOp):
+    cases = OrderedCmpOp.cases + [
         signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16),
         signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162),
     ]
 
 
 @register_global(operator.lt)
-class _typing_operator_lt(ConcreteTemplate):
-    cases = [
+class _typing_operator_lt(OrderedCmpOp):
+    cases = OrderedCmpOp.cases + [
         signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16),
         signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162),
     ]
 
 
 @register_global(operator.ge)
-class _typing_operator_ge(ConcreteTemplate):
-    cases = [
+class _typing_operator_ge(OrderedCmpOp):
+    cases = OrderedCmpOp.cases + [
         signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16),
         signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162),
     ]
 
 
 @register_global(operator.le)
-class _typing_operator_le(ConcreteTemplate):
-    cases = [
+class _typing_operator_le(OrderedCmpOp):
+    cases = OrderedCmpOp.cases + [
         signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16),
         signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162),
     ]

From 220287aed7eed13eb089c1bb10c80d7b66f484ea Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Wed, 20 Aug 2025 15:32:03 -0700
Subject: [PATCH 46/56] remove operator function exposure (a numbast bug)

---
 numba_cuda/numba/cuda/_internal/cuda_bf16.py | 34 --------------------
 1 file changed, 34 deletions(-)

diff --git a/numba_cuda/numba/cuda/_internal/cuda_bf16.py b/numba_cuda/numba/cuda/_internal/cuda_bf16.py
index 8fc977cb0..3b63e14ff 100644
--- a/numba_cuda/numba/cuda/_internal/cuda_bf16.py
+++ b/numba_cuda/numba/cuda/_internal/cuda_bf16.py
@@ -16385,40 +16385,6 @@ class _typing_operator_le(OrderedCmpOp):
     "h2cos",
     "h2sin",
     "atomicAdd",
-    "atomicAdd",
-    "operator+",
-    "operator-",
-    "operator*",
-    "operator/",
-    "operator+=",
-    "operator-=",
-    "operator*=",
-    "operator/=",
-    "operator+",
-    "operator-",
-    "operator==",
-    "operator!=",
-    "operator>",
-    "operator<",
-    "operator>=",
-    "operator<=",
-    "operator+",
-    "operator-",
-    "operator*",
-    "operator/",
-    "operator+=",
-    "operator-=",
-    "operator*=",
-    "operator/=",
-    "operator+",
-    "operator-",
-    "operator==",
-    "operator!=",
-    "operator>",
-    "operator<",
-    "operator>=",
-    "operator<=",
-    "__half",
 ]
 
 

From 0f6683e3c689217a147f041ce1d1f2cf3ba18c3d Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Wed, 20 Aug 2025 22:17:40 -0700
Subject: [PATCH 47/56] remove ml_dtypes dependency in core

---
 numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py | 10 ----------
 numba_cuda/numba/cuda/types.py                      | 10 ----------
 2 files changed, 20 deletions(-)

diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py
index 8f0d4569b..0d6a5935e 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py
@@ -1,5 +1,3 @@
-import unittest
-from importlib.util import find_spec
 import numpy as np
 from ml_dtypes import bfloat16 as mldtypes_bf16
 
@@ -572,14 +570,6 @@ def kernel(out):
             _bf16_ulp_distance(raw[4:], f8_expected), 2
         )
 
-    @unittest.skipIf(
-        find_spec("ml_dtypes") is None,
-        "ml_dtypes is required to use bfloat16 on host",
-    )
-    def test_use_bfloat16_on_host(self):
-        x = bfloat16(3.0)
-        self.assertEqual(x, 3.0)
-
 
 def _bf16_ulp_rank(bits_int16: np.ndarray) -> np.ndarray:
     """
diff --git a/numba_cuda/numba/cuda/types.py b/numba_cuda/numba/cuda/types.py
index 1f7786c37..f1b23a836 100644
--- a/numba_cuda/numba/cuda/types.py
+++ b/numba_cuda/numba/cuda/types.py
@@ -92,15 +92,5 @@ def unify(self, typingctx, other):
         if isinstance(other, (types.Float, types.Integer)):
             return typingctx.unify_pairs(self, other)
 
-    def cast_python_value(self, value):
-        try:
-            import ml_dtypes  # noqa: F401
-
-            return ml_dtypes.bfloat16(value)
-        except ImportError:
-            raise NotImplementedError(
-                "Please install ml_dtypes to use bfloat16 on host."
-            )
-
 
 bfloat16 = Bfloat16()

From c309442f40a09c9562a10d1363bdfa2013b992f9 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Wed, 20 Aug 2025 22:56:00 -0700
Subject: [PATCH 48/56] use builtin not old_builtin

---
 numba_cuda/numba/cuda/_internal/cuda_bf16.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/numba_cuda/numba/cuda/_internal/cuda_bf16.py b/numba_cuda/numba/cuda/_internal/cuda_bf16.py
index 3b63e14ff..1edc43555 100644
--- a/numba_cuda/numba/cuda/_internal/cuda_bf16.py
+++ b/numba_cuda/numba/cuda/_internal/cuda_bf16.py
@@ -25,7 +25,7 @@
 from numba.core.imputils import Registry as TargetRegistry
 from numba.core.imputils import lower_cast
 from numba.core.typing import signature
-from numba.core.typing.old_builtins import (
+from numba.core.typing.builtins import (
     BinOp,
     BinOpTrueDiv,
     UnaryNegate,

From 6ed0e815ca441e6b3bac1a9b843ec4ad55b1b907 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Wed, 20 Aug 2025 23:10:01 -0700
Subject: [PATCH 49/56] add ml_dtypes to simulator ci

---
 ci/test_simulator.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ci/test_simulator.sh b/ci/test_simulator.sh
index 4bdaf8bef..bb85a8733 100755
--- a/ci/test_simulator.sh
+++ b/ci/test_simulator.sh
@@ -13,6 +13,7 @@ DEPENDENCIES=(
     "pytest"
     "pytest-xdist"
     "cffi"
+    "ml_dtypes"
     "python=${RAPIDS_PY_VERSION}"
     "numba-cuda"
 )

From c0250cb45be80afb212c1eb8523376c2d65dfed8 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Wed, 20 Aug 2025 23:25:03 -0700
Subject: [PATCH 50/56] fix sub-sub section headers

---
 docs/source/reference/types.rst | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/docs/source/reference/types.rst b/docs/source/reference/types.rst
index f06caf5f4..9cc4c2bf2 100644
--- a/docs/source/reference/types.rst
+++ b/docs/source/reference/types.rst
@@ -370,7 +370,7 @@ Special value predicates:
     <https://docs.nvidia.com/cuda/cuda-math-api/cuda_math_api/group__CUDA__MATH____BFLOAT16__COMPARISON.html#group__cuda__math____bfloat16__comparison>`_.
 
 Precision Conversion and Data Movement
-*************************************
+**************************************
 
 The following conversion intrinsics convert between ``bfloat16`` and other
 scalar types. Rounding-mode suffixes:
@@ -381,7 +381,7 @@ scalar types. Rounding-mode suffixes:
 - ``_ru``: round-up (towards +∞)
 
 Floating-point conversions
-==========================
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 .. function:: numba.cuda.bf16.float32_to_bfloat16(x)
 
@@ -403,13 +403,13 @@ Floating-point conversions
     Convert a ``float32`` to ``bfloat16`` using the specified rounding mode.
 
 Integer conversions
-===================
+^^^^^^^^^^^^^^^^^^^^
 
 Representative APIs for each integer width are listed below. All have
 rounding-mode variants ``_rn``, ``_rz``, ``_rd``, ``_ru``.
 
 int16 (signed 16-bit)
----------------------
+"""""""""""""""""""""
 
 .. function:: numba.cuda.bf16.int16_to_bfloat16_rn(x)
 .. function:: numba.cuda.bf16.int16_to_bfloat16_rz(x)
@@ -426,7 +426,7 @@ int16 (signed 16-bit)
     Convert a ``bfloat16`` to ``int16`` with the selected rounding mode.
 
 uint16 (unsigned 16-bit)
-------------------------
+"""""""""""""""""""""""""
 
 .. function:: numba.cuda.bf16.uint16_to_bfloat16_rn(x)
 .. function:: numba.cuda.bf16.uint16_to_bfloat16_rz(x)
@@ -443,7 +443,7 @@ uint16 (unsigned 16-bit)
     Convert a ``bfloat16`` to ``uint16`` with the selected rounding mode.
 
 int32 (signed 32-bit)
----------------------
+"""""""""""""""""""""
 
 .. function:: numba.cuda.bf16.int32_to_bfloat16_rn(x)
 .. function:: numba.cuda.bf16.int32_to_bfloat16_rz(x)
@@ -460,7 +460,7 @@ int32 (signed 32-bit)
     Convert a ``bfloat16`` to ``int32`` with the selected rounding mode.
 
 uint32 (unsigned 32-bit)
-------------------------
+"""""""""""""""""""""""""
 
 .. function:: numba.cuda.bf16.uint32_to_bfloat16_rn(x)
 .. function:: numba.cuda.bf16.uint32_to_bfloat16_rz(x)
@@ -477,7 +477,7 @@ uint32 (unsigned 32-bit)
     Convert a ``bfloat16`` to ``uint32`` with the selected rounding mode.
 
 int64 (signed 64-bit)
----------------------
+"""""""""""""""""""""
 
 .. function:: numba.cuda.bf16.int64_to_bfloat16_rn(x)
 .. function:: numba.cuda.bf16.int64_to_bfloat16_rz(x)
@@ -494,7 +494,7 @@ int64 (signed 64-bit)
     Convert a ``bfloat16`` to ``int64`` with the selected rounding mode.
 
 uint64 (unsigned 64-bit)
-------------------------
+"""""""""""""""""""""""""
 
 .. function:: numba.cuda.bf16.uint64_to_bfloat16_rn(x)
 .. function:: numba.cuda.bf16.uint64_to_bfloat16_rz(x)
@@ -511,7 +511,7 @@ uint64 (unsigned 64-bit)
     Convert a ``bfloat16`` to ``uint64`` with the selected rounding mode.
 
 8-bit conversions
-=================
+^^^^^^^^^^^^^^^^^^
 
 .. function:: numba.cuda.bf16.bfloat16_to_int8_rz(x)
 
@@ -522,7 +522,7 @@ uint64 (unsigned 64-bit)
     Convert a ``bfloat16`` to ``uint8`` with round-towards-zero.
 
 Bit Reinterpret Casts
-*********************
+^^^^^^^^^^^^^^^^^^^^^
 
 These APIs reinterpret bits without numeric conversion:
 

From 13d7cb7838c5d72521aa499cba4ed7eee4022e74 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Wed, 20 Aug 2025 23:44:16 -0700
Subject: [PATCH 51/56] skip simulator for roundtrip

---
 numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py
index 4e9b3dd30..f8f47644b 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py
@@ -344,6 +344,8 @@ def kernel(out):
         self.assertAlmostEqual(out[3], 2.0, delta=1e-3)
 
     def test_bfloat16_as_bitcast(self):
+        self.skip_unsupported()
+
         @cuda.jit
         def roundtrip_kernel(test_val, i2, u2):
             i2[0] = int16_as_bfloat16(bfloat16_as_int16(test_val))

From d41c67cf0eabe491c6fc1ea80f53bdb20c55e61d Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Thu, 21 Aug 2025 09:50:05 -0700
Subject: [PATCH 52/56] use numba typing templates

---
 numba_cuda/numba/cuda/_internal/cuda_bf16.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/numba_cuda/numba/cuda/_internal/cuda_bf16.py b/numba_cuda/numba/cuda/_internal/cuda_bf16.py
index c7a8ed934..33beb2b5a 100644
--- a/numba_cuda/numba/cuda/_internal/cuda_bf16.py
+++ b/numba_cuda/numba/cuda/_internal/cuda_bf16.py
@@ -36,8 +36,8 @@
     UnorderedCmpOp,
     OrderedCmpOp,
 )
-from numba.core.typing.templates import AttributeTemplate, ConcreteTemplate
-from numba.core.typing.templates import Registry as TypingRegistry
+from numba.cuda.typing.templates import AttributeTemplate, ConcreteTemplate
+from numba.cuda.typing.templates import Registry as TypingRegistry
 from numba.cuda import CUSource, declare_device
 from numba.cuda.vector_types import vector_types
 from numba.extending import as_numba_type

From 7d77ada2314e6ac206cba1c2c07783ac6c9277ae Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Thu, 21 Aug 2025 10:54:56 -0700
Subject: [PATCH 53/56] skip lto test without nvjitlink

---
 numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py
index 31d1d13e4..7d4343e35 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py
@@ -5,6 +5,7 @@
 from numba.cuda.testing import unittest, CUDATestCase
 import numpy as np
 import operator
+from numba.cuda.testing import skip_if_nvjitlink_missing
 
 from numba import (
     config,
@@ -293,6 +294,7 @@ def kernel(arr):
 
         np.testing.assert_allclose(arr, [3], atol=1e-2)
 
+    @skip_if_nvjitlink_missing("LTO is not supported without nvjitlink.")
     def test_bf16_intrinsics_used_in_lto(self):
         self.skip_unsupported()
 

From 212f4f0139f55279240eabb837153a210a7400f8 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Thu, 21 Aug 2025 10:58:13 -0700
Subject: [PATCH 54/56] skip cuda sim for bfloat16 tests

---
 .../numba/cuda/tests/cudapy/test_bfloat16.py  | 191 +++++++++---------
 1 file changed, 98 insertions(+), 93 deletions(-)

diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py
index f8f47644b..95e5fe140 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py
@@ -14,100 +14,105 @@
     uint16,
     uint32,
     uint64,
+    config,
 )
-from numba.cuda.bf16 import (
-    bfloat16,
-    habs,
-    hadd,
-    hsub,
-    hmul,
-    hadd_rn,
-    hsub_rn,
-    hmul_rn,
-    hdiv,
-    hadd_sat,
-    hsub_sat,
-    hmul_sat,
-    hfma,
-    hfma_sat,
-    hneg,
-    hfma_relu,
-    # Comparison intrinsics
-    heq,
-    hne,
-    hge,
-    hgt,
-    hle,
-    hlt,
-    hmax,
-    hmin,
-    hmax_nan,
-    hmin_nan,
-    hisnan,
-    hisinf,
-    # Conversion intrinsics (NumPy-style names)
-    bfloat16_to_int8_rz,
-    bfloat16_to_uint8_rz,
-    int16_to_bfloat16_rn,
-    int16_to_bfloat16_rz,
-    int16_to_bfloat16_rd,
-    int16_to_bfloat16_ru,
-    bfloat16_to_int16_rn,
-    bfloat16_to_int16_rz,
-    bfloat16_to_int16_rd,
-    bfloat16_to_int16_ru,
-    uint16_to_bfloat16_rn,
-    uint16_to_bfloat16_rz,
-    uint16_to_bfloat16_rd,
-    uint16_to_bfloat16_ru,
-    bfloat16_to_uint16_rn,
-    bfloat16_to_uint16_rz,
-    bfloat16_to_uint16_rd,
-    bfloat16_to_uint16_ru,
-    int32_to_bfloat16_rn,
-    int32_to_bfloat16_rz,
-    int32_to_bfloat16_rd,
-    int32_to_bfloat16_ru,
-    bfloat16_to_int32_rn,
-    bfloat16_to_int32_rz,
-    bfloat16_to_int32_rd,
-    bfloat16_to_int32_ru,
-    uint32_to_bfloat16_rn,
-    uint32_to_bfloat16_rz,
-    uint32_to_bfloat16_rd,
-    uint32_to_bfloat16_ru,
-    bfloat16_to_uint32_rn,
-    bfloat16_to_uint32_rz,
-    bfloat16_to_uint32_rd,
-    bfloat16_to_uint32_ru,
-    bfloat16_to_int64_rn,
-    bfloat16_to_int64_rz,
-    bfloat16_to_int64_rd,
-    bfloat16_to_int64_ru,
-    int64_to_bfloat16_rn,
-    int64_to_bfloat16_rz,
-    int64_to_bfloat16_rd,
-    int64_to_bfloat16_ru,
-    bfloat16_to_uint64_rn,
-    bfloat16_to_uint64_rz,
-    bfloat16_to_uint64_rd,
-    bfloat16_to_uint64_ru,
-    uint64_to_bfloat16_rn,
-    uint64_to_bfloat16_rz,
-    uint64_to_bfloat16_rd,
-    uint64_to_bfloat16_ru,
-    bfloat16_as_int16,
-    int16_as_bfloat16,
-    bfloat16_as_uint16,
-    uint16_as_bfloat16,
-    bfloat16_to_float32,
-    float32_to_bfloat16,
-    float64_to_bfloat16,
-    float32_to_bfloat16_rn,
-    float32_to_bfloat16_rz,
-    float32_to_bfloat16_rd,
-    float32_to_bfloat16_ru,
-)
+
+
+if not config.ENABLE_CUDASIM:
+    from numba.cuda.bf16 import (
+        bfloat16,
+        habs,
+        hadd,
+        hsub,
+        hmul,
+        hadd_rn,
+        hsub_rn,
+        hmul_rn,
+        hdiv,
+        hadd_sat,
+        hsub_sat,
+        hmul_sat,
+        hfma,
+        hfma_sat,
+        hneg,
+        hfma_relu,
+        # Comparison intrinsics
+        heq,
+        hne,
+        hge,
+        hgt,
+        hle,
+        hlt,
+        hmax,
+        hmin,
+        hmax_nan,
+        hmin_nan,
+        hisnan,
+        hisinf,
+        # Conversion intrinsics (NumPy-style names)
+        bfloat16_to_int8_rz,
+        bfloat16_to_uint8_rz,
+        int16_to_bfloat16_rn,
+        int16_to_bfloat16_rz,
+        int16_to_bfloat16_rd,
+        int16_to_bfloat16_ru,
+        bfloat16_to_int16_rn,
+        bfloat16_to_int16_rz,
+        bfloat16_to_int16_rd,
+        bfloat16_to_int16_ru,
+        uint16_to_bfloat16_rn,
+        uint16_to_bfloat16_rz,
+        uint16_to_bfloat16_rd,
+        uint16_to_bfloat16_ru,
+        bfloat16_to_uint16_rn,
+        bfloat16_to_uint16_rz,
+        bfloat16_to_uint16_rd,
+        bfloat16_to_uint16_ru,
+        int32_to_bfloat16_rn,
+        int32_to_bfloat16_rz,
+        int32_to_bfloat16_rd,
+        int32_to_bfloat16_ru,
+        bfloat16_to_int32_rn,
+        bfloat16_to_int32_rz,
+        bfloat16_to_int32_rd,
+        bfloat16_to_int32_ru,
+        uint32_to_bfloat16_rn,
+        uint32_to_bfloat16_rz,
+        uint32_to_bfloat16_rd,
+        uint32_to_bfloat16_ru,
+        bfloat16_to_uint32_rn,
+        bfloat16_to_uint32_rz,
+        bfloat16_to_uint32_rd,
+        bfloat16_to_uint32_ru,
+        bfloat16_to_int64_rn,
+        bfloat16_to_int64_rz,
+        bfloat16_to_int64_rd,
+        bfloat16_to_int64_ru,
+        int64_to_bfloat16_rn,
+        int64_to_bfloat16_rz,
+        int64_to_bfloat16_rd,
+        int64_to_bfloat16_ru,
+        bfloat16_to_uint64_rn,
+        bfloat16_to_uint64_rz,
+        bfloat16_to_uint64_rd,
+        bfloat16_to_uint64_ru,
+        uint64_to_bfloat16_rn,
+        uint64_to_bfloat16_rz,
+        uint64_to_bfloat16_rd,
+        uint64_to_bfloat16_ru,
+        bfloat16_as_int16,
+        int16_as_bfloat16,
+        bfloat16_as_uint16,
+        uint16_as_bfloat16,
+        bfloat16_to_float32,
+        float32_to_bfloat16,
+        float64_to_bfloat16,
+        float32_to_bfloat16_rn,
+        float32_to_bfloat16_rz,
+        float32_to_bfloat16_rd,
+        float32_to_bfloat16_ru,
+    )
+
 from numba.cuda.testing import CUDATestCase
 
 import math

From ac576bed3602129be944607d20eafee17792df78 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Thu, 21 Aug 2025 13:04:51 -0700
Subject: [PATCH 55/56] update simulator tests

---
 numba_cuda/numba/cuda/tests/cudapy/test_print.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_print.py b/numba_cuda/numba/cuda/tests/cudapy/test_print.py
index 15e8b7ebf..f6df2c1e3 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_print.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_print.py
@@ -103,7 +103,11 @@ def print_too_many(r):
 """
 
 print_bfloat16_usecase = """\
-from numba import cuda
+from numba import cuda, config
+
+if config.ENABLE_CUDASIM:
+    print("bfloat16 on host is not yet supported.")
+    exit(0)
 
 @cuda.jit
 def print_bfloat16():

From f3946db1dc4b7df4a3490209c7dd92f692ad69e2 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Thu, 21 Aug 2025 22:19:46 -0700
Subject: [PATCH 56/56] skip simulator test on host

---
 numba_cuda/numba/cuda/tests/cudapy/test_print.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_print.py b/numba_cuda/numba/cuda/tests/cudapy/test_print.py
index f6df2c1e3..ff27fd169 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_print.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_print.py
@@ -105,10 +105,6 @@ def print_too_many(r):
 print_bfloat16_usecase = """\
 from numba import cuda, config
 
-if config.ENABLE_CUDASIM:
-    print("bfloat16 on host is not yet supported.")
-    exit(0)
-
 @cuda.jit
 def print_bfloat16():
     # 0.9375 is a dyadic rational, it's integer significand can expand within 7 digits.
@@ -166,6 +162,7 @@ def test_dim3(self):
         expected = [str(i) for i in np.ndindex(2, 2, 2)]
         self.assertEqual(sorted(lines), expected)
 
+    @skip_on_cudasim("bfloat16 on host is not yet supported.")
     def test_bfloat16(self):
         output, _ = self.run_code(print_bfloat16_usecase)
         self.assertEqual(output.strip(), "0.937500 0.937500 0.937500")