diff --git a/ci/test_conda.sh b/ci/test_conda.sh
index 717ba5dc2..ed2d57cef 100755
--- a/ci/test_conda.sh
+++ b/ci/test_conda.sh
@@ -36,6 +36,7 @@ DEPENDENCIES=(
     "pytest"
     "pytest-xdist"
     "cffi"
+    "ml_dtypes"
     "python=${RAPIDS_PY_VERSION}"
 )
 # Constrain oldest supported dependencies for testing
diff --git a/ci/test_conda_ctypes_binding.sh b/ci/test_conda_ctypes_binding.sh
index a7058619c..844b35b40 100755
--- a/ci/test_conda_ctypes_binding.sh
+++ b/ci/test_conda_ctypes_binding.sh
@@ -26,6 +26,7 @@ DEPENDENCIES=(
     "pytest"
     "pytest-xdist"
     "cffi"
+    "ml_dtypes"
     "python=${RAPIDS_PY_VERSION}"
     "numba-cuda"
 )
diff --git a/ci/test_simulator.sh b/ci/test_simulator.sh
index 4bdaf8bef..bb85a8733 100755
--- a/ci/test_simulator.sh
+++ b/ci/test_simulator.sh
@@ -13,6 +13,7 @@ DEPENDENCIES=(
     "pytest"
     "pytest-xdist"
     "cffi"
+    "ml_dtypes"
     "python=${RAPIDS_PY_VERSION}"
     "numba-cuda"
 )
diff --git a/configs/cuda_bf16.yml b/configs/cuda_bf16.yml
index f08cdbd77..2045b0376 100644
--- a/configs/cuda_bf16.yml
+++ b/configs/cuda_bf16.yml
@@ -1,10 +1,12 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-2-Clause
 Name: Numba Bfloat16
-Version: 0.0.1
-Entry Point: ./numba_cuda/numba/cuda/include/12/cuda_bf16.h
+Version: 0.0.2
+GPU Arch:
+    - sm_80 # sm_80 is the first CUDA architecture that supports bfloat16
+Entry Point: ./numba_cuda/numba/cuda/include/13/cuda_bf16.h
 File List:
-    - ./numba_cuda/numba/cuda/include/12/cuda_bf16.h
+    - ./numba_cuda/numba/cuda/include/13/cuda_bf16.h
 Exclude: {}
 Types:
     __nv_bfloat16_raw: Number
@@ -21,6 +23,4 @@ Data Models:
     __nv_bfloat162: StructModel
     nv_bfloat162: StructModel
 Shim Include Override: "\"cuda_bf16.h\""
-Additional Import:
-    - os
-Require Pynvjitlink: False
+Use Separate Registry: True
diff --git a/docs/source/reference/types.rst b/docs/source/reference/types.rst
index 4591a8905..9cc4c2bf2 100644
--- a/docs/source/reference/types.rst
+++ b/docs/source/reference/types.rst
@@ -84,7 +84,7 @@ Data Movement and Casts
 
 Construction of a single instance of a ``bfloat16`` object:
 
-.. function:: numba.cuda.bf16.bfloat16(b)
+.. function:: numba.cuda.types.bfloat16(b)
 
     Constructs a ``bfloat16`` from existing device `scalar`. Supported scalar
     types:
@@ -96,6 +96,7 @@ Construction of a single instance of a ``bfloat16`` object:
     - ``int32``
     - ``uint64``
     - ``uint32``
+    - ``float16``
 
 Conversely, ``bfloat16`` data can be cast back to existing native data type via
 ``dtype(b)``, where ``dtype`` is one of the data types above (except float16),
@@ -104,7 +105,7 @@ and ``b`` is a bfloat16 object.
 Arithmetic
 **********
 
-Supported arithmetic operations on ``bfloat`16`` operands are:
+Supported arithmetic operations on ``bfloat16`` operands are:
 
 - Arithmetic (``+``, ``-``, ``*``, ``/``)
 - Arithmetic assignment operators (``+=``, ``-=``, ``*=``, ``/=``)
@@ -144,11 +145,11 @@ on ``bfloat16`` are provided:
     mode.
 
 .. function:: numba.cuda.bf16.hlog2(b)
-    Calculates bfloat16 decimal logarithm of input ``b`` in round-to-nearest-even
-    mode.
+    Calculates bfloat16 binary logarithm (base-2) of input ``b`` in
+    round-to-nearest-even mode.
 
 .. function:: numba.cuda.bf16.hlog10(b)
-    Calculates bfloat16 natural exponential function of input ``b`` in
+    Calculates bfloat16 common logarithm (base-10) of input ``b`` in
     round-to-nearest-even mode.
 
 .. function:: numba.cuda.bf16.hcos(b)
@@ -191,3 +192,352 @@ on ``bfloat16`` are provided:
 .. function:: numba.cuda.bf16.hexp10(b)
     Calculates bfloat16 decimal exponential function of input ``b`` in
     round-to-nearest-even mode.
+
+
+Arithmetic Intrinsics
+*********************
+
+The following low-level arithmetic intrinsics are available under
+``numba.cuda.bf16`` and map to CUDA bfloat16 arithmetic functions. Unless
+otherwise noted, operations are performed in round-to-nearest-even mode.
+
+.. function:: numba.cuda.bf16.habs(a)
+
+    Calculates the absolute value of input ``a`` (bfloat16) and returns the result.
+
+.. function:: numba.cuda.bf16.hneg(a)
+
+    Negates input ``a`` (bfloat16) and returns the result.
+
+.. function:: numba.cuda.bf16.hadd(a, b)
+
+    Adds ``a`` and ``b`` (bfloat16) in round-to-nearest-even mode.
+
+.. function:: numba.cuda.bf16.hadd_rn(a, b)
+
+    Adds ``a`` and ``b`` (bfloat16) in round-to-nearest-even mode. Prevents
+    contraction of separate operations into a fused-multiply-add.
+
+.. function:: numba.cuda.bf16.hadd_sat(a, b)
+
+    Adds ``a`` and ``b`` (bfloat16) in round-to-nearest-even mode, with
+    saturation to the range ``[0.0, 1.0]``. NaN results are flushed to ``+0.0``.
+
+.. function:: numba.cuda.bf16.hsub(a, b)
+
+    Subtracts ``b`` from ``a`` (bfloat16) in round-to-nearest-even mode.
+
+.. function:: numba.cuda.bf16.hsub_rn(a, b)
+
+    Subtracts ``b`` from ``a`` (bfloat16) in round-to-nearest-even mode.
+    Prevents contraction of separate operations into a fused-multiply-add.
+
+.. function:: numba.cuda.bf16.hsub_sat(a, b)
+
+    Subtracts ``b`` from ``a`` (bfloat16) in round-to-nearest-even mode, with
+    saturation to the range ``[0.0, 1.0]``. NaN results are flushed to ``+0.0``.
+
+.. function:: numba.cuda.bf16.hmul(a, b)
+
+    Multiplies ``a`` and ``b`` (bfloat16) in round-to-nearest-even mode.
+
+.. function:: numba.cuda.bf16.hmul_rn(a, b)
+
+    Multiplies ``a`` and ``b`` (bfloat16) in round-to-nearest-even mode.
+    Prevents contraction of separate operations into a fused-multiply-add.
+
+.. function:: numba.cuda.bf16.hmul_sat(a, b)
+
+    Multiplies ``a`` and ``b`` (bfloat16) in round-to-nearest-even mode, with
+    saturation to the range ``[0.0, 1.0]``. NaN results are flushed to ``+0.0``.
+
+.. function:: numba.cuda.bf16.hdiv(a, b)
+
+    Divides ``a`` by ``b`` (bfloat16) in round-to-nearest-even mode.
+
+.. function:: numba.cuda.bf16.hfma(a, b, c)
+
+    Computes a fused multiply-add of ``a`` and ``b`` plus ``c`` (bfloat16) in
+    round-to-nearest-even mode; i.e. returns ``a * b + c``.
+
+.. function:: numba.cuda.bf16.hfma_sat(a, b, c)
+
+    Fused multiply-add in round-to-nearest-even mode with saturation to the
+    range ``[0.0, 1.0]``. NaN results are flushed to ``+0.0``.
+
+.. function:: numba.cuda.bf16.hfma_relu(a, b, c)
+
+    Fused multiply-add in round-to-nearest-even mode with ReLU saturation;
+    i.e. returns ``max(0, a * b + c)``.
+
+Comparison Intrinsics
+*********************
+
+Device-level comparison intrinsics operating on ``bfloat16`` values are
+available under ``numba.cuda.bf16``. Unless stated otherwise, the ordered
+comparisons return ``False`` if either input is NaN, following IEEE semantics.
+
+.. function:: numba.cuda.bf16.heq(a, b)
+
+    Ordered equality. Returns ``True`` iff ``a == b``. NaN inputs yield ``False``.
+
+.. function:: numba.cuda.bf16.hne(a, b)
+
+    Ordered inequality. Returns ``True`` iff ``a != b`` and neither input is NaN.
+    NaN inputs yield ``False``.
+
+.. function:: numba.cuda.bf16.hge(a, b)
+
+    Ordered greater-or-equal. NaN inputs yield ``False``.
+
+.. function:: numba.cuda.bf16.hgt(a, b)
+
+    Ordered greater-than. NaN inputs yield ``False``.
+
+.. function:: numba.cuda.bf16.hle(a, b)
+
+    Ordered less-or-equal. NaN inputs yield ``False``.
+
+.. function:: numba.cuda.bf16.hlt(a, b)
+
+    Ordered less-than. NaN inputs yield ``False``.
+
+The unordered comparison variants return ``True`` when either input is NaN:
+
+.. function:: numba.cuda.bf16.hequ(a, b)
+
+    Unordered equality. Returns ``True`` if ``a`` or ``b`` is NaN, or if ``a == b``.
+
+.. function:: numba.cuda.bf16.hneu(a, b)
+
+    Unordered inequality. Returns ``True`` if ``a`` or ``b`` is NaN, or if ``a != b``.
+
+.. function:: numba.cuda.bf16.hgeu(a, b)
+
+    Unordered greater-or-equal. Returns ``True`` if ``a`` or ``b`` is NaN, or if ``a >= b``.
+
+.. function:: numba.cuda.bf16.hgtu(a, b)
+
+    Unordered greater-than. Returns ``True`` if ``a`` or ``b`` is NaN, or if ``a > b``.
+
+.. function:: numba.cuda.bf16.hleu(a, b)
+
+    Unordered less-or-equal. Returns ``True`` if ``a`` or ``b`` is NaN, or if ``a <= b``.
+
+.. function:: numba.cuda.bf16.hltu(a, b)
+
+    Unordered less-than. Returns ``True`` if ``a`` or ``b`` is NaN, or if ``a < b``.
+
+Min/Max operations follow CUDA semantics for zeros and NaNs:
+
+.. function:: numba.cuda.bf16.hmax(a, b)
+
+    Returns ``max(a, b)`` with the following behavior:
+    if either input is NaN, the other input is returned; if both are NaN,
+    the canonical NaN is returned. If both inputs are zero, ``+0.0 > -0.0``.
+
+.. function:: numba.cuda.bf16.hmin(a, b)
+
+    Returns ``min(a, b)`` with the following behavior:
+    if either input is NaN, the other input is returned; if both are NaN,
+    the canonical NaN is returned. If both inputs are zero, ``+0.0 > -0.0``.
+
+.. function:: numba.cuda.bf16.hmax_nan(a, b)
+
+    Returns ``max(a, b)`` where NaNs pass through: if either input is NaN,
+    the canonical NaN is returned.
+
+.. function:: numba.cuda.bf16.hmin_nan(a, b)
+
+    Returns ``min(a, b)`` where NaNs pass through: if either input is NaN,
+    the canonical NaN is returned.
+
+Special value predicates:
+
+.. function:: numba.cuda.bf16.hisnan(a)
+
+    Returns ``True`` if ``a`` is a NaN, ``False`` otherwise.
+
+.. function:: numba.cuda.bf16.hisinf(a)
+
+    Returns a nonzero integer if ``a`` is infinite, otherwise ``0``.
+
+.. note::
+
+    Python comparison operators on ``bfloat16`` values in device code map to
+    the ordered comparisons above. For more details on the CUDA bfloat16
+    comparison semantics, see `NVIDIA CUDA Math API: Bfloat16 Comparison Functions
+    <https://docs.nvidia.com/cuda/cuda-math-api/cuda_math_api/group__CUDA__MATH____BFLOAT16__COMPARISON.html#group__cuda__math____bfloat16__comparison>`_.
+
+Precision Conversion and Data Movement
+**************************************
+
+The following conversion intrinsics convert between ``bfloat16`` and other
+scalar types. Rounding-mode suffixes:
+
+- ``_rn``: round-to-nearest-even
+- ``_rz``: round-towards-zero
+- ``_rd``: round-down (towards −∞)
+- ``_ru``: round-up (towards +∞)
+
+Floating-point conversions
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. function:: numba.cuda.bf16.float32_to_bfloat16(x)
+
+    Convert a ``float32`` to ``bfloat16`` (default rounding is round-to-nearest-even).
+
+.. function:: numba.cuda.bf16.float64_to_bfloat16(x)
+
+    Convert a ``float64`` to ``bfloat16`` (default rounding is round-to-nearest-even).
+
+.. function:: numba.cuda.bf16.bfloat16_to_float32(x)
+
+    Convert a ``bfloat16`` to ``float32``.
+
+.. function:: numba.cuda.bf16.float32_to_bfloat16_rn(x)
+.. function:: numba.cuda.bf16.float32_to_bfloat16_rz(x)
+.. function:: numba.cuda.bf16.float32_to_bfloat16_rd(x)
+.. function:: numba.cuda.bf16.float32_to_bfloat16_ru(x)
+
+    Convert a ``float32`` to ``bfloat16`` using the specified rounding mode.
+
+Integer conversions
+^^^^^^^^^^^^^^^^^^^^
+
+Representative APIs for each integer width are listed below. All have
+rounding-mode variants ``_rn``, ``_rz``, ``_rd``, ``_ru``.
+
+int16 (signed 16-bit)
+"""""""""""""""""""""
+
+.. function:: numba.cuda.bf16.int16_to_bfloat16_rn(x)
+.. function:: numba.cuda.bf16.int16_to_bfloat16_rz(x)
+.. function:: numba.cuda.bf16.int16_to_bfloat16_rd(x)
+.. function:: numba.cuda.bf16.int16_to_bfloat16_ru(x)
+
+    Convert an ``int16`` to ``bfloat16`` with the selected rounding mode.
+
+.. function:: numba.cuda.bf16.bfloat16_to_int16_rn(x)
+.. function:: numba.cuda.bf16.bfloat16_to_int16_rz(x)
+.. function:: numba.cuda.bf16.bfloat16_to_int16_rd(x)
+.. function:: numba.cuda.bf16.bfloat16_to_int16_ru(x)
+
+    Convert a ``bfloat16`` to ``int16`` with the selected rounding mode.
+
+uint16 (unsigned 16-bit)
+"""""""""""""""""""""""""
+
+.. function:: numba.cuda.bf16.uint16_to_bfloat16_rn(x)
+.. function:: numba.cuda.bf16.uint16_to_bfloat16_rz(x)
+.. function:: numba.cuda.bf16.uint16_to_bfloat16_rd(x)
+.. function:: numba.cuda.bf16.uint16_to_bfloat16_ru(x)
+
+    Convert a ``uint16`` to ``bfloat16`` with the selected rounding mode.
+
+.. function:: numba.cuda.bf16.bfloat16_to_uint16_rn(x)
+.. function:: numba.cuda.bf16.bfloat16_to_uint16_rz(x)
+.. function:: numba.cuda.bf16.bfloat16_to_uint16_rd(x)
+.. function:: numba.cuda.bf16.bfloat16_to_uint16_ru(x)
+
+    Convert a ``bfloat16`` to ``uint16`` with the selected rounding mode.
+
+int32 (signed 32-bit)
+"""""""""""""""""""""
+
+.. function:: numba.cuda.bf16.int32_to_bfloat16_rn(x)
+.. function:: numba.cuda.bf16.int32_to_bfloat16_rz(x)
+.. function:: numba.cuda.bf16.int32_to_bfloat16_rd(x)
+.. function:: numba.cuda.bf16.int32_to_bfloat16_ru(x)
+
+    Convert an ``int32`` to ``bfloat16`` with the selected rounding mode.
+
+.. function:: numba.cuda.bf16.bfloat16_to_int32_rn(x)
+.. function:: numba.cuda.bf16.bfloat16_to_int32_rz(x)
+.. function:: numba.cuda.bf16.bfloat16_to_int32_rd(x)
+.. function:: numba.cuda.bf16.bfloat16_to_int32_ru(x)
+
+    Convert a ``bfloat16`` to ``int32`` with the selected rounding mode.
+
+uint32 (unsigned 32-bit)
+"""""""""""""""""""""""""
+
+.. function:: numba.cuda.bf16.uint32_to_bfloat16_rn(x)
+.. function:: numba.cuda.bf16.uint32_to_bfloat16_rz(x)
+.. function:: numba.cuda.bf16.uint32_to_bfloat16_rd(x)
+.. function:: numba.cuda.bf16.uint32_to_bfloat16_ru(x)
+
+    Convert a ``uint32`` to ``bfloat16`` with the selected rounding mode.
+
+.. function:: numba.cuda.bf16.bfloat16_to_uint32_rn(x)
+.. function:: numba.cuda.bf16.bfloat16_to_uint32_rz(x)
+.. function:: numba.cuda.bf16.bfloat16_to_uint32_rd(x)
+.. function:: numba.cuda.bf16.bfloat16_to_uint32_ru(x)
+
+    Convert a ``bfloat16`` to ``uint32`` with the selected rounding mode.
+
+int64 (signed 64-bit)
+"""""""""""""""""""""
+
+.. function:: numba.cuda.bf16.int64_to_bfloat16_rn(x)
+.. function:: numba.cuda.bf16.int64_to_bfloat16_rz(x)
+.. function:: numba.cuda.bf16.int64_to_bfloat16_rd(x)
+.. function:: numba.cuda.bf16.int64_to_bfloat16_ru(x)
+
+    Convert an ``int64`` to ``bfloat16`` with the selected rounding mode.
+
+.. function:: numba.cuda.bf16.bfloat16_to_int64_rn(x)
+.. function:: numba.cuda.bf16.bfloat16_to_int64_rz(x)
+.. function:: numba.cuda.bf16.bfloat16_to_int64_rd(x)
+.. function:: numba.cuda.bf16.bfloat16_to_int64_ru(x)
+
+    Convert a ``bfloat16`` to ``int64`` with the selected rounding mode.
+
+uint64 (unsigned 64-bit)
+"""""""""""""""""""""""""
+
+.. function:: numba.cuda.bf16.uint64_to_bfloat16_rn(x)
+.. function:: numba.cuda.bf16.uint64_to_bfloat16_rz(x)
+.. function:: numba.cuda.bf16.uint64_to_bfloat16_rd(x)
+.. function:: numba.cuda.bf16.uint64_to_bfloat16_ru(x)
+
+    Convert a ``uint64`` to ``bfloat16`` with the selected rounding mode.
+
+.. function:: numba.cuda.bf16.bfloat16_to_uint64_rn(x)
+.. function:: numba.cuda.bf16.bfloat16_to_uint64_rz(x)
+.. function:: numba.cuda.bf16.bfloat16_to_uint64_rd(x)
+.. function:: numba.cuda.bf16.bfloat16_to_uint64_ru(x)
+
+    Convert a ``bfloat16`` to ``uint64`` with the selected rounding mode.
+
+8-bit conversions
+^^^^^^^^^^^^^^^^^^
+
+.. function:: numba.cuda.bf16.bfloat16_to_int8_rz(x)
+
+    Convert a ``bfloat16`` to ``int8`` with round-towards-zero.
+
+.. function:: numba.cuda.bf16.bfloat16_to_uint8_rz(x)
+
+    Convert a ``bfloat16`` to ``uint8`` with round-towards-zero.
+
+Bit Reinterpret Casts
+^^^^^^^^^^^^^^^^^^^^^
+
+These APIs reinterpret bits without numeric conversion:
+
+.. function:: numba.cuda.bf16.bfloat16_as_int16(x)
+
+    Reinterpret the bits of ``bfloat16`` as an ``int16``.
+
+.. function:: numba.cuda.bf16.bfloat16_as_uint16(x)
+
+    Reinterpret the bits of ``bfloat16`` as a ``uint16``.
+
+.. function:: numba.cuda.bf16.int16_as_bfloat16(x)
+
+    Reinterpret the bits of an ``int16`` as a ``bfloat16``.
+
+.. function:: numba.cuda.bf16.uint16_as_bfloat16(x)
+
+    Reinterpret the bits of a ``uint16`` as a ``bfloat16``.
diff --git a/numba_cuda/numba/cuda/_internal/cuda_bf16.py b/numba_cuda/numba/cuda/_internal/cuda_bf16.py
index a1cabfdff..33beb2b5a 100644
--- a/numba_cuda/numba/cuda/_internal/cuda_bf16.py
+++ b/numba_cuda/numba/cuda/_internal/cuda_bf16.py
@@ -3,19 +3,18 @@
 
 # Automatically generated by Numbast Static Binding Generator
 # Generator Information:
-# Ast_canopy version: 0.3.0
-# Numbast version: 0.3.0
-# Generation command: /home/wangm/numbast/numbast/src/numbast/__main__.py --cfg-path configs/cuda_bf16.yml --output-dir numba_cuda/numba/cuda/_internal
-# Static binding generator parameters: {'cfg_path': 'configs/cuda_bf16.yml', 'output_dir': 'numba_cuda/numba/cuda/', 'entry_point': None, 'retain': None, 'types': None, 'datamodels': None, 'compute_capability': None, 'run_ruff_format': True}
-# Config file path (relative to the path of the generated binding): ../../../../configs/cuda_bf16.yml
+# Ast_canopy version: 0.5.0
+# Numbast version: 0.5.0
+# Generation command: /home/wangm/numbast/numbast/src/numbast/__main__.py --cfg-path configs/cuda_bf16.yml --output-dir numba_cuda/numba/cuda/_internal/
+# Static binding generator parameters: {'cfg_path': 'configs/cuda_bf16.yml', 'output_dir': 'numba_cuda/numba/cuda/_internal/', 'run_ruff_format': True}
+# Config file path (relative to the path of the generated binding): ../../../../../configs/cuda_bf16.yml
 # Cudatoolkit version: (12, 8)
-# Default CUDA_HOME path: /home/wangm/micromamba/envs/numbast
+# Default CUDA_HOME path: /home/wangm/miniforge3/envs/numbast
 
 
 # Imports:
 import io
 import operator
-import os
 
 import numba
 from llvmlite import ir
@@ -26,11 +25,21 @@
     make_attribute_wrapper,
     register_model,
 )
-from numba.cuda.typing import signature
+from numba.core.imputils import Registry as TargetRegistry
+from numba.core.imputils import lower_cast
+from numba.core.typing import signature
+from numba.core.typing.builtins import (
+    BinOp,
+    BinOpTrueDiv,
+    UnaryNegate,
+    UnaryPositive,
+    UnorderedCmpOp,
+    OrderedCmpOp,
+)
 from numba.cuda.typing.templates import AttributeTemplate, ConcreteTemplate
+from numba.cuda.typing.templates import Registry as TypingRegistry
 from numba.cuda import CUSource, declare_device
-from numba.cuda.cudadecl import register, register_attr, register_global
-from numba.cuda.cudaimpl import lower
+from numba.cuda.vector_types import vector_types
 from numba.extending import as_numba_type
 from numba.types import (
     CPointer,
@@ -49,9 +58,22 @@
     uint16,
     uint32,
     uint64,
+    void,
 )
+from numba.cuda.types import bfloat16
+
+float32x2 = vector_types["float32x2"]
+__half = float16
 
-# Setups:
+
+typing_registry = TypingRegistry()
+register = typing_registry.register
+register_attr = typing_registry.register_attr
+register_global = typing_registry.register_global
+target_registry = TargetRegistry()
+lower = target_registry.lower
+lower_attr = target_registry.lower_getattr
+lower_constant = target_registry.lower_constant
 
 # Shim Stream:
 
@@ -79,83 +101,84 @@ def reset(self):
 shim_stream.write(shim_prefix)
 shim_obj = CUSource(shim_stream)
 
+
 # Enums:
 
 
 # Structs:
 
 
-# Typing for unnamed1401637
-class _type_class_unnamed1401637(Type):
+# Typing for unnamed1405307
+class _type_class_unnamed1405307(Type):
     def __init__(self):
-        super().__init__(name="unnamed1401637")
+        super().__init__(name="unnamed1405307")
         self.alignof_ = 2
         self.bitwidth = 2 * 8
 
 
-_type_unnamed1401637 = _type_class_unnamed1401637()
+_type_unnamed1405307 = _type_class_unnamed1405307()
 
 
 # Make Python API for struct
-unnamed1401637 = type("unnamed1401637", (), {"_nbtype": _type_unnamed1401637})
+unnamed1405307 = type("unnamed1405307", (), {"_nbtype": _type_unnamed1405307})
 
-as_numba_type.register(unnamed1401637, _type_unnamed1401637)
+as_numba_type.register(unnamed1405307, _type_unnamed1405307)
 
 
-@register_model(_type_class_unnamed1401637)
-class _model_unnamed1401637(StructModel):
+@register_model(_type_class_unnamed1405307)
+class _model_unnamed1405307(StructModel):
     def __init__(self, dmm, fe_type):
         members = [("x", uint16)]
         super().__init__(dmm, fe_type, members)
 
 
 @register_attr
-class _attr_typing_unnamed1401637(AttributeTemplate):
-    key = globals()["unnamed1401637"]
+class _attr_typing_unnamed1405307(AttributeTemplate):
+    key = globals()["unnamed1405307"]
 
     def resolve_x(self, obj):
         return uint16
 
 
-make_attribute_wrapper(_type_class_unnamed1401637, "x", "x")
+make_attribute_wrapper(_type_class_unnamed1405307, "x", "x")
 
 
 @register
-class _ctor_template_unnamed1401637(ConcreteTemplate):
-    key = globals()["unnamed1401637"]
+class _ctor_template_unnamed1405307(ConcreteTemplate):
+    key = globals()["unnamed1405307"]
     cases = []
 
 
-register_global(unnamed1401637, Function(_ctor_template_unnamed1401637))
+register_global(unnamed1405307, Function(_ctor_template_unnamed1405307))
 
 
-# Typing for unnamed1401746
-class _type_class_unnamed1401746(Type):
+# Typing for unnamed1405416
+class _type_class_unnamed1405416(Type):
     def __init__(self):
-        super().__init__(name="unnamed1401746")
+        super().__init__(name="unnamed1405416")
         self.alignof_ = 4
         self.bitwidth = 4 * 8
 
 
-_type_unnamed1401746 = _type_class_unnamed1401746()
+_type_unnamed1405416 = _type_class_unnamed1405416()
 
 
 # Make Python API for struct
-unnamed1401746 = type("unnamed1401746", (), {"_nbtype": _type_unnamed1401746})
+unnamed1405416 = type("unnamed1405416", (), {"_nbtype": _type_unnamed1405416})
 
-as_numba_type.register(unnamed1401746, _type_unnamed1401746)
+as_numba_type.register(unnamed1405416, _type_unnamed1405416)
 
 
-@register_model(_type_class_unnamed1401746)
-class _model_unnamed1401746(StructModel):
+@register_model(_type_class_unnamed1405416)
+class _model_unnamed1405416(StructModel):
     def __init__(self, dmm, fe_type):
         members = [("x", uint16), ("y", uint16)]
         super().__init__(dmm, fe_type, members)
 
 
 @register_attr
-class _attr_typing_unnamed1401746(AttributeTemplate):
-    key = globals()["unnamed1401746"]
+class _attr_typing_unnamed1405416(AttributeTemplate):
+    key = globals()["unnamed1405416"]
 
     def resolve_x(self, obj):
         return uint16
@@ -164,56 +187,26 @@ def resolve_y(self, obj):
         return uint16
 
 
-make_attribute_wrapper(_type_class_unnamed1401746, "x", "x")
-
-
-make_attribute_wrapper(_type_class_unnamed1401746, "y", "y")
-
-
-@register
-class _ctor_template_unnamed1401746(ConcreteTemplate):
-    key = globals()["unnamed1401746"]
-    cases = []
-
-
-register_global(unnamed1401746, Function(_ctor_template_unnamed1401746))
-
-
-# Typing for __nv_bfloat16
-class _type_class___nv_bfloat16(Number):
-    def __init__(self):
-        super().__init__(name="__nv_bfloat16")
-        self.alignof_ = 2
-        self.bitwidth = 2 * 8
-
-
-_type___nv_bfloat16 = _type_class___nv_bfloat16()
-
+make_attribute_wrapper(_type_class_unnamed1405416, "x", "x")
 
-# Make Python API for struct
-__nv_bfloat16 = type("__nv_bfloat16", (), {"_nbtype": _type___nv_bfloat16})
 
-as_numba_type.register(__nv_bfloat16, _type___nv_bfloat16)
+make_attribute_wrapper(_type_class_unnamed1405416, "y", "y")
 
 
-@register_model(_type_class___nv_bfloat16)
-class _model___nv_bfloat16(PrimitiveModel):
-    def __init__(self, dmm, fe_type):
-        be_type = ir.IntType(fe_type.bitwidth)
-        super(_model___nv_bfloat16, self).__init__(dmm, fe_type, be_type)
+__nv_bfloat16 = _type___nv_bfloat16 = bfloat16
 
 
-def _lower___nv_bfloat16_void(shim_stream, shim_obj):
+def _lower__ZN13__nv_bfloat16C1Ev(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16____nv_bfloat16_1(int &ignore, __nv_bfloat16 *self ) {
+    _ZN13__nv_bfloat16C1Ev_nbst(int &ignore, __nv_bfloat16 *self ) {
         new (self) __nv_bfloat16();
         return 0;
     }
         """
 
     _ctor_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16____nv_bfloat16_1",
+        "_ZN13__nv_bfloat16C1Ev_nbst",
         int32(
             CPointer(_type___nv_bfloat16),
         ),
@@ -227,9 +220,7 @@ def __nv_bfloat16_device_caller(arg_0):
     )
     def ctor_impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key(
-            "____nv_bfloat16____nv_bfloat16_1", shim_raw_str
-        )
+        shim_stream.write_with_key("_ZN13__nv_bfloat16C1Ev_nbst", shim_raw_str)
         selfptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
         )
@@ -253,31 +244,31 @@ def ctor_impl(context, builder, sig, args):
         )
 
 
-_lower___nv_bfloat16_void(shim_stream, shim_obj)
+_lower__ZN13__nv_bfloat16C1Ev(shim_stream, shim_obj)
 
 
-def _lower___nv_bfloat16__type_unnamed1401637(shim_stream, shim_obj):
+def _lower__ZN13__nv_bfloat16C1ERK17__nv_bfloat16_raw(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16____nv_bfloat16_2(int &ignore, __nv_bfloat16 *self , __nv_bfloat16_raw* hr) {
+    _ZN13__nv_bfloat16C1ERK17__nv_bfloat16_raw_nbst(int &ignore, __nv_bfloat16 *self , __nv_bfloat16_raw* hr) {
         new (self) __nv_bfloat16(*hr);
         return 0;
     }
         """
 
     _ctor_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16____nv_bfloat16_2",
-        int32(CPointer(_type___nv_bfloat16), CPointer(_type_unnamed1401637)),
+        "_ZN13__nv_bfloat16C1ERK17__nv_bfloat16_raw_nbst",
+        int32(CPointer(_type___nv_bfloat16), CPointer(_type_unnamed1405307)),
     )
 
     def __nv_bfloat16_device_caller(arg_0, arg_1):
         return _ctor_decl___nv_bfloat16(arg_0, arg_1)
 
-    @lower(__nv_bfloat16, _type_unnamed1401637)
+    @lower(__nv_bfloat16, _type_unnamed1405307)
     def ctor_impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
-            "____nv_bfloat16____nv_bfloat16_2", shim_raw_str
+            "_ZN13__nv_bfloat16C1ERK17__nv_bfloat16_raw_nbst", shim_raw_str
         )
         selfptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
@@ -294,7 +285,7 @@ def ctor_impl(context, builder, sig, args):
             signature(
                 int32,
                 CPointer(_type___nv_bfloat16),
-                CPointer(_type_unnamed1401637),
+                CPointer(_type_unnamed1405307),
             ),
             (selfptr, *argptrs),
         )
@@ -302,21 +293,30 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
         )
 
+    @lower_cast(_type_unnamed1405307, _type___nv_bfloat16)
+    def conversion_impl(context, builder, fromty, toty, value):
+        return ctor_impl(
+            context,
+            builder,
+            signature(_type___nv_bfloat16, fromty),
+            [value],
+        )
 
-_lower___nv_bfloat16__type_unnamed1401637(shim_stream, shim_obj)
 
+_lower__ZN13__nv_bfloat16C1ERK17__nv_bfloat16_raw(shim_stream, shim_obj)
 
-def _lower___nv_bfloat16_float16(shim_stream, shim_obj):
+
+def _lower__ZN13__nv_bfloat16C1E6__half(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16____nv_bfloat16_3(int &ignore, __nv_bfloat16 *self , __half* f) {
+    _ZN13__nv_bfloat16C1E6__half_nbst(int &ignore, __nv_bfloat16 *self , __half* f) {
         new (self) __nv_bfloat16(*f);
         return 0;
     }
         """
 
     _ctor_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16____nv_bfloat16_3",
+        "_ZN13__nv_bfloat16C1E6__half_nbst",
         int32(CPointer(_type___nv_bfloat16), CPointer(float16)),
     )
 
@@ -327,7 +327,7 @@ def __nv_bfloat16_device_caller(arg_0, arg_1):
     def ctor_impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
-            "____nv_bfloat16____nv_bfloat16_3", shim_raw_str
+            "_ZN13__nv_bfloat16C1E6__half_nbst", shim_raw_str
         )
         selfptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
@@ -348,21 +348,32 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
         )
 
+    # By default, Numbast does not generate this cast because the c++ conversion
+    # constructor is marked explict. We enable it by hand here.
+    @lower_cast(float16, __nv_bfloat16)
+    def conversion_impl(context, builder, fromty, toty, value):
+        return ctor_impl(
+            context,
+            builder,
+            signature(__nv_bfloat16, fromty),
+            [value],
+        )
+
 
-_lower___nv_bfloat16_float16(shim_stream, shim_obj)
+_lower__ZN13__nv_bfloat16C1E6__half(shim_stream, shim_obj)
 
 
-def _lower___nv_bfloat16_float32(shim_stream, shim_obj):
+def _lower__ZN13__nv_bfloat16C1Ef(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16____nv_bfloat16_4(int &ignore, __nv_bfloat16 *self , float* f) {
+    _ZN13__nv_bfloat16C1Ef_nbst(int &ignore, __nv_bfloat16 *self , float* f) {
         new (self) __nv_bfloat16(*f);
         return 0;
     }
         """
 
     _ctor_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16____nv_bfloat16_4",
+        "_ZN13__nv_bfloat16C1Ef_nbst",
         int32(CPointer(_type___nv_bfloat16), CPointer(float32)),
     )
 
@@ -372,9 +383,7 @@ def __nv_bfloat16_device_caller(arg_0, arg_1):
     @lower(__nv_bfloat16, float32)
     def ctor_impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key(
-            "____nv_bfloat16____nv_bfloat16_4", shim_raw_str
-        )
+        shim_stream.write_with_key("_ZN13__nv_bfloat16C1Ef_nbst", shim_raw_str)
         selfptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
         )
@@ -394,21 +403,30 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
         )
 
+    @lower_cast(float32, _type___nv_bfloat16)
+    def conversion_impl(context, builder, fromty, toty, value):
+        return ctor_impl(
+            context,
+            builder,
+            signature(_type___nv_bfloat16, fromty),
+            [value],
+        )
 
-_lower___nv_bfloat16_float32(shim_stream, shim_obj)
 
+_lower__ZN13__nv_bfloat16C1Ef(shim_stream, shim_obj)
 
-def _lower___nv_bfloat16_float64(shim_stream, shim_obj):
+
+def _lower__ZN13__nv_bfloat16C1Ed(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16____nv_bfloat16_5(int &ignore, __nv_bfloat16 *self , double* f) {
+    _ZN13__nv_bfloat16C1Ed_nbst(int &ignore, __nv_bfloat16 *self , double* f) {
         new (self) __nv_bfloat16(*f);
         return 0;
     }
         """
 
     _ctor_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16____nv_bfloat16_5",
+        "_ZN13__nv_bfloat16C1Ed_nbst",
         int32(CPointer(_type___nv_bfloat16), CPointer(float64)),
     )
 
@@ -418,9 +436,7 @@ def __nv_bfloat16_device_caller(arg_0, arg_1):
     @lower(__nv_bfloat16, float64)
     def ctor_impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key(
-            "____nv_bfloat16____nv_bfloat16_5", shim_raw_str
-        )
+        shim_stream.write_with_key("_ZN13__nv_bfloat16C1Ed_nbst", shim_raw_str)
         selfptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
         )
@@ -440,21 +456,30 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
         )
 
+    @lower_cast(float64, _type___nv_bfloat16)
+    def conversion_impl(context, builder, fromty, toty, value):
+        return ctor_impl(
+            context,
+            builder,
+            signature(_type___nv_bfloat16, fromty),
+            [value],
+        )
+
 
-_lower___nv_bfloat16_float64(shim_stream, shim_obj)
+_lower__ZN13__nv_bfloat16C1Ed(shim_stream, shim_obj)
 
 
-def _lower___nv_bfloat16_int16(shim_stream, shim_obj):
+def _lower__ZN13__nv_bfloat16C1Es(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16____nv_bfloat16_6(int &ignore, __nv_bfloat16 *self , short* val) {
+    _ZN13__nv_bfloat16C1Es_nbst(int &ignore, __nv_bfloat16 *self , short* val) {
         new (self) __nv_bfloat16(*val);
         return 0;
     }
         """
 
     _ctor_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16____nv_bfloat16_6",
+        "_ZN13__nv_bfloat16C1Es_nbst",
         int32(CPointer(_type___nv_bfloat16), CPointer(int16)),
     )
 
@@ -464,9 +489,7 @@ def __nv_bfloat16_device_caller(arg_0, arg_1):
     @lower(__nv_bfloat16, int16)
     def ctor_impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key(
-            "____nv_bfloat16____nv_bfloat16_6", shim_raw_str
-        )
+        shim_stream.write_with_key("_ZN13__nv_bfloat16C1Es_nbst", shim_raw_str)
         selfptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
         )
@@ -486,21 +509,30 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
         )
 
+    @lower_cast(int16, _type___nv_bfloat16)
+    def conversion_impl(context, builder, fromty, toty, value):
+        return ctor_impl(
+            context,
+            builder,
+            signature(_type___nv_bfloat16, fromty),
+            [value],
+        )
+
 
-_lower___nv_bfloat16_int16(shim_stream, shim_obj)
+_lower__ZN13__nv_bfloat16C1Es(shim_stream, shim_obj)
 
 
-def _lower___nv_bfloat16_uint16(shim_stream, shim_obj):
+def _lower__ZN13__nv_bfloat16C1Et(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16____nv_bfloat16_7(int &ignore, __nv_bfloat16 *self , unsigned short* val) {
+    _ZN13__nv_bfloat16C1Et_nbst(int &ignore, __nv_bfloat16 *self , unsigned short* val) {
         new (self) __nv_bfloat16(*val);
         return 0;
     }
         """
 
     _ctor_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16____nv_bfloat16_7",
+        "_ZN13__nv_bfloat16C1Et_nbst",
         int32(CPointer(_type___nv_bfloat16), CPointer(uint16)),
     )
 
@@ -510,9 +542,7 @@ def __nv_bfloat16_device_caller(arg_0, arg_1):
     @lower(__nv_bfloat16, uint16)
     def ctor_impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key(
-            "____nv_bfloat16____nv_bfloat16_7", shim_raw_str
-        )
+        shim_stream.write_with_key("_ZN13__nv_bfloat16C1Et_nbst", shim_raw_str)
         selfptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
         )
@@ -532,21 +562,30 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
         )
 
+    @lower_cast(uint16, _type___nv_bfloat16)
+    def conversion_impl(context, builder, fromty, toty, value):
+        return ctor_impl(
+            context,
+            builder,
+            signature(_type___nv_bfloat16, fromty),
+            [value],
+        )
+
 
-_lower___nv_bfloat16_uint16(shim_stream, shim_obj)
+_lower__ZN13__nv_bfloat16C1Et(shim_stream, shim_obj)
 
 
-def _lower___nv_bfloat16_int32(shim_stream, shim_obj):
+def _lower__ZN13__nv_bfloat16C1Ei(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16____nv_bfloat16_8(int &ignore, __nv_bfloat16 *self , int* val) {
+    _ZN13__nv_bfloat16C1Ei_nbst(int &ignore, __nv_bfloat16 *self , int* val) {
         new (self) __nv_bfloat16(*val);
         return 0;
     }
         """
 
     _ctor_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16____nv_bfloat16_8",
+        "_ZN13__nv_bfloat16C1Ei_nbst",
         int32(CPointer(_type___nv_bfloat16), CPointer(int32)),
     )
 
@@ -556,9 +595,7 @@ def __nv_bfloat16_device_caller(arg_0, arg_1):
     @lower(__nv_bfloat16, int32)
     def ctor_impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key(
-            "____nv_bfloat16____nv_bfloat16_8", shim_raw_str
-        )
+        shim_stream.write_with_key("_ZN13__nv_bfloat16C1Ei_nbst", shim_raw_str)
         selfptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
         )
@@ -578,21 +615,30 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
         )
 
+    @lower_cast(int32, _type___nv_bfloat16)
+    def conversion_impl(context, builder, fromty, toty, value):
+        return ctor_impl(
+            context,
+            builder,
+            signature(_type___nv_bfloat16, fromty),
+            [value],
+        )
 
-_lower___nv_bfloat16_int32(shim_stream, shim_obj)
 
+_lower__ZN13__nv_bfloat16C1Ei(shim_stream, shim_obj)
 
-def _lower___nv_bfloat16_uint32(shim_stream, shim_obj):
+
+def _lower__ZN13__nv_bfloat16C1Ej(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16____nv_bfloat16_9(int &ignore, __nv_bfloat16 *self , unsigned int* val) {
+    _ZN13__nv_bfloat16C1Ej_nbst(int &ignore, __nv_bfloat16 *self , unsigned int* val) {
         new (self) __nv_bfloat16(*val);
         return 0;
     }
         """
 
     _ctor_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16____nv_bfloat16_9",
+        "_ZN13__nv_bfloat16C1Ej_nbst",
         int32(CPointer(_type___nv_bfloat16), CPointer(uint32)),
     )
 
@@ -602,9 +648,7 @@ def __nv_bfloat16_device_caller(arg_0, arg_1):
     @lower(__nv_bfloat16, uint32)
     def ctor_impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key(
-            "____nv_bfloat16____nv_bfloat16_9", shim_raw_str
-        )
+        shim_stream.write_with_key("_ZN13__nv_bfloat16C1Ej_nbst", shim_raw_str)
         selfptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
         )
@@ -624,21 +668,30 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
         )
 
+    @lower_cast(uint32, _type___nv_bfloat16)
+    def conversion_impl(context, builder, fromty, toty, value):
+        return ctor_impl(
+            context,
+            builder,
+            signature(_type___nv_bfloat16, fromty),
+            [value],
+        )
+
 
-_lower___nv_bfloat16_uint32(shim_stream, shim_obj)
+_lower__ZN13__nv_bfloat16C1Ej(shim_stream, shim_obj)
 
 
-def _lower___nv_bfloat16_int64(shim_stream, shim_obj):
+def _lower__ZN13__nv_bfloat16C1El(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16____nv_bfloat16_10(int &ignore, __nv_bfloat16 *self , long* val) {
+    _ZN13__nv_bfloat16C1El_nbst(int &ignore, __nv_bfloat16 *self , long* val) {
         new (self) __nv_bfloat16(*val);
         return 0;
     }
         """
 
     _ctor_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16____nv_bfloat16_10",
+        "_ZN13__nv_bfloat16C1El_nbst",
         int32(CPointer(_type___nv_bfloat16), CPointer(int64)),
     )
 
@@ -648,9 +701,7 @@ def __nv_bfloat16_device_caller(arg_0, arg_1):
     @lower(__nv_bfloat16, int64)
     def ctor_impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key(
-            "____nv_bfloat16____nv_bfloat16_10", shim_raw_str
-        )
+        shim_stream.write_with_key("_ZN13__nv_bfloat16C1El_nbst", shim_raw_str)
         selfptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
         )
@@ -670,21 +721,30 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
         )
 
+    @lower_cast(int64, _type___nv_bfloat16)
+    def conversion_impl(context, builder, fromty, toty, value):
+        return ctor_impl(
+            context,
+            builder,
+            signature(_type___nv_bfloat16, fromty),
+            [value],
+        )
+
 
-_lower___nv_bfloat16_int64(shim_stream, shim_obj)
+_lower__ZN13__nv_bfloat16C1El(shim_stream, shim_obj)
 
 
-def _lower___nv_bfloat16_uint64(shim_stream, shim_obj):
+def _lower__ZN13__nv_bfloat16C1Em(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16____nv_bfloat16_11(int &ignore, __nv_bfloat16 *self , unsigned long* val) {
+    _ZN13__nv_bfloat16C1Em_nbst(int &ignore, __nv_bfloat16 *self , unsigned long* val) {
         new (self) __nv_bfloat16(*val);
         return 0;
     }
         """
 
     _ctor_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16____nv_bfloat16_11",
+        "_ZN13__nv_bfloat16C1Em_nbst",
         int32(CPointer(_type___nv_bfloat16), CPointer(uint64)),
     )
 
@@ -694,9 +754,7 @@ def __nv_bfloat16_device_caller(arg_0, arg_1):
     @lower(__nv_bfloat16, uint64)
     def ctor_impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key(
-            "____nv_bfloat16____nv_bfloat16_11", shim_raw_str
-        )
+        shim_stream.write_with_key("_ZN13__nv_bfloat16C1Em_nbst", shim_raw_str)
         selfptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
         )
@@ -716,21 +774,30 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
         )
 
+    @lower_cast(uint64, _type___nv_bfloat16)
+    def conversion_impl(context, builder, fromty, toty, value):
+        return ctor_impl(
+            context,
+            builder,
+            signature(_type___nv_bfloat16, fromty),
+            [value],
+        )
+
 
-_lower___nv_bfloat16_uint64(shim_stream, shim_obj)
+_lower__ZN13__nv_bfloat16C1Em(shim_stream, shim_obj)
 
 
-def _lower___nv_bfloat16_int64(shim_stream, shim_obj):
+def _lower__ZN13__nv_bfloat16C1Ex(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16____nv_bfloat16_12(int &ignore, __nv_bfloat16 *self , long long* val) {
+    _ZN13__nv_bfloat16C1Ex_nbst(int &ignore, __nv_bfloat16 *self , long long* val) {
         new (self) __nv_bfloat16(*val);
         return 0;
     }
         """
 
     _ctor_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16____nv_bfloat16_12",
+        "_ZN13__nv_bfloat16C1Ex_nbst",
         int32(CPointer(_type___nv_bfloat16), CPointer(int64)),
     )
 
@@ -740,9 +807,7 @@ def __nv_bfloat16_device_caller(arg_0, arg_1):
     @lower(__nv_bfloat16, int64)
     def ctor_impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key(
-            "____nv_bfloat16____nv_bfloat16_12", shim_raw_str
-        )
+        shim_stream.write_with_key("_ZN13__nv_bfloat16C1Ex_nbst", shim_raw_str)
         selfptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
         )
@@ -762,21 +827,30 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
         )
 
+    @lower_cast(int64, _type___nv_bfloat16)
+    def conversion_impl(context, builder, fromty, toty, value):
+        return ctor_impl(
+            context,
+            builder,
+            signature(_type___nv_bfloat16, fromty),
+            [value],
+        )
 
-_lower___nv_bfloat16_int64(shim_stream, shim_obj)
 
+_lower__ZN13__nv_bfloat16C1Ex(shim_stream, shim_obj)
 
-def _lower___nv_bfloat16_uint64(shim_stream, shim_obj):
+
+def _lower__ZN13__nv_bfloat16C1Ey(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16____nv_bfloat16_13(int &ignore, __nv_bfloat16 *self , unsigned long long* val) {
+    _ZN13__nv_bfloat16C1Ey_nbst(int &ignore, __nv_bfloat16 *self , unsigned long long* val) {
         new (self) __nv_bfloat16(*val);
         return 0;
     }
         """
 
     _ctor_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16____nv_bfloat16_13",
+        "_ZN13__nv_bfloat16C1Ey_nbst",
         int32(CPointer(_type___nv_bfloat16), CPointer(uint64)),
     )
 
@@ -786,9 +860,7 @@ def __nv_bfloat16_device_caller(arg_0, arg_1):
     @lower(__nv_bfloat16, uint64)
     def ctor_impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key(
-            "____nv_bfloat16____nv_bfloat16_13", shim_raw_str
-        )
+        shim_stream.write_with_key("_ZN13__nv_bfloat16C1Ey_nbst", shim_raw_str)
         selfptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
         )
@@ -808,8 +880,17 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None)
         )
 
+    @lower_cast(uint64, _type___nv_bfloat16)
+    def conversion_impl(context, builder, fromty, toty, value):
+        return ctor_impl(
+            context,
+            builder,
+            signature(_type___nv_bfloat16, fromty),
+            [value],
+        )
+
 
-_lower___nv_bfloat16_uint64(shim_stream, shim_obj)
+_lower__ZN13__nv_bfloat16C1Ey(shim_stream, shim_obj)
 
 
 @register
@@ -819,7 +900,7 @@ class _ctor_template___nv_bfloat16(ConcreteTemplate):
         signature(
             _type___nv_bfloat16,
         ),
-        signature(_type___nv_bfloat16, _type_unnamed1401637),
+        signature(_type___nv_bfloat16, _type_unnamed1405307),
         signature(_type___nv_bfloat16, float16),
         signature(_type___nv_bfloat16, float32),
         signature(_type___nv_bfloat16, float64),
@@ -837,18 +918,18 @@ class _ctor_template___nv_bfloat16(ConcreteTemplate):
 register_global(__nv_bfloat16, Function(_ctor_template___nv_bfloat16))
 
 
-def _from___nv_bfloat16_to__type_unnamed1401637_lower(shim_stream, shim_obj):
+def _from___nv_bfloat16_to__type_unnamed1405307_lower(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16_operator___nv_bfloat16_raw_1(__nv_bfloat16_raw &retval, __nv_bfloat16 *self) {
+    ____nv_bfloat16__ZNK13__nv_bfloat16cv17__nv_bfloat16_rawEv_1(__nv_bfloat16_raw &retval, __nv_bfloat16 *self) {
         retval = self->operator __nv_bfloat16_raw();
         return 0;
     }
         """
 
     _op_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16_operator___nv_bfloat16_raw_1",
-        _type_unnamed1401637(
+        "____nv_bfloat16__ZNK13__nv_bfloat16cv17__nv_bfloat16_rawEv_1",
+        _type_unnamed1405307(
             CPointer(_type___nv_bfloat16),
         ),
     )
@@ -856,11 +937,12 @@ def _from___nv_bfloat16_to__type_unnamed1401637_lower(shim_stream, shim_obj):
     def _conversion_op_caller___nv_bfloat16(arg):
         return _op_decl___nv_bfloat16(arg)
 
-    @lower_cast(_type___nv_bfloat16, _type_unnamed1401637)
+    @lower_cast(_type___nv_bfloat16, _type_unnamed1405307)
     def impl(context, builder, fromty, toty, value):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
-            "____nv_bfloat16_operator___nv_bfloat16_raw_1", shim_raw_str
+            "____nv_bfloat16__ZNK13__nv_bfloat16cv17__nv_bfloat16_rawEv_1",
+            shim_raw_str,
         )
         ptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
@@ -873,28 +955,28 @@ def impl(context, builder, fromty, toty, value):
             builder,
             _conversion_op_caller___nv_bfloat16,
             signature(
-                _type_unnamed1401637,
+                _type_unnamed1405307,
                 CPointer(_type___nv_bfloat16),
             ),
             (ptr,),
         )
 
 
-_from___nv_bfloat16_to__type_unnamed1401637_lower(shim_stream, shim_obj)
+_from___nv_bfloat16_to__type_unnamed1405307_lower(shim_stream, shim_obj)
 
 
-def _from___nv_bfloat16_to__type_unnamed1401637_lower(shim_stream, shim_obj):
+def _from___nv_bfloat16_to__type_unnamed1405307_lower(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16_operator___nv_bfloat16_raw_2(__nv_bfloat16_raw &retval, __nv_bfloat16 *self) {
+    ____nv_bfloat16__ZNVK13__nv_bfloat16cv17__nv_bfloat16_rawEv_1(__nv_bfloat16_raw &retval, __nv_bfloat16 *self) {
         retval = self->operator __nv_bfloat16_raw();
         return 0;
     }
         """
 
     _op_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16_operator___nv_bfloat16_raw_2",
-        _type_unnamed1401637(
+        "____nv_bfloat16__ZNVK13__nv_bfloat16cv17__nv_bfloat16_rawEv_1",
+        _type_unnamed1405307(
             CPointer(_type___nv_bfloat16),
         ),
     )
@@ -902,11 +984,12 @@ def _from___nv_bfloat16_to__type_unnamed1401637_lower(shim_stream, shim_obj):
     def _conversion_op_caller___nv_bfloat16(arg):
         return _op_decl___nv_bfloat16(arg)
 
-    @lower_cast(_type___nv_bfloat16, _type_unnamed1401637)
+    @lower_cast(_type___nv_bfloat16, _type_unnamed1405307)
     def impl(context, builder, fromty, toty, value):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
-            "____nv_bfloat16_operator___nv_bfloat16_raw_2", shim_raw_str
+            "____nv_bfloat16__ZNVK13__nv_bfloat16cv17__nv_bfloat16_rawEv_1",
+            shim_raw_str,
         )
         ptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
@@ -919,27 +1002,27 @@ def impl(context, builder, fromty, toty, value):
             builder,
             _conversion_op_caller___nv_bfloat16,
             signature(
-                _type_unnamed1401637,
+                _type_unnamed1405307,
                 CPointer(_type___nv_bfloat16),
             ),
             (ptr,),
         )
 
 
-_from___nv_bfloat16_to__type_unnamed1401637_lower(shim_stream, shim_obj)
+_from___nv_bfloat16_to__type_unnamed1405307_lower(shim_stream, shim_obj)
 
 
 def _from___nv_bfloat16_to_float32_lower(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16_operator_float_1(float &retval, __nv_bfloat16 *self) {
+    ____nv_bfloat16__ZNK13__nv_bfloat16cvfEv_1(float &retval, __nv_bfloat16 *self) {
         retval = self->operator float();
         return 0;
     }
         """
 
     _op_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16_operator_float_1",
+        "____nv_bfloat16__ZNK13__nv_bfloat16cvfEv_1",
         float32(
             CPointer(_type___nv_bfloat16),
         ),
@@ -952,7 +1035,7 @@ def _conversion_op_caller___nv_bfloat16(arg):
     def impl(context, builder, fromty, toty, value):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
-            "____nv_bfloat16_operator_float_1", shim_raw_str
+            "____nv_bfloat16__ZNK13__nv_bfloat16cvfEv_1", shim_raw_str
         )
         ptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
@@ -978,14 +1061,14 @@ def impl(context, builder, fromty, toty, value):
 def _from___nv_bfloat16_to_int8_lower(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16_operator_signed_char_1(signed char &retval, __nv_bfloat16 *self) {
+    ____nv_bfloat16__ZNK13__nv_bfloat16cvaEv_1(signed char &retval, __nv_bfloat16 *self) {
         retval = self->operator signed char();
         return 0;
     }
         """
 
     _op_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16_operator_signed_char_1",
+        "____nv_bfloat16__ZNK13__nv_bfloat16cvaEv_1",
         int8(
             CPointer(_type___nv_bfloat16),
         ),
@@ -998,7 +1081,7 @@ def _conversion_op_caller___nv_bfloat16(arg):
     def impl(context, builder, fromty, toty, value):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
-            "____nv_bfloat16_operator_signed_char_1", shim_raw_str
+            "____nv_bfloat16__ZNK13__nv_bfloat16cvaEv_1", shim_raw_str
         )
         ptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
@@ -1024,14 +1107,14 @@ def impl(context, builder, fromty, toty, value):
 def _from___nv_bfloat16_to_uint8_lower(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16_operator_unsigned_char_1(unsigned char &retval, __nv_bfloat16 *self) {
+    ____nv_bfloat16__ZNK13__nv_bfloat16cvhEv_1(unsigned char &retval, __nv_bfloat16 *self) {
         retval = self->operator unsigned char();
         return 0;
     }
         """
 
     _op_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16_operator_unsigned_char_1",
+        "____nv_bfloat16__ZNK13__nv_bfloat16cvhEv_1",
         uint8(
             CPointer(_type___nv_bfloat16),
         ),
@@ -1044,7 +1127,7 @@ def _conversion_op_caller___nv_bfloat16(arg):
     def impl(context, builder, fromty, toty, value):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
-            "____nv_bfloat16_operator_unsigned_char_1", shim_raw_str
+            "____nv_bfloat16__ZNK13__nv_bfloat16cvhEv_1", shim_raw_str
         )
         ptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
@@ -1070,14 +1153,14 @@ def impl(context, builder, fromty, toty, value):
 def _from___nv_bfloat16_to_int8_lower(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16_operator_char_1(char &retval, __nv_bfloat16 *self) {
+    ____nv_bfloat16__ZNK13__nv_bfloat16cvcEv_1(char &retval, __nv_bfloat16 *self) {
         retval = self->operator char();
         return 0;
     }
         """
 
     _op_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16_operator_char_1",
+        "____nv_bfloat16__ZNK13__nv_bfloat16cvcEv_1",
         int8(
             CPointer(_type___nv_bfloat16),
         ),
@@ -1090,7 +1173,7 @@ def _conversion_op_caller___nv_bfloat16(arg):
     def impl(context, builder, fromty, toty, value):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
-            "____nv_bfloat16_operator_char_1", shim_raw_str
+            "____nv_bfloat16__ZNK13__nv_bfloat16cvcEv_1", shim_raw_str
         )
         ptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
@@ -1116,14 +1199,14 @@ def impl(context, builder, fromty, toty, value):
 def _from___nv_bfloat16_to_int16_lower(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16_operator_short_1(short &retval, __nv_bfloat16 *self) {
+    ____nv_bfloat16__ZNK13__nv_bfloat16cvsEv_1(short &retval, __nv_bfloat16 *self) {
         retval = self->operator short();
         return 0;
     }
         """
 
     _op_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16_operator_short_1",
+        "____nv_bfloat16__ZNK13__nv_bfloat16cvsEv_1",
         int16(
             CPointer(_type___nv_bfloat16),
         ),
@@ -1136,7 +1219,7 @@ def _conversion_op_caller___nv_bfloat16(arg):
     def impl(context, builder, fromty, toty, value):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
-            "____nv_bfloat16_operator_short_1", shim_raw_str
+            "____nv_bfloat16__ZNK13__nv_bfloat16cvsEv_1", shim_raw_str
         )
         ptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
@@ -1162,14 +1245,14 @@ def impl(context, builder, fromty, toty, value):
 def _from___nv_bfloat16_to_uint16_lower(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16_operator_unsigned_short_1(unsigned short &retval, __nv_bfloat16 *self) {
+    ____nv_bfloat16__ZNK13__nv_bfloat16cvtEv_1(unsigned short &retval, __nv_bfloat16 *self) {
         retval = self->operator unsigned short();
         return 0;
     }
         """
 
     _op_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16_operator_unsigned_short_1",
+        "____nv_bfloat16__ZNK13__nv_bfloat16cvtEv_1",
         uint16(
             CPointer(_type___nv_bfloat16),
         ),
@@ -1182,7 +1265,7 @@ def _conversion_op_caller___nv_bfloat16(arg):
     def impl(context, builder, fromty, toty, value):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
-            "____nv_bfloat16_operator_unsigned_short_1", shim_raw_str
+            "____nv_bfloat16__ZNK13__nv_bfloat16cvtEv_1", shim_raw_str
         )
         ptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
@@ -1208,14 +1291,14 @@ def impl(context, builder, fromty, toty, value):
 def _from___nv_bfloat16_to_int32_lower(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16_operator_int_1(int &retval, __nv_bfloat16 *self) {
+    ____nv_bfloat16__ZNK13__nv_bfloat16cviEv_1(int &retval, __nv_bfloat16 *self) {
         retval = self->operator int();
         return 0;
     }
         """
 
     _op_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16_operator_int_1",
+        "____nv_bfloat16__ZNK13__nv_bfloat16cviEv_1",
         int32(
             CPointer(_type___nv_bfloat16),
         ),
@@ -1228,7 +1311,7 @@ def _conversion_op_caller___nv_bfloat16(arg):
     def impl(context, builder, fromty, toty, value):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
-            "____nv_bfloat16_operator_int_1", shim_raw_str
+            "____nv_bfloat16__ZNK13__nv_bfloat16cviEv_1", shim_raw_str
         )
         ptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
@@ -1254,14 +1337,14 @@ def impl(context, builder, fromty, toty, value):
 def _from___nv_bfloat16_to_uint32_lower(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16_operator_unsigned_int_1(unsigned int &retval, __nv_bfloat16 *self) {
+    ____nv_bfloat16__ZNK13__nv_bfloat16cvjEv_1(unsigned int &retval, __nv_bfloat16 *self) {
         retval = self->operator unsigned int();
         return 0;
     }
         """
 
     _op_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16_operator_unsigned_int_1",
+        "____nv_bfloat16__ZNK13__nv_bfloat16cvjEv_1",
         uint32(
             CPointer(_type___nv_bfloat16),
         ),
@@ -1274,7 +1357,7 @@ def _conversion_op_caller___nv_bfloat16(arg):
     def impl(context, builder, fromty, toty, value):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
-            "____nv_bfloat16_operator_unsigned_int_1", shim_raw_str
+            "____nv_bfloat16__ZNK13__nv_bfloat16cvjEv_1", shim_raw_str
         )
         ptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
@@ -1300,14 +1383,14 @@ def impl(context, builder, fromty, toty, value):
 def _from___nv_bfloat16_to_int64_lower(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16_operator_long_1(long &retval, __nv_bfloat16 *self) {
+    ____nv_bfloat16__ZNK13__nv_bfloat16cvlEv_1(long &retval, __nv_bfloat16 *self) {
         retval = self->operator long();
         return 0;
     }
         """
 
     _op_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16_operator_long_1",
+        "____nv_bfloat16__ZNK13__nv_bfloat16cvlEv_1",
         int64(
             CPointer(_type___nv_bfloat16),
         ),
@@ -1320,7 +1403,7 @@ def _conversion_op_caller___nv_bfloat16(arg):
     def impl(context, builder, fromty, toty, value):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
-            "____nv_bfloat16_operator_long_1", shim_raw_str
+            "____nv_bfloat16__ZNK13__nv_bfloat16cvlEv_1", shim_raw_str
         )
         ptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
@@ -1346,14 +1429,14 @@ def impl(context, builder, fromty, toty, value):
 def _from___nv_bfloat16_to_uint64_lower(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16_operator_unsigned_long_1(unsigned long &retval, __nv_bfloat16 *self) {
+    ____nv_bfloat16__ZNK13__nv_bfloat16cvmEv_1(unsigned long &retval, __nv_bfloat16 *self) {
         retval = self->operator unsigned long();
         return 0;
     }
         """
 
     _op_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16_operator_unsigned_long_1",
+        "____nv_bfloat16__ZNK13__nv_bfloat16cvmEv_1",
         uint64(
             CPointer(_type___nv_bfloat16),
         ),
@@ -1366,7 +1449,7 @@ def _conversion_op_caller___nv_bfloat16(arg):
     def impl(context, builder, fromty, toty, value):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
-            "____nv_bfloat16_operator_unsigned_long_1", shim_raw_str
+            "____nv_bfloat16__ZNK13__nv_bfloat16cvmEv_1", shim_raw_str
         )
         ptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
@@ -1392,14 +1475,14 @@ def impl(context, builder, fromty, toty, value):
 def _from___nv_bfloat16_to_int64_lower(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16_operator_long_long_1(long long &retval, __nv_bfloat16 *self) {
+    ____nv_bfloat16__ZNK13__nv_bfloat16cvxEv_1(long long &retval, __nv_bfloat16 *self) {
         retval = self->operator long long();
         return 0;
     }
         """
 
     _op_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16_operator_long_long_1",
+        "____nv_bfloat16__ZNK13__nv_bfloat16cvxEv_1",
         int64(
             CPointer(_type___nv_bfloat16),
         ),
@@ -1412,7 +1495,7 @@ def _conversion_op_caller___nv_bfloat16(arg):
     def impl(context, builder, fromty, toty, value):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
-            "____nv_bfloat16_operator_long_long_1", shim_raw_str
+            "____nv_bfloat16__ZNK13__nv_bfloat16cvxEv_1", shim_raw_str
         )
         ptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
@@ -1438,14 +1521,14 @@ def impl(context, builder, fromty, toty, value):
 def _from___nv_bfloat16_to_uint64_lower(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16_operator_unsigned_long_long_1(unsigned long long &retval, __nv_bfloat16 *self) {
+    ____nv_bfloat16__ZNK13__nv_bfloat16cvyEv_1(unsigned long long &retval, __nv_bfloat16 *self) {
         retval = self->operator unsigned long long();
         return 0;
     }
         """
 
     _op_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16_operator_unsigned_long_long_1",
+        "____nv_bfloat16__ZNK13__nv_bfloat16cvyEv_1",
         uint64(
             CPointer(_type___nv_bfloat16),
         ),
@@ -1458,7 +1541,7 @@ def _conversion_op_caller___nv_bfloat16(arg):
     def impl(context, builder, fromty, toty, value):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
-            "____nv_bfloat16_operator_unsigned_long_long_1", shim_raw_str
+            "____nv_bfloat16__ZNK13__nv_bfloat16cvyEv_1", shim_raw_str
         )
         ptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
@@ -1484,14 +1567,14 @@ def impl(context, builder, fromty, toty, value):
 def _from___nv_bfloat16_to_bool__lower(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat16_operator_bool_1(bool &retval, __nv_bfloat16 *self) {
+    ____nv_bfloat16__ZNK13__nv_bfloat16cvbEv_1(bool &retval, __nv_bfloat16 *self) {
         retval = self->operator bool();
         return 0;
     }
         """
 
     _op_decl___nv_bfloat16 = declare_device(
-        "____nv_bfloat16_operator_bool_1",
+        "____nv_bfloat16__ZNK13__nv_bfloat16cvbEv_1",
         bool_(
             CPointer(_type___nv_bfloat16),
         ),
@@ -1504,7 +1587,7 @@ def _conversion_op_caller___nv_bfloat16(arg):
     def impl(context, builder, fromty, toty, value):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
-            "____nv_bfloat16_operator_bool_1", shim_raw_str
+            "____nv_bfloat16__ZNK13__nv_bfloat16cvbEv_1", shim_raw_str
         )
         ptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat16), name="selfptr"
@@ -1527,6 +1610,33 @@ def impl(context, builder, fromty, toty, value):
 _from___nv_bfloat16_to_bool__lower(shim_stream, shim_obj)
 
 
+# C++ does not provide a conversion operator from bfloat16 to double, so we need to implement it manually.
+def _from___nv_bfloat16_to_float64__lower():
+    @lower_cast(_type___nv_bfloat16, float64)
+    def impl(context, builder, fromty, toty, value):
+        # Hand rolled bfloat16 -> float32 -> double conversion with zero-ext
+        bits32 = builder.zext(value, ir.IntType(32))
+        shift = builder.shl(bits32, ir.Constant(ir.IntType(32), 16))
+        f32 = builder.bitcast(shift, ir.FloatType())
+        f64 = builder.fpext(f32, ir.DoubleType())
+        return f64
+
+
+_from___nv_bfloat16_to_float64__lower()
+
+
+def _literalint_to_bf16_lower():
+    @lower_cast(types.IntegerLiteral, _type___nv_bfloat16)
+    def impl(context, builder, fromty, toty, value):
+        f32 = context.cast(builder, value, fromty, float32)
+        i32 = builder.bitcast(f32, ir.IntType(32))
+        i16 = builder.trunc(i32, ir.IntType(16))
+        return i16
+
+
+_literalint_to_bf16_lower()
+
+
 # Typing for __nv_bfloat162
 class _type_class___nv_bfloat162(Type):
     def __init__(self):
@@ -1568,17 +1678,17 @@ def resolve_y(self, obj):
 make_attribute_wrapper(_type_class___nv_bfloat162, "y", "y")
 
 
-def _lower___nv_bfloat162_void(shim_stream, shim_obj):
+def _lower__ZN14__nv_bfloat162C1Ev(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat162____nv_bfloat162_1(int &ignore, __nv_bfloat162 *self ) {
+    _ZN14__nv_bfloat162C1Ev_nbst(int &ignore, __nv_bfloat162 *self ) {
         new (self) __nv_bfloat162();
         return 0;
     }
         """
 
     _ctor_decl___nv_bfloat162 = declare_device(
-        "____nv_bfloat162____nv_bfloat162_1",
+        "_ZN14__nv_bfloat162C1Ev_nbst",
         int32(
             CPointer(_type___nv_bfloat162),
         ),
@@ -1592,9 +1702,7 @@ def __nv_bfloat162_device_caller(arg_0):
     )
     def ctor_impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key(
-            "____nv_bfloat162____nv_bfloat162_1", shim_raw_str
-        )
+        shim_stream.write_with_key("_ZN14__nv_bfloat162C1Ev_nbst", shim_raw_str)
         selfptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat162), name="selfptr"
         )
@@ -1618,20 +1726,20 @@ def ctor_impl(context, builder, sig, args):
         )
 
 
-_lower___nv_bfloat162_void(shim_stream, shim_obj)
+_lower__ZN14__nv_bfloat162C1Ev(shim_stream, shim_obj)
 
 
-def _lower___nv_bfloat162__type___nv_bfloat162(shim_stream, shim_obj):
+def _lower__ZN14__nv_bfloat162C1EOS_(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat162____nv_bfloat162_2(int &ignore, __nv_bfloat162 *self , __nv_bfloat162* src) {
+    _ZN14__nv_bfloat162C1EOS__nbst(int &ignore, __nv_bfloat162 *self , __nv_bfloat162* src) {
         new (self) __nv_bfloat162(*src);
         return 0;
     }
         """
 
     _ctor_decl___nv_bfloat162 = declare_device(
-        "____nv_bfloat162____nv_bfloat162_2",
+        "_ZN14__nv_bfloat162C1EOS__nbst",
         int32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
     )
 
@@ -1642,7 +1750,7 @@ def __nv_bfloat162_device_caller(arg_0, arg_1):
     def ctor_impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
-            "____nv_bfloat162____nv_bfloat162_2", shim_raw_str
+            "_ZN14__nv_bfloat162C1EOS__nbst", shim_raw_str
         )
         selfptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat162), name="selfptr"
@@ -1668,22 +1776,20 @@ def ctor_impl(context, builder, sig, args):
         )
 
 
-_lower___nv_bfloat162__type___nv_bfloat162(shim_stream, shim_obj)
+_lower__ZN14__nv_bfloat162C1EOS_(shim_stream, shim_obj)
 
 
-def _lower___nv_bfloat162__type___nv_bfloat16__type___nv_bfloat16(
-    shim_stream, shim_obj
-):
+def _lower__ZN14__nv_bfloat162C1ERK13__nv_bfloat16S2_(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat162____nv_bfloat162_3(int &ignore, __nv_bfloat162 *self , __nv_bfloat16* a, __nv_bfloat16* b) {
+    _ZN14__nv_bfloat162C1ERK13__nv_bfloat16S2__nbst(int &ignore, __nv_bfloat162 *self , __nv_bfloat16* a, __nv_bfloat16* b) {
         new (self) __nv_bfloat162(*a, *b);
         return 0;
     }
         """
 
     _ctor_decl___nv_bfloat162 = declare_device(
-        "____nv_bfloat162____nv_bfloat162_3",
+        "_ZN14__nv_bfloat162C1ERK13__nv_bfloat16S2__nbst",
         int32(
             CPointer(_type___nv_bfloat162),
             CPointer(_type___nv_bfloat16),
@@ -1698,7 +1804,7 @@ def __nv_bfloat162_device_caller(arg_0, arg_1, arg_2):
     def ctor_impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
-            "____nv_bfloat162____nv_bfloat162_3", shim_raw_str
+            "_ZN14__nv_bfloat162C1ERK13__nv_bfloat16S2__nbst", shim_raw_str
         )
         selfptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat162), name="selfptr"
@@ -1725,22 +1831,20 @@ def ctor_impl(context, builder, sig, args):
         )
 
 
-_lower___nv_bfloat162__type___nv_bfloat16__type___nv_bfloat16(
-    shim_stream, shim_obj
-)
+_lower__ZN14__nv_bfloat162C1ERK13__nv_bfloat16S2_(shim_stream, shim_obj)
 
 
-def _lower___nv_bfloat162__type___nv_bfloat162(shim_stream, shim_obj):
+def _lower__ZN14__nv_bfloat162C1ERKS_(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat162____nv_bfloat162_4(int &ignore, __nv_bfloat162 *self , __nv_bfloat162* src) {
+    _ZN14__nv_bfloat162C1ERKS__nbst(int &ignore, __nv_bfloat162 *self , __nv_bfloat162* src) {
         new (self) __nv_bfloat162(*src);
         return 0;
     }
         """
 
     _ctor_decl___nv_bfloat162 = declare_device(
-        "____nv_bfloat162____nv_bfloat162_4",
+        "_ZN14__nv_bfloat162C1ERKS__nbst",
         int32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
     )
 
@@ -1751,7 +1855,7 @@ def __nv_bfloat162_device_caller(arg_0, arg_1):
     def ctor_impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
-            "____nv_bfloat162____nv_bfloat162_4", shim_raw_str
+            "_ZN14__nv_bfloat162C1ERKS__nbst", shim_raw_str
         )
         selfptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat162), name="selfptr"
@@ -1777,31 +1881,31 @@ def ctor_impl(context, builder, sig, args):
         )
 
 
-_lower___nv_bfloat162__type___nv_bfloat162(shim_stream, shim_obj)
+_lower__ZN14__nv_bfloat162C1ERKS_(shim_stream, shim_obj)
 
 
-def _lower___nv_bfloat162__type_unnamed1401746(shim_stream, shim_obj):
+def _lower__ZN14__nv_bfloat162C1ERK18__nv_bfloat162_raw(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat162____nv_bfloat162_5(int &ignore, __nv_bfloat162 *self , __nv_bfloat162_raw* h2r) {
+    _ZN14__nv_bfloat162C1ERK18__nv_bfloat162_raw_nbst(int &ignore, __nv_bfloat162 *self , __nv_bfloat162_raw* h2r) {
         new (self) __nv_bfloat162(*h2r);
         return 0;
     }
         """
 
     _ctor_decl___nv_bfloat162 = declare_device(
-        "____nv_bfloat162____nv_bfloat162_5",
-        int32(CPointer(_type___nv_bfloat162), CPointer(_type_unnamed1401746)),
+        "_ZN14__nv_bfloat162C1ERK18__nv_bfloat162_raw_nbst",
+        int32(CPointer(_type___nv_bfloat162), CPointer(_type_unnamed1405416)),
     )
 
     def __nv_bfloat162_device_caller(arg_0, arg_1):
         return _ctor_decl___nv_bfloat162(arg_0, arg_1)
 
-    @lower(__nv_bfloat162, _type_unnamed1401746)
+    @lower(__nv_bfloat162, _type_unnamed1405416)
     def ctor_impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
-            "____nv_bfloat162____nv_bfloat162_5", shim_raw_str
+            "_ZN14__nv_bfloat162C1ERK18__nv_bfloat162_raw_nbst", shim_raw_str
         )
         selfptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat162), name="selfptr"
@@ -1818,7 +1922,7 @@ def ctor_impl(context, builder, sig, args):
             signature(
                 int32,
                 CPointer(_type___nv_bfloat162),
-                CPointer(_type_unnamed1401746),
+                CPointer(_type_unnamed1405416),
             ),
             (selfptr, *argptrs),
         )
@@ -1826,8 +1930,17 @@ def ctor_impl(context, builder, sig, args):
             selfptr, align=getattr(_type___nv_bfloat162, "alignof_", None)
         )
 
+    @lower_cast(_type_unnamed1405416, _type___nv_bfloat162)
+    def conversion_impl(context, builder, fromty, toty, value):
+        return ctor_impl(
+            context,
+            builder,
+            signature(_type___nv_bfloat162, fromty),
+            [value],
+        )
+
 
-_lower___nv_bfloat162__type_unnamed1401746(shim_stream, shim_obj)
+_lower__ZN14__nv_bfloat162C1ERK18__nv_bfloat162_raw(shim_stream, shim_obj)
 
 
 @register
@@ -1842,25 +1955,25 @@ class _ctor_template___nv_bfloat162(ConcreteTemplate):
             _type___nv_bfloat162, _type___nv_bfloat16, _type___nv_bfloat16
         ),
         signature(_type___nv_bfloat162, _type___nv_bfloat162),
-        signature(_type___nv_bfloat162, _type_unnamed1401746),
+        signature(_type___nv_bfloat162, _type_unnamed1405416),
     ]
 
 
 register_global(__nv_bfloat162, Function(_ctor_template___nv_bfloat162))
 
 
-def _from___nv_bfloat162_to__type_unnamed1401746_lower(shim_stream, shim_obj):
+def _from___nv_bfloat162_to__type_unnamed1405416_lower(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    ____nv_bfloat162_operator___nv_bfloat162_raw_1(__nv_bfloat162_raw &retval, __nv_bfloat162 *self) {
+    ____nv_bfloat162__ZNK14__nv_bfloat162cv18__nv_bfloat162_rawEv_1(__nv_bfloat162_raw &retval, __nv_bfloat162 *self) {
         retval = self->operator __nv_bfloat162_raw();
         return 0;
     }
         """
 
     _op_decl___nv_bfloat162 = declare_device(
-        "____nv_bfloat162_operator___nv_bfloat162_raw_1",
-        _type_unnamed1401746(
+        "____nv_bfloat162__ZNK14__nv_bfloat162cv18__nv_bfloat162_rawEv_1",
+        _type_unnamed1405416(
             CPointer(_type___nv_bfloat162),
         ),
     )
@@ -1868,11 +1981,12 @@ def _from___nv_bfloat162_to__type_unnamed1401746_lower(shim_stream, shim_obj):
     def _conversion_op_caller___nv_bfloat162(arg):
         return _op_decl___nv_bfloat162(arg)
 
-    @lower_cast(_type___nv_bfloat162, _type_unnamed1401746)
+    @lower_cast(_type___nv_bfloat162, _type_unnamed1405416)
     def impl(context, builder, fromty, toty, value):
         context.active_code_library.add_linking_file(shim_obj)
         shim_stream.write_with_key(
-            "____nv_bfloat162_operator___nv_bfloat162_raw_1", shim_raw_str
+            "____nv_bfloat162__ZNK14__nv_bfloat162cv18__nv_bfloat162_rawEv_1",
+            shim_raw_str,
         )
         ptr = builder.alloca(
             context.get_value_type(_type___nv_bfloat162), name="selfptr"
@@ -1885,1997 +1999,2083 @@ def impl(context, builder, fromty, toty, value):
             builder,
             _conversion_op_caller___nv_bfloat162,
             signature(
-                _type_unnamed1401746,
+                _type_unnamed1405416,
                 CPointer(_type___nv_bfloat162),
             ),
             (ptr,),
         )
 
 
-_from___nv_bfloat162_to__type_unnamed1401746_lower(shim_stream, shim_obj)
+_from___nv_bfloat162_to__type_unnamed1405416_lower(shim_stream, shim_obj)
 
 
 # Functions:
 
 
-def make_bfloat162():
+def __double2bfloat16():
     pass
 
 
-def _make_bfloat162_1_lower(shim_stream, shim_obj):
+def _lower__ZL17__double2bfloat16d_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    make_bfloat162_1(__nv_bfloat162 &retval , __nv_bfloat16* x, __nv_bfloat16* y) {
-        retval = make_bfloat162(*x, *y);
+    _ZL17__double2bfloat16d_nbst(__nv_bfloat16 &retval , double* a) {
+        retval = __double2bfloat16(*a);
         return 0;
     }
         """
 
-    make_bfloat162_1 = declare_device(
-        "make_bfloat162_1",
-        _type___nv_bfloat162(
-            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
-        ),
+    _ZL17__double2bfloat16d_nbst = declare_device(
+        "_ZL17__double2bfloat16d_nbst", _type___nv_bfloat16(CPointer(float64))
     )
 
-    def make_bfloat162_1_caller(arg_0, arg_1):
-        return make_bfloat162_1(arg_0, arg_1)
+    def _ZL17__double2bfloat16d_nbst_caller(arg_0):
+        return _ZL17__double2bfloat16d_nbst(arg_0)
 
-    @lower(make_bfloat162, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__double2bfloat16, float64)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("make_bfloat162_1", shim_raw_str)
+        shim_stream.write_with_key("_ZL17__double2bfloat16d_nbst", shim_raw_str)
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            make_bfloat162_1_caller,
-            signature(
-                _type___nv_bfloat162,
-                CPointer(_type___nv_bfloat16),
-                CPointer(_type___nv_bfloat16),
-            ),
+            _ZL17__double2bfloat16d_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(float64)),
             ptrs,
         )
 
 
-_make_bfloat162_1_lower(shim_stream, shim_obj)
+_lower__ZL17__double2bfloat16d_nbst(shim_stream, shim_obj)
 
 
-def htrunc():
+def __float2bfloat16():
     pass
 
 
-def _htrunc_1_lower(shim_stream, shim_obj):
+def _lower__ZL16__float2bfloat16f_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    htrunc_1(__nv_bfloat16 &retval , __nv_bfloat16* h) {
-        retval = htrunc(*h);
+    _ZL16__float2bfloat16f_nbst(__nv_bfloat16 &retval , float* a) {
+        retval = __float2bfloat16(*a);
         return 0;
     }
         """
 
-    htrunc_1 = declare_device(
-        "htrunc_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16))
+    _ZL16__float2bfloat16f_nbst = declare_device(
+        "_ZL16__float2bfloat16f_nbst", _type___nv_bfloat16(CPointer(float32))
     )
 
-    def htrunc_1_caller(arg_0):
-        return htrunc_1(arg_0)
+    def _ZL16__float2bfloat16f_nbst_caller(arg_0):
+        return _ZL16__float2bfloat16f_nbst(arg_0)
 
-    @lower(htrunc, _type___nv_bfloat16)
+    @lower(__float2bfloat16, float32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("htrunc_1", shim_raw_str)
+        shim_stream.write_with_key("_ZL16__float2bfloat16f_nbst", shim_raw_str)
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            htrunc_1_caller,
-            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            _ZL16__float2bfloat16f_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(float32)),
             ptrs,
         )
 
 
-_htrunc_1_lower(shim_stream, shim_obj)
+_lower__ZL16__float2bfloat16f_nbst(shim_stream, shim_obj)
 
 
-def hceil():
+def __float2bfloat16_rn():
     pass
 
 
-def _hceil_1_lower(shim_stream, shim_obj):
+def _lower__ZL19__float2bfloat16_rnf_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    hceil_1(__nv_bfloat16 &retval , __nv_bfloat16* h) {
-        retval = hceil(*h);
+    _ZL19__float2bfloat16_rnf_nbst(__nv_bfloat16 &retval , float* a) {
+        retval = __float2bfloat16_rn(*a);
         return 0;
     }
         """
 
-    hceil_1 = declare_device(
-        "hceil_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16))
+    _ZL19__float2bfloat16_rnf_nbst = declare_device(
+        "_ZL19__float2bfloat16_rnf_nbst", _type___nv_bfloat16(CPointer(float32))
     )
 
-    def hceil_1_caller(arg_0):
-        return hceil_1(arg_0)
+    def _ZL19__float2bfloat16_rnf_nbst_caller(arg_0):
+        return _ZL19__float2bfloat16_rnf_nbst(arg_0)
 
-    @lower(hceil, _type___nv_bfloat16)
+    @lower(__float2bfloat16_rn, float32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("hceil_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL19__float2bfloat16_rnf_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            hceil_1_caller,
-            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            _ZL19__float2bfloat16_rnf_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(float32)),
             ptrs,
         )
 
 
-_hceil_1_lower(shim_stream, shim_obj)
+_lower__ZL19__float2bfloat16_rnf_nbst(shim_stream, shim_obj)
 
 
-def hfloor():
+def __float2bfloat16_rz():
     pass
 
 
-def _hfloor_1_lower(shim_stream, shim_obj):
+def _lower__ZL19__float2bfloat16_rzf_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    hfloor_1(__nv_bfloat16 &retval , __nv_bfloat16* h) {
-        retval = hfloor(*h);
+    _ZL19__float2bfloat16_rzf_nbst(__nv_bfloat16 &retval , float* a) {
+        retval = __float2bfloat16_rz(*a);
         return 0;
     }
         """
 
-    hfloor_1 = declare_device(
-        "hfloor_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16))
+    _ZL19__float2bfloat16_rzf_nbst = declare_device(
+        "_ZL19__float2bfloat16_rzf_nbst", _type___nv_bfloat16(CPointer(float32))
     )
 
-    def hfloor_1_caller(arg_0):
-        return hfloor_1(arg_0)
+    def _ZL19__float2bfloat16_rzf_nbst_caller(arg_0):
+        return _ZL19__float2bfloat16_rzf_nbst(arg_0)
 
-    @lower(hfloor, _type___nv_bfloat16)
+    @lower(__float2bfloat16_rz, float32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("hfloor_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL19__float2bfloat16_rzf_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            hfloor_1_caller,
-            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            _ZL19__float2bfloat16_rzf_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(float32)),
             ptrs,
         )
 
 
-_hfloor_1_lower(shim_stream, shim_obj)
+_lower__ZL19__float2bfloat16_rzf_nbst(shim_stream, shim_obj)
 
 
-def hrint():
+def __float2bfloat16_rd():
     pass
 
 
-def _hrint_1_lower(shim_stream, shim_obj):
+def _lower__ZL19__float2bfloat16_rdf_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    hrint_1(__nv_bfloat16 &retval , __nv_bfloat16* h) {
-        retval = hrint(*h);
+    _ZL19__float2bfloat16_rdf_nbst(__nv_bfloat16 &retval , float* a) {
+        retval = __float2bfloat16_rd(*a);
         return 0;
     }
         """
 
-    hrint_1 = declare_device(
-        "hrint_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16))
+    _ZL19__float2bfloat16_rdf_nbst = declare_device(
+        "_ZL19__float2bfloat16_rdf_nbst", _type___nv_bfloat16(CPointer(float32))
     )
 
-    def hrint_1_caller(arg_0):
-        return hrint_1(arg_0)
+    def _ZL19__float2bfloat16_rdf_nbst_caller(arg_0):
+        return _ZL19__float2bfloat16_rdf_nbst(arg_0)
 
-    @lower(hrint, _type___nv_bfloat16)
+    @lower(__float2bfloat16_rd, float32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("hrint_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL19__float2bfloat16_rdf_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            hrint_1_caller,
-            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            _ZL19__float2bfloat16_rdf_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(float32)),
             ptrs,
         )
 
 
-_hrint_1_lower(shim_stream, shim_obj)
+_lower__ZL19__float2bfloat16_rdf_nbst(shim_stream, shim_obj)
 
 
-def h2trunc():
+def __float2bfloat16_ru():
     pass
 
 
-def _h2trunc_1_lower(shim_stream, shim_obj):
+def _lower__ZL19__float2bfloat16_ruf_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    h2trunc_1(__nv_bfloat162 &retval , __nv_bfloat162* h) {
-        retval = h2trunc(*h);
+    _ZL19__float2bfloat16_ruf_nbst(__nv_bfloat16 &retval , float* a) {
+        retval = __float2bfloat16_ru(*a);
         return 0;
     }
         """
 
-    h2trunc_1 = declare_device(
-        "h2trunc_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162))
+    _ZL19__float2bfloat16_ruf_nbst = declare_device(
+        "_ZL19__float2bfloat16_ruf_nbst", _type___nv_bfloat16(CPointer(float32))
     )
 
-    def h2trunc_1_caller(arg_0):
-        return h2trunc_1(arg_0)
+    def _ZL19__float2bfloat16_ruf_nbst_caller(arg_0):
+        return _ZL19__float2bfloat16_ruf_nbst(arg_0)
 
-    @lower(h2trunc, _type___nv_bfloat162)
+    @lower(__float2bfloat16_ru, float32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("h2trunc_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL19__float2bfloat16_ruf_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            h2trunc_1_caller,
-            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            _ZL19__float2bfloat16_ruf_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(float32)),
             ptrs,
         )
 
 
-_h2trunc_1_lower(shim_stream, shim_obj)
+_lower__ZL19__float2bfloat16_ruf_nbst(shim_stream, shim_obj)
 
 
-def h2ceil():
+def __bfloat162float():
     pass
 
 
-def _h2ceil_1_lower(shim_stream, shim_obj):
+def _lower__ZL16__bfloat162float13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    h2ceil_1(__nv_bfloat162 &retval , __nv_bfloat162* h) {
-        retval = h2ceil(*h);
+    _ZL16__bfloat162float13__nv_bfloat16_nbst(float &retval , __nv_bfloat16* a) {
+        retval = __bfloat162float(*a);
         return 0;
     }
         """
 
-    h2ceil_1 = declare_device(
-        "h2ceil_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162))
+    _ZL16__bfloat162float13__nv_bfloat16_nbst = declare_device(
+        "_ZL16__bfloat162float13__nv_bfloat16_nbst",
+        float32(CPointer(_type___nv_bfloat16)),
     )
 
-    def h2ceil_1_caller(arg_0):
-        return h2ceil_1(arg_0)
+    def _ZL16__bfloat162float13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL16__bfloat162float13__nv_bfloat16_nbst(arg_0)
 
-    @lower(h2ceil, _type___nv_bfloat162)
+    @lower(__bfloat162float, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("h2ceil_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL16__bfloat162float13__nv_bfloat16_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            h2ceil_1_caller,
-            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            _ZL16__bfloat162float13__nv_bfloat16_nbst_caller,
+            signature(float32, CPointer(_type___nv_bfloat16)),
             ptrs,
         )
 
 
-_h2ceil_1_lower(shim_stream, shim_obj)
+_lower__ZL16__bfloat162float13__nv_bfloat16_nbst(shim_stream, shim_obj)
 
 
-def h2floor():
+def __float2bfloat162_rn():
     pass
 
 
-def _h2floor_1_lower(shim_stream, shim_obj):
+def _lower__ZL20__float2bfloat162_rnf_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    h2floor_1(__nv_bfloat162 &retval , __nv_bfloat162* h) {
-        retval = h2floor(*h);
+    _ZL20__float2bfloat162_rnf_nbst(__nv_bfloat162 &retval , float* a) {
+        retval = __float2bfloat162_rn(*a);
         return 0;
     }
         """
 
-    h2floor_1 = declare_device(
-        "h2floor_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162))
+    _ZL20__float2bfloat162_rnf_nbst = declare_device(
+        "_ZL20__float2bfloat162_rnf_nbst",
+        _type___nv_bfloat162(CPointer(float32)),
     )
 
-    def h2floor_1_caller(arg_0):
-        return h2floor_1(arg_0)
+    def _ZL20__float2bfloat162_rnf_nbst_caller(arg_0):
+        return _ZL20__float2bfloat162_rnf_nbst(arg_0)
 
-    @lower(h2floor, _type___nv_bfloat162)
+    @lower(__float2bfloat162_rn, float32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("h2floor_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL20__float2bfloat162_rnf_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            h2floor_1_caller,
-            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            _ZL20__float2bfloat162_rnf_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(float32)),
             ptrs,
         )
 
 
-_h2floor_1_lower(shim_stream, shim_obj)
+_lower__ZL20__float2bfloat162_rnf_nbst(shim_stream, shim_obj)
 
 
-def h2rint():
+def __floats2bfloat162_rn():
     pass
 
 
-def _h2rint_1_lower(shim_stream, shim_obj):
+def _lower__ZL21__floats2bfloat162_rnff_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    h2rint_1(__nv_bfloat162 &retval , __nv_bfloat162* h) {
-        retval = h2rint(*h);
+    _ZL21__floats2bfloat162_rnff_nbst(__nv_bfloat162 &retval , float* a, float* b) {
+        retval = __floats2bfloat162_rn(*a, *b);
         return 0;
     }
         """
 
-    h2rint_1 = declare_device(
-        "h2rint_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162))
+    _ZL21__floats2bfloat162_rnff_nbst = declare_device(
+        "_ZL21__floats2bfloat162_rnff_nbst",
+        _type___nv_bfloat162(CPointer(float32), CPointer(float32)),
     )
 
-    def h2rint_1_caller(arg_0):
-        return h2rint_1(arg_0)
+    def _ZL21__floats2bfloat162_rnff_nbst_caller(arg_0, arg_1):
+        return _ZL21__floats2bfloat162_rnff_nbst(arg_0, arg_1)
 
-    @lower(h2rint, _type___nv_bfloat162)
+    @lower(__floats2bfloat162_rn, float32, float32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("h2rint_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL21__floats2bfloat162_rnff_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            h2rint_1_caller,
-            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            _ZL21__floats2bfloat162_rnff_nbst_caller,
+            signature(
+                _type___nv_bfloat162, CPointer(float32), CPointer(float32)
+            ),
             ptrs,
         )
 
 
-_h2rint_1_lower(shim_stream, shim_obj)
+_lower__ZL21__floats2bfloat162_rnff_nbst(shim_stream, shim_obj)
 
 
-def hsqrt():
+def __low2float():
     pass
 
 
-def _hsqrt_1_lower(shim_stream, shim_obj):
+def _lower__ZL11__low2float14__nv_bfloat162_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    hsqrt_1(__nv_bfloat16 &retval , __nv_bfloat16* a) {
-        retval = hsqrt(*a);
+    _ZL11__low2float14__nv_bfloat162_nbst(float &retval , __nv_bfloat162* a) {
+        retval = __low2float(*a);
         return 0;
     }
         """
 
-    hsqrt_1 = declare_device(
-        "hsqrt_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16))
+    _ZL11__low2float14__nv_bfloat162_nbst = declare_device(
+        "_ZL11__low2float14__nv_bfloat162_nbst",
+        float32(CPointer(_type___nv_bfloat162)),
     )
 
-    def hsqrt_1_caller(arg_0):
-        return hsqrt_1(arg_0)
+    def _ZL11__low2float14__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL11__low2float14__nv_bfloat162_nbst(arg_0)
 
-    @lower(hsqrt, _type___nv_bfloat16)
+    @lower(__low2float, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("hsqrt_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL11__low2float14__nv_bfloat162_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            hsqrt_1_caller,
-            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            _ZL11__low2float14__nv_bfloat162_nbst_caller,
+            signature(float32, CPointer(_type___nv_bfloat162)),
             ptrs,
         )
 
 
-_hsqrt_1_lower(shim_stream, shim_obj)
+_lower__ZL11__low2float14__nv_bfloat162_nbst(shim_stream, shim_obj)
 
 
-def hrsqrt():
+def __high2float():
     pass
 
 
-def _hrsqrt_1_lower(shim_stream, shim_obj):
+def _lower__ZL12__high2float14__nv_bfloat162_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    hrsqrt_1(__nv_bfloat16 &retval , __nv_bfloat16* a) {
-        retval = hrsqrt(*a);
+    _ZL12__high2float14__nv_bfloat162_nbst(float &retval , __nv_bfloat162* a) {
+        retval = __high2float(*a);
         return 0;
     }
         """
 
-    hrsqrt_1 = declare_device(
-        "hrsqrt_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16))
+    _ZL12__high2float14__nv_bfloat162_nbst = declare_device(
+        "_ZL12__high2float14__nv_bfloat162_nbst",
+        float32(CPointer(_type___nv_bfloat162)),
     )
 
-    def hrsqrt_1_caller(arg_0):
-        return hrsqrt_1(arg_0)
+    def _ZL12__high2float14__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL12__high2float14__nv_bfloat162_nbst(arg_0)
 
-    @lower(hrsqrt, _type___nv_bfloat16)
+    @lower(__high2float, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("hrsqrt_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL12__high2float14__nv_bfloat162_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            hrsqrt_1_caller,
-            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            _ZL12__high2float14__nv_bfloat162_nbst_caller,
+            signature(float32, CPointer(_type___nv_bfloat162)),
             ptrs,
         )
 
 
-_hrsqrt_1_lower(shim_stream, shim_obj)
+_lower__ZL12__high2float14__nv_bfloat162_nbst(shim_stream, shim_obj)
 
 
-def hrcp():
+def __float22bfloat162_rn():
     pass
 
 
-def _hrcp_1_lower(shim_stream, shim_obj):
+def _lower__ZL21__float22bfloat162_rn6float2_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    hrcp_1(__nv_bfloat16 &retval , __nv_bfloat16* a) {
-        retval = hrcp(*a);
+    _ZL21__float22bfloat162_rn6float2_nbst(__nv_bfloat162 &retval , float2* a) {
+        retval = __float22bfloat162_rn(*a);
         return 0;
     }
         """
 
-    hrcp_1 = declare_device(
-        "hrcp_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16))
+    _ZL21__float22bfloat162_rn6float2_nbst = declare_device(
+        "_ZL21__float22bfloat162_rn6float2_nbst",
+        _type___nv_bfloat162(CPointer(float32x2)),
     )
 
-    def hrcp_1_caller(arg_0):
-        return hrcp_1(arg_0)
+    def _ZL21__float22bfloat162_rn6float2_nbst_caller(arg_0):
+        return _ZL21__float22bfloat162_rn6float2_nbst(arg_0)
 
-    @lower(hrcp, _type___nv_bfloat16)
+    @lower(__float22bfloat162_rn, float32x2)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("hrcp_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL21__float22bfloat162_rn6float2_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            hrcp_1_caller,
-            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            _ZL21__float22bfloat162_rn6float2_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(float32x2)),
             ptrs,
         )
 
 
-_hrcp_1_lower(shim_stream, shim_obj)
+_lower__ZL21__float22bfloat162_rn6float2_nbst(shim_stream, shim_obj)
 
 
-def hlog():
+def __bfloat1622float2():
     pass
 
 
-def _hlog_1_lower(shim_stream, shim_obj):
+def _lower__ZL18__bfloat1622float214__nv_bfloat162_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    hlog_1(__nv_bfloat16 &retval , __nv_bfloat16* a) {
-        retval = hlog(*a);
+    _ZL18__bfloat1622float214__nv_bfloat162_nbst(float2 &retval , __nv_bfloat162* a) {
+        retval = __bfloat1622float2(*a);
         return 0;
     }
         """
 
-    hlog_1 = declare_device(
-        "hlog_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16))
+    _ZL18__bfloat1622float214__nv_bfloat162_nbst = declare_device(
+        "_ZL18__bfloat1622float214__nv_bfloat162_nbst",
+        float32x2(CPointer(_type___nv_bfloat162)),
     )
 
-    def hlog_1_caller(arg_0):
-        return hlog_1(arg_0)
+    def _ZL18__bfloat1622float214__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL18__bfloat1622float214__nv_bfloat162_nbst(arg_0)
 
-    @lower(hlog, _type___nv_bfloat16)
+    @lower(__bfloat1622float2, _type___nv_bfloat162)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("hlog_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL18__bfloat1622float214__nv_bfloat162_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            hlog_1_caller,
-            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            _ZL18__bfloat1622float214__nv_bfloat162_nbst_caller,
+            signature(float32x2, CPointer(_type___nv_bfloat162)),
             ptrs,
         )
 
 
-_hlog_1_lower(shim_stream, shim_obj)
+_lower__ZL18__bfloat1622float214__nv_bfloat162_nbst(shim_stream, shim_obj)
 
 
-def hlog2():
+def __bfloat162char_rz():
     pass
 
 
-def _hlog2_1_lower(shim_stream, shim_obj):
+def _lower__ZL18__bfloat162char_rz13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    hlog2_1(__nv_bfloat16 &retval , __nv_bfloat16* a) {
-        retval = hlog2(*a);
+    _ZL18__bfloat162char_rz13__nv_bfloat16_nbst(signed char &retval , __nv_bfloat16* h) {
+        retval = __bfloat162char_rz(*h);
         return 0;
     }
         """
 
-    hlog2_1 = declare_device(
-        "hlog2_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16))
+    _ZL18__bfloat162char_rz13__nv_bfloat16_nbst = declare_device(
+        "_ZL18__bfloat162char_rz13__nv_bfloat16_nbst",
+        int8(CPointer(_type___nv_bfloat16)),
     )
 
-    def hlog2_1_caller(arg_0):
-        return hlog2_1(arg_0)
+    def _ZL18__bfloat162char_rz13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL18__bfloat162char_rz13__nv_bfloat16_nbst(arg_0)
 
-    @lower(hlog2, _type___nv_bfloat16)
+    @lower(__bfloat162char_rz, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("hlog2_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL18__bfloat162char_rz13__nv_bfloat16_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            hlog2_1_caller,
-            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            _ZL18__bfloat162char_rz13__nv_bfloat16_nbst_caller,
+            signature(int8, CPointer(_type___nv_bfloat16)),
             ptrs,
         )
 
 
-_hlog2_1_lower(shim_stream, shim_obj)
+_lower__ZL18__bfloat162char_rz13__nv_bfloat16_nbst(shim_stream, shim_obj)
 
 
-def hlog10():
+def __bfloat162uchar_rz():
     pass
 
 
-def _hlog10_1_lower(shim_stream, shim_obj):
+def _lower__ZL19__bfloat162uchar_rz13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    hlog10_1(__nv_bfloat16 &retval , __nv_bfloat16* a) {
-        retval = hlog10(*a);
+    _ZL19__bfloat162uchar_rz13__nv_bfloat16_nbst(unsigned char &retval , __nv_bfloat16* h) {
+        retval = __bfloat162uchar_rz(*h);
         return 0;
     }
         """
 
-    hlog10_1 = declare_device(
-        "hlog10_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16))
+    _ZL19__bfloat162uchar_rz13__nv_bfloat16_nbst = declare_device(
+        "_ZL19__bfloat162uchar_rz13__nv_bfloat16_nbst",
+        uint8(CPointer(_type___nv_bfloat16)),
     )
 
-    def hlog10_1_caller(arg_0):
-        return hlog10_1(arg_0)
+    def _ZL19__bfloat162uchar_rz13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL19__bfloat162uchar_rz13__nv_bfloat16_nbst(arg_0)
 
-    @lower(hlog10, _type___nv_bfloat16)
+    @lower(__bfloat162uchar_rz, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("hlog10_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL19__bfloat162uchar_rz13__nv_bfloat16_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            hlog10_1_caller,
-            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            _ZL19__bfloat162uchar_rz13__nv_bfloat16_nbst_caller,
+            signature(uint8, CPointer(_type___nv_bfloat16)),
             ptrs,
         )
 
 
-_hlog10_1_lower(shim_stream, shim_obj)
+_lower__ZL19__bfloat162uchar_rz13__nv_bfloat16_nbst(shim_stream, shim_obj)
 
 
-def hexp():
+def __bfloat162int_rn():
     pass
 
 
-def _hexp_1_lower(shim_stream, shim_obj):
+def _lower__ZL17__bfloat162int_rn13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    hexp_1(__nv_bfloat16 &retval , __nv_bfloat16* a) {
-        retval = hexp(*a);
+    _ZL17__bfloat162int_rn13__nv_bfloat16_nbst(int &retval , __nv_bfloat16* h) {
+        retval = __bfloat162int_rn(*h);
         return 0;
     }
         """
 
-    hexp_1 = declare_device(
-        "hexp_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16))
+    _ZL17__bfloat162int_rn13__nv_bfloat16_nbst = declare_device(
+        "_ZL17__bfloat162int_rn13__nv_bfloat16_nbst",
+        int32(CPointer(_type___nv_bfloat16)),
     )
 
-    def hexp_1_caller(arg_0):
-        return hexp_1(arg_0)
+    def _ZL17__bfloat162int_rn13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL17__bfloat162int_rn13__nv_bfloat16_nbst(arg_0)
 
-    @lower(hexp, _type___nv_bfloat16)
+    @lower(__bfloat162int_rn, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("hexp_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL17__bfloat162int_rn13__nv_bfloat16_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            hexp_1_caller,
-            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            _ZL17__bfloat162int_rn13__nv_bfloat16_nbst_caller,
+            signature(int32, CPointer(_type___nv_bfloat16)),
             ptrs,
         )
 
 
-_hexp_1_lower(shim_stream, shim_obj)
+_lower__ZL17__bfloat162int_rn13__nv_bfloat16_nbst(shim_stream, shim_obj)
 
 
-def htanh_approx():
+def __bfloat162int_rz():
     pass
 
 
-def _htanh_approx_1_lower(shim_stream, shim_obj):
+def _lower__ZL17__bfloat162int_rz13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    htanh_approx_1(__nv_bfloat16 &retval , __nv_bfloat16* a) {
-        retval = htanh_approx(*a);
+    _ZL17__bfloat162int_rz13__nv_bfloat16_nbst(int &retval , __nv_bfloat16* h) {
+        retval = __bfloat162int_rz(*h);
         return 0;
     }
         """
 
-    htanh_approx_1 = declare_device(
-        "htanh_approx_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16))
+    _ZL17__bfloat162int_rz13__nv_bfloat16_nbst = declare_device(
+        "_ZL17__bfloat162int_rz13__nv_bfloat16_nbst",
+        int32(CPointer(_type___nv_bfloat16)),
     )
 
-    def htanh_approx_1_caller(arg_0):
-        return htanh_approx_1(arg_0)
+    def _ZL17__bfloat162int_rz13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL17__bfloat162int_rz13__nv_bfloat16_nbst(arg_0)
 
-    @lower(htanh_approx, _type___nv_bfloat16)
+    @lower(__bfloat162int_rz, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("htanh_approx_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL17__bfloat162int_rz13__nv_bfloat16_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            htanh_approx_1_caller,
-            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            _ZL17__bfloat162int_rz13__nv_bfloat16_nbst_caller,
+            signature(int32, CPointer(_type___nv_bfloat16)),
             ptrs,
         )
 
 
-_htanh_approx_1_lower(shim_stream, shim_obj)
+_lower__ZL17__bfloat162int_rz13__nv_bfloat16_nbst(shim_stream, shim_obj)
 
 
-def h2tanh_approx():
+def __bfloat162int_rd():
     pass
 
 
-def _h2tanh_approx_1_lower(shim_stream, shim_obj):
+def _lower__ZL17__bfloat162int_rd13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    h2tanh_approx_1(__nv_bfloat162 &retval , __nv_bfloat162* a) {
-        retval = h2tanh_approx(*a);
+    _ZL17__bfloat162int_rd13__nv_bfloat16_nbst(int &retval , __nv_bfloat16* h) {
+        retval = __bfloat162int_rd(*h);
         return 0;
     }
         """
 
-    h2tanh_approx_1 = declare_device(
-        "h2tanh_approx_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162))
+    _ZL17__bfloat162int_rd13__nv_bfloat16_nbst = declare_device(
+        "_ZL17__bfloat162int_rd13__nv_bfloat16_nbst",
+        int32(CPointer(_type___nv_bfloat16)),
     )
 
-    def h2tanh_approx_1_caller(arg_0):
-        return h2tanh_approx_1(arg_0)
+    def _ZL17__bfloat162int_rd13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL17__bfloat162int_rd13__nv_bfloat16_nbst(arg_0)
 
-    @lower(h2tanh_approx, _type___nv_bfloat162)
+    @lower(__bfloat162int_rd, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("h2tanh_approx_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL17__bfloat162int_rd13__nv_bfloat16_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            h2tanh_approx_1_caller,
-            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            _ZL17__bfloat162int_rd13__nv_bfloat16_nbst_caller,
+            signature(int32, CPointer(_type___nv_bfloat16)),
             ptrs,
         )
 
 
-_h2tanh_approx_1_lower(shim_stream, shim_obj)
+_lower__ZL17__bfloat162int_rd13__nv_bfloat16_nbst(shim_stream, shim_obj)
 
 
-def htanh():
+def __bfloat162int_ru():
     pass
 
 
-def _htanh_1_lower(shim_stream, shim_obj):
+def _lower__ZL17__bfloat162int_ru13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    htanh_1(__nv_bfloat16 &retval , __nv_bfloat16* a) {
-        retval = htanh(*a);
+    _ZL17__bfloat162int_ru13__nv_bfloat16_nbst(int &retval , __nv_bfloat16* h) {
+        retval = __bfloat162int_ru(*h);
         return 0;
     }
         """
 
-    htanh_1 = declare_device(
-        "htanh_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16))
+    _ZL17__bfloat162int_ru13__nv_bfloat16_nbst = declare_device(
+        "_ZL17__bfloat162int_ru13__nv_bfloat16_nbst",
+        int32(CPointer(_type___nv_bfloat16)),
     )
 
-    def htanh_1_caller(arg_0):
-        return htanh_1(arg_0)
+    def _ZL17__bfloat162int_ru13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL17__bfloat162int_ru13__nv_bfloat16_nbst(arg_0)
 
-    @lower(htanh, _type___nv_bfloat16)
+    @lower(__bfloat162int_ru, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("htanh_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL17__bfloat162int_ru13__nv_bfloat16_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            htanh_1_caller,
-            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            _ZL17__bfloat162int_ru13__nv_bfloat16_nbst_caller,
+            signature(int32, CPointer(_type___nv_bfloat16)),
             ptrs,
         )
 
 
-_htanh_1_lower(shim_stream, shim_obj)
+_lower__ZL17__bfloat162int_ru13__nv_bfloat16_nbst(shim_stream, shim_obj)
 
 
-def h2tanh():
+def __int2bfloat16_rn():
     pass
 
 
-def _h2tanh_1_lower(shim_stream, shim_obj):
+def _lower__ZL17__int2bfloat16_rni_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    h2tanh_1(__nv_bfloat162 &retval , __nv_bfloat162* a) {
-        retval = h2tanh(*a);
+    _ZL17__int2bfloat16_rni_nbst(__nv_bfloat16 &retval , int* i) {
+        retval = __int2bfloat16_rn(*i);
         return 0;
     }
         """
 
-    h2tanh_1 = declare_device(
-        "h2tanh_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162))
+    _ZL17__int2bfloat16_rni_nbst = declare_device(
+        "_ZL17__int2bfloat16_rni_nbst", _type___nv_bfloat16(CPointer(int32))
     )
 
-    def h2tanh_1_caller(arg_0):
-        return h2tanh_1(arg_0)
+    def _ZL17__int2bfloat16_rni_nbst_caller(arg_0):
+        return _ZL17__int2bfloat16_rni_nbst(arg_0)
 
-    @lower(h2tanh, _type___nv_bfloat162)
+    @lower(__int2bfloat16_rn, int32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("h2tanh_1", shim_raw_str)
+        shim_stream.write_with_key("_ZL17__int2bfloat16_rni_nbst", shim_raw_str)
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            h2tanh_1_caller,
-            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            _ZL17__int2bfloat16_rni_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(int32)),
             ptrs,
         )
 
 
-_h2tanh_1_lower(shim_stream, shim_obj)
+_lower__ZL17__int2bfloat16_rni_nbst(shim_stream, shim_obj)
 
 
-def hexp2():
+def __int2bfloat16_rz():
     pass
 
 
-def _hexp2_1_lower(shim_stream, shim_obj):
+def _lower__ZL17__int2bfloat16_rzi_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    hexp2_1(__nv_bfloat16 &retval , __nv_bfloat16* a) {
-        retval = hexp2(*a);
+    _ZL17__int2bfloat16_rzi_nbst(__nv_bfloat16 &retval , int* i) {
+        retval = __int2bfloat16_rz(*i);
         return 0;
     }
         """
 
-    hexp2_1 = declare_device(
-        "hexp2_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16))
+    _ZL17__int2bfloat16_rzi_nbst = declare_device(
+        "_ZL17__int2bfloat16_rzi_nbst", _type___nv_bfloat16(CPointer(int32))
     )
 
-    def hexp2_1_caller(arg_0):
-        return hexp2_1(arg_0)
+    def _ZL17__int2bfloat16_rzi_nbst_caller(arg_0):
+        return _ZL17__int2bfloat16_rzi_nbst(arg_0)
 
-    @lower(hexp2, _type___nv_bfloat16)
+    @lower(__int2bfloat16_rz, int32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("hexp2_1", shim_raw_str)
+        shim_stream.write_with_key("_ZL17__int2bfloat16_rzi_nbst", shim_raw_str)
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            hexp2_1_caller,
-            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            _ZL17__int2bfloat16_rzi_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(int32)),
             ptrs,
         )
 
 
-_hexp2_1_lower(shim_stream, shim_obj)
+_lower__ZL17__int2bfloat16_rzi_nbst(shim_stream, shim_obj)
 
 
-def hexp10():
+def __int2bfloat16_rd():
     pass
 
 
-def _hexp10_1_lower(shim_stream, shim_obj):
+def _lower__ZL17__int2bfloat16_rdi_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    hexp10_1(__nv_bfloat16 &retval , __nv_bfloat16* a) {
-        retval = hexp10(*a);
+    _ZL17__int2bfloat16_rdi_nbst(__nv_bfloat16 &retval , int* i) {
+        retval = __int2bfloat16_rd(*i);
         return 0;
     }
         """
 
-    hexp10_1 = declare_device(
-        "hexp10_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16))
+    _ZL17__int2bfloat16_rdi_nbst = declare_device(
+        "_ZL17__int2bfloat16_rdi_nbst", _type___nv_bfloat16(CPointer(int32))
     )
 
-    def hexp10_1_caller(arg_0):
-        return hexp10_1(arg_0)
+    def _ZL17__int2bfloat16_rdi_nbst_caller(arg_0):
+        return _ZL17__int2bfloat16_rdi_nbst(arg_0)
 
-    @lower(hexp10, _type___nv_bfloat16)
+    @lower(__int2bfloat16_rd, int32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("hexp10_1", shim_raw_str)
+        shim_stream.write_with_key("_ZL17__int2bfloat16_rdi_nbst", shim_raw_str)
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            hexp10_1_caller,
-            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            _ZL17__int2bfloat16_rdi_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(int32)),
             ptrs,
         )
 
 
-_hexp10_1_lower(shim_stream, shim_obj)
+_lower__ZL17__int2bfloat16_rdi_nbst(shim_stream, shim_obj)
 
 
-def hcos():
+def __int2bfloat16_ru():
     pass
 
 
-def _hcos_1_lower(shim_stream, shim_obj):
+def _lower__ZL17__int2bfloat16_rui_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    hcos_1(__nv_bfloat16 &retval , __nv_bfloat16* a) {
-        retval = hcos(*a);
+    _ZL17__int2bfloat16_rui_nbst(__nv_bfloat16 &retval , int* i) {
+        retval = __int2bfloat16_ru(*i);
         return 0;
     }
         """
 
-    hcos_1 = declare_device(
-        "hcos_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16))
+    _ZL17__int2bfloat16_rui_nbst = declare_device(
+        "_ZL17__int2bfloat16_rui_nbst", _type___nv_bfloat16(CPointer(int32))
     )
 
-    def hcos_1_caller(arg_0):
-        return hcos_1(arg_0)
+    def _ZL17__int2bfloat16_rui_nbst_caller(arg_0):
+        return _ZL17__int2bfloat16_rui_nbst(arg_0)
 
-    @lower(hcos, _type___nv_bfloat16)
+    @lower(__int2bfloat16_ru, int32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("hcos_1", shim_raw_str)
+        shim_stream.write_with_key("_ZL17__int2bfloat16_rui_nbst", shim_raw_str)
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            hcos_1_caller,
-            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            _ZL17__int2bfloat16_rui_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(int32)),
             ptrs,
         )
 
 
-_hcos_1_lower(shim_stream, shim_obj)
+_lower__ZL17__int2bfloat16_rui_nbst(shim_stream, shim_obj)
 
 
-def hsin():
+def __bfloat162short_rn():
     pass
 
 
-def _hsin_1_lower(shim_stream, shim_obj):
+def _lower__ZL19__bfloat162short_rn13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    hsin_1(__nv_bfloat16 &retval , __nv_bfloat16* a) {
-        retval = hsin(*a);
+    _ZL19__bfloat162short_rn13__nv_bfloat16_nbst(short &retval , __nv_bfloat16* h) {
+        retval = __bfloat162short_rn(*h);
         return 0;
     }
         """
 
-    hsin_1 = declare_device(
-        "hsin_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16))
+    _ZL19__bfloat162short_rn13__nv_bfloat16_nbst = declare_device(
+        "_ZL19__bfloat162short_rn13__nv_bfloat16_nbst",
+        int16(CPointer(_type___nv_bfloat16)),
     )
 
-    def hsin_1_caller(arg_0):
-        return hsin_1(arg_0)
+    def _ZL19__bfloat162short_rn13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL19__bfloat162short_rn13__nv_bfloat16_nbst(arg_0)
 
-    @lower(hsin, _type___nv_bfloat16)
+    @lower(__bfloat162short_rn, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("hsin_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL19__bfloat162short_rn13__nv_bfloat16_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            hsin_1_caller,
-            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            _ZL19__bfloat162short_rn13__nv_bfloat16_nbst_caller,
+            signature(int16, CPointer(_type___nv_bfloat16)),
             ptrs,
         )
 
 
-_hsin_1_lower(shim_stream, shim_obj)
+_lower__ZL19__bfloat162short_rn13__nv_bfloat16_nbst(shim_stream, shim_obj)
 
 
-def h2sqrt():
+def __bfloat162short_rz():
     pass
 
 
-def _h2sqrt_1_lower(shim_stream, shim_obj):
+def _lower__ZL19__bfloat162short_rz13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    h2sqrt_1(__nv_bfloat162 &retval , __nv_bfloat162* a) {
-        retval = h2sqrt(*a);
+    _ZL19__bfloat162short_rz13__nv_bfloat16_nbst(short &retval , __nv_bfloat16* h) {
+        retval = __bfloat162short_rz(*h);
         return 0;
     }
         """
 
-    h2sqrt_1 = declare_device(
-        "h2sqrt_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162))
+    _ZL19__bfloat162short_rz13__nv_bfloat16_nbst = declare_device(
+        "_ZL19__bfloat162short_rz13__nv_bfloat16_nbst",
+        int16(CPointer(_type___nv_bfloat16)),
     )
 
-    def h2sqrt_1_caller(arg_0):
-        return h2sqrt_1(arg_0)
+    def _ZL19__bfloat162short_rz13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL19__bfloat162short_rz13__nv_bfloat16_nbst(arg_0)
 
-    @lower(h2sqrt, _type___nv_bfloat162)
+    @lower(__bfloat162short_rz, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("h2sqrt_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL19__bfloat162short_rz13__nv_bfloat16_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            h2sqrt_1_caller,
-            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            _ZL19__bfloat162short_rz13__nv_bfloat16_nbst_caller,
+            signature(int16, CPointer(_type___nv_bfloat16)),
             ptrs,
         )
 
 
-_h2sqrt_1_lower(shim_stream, shim_obj)
+_lower__ZL19__bfloat162short_rz13__nv_bfloat16_nbst(shim_stream, shim_obj)
 
 
-def h2rsqrt():
+def __bfloat162short_rd():
     pass
 
 
-def _h2rsqrt_1_lower(shim_stream, shim_obj):
+def _lower__ZL19__bfloat162short_rd13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    h2rsqrt_1(__nv_bfloat162 &retval , __nv_bfloat162* a) {
-        retval = h2rsqrt(*a);
+    _ZL19__bfloat162short_rd13__nv_bfloat16_nbst(short &retval , __nv_bfloat16* h) {
+        retval = __bfloat162short_rd(*h);
         return 0;
     }
         """
 
-    h2rsqrt_1 = declare_device(
-        "h2rsqrt_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162))
+    _ZL19__bfloat162short_rd13__nv_bfloat16_nbst = declare_device(
+        "_ZL19__bfloat162short_rd13__nv_bfloat16_nbst",
+        int16(CPointer(_type___nv_bfloat16)),
     )
 
-    def h2rsqrt_1_caller(arg_0):
-        return h2rsqrt_1(arg_0)
+    def _ZL19__bfloat162short_rd13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL19__bfloat162short_rd13__nv_bfloat16_nbst(arg_0)
 
-    @lower(h2rsqrt, _type___nv_bfloat162)
+    @lower(__bfloat162short_rd, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("h2rsqrt_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL19__bfloat162short_rd13__nv_bfloat16_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            h2rsqrt_1_caller,
-            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            _ZL19__bfloat162short_rd13__nv_bfloat16_nbst_caller,
+            signature(int16, CPointer(_type___nv_bfloat16)),
             ptrs,
         )
 
 
-_h2rsqrt_1_lower(shim_stream, shim_obj)
+_lower__ZL19__bfloat162short_rd13__nv_bfloat16_nbst(shim_stream, shim_obj)
 
 
-def h2rcp():
+def __bfloat162short_ru():
     pass
 
 
-def _h2rcp_1_lower(shim_stream, shim_obj):
+def _lower__ZL19__bfloat162short_ru13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    h2rcp_1(__nv_bfloat162 &retval , __nv_bfloat162* a) {
-        retval = h2rcp(*a);
+    _ZL19__bfloat162short_ru13__nv_bfloat16_nbst(short &retval , __nv_bfloat16* h) {
+        retval = __bfloat162short_ru(*h);
         return 0;
     }
         """
 
-    h2rcp_1 = declare_device(
-        "h2rcp_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162))
+    _ZL19__bfloat162short_ru13__nv_bfloat16_nbst = declare_device(
+        "_ZL19__bfloat162short_ru13__nv_bfloat16_nbst",
+        int16(CPointer(_type___nv_bfloat16)),
     )
 
-    def h2rcp_1_caller(arg_0):
-        return h2rcp_1(arg_0)
+    def _ZL19__bfloat162short_ru13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL19__bfloat162short_ru13__nv_bfloat16_nbst(arg_0)
 
-    @lower(h2rcp, _type___nv_bfloat162)
+    @lower(__bfloat162short_ru, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("h2rcp_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL19__bfloat162short_ru13__nv_bfloat16_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            h2rcp_1_caller,
-            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            _ZL19__bfloat162short_ru13__nv_bfloat16_nbst_caller,
+            signature(int16, CPointer(_type___nv_bfloat16)),
             ptrs,
         )
 
 
-_h2rcp_1_lower(shim_stream, shim_obj)
+_lower__ZL19__bfloat162short_ru13__nv_bfloat16_nbst(shim_stream, shim_obj)
 
 
-def h2log():
+def __short2bfloat16_rn():
     pass
 
 
-def _h2log_1_lower(shim_stream, shim_obj):
+def _lower__ZL19__short2bfloat16_rns_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    h2log_1(__nv_bfloat162 &retval , __nv_bfloat162* a) {
-        retval = h2log(*a);
+    _ZL19__short2bfloat16_rns_nbst(__nv_bfloat16 &retval , short* i) {
+        retval = __short2bfloat16_rn(*i);
         return 0;
     }
         """
 
-    h2log_1 = declare_device(
-        "h2log_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162))
+    _ZL19__short2bfloat16_rns_nbst = declare_device(
+        "_ZL19__short2bfloat16_rns_nbst", _type___nv_bfloat16(CPointer(int16))
     )
 
-    def h2log_1_caller(arg_0):
-        return h2log_1(arg_0)
+    def _ZL19__short2bfloat16_rns_nbst_caller(arg_0):
+        return _ZL19__short2bfloat16_rns_nbst(arg_0)
 
-    @lower(h2log, _type___nv_bfloat162)
+    @lower(__short2bfloat16_rn, int16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("h2log_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL19__short2bfloat16_rns_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            h2log_1_caller,
-            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            _ZL19__short2bfloat16_rns_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(int16)),
             ptrs,
         )
 
 
-_h2log_1_lower(shim_stream, shim_obj)
+_lower__ZL19__short2bfloat16_rns_nbst(shim_stream, shim_obj)
 
 
-def h2log2():
+def __short2bfloat16_rz():
     pass
 
 
-def _h2log2_1_lower(shim_stream, shim_obj):
+def _lower__ZL19__short2bfloat16_rzs_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    h2log2_1(__nv_bfloat162 &retval , __nv_bfloat162* a) {
-        retval = h2log2(*a);
+    _ZL19__short2bfloat16_rzs_nbst(__nv_bfloat16 &retval , short* i) {
+        retval = __short2bfloat16_rz(*i);
         return 0;
     }
         """
 
-    h2log2_1 = declare_device(
-        "h2log2_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162))
+    _ZL19__short2bfloat16_rzs_nbst = declare_device(
+        "_ZL19__short2bfloat16_rzs_nbst", _type___nv_bfloat16(CPointer(int16))
     )
 
-    def h2log2_1_caller(arg_0):
-        return h2log2_1(arg_0)
+    def _ZL19__short2bfloat16_rzs_nbst_caller(arg_0):
+        return _ZL19__short2bfloat16_rzs_nbst(arg_0)
 
-    @lower(h2log2, _type___nv_bfloat162)
+    @lower(__short2bfloat16_rz, int16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("h2log2_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL19__short2bfloat16_rzs_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            h2log2_1_caller,
-            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            _ZL19__short2bfloat16_rzs_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(int16)),
             ptrs,
         )
 
 
-_h2log2_1_lower(shim_stream, shim_obj)
+_lower__ZL19__short2bfloat16_rzs_nbst(shim_stream, shim_obj)
 
 
-def h2log10():
+def __short2bfloat16_rd():
     pass
 
 
-def _h2log10_1_lower(shim_stream, shim_obj):
+def _lower__ZL19__short2bfloat16_rds_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    h2log10_1(__nv_bfloat162 &retval , __nv_bfloat162* a) {
-        retval = h2log10(*a);
+    _ZL19__short2bfloat16_rds_nbst(__nv_bfloat16 &retval , short* i) {
+        retval = __short2bfloat16_rd(*i);
         return 0;
     }
         """
 
-    h2log10_1 = declare_device(
-        "h2log10_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162))
+    _ZL19__short2bfloat16_rds_nbst = declare_device(
+        "_ZL19__short2bfloat16_rds_nbst", _type___nv_bfloat16(CPointer(int16))
     )
 
-    def h2log10_1_caller(arg_0):
-        return h2log10_1(arg_0)
+    def _ZL19__short2bfloat16_rds_nbst_caller(arg_0):
+        return _ZL19__short2bfloat16_rds_nbst(arg_0)
 
-    @lower(h2log10, _type___nv_bfloat162)
+    @lower(__short2bfloat16_rd, int16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("h2log10_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL19__short2bfloat16_rds_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            h2log10_1_caller,
-            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            _ZL19__short2bfloat16_rds_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(int16)),
             ptrs,
         )
 
 
-_h2log10_1_lower(shim_stream, shim_obj)
+_lower__ZL19__short2bfloat16_rds_nbst(shim_stream, shim_obj)
 
 
-def h2exp():
+def __short2bfloat16_ru():
     pass
 
 
-def _h2exp_1_lower(shim_stream, shim_obj):
+def _lower__ZL19__short2bfloat16_rus_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    h2exp_1(__nv_bfloat162 &retval , __nv_bfloat162* a) {
-        retval = h2exp(*a);
+    _ZL19__short2bfloat16_rus_nbst(__nv_bfloat16 &retval , short* i) {
+        retval = __short2bfloat16_ru(*i);
         return 0;
     }
         """
 
-    h2exp_1 = declare_device(
-        "h2exp_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162))
+    _ZL19__short2bfloat16_rus_nbst = declare_device(
+        "_ZL19__short2bfloat16_rus_nbst", _type___nv_bfloat16(CPointer(int16))
     )
 
-    def h2exp_1_caller(arg_0):
-        return h2exp_1(arg_0)
+    def _ZL19__short2bfloat16_rus_nbst_caller(arg_0):
+        return _ZL19__short2bfloat16_rus_nbst(arg_0)
 
-    @lower(h2exp, _type___nv_bfloat162)
+    @lower(__short2bfloat16_ru, int16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("h2exp_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL19__short2bfloat16_rus_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            h2exp_1_caller,
-            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            _ZL19__short2bfloat16_rus_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(int16)),
             ptrs,
         )
 
 
-_h2exp_1_lower(shim_stream, shim_obj)
+_lower__ZL19__short2bfloat16_rus_nbst(shim_stream, shim_obj)
 
 
-def h2exp2():
+def __bfloat162uint_rn():
     pass
 
 
-def _h2exp2_1_lower(shim_stream, shim_obj):
+def _lower__ZL18__bfloat162uint_rn13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    h2exp2_1(__nv_bfloat162 &retval , __nv_bfloat162* a) {
-        retval = h2exp2(*a);
+    _ZL18__bfloat162uint_rn13__nv_bfloat16_nbst(unsigned int &retval , __nv_bfloat16* h) {
+        retval = __bfloat162uint_rn(*h);
         return 0;
     }
         """
 
-    h2exp2_1 = declare_device(
-        "h2exp2_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162))
+    _ZL18__bfloat162uint_rn13__nv_bfloat16_nbst = declare_device(
+        "_ZL18__bfloat162uint_rn13__nv_bfloat16_nbst",
+        uint32(CPointer(_type___nv_bfloat16)),
     )
 
-    def h2exp2_1_caller(arg_0):
-        return h2exp2_1(arg_0)
+    def _ZL18__bfloat162uint_rn13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL18__bfloat162uint_rn13__nv_bfloat16_nbst(arg_0)
 
-    @lower(h2exp2, _type___nv_bfloat162)
+    @lower(__bfloat162uint_rn, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("h2exp2_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL18__bfloat162uint_rn13__nv_bfloat16_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            h2exp2_1_caller,
-            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            _ZL18__bfloat162uint_rn13__nv_bfloat16_nbst_caller,
+            signature(uint32, CPointer(_type___nv_bfloat16)),
             ptrs,
         )
 
 
-_h2exp2_1_lower(shim_stream, shim_obj)
+_lower__ZL18__bfloat162uint_rn13__nv_bfloat16_nbst(shim_stream, shim_obj)
 
 
-def h2exp10():
+def __bfloat162uint_rz():
     pass
 
 
-def _h2exp10_1_lower(shim_stream, shim_obj):
+def _lower__ZL18__bfloat162uint_rz13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    h2exp10_1(__nv_bfloat162 &retval , __nv_bfloat162* a) {
-        retval = h2exp10(*a);
+    _ZL18__bfloat162uint_rz13__nv_bfloat16_nbst(unsigned int &retval , __nv_bfloat16* h) {
+        retval = __bfloat162uint_rz(*h);
         return 0;
     }
         """
 
-    h2exp10_1 = declare_device(
-        "h2exp10_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162))
+    _ZL18__bfloat162uint_rz13__nv_bfloat16_nbst = declare_device(
+        "_ZL18__bfloat162uint_rz13__nv_bfloat16_nbst",
+        uint32(CPointer(_type___nv_bfloat16)),
     )
 
-    def h2exp10_1_caller(arg_0):
-        return h2exp10_1(arg_0)
+    def _ZL18__bfloat162uint_rz13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL18__bfloat162uint_rz13__nv_bfloat16_nbst(arg_0)
 
-    @lower(h2exp10, _type___nv_bfloat162)
+    @lower(__bfloat162uint_rz, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("h2exp10_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL18__bfloat162uint_rz13__nv_bfloat16_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            h2exp10_1_caller,
-            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            _ZL18__bfloat162uint_rz13__nv_bfloat16_nbst_caller,
+            signature(uint32, CPointer(_type___nv_bfloat16)),
             ptrs,
         )
 
 
-_h2exp10_1_lower(shim_stream, shim_obj)
+_lower__ZL18__bfloat162uint_rz13__nv_bfloat16_nbst(shim_stream, shim_obj)
 
 
-def h2cos():
+def __bfloat162uint_rd():
     pass
 
 
-def _h2cos_1_lower(shim_stream, shim_obj):
+def _lower__ZL18__bfloat162uint_rd13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    h2cos_1(__nv_bfloat162 &retval , __nv_bfloat162* a) {
-        retval = h2cos(*a);
+    _ZL18__bfloat162uint_rd13__nv_bfloat16_nbst(unsigned int &retval , __nv_bfloat16* h) {
+        retval = __bfloat162uint_rd(*h);
         return 0;
     }
         """
 
-    h2cos_1 = declare_device(
-        "h2cos_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162))
+    _ZL18__bfloat162uint_rd13__nv_bfloat16_nbst = declare_device(
+        "_ZL18__bfloat162uint_rd13__nv_bfloat16_nbst",
+        uint32(CPointer(_type___nv_bfloat16)),
     )
 
-    def h2cos_1_caller(arg_0):
-        return h2cos_1(arg_0)
+    def _ZL18__bfloat162uint_rd13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL18__bfloat162uint_rd13__nv_bfloat16_nbst(arg_0)
 
-    @lower(h2cos, _type___nv_bfloat162)
+    @lower(__bfloat162uint_rd, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("h2cos_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL18__bfloat162uint_rd13__nv_bfloat16_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            h2cos_1_caller,
-            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            _ZL18__bfloat162uint_rd13__nv_bfloat16_nbst_caller,
+            signature(uint32, CPointer(_type___nv_bfloat16)),
             ptrs,
         )
 
 
-_h2cos_1_lower(shim_stream, shim_obj)
+_lower__ZL18__bfloat162uint_rd13__nv_bfloat16_nbst(shim_stream, shim_obj)
 
 
-def h2sin():
+def __bfloat162uint_ru():
     pass
 
 
-def _h2sin_1_lower(shim_stream, shim_obj):
+def _lower__ZL18__bfloat162uint_ru13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    h2sin_1(__nv_bfloat162 &retval , __nv_bfloat162* a) {
-        retval = h2sin(*a);
+    _ZL18__bfloat162uint_ru13__nv_bfloat16_nbst(unsigned int &retval , __nv_bfloat16* h) {
+        retval = __bfloat162uint_ru(*h);
         return 0;
     }
         """
 
-    h2sin_1 = declare_device(
-        "h2sin_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162))
+    _ZL18__bfloat162uint_ru13__nv_bfloat16_nbst = declare_device(
+        "_ZL18__bfloat162uint_ru13__nv_bfloat16_nbst",
+        uint32(CPointer(_type___nv_bfloat16)),
     )
 
-    def h2sin_1_caller(arg_0):
-        return h2sin_1(arg_0)
+    def _ZL18__bfloat162uint_ru13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL18__bfloat162uint_ru13__nv_bfloat16_nbst(arg_0)
 
-    @lower(h2sin, _type___nv_bfloat162)
+    @lower(__bfloat162uint_ru, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("h2sin_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL18__bfloat162uint_ru13__nv_bfloat16_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            h2sin_1_caller,
-            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            _ZL18__bfloat162uint_ru13__nv_bfloat16_nbst_caller,
+            signature(uint32, CPointer(_type___nv_bfloat16)),
             ptrs,
         )
 
 
-_h2sin_1_lower(shim_stream, shim_obj)
+_lower__ZL18__bfloat162uint_ru13__nv_bfloat16_nbst(shim_stream, shim_obj)
 
 
-def atomicAdd():
+def __uint2bfloat16_rn():
     pass
 
 
-def _atomicAdd_1_lower(shim_stream, shim_obj):
+def _lower__ZL18__uint2bfloat16_rnj_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    atomicAdd_1(__nv_bfloat162 &retval , __nv_bfloat162 ** address, __nv_bfloat162* val) {
-        retval = atomicAdd(*address, *val);
+    _ZL18__uint2bfloat16_rnj_nbst(__nv_bfloat16 &retval , unsigned int* i) {
+        retval = __uint2bfloat16_rn(*i);
         return 0;
     }
         """
 
-    atomicAdd_1 = declare_device(
-        "atomicAdd_1",
-        _type___nv_bfloat162(
-            CPointer(CPointer(_type___nv_bfloat162)),
-            CPointer(_type___nv_bfloat162),
-        ),
+    _ZL18__uint2bfloat16_rnj_nbst = declare_device(
+        "_ZL18__uint2bfloat16_rnj_nbst", _type___nv_bfloat16(CPointer(uint32))
     )
 
-    def atomicAdd_1_caller(arg_0, arg_1):
-        return atomicAdd_1(arg_0, arg_1)
+    def _ZL18__uint2bfloat16_rnj_nbst_caller(arg_0):
+        return _ZL18__uint2bfloat16_rnj_nbst(arg_0)
 
-    @lower(atomicAdd, CPointer(_type___nv_bfloat162), _type___nv_bfloat162)
+    @lower(__uint2bfloat16_rn, uint32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("atomicAdd_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL18__uint2bfloat16_rnj_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            atomicAdd_1_caller,
-            signature(
-                _type___nv_bfloat162,
-                CPointer(CPointer(_type___nv_bfloat162)),
-                CPointer(_type___nv_bfloat162),
-            ),
+            _ZL18__uint2bfloat16_rnj_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(uint32)),
             ptrs,
         )
 
 
-_atomicAdd_1_lower(shim_stream, shim_obj)
+_lower__ZL18__uint2bfloat16_rnj_nbst(shim_stream, shim_obj)
+
+
+def __uint2bfloat16_rz():
+    pass
 
 
-def _atomicAdd_2_lower(shim_stream, shim_obj):
+def _lower__ZL18__uint2bfloat16_rzj_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    atomicAdd_2(__nv_bfloat16 &retval , __nv_bfloat16 ** address, __nv_bfloat16* val) {
-        retval = atomicAdd(*address, *val);
+    _ZL18__uint2bfloat16_rzj_nbst(__nv_bfloat16 &retval , unsigned int* i) {
+        retval = __uint2bfloat16_rz(*i);
         return 0;
     }
         """
 
-    atomicAdd_2 = declare_device(
-        "atomicAdd_2",
-        _type___nv_bfloat16(
-            CPointer(CPointer(_type___nv_bfloat16)),
-            CPointer(_type___nv_bfloat16),
-        ),
+    _ZL18__uint2bfloat16_rzj_nbst = declare_device(
+        "_ZL18__uint2bfloat16_rzj_nbst", _type___nv_bfloat16(CPointer(uint32))
     )
 
-    def atomicAdd_2_caller(arg_0, arg_1):
-        return atomicAdd_2(arg_0, arg_1)
+    def _ZL18__uint2bfloat16_rzj_nbst_caller(arg_0):
+        return _ZL18__uint2bfloat16_rzj_nbst(arg_0)
 
-    @lower(atomicAdd, CPointer(_type___nv_bfloat16), _type___nv_bfloat16)
+    @lower(__uint2bfloat16_rz, uint32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("atomicAdd_2", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL18__uint2bfloat16_rzj_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            atomicAdd_2_caller,
-            signature(
-                _type___nv_bfloat16,
-                CPointer(CPointer(_type___nv_bfloat16)),
-                CPointer(_type___nv_bfloat16),
-            ),
+            _ZL18__uint2bfloat16_rzj_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(uint32)),
             ptrs,
         )
 
 
-_atomicAdd_2_lower(shim_stream, shim_obj)
+_lower__ZL18__uint2bfloat16_rzj_nbst(shim_stream, shim_obj)
+
+
+def __uint2bfloat16_rd():
+    pass
 
 
-def _operator_add_1_lower(shim_stream, shim_obj):
+def _lower__ZL18__uint2bfloat16_rdj_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    operator_add_1(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
-        retval = operator+(*lh, *rh);
+    _ZL18__uint2bfloat16_rdj_nbst(__nv_bfloat16 &retval , unsigned int* i) {
+        retval = __uint2bfloat16_rd(*i);
         return 0;
     }
         """
 
-    operator_add_1 = declare_device(
-        "operator_add_1",
-        _type___nv_bfloat16(
-            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
-        ),
+    _ZL18__uint2bfloat16_rdj_nbst = declare_device(
+        "_ZL18__uint2bfloat16_rdj_nbst", _type___nv_bfloat16(CPointer(uint32))
     )
 
-    def operator_add_1_caller(arg_0, arg_1):
-        return operator_add_1(arg_0, arg_1)
+    def _ZL18__uint2bfloat16_rdj_nbst_caller(arg_0):
+        return _ZL18__uint2bfloat16_rdj_nbst(arg_0)
 
-    @lower(operator.add, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__uint2bfloat16_rd, uint32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_add_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL18__uint2bfloat16_rdj_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            operator_add_1_caller,
-            signature(
-                _type___nv_bfloat16,
-                CPointer(_type___nv_bfloat16),
-                CPointer(_type___nv_bfloat16),
-            ),
+            _ZL18__uint2bfloat16_rdj_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(uint32)),
             ptrs,
         )
 
 
-_operator_add_1_lower(shim_stream, shim_obj)
+_lower__ZL18__uint2bfloat16_rdj_nbst(shim_stream, shim_obj)
 
 
-def _operator_sub_1_lower(shim_stream, shim_obj):
+def __uint2bfloat16_ru():
+    pass
+
+
+def _lower__ZL18__uint2bfloat16_ruj_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    operator_sub_1(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
-        retval = operator-(*lh, *rh);
+    _ZL18__uint2bfloat16_ruj_nbst(__nv_bfloat16 &retval , unsigned int* i) {
+        retval = __uint2bfloat16_ru(*i);
         return 0;
     }
         """
 
-    operator_sub_1 = declare_device(
-        "operator_sub_1",
-        _type___nv_bfloat16(
-            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
-        ),
+    _ZL18__uint2bfloat16_ruj_nbst = declare_device(
+        "_ZL18__uint2bfloat16_ruj_nbst", _type___nv_bfloat16(CPointer(uint32))
     )
 
-    def operator_sub_1_caller(arg_0, arg_1):
-        return operator_sub_1(arg_0, arg_1)
+    def _ZL18__uint2bfloat16_ruj_nbst_caller(arg_0):
+        return _ZL18__uint2bfloat16_ruj_nbst(arg_0)
 
-    @lower(operator.sub, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__uint2bfloat16_ru, uint32)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_sub_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL18__uint2bfloat16_ruj_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            operator_sub_1_caller,
-            signature(
-                _type___nv_bfloat16,
-                CPointer(_type___nv_bfloat16),
-                CPointer(_type___nv_bfloat16),
-            ),
+            _ZL18__uint2bfloat16_ruj_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(uint32)),
             ptrs,
         )
 
 
-_operator_sub_1_lower(shim_stream, shim_obj)
+_lower__ZL18__uint2bfloat16_ruj_nbst(shim_stream, shim_obj)
+
+
+def __bfloat162ushort_rn():
+    pass
 
 
-def _operator_mul_1_lower(shim_stream, shim_obj):
+def _lower__ZL20__bfloat162ushort_rn13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    operator_mul_1(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
-        retval = operator*(*lh, *rh);
+    _ZL20__bfloat162ushort_rn13__nv_bfloat16_nbst(unsigned short &retval , __nv_bfloat16* h) {
+        retval = __bfloat162ushort_rn(*h);
         return 0;
     }
         """
 
-    operator_mul_1 = declare_device(
-        "operator_mul_1",
-        _type___nv_bfloat16(
-            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
-        ),
+    _ZL20__bfloat162ushort_rn13__nv_bfloat16_nbst = declare_device(
+        "_ZL20__bfloat162ushort_rn13__nv_bfloat16_nbst",
+        uint16(CPointer(_type___nv_bfloat16)),
     )
 
-    def operator_mul_1_caller(arg_0, arg_1):
-        return operator_mul_1(arg_0, arg_1)
+    def _ZL20__bfloat162ushort_rn13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL20__bfloat162ushort_rn13__nv_bfloat16_nbst(arg_0)
 
-    @lower(operator.mul, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__bfloat162ushort_rn, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_mul_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL20__bfloat162ushort_rn13__nv_bfloat16_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            operator_mul_1_caller,
-            signature(
-                _type___nv_bfloat16,
-                CPointer(_type___nv_bfloat16),
-                CPointer(_type___nv_bfloat16),
-            ),
+            _ZL20__bfloat162ushort_rn13__nv_bfloat16_nbst_caller,
+            signature(uint16, CPointer(_type___nv_bfloat16)),
             ptrs,
         )
 
 
-_operator_mul_1_lower(shim_stream, shim_obj)
+_lower__ZL20__bfloat162ushort_rn13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def __bfloat162ushort_rz():
+    pass
 
 
-def _operator_truediv_1_lower(shim_stream, shim_obj):
+def _lower__ZL20__bfloat162ushort_rz13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    operator_truediv_1(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
-        retval = operator/(*lh, *rh);
+    _ZL20__bfloat162ushort_rz13__nv_bfloat16_nbst(unsigned short &retval , __nv_bfloat16* h) {
+        retval = __bfloat162ushort_rz(*h);
         return 0;
     }
         """
 
-    operator_truediv_1 = declare_device(
-        "operator_truediv_1",
-        _type___nv_bfloat16(
-            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
-        ),
+    _ZL20__bfloat162ushort_rz13__nv_bfloat16_nbst = declare_device(
+        "_ZL20__bfloat162ushort_rz13__nv_bfloat16_nbst",
+        uint16(CPointer(_type___nv_bfloat16)),
     )
 
-    def operator_truediv_1_caller(arg_0, arg_1):
-        return operator_truediv_1(arg_0, arg_1)
+    def _ZL20__bfloat162ushort_rz13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL20__bfloat162ushort_rz13__nv_bfloat16_nbst(arg_0)
 
-    @lower(operator.truediv, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__bfloat162ushort_rz, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_truediv_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL20__bfloat162ushort_rz13__nv_bfloat16_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            operator_truediv_1_caller,
-            signature(
-                _type___nv_bfloat16,
-                CPointer(_type___nv_bfloat16),
-                CPointer(_type___nv_bfloat16),
-            ),
+            _ZL20__bfloat162ushort_rz13__nv_bfloat16_nbst_caller,
+            signature(uint16, CPointer(_type___nv_bfloat16)),
             ptrs,
         )
 
 
-_operator_truediv_1_lower(shim_stream, shim_obj)
+_lower__ZL20__bfloat162ushort_rz13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def __bfloat162ushort_rd():
+    pass
 
 
-def _operator_iadd_1_lower(shim_stream, shim_obj):
+def _lower__ZL20__bfloat162ushort_rd13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    operator_iadd_1(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
-        retval = operator+=(*lh, *rh);
+    _ZL20__bfloat162ushort_rd13__nv_bfloat16_nbst(unsigned short &retval , __nv_bfloat16* h) {
+        retval = __bfloat162ushort_rd(*h);
         return 0;
     }
         """
 
-    operator_iadd_1 = declare_device(
-        "operator_iadd_1",
-        _type___nv_bfloat16(
-            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
-        ),
+    _ZL20__bfloat162ushort_rd13__nv_bfloat16_nbst = declare_device(
+        "_ZL20__bfloat162ushort_rd13__nv_bfloat16_nbst",
+        uint16(CPointer(_type___nv_bfloat16)),
     )
 
-    def operator_iadd_1_caller(arg_0, arg_1):
-        return operator_iadd_1(arg_0, arg_1)
+    def _ZL20__bfloat162ushort_rd13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL20__bfloat162ushort_rd13__nv_bfloat16_nbst(arg_0)
 
-    @lower(operator.iadd, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__bfloat162ushort_rd, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_iadd_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL20__bfloat162ushort_rd13__nv_bfloat16_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            operator_iadd_1_caller,
-            signature(
-                _type___nv_bfloat16,
-                CPointer(_type___nv_bfloat16),
-                CPointer(_type___nv_bfloat16),
-            ),
+            _ZL20__bfloat162ushort_rd13__nv_bfloat16_nbst_caller,
+            signature(uint16, CPointer(_type___nv_bfloat16)),
             ptrs,
         )
 
 
-_operator_iadd_1_lower(shim_stream, shim_obj)
+_lower__ZL20__bfloat162ushort_rd13__nv_bfloat16_nbst(shim_stream, shim_obj)
 
 
-def _operator_isub_1_lower(shim_stream, shim_obj):
+def __bfloat162ushort_ru():
+    pass
+
+
+def _lower__ZL20__bfloat162ushort_ru13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    operator_isub_1(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
-        retval = operator-=(*lh, *rh);
+    _ZL20__bfloat162ushort_ru13__nv_bfloat16_nbst(unsigned short &retval , __nv_bfloat16* h) {
+        retval = __bfloat162ushort_ru(*h);
         return 0;
     }
         """
 
-    operator_isub_1 = declare_device(
-        "operator_isub_1",
-        _type___nv_bfloat16(
-            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
-        ),
+    _ZL20__bfloat162ushort_ru13__nv_bfloat16_nbst = declare_device(
+        "_ZL20__bfloat162ushort_ru13__nv_bfloat16_nbst",
+        uint16(CPointer(_type___nv_bfloat16)),
     )
 
-    def operator_isub_1_caller(arg_0, arg_1):
-        return operator_isub_1(arg_0, arg_1)
+    def _ZL20__bfloat162ushort_ru13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL20__bfloat162ushort_ru13__nv_bfloat16_nbst(arg_0)
 
-    @lower(operator.isub, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__bfloat162ushort_ru, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_isub_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL20__bfloat162ushort_ru13__nv_bfloat16_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            operator_isub_1_caller,
-            signature(
-                _type___nv_bfloat16,
-                CPointer(_type___nv_bfloat16),
-                CPointer(_type___nv_bfloat16),
-            ),
+            _ZL20__bfloat162ushort_ru13__nv_bfloat16_nbst_caller,
+            signature(uint16, CPointer(_type___nv_bfloat16)),
             ptrs,
         )
 
 
-_operator_isub_1_lower(shim_stream, shim_obj)
+_lower__ZL20__bfloat162ushort_ru13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def __ushort2bfloat16_rn():
+    pass
 
 
-def _operator_imul_1_lower(shim_stream, shim_obj):
+def _lower__ZL20__ushort2bfloat16_rnt_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    operator_imul_1(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
-        retval = operator*=(*lh, *rh);
+    _ZL20__ushort2bfloat16_rnt_nbst(__nv_bfloat16 &retval , unsigned short* i) {
+        retval = __ushort2bfloat16_rn(*i);
         return 0;
     }
         """
 
-    operator_imul_1 = declare_device(
-        "operator_imul_1",
-        _type___nv_bfloat16(
-            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
-        ),
+    _ZL20__ushort2bfloat16_rnt_nbst = declare_device(
+        "_ZL20__ushort2bfloat16_rnt_nbst", _type___nv_bfloat16(CPointer(uint16))
     )
 
-    def operator_imul_1_caller(arg_0, arg_1):
-        return operator_imul_1(arg_0, arg_1)
+    def _ZL20__ushort2bfloat16_rnt_nbst_caller(arg_0):
+        return _ZL20__ushort2bfloat16_rnt_nbst(arg_0)
 
-    @lower(operator.imul, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__ushort2bfloat16_rn, uint16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_imul_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL20__ushort2bfloat16_rnt_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            operator_imul_1_caller,
-            signature(
-                _type___nv_bfloat16,
-                CPointer(_type___nv_bfloat16),
-                CPointer(_type___nv_bfloat16),
-            ),
+            _ZL20__ushort2bfloat16_rnt_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(uint16)),
             ptrs,
         )
 
 
-_operator_imul_1_lower(shim_stream, shim_obj)
+_lower__ZL20__ushort2bfloat16_rnt_nbst(shim_stream, shim_obj)
+
+
+def __ushort2bfloat16_rz():
+    pass
 
 
-def _operator_itruediv_1_lower(shim_stream, shim_obj):
+def _lower__ZL20__ushort2bfloat16_rzt_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    operator_itruediv_1(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
-        retval = operator/=(*lh, *rh);
+    _ZL20__ushort2bfloat16_rzt_nbst(__nv_bfloat16 &retval , unsigned short* i) {
+        retval = __ushort2bfloat16_rz(*i);
         return 0;
     }
         """
 
-    operator_itruediv_1 = declare_device(
-        "operator_itruediv_1",
-        _type___nv_bfloat16(
-            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
-        ),
+    _ZL20__ushort2bfloat16_rzt_nbst = declare_device(
+        "_ZL20__ushort2bfloat16_rzt_nbst", _type___nv_bfloat16(CPointer(uint16))
     )
 
-    def operator_itruediv_1_caller(arg_0, arg_1):
-        return operator_itruediv_1(arg_0, arg_1)
+    def _ZL20__ushort2bfloat16_rzt_nbst_caller(arg_0):
+        return _ZL20__ushort2bfloat16_rzt_nbst(arg_0)
 
-    @lower(operator.itruediv, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__ushort2bfloat16_rz, uint16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_itruediv_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL20__ushort2bfloat16_rzt_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            operator_itruediv_1_caller,
-            signature(
-                _type___nv_bfloat16,
-                CPointer(_type___nv_bfloat16),
-                CPointer(_type___nv_bfloat16),
-            ),
+            _ZL20__ushort2bfloat16_rzt_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(uint16)),
             ptrs,
         )
 
 
-_operator_itruediv_1_lower(shim_stream, shim_obj)
+_lower__ZL20__ushort2bfloat16_rzt_nbst(shim_stream, shim_obj)
+
+
+def __ushort2bfloat16_rd():
+    pass
 
 
-def _operator_pos_1_lower(shim_stream, shim_obj):
+def _lower__ZL20__ushort2bfloat16_rdt_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    operator_pos_1(__nv_bfloat16 &retval , __nv_bfloat16* h) {
-        retval = operator+(*h);
+    _ZL20__ushort2bfloat16_rdt_nbst(__nv_bfloat16 &retval , unsigned short* i) {
+        retval = __ushort2bfloat16_rd(*i);
         return 0;
     }
         """
 
-    operator_pos_1 = declare_device(
-        "operator_pos_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16))
+    _ZL20__ushort2bfloat16_rdt_nbst = declare_device(
+        "_ZL20__ushort2bfloat16_rdt_nbst", _type___nv_bfloat16(CPointer(uint16))
     )
 
-    def operator_pos_1_caller(arg_0):
-        return operator_pos_1(arg_0)
+    def _ZL20__ushort2bfloat16_rdt_nbst_caller(arg_0):
+        return _ZL20__ushort2bfloat16_rdt_nbst(arg_0)
 
-    @lower(operator.pos, _type___nv_bfloat16)
+    @lower(__ushort2bfloat16_rd, uint16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_pos_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL20__ushort2bfloat16_rdt_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            operator_pos_1_caller,
-            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            _ZL20__ushort2bfloat16_rdt_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(uint16)),
             ptrs,
         )
 
 
-_operator_pos_1_lower(shim_stream, shim_obj)
+_lower__ZL20__ushort2bfloat16_rdt_nbst(shim_stream, shim_obj)
+
+
+def __ushort2bfloat16_ru():
+    pass
 
 
-def _operator_neg_1_lower(shim_stream, shim_obj):
+def _lower__ZL20__ushort2bfloat16_rut_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    operator_neg_1(__nv_bfloat16 &retval , __nv_bfloat16* h) {
-        retval = operator-(*h);
+    _ZL20__ushort2bfloat16_rut_nbst(__nv_bfloat16 &retval , unsigned short* i) {
+        retval = __ushort2bfloat16_ru(*i);
         return 0;
     }
         """
 
-    operator_neg_1 = declare_device(
-        "operator_neg_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16))
+    _ZL20__ushort2bfloat16_rut_nbst = declare_device(
+        "_ZL20__ushort2bfloat16_rut_nbst", _type___nv_bfloat16(CPointer(uint16))
     )
 
-    def operator_neg_1_caller(arg_0):
-        return operator_neg_1(arg_0)
+    def _ZL20__ushort2bfloat16_rut_nbst_caller(arg_0):
+        return _ZL20__ushort2bfloat16_rut_nbst(arg_0)
 
-    @lower(operator.neg, _type___nv_bfloat16)
+    @lower(__ushort2bfloat16_ru, uint16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_neg_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL20__ushort2bfloat16_rut_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            operator_neg_1_caller,
-            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            _ZL20__ushort2bfloat16_rut_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(uint16)),
             ptrs,
         )
 
 
-_operator_neg_1_lower(shim_stream, shim_obj)
+_lower__ZL20__ushort2bfloat16_rut_nbst(shim_stream, shim_obj)
+
+
+def __bfloat162ull_rn():
+    pass
 
 
-def _operator_eq_1_lower(shim_stream, shim_obj):
+def _lower__ZL17__bfloat162ull_rn13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    operator_eq_1(bool &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
-        retval = operator==(*lh, *rh);
+    _ZL17__bfloat162ull_rn13__nv_bfloat16_nbst(unsigned long long &retval , __nv_bfloat16* h) {
+        retval = __bfloat162ull_rn(*h);
         return 0;
     }
         """
 
-    operator_eq_1 = declare_device(
-        "operator_eq_1",
-        bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)),
+    _ZL17__bfloat162ull_rn13__nv_bfloat16_nbst = declare_device(
+        "_ZL17__bfloat162ull_rn13__nv_bfloat16_nbst",
+        uint64(CPointer(_type___nv_bfloat16)),
     )
 
-    def operator_eq_1_caller(arg_0, arg_1):
-        return operator_eq_1(arg_0, arg_1)
+    def _ZL17__bfloat162ull_rn13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL17__bfloat162ull_rn13__nv_bfloat16_nbst(arg_0)
 
-    @lower(operator.eq, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__bfloat162ull_rn, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_eq_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL17__bfloat162ull_rn13__nv_bfloat16_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            operator_eq_1_caller,
-            signature(
-                bool_,
-                CPointer(_type___nv_bfloat16),
-                CPointer(_type___nv_bfloat16),
-            ),
+            _ZL17__bfloat162ull_rn13__nv_bfloat16_nbst_caller,
+            signature(uint64, CPointer(_type___nv_bfloat16)),
             ptrs,
         )
 
 
-_operator_eq_1_lower(shim_stream, shim_obj)
+_lower__ZL17__bfloat162ull_rn13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def __bfloat162ull_rz():
+    pass
 
 
-def _operator_ne_1_lower(shim_stream, shim_obj):
+def _lower__ZL17__bfloat162ull_rz13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    operator_ne_1(bool &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
-        retval = operator!=(*lh, *rh);
+    _ZL17__bfloat162ull_rz13__nv_bfloat16_nbst(unsigned long long &retval , __nv_bfloat16* h) {
+        retval = __bfloat162ull_rz(*h);
         return 0;
     }
         """
 
-    operator_ne_1 = declare_device(
-        "operator_ne_1",
-        bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)),
+    _ZL17__bfloat162ull_rz13__nv_bfloat16_nbst = declare_device(
+        "_ZL17__bfloat162ull_rz13__nv_bfloat16_nbst",
+        uint64(CPointer(_type___nv_bfloat16)),
     )
 
-    def operator_ne_1_caller(arg_0, arg_1):
-        return operator_ne_1(arg_0, arg_1)
+    def _ZL17__bfloat162ull_rz13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL17__bfloat162ull_rz13__nv_bfloat16_nbst(arg_0)
 
-    @lower(operator.ne, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__bfloat162ull_rz, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_ne_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL17__bfloat162ull_rz13__nv_bfloat16_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            operator_ne_1_caller,
-            signature(
-                bool_,
-                CPointer(_type___nv_bfloat16),
-                CPointer(_type___nv_bfloat16),
-            ),
+            _ZL17__bfloat162ull_rz13__nv_bfloat16_nbst_caller,
+            signature(uint64, CPointer(_type___nv_bfloat16)),
             ptrs,
         )
 
 
-_operator_ne_1_lower(shim_stream, shim_obj)
+_lower__ZL17__bfloat162ull_rz13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
 
+def make_bfloat162():
+    pass
 
-def _operator_gt_1_lower(shim_stream, shim_obj):
+
+def _lower__ZL14make_bfloat16213__nv_bfloat16S__nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    operator_gt_1(bool &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
-        retval = operator>(*lh, *rh);
+    _ZL14make_bfloat16213__nv_bfloat16S__nbst(__nv_bfloat162 &retval , __nv_bfloat16* x, __nv_bfloat16* y) {
+        retval = make_bfloat162(*x, *y);
         return 0;
     }
         """
 
-    operator_gt_1 = declare_device(
-        "operator_gt_1",
-        bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)),
+    _ZL14make_bfloat16213__nv_bfloat16S__nbst = declare_device(
+        "_ZL14make_bfloat16213__nv_bfloat16S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
+        ),
     )
 
-    def operator_gt_1_caller(arg_0, arg_1):
-        return operator_gt_1(arg_0, arg_1)
+    def _ZL14make_bfloat16213__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL14make_bfloat16213__nv_bfloat16S__nbst(arg_0, arg_1)
 
-    @lower(operator.gt, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(make_bfloat162, _type___nv_bfloat16, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_gt_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL14make_bfloat16213__nv_bfloat16S__nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            operator_gt_1_caller,
+            _ZL14make_bfloat16213__nv_bfloat16S__nbst_caller,
             signature(
-                bool_,
+                _type___nv_bfloat162,
                 CPointer(_type___nv_bfloat16),
                 CPointer(_type___nv_bfloat16),
             ),
@@ -3883,858 +4083,11629 @@ def impl(context, builder, sig, args):
         )
 
 
-_operator_gt_1_lower(shim_stream, shim_obj)
+_lower__ZL14make_bfloat16213__nv_bfloat16S__nbst(shim_stream, shim_obj)
 
 
-def _operator_lt_1_lower(shim_stream, shim_obj):
-    shim_raw_str = """
+def __bfloat162ull_rd():
+    pass
+
+
+def _lower__ZL17__bfloat162ull_rd13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
     extern "C" __device__ int
-    operator_lt_1(bool &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
-        retval = operator<(*lh, *rh);
+    _ZL17__bfloat162ull_rd13__nv_bfloat16_nbst(unsigned long long &retval , __nv_bfloat16* h) {
+        retval = __bfloat162ull_rd(*h);
         return 0;
     }
         """
 
-    operator_lt_1 = declare_device(
-        "operator_lt_1",
-        bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)),
+    _ZL17__bfloat162ull_rd13__nv_bfloat16_nbst = declare_device(
+        "_ZL17__bfloat162ull_rd13__nv_bfloat16_nbst",
+        uint64(CPointer(_type___nv_bfloat16)),
     )
 
-    def operator_lt_1_caller(arg_0, arg_1):
-        return operator_lt_1(arg_0, arg_1)
+    def _ZL17__bfloat162ull_rd13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL17__bfloat162ull_rd13__nv_bfloat16_nbst(arg_0)
 
-    @lower(operator.lt, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__bfloat162ull_rd, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_lt_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL17__bfloat162ull_rd13__nv_bfloat16_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            operator_lt_1_caller,
-            signature(
-                bool_,
-                CPointer(_type___nv_bfloat16),
-                CPointer(_type___nv_bfloat16),
-            ),
+            _ZL17__bfloat162ull_rd13__nv_bfloat16_nbst_caller,
+            signature(uint64, CPointer(_type___nv_bfloat16)),
             ptrs,
         )
 
 
-_operator_lt_1_lower(shim_stream, shim_obj)
+_lower__ZL17__bfloat162ull_rd13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def __bfloat162ull_ru():
+    pass
 
 
-def _operator_ge_1_lower(shim_stream, shim_obj):
+def _lower__ZL17__bfloat162ull_ru13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    operator_ge_1(bool &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
-        retval = operator>=(*lh, *rh);
+    _ZL17__bfloat162ull_ru13__nv_bfloat16_nbst(unsigned long long &retval , __nv_bfloat16* h) {
+        retval = __bfloat162ull_ru(*h);
         return 0;
     }
         """
 
-    operator_ge_1 = declare_device(
-        "operator_ge_1",
-        bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)),
+    _ZL17__bfloat162ull_ru13__nv_bfloat16_nbst = declare_device(
+        "_ZL17__bfloat162ull_ru13__nv_bfloat16_nbst",
+        uint64(CPointer(_type___nv_bfloat16)),
     )
 
-    def operator_ge_1_caller(arg_0, arg_1):
-        return operator_ge_1(arg_0, arg_1)
+    def _ZL17__bfloat162ull_ru13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL17__bfloat162ull_ru13__nv_bfloat16_nbst(arg_0)
 
-    @lower(operator.ge, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__bfloat162ull_ru, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_ge_1", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL17__bfloat162ull_ru13__nv_bfloat16_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            operator_ge_1_caller,
-            signature(
-                bool_,
-                CPointer(_type___nv_bfloat16),
-                CPointer(_type___nv_bfloat16),
-            ),
+            _ZL17__bfloat162ull_ru13__nv_bfloat16_nbst_caller,
+            signature(uint64, CPointer(_type___nv_bfloat16)),
             ptrs,
         )
 
 
-_operator_ge_1_lower(shim_stream, shim_obj)
+_lower__ZL17__bfloat162ull_ru13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def __ull2bfloat16_rn():
+    pass
 
 
-def _operator_le_1_lower(shim_stream, shim_obj):
+def _lower__ZL17__ull2bfloat16_rny_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    operator_le_1(bool &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
-        retval = operator<=(*lh, *rh);
+    _ZL17__ull2bfloat16_rny_nbst(__nv_bfloat16 &retval , unsigned long long* i) {
+        retval = __ull2bfloat16_rn(*i);
         return 0;
     }
         """
 
-    operator_le_1 = declare_device(
-        "operator_le_1",
-        bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)),
+    _ZL17__ull2bfloat16_rny_nbst = declare_device(
+        "_ZL17__ull2bfloat16_rny_nbst", _type___nv_bfloat16(CPointer(uint64))
     )
 
-    def operator_le_1_caller(arg_0, arg_1):
-        return operator_le_1(arg_0, arg_1)
+    def _ZL17__ull2bfloat16_rny_nbst_caller(arg_0):
+        return _ZL17__ull2bfloat16_rny_nbst(arg_0)
 
-    @lower(operator.le, _type___nv_bfloat16, _type___nv_bfloat16)
+    @lower(__ull2bfloat16_rn, uint64)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_le_1", shim_raw_str)
+        shim_stream.write_with_key("_ZL17__ull2bfloat16_rny_nbst", shim_raw_str)
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            operator_le_1_caller,
-            signature(
-                bool_,
-                CPointer(_type___nv_bfloat16),
-                CPointer(_type___nv_bfloat16),
-            ),
+            _ZL17__ull2bfloat16_rny_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(uint64)),
             ptrs,
         )
 
 
-_operator_le_1_lower(shim_stream, shim_obj)
+_lower__ZL17__ull2bfloat16_rny_nbst(shim_stream, shim_obj)
+
+
+def __ull2bfloat16_rz():
+    pass
 
 
-def _operator_add_2_lower(shim_stream, shim_obj):
+def _lower__ZL17__ull2bfloat16_rzy_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    operator_add_2(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
-        retval = operator+(*lh, *rh);
+    _ZL17__ull2bfloat16_rzy_nbst(__nv_bfloat16 &retval , unsigned long long* i) {
+        retval = __ull2bfloat16_rz(*i);
         return 0;
     }
         """
 
-    operator_add_2 = declare_device(
-        "operator_add_2",
-        _type___nv_bfloat162(
-            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
-        ),
+    _ZL17__ull2bfloat16_rzy_nbst = declare_device(
+        "_ZL17__ull2bfloat16_rzy_nbst", _type___nv_bfloat16(CPointer(uint64))
     )
 
-    def operator_add_2_caller(arg_0, arg_1):
-        return operator_add_2(arg_0, arg_1)
+    def _ZL17__ull2bfloat16_rzy_nbst_caller(arg_0):
+        return _ZL17__ull2bfloat16_rzy_nbst(arg_0)
 
-    @lower(operator.add, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__ull2bfloat16_rz, uint64)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_add_2", shim_raw_str)
+        shim_stream.write_with_key("_ZL17__ull2bfloat16_rzy_nbst", shim_raw_str)
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            operator_add_2_caller,
-            signature(
-                _type___nv_bfloat162,
-                CPointer(_type___nv_bfloat162),
-                CPointer(_type___nv_bfloat162),
-            ),
+            _ZL17__ull2bfloat16_rzy_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(uint64)),
             ptrs,
         )
 
 
-_operator_add_2_lower(shim_stream, shim_obj)
+_lower__ZL17__ull2bfloat16_rzy_nbst(shim_stream, shim_obj)
+
+
+def __ull2bfloat16_rd():
+    pass
 
 
-def _operator_sub_2_lower(shim_stream, shim_obj):
+def _lower__ZL17__ull2bfloat16_rdy_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    operator_sub_2(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
-        retval = operator-(*lh, *rh);
+    _ZL17__ull2bfloat16_rdy_nbst(__nv_bfloat16 &retval , unsigned long long* i) {
+        retval = __ull2bfloat16_rd(*i);
         return 0;
     }
         """
 
-    operator_sub_2 = declare_device(
-        "operator_sub_2",
-        _type___nv_bfloat162(
-            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
-        ),
+    _ZL17__ull2bfloat16_rdy_nbst = declare_device(
+        "_ZL17__ull2bfloat16_rdy_nbst", _type___nv_bfloat16(CPointer(uint64))
     )
 
-    def operator_sub_2_caller(arg_0, arg_1):
-        return operator_sub_2(arg_0, arg_1)
+    def _ZL17__ull2bfloat16_rdy_nbst_caller(arg_0):
+        return _ZL17__ull2bfloat16_rdy_nbst(arg_0)
 
-    @lower(operator.sub, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__ull2bfloat16_rd, uint64)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_sub_2", shim_raw_str)
+        shim_stream.write_with_key("_ZL17__ull2bfloat16_rdy_nbst", shim_raw_str)
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            operator_sub_2_caller,
-            signature(
-                _type___nv_bfloat162,
-                CPointer(_type___nv_bfloat162),
-                CPointer(_type___nv_bfloat162),
-            ),
+            _ZL17__ull2bfloat16_rdy_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(uint64)),
             ptrs,
         )
 
 
-_operator_sub_2_lower(shim_stream, shim_obj)
+_lower__ZL17__ull2bfloat16_rdy_nbst(shim_stream, shim_obj)
 
 
-def _operator_mul_2_lower(shim_stream, shim_obj):
+def __ull2bfloat16_ru():
+    pass
+
+
+def _lower__ZL17__ull2bfloat16_ruy_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    operator_mul_2(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
-        retval = operator*(*lh, *rh);
+    _ZL17__ull2bfloat16_ruy_nbst(__nv_bfloat16 &retval , unsigned long long* i) {
+        retval = __ull2bfloat16_ru(*i);
         return 0;
     }
         """
 
-    operator_mul_2 = declare_device(
-        "operator_mul_2",
-        _type___nv_bfloat162(
-            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
-        ),
+    _ZL17__ull2bfloat16_ruy_nbst = declare_device(
+        "_ZL17__ull2bfloat16_ruy_nbst", _type___nv_bfloat16(CPointer(uint64))
     )
 
-    def operator_mul_2_caller(arg_0, arg_1):
-        return operator_mul_2(arg_0, arg_1)
+    def _ZL17__ull2bfloat16_ruy_nbst_caller(arg_0):
+        return _ZL17__ull2bfloat16_ruy_nbst(arg_0)
 
-    @lower(operator.mul, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__ull2bfloat16_ru, uint64)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_mul_2", shim_raw_str)
+        shim_stream.write_with_key("_ZL17__ull2bfloat16_ruy_nbst", shim_raw_str)
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            operator_mul_2_caller,
-            signature(
-                _type___nv_bfloat162,
-                CPointer(_type___nv_bfloat162),
-                CPointer(_type___nv_bfloat162),
-            ),
+            _ZL17__ull2bfloat16_ruy_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(uint64)),
             ptrs,
         )
 
 
-_operator_mul_2_lower(shim_stream, shim_obj)
+_lower__ZL17__ull2bfloat16_ruy_nbst(shim_stream, shim_obj)
+
+
+def __bfloat162ll_rn():
+    pass
 
 
-def _operator_truediv_2_lower(shim_stream, shim_obj):
+def _lower__ZL16__bfloat162ll_rn13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    operator_truediv_2(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
-        retval = operator/(*lh, *rh);
+    _ZL16__bfloat162ll_rn13__nv_bfloat16_nbst(long long &retval , __nv_bfloat16* h) {
+        retval = __bfloat162ll_rn(*h);
         return 0;
     }
         """
 
-    operator_truediv_2 = declare_device(
-        "operator_truediv_2",
-        _type___nv_bfloat162(
-            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
-        ),
+    _ZL16__bfloat162ll_rn13__nv_bfloat16_nbst = declare_device(
+        "_ZL16__bfloat162ll_rn13__nv_bfloat16_nbst",
+        int64(CPointer(_type___nv_bfloat16)),
     )
 
-    def operator_truediv_2_caller(arg_0, arg_1):
-        return operator_truediv_2(arg_0, arg_1)
+    def _ZL16__bfloat162ll_rn13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL16__bfloat162ll_rn13__nv_bfloat16_nbst(arg_0)
 
-    @lower(operator.truediv, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__bfloat162ll_rn, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_truediv_2", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL16__bfloat162ll_rn13__nv_bfloat16_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            operator_truediv_2_caller,
-            signature(
-                _type___nv_bfloat162,
-                CPointer(_type___nv_bfloat162),
-                CPointer(_type___nv_bfloat162),
-            ),
+            _ZL16__bfloat162ll_rn13__nv_bfloat16_nbst_caller,
+            signature(int64, CPointer(_type___nv_bfloat16)),
             ptrs,
         )
 
 
-_operator_truediv_2_lower(shim_stream, shim_obj)
+_lower__ZL16__bfloat162ll_rn13__nv_bfloat16_nbst(shim_stream, shim_obj)
 
 
-def _operator_iadd_2_lower(shim_stream, shim_obj):
+def __bfloat162ll_rz():
+    pass
+
+
+def _lower__ZL16__bfloat162ll_rz13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    operator_iadd_2(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
-        retval = operator+=(*lh, *rh);
+    _ZL16__bfloat162ll_rz13__nv_bfloat16_nbst(long long &retval , __nv_bfloat16* h) {
+        retval = __bfloat162ll_rz(*h);
         return 0;
     }
         """
 
-    operator_iadd_2 = declare_device(
-        "operator_iadd_2",
-        _type___nv_bfloat162(
-            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
-        ),
+    _ZL16__bfloat162ll_rz13__nv_bfloat16_nbst = declare_device(
+        "_ZL16__bfloat162ll_rz13__nv_bfloat16_nbst",
+        int64(CPointer(_type___nv_bfloat16)),
     )
 
-    def operator_iadd_2_caller(arg_0, arg_1):
-        return operator_iadd_2(arg_0, arg_1)
+    def _ZL16__bfloat162ll_rz13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL16__bfloat162ll_rz13__nv_bfloat16_nbst(arg_0)
 
-    @lower(operator.iadd, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__bfloat162ll_rz, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_iadd_2", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL16__bfloat162ll_rz13__nv_bfloat16_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            operator_iadd_2_caller,
-            signature(
-                _type___nv_bfloat162,
-                CPointer(_type___nv_bfloat162),
-                CPointer(_type___nv_bfloat162),
-            ),
+            _ZL16__bfloat162ll_rz13__nv_bfloat16_nbst_caller,
+            signature(int64, CPointer(_type___nv_bfloat16)),
             ptrs,
         )
 
 
-_operator_iadd_2_lower(shim_stream, shim_obj)
+_lower__ZL16__bfloat162ll_rz13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def __bfloat162ll_rd():
+    pass
 
 
-def _operator_isub_2_lower(shim_stream, shim_obj):
+def _lower__ZL16__bfloat162ll_rd13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    operator_isub_2(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
-        retval = operator-=(*lh, *rh);
+    _ZL16__bfloat162ll_rd13__nv_bfloat16_nbst(long long &retval , __nv_bfloat16* h) {
+        retval = __bfloat162ll_rd(*h);
         return 0;
     }
         """
 
-    operator_isub_2 = declare_device(
-        "operator_isub_2",
-        _type___nv_bfloat162(
-            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
-        ),
+    _ZL16__bfloat162ll_rd13__nv_bfloat16_nbst = declare_device(
+        "_ZL16__bfloat162ll_rd13__nv_bfloat16_nbst",
+        int64(CPointer(_type___nv_bfloat16)),
     )
 
-    def operator_isub_2_caller(arg_0, arg_1):
-        return operator_isub_2(arg_0, arg_1)
+    def _ZL16__bfloat162ll_rd13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL16__bfloat162ll_rd13__nv_bfloat16_nbst(arg_0)
 
-    @lower(operator.isub, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__bfloat162ll_rd, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_isub_2", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL16__bfloat162ll_rd13__nv_bfloat16_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            operator_isub_2_caller,
-            signature(
-                _type___nv_bfloat162,
-                CPointer(_type___nv_bfloat162),
-                CPointer(_type___nv_bfloat162),
-            ),
+            _ZL16__bfloat162ll_rd13__nv_bfloat16_nbst_caller,
+            signature(int64, CPointer(_type___nv_bfloat16)),
             ptrs,
         )
 
 
-_operator_isub_2_lower(shim_stream, shim_obj)
+_lower__ZL16__bfloat162ll_rd13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def __bfloat162ll_ru():
+    pass
 
 
-def _operator_imul_2_lower(shim_stream, shim_obj):
+def _lower__ZL16__bfloat162ll_ru13__nv_bfloat16_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    operator_imul_2(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
-        retval = operator*=(*lh, *rh);
+    _ZL16__bfloat162ll_ru13__nv_bfloat16_nbst(long long &retval , __nv_bfloat16* h) {
+        retval = __bfloat162ll_ru(*h);
         return 0;
     }
         """
 
-    operator_imul_2 = declare_device(
-        "operator_imul_2",
-        _type___nv_bfloat162(
-            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
-        ),
+    _ZL16__bfloat162ll_ru13__nv_bfloat16_nbst = declare_device(
+        "_ZL16__bfloat162ll_ru13__nv_bfloat16_nbst",
+        int64(CPointer(_type___nv_bfloat16)),
     )
 
-    def operator_imul_2_caller(arg_0, arg_1):
-        return operator_imul_2(arg_0, arg_1)
+    def _ZL16__bfloat162ll_ru13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL16__bfloat162ll_ru13__nv_bfloat16_nbst(arg_0)
 
-    @lower(operator.imul, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__bfloat162ll_ru, _type___nv_bfloat16)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_imul_2", shim_raw_str)
+        shim_stream.write_with_key(
+            "_ZL16__bfloat162ll_ru13__nv_bfloat16_nbst", shim_raw_str
+        )
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            operator_imul_2_caller,
-            signature(
-                _type___nv_bfloat162,
-                CPointer(_type___nv_bfloat162),
-                CPointer(_type___nv_bfloat162),
-            ),
+            _ZL16__bfloat162ll_ru13__nv_bfloat16_nbst_caller,
+            signature(int64, CPointer(_type___nv_bfloat16)),
             ptrs,
         )
 
 
-_operator_imul_2_lower(shim_stream, shim_obj)
+_lower__ZL16__bfloat162ll_ru13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def __ll2bfloat16_rn():
+    pass
 
 
-def _operator_itruediv_2_lower(shim_stream, shim_obj):
+def _lower__ZL16__ll2bfloat16_rnx_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    operator_itruediv_2(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
-        retval = operator/=(*lh, *rh);
+    _ZL16__ll2bfloat16_rnx_nbst(__nv_bfloat16 &retval , long long* i) {
+        retval = __ll2bfloat16_rn(*i);
         return 0;
     }
         """
 
-    operator_itruediv_2 = declare_device(
-        "operator_itruediv_2",
-        _type___nv_bfloat162(
-            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
-        ),
+    _ZL16__ll2bfloat16_rnx_nbst = declare_device(
+        "_ZL16__ll2bfloat16_rnx_nbst", _type___nv_bfloat16(CPointer(int64))
     )
 
-    def operator_itruediv_2_caller(arg_0, arg_1):
-        return operator_itruediv_2(arg_0, arg_1)
+    def _ZL16__ll2bfloat16_rnx_nbst_caller(arg_0):
+        return _ZL16__ll2bfloat16_rnx_nbst(arg_0)
 
-    @lower(operator.itruediv, _type___nv_bfloat162, _type___nv_bfloat162)
+    @lower(__ll2bfloat16_rn, int64)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_itruediv_2", shim_raw_str)
+        shim_stream.write_with_key("_ZL16__ll2bfloat16_rnx_nbst", shim_raw_str)
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            operator_itruediv_2_caller,
-            signature(
-                _type___nv_bfloat162,
-                CPointer(_type___nv_bfloat162),
-                CPointer(_type___nv_bfloat162),
-            ),
+            _ZL16__ll2bfloat16_rnx_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(int64)),
             ptrs,
         )
 
 
-_operator_itruediv_2_lower(shim_stream, shim_obj)
+_lower__ZL16__ll2bfloat16_rnx_nbst(shim_stream, shim_obj)
+
+
+def __ll2bfloat16_rz():
+    pass
 
 
-def _operator_pos_2_lower(shim_stream, shim_obj):
+def _lower__ZL16__ll2bfloat16_rzx_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    operator_pos_2(__nv_bfloat162 &retval , __nv_bfloat162* h) {
-        retval = operator+(*h);
+    _ZL16__ll2bfloat16_rzx_nbst(__nv_bfloat16 &retval , long long* i) {
+        retval = __ll2bfloat16_rz(*i);
         return 0;
     }
         """
 
-    operator_pos_2 = declare_device(
-        "operator_pos_2", _type___nv_bfloat162(CPointer(_type___nv_bfloat162))
+    _ZL16__ll2bfloat16_rzx_nbst = declare_device(
+        "_ZL16__ll2bfloat16_rzx_nbst", _type___nv_bfloat16(CPointer(int64))
     )
 
-    def operator_pos_2_caller(arg_0):
-        return operator_pos_2(arg_0)
+    def _ZL16__ll2bfloat16_rzx_nbst_caller(arg_0):
+        return _ZL16__ll2bfloat16_rzx_nbst(arg_0)
 
-    @lower(operator.pos, _type___nv_bfloat162)
+    @lower(__ll2bfloat16_rz, int64)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_pos_2", shim_raw_str)
+        shim_stream.write_with_key("_ZL16__ll2bfloat16_rzx_nbst", shim_raw_str)
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            operator_pos_2_caller,
-            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            _ZL16__ll2bfloat16_rzx_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(int64)),
             ptrs,
         )
 
 
-_operator_pos_2_lower(shim_stream, shim_obj)
+_lower__ZL16__ll2bfloat16_rzx_nbst(shim_stream, shim_obj)
 
 
-def _operator_neg_2_lower(shim_stream, shim_obj):
+def __ll2bfloat16_rd():
+    pass
+
+
+def _lower__ZL16__ll2bfloat16_rdx_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    operator_neg_2(__nv_bfloat162 &retval , __nv_bfloat162* h) {
-        retval = operator-(*h);
+    _ZL16__ll2bfloat16_rdx_nbst(__nv_bfloat16 &retval , long long* i) {
+        retval = __ll2bfloat16_rd(*i);
         return 0;
     }
         """
 
-    operator_neg_2 = declare_device(
-        "operator_neg_2", _type___nv_bfloat162(CPointer(_type___nv_bfloat162))
+    _ZL16__ll2bfloat16_rdx_nbst = declare_device(
+        "_ZL16__ll2bfloat16_rdx_nbst", _type___nv_bfloat16(CPointer(int64))
     )
 
-    def operator_neg_2_caller(arg_0):
-        return operator_neg_2(arg_0)
+    def _ZL16__ll2bfloat16_rdx_nbst_caller(arg_0):
+        return _ZL16__ll2bfloat16_rdx_nbst(arg_0)
 
-    @lower(operator.neg, _type___nv_bfloat162)
+    @lower(__ll2bfloat16_rd, int64)
     def impl(context, builder, sig, args):
         context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_neg_2", shim_raw_str)
+        shim_stream.write_with_key("_ZL16__ll2bfloat16_rdx_nbst", shim_raw_str)
         ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
         for ptr, ty, arg in zip(ptrs, sig.args, args):
             builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
 
         return context.compile_internal(
             builder,
-            operator_neg_2_caller,
-            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            _ZL16__ll2bfloat16_rdx_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(int64)),
             ptrs,
         )
 
 
-_operator_neg_2_lower(shim_stream, shim_obj)
+_lower__ZL16__ll2bfloat16_rdx_nbst(shim_stream, shim_obj)
+
+
+def __ll2bfloat16_ru():
+    pass
 
 
-def _operator_eq_2_lower(shim_stream, shim_obj):
+def _lower__ZL16__ll2bfloat16_rux_nbst(shim_stream, shim_obj):
     shim_raw_str = """
     extern "C" __device__ int
-    operator_eq_2(bool &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
-        retval = operator==(*lh, *rh);
+    _ZL16__ll2bfloat16_rux_nbst(__nv_bfloat16 &retval , long long* i) {
+        retval = __ll2bfloat16_ru(*i);
+        return 0;
+    }
+        """
+
+    _ZL16__ll2bfloat16_rux_nbst = declare_device(
+        "_ZL16__ll2bfloat16_rux_nbst", _type___nv_bfloat16(CPointer(int64))
+    )
+
+    def _ZL16__ll2bfloat16_rux_nbst_caller(arg_0):
+        return _ZL16__ll2bfloat16_rux_nbst(arg_0)
+
+    @lower(__ll2bfloat16_ru, int64)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key("_ZL16__ll2bfloat16_rux_nbst", shim_raw_str)
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL16__ll2bfloat16_rux_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(int64)),
+            ptrs,
+        )
+
+
+_lower__ZL16__ll2bfloat16_rux_nbst(shim_stream, shim_obj)
+
+
+def htrunc():
+    pass
+
+
+def _lower__ZL6htrunc13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6htrunc13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* h) {
+        retval = htrunc(*h);
+        return 0;
+    }
+        """
+
+    _ZL6htrunc13__nv_bfloat16_nbst = declare_device(
+        "_ZL6htrunc13__nv_bfloat16_nbst",
+        _type___nv_bfloat16(CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL6htrunc13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL6htrunc13__nv_bfloat16_nbst(arg_0)
+
+    @lower(htrunc, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6htrunc13__nv_bfloat16_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6htrunc13__nv_bfloat16_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            ptrs,
+        )
+
+
+_lower__ZL6htrunc13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def hceil():
+    pass
+
+
+def _lower__ZL5hceil13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL5hceil13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* h) {
+        retval = hceil(*h);
         return 0;
     }
         """
 
-    operator_eq_2 = declare_device(
-        "operator_eq_2",
-        bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
-    )
+    _ZL5hceil13__nv_bfloat16_nbst = declare_device(
+        "_ZL5hceil13__nv_bfloat16_nbst",
+        _type___nv_bfloat16(CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL5hceil13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL5hceil13__nv_bfloat16_nbst(arg_0)
+
+    @lower(hceil, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL5hceil13__nv_bfloat16_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL5hceil13__nv_bfloat16_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            ptrs,
+        )
+
+
+_lower__ZL5hceil13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def hfloor():
+    pass
+
+
+def _lower__ZL6hfloor13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6hfloor13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* h) {
+        retval = hfloor(*h);
+        return 0;
+    }
+        """
+
+    _ZL6hfloor13__nv_bfloat16_nbst = declare_device(
+        "_ZL6hfloor13__nv_bfloat16_nbst",
+        _type___nv_bfloat16(CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL6hfloor13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL6hfloor13__nv_bfloat16_nbst(arg_0)
+
+    @lower(hfloor, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6hfloor13__nv_bfloat16_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6hfloor13__nv_bfloat16_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            ptrs,
+        )
+
+
+_lower__ZL6hfloor13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def hrint():
+    pass
+
+
+def _lower__ZL5hrint13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL5hrint13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* h) {
+        retval = hrint(*h);
+        return 0;
+    }
+        """
+
+    _ZL5hrint13__nv_bfloat16_nbst = declare_device(
+        "_ZL5hrint13__nv_bfloat16_nbst",
+        _type___nv_bfloat16(CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL5hrint13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL5hrint13__nv_bfloat16_nbst(arg_0)
+
+    @lower(hrint, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL5hrint13__nv_bfloat16_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL5hrint13__nv_bfloat16_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            ptrs,
+        )
+
+
+_lower__ZL5hrint13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def h2trunc():
+    pass
+
+
+def _lower__ZL7h2trunc14__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL7h2trunc14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* h) {
+        retval = h2trunc(*h);
+        return 0;
+    }
+        """
+
+    _ZL7h2trunc14__nv_bfloat162_nbst = declare_device(
+        "_ZL7h2trunc14__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL7h2trunc14__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL7h2trunc14__nv_bfloat162_nbst(arg_0)
+
+    @lower(h2trunc, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL7h2trunc14__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL7h2trunc14__nv_bfloat162_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            ptrs,
+        )
+
+
+_lower__ZL7h2trunc14__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def h2ceil():
+    pass
+
+
+def _lower__ZL6h2ceil14__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6h2ceil14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* h) {
+        retval = h2ceil(*h);
+        return 0;
+    }
+        """
+
+    _ZL6h2ceil14__nv_bfloat162_nbst = declare_device(
+        "_ZL6h2ceil14__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL6h2ceil14__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL6h2ceil14__nv_bfloat162_nbst(arg_0)
+
+    @lower(h2ceil, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6h2ceil14__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6h2ceil14__nv_bfloat162_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            ptrs,
+        )
+
+
+_lower__ZL6h2ceil14__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def h2floor():
+    pass
+
+
+def _lower__ZL7h2floor14__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL7h2floor14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* h) {
+        retval = h2floor(*h);
+        return 0;
+    }
+        """
+
+    _ZL7h2floor14__nv_bfloat162_nbst = declare_device(
+        "_ZL7h2floor14__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL7h2floor14__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL7h2floor14__nv_bfloat162_nbst(arg_0)
+
+    @lower(h2floor, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL7h2floor14__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL7h2floor14__nv_bfloat162_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            ptrs,
+        )
+
+
+_lower__ZL7h2floor14__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def h2rint():
+    pass
+
+
+def _lower__ZL6h2rint14__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6h2rint14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* h) {
+        retval = h2rint(*h);
+        return 0;
+    }
+        """
+
+    _ZL6h2rint14__nv_bfloat162_nbst = declare_device(
+        "_ZL6h2rint14__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL6h2rint14__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL6h2rint14__nv_bfloat162_nbst(arg_0)
+
+    @lower(h2rint, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6h2rint14__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6h2rint14__nv_bfloat162_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            ptrs,
+        )
+
+
+_lower__ZL6h2rint14__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def __bfloat162bfloat162():
+    pass
+
+
+def _lower__ZL20__bfloat162bfloat16213__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL20__bfloat162bfloat16213__nv_bfloat16_nbst(__nv_bfloat162 &retval , __nv_bfloat16* a) {
+        retval = __bfloat162bfloat162(*a);
+        return 0;
+    }
+        """
+
+    _ZL20__bfloat162bfloat16213__nv_bfloat16_nbst = declare_device(
+        "_ZL20__bfloat162bfloat16213__nv_bfloat16_nbst",
+        _type___nv_bfloat162(CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL20__bfloat162bfloat16213__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL20__bfloat162bfloat16213__nv_bfloat16_nbst(arg_0)
+
+    @lower(__bfloat162bfloat162, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL20__bfloat162bfloat16213__nv_bfloat16_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL20__bfloat162bfloat16213__nv_bfloat16_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat16)),
+            ptrs,
+        )
+
+
+_lower__ZL20__bfloat162bfloat16213__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def __lowhigh2highlow():
+    pass
+
+
+def _lower__ZL17__lowhigh2highlow14__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL17__lowhigh2highlow14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) {
+        retval = __lowhigh2highlow(*a);
+        return 0;
+    }
+        """
+
+    _ZL17__lowhigh2highlow14__nv_bfloat162_nbst = declare_device(
+        "_ZL17__lowhigh2highlow14__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL17__lowhigh2highlow14__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL17__lowhigh2highlow14__nv_bfloat162_nbst(arg_0)
+
+    @lower(__lowhigh2highlow, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL17__lowhigh2highlow14__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL17__lowhigh2highlow14__nv_bfloat162_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            ptrs,
+        )
+
+
+_lower__ZL17__lowhigh2highlow14__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def __lows2bfloat162():
+    pass
+
+
+def _lower__ZL16__lows2bfloat16214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL16__lows2bfloat16214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __lows2bfloat162(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL16__lows2bfloat16214__nv_bfloat162S__nbst = declare_device(
+        "_ZL16__lows2bfloat16214__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL16__lows2bfloat16214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL16__lows2bfloat16214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__lows2bfloat162, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL16__lows2bfloat16214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL16__lows2bfloat16214__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL16__lows2bfloat16214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __highs2bfloat162():
+    pass
+
+
+def _lower__ZL17__highs2bfloat16214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL17__highs2bfloat16214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __highs2bfloat162(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL17__highs2bfloat16214__nv_bfloat162S__nbst = declare_device(
+        "_ZL17__highs2bfloat16214__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL17__highs2bfloat16214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL17__highs2bfloat16214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__highs2bfloat162, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL17__highs2bfloat16214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL17__highs2bfloat16214__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL17__highs2bfloat16214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __high2bfloat16():
+    pass
+
+
+def _lower__ZL15__high2bfloat1614__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL15__high2bfloat1614__nv_bfloat162_nbst(__nv_bfloat16 &retval , __nv_bfloat162* a) {
+        retval = __high2bfloat16(*a);
+        return 0;
+    }
+        """
+
+    _ZL15__high2bfloat1614__nv_bfloat162_nbst = declare_device(
+        "_ZL15__high2bfloat1614__nv_bfloat162_nbst",
+        _type___nv_bfloat16(CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL15__high2bfloat1614__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL15__high2bfloat1614__nv_bfloat162_nbst(arg_0)
+
+    @lower(__high2bfloat16, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL15__high2bfloat1614__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL15__high2bfloat1614__nv_bfloat162_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat162)),
+            ptrs,
+        )
+
+
+_lower__ZL15__high2bfloat1614__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def __low2bfloat16():
+    pass
+
+
+def _lower__ZL14__low2bfloat1614__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL14__low2bfloat1614__nv_bfloat162_nbst(__nv_bfloat16 &retval , __nv_bfloat162* a) {
+        retval = __low2bfloat16(*a);
+        return 0;
+    }
+        """
+
+    _ZL14__low2bfloat1614__nv_bfloat162_nbst = declare_device(
+        "_ZL14__low2bfloat1614__nv_bfloat162_nbst",
+        _type___nv_bfloat16(CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL14__low2bfloat1614__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL14__low2bfloat1614__nv_bfloat162_nbst(arg_0)
+
+    @lower(__low2bfloat16, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL14__low2bfloat1614__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL14__low2bfloat1614__nv_bfloat162_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat162)),
+            ptrs,
+        )
+
+
+_lower__ZL14__low2bfloat1614__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def __hisinf():
+    pass
+
+
+def _lower__ZL8__hisinf13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL8__hisinf13__nv_bfloat16_nbst(int &retval , __nv_bfloat16* a) {
+        retval = __hisinf(*a);
+        return 0;
+    }
+        """
+
+    _ZL8__hisinf13__nv_bfloat16_nbst = declare_device(
+        "_ZL8__hisinf13__nv_bfloat16_nbst", int32(CPointer(_type___nv_bfloat16))
+    )
+
+    def _ZL8__hisinf13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL8__hisinf13__nv_bfloat16_nbst(arg_0)
+
+    @lower(__hisinf, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL8__hisinf13__nv_bfloat16_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL8__hisinf13__nv_bfloat16_nbst_caller,
+            signature(int32, CPointer(_type___nv_bfloat16)),
+            ptrs,
+        )
+
+
+_lower__ZL8__hisinf13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def __halves2bfloat162():
+    pass
+
+
+def _lower__ZL18__halves2bfloat16213__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL18__halves2bfloat16213__nv_bfloat16S__nbst(__nv_bfloat162 &retval , __nv_bfloat16* a, __nv_bfloat16* b) {
+        retval = __halves2bfloat162(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL18__halves2bfloat16213__nv_bfloat16S__nbst = declare_device(
+        "_ZL18__halves2bfloat16213__nv_bfloat16S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
+        ),
+    )
+
+    def _ZL18__halves2bfloat16213__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL18__halves2bfloat16213__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__halves2bfloat162, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL18__halves2bfloat16213__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL18__halves2bfloat16213__nv_bfloat16S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL18__halves2bfloat16213__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __low2bfloat162():
+    pass
+
+
+def _lower__ZL15__low2bfloat16214__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL15__low2bfloat16214__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) {
+        retval = __low2bfloat162(*a);
+        return 0;
+    }
+        """
+
+    _ZL15__low2bfloat16214__nv_bfloat162_nbst = declare_device(
+        "_ZL15__low2bfloat16214__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL15__low2bfloat16214__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL15__low2bfloat16214__nv_bfloat162_nbst(arg_0)
+
+    @lower(__low2bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL15__low2bfloat16214__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL15__low2bfloat16214__nv_bfloat162_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            ptrs,
+        )
+
+
+_lower__ZL15__low2bfloat16214__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def __high2bfloat162():
+    pass
+
+
+def _lower__ZL16__high2bfloat16214__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL16__high2bfloat16214__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) {
+        retval = __high2bfloat162(*a);
+        return 0;
+    }
+        """
+
+    _ZL16__high2bfloat16214__nv_bfloat162_nbst = declare_device(
+        "_ZL16__high2bfloat16214__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL16__high2bfloat16214__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL16__high2bfloat16214__nv_bfloat162_nbst(arg_0)
+
+    @lower(__high2bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL16__high2bfloat16214__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL16__high2bfloat16214__nv_bfloat162_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            ptrs,
+        )
+
+
+_lower__ZL16__high2bfloat16214__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def __bfloat16_as_short():
+    pass
+
+
+def _lower__ZL19__bfloat16_as_short13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL19__bfloat16_as_short13__nv_bfloat16_nbst(short &retval , __nv_bfloat16* h) {
+        retval = __bfloat16_as_short(*h);
+        return 0;
+    }
+        """
+
+    _ZL19__bfloat16_as_short13__nv_bfloat16_nbst = declare_device(
+        "_ZL19__bfloat16_as_short13__nv_bfloat16_nbst",
+        int16(CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL19__bfloat16_as_short13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL19__bfloat16_as_short13__nv_bfloat16_nbst(arg_0)
+
+    @lower(__bfloat16_as_short, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL19__bfloat16_as_short13__nv_bfloat16_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL19__bfloat16_as_short13__nv_bfloat16_nbst_caller,
+            signature(int16, CPointer(_type___nv_bfloat16)),
+            ptrs,
+        )
+
+
+_lower__ZL19__bfloat16_as_short13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def __bfloat16_as_ushort():
+    pass
+
+
+def _lower__ZL20__bfloat16_as_ushort13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL20__bfloat16_as_ushort13__nv_bfloat16_nbst(unsigned short &retval , __nv_bfloat16* h) {
+        retval = __bfloat16_as_ushort(*h);
+        return 0;
+    }
+        """
+
+    _ZL20__bfloat16_as_ushort13__nv_bfloat16_nbst = declare_device(
+        "_ZL20__bfloat16_as_ushort13__nv_bfloat16_nbst",
+        uint16(CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL20__bfloat16_as_ushort13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL20__bfloat16_as_ushort13__nv_bfloat16_nbst(arg_0)
+
+    @lower(__bfloat16_as_ushort, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL20__bfloat16_as_ushort13__nv_bfloat16_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL20__bfloat16_as_ushort13__nv_bfloat16_nbst_caller,
+            signature(uint16, CPointer(_type___nv_bfloat16)),
+            ptrs,
+        )
+
+
+_lower__ZL20__bfloat16_as_ushort13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def __short_as_bfloat16():
+    pass
+
+
+def _lower__ZL19__short_as_bfloat16s_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL19__short_as_bfloat16s_nbst(__nv_bfloat16 &retval , short* i) {
+        retval = __short_as_bfloat16(*i);
+        return 0;
+    }
+        """
+
+    _ZL19__short_as_bfloat16s_nbst = declare_device(
+        "_ZL19__short_as_bfloat16s_nbst", _type___nv_bfloat16(CPointer(int16))
+    )
+
+    def _ZL19__short_as_bfloat16s_nbst_caller(arg_0):
+        return _ZL19__short_as_bfloat16s_nbst(arg_0)
+
+    @lower(__short_as_bfloat16, int16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL19__short_as_bfloat16s_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL19__short_as_bfloat16s_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(int16)),
+            ptrs,
+        )
+
+
+_lower__ZL19__short_as_bfloat16s_nbst(shim_stream, shim_obj)
+
+
+def __ushort_as_bfloat16():
+    pass
+
+
+def _lower__ZL20__ushort_as_bfloat16t_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL20__ushort_as_bfloat16t_nbst(__nv_bfloat16 &retval , unsigned short* i) {
+        retval = __ushort_as_bfloat16(*i);
+        return 0;
+    }
+        """
+
+    _ZL20__ushort_as_bfloat16t_nbst = declare_device(
+        "_ZL20__ushort_as_bfloat16t_nbst", _type___nv_bfloat16(CPointer(uint16))
+    )
+
+    def _ZL20__ushort_as_bfloat16t_nbst_caller(arg_0):
+        return _ZL20__ushort_as_bfloat16t_nbst(arg_0)
+
+    @lower(__ushort_as_bfloat16, uint16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL20__ushort_as_bfloat16t_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL20__ushort_as_bfloat16t_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(uint16)),
+            ptrs,
+        )
+
+
+_lower__ZL20__ushort_as_bfloat16t_nbst(shim_stream, shim_obj)
+
+
+def __shfl_sync():
+    pass
+
+
+def _lower__ZL11__shfl_syncj14__nv_bfloat162ii_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL11__shfl_syncj14__nv_bfloat162ii_nbst(__nv_bfloat162 &retval , unsigned int* mask, __nv_bfloat162* var, int* srcLane, int* width) {
+        retval = __shfl_sync(*mask, *var, *srcLane, *width);
+        return 0;
+    }
+        """
+
+    _ZL11__shfl_syncj14__nv_bfloat162ii_nbst = declare_device(
+        "_ZL11__shfl_syncj14__nv_bfloat162ii_nbst",
+        _type___nv_bfloat162(
+            CPointer(uint32),
+            CPointer(_type___nv_bfloat162),
+            CPointer(int32),
+            CPointer(int32),
+        ),
+    )
+
+    def _ZL11__shfl_syncj14__nv_bfloat162ii_nbst_caller(
+        arg_0, arg_1, arg_2, arg_3
+    ):
+        return _ZL11__shfl_syncj14__nv_bfloat162ii_nbst(
+            arg_0, arg_1, arg_2, arg_3
+        )
+
+    @lower(__shfl_sync, uint32, _type___nv_bfloat162, int32, int32)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL11__shfl_syncj14__nv_bfloat162ii_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL11__shfl_syncj14__nv_bfloat162ii_nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(uint32),
+                CPointer(_type___nv_bfloat162),
+                CPointer(int32),
+                CPointer(int32),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL11__shfl_syncj14__nv_bfloat162ii_nbst(shim_stream, shim_obj)
+
+
+def __shfl_up_sync():
+    pass
+
+
+def _lower__ZL14__shfl_up_syncj14__nv_bfloat162ji_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL14__shfl_up_syncj14__nv_bfloat162ji_nbst(__nv_bfloat162 &retval , unsigned int* mask, __nv_bfloat162* var, unsigned int* delta, int* width) {
+        retval = __shfl_up_sync(*mask, *var, *delta, *width);
+        return 0;
+    }
+        """
+
+    _ZL14__shfl_up_syncj14__nv_bfloat162ji_nbst = declare_device(
+        "_ZL14__shfl_up_syncj14__nv_bfloat162ji_nbst",
+        _type___nv_bfloat162(
+            CPointer(uint32),
+            CPointer(_type___nv_bfloat162),
+            CPointer(uint32),
+            CPointer(int32),
+        ),
+    )
+
+    def _ZL14__shfl_up_syncj14__nv_bfloat162ji_nbst_caller(
+        arg_0, arg_1, arg_2, arg_3
+    ):
+        return _ZL14__shfl_up_syncj14__nv_bfloat162ji_nbst(
+            arg_0, arg_1, arg_2, arg_3
+        )
+
+    @lower(__shfl_up_sync, uint32, _type___nv_bfloat162, uint32, int32)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL14__shfl_up_syncj14__nv_bfloat162ji_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL14__shfl_up_syncj14__nv_bfloat162ji_nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(uint32),
+                CPointer(_type___nv_bfloat162),
+                CPointer(uint32),
+                CPointer(int32),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL14__shfl_up_syncj14__nv_bfloat162ji_nbst(shim_stream, shim_obj)
+
+
+def __shfl_down_sync():
+    pass
+
+
+def _lower__ZL16__shfl_down_syncj14__nv_bfloat162ji_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL16__shfl_down_syncj14__nv_bfloat162ji_nbst(__nv_bfloat162 &retval , unsigned int* mask, __nv_bfloat162* var, unsigned int* delta, int* width) {
+        retval = __shfl_down_sync(*mask, *var, *delta, *width);
+        return 0;
+    }
+        """
+
+    _ZL16__shfl_down_syncj14__nv_bfloat162ji_nbst = declare_device(
+        "_ZL16__shfl_down_syncj14__nv_bfloat162ji_nbst",
+        _type___nv_bfloat162(
+            CPointer(uint32),
+            CPointer(_type___nv_bfloat162),
+            CPointer(uint32),
+            CPointer(int32),
+        ),
+    )
+
+    def _ZL16__shfl_down_syncj14__nv_bfloat162ji_nbst_caller(
+        arg_0, arg_1, arg_2, arg_3
+    ):
+        return _ZL16__shfl_down_syncj14__nv_bfloat162ji_nbst(
+            arg_0, arg_1, arg_2, arg_3
+        )
+
+    @lower(__shfl_down_sync, uint32, _type___nv_bfloat162, uint32, int32)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL16__shfl_down_syncj14__nv_bfloat162ji_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL16__shfl_down_syncj14__nv_bfloat162ji_nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(uint32),
+                CPointer(_type___nv_bfloat162),
+                CPointer(uint32),
+                CPointer(int32),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL16__shfl_down_syncj14__nv_bfloat162ji_nbst(shim_stream, shim_obj)
+
+
+def __shfl_xor_sync():
+    pass
+
+
+def _lower__ZL15__shfl_xor_syncj14__nv_bfloat162ii_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL15__shfl_xor_syncj14__nv_bfloat162ii_nbst(__nv_bfloat162 &retval , unsigned int* mask, __nv_bfloat162* var, int* laneMask, int* width) {
+        retval = __shfl_xor_sync(*mask, *var, *laneMask, *width);
+        return 0;
+    }
+        """
+
+    _ZL15__shfl_xor_syncj14__nv_bfloat162ii_nbst = declare_device(
+        "_ZL15__shfl_xor_syncj14__nv_bfloat162ii_nbst",
+        _type___nv_bfloat162(
+            CPointer(uint32),
+            CPointer(_type___nv_bfloat162),
+            CPointer(int32),
+            CPointer(int32),
+        ),
+    )
+
+    def _ZL15__shfl_xor_syncj14__nv_bfloat162ii_nbst_caller(
+        arg_0, arg_1, arg_2, arg_3
+    ):
+        return _ZL15__shfl_xor_syncj14__nv_bfloat162ii_nbst(
+            arg_0, arg_1, arg_2, arg_3
+        )
+
+    @lower(__shfl_xor_sync, uint32, _type___nv_bfloat162, int32, int32)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL15__shfl_xor_syncj14__nv_bfloat162ii_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL15__shfl_xor_syncj14__nv_bfloat162ii_nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(uint32),
+                CPointer(_type___nv_bfloat162),
+                CPointer(int32),
+                CPointer(int32),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL15__shfl_xor_syncj14__nv_bfloat162ii_nbst(shim_stream, shim_obj)
+
+
+def _lower__ZL11__shfl_syncj13__nv_bfloat16ii_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL11__shfl_syncj13__nv_bfloat16ii_nbst(__nv_bfloat16 &retval , unsigned int* mask, __nv_bfloat16* var, int* srcLane, int* width) {
+        retval = __shfl_sync(*mask, *var, *srcLane, *width);
+        return 0;
+    }
+        """
+
+    _ZL11__shfl_syncj13__nv_bfloat16ii_nbst = declare_device(
+        "_ZL11__shfl_syncj13__nv_bfloat16ii_nbst",
+        _type___nv_bfloat16(
+            CPointer(uint32),
+            CPointer(_type___nv_bfloat16),
+            CPointer(int32),
+            CPointer(int32),
+        ),
+    )
+
+    def _ZL11__shfl_syncj13__nv_bfloat16ii_nbst_caller(
+        arg_0, arg_1, arg_2, arg_3
+    ):
+        return _ZL11__shfl_syncj13__nv_bfloat16ii_nbst(
+            arg_0, arg_1, arg_2, arg_3
+        )
+
+    @lower(__shfl_sync, uint32, _type___nv_bfloat16, int32, int32)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL11__shfl_syncj13__nv_bfloat16ii_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL11__shfl_syncj13__nv_bfloat16ii_nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(uint32),
+                CPointer(_type___nv_bfloat16),
+                CPointer(int32),
+                CPointer(int32),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL11__shfl_syncj13__nv_bfloat16ii_nbst(shim_stream, shim_obj)
+
+
+def _lower__ZL14__shfl_up_syncj13__nv_bfloat16ji_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL14__shfl_up_syncj13__nv_bfloat16ji_nbst(__nv_bfloat16 &retval , unsigned int* mask, __nv_bfloat16* var, unsigned int* delta, int* width) {
+        retval = __shfl_up_sync(*mask, *var, *delta, *width);
+        return 0;
+    }
+        """
+
+    _ZL14__shfl_up_syncj13__nv_bfloat16ji_nbst = declare_device(
+        "_ZL14__shfl_up_syncj13__nv_bfloat16ji_nbst",
+        _type___nv_bfloat16(
+            CPointer(uint32),
+            CPointer(_type___nv_bfloat16),
+            CPointer(uint32),
+            CPointer(int32),
+        ),
+    )
+
+    def _ZL14__shfl_up_syncj13__nv_bfloat16ji_nbst_caller(
+        arg_0, arg_1, arg_2, arg_3
+    ):
+        return _ZL14__shfl_up_syncj13__nv_bfloat16ji_nbst(
+            arg_0, arg_1, arg_2, arg_3
+        )
+
+    @lower(__shfl_up_sync, uint32, _type___nv_bfloat16, uint32, int32)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL14__shfl_up_syncj13__nv_bfloat16ji_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL14__shfl_up_syncj13__nv_bfloat16ji_nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(uint32),
+                CPointer(_type___nv_bfloat16),
+                CPointer(uint32),
+                CPointer(int32),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL14__shfl_up_syncj13__nv_bfloat16ji_nbst(shim_stream, shim_obj)
+
+
+def _lower__ZL16__shfl_down_syncj13__nv_bfloat16ji_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL16__shfl_down_syncj13__nv_bfloat16ji_nbst(__nv_bfloat16 &retval , unsigned int* mask, __nv_bfloat16* var, unsigned int* delta, int* width) {
+        retval = __shfl_down_sync(*mask, *var, *delta, *width);
+        return 0;
+    }
+        """
+
+    _ZL16__shfl_down_syncj13__nv_bfloat16ji_nbst = declare_device(
+        "_ZL16__shfl_down_syncj13__nv_bfloat16ji_nbst",
+        _type___nv_bfloat16(
+            CPointer(uint32),
+            CPointer(_type___nv_bfloat16),
+            CPointer(uint32),
+            CPointer(int32),
+        ),
+    )
+
+    def _ZL16__shfl_down_syncj13__nv_bfloat16ji_nbst_caller(
+        arg_0, arg_1, arg_2, arg_3
+    ):
+        return _ZL16__shfl_down_syncj13__nv_bfloat16ji_nbst(
+            arg_0, arg_1, arg_2, arg_3
+        )
+
+    @lower(__shfl_down_sync, uint32, _type___nv_bfloat16, uint32, int32)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL16__shfl_down_syncj13__nv_bfloat16ji_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL16__shfl_down_syncj13__nv_bfloat16ji_nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(uint32),
+                CPointer(_type___nv_bfloat16),
+                CPointer(uint32),
+                CPointer(int32),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL16__shfl_down_syncj13__nv_bfloat16ji_nbst(shim_stream, shim_obj)
+
+
+def _lower__ZL15__shfl_xor_syncj13__nv_bfloat16ii_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL15__shfl_xor_syncj13__nv_bfloat16ii_nbst(__nv_bfloat16 &retval , unsigned int* mask, __nv_bfloat16* var, int* laneMask, int* width) {
+        retval = __shfl_xor_sync(*mask, *var, *laneMask, *width);
+        return 0;
+    }
+        """
+
+    _ZL15__shfl_xor_syncj13__nv_bfloat16ii_nbst = declare_device(
+        "_ZL15__shfl_xor_syncj13__nv_bfloat16ii_nbst",
+        _type___nv_bfloat16(
+            CPointer(uint32),
+            CPointer(_type___nv_bfloat16),
+            CPointer(int32),
+            CPointer(int32),
+        ),
+    )
+
+    def _ZL15__shfl_xor_syncj13__nv_bfloat16ii_nbst_caller(
+        arg_0, arg_1, arg_2, arg_3
+    ):
+        return _ZL15__shfl_xor_syncj13__nv_bfloat16ii_nbst(
+            arg_0, arg_1, arg_2, arg_3
+        )
+
+    @lower(__shfl_xor_sync, uint32, _type___nv_bfloat16, int32, int32)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL15__shfl_xor_syncj13__nv_bfloat16ii_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL15__shfl_xor_syncj13__nv_bfloat16ii_nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(uint32),
+                CPointer(_type___nv_bfloat16),
+                CPointer(int32),
+                CPointer(int32),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL15__shfl_xor_syncj13__nv_bfloat16ii_nbst(shim_stream, shim_obj)
+
+
+def __ldg():
+    pass
+
+
+def _lower__ZL5__ldgPK14__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL5__ldgPK14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162 ** ptr) {
+        retval = __ldg(*ptr);
+        return 0;
+    }
+        """
+
+    _ZL5__ldgPK14__nv_bfloat162_nbst = declare_device(
+        "_ZL5__ldgPK14__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(CPointer(_type___nv_bfloat162))),
+    )
+
+    def _ZL5__ldgPK14__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL5__ldgPK14__nv_bfloat162_nbst(arg_0)
+
+    @lower(__ldg, CPointer(_type___nv_bfloat162))
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL5__ldgPK14__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL5__ldgPK14__nv_bfloat162_nbst_caller,
+            signature(
+                _type___nv_bfloat162, CPointer(CPointer(_type___nv_bfloat162))
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL5__ldgPK14__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def _lower__ZL5__ldgPK13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL5__ldgPK13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16 ** ptr) {
+        retval = __ldg(*ptr);
+        return 0;
+    }
+        """
+
+    _ZL5__ldgPK13__nv_bfloat16_nbst = declare_device(
+        "_ZL5__ldgPK13__nv_bfloat16_nbst",
+        _type___nv_bfloat16(CPointer(CPointer(_type___nv_bfloat16))),
+    )
+
+    def _ZL5__ldgPK13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL5__ldgPK13__nv_bfloat16_nbst(arg_0)
+
+    @lower(__ldg, CPointer(_type___nv_bfloat16))
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL5__ldgPK13__nv_bfloat16_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL5__ldgPK13__nv_bfloat16_nbst_caller,
+            signature(
+                _type___nv_bfloat16, CPointer(CPointer(_type___nv_bfloat16))
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL5__ldgPK13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def __ldcg():
+    pass
+
+
+def _lower__ZL6__ldcgPK14__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__ldcgPK14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162 ** ptr) {
+        retval = __ldcg(*ptr);
+        return 0;
+    }
+        """
+
+    _ZL6__ldcgPK14__nv_bfloat162_nbst = declare_device(
+        "_ZL6__ldcgPK14__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(CPointer(_type___nv_bfloat162))),
+    )
+
+    def _ZL6__ldcgPK14__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL6__ldcgPK14__nv_bfloat162_nbst(arg_0)
+
+    @lower(__ldcg, CPointer(_type___nv_bfloat162))
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__ldcgPK14__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__ldcgPK14__nv_bfloat162_nbst_caller,
+            signature(
+                _type___nv_bfloat162, CPointer(CPointer(_type___nv_bfloat162))
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__ldcgPK14__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def _lower__ZL6__ldcgPK13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__ldcgPK13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16 ** ptr) {
+        retval = __ldcg(*ptr);
+        return 0;
+    }
+        """
+
+    _ZL6__ldcgPK13__nv_bfloat16_nbst = declare_device(
+        "_ZL6__ldcgPK13__nv_bfloat16_nbst",
+        _type___nv_bfloat16(CPointer(CPointer(_type___nv_bfloat16))),
+    )
+
+    def _ZL6__ldcgPK13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL6__ldcgPK13__nv_bfloat16_nbst(arg_0)
+
+    @lower(__ldcg, CPointer(_type___nv_bfloat16))
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__ldcgPK13__nv_bfloat16_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__ldcgPK13__nv_bfloat16_nbst_caller,
+            signature(
+                _type___nv_bfloat16, CPointer(CPointer(_type___nv_bfloat16))
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__ldcgPK13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def __ldca():
+    pass
+
+
+def _lower__ZL6__ldcaPK14__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__ldcaPK14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162 ** ptr) {
+        retval = __ldca(*ptr);
+        return 0;
+    }
+        """
+
+    _ZL6__ldcaPK14__nv_bfloat162_nbst = declare_device(
+        "_ZL6__ldcaPK14__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(CPointer(_type___nv_bfloat162))),
+    )
+
+    def _ZL6__ldcaPK14__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL6__ldcaPK14__nv_bfloat162_nbst(arg_0)
+
+    @lower(__ldca, CPointer(_type___nv_bfloat162))
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__ldcaPK14__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__ldcaPK14__nv_bfloat162_nbst_caller,
+            signature(
+                _type___nv_bfloat162, CPointer(CPointer(_type___nv_bfloat162))
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__ldcaPK14__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def _lower__ZL6__ldcaPK13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__ldcaPK13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16 ** ptr) {
+        retval = __ldca(*ptr);
+        return 0;
+    }
+        """
+
+    _ZL6__ldcaPK13__nv_bfloat16_nbst = declare_device(
+        "_ZL6__ldcaPK13__nv_bfloat16_nbst",
+        _type___nv_bfloat16(CPointer(CPointer(_type___nv_bfloat16))),
+    )
+
+    def _ZL6__ldcaPK13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL6__ldcaPK13__nv_bfloat16_nbst(arg_0)
+
+    @lower(__ldca, CPointer(_type___nv_bfloat16))
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__ldcaPK13__nv_bfloat16_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__ldcaPK13__nv_bfloat16_nbst_caller,
+            signature(
+                _type___nv_bfloat16, CPointer(CPointer(_type___nv_bfloat16))
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__ldcaPK13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def __ldcs():
+    pass
+
+
+def _lower__ZL6__ldcsPK14__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__ldcsPK14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162 ** ptr) {
+        retval = __ldcs(*ptr);
+        return 0;
+    }
+        """
+
+    _ZL6__ldcsPK14__nv_bfloat162_nbst = declare_device(
+        "_ZL6__ldcsPK14__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(CPointer(_type___nv_bfloat162))),
+    )
+
+    def _ZL6__ldcsPK14__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL6__ldcsPK14__nv_bfloat162_nbst(arg_0)
+
+    @lower(__ldcs, CPointer(_type___nv_bfloat162))
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__ldcsPK14__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__ldcsPK14__nv_bfloat162_nbst_caller,
+            signature(
+                _type___nv_bfloat162, CPointer(CPointer(_type___nv_bfloat162))
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__ldcsPK14__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def _lower__ZL6__ldcsPK13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__ldcsPK13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16 ** ptr) {
+        retval = __ldcs(*ptr);
+        return 0;
+    }
+        """
+
+    _ZL6__ldcsPK13__nv_bfloat16_nbst = declare_device(
+        "_ZL6__ldcsPK13__nv_bfloat16_nbst",
+        _type___nv_bfloat16(CPointer(CPointer(_type___nv_bfloat16))),
+    )
+
+    def _ZL6__ldcsPK13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL6__ldcsPK13__nv_bfloat16_nbst(arg_0)
+
+    @lower(__ldcs, CPointer(_type___nv_bfloat16))
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__ldcsPK13__nv_bfloat16_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__ldcsPK13__nv_bfloat16_nbst_caller,
+            signature(
+                _type___nv_bfloat16, CPointer(CPointer(_type___nv_bfloat16))
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__ldcsPK13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def __ldlu():
+    pass
+
+
+def _lower__ZL6__ldluPK14__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__ldluPK14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162 ** ptr) {
+        retval = __ldlu(*ptr);
+        return 0;
+    }
+        """
+
+    _ZL6__ldluPK14__nv_bfloat162_nbst = declare_device(
+        "_ZL6__ldluPK14__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(CPointer(_type___nv_bfloat162))),
+    )
+
+    def _ZL6__ldluPK14__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL6__ldluPK14__nv_bfloat162_nbst(arg_0)
+
+    @lower(__ldlu, CPointer(_type___nv_bfloat162))
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__ldluPK14__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__ldluPK14__nv_bfloat162_nbst_caller,
+            signature(
+                _type___nv_bfloat162, CPointer(CPointer(_type___nv_bfloat162))
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__ldluPK14__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def _lower__ZL6__ldluPK13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__ldluPK13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16 ** ptr) {
+        retval = __ldlu(*ptr);
+        return 0;
+    }
+        """
+
+    _ZL6__ldluPK13__nv_bfloat16_nbst = declare_device(
+        "_ZL6__ldluPK13__nv_bfloat16_nbst",
+        _type___nv_bfloat16(CPointer(CPointer(_type___nv_bfloat16))),
+    )
+
+    def _ZL6__ldluPK13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL6__ldluPK13__nv_bfloat16_nbst(arg_0)
+
+    @lower(__ldlu, CPointer(_type___nv_bfloat16))
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__ldluPK13__nv_bfloat16_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__ldluPK13__nv_bfloat16_nbst_caller,
+            signature(
+                _type___nv_bfloat16, CPointer(CPointer(_type___nv_bfloat16))
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__ldluPK13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def __ldcv():
+    pass
+
+
+def _lower__ZL6__ldcvPK14__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__ldcvPK14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162 ** ptr) {
+        retval = __ldcv(*ptr);
+        return 0;
+    }
+        """
+
+    _ZL6__ldcvPK14__nv_bfloat162_nbst = declare_device(
+        "_ZL6__ldcvPK14__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(CPointer(_type___nv_bfloat162))),
+    )
+
+    def _ZL6__ldcvPK14__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL6__ldcvPK14__nv_bfloat162_nbst(arg_0)
+
+    @lower(__ldcv, CPointer(_type___nv_bfloat162))
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__ldcvPK14__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__ldcvPK14__nv_bfloat162_nbst_caller,
+            signature(
+                _type___nv_bfloat162, CPointer(CPointer(_type___nv_bfloat162))
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__ldcvPK14__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def _lower__ZL6__ldcvPK13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__ldcvPK13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16 ** ptr) {
+        retval = __ldcv(*ptr);
+        return 0;
+    }
+        """
+
+    _ZL6__ldcvPK13__nv_bfloat16_nbst = declare_device(
+        "_ZL6__ldcvPK13__nv_bfloat16_nbst",
+        _type___nv_bfloat16(CPointer(CPointer(_type___nv_bfloat16))),
+    )
+
+    def _ZL6__ldcvPK13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL6__ldcvPK13__nv_bfloat16_nbst(arg_0)
+
+    @lower(__ldcv, CPointer(_type___nv_bfloat16))
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__ldcvPK13__nv_bfloat16_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__ldcvPK13__nv_bfloat16_nbst_caller,
+            signature(
+                _type___nv_bfloat16, CPointer(CPointer(_type___nv_bfloat16))
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__ldcvPK13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def __stwb():
+    pass
+
+
+def _lower__ZL6__stwbP14__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__stwbP14__nv_bfloat162S__nbst(int &retval , __nv_bfloat162 ** ptr, __nv_bfloat162* value) {
+        __stwb(*ptr, *value);
+        return 0;
+    }
+        """
+
+    _ZL6__stwbP14__nv_bfloat162S__nbst = declare_device(
+        "_ZL6__stwbP14__nv_bfloat162S__nbst",
+        void(
+            CPointer(CPointer(_type___nv_bfloat162)),
+            CPointer(_type___nv_bfloat162),
+        ),
+    )
+
+    def _ZL6__stwbP14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL6__stwbP14__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__stwb, CPointer(_type___nv_bfloat162), _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__stwbP14__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__stwbP14__nv_bfloat162S__nbst_caller,
+            signature(
+                void,
+                CPointer(CPointer(_type___nv_bfloat162)),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__stwbP14__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZL6__stwbP13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__stwbP13__nv_bfloat16S__nbst(int &retval , __nv_bfloat16 ** ptr, __nv_bfloat16* value) {
+        __stwb(*ptr, *value);
+        return 0;
+    }
+        """
+
+    _ZL6__stwbP13__nv_bfloat16S__nbst = declare_device(
+        "_ZL6__stwbP13__nv_bfloat16S__nbst",
+        void(
+            CPointer(CPointer(_type___nv_bfloat16)),
+            CPointer(_type___nv_bfloat16),
+        ),
+    )
+
+    def _ZL6__stwbP13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL6__stwbP13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__stwb, CPointer(_type___nv_bfloat16), _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__stwbP13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__stwbP13__nv_bfloat16S__nbst_caller,
+            signature(
+                void,
+                CPointer(CPointer(_type___nv_bfloat16)),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__stwbP13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __stcg():
+    pass
+
+
+def _lower__ZL6__stcgP14__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__stcgP14__nv_bfloat162S__nbst(int &retval , __nv_bfloat162 ** ptr, __nv_bfloat162* value) {
+        __stcg(*ptr, *value);
+        return 0;
+    }
+        """
+
+    _ZL6__stcgP14__nv_bfloat162S__nbst = declare_device(
+        "_ZL6__stcgP14__nv_bfloat162S__nbst",
+        void(
+            CPointer(CPointer(_type___nv_bfloat162)),
+            CPointer(_type___nv_bfloat162),
+        ),
+    )
+
+    def _ZL6__stcgP14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL6__stcgP14__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__stcg, CPointer(_type___nv_bfloat162), _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__stcgP14__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__stcgP14__nv_bfloat162S__nbst_caller,
+            signature(
+                void,
+                CPointer(CPointer(_type___nv_bfloat162)),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__stcgP14__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZL6__stcgP13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__stcgP13__nv_bfloat16S__nbst(int &retval , __nv_bfloat16 ** ptr, __nv_bfloat16* value) {
+        __stcg(*ptr, *value);
+        return 0;
+    }
+        """
+
+    _ZL6__stcgP13__nv_bfloat16S__nbst = declare_device(
+        "_ZL6__stcgP13__nv_bfloat16S__nbst",
+        void(
+            CPointer(CPointer(_type___nv_bfloat16)),
+            CPointer(_type___nv_bfloat16),
+        ),
+    )
+
+    def _ZL6__stcgP13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL6__stcgP13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__stcg, CPointer(_type___nv_bfloat16), _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__stcgP13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__stcgP13__nv_bfloat16S__nbst_caller,
+            signature(
+                void,
+                CPointer(CPointer(_type___nv_bfloat16)),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__stcgP13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __stcs():
+    pass
+
+
+def _lower__ZL6__stcsP14__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__stcsP14__nv_bfloat162S__nbst(int &retval , __nv_bfloat162 ** ptr, __nv_bfloat162* value) {
+        __stcs(*ptr, *value);
+        return 0;
+    }
+        """
+
+    _ZL6__stcsP14__nv_bfloat162S__nbst = declare_device(
+        "_ZL6__stcsP14__nv_bfloat162S__nbst",
+        void(
+            CPointer(CPointer(_type___nv_bfloat162)),
+            CPointer(_type___nv_bfloat162),
+        ),
+    )
+
+    def _ZL6__stcsP14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL6__stcsP14__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__stcs, CPointer(_type___nv_bfloat162), _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__stcsP14__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__stcsP14__nv_bfloat162S__nbst_caller,
+            signature(
+                void,
+                CPointer(CPointer(_type___nv_bfloat162)),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__stcsP14__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZL6__stcsP13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__stcsP13__nv_bfloat16S__nbst(int &retval , __nv_bfloat16 ** ptr, __nv_bfloat16* value) {
+        __stcs(*ptr, *value);
+        return 0;
+    }
+        """
+
+    _ZL6__stcsP13__nv_bfloat16S__nbst = declare_device(
+        "_ZL6__stcsP13__nv_bfloat16S__nbst",
+        void(
+            CPointer(CPointer(_type___nv_bfloat16)),
+            CPointer(_type___nv_bfloat16),
+        ),
+    )
+
+    def _ZL6__stcsP13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL6__stcsP13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__stcs, CPointer(_type___nv_bfloat16), _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__stcsP13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__stcsP13__nv_bfloat16S__nbst_caller,
+            signature(
+                void,
+                CPointer(CPointer(_type___nv_bfloat16)),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__stcsP13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __stwt():
+    pass
+
+
+def _lower__ZL6__stwtP14__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__stwtP14__nv_bfloat162S__nbst(int &retval , __nv_bfloat162 ** ptr, __nv_bfloat162* value) {
+        __stwt(*ptr, *value);
+        return 0;
+    }
+        """
+
+    _ZL6__stwtP14__nv_bfloat162S__nbst = declare_device(
+        "_ZL6__stwtP14__nv_bfloat162S__nbst",
+        void(
+            CPointer(CPointer(_type___nv_bfloat162)),
+            CPointer(_type___nv_bfloat162),
+        ),
+    )
+
+    def _ZL6__stwtP14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL6__stwtP14__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__stwt, CPointer(_type___nv_bfloat162), _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__stwtP14__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__stwtP14__nv_bfloat162S__nbst_caller,
+            signature(
+                void,
+                CPointer(CPointer(_type___nv_bfloat162)),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__stwtP14__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZL6__stwtP13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__stwtP13__nv_bfloat16S__nbst(int &retval , __nv_bfloat16 ** ptr, __nv_bfloat16* value) {
+        __stwt(*ptr, *value);
+        return 0;
+    }
+        """
+
+    _ZL6__stwtP13__nv_bfloat16S__nbst = declare_device(
+        "_ZL6__stwtP13__nv_bfloat16S__nbst",
+        void(
+            CPointer(CPointer(_type___nv_bfloat16)),
+            CPointer(_type___nv_bfloat16),
+        ),
+    )
+
+    def _ZL6__stwtP13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL6__stwtP13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__stwt, CPointer(_type___nv_bfloat16), _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__stwtP13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__stwtP13__nv_bfloat16S__nbst_caller,
+            signature(
+                void,
+                CPointer(CPointer(_type___nv_bfloat16)),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__stwtP13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __heq2():
+    pass
+
+
+def _lower__ZL6__heq214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__heq214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __heq2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL6__heq214__nv_bfloat162S__nbst = declare_device(
+        "_ZL6__heq214__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL6__heq214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL6__heq214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__heq2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__heq214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__heq214__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__heq214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hne2():
+    pass
+
+
+def _lower__ZL6__hne214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__hne214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hne2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL6__hne214__nv_bfloat162S__nbst = declare_device(
+        "_ZL6__hne214__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL6__hne214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL6__hne214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hne2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__hne214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__hne214__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__hne214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hle2():
+    pass
+
+
+def _lower__ZL6__hle214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__hle214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hle2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL6__hle214__nv_bfloat162S__nbst = declare_device(
+        "_ZL6__hle214__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL6__hle214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL6__hle214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hle2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__hle214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__hle214__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__hle214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hge2():
+    pass
+
+
+def _lower__ZL6__hge214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__hge214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hge2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL6__hge214__nv_bfloat162S__nbst = declare_device(
+        "_ZL6__hge214__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL6__hge214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL6__hge214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hge2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__hge214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__hge214__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__hge214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hlt2():
+    pass
+
+
+def _lower__ZL6__hlt214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__hlt214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hlt2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL6__hlt214__nv_bfloat162S__nbst = declare_device(
+        "_ZL6__hlt214__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL6__hlt214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL6__hlt214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hlt2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__hlt214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__hlt214__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__hlt214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hgt2():
+    pass
+
+
+def _lower__ZL6__hgt214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__hgt214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hgt2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL6__hgt214__nv_bfloat162S__nbst = declare_device(
+        "_ZL6__hgt214__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL6__hgt214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL6__hgt214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hgt2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__hgt214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__hgt214__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__hgt214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hequ2():
+    pass
+
+
+def _lower__ZL7__hequ214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL7__hequ214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hequ2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL7__hequ214__nv_bfloat162S__nbst = declare_device(
+        "_ZL7__hequ214__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL7__hequ214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL7__hequ214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hequ2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL7__hequ214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL7__hequ214__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL7__hequ214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hneu2():
+    pass
+
+
+def _lower__ZL7__hneu214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL7__hneu214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hneu2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL7__hneu214__nv_bfloat162S__nbst = declare_device(
+        "_ZL7__hneu214__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL7__hneu214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL7__hneu214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hneu2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL7__hneu214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL7__hneu214__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL7__hneu214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hleu2():
+    pass
+
+
+def _lower__ZL7__hleu214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL7__hleu214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hleu2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL7__hleu214__nv_bfloat162S__nbst = declare_device(
+        "_ZL7__hleu214__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL7__hleu214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL7__hleu214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hleu2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL7__hleu214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL7__hleu214__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL7__hleu214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hgeu2():
+    pass
+
+
+def _lower__ZL7__hgeu214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL7__hgeu214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hgeu2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL7__hgeu214__nv_bfloat162S__nbst = declare_device(
+        "_ZL7__hgeu214__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL7__hgeu214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL7__hgeu214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hgeu2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL7__hgeu214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL7__hgeu214__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL7__hgeu214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hltu2():
+    pass
+
+
+def _lower__ZL7__hltu214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL7__hltu214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hltu2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL7__hltu214__nv_bfloat162S__nbst = declare_device(
+        "_ZL7__hltu214__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL7__hltu214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL7__hltu214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hltu2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL7__hltu214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL7__hltu214__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL7__hltu214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hgtu2():
+    pass
+
+
+def _lower__ZL7__hgtu214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL7__hgtu214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hgtu2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL7__hgtu214__nv_bfloat162S__nbst = declare_device(
+        "_ZL7__hgtu214__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL7__hgtu214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL7__hgtu214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hgtu2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL7__hgtu214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL7__hgtu214__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL7__hgtu214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __heq2_mask():
+    pass
+
+
+def _lower__ZL11__heq2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL11__heq2_mask14__nv_bfloat162S__nbst(unsigned int &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __heq2_mask(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL11__heq2_mask14__nv_bfloat162S__nbst = declare_device(
+        "_ZL11__heq2_mask14__nv_bfloat162S__nbst",
+        uint32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL11__heq2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL11__heq2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__heq2_mask, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL11__heq2_mask14__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL11__heq2_mask14__nv_bfloat162S__nbst_caller,
+            signature(
+                uint32,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL11__heq2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hne2_mask():
+    pass
+
+
+def _lower__ZL11__hne2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL11__hne2_mask14__nv_bfloat162S__nbst(unsigned int &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hne2_mask(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL11__hne2_mask14__nv_bfloat162S__nbst = declare_device(
+        "_ZL11__hne2_mask14__nv_bfloat162S__nbst",
+        uint32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL11__hne2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL11__hne2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hne2_mask, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL11__hne2_mask14__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL11__hne2_mask14__nv_bfloat162S__nbst_caller,
+            signature(
+                uint32,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL11__hne2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hle2_mask():
+    pass
+
+
+def _lower__ZL11__hle2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL11__hle2_mask14__nv_bfloat162S__nbst(unsigned int &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hle2_mask(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL11__hle2_mask14__nv_bfloat162S__nbst = declare_device(
+        "_ZL11__hle2_mask14__nv_bfloat162S__nbst",
+        uint32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL11__hle2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL11__hle2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hle2_mask, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL11__hle2_mask14__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL11__hle2_mask14__nv_bfloat162S__nbst_caller,
+            signature(
+                uint32,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL11__hle2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hge2_mask():
+    pass
+
+
+def _lower__ZL11__hge2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL11__hge2_mask14__nv_bfloat162S__nbst(unsigned int &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hge2_mask(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL11__hge2_mask14__nv_bfloat162S__nbst = declare_device(
+        "_ZL11__hge2_mask14__nv_bfloat162S__nbst",
+        uint32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL11__hge2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL11__hge2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hge2_mask, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL11__hge2_mask14__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL11__hge2_mask14__nv_bfloat162S__nbst_caller,
+            signature(
+                uint32,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL11__hge2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hlt2_mask():
+    pass
+
+
+def _lower__ZL11__hlt2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL11__hlt2_mask14__nv_bfloat162S__nbst(unsigned int &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hlt2_mask(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL11__hlt2_mask14__nv_bfloat162S__nbst = declare_device(
+        "_ZL11__hlt2_mask14__nv_bfloat162S__nbst",
+        uint32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL11__hlt2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL11__hlt2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hlt2_mask, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL11__hlt2_mask14__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL11__hlt2_mask14__nv_bfloat162S__nbst_caller,
+            signature(
+                uint32,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL11__hlt2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hgt2_mask():
+    pass
+
+
+def _lower__ZL11__hgt2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL11__hgt2_mask14__nv_bfloat162S__nbst(unsigned int &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hgt2_mask(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL11__hgt2_mask14__nv_bfloat162S__nbst = declare_device(
+        "_ZL11__hgt2_mask14__nv_bfloat162S__nbst",
+        uint32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL11__hgt2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL11__hgt2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hgt2_mask, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL11__hgt2_mask14__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL11__hgt2_mask14__nv_bfloat162S__nbst_caller,
+            signature(
+                uint32,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL11__hgt2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hequ2_mask():
+    pass
+
+
+def _lower__ZL12__hequ2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL12__hequ2_mask14__nv_bfloat162S__nbst(unsigned int &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hequ2_mask(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL12__hequ2_mask14__nv_bfloat162S__nbst = declare_device(
+        "_ZL12__hequ2_mask14__nv_bfloat162S__nbst",
+        uint32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL12__hequ2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL12__hequ2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hequ2_mask, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL12__hequ2_mask14__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL12__hequ2_mask14__nv_bfloat162S__nbst_caller,
+            signature(
+                uint32,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL12__hequ2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hneu2_mask():
+    pass
+
+
+def _lower__ZL12__hneu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL12__hneu2_mask14__nv_bfloat162S__nbst(unsigned int &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hneu2_mask(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL12__hneu2_mask14__nv_bfloat162S__nbst = declare_device(
+        "_ZL12__hneu2_mask14__nv_bfloat162S__nbst",
+        uint32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL12__hneu2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL12__hneu2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hneu2_mask, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL12__hneu2_mask14__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL12__hneu2_mask14__nv_bfloat162S__nbst_caller,
+            signature(
+                uint32,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL12__hneu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hleu2_mask():
+    pass
+
+
+def _lower__ZL12__hleu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL12__hleu2_mask14__nv_bfloat162S__nbst(unsigned int &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hleu2_mask(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL12__hleu2_mask14__nv_bfloat162S__nbst = declare_device(
+        "_ZL12__hleu2_mask14__nv_bfloat162S__nbst",
+        uint32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL12__hleu2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL12__hleu2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hleu2_mask, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL12__hleu2_mask14__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL12__hleu2_mask14__nv_bfloat162S__nbst_caller,
+            signature(
+                uint32,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL12__hleu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hgeu2_mask():
+    pass
+
+
+def _lower__ZL12__hgeu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL12__hgeu2_mask14__nv_bfloat162S__nbst(unsigned int &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hgeu2_mask(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL12__hgeu2_mask14__nv_bfloat162S__nbst = declare_device(
+        "_ZL12__hgeu2_mask14__nv_bfloat162S__nbst",
+        uint32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL12__hgeu2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL12__hgeu2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hgeu2_mask, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL12__hgeu2_mask14__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL12__hgeu2_mask14__nv_bfloat162S__nbst_caller,
+            signature(
+                uint32,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL12__hgeu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hltu2_mask():
+    pass
+
+
+def _lower__ZL12__hltu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL12__hltu2_mask14__nv_bfloat162S__nbst(unsigned int &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hltu2_mask(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL12__hltu2_mask14__nv_bfloat162S__nbst = declare_device(
+        "_ZL12__hltu2_mask14__nv_bfloat162S__nbst",
+        uint32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL12__hltu2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL12__hltu2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hltu2_mask, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL12__hltu2_mask14__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL12__hltu2_mask14__nv_bfloat162S__nbst_caller,
+            signature(
+                uint32,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL12__hltu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hgtu2_mask():
+    pass
+
+
+def _lower__ZL12__hgtu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL12__hgtu2_mask14__nv_bfloat162S__nbst(unsigned int &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hgtu2_mask(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL12__hgtu2_mask14__nv_bfloat162S__nbst = declare_device(
+        "_ZL12__hgtu2_mask14__nv_bfloat162S__nbst",
+        uint32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL12__hgtu2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL12__hgtu2_mask14__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hgtu2_mask, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL12__hgtu2_mask14__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL12__hgtu2_mask14__nv_bfloat162S__nbst_caller,
+            signature(
+                uint32,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL12__hgtu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hisnan2():
+    pass
+
+
+def _lower__ZL9__hisnan214__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL9__hisnan214__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) {
+        retval = __hisnan2(*a);
+        return 0;
+    }
+        """
+
+    _ZL9__hisnan214__nv_bfloat162_nbst = declare_device(
+        "_ZL9__hisnan214__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL9__hisnan214__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL9__hisnan214__nv_bfloat162_nbst(arg_0)
+
+    @lower(__hisnan2, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL9__hisnan214__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL9__hisnan214__nv_bfloat162_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            ptrs,
+        )
+
+
+_lower__ZL9__hisnan214__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def __hadd2():
+    pass
+
+
+def _lower__ZL7__hadd214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL7__hadd214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hadd2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL7__hadd214__nv_bfloat162S__nbst = declare_device(
+        "_ZL7__hadd214__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL7__hadd214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL7__hadd214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hadd2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL7__hadd214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL7__hadd214__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL7__hadd214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hsub2():
+    pass
+
+
+def _lower__ZL7__hsub214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL7__hsub214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hsub2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL7__hsub214__nv_bfloat162S__nbst = declare_device(
+        "_ZL7__hsub214__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL7__hsub214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL7__hsub214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hsub2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL7__hsub214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL7__hsub214__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL7__hsub214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hmul2():
+    pass
+
+
+def _lower__ZL7__hmul214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL7__hmul214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hmul2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL7__hmul214__nv_bfloat162S__nbst = declare_device(
+        "_ZL7__hmul214__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL7__hmul214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL7__hmul214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hmul2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL7__hmul214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL7__hmul214__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL7__hmul214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hadd2_rn():
+    pass
+
+
+def _lower__ZL10__hadd2_rn14__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL10__hadd2_rn14__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hadd2_rn(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL10__hadd2_rn14__nv_bfloat162S__nbst = declare_device(
+        "_ZL10__hadd2_rn14__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL10__hadd2_rn14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL10__hadd2_rn14__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hadd2_rn, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL10__hadd2_rn14__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL10__hadd2_rn14__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL10__hadd2_rn14__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hsub2_rn():
+    pass
+
+
+def _lower__ZL10__hsub2_rn14__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL10__hsub2_rn14__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hsub2_rn(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL10__hsub2_rn14__nv_bfloat162S__nbst = declare_device(
+        "_ZL10__hsub2_rn14__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL10__hsub2_rn14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL10__hsub2_rn14__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hsub2_rn, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL10__hsub2_rn14__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL10__hsub2_rn14__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL10__hsub2_rn14__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hmul2_rn():
+    pass
+
+
+def _lower__ZL10__hmul2_rn14__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL10__hmul2_rn14__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hmul2_rn(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL10__hmul2_rn14__nv_bfloat162S__nbst = declare_device(
+        "_ZL10__hmul2_rn14__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL10__hmul2_rn14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL10__hmul2_rn14__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hmul2_rn, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL10__hmul2_rn14__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL10__hmul2_rn14__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL10__hmul2_rn14__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __h2div():
+    pass
+
+
+def _lower__ZL7__h2div14__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL7__h2div14__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __h2div(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL7__h2div14__nv_bfloat162S__nbst = declare_device(
+        "_ZL7__h2div14__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL7__h2div14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL7__h2div14__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__h2div, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL7__h2div14__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL7__h2div14__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL7__h2div14__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __habs2():
+    pass
+
+
+def _lower__ZL7__habs214__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL7__habs214__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) {
+        retval = __habs2(*a);
+        return 0;
+    }
+        """
+
+    _ZL7__habs214__nv_bfloat162_nbst = declare_device(
+        "_ZL7__habs214__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL7__habs214__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL7__habs214__nv_bfloat162_nbst(arg_0)
+
+    @lower(__habs2, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL7__habs214__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL7__habs214__nv_bfloat162_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            ptrs,
+        )
+
+
+_lower__ZL7__habs214__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def __hadd2_sat():
+    pass
+
+
+def _lower__ZL11__hadd2_sat14__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL11__hadd2_sat14__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hadd2_sat(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL11__hadd2_sat14__nv_bfloat162S__nbst = declare_device(
+        "_ZL11__hadd2_sat14__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL11__hadd2_sat14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL11__hadd2_sat14__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hadd2_sat, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL11__hadd2_sat14__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL11__hadd2_sat14__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL11__hadd2_sat14__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hsub2_sat():
+    pass
+
+
+def _lower__ZL11__hsub2_sat14__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL11__hsub2_sat14__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hsub2_sat(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL11__hsub2_sat14__nv_bfloat162S__nbst = declare_device(
+        "_ZL11__hsub2_sat14__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL11__hsub2_sat14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL11__hsub2_sat14__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hsub2_sat, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL11__hsub2_sat14__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL11__hsub2_sat14__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL11__hsub2_sat14__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hmul2_sat():
+    pass
+
+
+def _lower__ZL11__hmul2_sat14__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL11__hmul2_sat14__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hmul2_sat(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL11__hmul2_sat14__nv_bfloat162S__nbst = declare_device(
+        "_ZL11__hmul2_sat14__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL11__hmul2_sat14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL11__hmul2_sat14__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hmul2_sat, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL11__hmul2_sat14__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL11__hmul2_sat14__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL11__hmul2_sat14__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hfma2():
+    pass
+
+
+def _lower__ZL7__hfma214__nv_bfloat162S_S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL7__hfma214__nv_bfloat162S_S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b, __nv_bfloat162* c) {
+        retval = __hfma2(*a, *b, *c);
+        return 0;
+    }
+        """
+
+    _ZL7__hfma214__nv_bfloat162S_S__nbst = declare_device(
+        "_ZL7__hfma214__nv_bfloat162S_S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162),
+            CPointer(_type___nv_bfloat162),
+            CPointer(_type___nv_bfloat162),
+        ),
+    )
+
+    def _ZL7__hfma214__nv_bfloat162S_S__nbst_caller(arg_0, arg_1, arg_2):
+        return _ZL7__hfma214__nv_bfloat162S_S__nbst(arg_0, arg_1, arg_2)
+
+    @lower(
+        __hfma2,
+        _type___nv_bfloat162,
+        _type___nv_bfloat162,
+        _type___nv_bfloat162,
+    )
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL7__hfma214__nv_bfloat162S_S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL7__hfma214__nv_bfloat162S_S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL7__hfma214__nv_bfloat162S_S__nbst(shim_stream, shim_obj)
+
+
+def __hfma2_sat():
+    pass
+
+
+def _lower__ZL11__hfma2_sat14__nv_bfloat162S_S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL11__hfma2_sat14__nv_bfloat162S_S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b, __nv_bfloat162* c) {
+        retval = __hfma2_sat(*a, *b, *c);
+        return 0;
+    }
+        """
+
+    _ZL11__hfma2_sat14__nv_bfloat162S_S__nbst = declare_device(
+        "_ZL11__hfma2_sat14__nv_bfloat162S_S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162),
+            CPointer(_type___nv_bfloat162),
+            CPointer(_type___nv_bfloat162),
+        ),
+    )
+
+    def _ZL11__hfma2_sat14__nv_bfloat162S_S__nbst_caller(arg_0, arg_1, arg_2):
+        return _ZL11__hfma2_sat14__nv_bfloat162S_S__nbst(arg_0, arg_1, arg_2)
+
+    @lower(
+        __hfma2_sat,
+        _type___nv_bfloat162,
+        _type___nv_bfloat162,
+        _type___nv_bfloat162,
+    )
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL11__hfma2_sat14__nv_bfloat162S_S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL11__hfma2_sat14__nv_bfloat162S_S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL11__hfma2_sat14__nv_bfloat162S_S__nbst(shim_stream, shim_obj)
+
+
+def __hneg2():
+    pass
+
+
+def _lower__ZL7__hneg214__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL7__hneg214__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) {
+        retval = __hneg2(*a);
+        return 0;
+    }
+        """
+
+    _ZL7__hneg214__nv_bfloat162_nbst = declare_device(
+        "_ZL7__hneg214__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL7__hneg214__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL7__hneg214__nv_bfloat162_nbst(arg_0)
+
+    @lower(__hneg2, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL7__hneg214__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL7__hneg214__nv_bfloat162_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            ptrs,
+        )
+
+
+_lower__ZL7__hneg214__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def __habs():
+    pass
+
+
+def _lower__ZL6__habs13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__habs13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) {
+        retval = __habs(*a);
+        return 0;
+    }
+        """
+
+    _ZL6__habs13__nv_bfloat16_nbst = declare_device(
+        "_ZL6__habs13__nv_bfloat16_nbst",
+        _type___nv_bfloat16(CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL6__habs13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL6__habs13__nv_bfloat16_nbst(arg_0)
+
+    @lower(__habs, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__habs13__nv_bfloat16_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__habs13__nv_bfloat16_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            ptrs,
+        )
+
+
+_lower__ZL6__habs13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def __hadd():
+    pass
+
+
+def _lower__ZL6__hadd13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__hadd13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) {
+        retval = __hadd(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL6__hadd13__nv_bfloat16S__nbst = declare_device(
+        "_ZL6__hadd13__nv_bfloat16S__nbst",
+        _type___nv_bfloat16(
+            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
+        ),
+    )
+
+    def _ZL6__hadd13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL6__hadd13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__hadd, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__hadd13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__hadd13__nv_bfloat16S__nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__hadd13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __hsub():
+    pass
+
+
+def _lower__ZL6__hsub13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__hsub13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) {
+        retval = __hsub(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL6__hsub13__nv_bfloat16S__nbst = declare_device(
+        "_ZL6__hsub13__nv_bfloat16S__nbst",
+        _type___nv_bfloat16(
+            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
+        ),
+    )
+
+    def _ZL6__hsub13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL6__hsub13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__hsub, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__hsub13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__hsub13__nv_bfloat16S__nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__hsub13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __hmul():
+    pass
+
+
+def _lower__ZL6__hmul13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__hmul13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) {
+        retval = __hmul(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL6__hmul13__nv_bfloat16S__nbst = declare_device(
+        "_ZL6__hmul13__nv_bfloat16S__nbst",
+        _type___nv_bfloat16(
+            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
+        ),
+    )
+
+    def _ZL6__hmul13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL6__hmul13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__hmul, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__hmul13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__hmul13__nv_bfloat16S__nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__hmul13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __hadd_rn():
+    pass
+
+
+def _lower__ZL9__hadd_rn13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL9__hadd_rn13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) {
+        retval = __hadd_rn(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL9__hadd_rn13__nv_bfloat16S__nbst = declare_device(
+        "_ZL9__hadd_rn13__nv_bfloat16S__nbst",
+        _type___nv_bfloat16(
+            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
+        ),
+    )
+
+    def _ZL9__hadd_rn13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL9__hadd_rn13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__hadd_rn, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL9__hadd_rn13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL9__hadd_rn13__nv_bfloat16S__nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL9__hadd_rn13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __hsub_rn():
+    pass
+
+
+def _lower__ZL9__hsub_rn13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL9__hsub_rn13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) {
+        retval = __hsub_rn(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL9__hsub_rn13__nv_bfloat16S__nbst = declare_device(
+        "_ZL9__hsub_rn13__nv_bfloat16S__nbst",
+        _type___nv_bfloat16(
+            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
+        ),
+    )
+
+    def _ZL9__hsub_rn13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL9__hsub_rn13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__hsub_rn, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL9__hsub_rn13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL9__hsub_rn13__nv_bfloat16S__nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL9__hsub_rn13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __hmul_rn():
+    pass
+
+
+def _lower__ZL9__hmul_rn13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL9__hmul_rn13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) {
+        retval = __hmul_rn(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL9__hmul_rn13__nv_bfloat16S__nbst = declare_device(
+        "_ZL9__hmul_rn13__nv_bfloat16S__nbst",
+        _type___nv_bfloat16(
+            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
+        ),
+    )
+
+    def _ZL9__hmul_rn13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL9__hmul_rn13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__hmul_rn, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL9__hmul_rn13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL9__hmul_rn13__nv_bfloat16S__nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL9__hmul_rn13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __hdiv():
+    pass
+
+
+def _lower__ZL6__hdiv13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__hdiv13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) {
+        retval = __hdiv(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL6__hdiv13__nv_bfloat16S__nbst = declare_device(
+        "_ZL6__hdiv13__nv_bfloat16S__nbst",
+        _type___nv_bfloat16(
+            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
+        ),
+    )
+
+    def _ZL6__hdiv13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL6__hdiv13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__hdiv, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__hdiv13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__hdiv13__nv_bfloat16S__nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__hdiv13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __hadd_sat():
+    pass
+
+
+def _lower__ZL10__hadd_sat13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL10__hadd_sat13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) {
+        retval = __hadd_sat(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL10__hadd_sat13__nv_bfloat16S__nbst = declare_device(
+        "_ZL10__hadd_sat13__nv_bfloat16S__nbst",
+        _type___nv_bfloat16(
+            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
+        ),
+    )
+
+    def _ZL10__hadd_sat13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL10__hadd_sat13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__hadd_sat, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL10__hadd_sat13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL10__hadd_sat13__nv_bfloat16S__nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL10__hadd_sat13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __hsub_sat():
+    pass
+
+
+def _lower__ZL10__hsub_sat13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL10__hsub_sat13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) {
+        retval = __hsub_sat(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL10__hsub_sat13__nv_bfloat16S__nbst = declare_device(
+        "_ZL10__hsub_sat13__nv_bfloat16S__nbst",
+        _type___nv_bfloat16(
+            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
+        ),
+    )
+
+    def _ZL10__hsub_sat13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL10__hsub_sat13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__hsub_sat, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL10__hsub_sat13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL10__hsub_sat13__nv_bfloat16S__nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL10__hsub_sat13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __hmul_sat():
+    pass
+
+
+def _lower__ZL10__hmul_sat13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL10__hmul_sat13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) {
+        retval = __hmul_sat(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL10__hmul_sat13__nv_bfloat16S__nbst = declare_device(
+        "_ZL10__hmul_sat13__nv_bfloat16S__nbst",
+        _type___nv_bfloat16(
+            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
+        ),
+    )
+
+    def _ZL10__hmul_sat13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL10__hmul_sat13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__hmul_sat, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL10__hmul_sat13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL10__hmul_sat13__nv_bfloat16S__nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL10__hmul_sat13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __hfma():
+    pass
+
+
+def _lower__ZL6__hfma13__nv_bfloat16S_S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__hfma13__nv_bfloat16S_S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b, __nv_bfloat16* c) {
+        retval = __hfma(*a, *b, *c);
+        return 0;
+    }
+        """
+
+    _ZL6__hfma13__nv_bfloat16S_S__nbst = declare_device(
+        "_ZL6__hfma13__nv_bfloat16S_S__nbst",
+        _type___nv_bfloat16(
+            CPointer(_type___nv_bfloat16),
+            CPointer(_type___nv_bfloat16),
+            CPointer(_type___nv_bfloat16),
+        ),
+    )
+
+    def _ZL6__hfma13__nv_bfloat16S_S__nbst_caller(arg_0, arg_1, arg_2):
+        return _ZL6__hfma13__nv_bfloat16S_S__nbst(arg_0, arg_1, arg_2)
+
+    @lower(
+        __hfma, _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16
+    )
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__hfma13__nv_bfloat16S_S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__hfma13__nv_bfloat16S_S__nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__hfma13__nv_bfloat16S_S__nbst(shim_stream, shim_obj)
+
+
+def __hfma_sat():
+    pass
+
+
+def _lower__ZL10__hfma_sat13__nv_bfloat16S_S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL10__hfma_sat13__nv_bfloat16S_S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b, __nv_bfloat16* c) {
+        retval = __hfma_sat(*a, *b, *c);
+        return 0;
+    }
+        """
+
+    _ZL10__hfma_sat13__nv_bfloat16S_S__nbst = declare_device(
+        "_ZL10__hfma_sat13__nv_bfloat16S_S__nbst",
+        _type___nv_bfloat16(
+            CPointer(_type___nv_bfloat16),
+            CPointer(_type___nv_bfloat16),
+            CPointer(_type___nv_bfloat16),
+        ),
+    )
+
+    def _ZL10__hfma_sat13__nv_bfloat16S_S__nbst_caller(arg_0, arg_1, arg_2):
+        return _ZL10__hfma_sat13__nv_bfloat16S_S__nbst(arg_0, arg_1, arg_2)
+
+    @lower(
+        __hfma_sat,
+        _type___nv_bfloat16,
+        _type___nv_bfloat16,
+        _type___nv_bfloat16,
+    )
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL10__hfma_sat13__nv_bfloat16S_S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL10__hfma_sat13__nv_bfloat16S_S__nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL10__hfma_sat13__nv_bfloat16S_S__nbst(shim_stream, shim_obj)
+
+
+def __hneg():
+    pass
+
+
+def _lower__ZL6__hneg13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__hneg13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) {
+        retval = __hneg(*a);
+        return 0;
+    }
+        """
+
+    _ZL6__hneg13__nv_bfloat16_nbst = declare_device(
+        "_ZL6__hneg13__nv_bfloat16_nbst",
+        _type___nv_bfloat16(CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL6__hneg13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL6__hneg13__nv_bfloat16_nbst(arg_0)
+
+    @lower(__hneg, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__hneg13__nv_bfloat16_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__hneg13__nv_bfloat16_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            ptrs,
+        )
+
+
+_lower__ZL6__hneg13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def __hbeq2():
+    pass
+
+
+def _lower__ZL7__hbeq214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL7__hbeq214__nv_bfloat162S__nbst(bool &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hbeq2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL7__hbeq214__nv_bfloat162S__nbst = declare_device(
+        "_ZL7__hbeq214__nv_bfloat162S__nbst",
+        bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL7__hbeq214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL7__hbeq214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hbeq2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL7__hbeq214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL7__hbeq214__nv_bfloat162S__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL7__hbeq214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hbne2():
+    pass
+
+
+def _lower__ZL7__hbne214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL7__hbne214__nv_bfloat162S__nbst(bool &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hbne2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL7__hbne214__nv_bfloat162S__nbst = declare_device(
+        "_ZL7__hbne214__nv_bfloat162S__nbst",
+        bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL7__hbne214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL7__hbne214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hbne2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL7__hbne214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL7__hbne214__nv_bfloat162S__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL7__hbne214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hble2():
+    pass
+
+
+def _lower__ZL7__hble214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL7__hble214__nv_bfloat162S__nbst(bool &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hble2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL7__hble214__nv_bfloat162S__nbst = declare_device(
+        "_ZL7__hble214__nv_bfloat162S__nbst",
+        bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL7__hble214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL7__hble214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hble2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL7__hble214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL7__hble214__nv_bfloat162S__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL7__hble214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hbge2():
+    pass
+
+
+def _lower__ZL7__hbge214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL7__hbge214__nv_bfloat162S__nbst(bool &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hbge2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL7__hbge214__nv_bfloat162S__nbst = declare_device(
+        "_ZL7__hbge214__nv_bfloat162S__nbst",
+        bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL7__hbge214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL7__hbge214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hbge2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL7__hbge214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL7__hbge214__nv_bfloat162S__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL7__hbge214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hblt2():
+    pass
+
+
+def _lower__ZL7__hblt214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL7__hblt214__nv_bfloat162S__nbst(bool &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hblt2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL7__hblt214__nv_bfloat162S__nbst = declare_device(
+        "_ZL7__hblt214__nv_bfloat162S__nbst",
+        bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL7__hblt214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL7__hblt214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hblt2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL7__hblt214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL7__hblt214__nv_bfloat162S__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL7__hblt214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hbgt2():
+    pass
+
+
+def _lower__ZL7__hbgt214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL7__hbgt214__nv_bfloat162S__nbst(bool &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hbgt2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL7__hbgt214__nv_bfloat162S__nbst = declare_device(
+        "_ZL7__hbgt214__nv_bfloat162S__nbst",
+        bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL7__hbgt214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL7__hbgt214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hbgt2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL7__hbgt214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL7__hbgt214__nv_bfloat162S__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL7__hbgt214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hbequ2():
+    pass
+
+
+def _lower__ZL8__hbequ214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL8__hbequ214__nv_bfloat162S__nbst(bool &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hbequ2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL8__hbequ214__nv_bfloat162S__nbst = declare_device(
+        "_ZL8__hbequ214__nv_bfloat162S__nbst",
+        bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL8__hbequ214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL8__hbequ214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hbequ2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL8__hbequ214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL8__hbequ214__nv_bfloat162S__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL8__hbequ214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hbneu2():
+    pass
+
+
+def _lower__ZL8__hbneu214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL8__hbneu214__nv_bfloat162S__nbst(bool &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hbneu2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL8__hbneu214__nv_bfloat162S__nbst = declare_device(
+        "_ZL8__hbneu214__nv_bfloat162S__nbst",
+        bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL8__hbneu214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL8__hbneu214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hbneu2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL8__hbneu214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL8__hbneu214__nv_bfloat162S__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL8__hbneu214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hbleu2():
+    pass
+
+
+def _lower__ZL8__hbleu214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL8__hbleu214__nv_bfloat162S__nbst(bool &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hbleu2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL8__hbleu214__nv_bfloat162S__nbst = declare_device(
+        "_ZL8__hbleu214__nv_bfloat162S__nbst",
+        bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL8__hbleu214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL8__hbleu214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hbleu2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL8__hbleu214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL8__hbleu214__nv_bfloat162S__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL8__hbleu214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hbgeu2():
+    pass
+
+
+def _lower__ZL8__hbgeu214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL8__hbgeu214__nv_bfloat162S__nbst(bool &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hbgeu2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL8__hbgeu214__nv_bfloat162S__nbst = declare_device(
+        "_ZL8__hbgeu214__nv_bfloat162S__nbst",
+        bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL8__hbgeu214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL8__hbgeu214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hbgeu2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL8__hbgeu214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL8__hbgeu214__nv_bfloat162S__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL8__hbgeu214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hbltu2():
+    pass
+
+
+def _lower__ZL8__hbltu214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL8__hbltu214__nv_bfloat162S__nbst(bool &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hbltu2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL8__hbltu214__nv_bfloat162S__nbst = declare_device(
+        "_ZL8__hbltu214__nv_bfloat162S__nbst",
+        bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL8__hbltu214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL8__hbltu214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hbltu2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL8__hbltu214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL8__hbltu214__nv_bfloat162S__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL8__hbltu214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hbgtu2():
+    pass
+
+
+def _lower__ZL8__hbgtu214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL8__hbgtu214__nv_bfloat162S__nbst(bool &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hbgtu2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL8__hbgtu214__nv_bfloat162S__nbst = declare_device(
+        "_ZL8__hbgtu214__nv_bfloat162S__nbst",
+        bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL8__hbgtu214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL8__hbgtu214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hbgtu2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL8__hbgtu214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL8__hbgtu214__nv_bfloat162S__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL8__hbgtu214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __heq():
+    pass
+
+
+def _lower__ZL5__heq13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL5__heq13__nv_bfloat16S__nbst(bool &retval , __nv_bfloat16* a, __nv_bfloat16* b) {
+        retval = __heq(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL5__heq13__nv_bfloat16S__nbst = declare_device(
+        "_ZL5__heq13__nv_bfloat16S__nbst",
+        bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL5__heq13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL5__heq13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__heq, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL5__heq13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL5__heq13__nv_bfloat16S__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL5__heq13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __hne():
+    pass
+
+
+def _lower__ZL5__hne13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL5__hne13__nv_bfloat16S__nbst(bool &retval , __nv_bfloat16* a, __nv_bfloat16* b) {
+        retval = __hne(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL5__hne13__nv_bfloat16S__nbst = declare_device(
+        "_ZL5__hne13__nv_bfloat16S__nbst",
+        bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL5__hne13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL5__hne13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__hne, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL5__hne13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL5__hne13__nv_bfloat16S__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL5__hne13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __hle():
+    pass
+
+
+def _lower__ZL5__hle13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL5__hle13__nv_bfloat16S__nbst(bool &retval , __nv_bfloat16* a, __nv_bfloat16* b) {
+        retval = __hle(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL5__hle13__nv_bfloat16S__nbst = declare_device(
+        "_ZL5__hle13__nv_bfloat16S__nbst",
+        bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL5__hle13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL5__hle13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__hle, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL5__hle13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL5__hle13__nv_bfloat16S__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL5__hle13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __hge():
+    pass
+
+
+def _lower__ZL5__hge13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL5__hge13__nv_bfloat16S__nbst(bool &retval , __nv_bfloat16* a, __nv_bfloat16* b) {
+        retval = __hge(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL5__hge13__nv_bfloat16S__nbst = declare_device(
+        "_ZL5__hge13__nv_bfloat16S__nbst",
+        bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL5__hge13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL5__hge13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__hge, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL5__hge13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL5__hge13__nv_bfloat16S__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL5__hge13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __hlt():
+    pass
+
+
+def _lower__ZL5__hlt13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL5__hlt13__nv_bfloat16S__nbst(bool &retval , __nv_bfloat16* a, __nv_bfloat16* b) {
+        retval = __hlt(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL5__hlt13__nv_bfloat16S__nbst = declare_device(
+        "_ZL5__hlt13__nv_bfloat16S__nbst",
+        bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL5__hlt13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL5__hlt13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__hlt, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL5__hlt13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL5__hlt13__nv_bfloat16S__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL5__hlt13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __hgt():
+    pass
+
+
+def _lower__ZL5__hgt13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL5__hgt13__nv_bfloat16S__nbst(bool &retval , __nv_bfloat16* a, __nv_bfloat16* b) {
+        retval = __hgt(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL5__hgt13__nv_bfloat16S__nbst = declare_device(
+        "_ZL5__hgt13__nv_bfloat16S__nbst",
+        bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL5__hgt13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL5__hgt13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__hgt, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL5__hgt13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL5__hgt13__nv_bfloat16S__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL5__hgt13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __hequ():
+    pass
+
+
+def _lower__ZL6__hequ13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__hequ13__nv_bfloat16S__nbst(bool &retval , __nv_bfloat16* a, __nv_bfloat16* b) {
+        retval = __hequ(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL6__hequ13__nv_bfloat16S__nbst = declare_device(
+        "_ZL6__hequ13__nv_bfloat16S__nbst",
+        bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL6__hequ13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL6__hequ13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__hequ, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__hequ13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__hequ13__nv_bfloat16S__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__hequ13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __hneu():
+    pass
+
+
+def _lower__ZL6__hneu13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__hneu13__nv_bfloat16S__nbst(bool &retval , __nv_bfloat16* a, __nv_bfloat16* b) {
+        retval = __hneu(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL6__hneu13__nv_bfloat16S__nbst = declare_device(
+        "_ZL6__hneu13__nv_bfloat16S__nbst",
+        bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL6__hneu13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL6__hneu13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__hneu, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__hneu13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__hneu13__nv_bfloat16S__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__hneu13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __hleu():
+    pass
+
+
+def _lower__ZL6__hleu13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__hleu13__nv_bfloat16S__nbst(bool &retval , __nv_bfloat16* a, __nv_bfloat16* b) {
+        retval = __hleu(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL6__hleu13__nv_bfloat16S__nbst = declare_device(
+        "_ZL6__hleu13__nv_bfloat16S__nbst",
+        bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL6__hleu13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL6__hleu13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__hleu, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__hleu13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__hleu13__nv_bfloat16S__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__hleu13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __hgeu():
+    pass
+
+
+def _lower__ZL6__hgeu13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__hgeu13__nv_bfloat16S__nbst(bool &retval , __nv_bfloat16* a, __nv_bfloat16* b) {
+        retval = __hgeu(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL6__hgeu13__nv_bfloat16S__nbst = declare_device(
+        "_ZL6__hgeu13__nv_bfloat16S__nbst",
+        bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL6__hgeu13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL6__hgeu13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__hgeu, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__hgeu13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__hgeu13__nv_bfloat16S__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__hgeu13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __hltu():
+    pass
+
+
+def _lower__ZL6__hltu13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__hltu13__nv_bfloat16S__nbst(bool &retval , __nv_bfloat16* a, __nv_bfloat16* b) {
+        retval = __hltu(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL6__hltu13__nv_bfloat16S__nbst = declare_device(
+        "_ZL6__hltu13__nv_bfloat16S__nbst",
+        bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL6__hltu13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL6__hltu13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__hltu, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__hltu13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__hltu13__nv_bfloat16S__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__hltu13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __hgtu():
+    pass
+
+
+def _lower__ZL6__hgtu13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__hgtu13__nv_bfloat16S__nbst(bool &retval , __nv_bfloat16* a, __nv_bfloat16* b) {
+        retval = __hgtu(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL6__hgtu13__nv_bfloat16S__nbst = declare_device(
+        "_ZL6__hgtu13__nv_bfloat16S__nbst",
+        bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL6__hgtu13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL6__hgtu13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__hgtu, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__hgtu13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__hgtu13__nv_bfloat16S__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__hgtu13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __hisnan():
+    pass
+
+
+def _lower__ZL8__hisnan13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL8__hisnan13__nv_bfloat16_nbst(bool &retval , __nv_bfloat16* a) {
+        retval = __hisnan(*a);
+        return 0;
+    }
+        """
+
+    _ZL8__hisnan13__nv_bfloat16_nbst = declare_device(
+        "_ZL8__hisnan13__nv_bfloat16_nbst", bool_(CPointer(_type___nv_bfloat16))
+    )
+
+    def _ZL8__hisnan13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL8__hisnan13__nv_bfloat16_nbst(arg_0)
+
+    @lower(__hisnan, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL8__hisnan13__nv_bfloat16_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL8__hisnan13__nv_bfloat16_nbst_caller,
+            signature(bool_, CPointer(_type___nv_bfloat16)),
+            ptrs,
+        )
+
+
+_lower__ZL8__hisnan13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def __hmax():
+    pass
+
+
+def _lower__ZL6__hmax13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__hmax13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) {
+        retval = __hmax(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL6__hmax13__nv_bfloat16S__nbst = declare_device(
+        "_ZL6__hmax13__nv_bfloat16S__nbst",
+        _type___nv_bfloat16(
+            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
+        ),
+    )
+
+    def _ZL6__hmax13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL6__hmax13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__hmax, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__hmax13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__hmax13__nv_bfloat16S__nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__hmax13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __hmin():
+    pass
+
+
+def _lower__ZL6__hmin13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6__hmin13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) {
+        retval = __hmin(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL6__hmin13__nv_bfloat16S__nbst = declare_device(
+        "_ZL6__hmin13__nv_bfloat16S__nbst",
+        _type___nv_bfloat16(
+            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
+        ),
+    )
+
+    def _ZL6__hmin13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL6__hmin13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__hmin, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6__hmin13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6__hmin13__nv_bfloat16S__nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL6__hmin13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __hmax_nan():
+    pass
+
+
+def _lower__ZL10__hmax_nan13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL10__hmax_nan13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) {
+        retval = __hmax_nan(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL10__hmax_nan13__nv_bfloat16S__nbst = declare_device(
+        "_ZL10__hmax_nan13__nv_bfloat16S__nbst",
+        _type___nv_bfloat16(
+            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
+        ),
+    )
+
+    def _ZL10__hmax_nan13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL10__hmax_nan13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__hmax_nan, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL10__hmax_nan13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL10__hmax_nan13__nv_bfloat16S__nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL10__hmax_nan13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __hmin_nan():
+    pass
+
+
+def _lower__ZL10__hmin_nan13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL10__hmin_nan13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) {
+        retval = __hmin_nan(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL10__hmin_nan13__nv_bfloat16S__nbst = declare_device(
+        "_ZL10__hmin_nan13__nv_bfloat16S__nbst",
+        _type___nv_bfloat16(
+            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
+        ),
+    )
+
+    def _ZL10__hmin_nan13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL10__hmin_nan13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(__hmin_nan, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL10__hmin_nan13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL10__hmin_nan13__nv_bfloat16S__nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL10__hmin_nan13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def __hfma_relu():
+    pass
+
+
+def _lower__ZL11__hfma_relu13__nv_bfloat16S_S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL11__hfma_relu13__nv_bfloat16S_S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b, __nv_bfloat16* c) {
+        retval = __hfma_relu(*a, *b, *c);
+        return 0;
+    }
+        """
+
+    _ZL11__hfma_relu13__nv_bfloat16S_S__nbst = declare_device(
+        "_ZL11__hfma_relu13__nv_bfloat16S_S__nbst",
+        _type___nv_bfloat16(
+            CPointer(_type___nv_bfloat16),
+            CPointer(_type___nv_bfloat16),
+            CPointer(_type___nv_bfloat16),
+        ),
+    )
+
+    def _ZL11__hfma_relu13__nv_bfloat16S_S__nbst_caller(arg_0, arg_1, arg_2):
+        return _ZL11__hfma_relu13__nv_bfloat16S_S__nbst(arg_0, arg_1, arg_2)
+
+    @lower(
+        __hfma_relu,
+        _type___nv_bfloat16,
+        _type___nv_bfloat16,
+        _type___nv_bfloat16,
+    )
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL11__hfma_relu13__nv_bfloat16S_S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL11__hfma_relu13__nv_bfloat16S_S__nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL11__hfma_relu13__nv_bfloat16S_S__nbst(shim_stream, shim_obj)
+
+
+def __hmax2():
+    pass
+
+
+def _lower__ZL7__hmax214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL7__hmax214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hmax2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL7__hmax214__nv_bfloat162S__nbst = declare_device(
+        "_ZL7__hmax214__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL7__hmax214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL7__hmax214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hmax2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL7__hmax214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL7__hmax214__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL7__hmax214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hmin2():
+    pass
+
+
+def _lower__ZL7__hmin214__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL7__hmin214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hmin2(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL7__hmin214__nv_bfloat162S__nbst = declare_device(
+        "_ZL7__hmin214__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL7__hmin214__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL7__hmin214__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hmin2, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL7__hmin214__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL7__hmin214__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL7__hmin214__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hmax2_nan():
+    pass
+
+
+def _lower__ZL11__hmax2_nan14__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL11__hmax2_nan14__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hmax2_nan(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL11__hmax2_nan14__nv_bfloat162S__nbst = declare_device(
+        "_ZL11__hmax2_nan14__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL11__hmax2_nan14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL11__hmax2_nan14__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hmax2_nan, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL11__hmax2_nan14__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL11__hmax2_nan14__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL11__hmax2_nan14__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hmin2_nan():
+    pass
+
+
+def _lower__ZL11__hmin2_nan14__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL11__hmin2_nan14__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) {
+        retval = __hmin2_nan(*a, *b);
+        return 0;
+    }
+        """
+
+    _ZL11__hmin2_nan14__nv_bfloat162S__nbst = declare_device(
+        "_ZL11__hmin2_nan14__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZL11__hmin2_nan14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL11__hmin2_nan14__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(__hmin2_nan, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL11__hmin2_nan14__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL11__hmin2_nan14__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL11__hmin2_nan14__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def __hfma2_relu():
+    pass
+
+
+def _lower__ZL12__hfma2_relu14__nv_bfloat162S_S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL12__hfma2_relu14__nv_bfloat162S_S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b, __nv_bfloat162* c) {
+        retval = __hfma2_relu(*a, *b, *c);
+        return 0;
+    }
+        """
+
+    _ZL12__hfma2_relu14__nv_bfloat162S_S__nbst = declare_device(
+        "_ZL12__hfma2_relu14__nv_bfloat162S_S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162),
+            CPointer(_type___nv_bfloat162),
+            CPointer(_type___nv_bfloat162),
+        ),
+    )
+
+    def _ZL12__hfma2_relu14__nv_bfloat162S_S__nbst_caller(arg_0, arg_1, arg_2):
+        return _ZL12__hfma2_relu14__nv_bfloat162S_S__nbst(arg_0, arg_1, arg_2)
+
+    @lower(
+        __hfma2_relu,
+        _type___nv_bfloat162,
+        _type___nv_bfloat162,
+        _type___nv_bfloat162,
+    )
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL12__hfma2_relu14__nv_bfloat162S_S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL12__hfma2_relu14__nv_bfloat162S_S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL12__hfma2_relu14__nv_bfloat162S_S__nbst(shim_stream, shim_obj)
+
+
+def __hcmadd():
+    pass
+
+
+def _lower__ZL8__hcmadd14__nv_bfloat162S_S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL8__hcmadd14__nv_bfloat162S_S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b, __nv_bfloat162* c) {
+        retval = __hcmadd(*a, *b, *c);
+        return 0;
+    }
+        """
+
+    _ZL8__hcmadd14__nv_bfloat162S_S__nbst = declare_device(
+        "_ZL8__hcmadd14__nv_bfloat162S_S__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162),
+            CPointer(_type___nv_bfloat162),
+            CPointer(_type___nv_bfloat162),
+        ),
+    )
+
+    def _ZL8__hcmadd14__nv_bfloat162S_S__nbst_caller(arg_0, arg_1, arg_2):
+        return _ZL8__hcmadd14__nv_bfloat162S_S__nbst(arg_0, arg_1, arg_2)
+
+    @lower(
+        __hcmadd,
+        _type___nv_bfloat162,
+        _type___nv_bfloat162,
+        _type___nv_bfloat162,
+    )
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL8__hcmadd14__nv_bfloat162S_S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL8__hcmadd14__nv_bfloat162S_S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL8__hcmadd14__nv_bfloat162S_S__nbst(shim_stream, shim_obj)
+
+
+def hsqrt():
+    pass
+
+
+def _lower__ZL5hsqrt13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL5hsqrt13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) {
+        retval = hsqrt(*a);
+        return 0;
+    }
+        """
+
+    _ZL5hsqrt13__nv_bfloat16_nbst = declare_device(
+        "_ZL5hsqrt13__nv_bfloat16_nbst",
+        _type___nv_bfloat16(CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL5hsqrt13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL5hsqrt13__nv_bfloat16_nbst(arg_0)
+
+    @lower(hsqrt, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL5hsqrt13__nv_bfloat16_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL5hsqrt13__nv_bfloat16_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            ptrs,
+        )
+
+
+_lower__ZL5hsqrt13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def hrsqrt():
+    pass
+
+
+def _lower__ZL6hrsqrt13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6hrsqrt13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) {
+        retval = hrsqrt(*a);
+        return 0;
+    }
+        """
+
+    _ZL6hrsqrt13__nv_bfloat16_nbst = declare_device(
+        "_ZL6hrsqrt13__nv_bfloat16_nbst",
+        _type___nv_bfloat16(CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL6hrsqrt13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL6hrsqrt13__nv_bfloat16_nbst(arg_0)
+
+    @lower(hrsqrt, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6hrsqrt13__nv_bfloat16_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6hrsqrt13__nv_bfloat16_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            ptrs,
+        )
+
+
+_lower__ZL6hrsqrt13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def hrcp():
+    pass
+
+
+def _lower__ZL4hrcp13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL4hrcp13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) {
+        retval = hrcp(*a);
+        return 0;
+    }
+        """
+
+    _ZL4hrcp13__nv_bfloat16_nbst = declare_device(
+        "_ZL4hrcp13__nv_bfloat16_nbst",
+        _type___nv_bfloat16(CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL4hrcp13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL4hrcp13__nv_bfloat16_nbst(arg_0)
+
+    @lower(hrcp, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key("_ZL4hrcp13__nv_bfloat16_nbst", shim_raw_str)
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL4hrcp13__nv_bfloat16_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            ptrs,
+        )
+
+
+_lower__ZL4hrcp13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def hlog():
+    pass
+
+
+def _lower__ZL4hlog13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL4hlog13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) {
+        retval = hlog(*a);
+        return 0;
+    }
+        """
+
+    _ZL4hlog13__nv_bfloat16_nbst = declare_device(
+        "_ZL4hlog13__nv_bfloat16_nbst",
+        _type___nv_bfloat16(CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL4hlog13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL4hlog13__nv_bfloat16_nbst(arg_0)
+
+    @lower(hlog, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key("_ZL4hlog13__nv_bfloat16_nbst", shim_raw_str)
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL4hlog13__nv_bfloat16_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            ptrs,
+        )
+
+
+_lower__ZL4hlog13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def hlog2():
+    pass
+
+
+def _lower__ZL5hlog213__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL5hlog213__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) {
+        retval = hlog2(*a);
+        return 0;
+    }
+        """
+
+    _ZL5hlog213__nv_bfloat16_nbst = declare_device(
+        "_ZL5hlog213__nv_bfloat16_nbst",
+        _type___nv_bfloat16(CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL5hlog213__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL5hlog213__nv_bfloat16_nbst(arg_0)
+
+    @lower(hlog2, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL5hlog213__nv_bfloat16_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL5hlog213__nv_bfloat16_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            ptrs,
+        )
+
+
+_lower__ZL5hlog213__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def hlog10():
+    pass
+
+
+def _lower__ZL6hlog1013__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6hlog1013__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) {
+        retval = hlog10(*a);
+        return 0;
+    }
+        """
+
+    _ZL6hlog1013__nv_bfloat16_nbst = declare_device(
+        "_ZL6hlog1013__nv_bfloat16_nbst",
+        _type___nv_bfloat16(CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL6hlog1013__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL6hlog1013__nv_bfloat16_nbst(arg_0)
+
+    @lower(hlog10, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6hlog1013__nv_bfloat16_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6hlog1013__nv_bfloat16_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            ptrs,
+        )
+
+
+_lower__ZL6hlog1013__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def hexp():
+    pass
+
+
+def _lower__ZL4hexp13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL4hexp13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) {
+        retval = hexp(*a);
+        return 0;
+    }
+        """
+
+    _ZL4hexp13__nv_bfloat16_nbst = declare_device(
+        "_ZL4hexp13__nv_bfloat16_nbst",
+        _type___nv_bfloat16(CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL4hexp13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL4hexp13__nv_bfloat16_nbst(arg_0)
+
+    @lower(hexp, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key("_ZL4hexp13__nv_bfloat16_nbst", shim_raw_str)
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL4hexp13__nv_bfloat16_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            ptrs,
+        )
+
+
+_lower__ZL4hexp13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def htanh_approx():
+    pass
+
+
+def _lower__ZL12htanh_approx13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL12htanh_approx13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) {
+        retval = htanh_approx(*a);
+        return 0;
+    }
+        """
+
+    _ZL12htanh_approx13__nv_bfloat16_nbst = declare_device(
+        "_ZL12htanh_approx13__nv_bfloat16_nbst",
+        _type___nv_bfloat16(CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL12htanh_approx13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL12htanh_approx13__nv_bfloat16_nbst(arg_0)
+
+    @lower(htanh_approx, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL12htanh_approx13__nv_bfloat16_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL12htanh_approx13__nv_bfloat16_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            ptrs,
+        )
+
+
+_lower__ZL12htanh_approx13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def h2tanh_approx():
+    pass
+
+
+def _lower__ZL13h2tanh_approx14__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL13h2tanh_approx14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) {
+        retval = h2tanh_approx(*a);
+        return 0;
+    }
+        """
+
+    _ZL13h2tanh_approx14__nv_bfloat162_nbst = declare_device(
+        "_ZL13h2tanh_approx14__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL13h2tanh_approx14__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL13h2tanh_approx14__nv_bfloat162_nbst(arg_0)
+
+    @lower(h2tanh_approx, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL13h2tanh_approx14__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL13h2tanh_approx14__nv_bfloat162_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            ptrs,
+        )
+
+
+_lower__ZL13h2tanh_approx14__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def htanh():
+    pass
+
+
+def _lower__ZL5htanh13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL5htanh13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) {
+        retval = htanh(*a);
+        return 0;
+    }
+        """
+
+    _ZL5htanh13__nv_bfloat16_nbst = declare_device(
+        "_ZL5htanh13__nv_bfloat16_nbst",
+        _type___nv_bfloat16(CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL5htanh13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL5htanh13__nv_bfloat16_nbst(arg_0)
+
+    @lower(htanh, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL5htanh13__nv_bfloat16_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL5htanh13__nv_bfloat16_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            ptrs,
+        )
+
+
+_lower__ZL5htanh13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def h2tanh():
+    pass
+
+
+def _lower__ZL6h2tanh14__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6h2tanh14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) {
+        retval = h2tanh(*a);
+        return 0;
+    }
+        """
+
+    _ZL6h2tanh14__nv_bfloat162_nbst = declare_device(
+        "_ZL6h2tanh14__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL6h2tanh14__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL6h2tanh14__nv_bfloat162_nbst(arg_0)
+
+    @lower(h2tanh, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6h2tanh14__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6h2tanh14__nv_bfloat162_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            ptrs,
+        )
+
+
+_lower__ZL6h2tanh14__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def hexp2():
+    pass
+
+
+def _lower__ZL5hexp213__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL5hexp213__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) {
+        retval = hexp2(*a);
+        return 0;
+    }
+        """
+
+    _ZL5hexp213__nv_bfloat16_nbst = declare_device(
+        "_ZL5hexp213__nv_bfloat16_nbst",
+        _type___nv_bfloat16(CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL5hexp213__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL5hexp213__nv_bfloat16_nbst(arg_0)
+
+    @lower(hexp2, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL5hexp213__nv_bfloat16_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL5hexp213__nv_bfloat16_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            ptrs,
+        )
+
+
+_lower__ZL5hexp213__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def hexp10():
+    pass
+
+
+def _lower__ZL6hexp1013__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6hexp1013__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) {
+        retval = hexp10(*a);
+        return 0;
+    }
+        """
+
+    _ZL6hexp1013__nv_bfloat16_nbst = declare_device(
+        "_ZL6hexp1013__nv_bfloat16_nbst",
+        _type___nv_bfloat16(CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL6hexp1013__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL6hexp1013__nv_bfloat16_nbst(arg_0)
+
+    @lower(hexp10, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6hexp1013__nv_bfloat16_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6hexp1013__nv_bfloat16_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            ptrs,
+        )
+
+
+_lower__ZL6hexp1013__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def hcos():
+    pass
+
+
+def _lower__ZL4hcos13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL4hcos13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) {
+        retval = hcos(*a);
+        return 0;
+    }
+        """
+
+    _ZL4hcos13__nv_bfloat16_nbst = declare_device(
+        "_ZL4hcos13__nv_bfloat16_nbst",
+        _type___nv_bfloat16(CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL4hcos13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL4hcos13__nv_bfloat16_nbst(arg_0)
+
+    @lower(hcos, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key("_ZL4hcos13__nv_bfloat16_nbst", shim_raw_str)
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL4hcos13__nv_bfloat16_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            ptrs,
+        )
+
+
+_lower__ZL4hcos13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def hsin():
+    pass
+
+
+def _lower__ZL4hsin13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL4hsin13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) {
+        retval = hsin(*a);
+        return 0;
+    }
+        """
+
+    _ZL4hsin13__nv_bfloat16_nbst = declare_device(
+        "_ZL4hsin13__nv_bfloat16_nbst",
+        _type___nv_bfloat16(CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZL4hsin13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZL4hsin13__nv_bfloat16_nbst(arg_0)
+
+    @lower(hsin, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key("_ZL4hsin13__nv_bfloat16_nbst", shim_raw_str)
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL4hsin13__nv_bfloat16_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            ptrs,
+        )
+
+
+_lower__ZL4hsin13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def h2sqrt():
+    pass
+
+
+def _lower__ZL6h2sqrt14__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6h2sqrt14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) {
+        retval = h2sqrt(*a);
+        return 0;
+    }
+        """
+
+    _ZL6h2sqrt14__nv_bfloat162_nbst = declare_device(
+        "_ZL6h2sqrt14__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL6h2sqrt14__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL6h2sqrt14__nv_bfloat162_nbst(arg_0)
+
+    @lower(h2sqrt, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6h2sqrt14__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6h2sqrt14__nv_bfloat162_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            ptrs,
+        )
+
+
+_lower__ZL6h2sqrt14__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def h2rsqrt():
+    pass
+
+
+def _lower__ZL7h2rsqrt14__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL7h2rsqrt14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) {
+        retval = h2rsqrt(*a);
+        return 0;
+    }
+        """
+
+    _ZL7h2rsqrt14__nv_bfloat162_nbst = declare_device(
+        "_ZL7h2rsqrt14__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL7h2rsqrt14__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL7h2rsqrt14__nv_bfloat162_nbst(arg_0)
+
+    @lower(h2rsqrt, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL7h2rsqrt14__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL7h2rsqrt14__nv_bfloat162_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            ptrs,
+        )
+
+
+_lower__ZL7h2rsqrt14__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def h2rcp():
+    pass
+
+
+def _lower__ZL5h2rcp14__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL5h2rcp14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) {
+        retval = h2rcp(*a);
+        return 0;
+    }
+        """
+
+    _ZL5h2rcp14__nv_bfloat162_nbst = declare_device(
+        "_ZL5h2rcp14__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL5h2rcp14__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL5h2rcp14__nv_bfloat162_nbst(arg_0)
+
+    @lower(h2rcp, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL5h2rcp14__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL5h2rcp14__nv_bfloat162_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            ptrs,
+        )
+
+
+_lower__ZL5h2rcp14__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def h2log():
+    pass
+
+
+def _lower__ZL5h2log14__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL5h2log14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) {
+        retval = h2log(*a);
+        return 0;
+    }
+        """
+
+    _ZL5h2log14__nv_bfloat162_nbst = declare_device(
+        "_ZL5h2log14__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL5h2log14__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL5h2log14__nv_bfloat162_nbst(arg_0)
+
+    @lower(h2log, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL5h2log14__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL5h2log14__nv_bfloat162_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            ptrs,
+        )
+
+
+_lower__ZL5h2log14__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def h2log2():
+    pass
+
+
+def _lower__ZL6h2log214__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6h2log214__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) {
+        retval = h2log2(*a);
+        return 0;
+    }
+        """
+
+    _ZL6h2log214__nv_bfloat162_nbst = declare_device(
+        "_ZL6h2log214__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL6h2log214__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL6h2log214__nv_bfloat162_nbst(arg_0)
+
+    @lower(h2log2, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6h2log214__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6h2log214__nv_bfloat162_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            ptrs,
+        )
+
+
+_lower__ZL6h2log214__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def h2log10():
+    pass
+
+
+def _lower__ZL7h2log1014__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL7h2log1014__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) {
+        retval = h2log10(*a);
+        return 0;
+    }
+        """
+
+    _ZL7h2log1014__nv_bfloat162_nbst = declare_device(
+        "_ZL7h2log1014__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL7h2log1014__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL7h2log1014__nv_bfloat162_nbst(arg_0)
+
+    @lower(h2log10, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL7h2log1014__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL7h2log1014__nv_bfloat162_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            ptrs,
+        )
+
+
+_lower__ZL7h2log1014__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def h2exp():
+    pass
+
+
+def _lower__ZL5h2exp14__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL5h2exp14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) {
+        retval = h2exp(*a);
+        return 0;
+    }
+        """
+
+    _ZL5h2exp14__nv_bfloat162_nbst = declare_device(
+        "_ZL5h2exp14__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL5h2exp14__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL5h2exp14__nv_bfloat162_nbst(arg_0)
+
+    @lower(h2exp, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL5h2exp14__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL5h2exp14__nv_bfloat162_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            ptrs,
+        )
+
+
+_lower__ZL5h2exp14__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def h2exp2():
+    pass
+
+
+def _lower__ZL6h2exp214__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL6h2exp214__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) {
+        retval = h2exp2(*a);
+        return 0;
+    }
+        """
+
+    _ZL6h2exp214__nv_bfloat162_nbst = declare_device(
+        "_ZL6h2exp214__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL6h2exp214__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL6h2exp214__nv_bfloat162_nbst(arg_0)
+
+    @lower(h2exp2, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL6h2exp214__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL6h2exp214__nv_bfloat162_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            ptrs,
+        )
+
+
+_lower__ZL6h2exp214__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def h2exp10():
+    pass
+
+
+def _lower__ZL7h2exp1014__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL7h2exp1014__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) {
+        retval = h2exp10(*a);
+        return 0;
+    }
+        """
+
+    _ZL7h2exp1014__nv_bfloat162_nbst = declare_device(
+        "_ZL7h2exp1014__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL7h2exp1014__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL7h2exp1014__nv_bfloat162_nbst(arg_0)
+
+    @lower(h2exp10, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL7h2exp1014__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL7h2exp1014__nv_bfloat162_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            ptrs,
+        )
+
+
+_lower__ZL7h2exp1014__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def h2cos():
+    pass
+
+
+def _lower__ZL5h2cos14__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL5h2cos14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) {
+        retval = h2cos(*a);
+        return 0;
+    }
+        """
+
+    _ZL5h2cos14__nv_bfloat162_nbst = declare_device(
+        "_ZL5h2cos14__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL5h2cos14__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL5h2cos14__nv_bfloat162_nbst(arg_0)
+
+    @lower(h2cos, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL5h2cos14__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL5h2cos14__nv_bfloat162_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            ptrs,
+        )
+
+
+_lower__ZL5h2cos14__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def h2sin():
+    pass
+
+
+def _lower__ZL5h2sin14__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL5h2sin14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) {
+        retval = h2sin(*a);
+        return 0;
+    }
+        """
+
+    _ZL5h2sin14__nv_bfloat162_nbst = declare_device(
+        "_ZL5h2sin14__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZL5h2sin14__nv_bfloat162_nbst_caller(arg_0):
+        return _ZL5h2sin14__nv_bfloat162_nbst(arg_0)
+
+    @lower(h2sin, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL5h2sin14__nv_bfloat162_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL5h2sin14__nv_bfloat162_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            ptrs,
+        )
+
+
+_lower__ZL5h2sin14__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def atomicAdd():
+    pass
+
+
+def _lower__ZL9atomicAddP14__nv_bfloat162S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL9atomicAddP14__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162 ** address, __nv_bfloat162* val) {
+        retval = atomicAdd(*address, *val);
+        return 0;
+    }
+        """
+
+    _ZL9atomicAddP14__nv_bfloat162S__nbst = declare_device(
+        "_ZL9atomicAddP14__nv_bfloat162S__nbst",
+        _type___nv_bfloat162(
+            CPointer(CPointer(_type___nv_bfloat162)),
+            CPointer(_type___nv_bfloat162),
+        ),
+    )
+
+    def _ZL9atomicAddP14__nv_bfloat162S__nbst_caller(arg_0, arg_1):
+        return _ZL9atomicAddP14__nv_bfloat162S__nbst(arg_0, arg_1)
+
+    @lower(atomicAdd, CPointer(_type___nv_bfloat162), _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL9atomicAddP14__nv_bfloat162S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL9atomicAddP14__nv_bfloat162S__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(CPointer(_type___nv_bfloat162)),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL9atomicAddP14__nv_bfloat162S__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZL9atomicAddP13__nv_bfloat16S__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZL9atomicAddP13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16 ** address, __nv_bfloat16* val) {
+        retval = atomicAdd(*address, *val);
+        return 0;
+    }
+        """
+
+    _ZL9atomicAddP13__nv_bfloat16S__nbst = declare_device(
+        "_ZL9atomicAddP13__nv_bfloat16S__nbst",
+        _type___nv_bfloat16(
+            CPointer(CPointer(_type___nv_bfloat16)),
+            CPointer(_type___nv_bfloat16),
+        ),
+    )
+
+    def _ZL9atomicAddP13__nv_bfloat16S__nbst_caller(arg_0, arg_1):
+        return _ZL9atomicAddP13__nv_bfloat16S__nbst(arg_0, arg_1)
+
+    @lower(atomicAdd, CPointer(_type___nv_bfloat16), _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZL9atomicAddP13__nv_bfloat16S__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZL9atomicAddP13__nv_bfloat16S__nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(CPointer(_type___nv_bfloat16)),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZL9atomicAddP13__nv_bfloat16S__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZplRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZplRK13__nv_bfloat16S1__nbst(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
+        retval = operator+(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZplRK13__nv_bfloat16S1__nbst = declare_device(
+        "_ZplRK13__nv_bfloat16S1__nbst",
+        _type___nv_bfloat16(
+            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
+        ),
+    )
+
+    def _ZplRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1):
+        return _ZplRK13__nv_bfloat16S1__nbst(arg_0, arg_1)
+
+    @lower(operator.add, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZplRK13__nv_bfloat16S1__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZplRK13__nv_bfloat16S1__nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZplRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZmiRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZmiRK13__nv_bfloat16S1__nbst(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
+        retval = operator-(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZmiRK13__nv_bfloat16S1__nbst = declare_device(
+        "_ZmiRK13__nv_bfloat16S1__nbst",
+        _type___nv_bfloat16(
+            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
+        ),
+    )
+
+    def _ZmiRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1):
+        return _ZmiRK13__nv_bfloat16S1__nbst(arg_0, arg_1)
+
+    @lower(operator.sub, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZmiRK13__nv_bfloat16S1__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZmiRK13__nv_bfloat16S1__nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZmiRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZmlRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZmlRK13__nv_bfloat16S1__nbst(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
+        retval = operator*(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZmlRK13__nv_bfloat16S1__nbst = declare_device(
+        "_ZmlRK13__nv_bfloat16S1__nbst",
+        _type___nv_bfloat16(
+            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
+        ),
+    )
+
+    def _ZmlRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1):
+        return _ZmlRK13__nv_bfloat16S1__nbst(arg_0, arg_1)
+
+    @lower(operator.mul, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZmlRK13__nv_bfloat16S1__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZmlRK13__nv_bfloat16S1__nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZmlRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZdvRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZdvRK13__nv_bfloat16S1__nbst(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
+        retval = operator/(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZdvRK13__nv_bfloat16S1__nbst = declare_device(
+        "_ZdvRK13__nv_bfloat16S1__nbst",
+        _type___nv_bfloat16(
+            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
+        ),
+    )
+
+    def _ZdvRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1):
+        return _ZdvRK13__nv_bfloat16S1__nbst(arg_0, arg_1)
+
+    @lower(operator.truediv, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZdvRK13__nv_bfloat16S1__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZdvRK13__nv_bfloat16S1__nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZdvRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZpLR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZpLR13__nv_bfloat16RKS__nbst(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
+        retval = operator+=(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZpLR13__nv_bfloat16RKS__nbst = declare_device(
+        "_ZpLR13__nv_bfloat16RKS__nbst",
+        _type___nv_bfloat16(
+            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
+        ),
+    )
+
+    def _ZpLR13__nv_bfloat16RKS__nbst_caller(arg_0, arg_1):
+        return _ZpLR13__nv_bfloat16RKS__nbst(arg_0, arg_1)
+
+    @lower(operator.iadd, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZpLR13__nv_bfloat16RKS__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZpLR13__nv_bfloat16RKS__nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZpLR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZmIR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZmIR13__nv_bfloat16RKS__nbst(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
+        retval = operator-=(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZmIR13__nv_bfloat16RKS__nbst = declare_device(
+        "_ZmIR13__nv_bfloat16RKS__nbst",
+        _type___nv_bfloat16(
+            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
+        ),
+    )
+
+    def _ZmIR13__nv_bfloat16RKS__nbst_caller(arg_0, arg_1):
+        return _ZmIR13__nv_bfloat16RKS__nbst(arg_0, arg_1)
+
+    @lower(operator.isub, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZmIR13__nv_bfloat16RKS__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZmIR13__nv_bfloat16RKS__nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZmIR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZmLR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZmLR13__nv_bfloat16RKS__nbst(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
+        retval = operator*=(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZmLR13__nv_bfloat16RKS__nbst = declare_device(
+        "_ZmLR13__nv_bfloat16RKS__nbst",
+        _type___nv_bfloat16(
+            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
+        ),
+    )
+
+    def _ZmLR13__nv_bfloat16RKS__nbst_caller(arg_0, arg_1):
+        return _ZmLR13__nv_bfloat16RKS__nbst(arg_0, arg_1)
+
+    @lower(operator.imul, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZmLR13__nv_bfloat16RKS__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZmLR13__nv_bfloat16RKS__nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZmLR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZdVR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZdVR13__nv_bfloat16RKS__nbst(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
+        retval = operator/=(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZdVR13__nv_bfloat16RKS__nbst = declare_device(
+        "_ZdVR13__nv_bfloat16RKS__nbst",
+        _type___nv_bfloat16(
+            CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)
+        ),
+    )
+
+    def _ZdVR13__nv_bfloat16RKS__nbst_caller(arg_0, arg_1):
+        return _ZdVR13__nv_bfloat16RKS__nbst(arg_0, arg_1)
+
+    @lower(operator.itruediv, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZdVR13__nv_bfloat16RKS__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZdVR13__nv_bfloat16RKS__nbst_caller,
+            signature(
+                _type___nv_bfloat16,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZdVR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZpsRK13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZpsRK13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* h) {
+        retval = operator+(*h);
+        return 0;
+    }
+        """
+
+    _ZpsRK13__nv_bfloat16_nbst = declare_device(
+        "_ZpsRK13__nv_bfloat16_nbst",
+        _type___nv_bfloat16(CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZpsRK13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZpsRK13__nv_bfloat16_nbst(arg_0)
+
+    @lower(operator.pos, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key("_ZpsRK13__nv_bfloat16_nbst", shim_raw_str)
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZpsRK13__nv_bfloat16_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            ptrs,
+        )
+
+
+_lower__ZpsRK13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def _lower__ZngRK13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZngRK13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* h) {
+        retval = operator-(*h);
+        return 0;
+    }
+        """
+
+    _ZngRK13__nv_bfloat16_nbst = declare_device(
+        "_ZngRK13__nv_bfloat16_nbst",
+        _type___nv_bfloat16(CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZngRK13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZngRK13__nv_bfloat16_nbst(arg_0)
+
+    @lower(operator.neg, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key("_ZngRK13__nv_bfloat16_nbst", shim_raw_str)
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZngRK13__nv_bfloat16_nbst_caller,
+            signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+            ptrs,
+        )
+
+
+_lower__ZngRK13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+def _lower__ZeqRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZeqRK13__nv_bfloat16S1__nbst(bool &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
+        retval = operator==(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZeqRK13__nv_bfloat16S1__nbst = declare_device(
+        "_ZeqRK13__nv_bfloat16S1__nbst",
+        bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZeqRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1):
+        return _ZeqRK13__nv_bfloat16S1__nbst(arg_0, arg_1)
+
+    @lower(operator.eq, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZeqRK13__nv_bfloat16S1__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZeqRK13__nv_bfloat16S1__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZeqRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZneRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZneRK13__nv_bfloat16S1__nbst(bool &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
+        retval = operator!=(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZneRK13__nv_bfloat16S1__nbst = declare_device(
+        "_ZneRK13__nv_bfloat16S1__nbst",
+        bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZneRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1):
+        return _ZneRK13__nv_bfloat16S1__nbst(arg_0, arg_1)
+
+    @lower(operator.ne, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZneRK13__nv_bfloat16S1__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZneRK13__nv_bfloat16S1__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZneRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZgtRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZgtRK13__nv_bfloat16S1__nbst(bool &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
+        retval = operator>(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZgtRK13__nv_bfloat16S1__nbst = declare_device(
+        "_ZgtRK13__nv_bfloat16S1__nbst",
+        bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZgtRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1):
+        return _ZgtRK13__nv_bfloat16S1__nbst(arg_0, arg_1)
+
+    @lower(operator.gt, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZgtRK13__nv_bfloat16S1__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZgtRK13__nv_bfloat16S1__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZgtRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZltRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZltRK13__nv_bfloat16S1__nbst(bool &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
+        retval = operator<(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZltRK13__nv_bfloat16S1__nbst = declare_device(
+        "_ZltRK13__nv_bfloat16S1__nbst",
+        bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZltRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1):
+        return _ZltRK13__nv_bfloat16S1__nbst(arg_0, arg_1)
+
+    @lower(operator.lt, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZltRK13__nv_bfloat16S1__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZltRK13__nv_bfloat16S1__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZltRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZgeRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZgeRK13__nv_bfloat16S1__nbst(bool &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
+        retval = operator>=(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZgeRK13__nv_bfloat16S1__nbst = declare_device(
+        "_ZgeRK13__nv_bfloat16S1__nbst",
+        bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZgeRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1):
+        return _ZgeRK13__nv_bfloat16S1__nbst(arg_0, arg_1)
+
+    @lower(operator.ge, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZgeRK13__nv_bfloat16S1__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZgeRK13__nv_bfloat16S1__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZgeRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZleRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZleRK13__nv_bfloat16S1__nbst(bool &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) {
+        retval = operator<=(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZleRK13__nv_bfloat16S1__nbst = declare_device(
+        "_ZleRK13__nv_bfloat16S1__nbst",
+        bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)),
+    )
+
+    def _ZleRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1):
+        return _ZleRK13__nv_bfloat16S1__nbst(arg_0, arg_1)
+
+    @lower(operator.le, _type___nv_bfloat16, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZleRK13__nv_bfloat16S1__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZleRK13__nv_bfloat16S1__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat16),
+                CPointer(_type___nv_bfloat16),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZleRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZplRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZplRK14__nv_bfloat162S1__nbst(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
+        retval = operator+(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZplRK14__nv_bfloat162S1__nbst = declare_device(
+        "_ZplRK14__nv_bfloat162S1__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZplRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1):
+        return _ZplRK14__nv_bfloat162S1__nbst(arg_0, arg_1)
+
+    @lower(operator.add, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZplRK14__nv_bfloat162S1__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZplRK14__nv_bfloat162S1__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZplRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZmiRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZmiRK14__nv_bfloat162S1__nbst(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
+        retval = operator-(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZmiRK14__nv_bfloat162S1__nbst = declare_device(
+        "_ZmiRK14__nv_bfloat162S1__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZmiRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1):
+        return _ZmiRK14__nv_bfloat162S1__nbst(arg_0, arg_1)
+
+    @lower(operator.sub, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZmiRK14__nv_bfloat162S1__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZmiRK14__nv_bfloat162S1__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZmiRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZmlRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZmlRK14__nv_bfloat162S1__nbst(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
+        retval = operator*(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZmlRK14__nv_bfloat162S1__nbst = declare_device(
+        "_ZmlRK14__nv_bfloat162S1__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZmlRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1):
+        return _ZmlRK14__nv_bfloat162S1__nbst(arg_0, arg_1)
+
+    @lower(operator.mul, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZmlRK14__nv_bfloat162S1__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZmlRK14__nv_bfloat162S1__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZmlRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZdvRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZdvRK14__nv_bfloat162S1__nbst(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
+        retval = operator/(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZdvRK14__nv_bfloat162S1__nbst = declare_device(
+        "_ZdvRK14__nv_bfloat162S1__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZdvRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1):
+        return _ZdvRK14__nv_bfloat162S1__nbst(arg_0, arg_1)
+
+    @lower(operator.truediv, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZdvRK14__nv_bfloat162S1__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZdvRK14__nv_bfloat162S1__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZdvRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZpLR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZpLR14__nv_bfloat162RKS__nbst(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
+        retval = operator+=(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZpLR14__nv_bfloat162RKS__nbst = declare_device(
+        "_ZpLR14__nv_bfloat162RKS__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZpLR14__nv_bfloat162RKS__nbst_caller(arg_0, arg_1):
+        return _ZpLR14__nv_bfloat162RKS__nbst(arg_0, arg_1)
+
+    @lower(operator.iadd, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZpLR14__nv_bfloat162RKS__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZpLR14__nv_bfloat162RKS__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZpLR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZmIR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZmIR14__nv_bfloat162RKS__nbst(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
+        retval = operator-=(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZmIR14__nv_bfloat162RKS__nbst = declare_device(
+        "_ZmIR14__nv_bfloat162RKS__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZmIR14__nv_bfloat162RKS__nbst_caller(arg_0, arg_1):
+        return _ZmIR14__nv_bfloat162RKS__nbst(arg_0, arg_1)
+
+    @lower(operator.isub, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZmIR14__nv_bfloat162RKS__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZmIR14__nv_bfloat162RKS__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZmIR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZmLR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZmLR14__nv_bfloat162RKS__nbst(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
+        retval = operator*=(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZmLR14__nv_bfloat162RKS__nbst = declare_device(
+        "_ZmLR14__nv_bfloat162RKS__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZmLR14__nv_bfloat162RKS__nbst_caller(arg_0, arg_1):
+        return _ZmLR14__nv_bfloat162RKS__nbst(arg_0, arg_1)
+
+    @lower(operator.imul, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZmLR14__nv_bfloat162RKS__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZmLR14__nv_bfloat162RKS__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZmLR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZdVR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZdVR14__nv_bfloat162RKS__nbst(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
+        retval = operator/=(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZdVR14__nv_bfloat162RKS__nbst = declare_device(
+        "_ZdVR14__nv_bfloat162RKS__nbst",
+        _type___nv_bfloat162(
+            CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)
+        ),
+    )
+
+    def _ZdVR14__nv_bfloat162RKS__nbst_caller(arg_0, arg_1):
+        return _ZdVR14__nv_bfloat162RKS__nbst(arg_0, arg_1)
+
+    @lower(operator.itruediv, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZdVR14__nv_bfloat162RKS__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZdVR14__nv_bfloat162RKS__nbst_caller,
+            signature(
+                _type___nv_bfloat162,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZdVR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZpsRK14__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZpsRK14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* h) {
+        retval = operator+(*h);
+        return 0;
+    }
+        """
+
+    _ZpsRK14__nv_bfloat162_nbst = declare_device(
+        "_ZpsRK14__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZpsRK14__nv_bfloat162_nbst_caller(arg_0):
+        return _ZpsRK14__nv_bfloat162_nbst(arg_0)
+
+    @lower(operator.pos, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key("_ZpsRK14__nv_bfloat162_nbst", shim_raw_str)
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZpsRK14__nv_bfloat162_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            ptrs,
+        )
+
+
+_lower__ZpsRK14__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def _lower__ZngRK14__nv_bfloat162_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZngRK14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* h) {
+        retval = operator-(*h);
+        return 0;
+    }
+        """
+
+    _ZngRK14__nv_bfloat162_nbst = declare_device(
+        "_ZngRK14__nv_bfloat162_nbst",
+        _type___nv_bfloat162(CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZngRK14__nv_bfloat162_nbst_caller(arg_0):
+        return _ZngRK14__nv_bfloat162_nbst(arg_0)
+
+    @lower(operator.neg, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key("_ZngRK14__nv_bfloat162_nbst", shim_raw_str)
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZngRK14__nv_bfloat162_nbst_caller,
+            signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+            ptrs,
+        )
+
+
+_lower__ZngRK14__nv_bfloat162_nbst(shim_stream, shim_obj)
+
+
+def _lower__ZeqRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZeqRK14__nv_bfloat162S1__nbst(bool &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
+        retval = operator==(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZeqRK14__nv_bfloat162S1__nbst = declare_device(
+        "_ZeqRK14__nv_bfloat162S1__nbst",
+        bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZeqRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1):
+        return _ZeqRK14__nv_bfloat162S1__nbst(arg_0, arg_1)
+
+    @lower(operator.eq, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZeqRK14__nv_bfloat162S1__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZeqRK14__nv_bfloat162S1__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZeqRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZneRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZneRK14__nv_bfloat162S1__nbst(bool &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
+        retval = operator!=(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZneRK14__nv_bfloat162S1__nbst = declare_device(
+        "_ZneRK14__nv_bfloat162S1__nbst",
+        bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZneRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1):
+        return _ZneRK14__nv_bfloat162S1__nbst(arg_0, arg_1)
+
+    @lower(operator.ne, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZneRK14__nv_bfloat162S1__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZneRK14__nv_bfloat162S1__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZneRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZgtRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZgtRK14__nv_bfloat162S1__nbst(bool &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
+        retval = operator>(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZgtRK14__nv_bfloat162S1__nbst = declare_device(
+        "_ZgtRK14__nv_bfloat162S1__nbst",
+        bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZgtRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1):
+        return _ZgtRK14__nv_bfloat162S1__nbst(arg_0, arg_1)
+
+    @lower(operator.gt, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZgtRK14__nv_bfloat162S1__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZgtRK14__nv_bfloat162S1__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZgtRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZltRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZltRK14__nv_bfloat162S1__nbst(bool &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
+        retval = operator<(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZltRK14__nv_bfloat162S1__nbst = declare_device(
+        "_ZltRK14__nv_bfloat162S1__nbst",
+        bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZltRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1):
+        return _ZltRK14__nv_bfloat162S1__nbst(arg_0, arg_1)
+
+    @lower(operator.lt, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZltRK14__nv_bfloat162S1__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZltRK14__nv_bfloat162S1__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZltRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZgeRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZgeRK14__nv_bfloat162S1__nbst(bool &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
+        retval = operator>=(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZgeRK14__nv_bfloat162S1__nbst = declare_device(
+        "_ZgeRK14__nv_bfloat162S1__nbst",
+        bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZgeRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1):
+        return _ZgeRK14__nv_bfloat162S1__nbst(arg_0, arg_1)
+
+    @lower(operator.ge, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZgeRK14__nv_bfloat162S1__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZgeRK14__nv_bfloat162S1__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZgeRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj)
+
+
+def _lower__ZleRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZleRK14__nv_bfloat162S1__nbst(bool &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
+        retval = operator<=(*lh, *rh);
+        return 0;
+    }
+        """
+
+    _ZleRK14__nv_bfloat162S1__nbst = declare_device(
+        "_ZleRK14__nv_bfloat162S1__nbst",
+        bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
+    )
+
+    def _ZleRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1):
+        return _ZleRK14__nv_bfloat162S1__nbst(arg_0, arg_1)
+
+    @lower(operator.le, _type___nv_bfloat162, _type___nv_bfloat162)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZleRK14__nv_bfloat162S1__nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZleRK14__nv_bfloat162S1__nbst_caller,
+            signature(
+                bool_,
+                CPointer(_type___nv_bfloat162),
+                CPointer(_type___nv_bfloat162),
+            ),
+            ptrs,
+        )
+
+
+_lower__ZleRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj)
+
+
+def __half():
+    pass
+
+
+def _lower__ZN6__halfC1E13__nv_bfloat16_nbst(shim_stream, shim_obj):
+    shim_raw_str = """
+    extern "C" __device__ int
+    _ZN6__halfC1E13__nv_bfloat16_nbst(int &retval , __nv_bfloat16* f) {
+        __half(*f);
+        return 0;
+    }
+        """
+
+    _ZN6__halfC1E13__nv_bfloat16_nbst = declare_device(
+        "_ZN6__halfC1E13__nv_bfloat16_nbst", void(CPointer(_type___nv_bfloat16))
+    )
+
+    def _ZN6__halfC1E13__nv_bfloat16_nbst_caller(arg_0):
+        return _ZN6__halfC1E13__nv_bfloat16_nbst(arg_0)
+
+    @lower(__half, _type___nv_bfloat16)
+    def impl(context, builder, sig, args):
+        context.active_code_library.add_linking_file(shim_obj)
+        shim_stream.write_with_key(
+            "_ZN6__halfC1E13__nv_bfloat16_nbst", shim_raw_str
+        )
+        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
+        for ptr, ty, arg in zip(ptrs, sig.args, args):
+            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+
+        return context.compile_internal(
+            builder,
+            _ZN6__halfC1E13__nv_bfloat16_nbst_caller,
+            signature(void, CPointer(_type___nv_bfloat16)),
+            ptrs,
+        )
+
+
+_lower__ZN6__halfC1E13__nv_bfloat16_nbst(shim_stream, shim_obj)
+
+
+@register
+class _typing___double2bfloat16(ConcreteTemplate):
+    key = globals()["__double2bfloat16"]
+    cases = [signature(_type___nv_bfloat16, float64)]
+
+
+register_global(__double2bfloat16, types.Function(_typing___double2bfloat16))
+
+
+@register
+class _typing___float2bfloat16(ConcreteTemplate):
+    key = globals()["__float2bfloat16"]
+    cases = [signature(_type___nv_bfloat16, float32)]
+
+
+register_global(__float2bfloat16, types.Function(_typing___float2bfloat16))
+
+
+@register
+class _typing___float2bfloat16_rn(ConcreteTemplate):
+    key = globals()["__float2bfloat16_rn"]
+    cases = [signature(_type___nv_bfloat16, float32)]
+
+
+register_global(
+    __float2bfloat16_rn, types.Function(_typing___float2bfloat16_rn)
+)
+
+
+@register
+class _typing___float2bfloat16_rz(ConcreteTemplate):
+    key = globals()["__float2bfloat16_rz"]
+    cases = [signature(_type___nv_bfloat16, float32)]
+
+
+register_global(
+    __float2bfloat16_rz, types.Function(_typing___float2bfloat16_rz)
+)
+
+
+@register
+class _typing___float2bfloat16_rd(ConcreteTemplate):
+    key = globals()["__float2bfloat16_rd"]
+    cases = [signature(_type___nv_bfloat16, float32)]
+
+
+register_global(
+    __float2bfloat16_rd, types.Function(_typing___float2bfloat16_rd)
+)
+
+
+@register
+class _typing___float2bfloat16_ru(ConcreteTemplate):
+    key = globals()["__float2bfloat16_ru"]
+    cases = [signature(_type___nv_bfloat16, float32)]
+
+
+register_global(
+    __float2bfloat16_ru, types.Function(_typing___float2bfloat16_ru)
+)
+
+
+@register
+class _typing___bfloat162float(ConcreteTemplate):
+    key = globals()["__bfloat162float"]
+    cases = [signature(float32, _type___nv_bfloat16)]
+
+
+register_global(__bfloat162float, types.Function(_typing___bfloat162float))
+
+
+@register
+class _typing___float2bfloat162_rn(ConcreteTemplate):
+    key = globals()["__float2bfloat162_rn"]
+    cases = [signature(_type___nv_bfloat162, float32)]
+
+
+register_global(
+    __float2bfloat162_rn, types.Function(_typing___float2bfloat162_rn)
+)
+
+
+@register
+class _typing___floats2bfloat162_rn(ConcreteTemplate):
+    key = globals()["__floats2bfloat162_rn"]
+    cases = [signature(_type___nv_bfloat162, float32, float32)]
+
+
+register_global(
+    __floats2bfloat162_rn, types.Function(_typing___floats2bfloat162_rn)
+)
+
+
+@register
+class _typing___low2float(ConcreteTemplate):
+    key = globals()["__low2float"]
+    cases = [signature(float32, _type___nv_bfloat162)]
+
+
+register_global(__low2float, types.Function(_typing___low2float))
+
+
+@register
+class _typing___high2float(ConcreteTemplate):
+    key = globals()["__high2float"]
+    cases = [signature(float32, _type___nv_bfloat162)]
+
+
+register_global(__high2float, types.Function(_typing___high2float))
+
+
+@register
+class _typing___float22bfloat162_rn(ConcreteTemplate):
+    key = globals()["__float22bfloat162_rn"]
+    cases = [signature(_type___nv_bfloat162, float32x2)]
+
+
+register_global(
+    __float22bfloat162_rn, types.Function(_typing___float22bfloat162_rn)
+)
+
+
+@register
+class _typing___bfloat1622float2(ConcreteTemplate):
+    key = globals()["__bfloat1622float2"]
+    cases = [signature(float32x2, _type___nv_bfloat162)]
+
+
+register_global(__bfloat1622float2, types.Function(_typing___bfloat1622float2))
+
+
+@register
+class _typing___bfloat162char_rz(ConcreteTemplate):
+    key = globals()["__bfloat162char_rz"]
+    cases = [signature(int8, _type___nv_bfloat16)]
+
+
+register_global(__bfloat162char_rz, types.Function(_typing___bfloat162char_rz))
+
+
+@register
+class _typing___bfloat162uchar_rz(ConcreteTemplate):
+    key = globals()["__bfloat162uchar_rz"]
+    cases = [signature(uint8, _type___nv_bfloat16)]
+
+
+register_global(
+    __bfloat162uchar_rz, types.Function(_typing___bfloat162uchar_rz)
+)
+
+
+@register
+class _typing___bfloat162int_rn(ConcreteTemplate):
+    key = globals()["__bfloat162int_rn"]
+    cases = [signature(int32, _type___nv_bfloat16)]
+
+
+register_global(__bfloat162int_rn, types.Function(_typing___bfloat162int_rn))
+
+
+@register
+class _typing___bfloat162int_rz(ConcreteTemplate):
+    key = globals()["__bfloat162int_rz"]
+    cases = [signature(int32, _type___nv_bfloat16)]
+
+
+register_global(__bfloat162int_rz, types.Function(_typing___bfloat162int_rz))
+
+
+@register
+class _typing___bfloat162int_rd(ConcreteTemplate):
+    key = globals()["__bfloat162int_rd"]
+    cases = [signature(int32, _type___nv_bfloat16)]
+
+
+register_global(__bfloat162int_rd, types.Function(_typing___bfloat162int_rd))
+
+
+@register
+class _typing___bfloat162int_ru(ConcreteTemplate):
+    key = globals()["__bfloat162int_ru"]
+    cases = [signature(int32, _type___nv_bfloat16)]
+
+
+register_global(__bfloat162int_ru, types.Function(_typing___bfloat162int_ru))
+
+
+@register
+class _typing___int2bfloat16_rn(ConcreteTemplate):
+    key = globals()["__int2bfloat16_rn"]
+    cases = [signature(_type___nv_bfloat16, int32)]
+
+
+register_global(__int2bfloat16_rn, types.Function(_typing___int2bfloat16_rn))
+
+
+@register
+class _typing___int2bfloat16_rz(ConcreteTemplate):
+    key = globals()["__int2bfloat16_rz"]
+    cases = [signature(_type___nv_bfloat16, int32)]
+
+
+register_global(__int2bfloat16_rz, types.Function(_typing___int2bfloat16_rz))
+
+
+@register
+class _typing___int2bfloat16_rd(ConcreteTemplate):
+    key = globals()["__int2bfloat16_rd"]
+    cases = [signature(_type___nv_bfloat16, int32)]
+
+
+register_global(__int2bfloat16_rd, types.Function(_typing___int2bfloat16_rd))
+
+
+@register
+class _typing___int2bfloat16_ru(ConcreteTemplate):
+    key = globals()["__int2bfloat16_ru"]
+    cases = [signature(_type___nv_bfloat16, int32)]
+
+
+register_global(__int2bfloat16_ru, types.Function(_typing___int2bfloat16_ru))
+
+
+@register
+class _typing___bfloat162short_rn(ConcreteTemplate):
+    key = globals()["__bfloat162short_rn"]
+    cases = [signature(int16, _type___nv_bfloat16)]
+
+
+register_global(
+    __bfloat162short_rn, types.Function(_typing___bfloat162short_rn)
+)
+
+
+@register
+class _typing___bfloat162short_rz(ConcreteTemplate):
+    key = globals()["__bfloat162short_rz"]
+    cases = [signature(int16, _type___nv_bfloat16)]
+
+
+register_global(
+    __bfloat162short_rz, types.Function(_typing___bfloat162short_rz)
+)
+
+
+@register
+class _typing___bfloat162short_rd(ConcreteTemplate):
+    key = globals()["__bfloat162short_rd"]
+    cases = [signature(int16, _type___nv_bfloat16)]
+
+
+register_global(
+    __bfloat162short_rd, types.Function(_typing___bfloat162short_rd)
+)
+
+
+@register
+class _typing___bfloat162short_ru(ConcreteTemplate):
+    key = globals()["__bfloat162short_ru"]
+    cases = [signature(int16, _type___nv_bfloat16)]
+
+
+register_global(
+    __bfloat162short_ru, types.Function(_typing___bfloat162short_ru)
+)
+
+
+@register
+class _typing___short2bfloat16_rn(ConcreteTemplate):
+    key = globals()["__short2bfloat16_rn"]
+    cases = [signature(_type___nv_bfloat16, int16)]
+
+
+register_global(
+    __short2bfloat16_rn, types.Function(_typing___short2bfloat16_rn)
+)
+
+
+@register
+class _typing___short2bfloat16_rz(ConcreteTemplate):
+    key = globals()["__short2bfloat16_rz"]
+    cases = [signature(_type___nv_bfloat16, int16)]
+
+
+register_global(
+    __short2bfloat16_rz, types.Function(_typing___short2bfloat16_rz)
+)
+
+
+@register
+class _typing___short2bfloat16_rd(ConcreteTemplate):
+    key = globals()["__short2bfloat16_rd"]
+    cases = [signature(_type___nv_bfloat16, int16)]
+
+
+register_global(
+    __short2bfloat16_rd, types.Function(_typing___short2bfloat16_rd)
+)
+
+
+@register
+class _typing___short2bfloat16_ru(ConcreteTemplate):
+    key = globals()["__short2bfloat16_ru"]
+    cases = [signature(_type___nv_bfloat16, int16)]
+
+
+register_global(
+    __short2bfloat16_ru, types.Function(_typing___short2bfloat16_ru)
+)
+
+
+@register
+class _typing___bfloat162uint_rn(ConcreteTemplate):
+    key = globals()["__bfloat162uint_rn"]
+    cases = [signature(uint32, _type___nv_bfloat16)]
+
+
+register_global(__bfloat162uint_rn, types.Function(_typing___bfloat162uint_rn))
+
+
+@register
+class _typing___bfloat162uint_rz(ConcreteTemplate):
+    key = globals()["__bfloat162uint_rz"]
+    cases = [signature(uint32, _type___nv_bfloat16)]
+
+
+register_global(__bfloat162uint_rz, types.Function(_typing___bfloat162uint_rz))
+
+
+@register
+class _typing___bfloat162uint_rd(ConcreteTemplate):
+    key = globals()["__bfloat162uint_rd"]
+    cases = [signature(uint32, _type___nv_bfloat16)]
+
+
+register_global(__bfloat162uint_rd, types.Function(_typing___bfloat162uint_rd))
+
+
+@register
+class _typing___bfloat162uint_ru(ConcreteTemplate):
+    key = globals()["__bfloat162uint_ru"]
+    cases = [signature(uint32, _type___nv_bfloat16)]
+
+
+register_global(__bfloat162uint_ru, types.Function(_typing___bfloat162uint_ru))
+
+
+@register
+class _typing___uint2bfloat16_rn(ConcreteTemplate):
+    key = globals()["__uint2bfloat16_rn"]
+    cases = [signature(_type___nv_bfloat16, uint32)]
+
+
+register_global(__uint2bfloat16_rn, types.Function(_typing___uint2bfloat16_rn))
+
+
+@register
+class _typing___uint2bfloat16_rz(ConcreteTemplate):
+    key = globals()["__uint2bfloat16_rz"]
+    cases = [signature(_type___nv_bfloat16, uint32)]
+
+
+register_global(__uint2bfloat16_rz, types.Function(_typing___uint2bfloat16_rz))
+
+
+@register
+class _typing___uint2bfloat16_rd(ConcreteTemplate):
+    key = globals()["__uint2bfloat16_rd"]
+    cases = [signature(_type___nv_bfloat16, uint32)]
+
+
+register_global(__uint2bfloat16_rd, types.Function(_typing___uint2bfloat16_rd))
+
+
+@register
+class _typing___uint2bfloat16_ru(ConcreteTemplate):
+    key = globals()["__uint2bfloat16_ru"]
+    cases = [signature(_type___nv_bfloat16, uint32)]
+
+
+register_global(__uint2bfloat16_ru, types.Function(_typing___uint2bfloat16_ru))
+
+
+@register
+class _typing___bfloat162ushort_rn(ConcreteTemplate):
+    key = globals()["__bfloat162ushort_rn"]
+    cases = [signature(uint16, _type___nv_bfloat16)]
+
+
+register_global(
+    __bfloat162ushort_rn, types.Function(_typing___bfloat162ushort_rn)
+)
+
+
+@register
+class _typing___bfloat162ushort_rz(ConcreteTemplate):
+    key = globals()["__bfloat162ushort_rz"]
+    cases = [signature(uint16, _type___nv_bfloat16)]
+
+
+register_global(
+    __bfloat162ushort_rz, types.Function(_typing___bfloat162ushort_rz)
+)
+
+
+@register
+class _typing___bfloat162ushort_rd(ConcreteTemplate):
+    key = globals()["__bfloat162ushort_rd"]
+    cases = [signature(uint16, _type___nv_bfloat16)]
+
+
+register_global(
+    __bfloat162ushort_rd, types.Function(_typing___bfloat162ushort_rd)
+)
+
+
+@register
+class _typing___bfloat162ushort_ru(ConcreteTemplate):
+    key = globals()["__bfloat162ushort_ru"]
+    cases = [signature(uint16, _type___nv_bfloat16)]
+
+
+register_global(
+    __bfloat162ushort_ru, types.Function(_typing___bfloat162ushort_ru)
+)
+
+
+@register
+class _typing___ushort2bfloat16_rn(ConcreteTemplate):
+    key = globals()["__ushort2bfloat16_rn"]
+    cases = [signature(_type___nv_bfloat16, uint16)]
+
+
+register_global(
+    __ushort2bfloat16_rn, types.Function(_typing___ushort2bfloat16_rn)
+)
+
+
+@register
+class _typing___ushort2bfloat16_rz(ConcreteTemplate):
+    key = globals()["__ushort2bfloat16_rz"]
+    cases = [signature(_type___nv_bfloat16, uint16)]
+
+
+register_global(
+    __ushort2bfloat16_rz, types.Function(_typing___ushort2bfloat16_rz)
+)
+
+
+@register
+class _typing___ushort2bfloat16_rd(ConcreteTemplate):
+    key = globals()["__ushort2bfloat16_rd"]
+    cases = [signature(_type___nv_bfloat16, uint16)]
+
+
+register_global(
+    __ushort2bfloat16_rd, types.Function(_typing___ushort2bfloat16_rd)
+)
+
+
+@register
+class _typing___ushort2bfloat16_ru(ConcreteTemplate):
+    key = globals()["__ushort2bfloat16_ru"]
+    cases = [signature(_type___nv_bfloat16, uint16)]
+
+
+register_global(
+    __ushort2bfloat16_ru, types.Function(_typing___ushort2bfloat16_ru)
+)
+
+
+@register
+class _typing___bfloat162ull_rn(ConcreteTemplate):
+    key = globals()["__bfloat162ull_rn"]
+    cases = [signature(uint64, _type___nv_bfloat16)]
+
+
+register_global(__bfloat162ull_rn, types.Function(_typing___bfloat162ull_rn))
+
+
+@register
+class _typing___bfloat162ull_rz(ConcreteTemplate):
+    key = globals()["__bfloat162ull_rz"]
+    cases = [signature(uint64, _type___nv_bfloat16)]
+
+
+register_global(__bfloat162ull_rz, types.Function(_typing___bfloat162ull_rz))
+
+
+@register
+class _typing_make_bfloat162(ConcreteTemplate):
+    key = globals()["make_bfloat162"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat16, _type___nv_bfloat16
+        )
+    ]
+
+
+register_global(make_bfloat162, types.Function(_typing_make_bfloat162))
+
+
+@register
+class _typing___bfloat162ull_rd(ConcreteTemplate):
+    key = globals()["__bfloat162ull_rd"]
+    cases = [signature(uint64, _type___nv_bfloat16)]
+
+
+register_global(__bfloat162ull_rd, types.Function(_typing___bfloat162ull_rd))
+
+
+@register
+class _typing___bfloat162ull_ru(ConcreteTemplate):
+    key = globals()["__bfloat162ull_ru"]
+    cases = [signature(uint64, _type___nv_bfloat16)]
+
+
+register_global(__bfloat162ull_ru, types.Function(_typing___bfloat162ull_ru))
+
+
+@register
+class _typing___ull2bfloat16_rn(ConcreteTemplate):
+    key = globals()["__ull2bfloat16_rn"]
+    cases = [signature(_type___nv_bfloat16, uint64)]
+
+
+register_global(__ull2bfloat16_rn, types.Function(_typing___ull2bfloat16_rn))
+
+
+@register
+class _typing___ull2bfloat16_rz(ConcreteTemplate):
+    key = globals()["__ull2bfloat16_rz"]
+    cases = [signature(_type___nv_bfloat16, uint64)]
+
+
+register_global(__ull2bfloat16_rz, types.Function(_typing___ull2bfloat16_rz))
+
+
+@register
+class _typing___ull2bfloat16_rd(ConcreteTemplate):
+    key = globals()["__ull2bfloat16_rd"]
+    cases = [signature(_type___nv_bfloat16, uint64)]
+
+
+register_global(__ull2bfloat16_rd, types.Function(_typing___ull2bfloat16_rd))
+
+
+@register
+class _typing___ull2bfloat16_ru(ConcreteTemplate):
+    key = globals()["__ull2bfloat16_ru"]
+    cases = [signature(_type___nv_bfloat16, uint64)]
+
+
+register_global(__ull2bfloat16_ru, types.Function(_typing___ull2bfloat16_ru))
+
+
+@register
+class _typing___bfloat162ll_rn(ConcreteTemplate):
+    key = globals()["__bfloat162ll_rn"]
+    cases = [signature(int64, _type___nv_bfloat16)]
+
+
+register_global(__bfloat162ll_rn, types.Function(_typing___bfloat162ll_rn))
+
+
+@register
+class _typing___bfloat162ll_rz(ConcreteTemplate):
+    key = globals()["__bfloat162ll_rz"]
+    cases = [signature(int64, _type___nv_bfloat16)]
+
+
+register_global(__bfloat162ll_rz, types.Function(_typing___bfloat162ll_rz))
+
+
+@register
+class _typing___bfloat162ll_rd(ConcreteTemplate):
+    key = globals()["__bfloat162ll_rd"]
+    cases = [signature(int64, _type___nv_bfloat16)]
+
+
+register_global(__bfloat162ll_rd, types.Function(_typing___bfloat162ll_rd))
+
+
+@register
+class _typing___bfloat162ll_ru(ConcreteTemplate):
+    key = globals()["__bfloat162ll_ru"]
+    cases = [signature(int64, _type___nv_bfloat16)]
+
+
+register_global(__bfloat162ll_ru, types.Function(_typing___bfloat162ll_ru))
+
+
+@register
+class _typing___ll2bfloat16_rn(ConcreteTemplate):
+    key = globals()["__ll2bfloat16_rn"]
+    cases = [signature(_type___nv_bfloat16, int64)]
+
+
+register_global(__ll2bfloat16_rn, types.Function(_typing___ll2bfloat16_rn))
+
+
+@register
+class _typing___ll2bfloat16_rz(ConcreteTemplate):
+    key = globals()["__ll2bfloat16_rz"]
+    cases = [signature(_type___nv_bfloat16, int64)]
+
+
+register_global(__ll2bfloat16_rz, types.Function(_typing___ll2bfloat16_rz))
+
+
+@register
+class _typing___ll2bfloat16_rd(ConcreteTemplate):
+    key = globals()["__ll2bfloat16_rd"]
+    cases = [signature(_type___nv_bfloat16, int64)]
+
+
+register_global(__ll2bfloat16_rd, types.Function(_typing___ll2bfloat16_rd))
+
+
+@register
+class _typing___ll2bfloat16_ru(ConcreteTemplate):
+    key = globals()["__ll2bfloat16_ru"]
+    cases = [signature(_type___nv_bfloat16, int64)]
+
+
+register_global(__ll2bfloat16_ru, types.Function(_typing___ll2bfloat16_ru))
+
+
+@register
+class _typing_htrunc(ConcreteTemplate):
+    key = globals()["htrunc"]
+    cases = [signature(_type___nv_bfloat16, _type___nv_bfloat16)]
+
+
+register_global(htrunc, types.Function(_typing_htrunc))
+
+
+@register
+class _typing_hceil(ConcreteTemplate):
+    key = globals()["hceil"]
+    cases = [signature(_type___nv_bfloat16, _type___nv_bfloat16)]
+
+
+register_global(hceil, types.Function(_typing_hceil))
+
+
+@register
+class _typing_hfloor(ConcreteTemplate):
+    key = globals()["hfloor"]
+    cases = [signature(_type___nv_bfloat16, _type___nv_bfloat16)]
+
+
+register_global(hfloor, types.Function(_typing_hfloor))
+
+
+@register
+class _typing_hrint(ConcreteTemplate):
+    key = globals()["hrint"]
+    cases = [signature(_type___nv_bfloat16, _type___nv_bfloat16)]
+
+
+register_global(hrint, types.Function(_typing_hrint))
+
+
+@register
+class _typing_h2trunc(ConcreteTemplate):
+    key = globals()["h2trunc"]
+    cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(h2trunc, types.Function(_typing_h2trunc))
+
+
+@register
+class _typing_h2ceil(ConcreteTemplate):
+    key = globals()["h2ceil"]
+    cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(h2ceil, types.Function(_typing_h2ceil))
+
+
+@register
+class _typing_h2floor(ConcreteTemplate):
+    key = globals()["h2floor"]
+    cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(h2floor, types.Function(_typing_h2floor))
+
+
+@register
+class _typing_h2rint(ConcreteTemplate):
+    key = globals()["h2rint"]
+    cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(h2rint, types.Function(_typing_h2rint))
+
+
+@register
+class _typing___bfloat162bfloat162(ConcreteTemplate):
+    key = globals()["__bfloat162bfloat162"]
+    cases = [signature(_type___nv_bfloat162, _type___nv_bfloat16)]
+
+
+register_global(
+    __bfloat162bfloat162, types.Function(_typing___bfloat162bfloat162)
+)
+
+
+@register
+class _typing___lowhigh2highlow(ConcreteTemplate):
+    key = globals()["__lowhigh2highlow"]
+    cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__lowhigh2highlow, types.Function(_typing___lowhigh2highlow))
+
+
+@register
+class _typing___lows2bfloat162(ConcreteTemplate):
+    key = globals()["__lows2bfloat162"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
+
+
+register_global(__lows2bfloat162, types.Function(_typing___lows2bfloat162))
+
+
+@register
+class _typing___highs2bfloat162(ConcreteTemplate):
+    key = globals()["__highs2bfloat162"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
+
+
+register_global(__highs2bfloat162, types.Function(_typing___highs2bfloat162))
+
+
+@register
+class _typing___high2bfloat16(ConcreteTemplate):
+    key = globals()["__high2bfloat16"]
+    cases = [signature(_type___nv_bfloat16, _type___nv_bfloat162)]
+
+
+register_global(__high2bfloat16, types.Function(_typing___high2bfloat16))
+
+
+@register
+class _typing___low2bfloat16(ConcreteTemplate):
+    key = globals()["__low2bfloat16"]
+    cases = [signature(_type___nv_bfloat16, _type___nv_bfloat162)]
+
+
+register_global(__low2bfloat16, types.Function(_typing___low2bfloat16))
+
+
+@register
+class _typing___hisinf(ConcreteTemplate):
+    key = globals()["__hisinf"]
+    cases = [signature(int32, _type___nv_bfloat16)]
+
+
+register_global(__hisinf, types.Function(_typing___hisinf))
+
+
+@register
+class _typing___halves2bfloat162(ConcreteTemplate):
+    key = globals()["__halves2bfloat162"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat16, _type___nv_bfloat16
+        )
+    ]
+
+
+register_global(__halves2bfloat162, types.Function(_typing___halves2bfloat162))
+
+
+@register
+class _typing___low2bfloat162(ConcreteTemplate):
+    key = globals()["__low2bfloat162"]
+    cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__low2bfloat162, types.Function(_typing___low2bfloat162))
+
+
+@register
+class _typing___high2bfloat162(ConcreteTemplate):
+    key = globals()["__high2bfloat162"]
+    cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__high2bfloat162, types.Function(_typing___high2bfloat162))
+
+
+@register
+class _typing___bfloat16_as_short(ConcreteTemplate):
+    key = globals()["__bfloat16_as_short"]
+    cases = [signature(int16, _type___nv_bfloat16)]
+
+
+register_global(
+    __bfloat16_as_short, types.Function(_typing___bfloat16_as_short)
+)
+
+
+@register
+class _typing___bfloat16_as_ushort(ConcreteTemplate):
+    key = globals()["__bfloat16_as_ushort"]
+    cases = [signature(uint16, _type___nv_bfloat16)]
+
+
+register_global(
+    __bfloat16_as_ushort, types.Function(_typing___bfloat16_as_ushort)
+)
+
+
+@register
+class _typing___short_as_bfloat16(ConcreteTemplate):
+    key = globals()["__short_as_bfloat16"]
+    cases = [signature(_type___nv_bfloat16, int16)]
+
+
+register_global(
+    __short_as_bfloat16, types.Function(_typing___short_as_bfloat16)
+)
+
+
+@register
+class _typing___ushort_as_bfloat16(ConcreteTemplate):
+    key = globals()["__ushort_as_bfloat16"]
+    cases = [signature(_type___nv_bfloat16, uint16)]
+
+
+register_global(
+    __ushort_as_bfloat16, types.Function(_typing___ushort_as_bfloat16)
+)
+
+
+@register
+class _typing___shfl_sync(ConcreteTemplate):
+    key = globals()["__shfl_sync"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, uint32, _type___nv_bfloat162, int32, int32
+        ),
+        signature(
+            _type___nv_bfloat16, uint32, _type___nv_bfloat16, int32, int32
+        ),
+    ]
+
+
+register_global(__shfl_sync, types.Function(_typing___shfl_sync))
+
+
+@register
+class _typing___shfl_up_sync(ConcreteTemplate):
+    key = globals()["__shfl_up_sync"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, uint32, _type___nv_bfloat162, uint32, int32
+        ),
+        signature(
+            _type___nv_bfloat16, uint32, _type___nv_bfloat16, uint32, int32
+        ),
+    ]
+
+
+register_global(__shfl_up_sync, types.Function(_typing___shfl_up_sync))
+
+
+@register
+class _typing___shfl_down_sync(ConcreteTemplate):
+    key = globals()["__shfl_down_sync"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, uint32, _type___nv_bfloat162, uint32, int32
+        ),
+        signature(
+            _type___nv_bfloat16, uint32, _type___nv_bfloat16, uint32, int32
+        ),
+    ]
+
+
+register_global(__shfl_down_sync, types.Function(_typing___shfl_down_sync))
+
+
+@register
+class _typing___shfl_xor_sync(ConcreteTemplate):
+    key = globals()["__shfl_xor_sync"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, uint32, _type___nv_bfloat162, int32, int32
+        ),
+        signature(
+            _type___nv_bfloat16, uint32, _type___nv_bfloat16, int32, int32
+        ),
+    ]
+
+
+register_global(__shfl_xor_sync, types.Function(_typing___shfl_xor_sync))
+
+
+@register
+class _typing___ldg(ConcreteTemplate):
+    key = globals()["__ldg"]
+    cases = [
+        signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+        signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+    ]
+
+
+register_global(__ldg, types.Function(_typing___ldg))
+
+
+@register
+class _typing___ldcg(ConcreteTemplate):
+    key = globals()["__ldcg"]
+    cases = [
+        signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+        signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+    ]
+
+
+register_global(__ldcg, types.Function(_typing___ldcg))
+
+
+@register
+class _typing___ldca(ConcreteTemplate):
+    key = globals()["__ldca"]
+    cases = [
+        signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+        signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+    ]
+
+
+register_global(__ldca, types.Function(_typing___ldca))
+
+
+@register
+class _typing___ldcs(ConcreteTemplate):
+    key = globals()["__ldcs"]
+    cases = [
+        signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+        signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+    ]
+
+
+register_global(__ldcs, types.Function(_typing___ldcs))
+
+
+@register
+class _typing___ldlu(ConcreteTemplate):
+    key = globals()["__ldlu"]
+    cases = [
+        signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+        signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+    ]
+
+
+register_global(__ldlu, types.Function(_typing___ldlu))
+
+
+@register
+class _typing___ldcv(ConcreteTemplate):
+    key = globals()["__ldcv"]
+    cases = [
+        signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)),
+        signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)),
+    ]
+
+
+register_global(__ldcv, types.Function(_typing___ldcv))
+
+
+@register
+class _typing___stwb(ConcreteTemplate):
+    key = globals()["__stwb"]
+    cases = [
+        signature(void, CPointer(_type___nv_bfloat162), _type___nv_bfloat162),
+        signature(void, CPointer(_type___nv_bfloat16), _type___nv_bfloat16),
+    ]
+
+
+register_global(__stwb, types.Function(_typing___stwb))
+
+
+@register
+class _typing___stcg(ConcreteTemplate):
+    key = globals()["__stcg"]
+    cases = [
+        signature(void, CPointer(_type___nv_bfloat162), _type___nv_bfloat162),
+        signature(void, CPointer(_type___nv_bfloat16), _type___nv_bfloat16),
+    ]
+
+
+register_global(__stcg, types.Function(_typing___stcg))
+
+
+@register
+class _typing___stcs(ConcreteTemplate):
+    key = globals()["__stcs"]
+    cases = [
+        signature(void, CPointer(_type___nv_bfloat162), _type___nv_bfloat162),
+        signature(void, CPointer(_type___nv_bfloat16), _type___nv_bfloat16),
+    ]
+
+
+register_global(__stcs, types.Function(_typing___stcs))
+
+
+@register
+class _typing___stwt(ConcreteTemplate):
+    key = globals()["__stwt"]
+    cases = [
+        signature(void, CPointer(_type___nv_bfloat162), _type___nv_bfloat162),
+        signature(void, CPointer(_type___nv_bfloat16), _type___nv_bfloat16),
+    ]
+
+
+register_global(__stwt, types.Function(_typing___stwt))
+
+
+@register
+class _typing___heq2(ConcreteTemplate):
+    key = globals()["__heq2"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
+
+
+register_global(__heq2, types.Function(_typing___heq2))
+
+
+@register
+class _typing___hne2(ConcreteTemplate):
+    key = globals()["__hne2"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
+
+
+register_global(__hne2, types.Function(_typing___hne2))
+
+
+@register
+class _typing___hle2(ConcreteTemplate):
+    key = globals()["__hle2"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
+
+
+register_global(__hle2, types.Function(_typing___hle2))
+
+
+@register
+class _typing___hge2(ConcreteTemplate):
+    key = globals()["__hge2"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
+
+
+register_global(__hge2, types.Function(_typing___hge2))
+
+
+@register
+class _typing___hlt2(ConcreteTemplate):
+    key = globals()["__hlt2"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
+
+
+register_global(__hlt2, types.Function(_typing___hlt2))
+
+
+@register
+class _typing___hgt2(ConcreteTemplate):
+    key = globals()["__hgt2"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
+
+
+register_global(__hgt2, types.Function(_typing___hgt2))
+
+
+@register
+class _typing___hequ2(ConcreteTemplate):
+    key = globals()["__hequ2"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
+
+
+register_global(__hequ2, types.Function(_typing___hequ2))
+
+
+@register
+class _typing___hneu2(ConcreteTemplate):
+    key = globals()["__hneu2"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
+
+
+register_global(__hneu2, types.Function(_typing___hneu2))
+
+
+@register
+class _typing___hleu2(ConcreteTemplate):
+    key = globals()["__hleu2"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
+
+
+register_global(__hleu2, types.Function(_typing___hleu2))
+
+
+@register
+class _typing___hgeu2(ConcreteTemplate):
+    key = globals()["__hgeu2"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
+
+
+register_global(__hgeu2, types.Function(_typing___hgeu2))
+
+
+@register
+class _typing___hltu2(ConcreteTemplate):
+    key = globals()["__hltu2"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
+
+
+register_global(__hltu2, types.Function(_typing___hltu2))
+
+
+@register
+class _typing___hgtu2(ConcreteTemplate):
+    key = globals()["__hgtu2"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
+
+
+register_global(__hgtu2, types.Function(_typing___hgtu2))
+
+
+@register
+class _typing___heq2_mask(ConcreteTemplate):
+    key = globals()["__heq2_mask"]
+    cases = [signature(uint32, _type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__heq2_mask, types.Function(_typing___heq2_mask))
+
+
+@register
+class _typing___hne2_mask(ConcreteTemplate):
+    key = globals()["__hne2_mask"]
+    cases = [signature(uint32, _type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__hne2_mask, types.Function(_typing___hne2_mask))
+
+
+@register
+class _typing___hle2_mask(ConcreteTemplate):
+    key = globals()["__hle2_mask"]
+    cases = [signature(uint32, _type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__hle2_mask, types.Function(_typing___hle2_mask))
+
+
+@register
+class _typing___hge2_mask(ConcreteTemplate):
+    key = globals()["__hge2_mask"]
+    cases = [signature(uint32, _type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__hge2_mask, types.Function(_typing___hge2_mask))
+
+
+@register
+class _typing___hlt2_mask(ConcreteTemplate):
+    key = globals()["__hlt2_mask"]
+    cases = [signature(uint32, _type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__hlt2_mask, types.Function(_typing___hlt2_mask))
+
+
+@register
+class _typing___hgt2_mask(ConcreteTemplate):
+    key = globals()["__hgt2_mask"]
+    cases = [signature(uint32, _type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__hgt2_mask, types.Function(_typing___hgt2_mask))
+
+
+@register
+class _typing___hequ2_mask(ConcreteTemplate):
+    key = globals()["__hequ2_mask"]
+    cases = [signature(uint32, _type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__hequ2_mask, types.Function(_typing___hequ2_mask))
+
+
+@register
+class _typing___hneu2_mask(ConcreteTemplate):
+    key = globals()["__hneu2_mask"]
+    cases = [signature(uint32, _type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__hneu2_mask, types.Function(_typing___hneu2_mask))
+
+
+@register
+class _typing___hleu2_mask(ConcreteTemplate):
+    key = globals()["__hleu2_mask"]
+    cases = [signature(uint32, _type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__hleu2_mask, types.Function(_typing___hleu2_mask))
+
+
+@register
+class _typing___hgeu2_mask(ConcreteTemplate):
+    key = globals()["__hgeu2_mask"]
+    cases = [signature(uint32, _type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__hgeu2_mask, types.Function(_typing___hgeu2_mask))
+
+
+@register
+class _typing___hltu2_mask(ConcreteTemplate):
+    key = globals()["__hltu2_mask"]
+    cases = [signature(uint32, _type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__hltu2_mask, types.Function(_typing___hltu2_mask))
+
+
+@register
+class _typing___hgtu2_mask(ConcreteTemplate):
+    key = globals()["__hgtu2_mask"]
+    cases = [signature(uint32, _type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__hgtu2_mask, types.Function(_typing___hgtu2_mask))
+
+
+@register
+class _typing___hisnan2(ConcreteTemplate):
+    key = globals()["__hisnan2"]
+    cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__hisnan2, types.Function(_typing___hisnan2))
+
+
+@register
+class _typing___hadd2(ConcreteTemplate):
+    key = globals()["__hadd2"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
+
+
+register_global(__hadd2, types.Function(_typing___hadd2))
+
+
+@register
+class _typing___hsub2(ConcreteTemplate):
+    key = globals()["__hsub2"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
+
+
+register_global(__hsub2, types.Function(_typing___hsub2))
+
+
+@register
+class _typing___hmul2(ConcreteTemplate):
+    key = globals()["__hmul2"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
+
+
+register_global(__hmul2, types.Function(_typing___hmul2))
+
+
+@register
+class _typing___hadd2_rn(ConcreteTemplate):
+    key = globals()["__hadd2_rn"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
+
+
+register_global(__hadd2_rn, types.Function(_typing___hadd2_rn))
+
+
+@register
+class _typing___hsub2_rn(ConcreteTemplate):
+    key = globals()["__hsub2_rn"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
+
+
+register_global(__hsub2_rn, types.Function(_typing___hsub2_rn))
+
+
+@register
+class _typing___hmul2_rn(ConcreteTemplate):
+    key = globals()["__hmul2_rn"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
+
+
+register_global(__hmul2_rn, types.Function(_typing___hmul2_rn))
+
+
+@register
+class _typing___h2div(ConcreteTemplate):
+    key = globals()["__h2div"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
+
+
+register_global(__h2div, types.Function(_typing___h2div))
+
+
+@register
+class _typing___habs2(ConcreteTemplate):
+    key = globals()["__habs2"]
+    cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__habs2, types.Function(_typing___habs2))
+
+
+@register
+class _typing___hadd2_sat(ConcreteTemplate):
+    key = globals()["__hadd2_sat"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
+
+
+register_global(__hadd2_sat, types.Function(_typing___hadd2_sat))
+
+
+@register
+class _typing___hsub2_sat(ConcreteTemplate):
+    key = globals()["__hsub2_sat"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
+
+
+register_global(__hsub2_sat, types.Function(_typing___hsub2_sat))
+
+
+@register
+class _typing___hmul2_sat(ConcreteTemplate):
+    key = globals()["__hmul2_sat"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
+
+
+register_global(__hmul2_sat, types.Function(_typing___hmul2_sat))
+
+
+@register
+class _typing___hfma2(ConcreteTemplate):
+    key = globals()["__hfma2"]
+    cases = [
+        signature(
+            _type___nv_bfloat162,
+            _type___nv_bfloat162,
+            _type___nv_bfloat162,
+            _type___nv_bfloat162,
+        )
+    ]
+
+
+register_global(__hfma2, types.Function(_typing___hfma2))
+
+
+@register
+class _typing___hfma2_sat(ConcreteTemplate):
+    key = globals()["__hfma2_sat"]
+    cases = [
+        signature(
+            _type___nv_bfloat162,
+            _type___nv_bfloat162,
+            _type___nv_bfloat162,
+            _type___nv_bfloat162,
+        )
+    ]
+
+
+register_global(__hfma2_sat, types.Function(_typing___hfma2_sat))
+
+
+@register
+class _typing___hneg2(ConcreteTemplate):
+    key = globals()["__hneg2"]
+    cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__hneg2, types.Function(_typing___hneg2))
+
+
+@register
+class _typing___habs(ConcreteTemplate):
+    key = globals()["__habs"]
+    cases = [signature(_type___nv_bfloat16, _type___nv_bfloat16)]
+
+
+register_global(__habs, types.Function(_typing___habs))
+
+
+@register
+class _typing___hadd(ConcreteTemplate):
+    key = globals()["__hadd"]
+    cases = [
+        signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16)
+    ]
+
+
+register_global(__hadd, types.Function(_typing___hadd))
+
+
+@register
+class _typing___hsub(ConcreteTemplate):
+    key = globals()["__hsub"]
+    cases = [
+        signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16)
+    ]
+
+
+register_global(__hsub, types.Function(_typing___hsub))
+
+
+@register
+class _typing___hmul(ConcreteTemplate):
+    key = globals()["__hmul"]
+    cases = [
+        signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16)
+    ]
+
+
+register_global(__hmul, types.Function(_typing___hmul))
+
+
+@register
+class _typing___hadd_rn(ConcreteTemplate):
+    key = globals()["__hadd_rn"]
+    cases = [
+        signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16)
+    ]
+
+
+register_global(__hadd_rn, types.Function(_typing___hadd_rn))
+
+
+@register
+class _typing___hsub_rn(ConcreteTemplate):
+    key = globals()["__hsub_rn"]
+    cases = [
+        signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16)
+    ]
+
+
+register_global(__hsub_rn, types.Function(_typing___hsub_rn))
+
+
+@register
+class _typing___hmul_rn(ConcreteTemplate):
+    key = globals()["__hmul_rn"]
+    cases = [
+        signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16)
+    ]
+
+
+register_global(__hmul_rn, types.Function(_typing___hmul_rn))
+
+
+@register
+class _typing___hdiv(ConcreteTemplate):
+    key = globals()["__hdiv"]
+    cases = [
+        signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16)
+    ]
+
+
+register_global(__hdiv, types.Function(_typing___hdiv))
+
+
+@register
+class _typing___hadd_sat(ConcreteTemplate):
+    key = globals()["__hadd_sat"]
+    cases = [
+        signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16)
+    ]
+
+
+register_global(__hadd_sat, types.Function(_typing___hadd_sat))
+
+
+@register
+class _typing___hsub_sat(ConcreteTemplate):
+    key = globals()["__hsub_sat"]
+    cases = [
+        signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16)
+    ]
+
+
+register_global(__hsub_sat, types.Function(_typing___hsub_sat))
+
+
+@register
+class _typing___hmul_sat(ConcreteTemplate):
+    key = globals()["__hmul_sat"]
+    cases = [
+        signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16)
+    ]
+
+
+register_global(__hmul_sat, types.Function(_typing___hmul_sat))
+
+
+@register
+class _typing___hfma(ConcreteTemplate):
+    key = globals()["__hfma"]
+    cases = [
+        signature(
+            _type___nv_bfloat16,
+            _type___nv_bfloat16,
+            _type___nv_bfloat16,
+            _type___nv_bfloat16,
+        )
+    ]
+
+
+register_global(__hfma, types.Function(_typing___hfma))
+
+
+@register
+class _typing___hfma_sat(ConcreteTemplate):
+    key = globals()["__hfma_sat"]
+    cases = [
+        signature(
+            _type___nv_bfloat16,
+            _type___nv_bfloat16,
+            _type___nv_bfloat16,
+            _type___nv_bfloat16,
+        )
+    ]
+
+
+register_global(__hfma_sat, types.Function(_typing___hfma_sat))
+
+
+@register
+class _typing___hneg(ConcreteTemplate):
+    key = globals()["__hneg"]
+    cases = [signature(_type___nv_bfloat16, _type___nv_bfloat16)]
+
+
+register_global(__hneg, types.Function(_typing___hneg))
+
+
+@register
+class _typing___hbeq2(ConcreteTemplate):
+    key = globals()["__hbeq2"]
+    cases = [signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__hbeq2, types.Function(_typing___hbeq2))
+
+
+@register
+class _typing___hbne2(ConcreteTemplate):
+    key = globals()["__hbne2"]
+    cases = [signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__hbne2, types.Function(_typing___hbne2))
+
+
+@register
+class _typing___hble2(ConcreteTemplate):
+    key = globals()["__hble2"]
+    cases = [signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__hble2, types.Function(_typing___hble2))
+
+
+@register
+class _typing___hbge2(ConcreteTemplate):
+    key = globals()["__hbge2"]
+    cases = [signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__hbge2, types.Function(_typing___hbge2))
+
+
+@register
+class _typing___hblt2(ConcreteTemplate):
+    key = globals()["__hblt2"]
+    cases = [signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__hblt2, types.Function(_typing___hblt2))
+
+
+@register
+class _typing___hbgt2(ConcreteTemplate):
+    key = globals()["__hbgt2"]
+    cases = [signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__hbgt2, types.Function(_typing___hbgt2))
+
+
+@register
+class _typing___hbequ2(ConcreteTemplate):
+    key = globals()["__hbequ2"]
+    cases = [signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__hbequ2, types.Function(_typing___hbequ2))
+
+
+@register
+class _typing___hbneu2(ConcreteTemplate):
+    key = globals()["__hbneu2"]
+    cases = [signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__hbneu2, types.Function(_typing___hbneu2))
+
+
+@register
+class _typing___hbleu2(ConcreteTemplate):
+    key = globals()["__hbleu2"]
+    cases = [signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__hbleu2, types.Function(_typing___hbleu2))
+
+
+@register
+class _typing___hbgeu2(ConcreteTemplate):
+    key = globals()["__hbgeu2"]
+    cases = [signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__hbgeu2, types.Function(_typing___hbgeu2))
+
+
+@register
+class _typing___hbltu2(ConcreteTemplate):
+    key = globals()["__hbltu2"]
+    cases = [signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__hbltu2, types.Function(_typing___hbltu2))
+
+
+@register
+class _typing___hbgtu2(ConcreteTemplate):
+    key = globals()["__hbgtu2"]
+    cases = [signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162)]
+
+
+register_global(__hbgtu2, types.Function(_typing___hbgtu2))
+
+
+@register
+class _typing___heq(ConcreteTemplate):
+    key = globals()["__heq"]
+    cases = [signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16)]
+
+
+register_global(__heq, types.Function(_typing___heq))
+
+
+@register
+class _typing___hne(ConcreteTemplate):
+    key = globals()["__hne"]
+    cases = [signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16)]
+
+
+register_global(__hne, types.Function(_typing___hne))
+
+
+@register
+class _typing___hle(ConcreteTemplate):
+    key = globals()["__hle"]
+    cases = [signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16)]
+
+
+register_global(__hle, types.Function(_typing___hle))
+
+
+@register
+class _typing___hge(ConcreteTemplate):
+    key = globals()["__hge"]
+    cases = [signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16)]
 
-    def operator_eq_2_caller(arg_0, arg_1):
-        return operator_eq_2(arg_0, arg_1)
 
-    @lower(operator.eq, _type___nv_bfloat162, _type___nv_bfloat162)
-    def impl(context, builder, sig, args):
-        context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_eq_2", shim_raw_str)
-        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
-        for ptr, ty, arg in zip(ptrs, sig.args, args):
-            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+register_global(__hge, types.Function(_typing___hge))
 
-        return context.compile_internal(
-            builder,
-            operator_eq_2_caller,
-            signature(
-                bool_,
-                CPointer(_type___nv_bfloat162),
-                CPointer(_type___nv_bfloat162),
-            ),
-            ptrs,
-        )
 
+@register
+class _typing___hlt(ConcreteTemplate):
+    key = globals()["__hlt"]
+    cases = [signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16)]
 
-_operator_eq_2_lower(shim_stream, shim_obj)
 
+register_global(__hlt, types.Function(_typing___hlt))
 
-def _operator_ne_2_lower(shim_stream, shim_obj):
-    shim_raw_str = """
-    extern "C" __device__ int
-    operator_ne_2(bool &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
-        retval = operator!=(*lh, *rh);
-        return 0;
-    }
-        """
 
-    operator_ne_2 = declare_device(
-        "operator_ne_2",
-        bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
-    )
+@register
+class _typing___hgt(ConcreteTemplate):
+    key = globals()["__hgt"]
+    cases = [signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16)]
 
-    def operator_ne_2_caller(arg_0, arg_1):
-        return operator_ne_2(arg_0, arg_1)
 
-    @lower(operator.ne, _type___nv_bfloat162, _type___nv_bfloat162)
-    def impl(context, builder, sig, args):
-        context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_ne_2", shim_raw_str)
-        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
-        for ptr, ty, arg in zip(ptrs, sig.args, args):
-            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+register_global(__hgt, types.Function(_typing___hgt))
 
-        return context.compile_internal(
-            builder,
-            operator_ne_2_caller,
-            signature(
-                bool_,
-                CPointer(_type___nv_bfloat162),
-                CPointer(_type___nv_bfloat162),
-            ),
-            ptrs,
-        )
 
+@register
+class _typing___hequ(ConcreteTemplate):
+    key = globals()["__hequ"]
+    cases = [signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16)]
 
-_operator_ne_2_lower(shim_stream, shim_obj)
 
+register_global(__hequ, types.Function(_typing___hequ))
 
-def _operator_gt_2_lower(shim_stream, shim_obj):
-    shim_raw_str = """
-    extern "C" __device__ int
-    operator_gt_2(bool &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
-        retval = operator>(*lh, *rh);
-        return 0;
-    }
-        """
 
-    operator_gt_2 = declare_device(
-        "operator_gt_2",
-        bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
-    )
+@register
+class _typing___hneu(ConcreteTemplate):
+    key = globals()["__hneu"]
+    cases = [signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16)]
 
-    def operator_gt_2_caller(arg_0, arg_1):
-        return operator_gt_2(arg_0, arg_1)
 
-    @lower(operator.gt, _type___nv_bfloat162, _type___nv_bfloat162)
-    def impl(context, builder, sig, args):
-        context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_gt_2", shim_raw_str)
-        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
-        for ptr, ty, arg in zip(ptrs, sig.args, args):
-            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+register_global(__hneu, types.Function(_typing___hneu))
 
-        return context.compile_internal(
-            builder,
-            operator_gt_2_caller,
-            signature(
-                bool_,
-                CPointer(_type___nv_bfloat162),
-                CPointer(_type___nv_bfloat162),
-            ),
-            ptrs,
-        )
 
+@register
+class _typing___hleu(ConcreteTemplate):
+    key = globals()["__hleu"]
+    cases = [signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16)]
 
-_operator_gt_2_lower(shim_stream, shim_obj)
 
+register_global(__hleu, types.Function(_typing___hleu))
 
-def _operator_lt_2_lower(shim_stream, shim_obj):
-    shim_raw_str = """
-    extern "C" __device__ int
-    operator_lt_2(bool &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
-        retval = operator<(*lh, *rh);
-        return 0;
-    }
-        """
 
-    operator_lt_2 = declare_device(
-        "operator_lt_2",
-        bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
-    )
+@register
+class _typing___hgeu(ConcreteTemplate):
+    key = globals()["__hgeu"]
+    cases = [signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16)]
 
-    def operator_lt_2_caller(arg_0, arg_1):
-        return operator_lt_2(arg_0, arg_1)
 
-    @lower(operator.lt, _type___nv_bfloat162, _type___nv_bfloat162)
-    def impl(context, builder, sig, args):
-        context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_lt_2", shim_raw_str)
-        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
-        for ptr, ty, arg in zip(ptrs, sig.args, args):
-            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+register_global(__hgeu, types.Function(_typing___hgeu))
 
-        return context.compile_internal(
-            builder,
-            operator_lt_2_caller,
-            signature(
-                bool_,
-                CPointer(_type___nv_bfloat162),
-                CPointer(_type___nv_bfloat162),
-            ),
-            ptrs,
-        )
 
+@register
+class _typing___hltu(ConcreteTemplate):
+    key = globals()["__hltu"]
+    cases = [signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16)]
 
-_operator_lt_2_lower(shim_stream, shim_obj)
 
+register_global(__hltu, types.Function(_typing___hltu))
 
-def _operator_ge_2_lower(shim_stream, shim_obj):
-    shim_raw_str = """
-    extern "C" __device__ int
-    operator_ge_2(bool &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
-        retval = operator>=(*lh, *rh);
-        return 0;
-    }
-        """
 
-    operator_ge_2 = declare_device(
-        "operator_ge_2",
-        bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
-    )
+@register
+class _typing___hgtu(ConcreteTemplate):
+    key = globals()["__hgtu"]
+    cases = [signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16)]
 
-    def operator_ge_2_caller(arg_0, arg_1):
-        return operator_ge_2(arg_0, arg_1)
 
-    @lower(operator.ge, _type___nv_bfloat162, _type___nv_bfloat162)
-    def impl(context, builder, sig, args):
-        context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_ge_2", shim_raw_str)
-        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
-        for ptr, ty, arg in zip(ptrs, sig.args, args):
-            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+register_global(__hgtu, types.Function(_typing___hgtu))
 
-        return context.compile_internal(
-            builder,
-            operator_ge_2_caller,
-            signature(
-                bool_,
-                CPointer(_type___nv_bfloat162),
-                CPointer(_type___nv_bfloat162),
-            ),
-            ptrs,
-        )
 
+@register
+class _typing___hisnan(ConcreteTemplate):
+    key = globals()["__hisnan"]
+    cases = [signature(bool_, _type___nv_bfloat16)]
 
-_operator_ge_2_lower(shim_stream, shim_obj)
 
+register_global(__hisnan, types.Function(_typing___hisnan))
 
-def _operator_le_2_lower(shim_stream, shim_obj):
-    shim_raw_str = """
-    extern "C" __device__ int
-    operator_le_2(bool &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) {
-        retval = operator<=(*lh, *rh);
-        return 0;
-    }
-        """
 
-    operator_le_2 = declare_device(
-        "operator_le_2",
-        bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)),
-    )
+@register
+class _typing___hmax(ConcreteTemplate):
+    key = globals()["__hmax"]
+    cases = [
+        signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16)
+    ]
 
-    def operator_le_2_caller(arg_0, arg_1):
-        return operator_le_2(arg_0, arg_1)
 
-    @lower(operator.le, _type___nv_bfloat162, _type___nv_bfloat162)
-    def impl(context, builder, sig, args):
-        context.active_code_library.add_linking_file(shim_obj)
-        shim_stream.write_with_key("operator_le_2", shim_raw_str)
-        ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args]
-        for ptr, ty, arg in zip(ptrs, sig.args, args):
-            builder.store(arg, ptr, align=getattr(ty, "alignof_", None))
+register_global(__hmax, types.Function(_typing___hmax))
 
-        return context.compile_internal(
-            builder,
-            operator_le_2_caller,
-            signature(
-                bool_,
-                CPointer(_type___nv_bfloat162),
-                CPointer(_type___nv_bfloat162),
-            ),
-            ptrs,
-        )
+
+@register
+class _typing___hmin(ConcreteTemplate):
+    key = globals()["__hmin"]
+    cases = [
+        signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16)
+    ]
 
 
-_operator_le_2_lower(shim_stream, shim_obj)
+register_global(__hmin, types.Function(_typing___hmin))
 
 
 @register
-class _typing_make_bfloat162(ConcreteTemplate):
-    key = globals()["make_bfloat162"]
+class _typing___hmax_nan(ConcreteTemplate):
+    key = globals()["__hmax_nan"]
     cases = [
-        signature(
-            _type___nv_bfloat162, _type___nv_bfloat16, _type___nv_bfloat16
-        )
+        signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16)
     ]
 
 
-register_global(make_bfloat162, types.Function(_typing_make_bfloat162))
+register_global(__hmax_nan, types.Function(_typing___hmax_nan))
 
 
 @register
-class _typing_htrunc(ConcreteTemplate):
-    key = globals()["htrunc"]
-    cases = [signature(_type___nv_bfloat16, _type___nv_bfloat16)]
+class _typing___hmin_nan(ConcreteTemplate):
+    key = globals()["__hmin_nan"]
+    cases = [
+        signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16)
+    ]
 
 
-register_global(htrunc, types.Function(_typing_htrunc))
+register_global(__hmin_nan, types.Function(_typing___hmin_nan))
 
 
 @register
-class _typing_hceil(ConcreteTemplate):
-    key = globals()["hceil"]
-    cases = [signature(_type___nv_bfloat16, _type___nv_bfloat16)]
+class _typing___hfma_relu(ConcreteTemplate):
+    key = globals()["__hfma_relu"]
+    cases = [
+        signature(
+            _type___nv_bfloat16,
+            _type___nv_bfloat16,
+            _type___nv_bfloat16,
+            _type___nv_bfloat16,
+        )
+    ]
 
 
-register_global(hceil, types.Function(_typing_hceil))
+register_global(__hfma_relu, types.Function(_typing___hfma_relu))
 
 
 @register
-class _typing_hfloor(ConcreteTemplate):
-    key = globals()["hfloor"]
-    cases = [signature(_type___nv_bfloat16, _type___nv_bfloat16)]
+class _typing___hmax2(ConcreteTemplate):
+    key = globals()["__hmax2"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
 
 
-register_global(hfloor, types.Function(_typing_hfloor))
+register_global(__hmax2, types.Function(_typing___hmax2))
 
 
 @register
-class _typing_hrint(ConcreteTemplate):
-    key = globals()["hrint"]
-    cases = [signature(_type___nv_bfloat16, _type___nv_bfloat16)]
+class _typing___hmin2(ConcreteTemplate):
+    key = globals()["__hmin2"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
 
 
-register_global(hrint, types.Function(_typing_hrint))
+register_global(__hmin2, types.Function(_typing___hmin2))
 
 
 @register
-class _typing_h2trunc(ConcreteTemplate):
-    key = globals()["h2trunc"]
-    cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)]
+class _typing___hmax2_nan(ConcreteTemplate):
+    key = globals()["__hmax2_nan"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
 
 
-register_global(h2trunc, types.Function(_typing_h2trunc))
+register_global(__hmax2_nan, types.Function(_typing___hmax2_nan))
 
 
 @register
-class _typing_h2ceil(ConcreteTemplate):
-    key = globals()["h2ceil"]
-    cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)]
+class _typing___hmin2_nan(ConcreteTemplate):
+    key = globals()["__hmin2_nan"]
+    cases = [
+        signature(
+            _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162
+        )
+    ]
 
 
-register_global(h2ceil, types.Function(_typing_h2ceil))
+register_global(__hmin2_nan, types.Function(_typing___hmin2_nan))
 
 
 @register
-class _typing_h2floor(ConcreteTemplate):
-    key = globals()["h2floor"]
-    cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)]
+class _typing___hfma2_relu(ConcreteTemplate):
+    key = globals()["__hfma2_relu"]
+    cases = [
+        signature(
+            _type___nv_bfloat162,
+            _type___nv_bfloat162,
+            _type___nv_bfloat162,
+            _type___nv_bfloat162,
+        )
+    ]
 
 
-register_global(h2floor, types.Function(_typing_h2floor))
+register_global(__hfma2_relu, types.Function(_typing___hfma2_relu))
 
 
 @register
-class _typing_h2rint(ConcreteTemplate):
-    key = globals()["h2rint"]
-    cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)]
+class _typing___hcmadd(ConcreteTemplate):
+    key = globals()["__hcmadd"]
+    cases = [
+        signature(
+            _type___nv_bfloat162,
+            _type___nv_bfloat162,
+            _type___nv_bfloat162,
+            _type___nv_bfloat162,
+        )
+    ]
 
 
-register_global(h2rint, types.Function(_typing_h2rint))
+register_global(__hcmadd, types.Function(_typing___hcmadd))
 
 
 @register
@@ -4991,9 +15962,18 @@ class _typing_atomicAdd(ConcreteTemplate):
 register_global(atomicAdd, types.Function(_typing_atomicAdd))
 
 
+@register
+class _typing___half(ConcreteTemplate):
+    key = globals()["__half"]
+    cases = [signature(void, _type___nv_bfloat16)]
+
+
+register_global(__half, types.Function(_typing___half))
+
+
 @register_global(operator.add)
-class _typing_operator_add(ConcreteTemplate):
-    cases = [
+class _typing_operator_add(BinOp):
+    cases = BinOp.cases + [
         signature(
             _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16
         ),
@@ -5004,8 +15984,8 @@ class _typing_operator_add(ConcreteTemplate):
 
 
 @register_global(operator.sub)
-class _typing_operator_sub(ConcreteTemplate):
-    cases = [
+class _typing_operator_sub(BinOp):
+    cases = BinOp.cases + [
         signature(
             _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16
         ),
@@ -5016,8 +15996,8 @@ class _typing_operator_sub(ConcreteTemplate):
 
 
 @register_global(operator.mul)
-class _typing_operator_mul(ConcreteTemplate):
-    cases = [
+class _typing_operator_mul(BinOp):
+    cases = BinOp.cases + [
         signature(
             _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16
         ),
@@ -5028,8 +16008,8 @@ class _typing_operator_mul(ConcreteTemplate):
 
 
 @register_global(operator.truediv)
-class _typing_operator_truediv(ConcreteTemplate):
-    cases = [
+class _typing_operator_truediv(BinOpTrueDiv):
+    cases = BinOpTrueDiv.cases + [
         signature(
             _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16
         ),
@@ -5040,8 +16020,8 @@ class _typing_operator_truediv(ConcreteTemplate):
 
 
 @register_global(operator.iadd)
-class _typing_operator_iadd(ConcreteTemplate):
-    cases = [
+class _typing_operator_iadd(BinOp):
+    cases = BinOp.cases + [
         signature(
             _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16
         ),
@@ -5052,8 +16032,8 @@ class _typing_operator_iadd(ConcreteTemplate):
 
 
 @register_global(operator.isub)
-class _typing_operator_isub(ConcreteTemplate):
-    cases = [
+class _typing_operator_isub(BinOp):
+    cases = BinOp.cases + [
         signature(
             _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16
         ),
@@ -5064,8 +16044,8 @@ class _typing_operator_isub(ConcreteTemplate):
 
 
 @register_global(operator.imul)
-class _typing_operator_imul(ConcreteTemplate):
-    cases = [
+class _typing_operator_imul(BinOp):
+    cases = BinOp.cases + [
         signature(
             _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16
         ),
@@ -5076,8 +16056,8 @@ class _typing_operator_imul(ConcreteTemplate):
 
 
 @register_global(operator.itruediv)
-class _typing_operator_itruediv(ConcreteTemplate):
-    cases = [
+class _typing_operator_itruediv(BinOpTrueDiv):
+    cases = BinOp.cases + [
         signature(
             _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16
         ),
@@ -5088,71 +16068,327 @@ class _typing_operator_itruediv(ConcreteTemplate):
 
 
 @register_global(operator.pos)
-class _typing_operator_pos(ConcreteTemplate):
-    cases = [
+class _typing_operator_pos(UnaryPositive):
+    cases = UnaryPositive.cases + [
         signature(_type___nv_bfloat16, _type___nv_bfloat16),
         signature(_type___nv_bfloat162, _type___nv_bfloat162),
     ]
 
 
 @register_global(operator.neg)
-class _typing_operator_neg(ConcreteTemplate):
-    cases = [
+class _typing_operator_neg(UnaryNegate):
+    cases = UnaryNegate.cases + [
         signature(_type___nv_bfloat16, _type___nv_bfloat16),
         signature(_type___nv_bfloat162, _type___nv_bfloat162),
     ]
 
 
 @register_global(operator.eq)
-class _typing_operator_eq(ConcreteTemplate):
-    cases = [
+class _typing_operator_eq(UnorderedCmpOp):
+    cases = UnorderedCmpOp.cases + [
         signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16),
         signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162),
     ]
 
 
 @register_global(operator.ne)
-class _typing_operator_ne(ConcreteTemplate):
-    cases = [
+class _typing_operator_ne(UnorderedCmpOp):
+    cases = UnorderedCmpOp.cases + [
         signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16),
         signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162),
     ]
 
 
 @register_global(operator.gt)
-class _typing_operator_gt(ConcreteTemplate):
-    cases = [
+class _typing_operator_gt(OrderedCmpOp):
+    cases = OrderedCmpOp.cases + [
         signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16),
         signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162),
     ]
 
 
 @register_global(operator.lt)
-class _typing_operator_lt(ConcreteTemplate):
-    cases = [
+class _typing_operator_lt(OrderedCmpOp):
+    cases = OrderedCmpOp.cases + [
         signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16),
         signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162),
     ]
 
 
 @register_global(operator.ge)
-class _typing_operator_ge(ConcreteTemplate):
-    cases = [
+class _typing_operator_ge(OrderedCmpOp):
+    cases = OrderedCmpOp.cases + [
         signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16),
         signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162),
     ]
 
 
 @register_global(operator.le)
-class _typing_operator_le(ConcreteTemplate):
-    cases = [
+class _typing_operator_le(OrderedCmpOp):
+    cases = OrderedCmpOp.cases + [
         signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16),
         signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162),
     ]
 
 
 # Aliases:
-__nv_bfloat16_raw = unnamed1401637
-__nv_bfloat162_raw = unnamed1401746
+__nv_bfloat16_raw = unnamed1405307
+__nv_bfloat162_raw = unnamed1405416
 nv_bfloat16 = __nv_bfloat16
 nv_bfloat162 = __nv_bfloat162
+
+
+# Symbols:
+
+
+_NBTYPE_SYMBOLS = [
+    "_type_unnamed1405307",
+    "_type_unnamed1405416",
+    "_type___nv_bfloat16",
+    "_type___nv_bfloat162",
+]
+
+
+_RECORD_SYMBOLS = [
+    "unnamed1405307",
+    "unnamed1405416",
+    "__nv_bfloat16",
+    "__nv_bfloat162",
+]
+
+
+_FUNCTION_SYMBOLS = [
+    "__double2bfloat16",
+    "__float2bfloat16",
+    "__float2bfloat16_rn",
+    "__float2bfloat16_rz",
+    "__float2bfloat16_rd",
+    "__float2bfloat16_ru",
+    "__bfloat162float",
+    "__float2bfloat162_rn",
+    "__floats2bfloat162_rn",
+    "__low2float",
+    "__high2float",
+    "__float22bfloat162_rn",
+    "__bfloat1622float2",
+    "__bfloat162char_rz",
+    "__bfloat162uchar_rz",
+    "__bfloat162int_rn",
+    "__bfloat162int_rz",
+    "__bfloat162int_rd",
+    "__bfloat162int_ru",
+    "__int2bfloat16_rn",
+    "__int2bfloat16_rz",
+    "__int2bfloat16_rd",
+    "__int2bfloat16_ru",
+    "__bfloat162short_rn",
+    "__bfloat162short_rz",
+    "__bfloat162short_rd",
+    "__bfloat162short_ru",
+    "__short2bfloat16_rn",
+    "__short2bfloat16_rz",
+    "__short2bfloat16_rd",
+    "__short2bfloat16_ru",
+    "__bfloat162uint_rn",
+    "__bfloat162uint_rz",
+    "__bfloat162uint_rd",
+    "__bfloat162uint_ru",
+    "__uint2bfloat16_rn",
+    "__uint2bfloat16_rz",
+    "__uint2bfloat16_rd",
+    "__uint2bfloat16_ru",
+    "__bfloat162ushort_rn",
+    "__bfloat162ushort_rz",
+    "__bfloat162ushort_rd",
+    "__bfloat162ushort_ru",
+    "__ushort2bfloat16_rn",
+    "__ushort2bfloat16_rz",
+    "__ushort2bfloat16_rd",
+    "__ushort2bfloat16_ru",
+    "__bfloat162ull_rn",
+    "__bfloat162ull_rz",
+    "make_bfloat162",
+    "__bfloat162ull_rd",
+    "__bfloat162ull_ru",
+    "__ull2bfloat16_rn",
+    "__ull2bfloat16_rz",
+    "__ull2bfloat16_rd",
+    "__ull2bfloat16_ru",
+    "__bfloat162ll_rn",
+    "__bfloat162ll_rz",
+    "__bfloat162ll_rd",
+    "__bfloat162ll_ru",
+    "__ll2bfloat16_rn",
+    "__ll2bfloat16_rz",
+    "__ll2bfloat16_rd",
+    "__ll2bfloat16_ru",
+    "htrunc",
+    "hceil",
+    "hfloor",
+    "hrint",
+    "h2trunc",
+    "h2ceil",
+    "h2floor",
+    "h2rint",
+    "__bfloat162bfloat162",
+    "__lowhigh2highlow",
+    "__lows2bfloat162",
+    "__highs2bfloat162",
+    "__high2bfloat16",
+    "__low2bfloat16",
+    "__hisinf",
+    "__halves2bfloat162",
+    "__low2bfloat162",
+    "__high2bfloat162",
+    "__bfloat16_as_short",
+    "__bfloat16_as_ushort",
+    "__short_as_bfloat16",
+    "__ushort_as_bfloat16",
+    "__shfl_sync",
+    "__shfl_sync",
+    "__shfl_up_sync",
+    "__shfl_up_sync",
+    "__shfl_down_sync",
+    "__shfl_down_sync",
+    "__shfl_xor_sync",
+    "__shfl_xor_sync",
+    "__ldg",
+    "__ldg",
+    "__ldcg",
+    "__ldcg",
+    "__ldca",
+    "__ldca",
+    "__ldcs",
+    "__ldcs",
+    "__ldlu",
+    "__ldlu",
+    "__ldcv",
+    "__ldcv",
+    "__stwb",
+    "__stwb",
+    "__stcg",
+    "__stcg",
+    "__stcs",
+    "__stcs",
+    "__stwt",
+    "__stwt",
+    "__heq2",
+    "__hne2",
+    "__hle2",
+    "__hge2",
+    "__hlt2",
+    "__hgt2",
+    "__hequ2",
+    "__hneu2",
+    "__hleu2",
+    "__hgeu2",
+    "__hltu2",
+    "__hgtu2",
+    "__heq2_mask",
+    "__hne2_mask",
+    "__hle2_mask",
+    "__hge2_mask",
+    "__hlt2_mask",
+    "__hgt2_mask",
+    "__hequ2_mask",
+    "__hneu2_mask",
+    "__hleu2_mask",
+    "__hgeu2_mask",
+    "__hltu2_mask",
+    "__hgtu2_mask",
+    "__hisnan2",
+    "__hadd2",
+    "__hsub2",
+    "__hmul2",
+    "__hadd2_rn",
+    "__hsub2_rn",
+    "__hmul2_rn",
+    "__h2div",
+    "__habs2",
+    "__hadd2_sat",
+    "__hsub2_sat",
+    "__hmul2_sat",
+    "__hfma2",
+    "__hfma2_sat",
+    "__hneg2",
+    "__habs",
+    "__hadd",
+    "__hsub",
+    "__hmul",
+    "__hadd_rn",
+    "__hsub_rn",
+    "__hmul_rn",
+    "__hdiv",
+    "__hadd_sat",
+    "__hsub_sat",
+    "__hmul_sat",
+    "__hfma",
+    "__hfma_sat",
+    "__hneg",
+    "__hbeq2",
+    "__hbne2",
+    "__hble2",
+    "__hbge2",
+    "__hblt2",
+    "__hbgt2",
+    "__hbequ2",
+    "__hbneu2",
+    "__hbleu2",
+    "__hbgeu2",
+    "__hbltu2",
+    "__hbgtu2",
+    "__heq",
+    "__hne",
+    "__hle",
+    "__hge",
+    "__hlt",
+    "__hgt",
+    "__hequ",
+    "__hneu",
+    "__hleu",
+    "__hgeu",
+    "__hltu",
+    "__hgtu",
+    "__hisnan",
+    "__hmax",
+    "__hmin",
+    "__hmax_nan",
+    "__hmin_nan",
+    "__hfma_relu",
+    "__hmax2",
+    "__hmin2",
+    "__hmax2_nan",
+    "__hmin2_nan",
+    "__hfma2_relu",
+    "__hcmadd",
+    "hsqrt",
+    "hrsqrt",
+    "hrcp",
+    "hlog",
+    "hlog2",
+    "hlog10",
+    "hexp",
+    "htanh_approx",
+    "h2tanh_approx",
+    "htanh",
+    "h2tanh",
+    "hexp2",
+    "hexp10",
+    "hcos",
+    "hsin",
+    "h2sqrt",
+    "h2rsqrt",
+    "h2rcp",
+    "h2log",
+    "h2log2",
+    "h2log10",
+    "h2exp",
+    "h2exp2",
+    "h2exp10",
+    "h2cos",
+    "h2sin",
+    "atomicAdd",
+]
+
+
+__all__ = _NBTYPE_SYMBOLS + _RECORD_SYMBOLS + _FUNCTION_SYMBOLS
diff --git a/numba_cuda/numba/cuda/bf16.py b/numba_cuda/numba/cuda/bf16.py
index cb2f41dc6..54f3a0b74 100644
--- a/numba_cuda/numba/cuda/bf16.py
+++ b/numba_cuda/numba/cuda/bf16.py
@@ -2,8 +2,116 @@
 # SPDX-License-Identifier: BSD-2-Clause
 
 from numba.cuda._internal.cuda_bf16 import (
-    _type_class___nv_bfloat16,
+    typing_registry,
+    target_registry,
     nv_bfloat16 as bfloat16,
+    # Arithmetic intrinsics
+    __habs as habs,
+    __hadd as hadd,
+    __hsub as hsub,
+    __hmul as hmul,
+    __hadd_rn as hadd_rn,
+    __hsub_rn as hsub_rn,
+    __hmul_rn as hmul_rn,
+    __hdiv as hdiv,
+    __hadd_sat as hadd_sat,
+    __hsub_sat as hsub_sat,
+    __hmul_sat as hmul_sat,
+    __hfma as hfma,
+    __hfma_sat as hfma_sat,
+    __hneg as hneg,
+    __hfma_relu as hfma_relu,
+    # Comparison intrinsics
+    __heq as heq,
+    __hne as hne,
+    __hge as hge,
+    __hgt as hgt,
+    __hle as hle,
+    __hlt as hlt,
+    __hmax as hmax,
+    __hmin as hmin,
+    __hmax_nan as hmax_nan,
+    __hmin_nan as hmin_nan,
+    __hisinf as hisinf,
+    __hisnan as hisnan,
+    # Unordered comparison intrinsics
+    __hequ as hequ,
+    __hneu as hneu,
+    __hgeu as hgeu,
+    __hgtu as hgtu,
+    __hleu as hleu,
+    __hltu as hltu,
+    # Precision conversion and data movement
+    # - floating-point family
+    __bfloat162float as bfloat162float,
+    __float2bfloat16 as float2bfloat16,
+    __double2bfloat16 as double2bfloat16,
+    __float2bfloat16_rn as float2bfloat16_rn,
+    __float2bfloat16_rz as float2bfloat16_rz,
+    __float2bfloat16_rd as float2bfloat16_rd,
+    __float2bfloat16_ru as float2bfloat16_ru,
+    # - char family
+    __bfloat162char_rz as bfloat162char_rz,
+    __bfloat162uchar_rz as bfloat162uchar_rz,
+    # - int family (signed 32-bit)
+    __int2bfloat16_rn as int2bfloat16_rn,
+    __int2bfloat16_rz as int2bfloat16_rz,
+    __int2bfloat16_rd as int2bfloat16_rd,
+    __int2bfloat16_ru as int2bfloat16_ru,
+    __bfloat162int_rn as bfloat162int_rn,
+    __bfloat162int_rz as bfloat162int_rz,
+    __bfloat162int_rd as bfloat162int_rd,
+    __bfloat162int_ru as bfloat162int_ru,
+    # - short family (signed 16-bit)
+    __short2bfloat16_rn as short2bfloat16_rn,
+    __short2bfloat16_rz as short2bfloat16_rz,
+    __short2bfloat16_rd as short2bfloat16_rd,
+    __short2bfloat16_ru as short2bfloat16_ru,
+    __bfloat162short_rn as bfloat162short_rn,
+    __bfloat162short_rz as bfloat162short_rz,
+    __bfloat162short_rd as bfloat162short_rd,
+    __bfloat162short_ru as bfloat162short_ru,
+    # - ushort family (unsigned 16-bit)
+    __ushort2bfloat16_rn as ushort2bfloat16_rn,
+    __ushort2bfloat16_rz as ushort2bfloat16_rz,
+    __ushort2bfloat16_rd as ushort2bfloat16_rd,
+    __ushort2bfloat16_ru as ushort2bfloat16_ru,
+    __bfloat162ushort_rn as bfloat162ushort_rn,
+    __bfloat162ushort_rz as bfloat162ushort_rz,
+    __bfloat162ushort_rd as bfloat162ushort_rd,
+    __bfloat162ushort_ru as bfloat162ushort_ru,
+    # - uint family (unsigned 32-bit)
+    __uint2bfloat16_rn as uint2bfloat16_rn,
+    __uint2bfloat16_rz as uint2bfloat16_rz,
+    __uint2bfloat16_rd as uint2bfloat16_rd,
+    __uint2bfloat16_ru as uint2bfloat16_ru,
+    __bfloat162uint_rn as bfloat162uint_rn,
+    __bfloat162uint_rz as bfloat162uint_rz,
+    __bfloat162uint_rd as bfloat162uint_rd,
+    __bfloat162uint_ru as bfloat162uint_ru,
+    # - ll family (signed 64-bit)
+    __ll2bfloat16_rn as ll2bfloat16_rn,
+    __ll2bfloat16_rz as ll2bfloat16_rz,
+    __ll2bfloat16_rd as ll2bfloat16_rd,
+    __ll2bfloat16_ru as ll2bfloat16_ru,
+    __bfloat162ll_rn as bfloat162ll_rn,
+    __bfloat162ll_rz as bfloat162ll_rz,
+    __bfloat162ll_rd as bfloat162ll_rd,
+    __bfloat162ll_ru as bfloat162ll_ru,
+    # - ull family (unsigned 64-bit)
+    __ull2bfloat16_rn as ull2bfloat16_rn,
+    __ull2bfloat16_rz as ull2bfloat16_rz,
+    __ull2bfloat16_rd as ull2bfloat16_rd,
+    __ull2bfloat16_ru as ull2bfloat16_ru,
+    __bfloat162ull_rn as bfloat162ull_rn,
+    __bfloat162ull_rz as bfloat162ull_rz,
+    __bfloat162ull_rd as bfloat162ull_rd,
+    __bfloat162ull_ru as bfloat162ull_ru,
+    # - bit reinterpret casts
+    __bfloat16_as_short as bfloat16_as_short,
+    __bfloat16_as_ushort as bfloat16_as_ushort,
+    __short_as_bfloat16 as short_as_bfloat16,
+    __ushort_as_bfloat16 as ushort_as_bfloat16,
     htrunc,
     hceil,
     hfloor,
@@ -28,7 +136,7 @@
 
 
 def _make_unary(a, func):
-    if isinstance(a, _type_class___nv_bfloat16):
+    if a == bfloat16:
         return lambda a: func(a)
 
 
@@ -92,9 +200,184 @@ def exp2_ol(a):
 except ImportError:
     pass
 
+## Public aliases using Numba/Numpy-style type names
+# Floating-point
+float32_to_bfloat16 = float2bfloat16
+float64_to_bfloat16 = double2bfloat16
+bfloat16_to_float32 = bfloat162float
+float32_to_bfloat16_rn = float2bfloat16_rn
+float32_to_bfloat16_rz = float2bfloat16_rz
+float32_to_bfloat16_rd = float2bfloat16_rd
+float32_to_bfloat16_ru = float2bfloat16_ru
+
+# Char (8-bit)
+bfloat16_to_int8_rz = bfloat162char_rz
+bfloat16_to_uint8_rz = bfloat162uchar_rz
+
+# Int16 / UInt16
+int16_to_bfloat16_rn = short2bfloat16_rn
+int16_to_bfloat16_rz = short2bfloat16_rz
+int16_to_bfloat16_rd = short2bfloat16_rd
+int16_to_bfloat16_ru = short2bfloat16_ru
+bfloat16_to_int16_rn = bfloat162short_rn
+bfloat16_to_int16_rz = bfloat162short_rz
+bfloat16_to_int16_rd = bfloat162short_rd
+bfloat16_to_int16_ru = bfloat162short_ru
+
+uint16_to_bfloat16_rn = ushort2bfloat16_rn
+uint16_to_bfloat16_rz = ushort2bfloat16_rz
+uint16_to_bfloat16_rd = ushort2bfloat16_rd
+uint16_to_bfloat16_ru = ushort2bfloat16_ru
+bfloat16_to_uint16_rn = bfloat162ushort_rn
+bfloat16_to_uint16_rz = bfloat162ushort_rz
+bfloat16_to_uint16_rd = bfloat162ushort_rd
+bfloat16_to_uint16_ru = bfloat162ushort_ru
+
+# Int32 / UInt32
+int32_to_bfloat16_rn = int2bfloat16_rn
+int32_to_bfloat16_rz = int2bfloat16_rz
+int32_to_bfloat16_rd = int2bfloat16_rd
+int32_to_bfloat16_ru = int2bfloat16_ru
+bfloat16_to_int32_rn = bfloat162int_rn
+bfloat16_to_int32_rz = bfloat162int_rz
+bfloat16_to_int32_rd = bfloat162int_rd
+bfloat16_to_int32_ru = bfloat162int_ru
+
+uint32_to_bfloat16_rn = uint2bfloat16_rn
+uint32_to_bfloat16_rz = uint2bfloat16_rz
+uint32_to_bfloat16_rd = uint2bfloat16_rd
+uint32_to_bfloat16_ru = uint2bfloat16_ru
+bfloat16_to_uint32_rn = bfloat162uint_rn
+bfloat16_to_uint32_rz = bfloat162uint_rz
+bfloat16_to_uint32_rd = bfloat162uint_rd
+bfloat16_to_uint32_ru = bfloat162uint_ru
+
+# Int64 / UInt64
+int64_to_bfloat16_rn = ll2bfloat16_rn
+int64_to_bfloat16_rz = ll2bfloat16_rz
+int64_to_bfloat16_rd = ll2bfloat16_rd
+int64_to_bfloat16_ru = ll2bfloat16_ru
+bfloat16_to_int64_rn = bfloat162ll_rn
+bfloat16_to_int64_rz = bfloat162ll_rz
+bfloat16_to_int64_rd = bfloat162ll_rd
+bfloat16_to_int64_ru = bfloat162ll_ru
+
+uint64_to_bfloat16_rn = ull2bfloat16_rn
+uint64_to_bfloat16_rz = ull2bfloat16_rz
+uint64_to_bfloat16_rd = ull2bfloat16_rd
+uint64_to_bfloat16_ru = ull2bfloat16_ru
+bfloat16_to_uint64_rn = bfloat162ull_rn
+bfloat16_to_uint64_rz = bfloat162ull_rz
+bfloat16_to_uint64_rd = bfloat162ull_rd
+bfloat16_to_uint64_ru = bfloat162ull_ru
+
+# Bit reinterpret casts
+bfloat16_as_int16 = bfloat16_as_short
+bfloat16_as_uint16 = bfloat16_as_ushort
+int16_as_bfloat16 = short_as_bfloat16
+uint16_as_bfloat16 = ushort_as_bfloat16
 
 __all__ = [
+    "typing_registry",
+    "target_registry",
     "bfloat16",
+    # Arithmetic intrinsics
+    "habs",
+    "hadd",
+    "hsub",
+    "hmul",
+    "hadd_rn",
+    "hsub_rn",
+    "hmul_rn",
+    "hdiv",
+    "hadd_sat",
+    "hsub_sat",
+    "hmul_sat",
+    "hfma",
+    "hfma_sat",
+    "hneg",
+    "hfma_relu",
+    # Comparison intrinsics
+    "heq",
+    "hne",
+    "hge",
+    "hgt",
+    "hle",
+    "hlt",
+    "hmax",
+    "hmin",
+    "hmax_nan",
+    "hmin_nan",
+    "hisinf",
+    "hisnan",
+    "hequ",
+    "hneu",
+    "hgeu",
+    "hgtu",
+    "hleu",
+    "hltu",
+    # Precision conversion and data movement
+    "float32_to_bfloat16",
+    "float64_to_bfloat16",
+    "bfloat16_to_float32",
+    "float32_to_bfloat16_rn",
+    "float32_to_bfloat16_rz",
+    "float32_to_bfloat16_rd",
+    "float32_to_bfloat16_ru",
+    "bfloat16_to_int8_rz",
+    "bfloat16_to_uint8_rz",
+    "int16_to_bfloat16_rn",
+    "int16_to_bfloat16_rz",
+    "int16_to_bfloat16_rd",
+    "int16_to_bfloat16_ru",
+    "bfloat16_to_int16_rn",
+    "bfloat16_to_int16_rz",
+    "bfloat16_to_int16_rd",
+    "bfloat16_to_int16_ru",
+    "uint16_to_bfloat16_rn",
+    "uint16_to_bfloat16_rz",
+    "uint16_to_bfloat16_rd",
+    "uint16_to_bfloat16_ru",
+    "bfloat16_to_uint16_rn",
+    "bfloat16_to_uint16_rz",
+    "bfloat16_to_uint16_rd",
+    "bfloat16_to_uint16_ru",
+    "int32_to_bfloat16_rn",
+    "int32_to_bfloat16_rz",
+    "int32_to_bfloat16_rd",
+    "int32_to_bfloat16_ru",
+    "bfloat16_to_int32_rn",
+    "bfloat16_to_int32_rz",
+    "bfloat16_to_int32_rd",
+    "bfloat16_to_int32_ru",
+    "uint32_to_bfloat16_rn",
+    "uint32_to_bfloat16_rz",
+    "uint32_to_bfloat16_rd",
+    "uint32_to_bfloat16_ru",
+    "bfloat16_to_uint32_rn",
+    "bfloat16_to_uint32_rz",
+    "bfloat16_to_uint32_rd",
+    "bfloat16_to_uint32_ru",
+    "int64_to_bfloat16_rn",
+    "int64_to_bfloat16_rz",
+    "int64_to_bfloat16_rd",
+    "int64_to_bfloat16_ru",
+    "bfloat16_to_int64_rn",
+    "bfloat16_to_int64_rz",
+    "bfloat16_to_int64_rd",
+    "bfloat16_to_int64_ru",
+    "uint64_to_bfloat16_rn",
+    "uint64_to_bfloat16_rz",
+    "uint64_to_bfloat16_rd",
+    "uint64_to_bfloat16_ru",
+    "bfloat16_to_uint64_rn",
+    "bfloat16_to_uint64_rz",
+    "bfloat16_to_uint64_rd",
+    "bfloat16_to_uint64_ru",
+    "bfloat16_as_int16",
+    "bfloat16_as_uint16",
+    "int16_as_bfloat16",
+    "uint16_as_bfloat16",
     "htrunc",
     "hceil",
     "hfloor",
diff --git a/numba_cuda/numba/cuda/cudadrv/nvrtc.py b/numba_cuda/numba/cuda/cudadrv/nvrtc.py
index 1a1035a25..a26e24c93 100644
--- a/numba_cuda/numba/cuda/cudadrv/nvrtc.py
+++ b/numba_cuda/numba/cuda/cudadrv/nvrtc.py
@@ -355,7 +355,11 @@ def compile(src, name, cc, ltoir=False):
     cudadrv_path = os.path.dirname(os.path.abspath(__file__))
     numba_cuda_path = os.path.dirname(cudadrv_path)
 
-    numba_include = f"{os.path.join(numba_cuda_path, 'include', '12')}"
+    nvrtc_ver_major = version[0]
+    if nvrtc_ver_major == 12:
+        numba_include = f"{os.path.join(numba_cuda_path, 'include', '12')}"
+    elif nvrtc_ver_major == 13:
+        numba_include = f"{os.path.join(numba_cuda_path, 'include', '13')}"
 
     if config.CUDA_NVRTC_EXTRA_SEARCH_PATHS:
         extra_includes = config.CUDA_NVRTC_EXTRA_SEARCH_PATHS.split(":")
diff --git a/numba_cuda/numba/cuda/include/13/cuda_bf16.h b/numba_cuda/numba/cuda/include/13/cuda_bf16.h
new file mode 100644
index 000000000..38feffba0
--- /dev/null
+++ b/numba_cuda/numba/cuda/include/13/cuda_bf16.h
@@ -0,0 +1,5118 @@
+/*
+* Copyright 1993-2024 NVIDIA Corporation.  All rights reserved.
+*
+* NOTICE TO LICENSEE:
+*
+* This source code and/or documentation ("Licensed Deliverables") are
+* subject to NVIDIA intellectual property rights under U.S. and
+* international Copyright laws.
+*
+* These Licensed Deliverables contained herein is PROPRIETARY and
+* CONFIDENTIAL to NVIDIA and is being provided under the terms and
+* conditions of a form of NVIDIA software license agreement by and
+* between NVIDIA and Licensee ("License Agreement") or electronically
+* accepted by Licensee.  Notwithstanding any terms or conditions to
+* the contrary in the License Agreement, reproduction or disclosure
+* of the Licensed Deliverables to any third party without the express
+* written consent of NVIDIA is prohibited.
+*
+* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+* OF THESE LICENSED DELIVERABLES.
+*
+* U.S. Government End Users.  These Licensed Deliverables are a
+* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+* 1995), consisting of "commercial computer software" and "commercial
+* computer software documentation" as such terms are used in 48
+* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+* only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+* U.S. Government End Users acquire the Licensed Deliverables with
+* only those rights set forth herein.
+*
+* Any use of the Licensed Deliverables in individual and commercial
+* software must include, in the user documentation and internal
+* comments to the code, the above Disclaimer and U.S. Government End
+* Users Notice.
+*/
+
+/**
+* \defgroup CUDA_MATH_INTRINSIC_BFLOAT16 Bfloat16 Precision Intrinsics
+* This section describes nv_bfloat16 precision intrinsic functions.
+* To use these functions, include the header file \p cuda_bf16.h in your program.
+* All of the functions defined here are available in device code.
+* Some of the functions are also available to host compilers, please
+* refer to respective functions' documentation for details.
+*
+* NOTE: Aggressive floating-point optimizations performed by host or device
+* compilers may affect numeric behavior of the functions implemented in this
+* header. Specific examples are:
+* - hsin(__nv_bfloat16);
+* - hcos(__nv_bfloat16);
+* - h2sin(__nv_bfloat162);
+* - h2cos(__nv_bfloat162);
+*
+* The following macros are available to help users selectively enable/disable
+* various definitions present in the header file:
+* - \p CUDA_NO_BFLOAT16 - If defined, this macro will prevent the definition of
+* additional type aliases in the global namespace, helping to avoid potential
+* conflicts with symbols defined in the user program.
+* - \p __CUDA_NO_BFLOAT16_CONVERSIONS__ - If defined, this macro will prevent
+* the use of the C++ type conversions (converting constructors and conversion
+* operators) that are common for built-in floating-point types, but may be
+* undesirable for \p __nv_bfloat16 which is essentially a user-defined type.
+* - \p __CUDA_NO_BFLOAT16_OPERATORS__ and \p __CUDA_NO_BFLOAT162_OPERATORS__ -
+* If defined, these macros will prevent the inadvertent use of usual arithmetic
+* and comparison operators. This enforces the storage-only type semantics and
+* prevents C++ style computations on \p __nv_bfloat16 and \p __nv_bfloat162 types.
+*/
+
+/**
+* \defgroup CUDA_MATH_INTRINSIC_BFLOAT16_CONSTANTS Bfloat16 Arithmetic Constants
+* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+* To use these constants, include the header file \p cuda_bf16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__BFLOAT16_ARITHMETIC Bfloat16 Arithmetic Functions
+* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+* To use these functions, include the header file \p cuda_bf16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__BFLOAT162_ARITHMETIC Bfloat162 Arithmetic Functions
+* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+* To use these functions, include the header file \p cuda_bf16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__BFLOAT16_COMPARISON Bfloat16 Comparison Functions
+* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+* To use these functions, include the header file \p cuda_bf16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__BFLOAT162_COMPARISON Bfloat162 Comparison Functions
+* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+* To use these functions, include the header file \p cuda_bf16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__BFLOAT16_MISC Bfloat16 Precision Conversion and Data Movement
+* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+* To use these functions, include the header file \p cuda_bf16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__BFLOAT16_FUNCTIONS Bfloat16 Math Functions
+* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+* To use these functions, include the header file \p cuda_bf16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__BFLOAT162_FUNCTIONS Bfloat162 Math Functions
+* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+* To use these functions, include the header file \p cuda_bf16.h in your program.
+*/
+
+#ifndef __CUDA_BF16_H__
+#define __CUDA_BF16_H__
+
+/* bring in __half data type and operations, for use in converting constructors */
+#include "cuda_fp16.h"
+
+// implicitly provided by NVRTC
+#if !defined(__CUDACC_RTC__)
+/* bring in float2, double4, etc vector types */
+#include "vector_types.h"
+/* bring in operations on vector types like: make_float2 */
+#include "vector_functions.h"
+#endif  /* !defined(__CUDACC_RTC__) */
+
+#define ___CUDA_BF16_STRINGIFY_INNERMOST(x) #x
+#define __CUDA_BF16_STRINGIFY(x) ___CUDA_BF16_STRINGIFY_INNERMOST(x)
+
+#if defined(__cplusplus)
+
+/* Set up function decorations */
+#if (defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3))))
+#define __CUDA_BF16_DECL__ __device__
+#define __CUDA_HOSTDEVICE_BF16_DECL__ __device__
+#define __CUDA_HOSTDEVICE__ __device__
+#elif defined(__CUDACC__) || defined(_NVHPC_CUDA)
+#define __CUDA_BF16_DECL__ static __device__ __inline__
+#define __CUDA_HOSTDEVICE_BF16_DECL__ static __host__ __device__ __inline__
+#define __CUDA_HOSTDEVICE__ __host__ __device__
+#else /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+#if defined(__GNUC__)
+#define __CUDA_HOSTDEVICE_BF16_DECL__ static __attribute__ ((unused))
+#else
+#define __CUDA_HOSTDEVICE_BF16_DECL__ static
+#endif /* defined(__GNUC__) */
+#define __CUDA_HOSTDEVICE__
+#endif /* (defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3))))) */
+
+#define __CUDA_BF16_TYPES_EXIST__
+
+/* Macros to allow nv_bfloat16 & nv_bfloat162 to be used by inline assembly */
+#define __BFLOAT16_TO_US(var) *(reinterpret_cast<unsigned short *>(&(var)))
+#define __BFLOAT16_TO_CUS(var) *(reinterpret_cast<const unsigned short *>(&(var)))
+#define __BFLOAT162_TO_UI(var) *(reinterpret_cast<unsigned int *>(&(var)))
+#define __BFLOAT162_TO_CUI(var) *(reinterpret_cast<const unsigned int *>(&(var)))
+
+/* Forward-declaration of structures defined in "cuda_bf16.hpp" */
+struct __nv_bfloat16;
+struct __nv_bfloat162;
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts double number to nv_bfloat16 precision in round-to-nearest-even mode
+* and returns \p nv_bfloat16 with converted value.
+*
+* \details Converts double number \p a to nv_bfloat16 precision in round-to-nearest-even mode.
+* \param[in] a - double. Is only being read.
+* \returns nv_bfloat16
+* - \p a converted to \p nv_bfloat16 using round-to-nearest-even mode.
+* - __double2bfloat16 \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - __double2bfloat16 \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - __double2bfloat16(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __double2bfloat16(const double a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts float number to nv_bfloat16 precision in round-to-nearest-even mode
+* and returns \p nv_bfloat16 with converted value.
+*
+* \details Converts float number \p a to nv_bfloat16 precision in round-to-nearest-even mode.
+* \param[in] a - float. Is only being read.
+* \returns nv_bfloat16
+* - \p a converted to nv_bfloat16 using round-to-nearest-even mode.
+*
+* \see __float2bfloat16_rn(float) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16(const float a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts float number to nv_bfloat16 precision in round-to-nearest-even mode
+* and returns \p nv_bfloat16 with converted value.
+*
+* \details Converts float number \p a to nv_bfloat16 precision in round-to-nearest-even mode.
+* \param[in] a - float. Is only being read.
+* \returns nv_bfloat16
+* - \p a converted to nv_bfloat16 using round-to-nearest-even mode.
+* - __float2bfloat16_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - __float2bfloat16_rn \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - __float2bfloat16_rn(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rn(const float a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts float number to nv_bfloat16 precision in round-towards-zero mode
+* and returns \p nv_bfloat16 with converted value.
+*
+* \details Converts float number \p a to nv_bfloat16 precision in round-towards-zero mode.
+* \param[in] a - float. Is only being read.
+* \returns nv_bfloat16
+* - \p a converted to nv_bfloat16 using round-towards-zero mode.
+* - __float2bfloat16_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - __float2bfloat16_rz \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - __float2bfloat16_rz(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rz(const float a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts float number to nv_bfloat16 precision in round-down mode
+* and returns \p nv_bfloat16 with converted value.
+*
+* \details Converts float number \p a to nv_bfloat16 precision in round-down mode.
+* \param[in] a - float. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p a converted to nv_bfloat16 using round-down mode.
+* - __float2bfloat16_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - __float2bfloat16_rd \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - __float2bfloat16_rd(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rd(const float a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts float number to nv_bfloat16 precision in round-up mode
+* and returns \p nv_bfloat16 with converted value.
+*
+* \details Converts float number \p a to nv_bfloat16 precision in round-up mode.
+* \param[in] a - float. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p a converted to nv_bfloat16 using round-up mode.
+* - __float2bfloat16_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - __float2bfloat16_ru \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - __float2bfloat16_ru(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_ru(const float a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts \p nv_bfloat16 number to float.
+*
+* \details Converts nv_bfloat16 number \p a to float.
+* \param[in] a - float. Is only being read.
+*
+* \returns float
+* - \p a converted to float.
+* - __bfloat162float \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - __bfloat162float \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - __bfloat162float(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ float __bfloat162float(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts input to nv_bfloat16 precision in round-to-nearest-even mode and
+* populates both halves of \p nv_bfloat162 with converted value.
+*
+* \details Converts input \p a to nv_bfloat16 precision in round-to-nearest-even mode and
+* populates both halves of \p nv_bfloat162 with converted value.
+* \param[in] a - float. Is only being read.
+*
+* \returns nv_bfloat162
+* - The \p nv_bfloat162 value with both halves equal to the converted nv_bfloat16
+* precision number.
+*
+* \see __float2bfloat16_rn(float) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __float2bfloat162_rn(const float a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts both input floats to nv_bfloat16 precision in round-to-nearest-even
+* mode and returns \p nv_bfloat162 with converted values.
+*
+* \details Converts both input floats to nv_bfloat16 precision in round-to-nearest-even mode
+* and combines the results into one \p nv_bfloat162 number. Low 16 bits of the return
+* value correspond to the input \p a, high 16 bits correspond to the input \p
+* b.
+* \param[in] a - float. Is only being read.
+* \param[in] b - float. Is only being read.
+*
+* \returns nv_bfloat162
+* - The \p nv_bfloat162 value with corresponding halves equal to the
+* converted input floats.
+*
+* \see __float2bfloat16_rn(float) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __floats2bfloat162_rn(const float a, const float b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts low 16 bits of \p nv_bfloat162 to float and returns the result
+*
+* \details Converts low 16 bits of \p nv_bfloat162 input \p a to 32-bit floating-point number
+* and returns the result.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns float
+* - The low 16 bits of \p a converted to float.
+*
+* \see __bfloat162float(__nv_bfloat16) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ float __low2float(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts high 16 bits of \p nv_bfloat162 to float and returns the result
+*
+* \details Converts high 16 bits of \p nv_bfloat162 input \p a to 32-bit floating-point number
+* and returns the result.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns float
+* - The high 16 bits of \p a converted to float.
+*
+* \see __bfloat162float(__nv_bfloat16) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ float __high2float(const __nv_bfloat162 a);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts both components of float2 number to nv_bfloat16 precision in
+* round-to-nearest-even mode and returns \p nv_bfloat162 with converted values.
+*
+* \details Converts both components of float2 to nv_bfloat16 precision in round-to-nearest-even
+* mode and combines the results into one \p nv_bfloat162 number. Low 16 bits of the
+* return value correspond to \p a.x and high 16 bits of the return value
+* correspond to \p a.y.
+* \param[in] a - float2. Is only being read.
+*
+* \returns nv_bfloat162
+* - The \p nv_bfloat162 which has corresponding halves equal to the
+* converted float2 components.
+*
+* \see __float2bfloat16_rn(float) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __float22bfloat162_rn(const float2 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts both halves of \p nv_bfloat162 to float2 and returns the result.
+*
+* \details Converts both halves of \p nv_bfloat162 input \p a to float and returns the
+* result as a \p float2 packed value.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns float2
+* - \p a converted to float2.
+*
+* \see __bfloat162float(__nv_bfloat16) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ float2 __bfloat1622float2(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed char in round-towards-zero mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed
+* char in round-towards-zero mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns signed char
+* - \p h converted to a signed char using round-towards-zero mode.
+* - __bfloat162char_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __bfloat162char_rz \cuda_math_formula (x), x > 127\end_cuda_math_formula returns SCHAR_MAX = \p 0x7F.
+* - __bfloat162char_rz \cuda_math_formula (x), x < -128\end_cuda_math_formula returns SCHAR_MIN = \p 0x80.
+* - __bfloat162char_rz(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ signed char __bfloat162char_rz(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned char in round-towards-zero mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned
+* char in round-towards-zero mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns unsigned char
+* - \p h converted to an unsigned char using round-towards-zero mode.
+* - __bfloat162uchar_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __bfloat162uchar_rz \cuda_math_formula (x), x > 255\end_cuda_math_formula returns UCHAR_MAX = \p 0xFF.
+* - __bfloat162uchar_rz \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0.
+* - __bfloat162uchar_rz(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned char __bfloat162uchar_rz(const __nv_bfloat16 h);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed integer in round-to-nearest-even mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed integer in
+* round-to-nearest-even mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns int
+* - \p h converted to a signed integer using round-to-nearest-even mode.
+* - __bfloat162int_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __bfloat162int_rn \cuda_math_formula (x), x > INT_MAX\end_cuda_math_formula returns INT_MAX = \p 0x7FFFFFFF.
+* - __bfloat162int_rn \cuda_math_formula (x), x < INT_MIN\end_cuda_math_formula returns INT_MIN = \p 0x80000000.
+* - __bfloat162int_rn(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ int __bfloat162int_rn(const __nv_bfloat16 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed integer in round-towards-zero mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed integer in
+* round-towards-zero mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns int
+* - \p h converted to a signed integer using round-towards-zero mode.
+* - __bfloat162int_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __bfloat162int_rz \cuda_math_formula (x), x > INT_MAX\end_cuda_math_formula returns INT_MAX = \p 0x7FFFFFFF.
+* - __bfloat162int_rz \cuda_math_formula (x), x < INT_MIN\end_cuda_math_formula returns INT_MIN = \p 0x80000000.
+* - __bfloat162int_rz(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ int __bfloat162int_rz(const __nv_bfloat16 h);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed integer in round-down mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed integer in
+* round-down mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns int
+* - \p h converted to a signed integer using round-down mode.
+* - __bfloat162int_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __bfloat162int_rd \cuda_math_formula (x), x > INT_MAX\end_cuda_math_formula returns INT_MAX = \p 0x7FFFFFFF.
+* - __bfloat162int_rd \cuda_math_formula (x), x < INT_MIN\end_cuda_math_formula returns INT_MIN = \p 0x80000000.
+* - __bfloat162int_rd(NaN) returns 0.* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ int __bfloat162int_rd(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed integer in round-up mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed integer in
+* round-up mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns int
+* - \p h converted to a signed integer using round-up mode.
+* - __bfloat162int_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __bfloat162int_ru \cuda_math_formula (x), x > INT_MAX\end_cuda_math_formula returns INT_MAX = \p 0x7FFFFFFF.
+* - __bfloat162int_ru \cuda_math_formula (x), x < INT_MIN\end_cuda_math_formula returns INT_MIN = \p 0x80000000.
+* - __bfloat162int_ru(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ int __bfloat162int_ru(const __nv_bfloat16 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed integer to a nv_bfloat16 in round-to-nearest-even mode.
+*
+* \details Convert the signed integer value \p i to a nv_bfloat16 floating-point
+* value in round-to-nearest-even mode.
+* \param[in] i - int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rn(const int i);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed integer to a nv_bfloat16 in round-towards-zero mode.
+*
+* \details Convert the signed integer value \p i to a nv_bfloat16 floating-point
+* value in round-towards-zero mode.
+* \param[in] i - int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rz(const int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed integer to a nv_bfloat16 in round-down mode.
+*
+* \details Convert the signed integer value \p i to a nv_bfloat16 floating-point
+* value in round-down mode.
+* \param[in] i - int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rd(const int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed integer to a nv_bfloat16 in round-up mode.
+*
+* \details Convert the signed integer value \p i to a nv_bfloat16 floating-point
+* value in round-up mode.
+* \param[in] i - int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_ru(const int i);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed short integer in round-to-nearest-even
+* mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed short
+* integer in round-to-nearest-even mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns short int
+* - \p h converted to a signed short integer using round-to-nearest-even mode.
+* - __bfloat162short_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __bfloat162short_rn \cuda_math_formula (x), x > 32767\end_cuda_math_formula returns SHRT_MAX = \p 0x7FFF.
+* - __bfloat162short_rn \cuda_math_formula (x), x < -32768\end_cuda_math_formula returns SHRT_MIN = \p 0x8000.
+* - __bfloat162short_rn(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ short int __bfloat162short_rn(const __nv_bfloat16 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed short integer in round-towards-zero mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed short
+* integer in round-towards-zero mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns short int
+* - \p h converted to a signed short integer using round-towards-zero mode.
+* - __bfloat162short_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __bfloat162short_rz \cuda_math_formula (x), x > 32767\end_cuda_math_formula returns SHRT_MAX = \p 0x7FFF.
+* - __bfloat162short_rz \cuda_math_formula (x), x < -32768\end_cuda_math_formula returns SHRT_MIN = \p 0x8000.
+* - __bfloat162short_rz(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ short int __bfloat162short_rz(const __nv_bfloat16 h);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed short integer in round-down mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed short
+* integer in round-down mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns short int
+* - \p h converted to a signed short integer using round-down mode.
+* - __bfloat162short_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __bfloat162short_rd \cuda_math_formula (x), x > 32767\end_cuda_math_formula returns SHRT_MAX = \p 0x7FFF.
+* - __bfloat162short_rd \cuda_math_formula (x), x < -32768\end_cuda_math_formula returns SHRT_MIN = \p 0x8000.
+* - __bfloat162short_rd(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ short int __bfloat162short_rd(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed short integer in round-up mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed short
+* integer in round-up mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns short int
+* - \p h converted to a signed short integer using round-up mode.
+* - __bfloat162short_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __bfloat162short_ru \cuda_math_formula (x), x > 32767\end_cuda_math_formula returns SHRT_MAX = \p 0x7FFF.
+* - __bfloat162short_ru \cuda_math_formula (x), x < -32768\end_cuda_math_formula returns SHRT_MIN = \p 0x8000.
+* - __bfloat162short_ru(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ short int __bfloat162short_ru(const __nv_bfloat16 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed short integer to a nv_bfloat16 in round-to-nearest-even
+* mode.
+*
+* \details Convert the signed short integer value \p i to a nv_bfloat16 floating-point
+* value in round-to-nearest-even mode.
+* \param[in] i - short int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rn(const short int i);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed short integer to a nv_bfloat16 in round-towards-zero mode.
+*
+* \details Convert the signed short integer value \p i to a nv_bfloat16 floating-point
+* value in round-towards-zero mode.
+* \param[in] i - short int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rz(const short int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed short integer to a nv_bfloat16 in round-down mode.
+*
+* \details Convert the signed short integer value \p i to a nv_bfloat16 floating-point
+* value in round-down mode.
+* \param[in] i - short int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rd(const short int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed short integer to a nv_bfloat16 in round-up mode.
+*
+* \details Convert the signed short integer value \p i to a nv_bfloat16 floating-point
+* value in round-up mode.
+* \param[in] i - short int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_ru(const short int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned integer in round-to-nearest-even mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned integer
+* in round-to-nearest-even mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns unsigned int
+* - \p h converted to an unsigned integer.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ unsigned int __bfloat162uint_rn(const __nv_bfloat16 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned integer in round-towards-zero mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned integer
+* in round-towards-zero mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns unsigned int
+* - \p h converted to an unsigned integer.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __bfloat162uint_rz(const __nv_bfloat16 h);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned integer in round-down mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned integer
+* in round-down mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns unsigned int
+* - \p h converted to an unsigned integer.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ unsigned int __bfloat162uint_rd(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned integer in round-up mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned integer
+* in round-up mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns unsigned int
+* - \p h converted to an unsigned integer.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ unsigned int __bfloat162uint_ru(const __nv_bfloat16 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned integer to a nv_bfloat16 in round-to-nearest-even mode.
+*
+* \details Convert the unsigned integer value \p i to a nv_bfloat16 floating-point
+* value in round-to-nearest-even mode.
+* \param[in] i - unsigned int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rn(const unsigned int i);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned integer to a nv_bfloat16 in round-towards-zero mode.
+*
+* \details Convert the unsigned integer value \p i to a nv_bfloat16 floating-point
+* value in round-towards-zero mode.
+* \param[in] i - unsigned int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rz(const unsigned int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned integer to a nv_bfloat16 in round-down mode.
+*
+* \details Convert the unsigned integer value \p i to a nv_bfloat16 floating-point
+* value in round-down mode.
+* \param[in] i - unsigned int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rd(const unsigned int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned integer to a nv_bfloat16 in round-up mode.
+*
+* \details Convert the unsigned integer value \p i to a nv_bfloat16 floating-point
+* value in round-up mode.
+* \param[in] i - unsigned int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_ru(const unsigned int i);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned short integer in round-to-nearest-even
+* mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned short
+* integer in round-to-nearest-even mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns unsigned short int
+* - \p h converted to an unsigned short integer.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_rn(const __nv_bfloat16 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned short integer in round-towards-zero
+* mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned short
+* integer in round-towards-zero mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns unsigned short int
+* - \p h converted to an unsigned short integer.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned short int __bfloat162ushort_rz(const __nv_bfloat16 h);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned short integer in round-down mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned short
+* integer in round-down mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns unsigned short int
+* - \p h converted to an unsigned short integer.
+*/
+__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_rd(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned short integer in round-up mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned short
+* integer in round-up mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns unsigned short int
+* - \p h converted to an unsigned short integer.
+*/
+__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_ru(const __nv_bfloat16 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned short integer to a nv_bfloat16 in round-to-nearest-even
+* mode.
+*
+* \details Convert the unsigned short integer value \p i to a nv_bfloat16 floating-point
+* value in round-to-nearest-even mode.
+* \param[in] i - unsigned short int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rn(const unsigned short int i);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned short integer to a nv_bfloat16 in round-towards-zero
+* mode.
+*
+* \details Convert the unsigned short integer value \p i to a nv_bfloat16 floating-point
+* value in round-towards-zero mode.
+* \param[in] i - unsigned short int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rz(const unsigned short int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned short integer to a nv_bfloat16 in round-down mode.
+*
+* \details Convert the unsigned short integer value \p i to a nv_bfloat16 floating-point
+* value in round-down mode.
+* \param[in] i - unsigned short int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rd(const unsigned short int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned short integer to a nv_bfloat16 in round-up mode.
+*
+* \details Convert the unsigned short integer value \p i to a nv_bfloat16 floating-point
+* value in round-up mode.
+* \param[in] i - unsigned short int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_ru(const unsigned short int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned 64-bit integer in round-to-nearest-even
+* mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned 64-bit
+* integer in round-to-nearest-even mode. NaN inputs return 0x8000000000000000.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns unsigned long long int
+* - \p h converted to an unsigned 64-bit integer.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_rn(const __nv_bfloat16 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned 64-bit integer in round-towards-zero
+* mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned 64-bit
+* integer in round-towards-zero mode. NaN inputs return 0x8000000000000000.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns unsigned long long int
+* - \p h converted to an unsigned 64-bit integer.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned long long int __bfloat162ull_rz(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Vector function, combines two \p nv_bfloat16 numbers into one \p nv_bfloat162 number.
+*
+* \details Combines two input \p nv_bfloat16 number \p x and \p y into one \p nv_bfloat162 number.
+* Input \p x is stored in low 16 bits of the return value, input \p y is stored
+* in high 16 bits of the return value.
+* \param[in] x - nv_bfloat16. Is only being read.
+* \param[in] y - nv_bfloat16. Is only being read.
+*
+* \returns __nv_bfloat162
+* - The \p __nv_bfloat162 vector with one half equal to \p x and the other to \p y.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 make_bfloat162(const __nv_bfloat16 x, const __nv_bfloat16 y);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned 64-bit integer in round-down mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned 64-bit
+* integer in round-down mode. NaN inputs return 0x8000000000000000.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns unsigned long long int
+* - \p h converted to an unsigned 64-bit integer.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_rd(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned 64-bit integer in round-up mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned 64-bit
+* integer in round-up mode. NaN inputs return 0x8000000000000000.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns unsigned long long int
+* - \p h converted to an unsigned 64-bit integer.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_ru(const __nv_bfloat16 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned 64-bit integer to a nv_bfloat16 in round-to-nearest-even
+* mode.
+*
+* \details Convert the unsigned 64-bit integer value \p i to a nv_bfloat16 floating-point
+* value in round-to-nearest-even mode.
+* \param[in] i - unsigned long long int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rn(const unsigned long long int i);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned 64-bit integer to a nv_bfloat16 in round-towards-zero
+* mode.
+*
+* \details Convert the unsigned 64-bit integer value \p i to a nv_bfloat16 floating-point
+* value in round-towards-zero mode.
+* \param[in] i - unsigned long long int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rz(const unsigned long long int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned 64-bit integer to a nv_bfloat16 in round-down mode.
+*
+* \details Convert the unsigned 64-bit integer value \p i to a nv_bfloat16 floating-point
+* value in round-down mode.
+* \param[in] i - unsigned long long int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rd(const unsigned long long int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned 64-bit integer to a nv_bfloat16 in round-up mode.
+*
+* \details Convert the unsigned 64-bit integer value \p i to a nv_bfloat16 floating-point
+* value in round-up mode.
+* \param[in] i - unsigned long long int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_ru(const unsigned long long int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed 64-bit integer in round-to-nearest-even
+* mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed 64-bit
+* integer in round-to-nearest-even mode. NaN inputs return a long long int with hex value of 0x8000000000000000.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns long long int
+* - \p h converted to a signed 64-bit integer.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ long long int __bfloat162ll_rn(const __nv_bfloat16 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed 64-bit integer in round-towards-zero mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed 64-bit
+* integer in round-towards-zero mode. NaN inputs return a long long int with hex value of 0x8000000000000000.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns long long int
+* - \p h converted to a signed 64-bit integer.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ long long int __bfloat162ll_rz(const __nv_bfloat16 h);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed 64-bit integer in round-down mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed 64-bit
+* integer in round-down mode. NaN inputs return a long long int with hex value of 0x8000000000000000.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns long long int
+* - \p h converted to a signed 64-bit integer.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ long long int __bfloat162ll_rd(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed 64-bit integer in round-up mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed 64-bit
+* integer in round-up mode. NaN inputs return a long long int with hex value of 0x8000000000000000.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns long long int
+* - \p h converted to a signed 64-bit integer.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ long long int __bfloat162ll_ru(const __nv_bfloat16 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed 64-bit integer to a nv_bfloat16 in round-to-nearest-even
+* mode.
+*
+* \details Convert the signed 64-bit integer value \p i to a nv_bfloat16 floating-point
+* value in round-to-nearest-even mode.
+* \param[in] i - long long int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rn(const long long int i);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed 64-bit integer to a nv_bfloat16 in round-towards-zero mode.
+*
+* \details Convert the signed 64-bit integer value \p i to a nv_bfloat16 floating-point
+* value in round-towards-zero mode.
+* \param[in] i - long long int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rz(const long long int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed 64-bit integer to a nv_bfloat16 in round-down mode.
+*
+* \details Convert the signed 64-bit integer value \p i to a nv_bfloat16 floating-point
+* value in round-down mode.
+* \param[in] i - long long int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rd(const long long int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed 64-bit integer to a nv_bfloat16 in round-up mode.
+*
+* \details Convert the signed 64-bit integer value \p i to a nv_bfloat16 floating-point
+* value in round-up mode.
+* \param[in] i - long long int. Is only being read.
+*
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_ru(const long long int i);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Truncate input argument to the integral part.
+*
+* \details Round \p h to the nearest integer value that does not exceed \p h in
+* magnitude.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The truncated integer value.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 htrunc(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculate ceiling of the input argument.
+*
+* \details Compute the smallest integer value not less than \p h.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The smallest integer value not less than \p h.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hceil(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculate the largest integer less than or equal to \p h.
+*
+* \details Calculate the largest integer value which is less than or equal to \p h.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The largest integer value which is less than or equal to \p h.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hfloor(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Round input to nearest integer value in nv_bfloat16 floating-point
+* number.
+*
+* \details Round \p h to the nearest integer value in nv_bfloat16 floating-point
+* format, with bfloat16way cases rounded to the nearest even integer value.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The nearest integer to \p h.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hrint(const __nv_bfloat16 h);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Truncate \p nv_bfloat162 vector input argument to the integral part.
+*
+* \details Round each component of vector \p h to the nearest integer value that does
+* not exceed \p h in magnitude.
+* \param[in] h - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The truncated \p h.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2trunc(const __nv_bfloat162 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculate \p nv_bfloat162 vector ceiling of the input argument.
+*
+* \details For each component of vector \p h compute the smallest integer value not less
+* than \p h.
+* \param[in] h - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The vector of smallest integers not less than \p h.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2ceil(const __nv_bfloat162 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculate the largest integer less than or equal to \p h.
+*
+* \details For each component of vector \p h calculate the largest integer value which
+* is less than or equal to \p h.
+* \param[in] h - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The vector of largest integers which is less than or equal to \p h.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2floor(const __nv_bfloat162 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Round input to nearest integer value in nv_bfloat16 floating-point
+* number.
+*
+* \details Round each component of \p nv_bfloat162 vector \p h to the nearest integer value in
+* nv_bfloat16 floating-point format, with bfloat16way cases rounded to the
+* nearest even integer value.
+* \param[in] h - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The vector of rounded integer values.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2rint(const __nv_bfloat162 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Returns \p nv_bfloat162 with both halves equal to the input value.
+*
+* \details Returns \p nv_bfloat162 number with both halves equal to the input \p a \p nv_bfloat16
+* number.
+* \param[in] a - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat162
+* - The vector which has both its halves equal to the input \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __bfloat162bfloat162(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Swaps both halves of the \p nv_bfloat162 input.
+*
+* \details Swaps both halves of the \p nv_bfloat162 input and returns a new \p nv_bfloat162 number
+* with swapped halves.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - \p a with its halves being swapped.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __lowhigh2highlow(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Extracts low 16 bits from each of the two \p nv_bfloat162 inputs and combines
+* into one \p nv_bfloat162 number.
+*
+* \details Extracts low 16 bits from each of the two \p nv_bfloat162 inputs and combines into
+* one \p nv_bfloat162 number. Low 16 bits from input \p a is stored in low 16 bits of
+* the return value, low 16 bits from input \p b is stored in high 16 bits of
+* the return value.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The low 16 bits of \p a and of \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __lows2bfloat162(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Extracts high 16 bits from each of the two \p nv_bfloat162 inputs and
+* combines into one \p nv_bfloat162 number.
+*
+* \details Extracts high 16 bits from each of the two \p nv_bfloat162 inputs and combines into
+* one \p nv_bfloat162 number. High 16 bits from input \p a is stored in low 16 bits of
+* the return value, high 16 bits from input \p b is stored in high 16 bits of
+* the return value.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The high 16 bits of \p a and of \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __highs2bfloat162(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Returns high 16 bits of \p nv_bfloat162 input.
+*
+* \details Returns high 16 bits of \p nv_bfloat162 input \p a.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat16
+* - The high 16 bits of the input.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __high2bfloat16(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Returns low 16 bits of \p nv_bfloat162 input.
+*
+* \details Returns low 16 bits of \p nv_bfloat162 input \p a.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat16
+* - Returns \p nv_bfloat16 which contains low 16 bits of the input \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __low2bfloat16(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Checks if the input \p nv_bfloat16 number is infinite.
+*
+* \details Checks if the input \p nv_bfloat16 number \p a is infinite.
+* \param[in] a - nv_bfloat16. Is only being read.
+*
+* \returns int
+* - -1 if \p a is equal to negative infinity,
+* - 1 if \p a is equal to positive infinity,
+* - 0 otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ int __hisinf(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Combines two \p nv_bfloat16 numbers into one \p nv_bfloat162 number.
+*
+* \details Combines two input \p nv_bfloat16 number \p a and \p b into one \p nv_bfloat162 number.
+* Input \p a is stored in low 16 bits of the return value, input \p b is stored
+* in high 16 bits of the return value.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat162
+* - The nv_bfloat162 with one nv_bfloat16 equal to \p a and the other to \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __halves2bfloat162(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Extracts low 16 bits from \p nv_bfloat162 input.
+*
+* \details Extracts low 16 bits from \p nv_bfloat162 input \p a and returns a new \p nv_bfloat162
+* number which has both halves equal to the extracted bits.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The nv_bfloat162 with both halves equal to the low 16 bits of the input.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __low2bfloat162(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Extracts high 16 bits from \p nv_bfloat162 input.
+*
+* \details Extracts high 16 bits from \p nv_bfloat162 input \p a and returns a new \p nv_bfloat162
+* number which has both halves equal to the extracted bits.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The nv_bfloat162 with both halves equal to the high 16 bits of the input.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __high2bfloat162(const __nv_bfloat162 a);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Reinterprets bits in a \p nv_bfloat16 as a signed short integer.
+*
+* \details Reinterprets the bits in the nv_bfloat16 floating-point number \p h
+* as a signed short integer.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns short int
+* - The reinterpreted value.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ short int __bfloat16_as_short(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Reinterprets bits in a \p nv_bfloat16 as an unsigned short integer.
+*
+* \details Reinterprets the bits in the nv_bfloat16 floating-point \p h
+* as an unsigned short number.
+* \param[in] h - nv_bfloat16. Is only being read.
+*
+* \returns unsigned short int
+* - The reinterpreted value.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned short int __bfloat16_as_ushort(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Reinterprets bits in a signed short integer as a \p nv_bfloat16.
+*
+* \details Reinterprets the bits in the signed short integer \p i as a
+* nv_bfloat16 floating-point number.
+* \param[in] i - short int. Is only being read.
+*
+* \returns nv_bfloat16
+* - The reinterpreted value.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __short_as_bfloat16(const short int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Reinterprets bits in an unsigned short integer as a \p nv_bfloat16.
+*
+* \details Reinterprets the bits in the unsigned short integer \p i as a
+* nv_bfloat16 floating-point number.
+* \param[in] i - unsigned short int. Is only being read.
+*
+* \returns nv_bfloat16
+* - The reinterpreted value.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ushort_as_bfloat16(const unsigned short int i);
+
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300))) || defined(_NVHPC_CUDA)
+
+#if !defined warpSize && !defined __local_warpSize
+#define warpSize    32
+#define __local_warpSize
+#endif
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Exchange a variable between threads within a warp. Direct copy from indexed thread.
+*
+* \details Returns the value of \p var held by the thread whose ID is given by \p srcLane.
+* If the \p width is less than \p warpSize, then each subsection of the warp behaves as a separate
+* entity with a starting logical thread ID of 0. If \p srcLane is outside the range \p [0:width-1],
+* the value returned corresponds to the value of \p var held by the \p srcLane modulo \p width (i.e.
+* within the same subsection). \p width must have a value which is a power of 2;
+* results are undefined if \p width is not a power of 2, or is a number greater than
+* \p warpSize.
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read.
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - nv_bfloat162. Is only being read.
+* \param[in] srcLane - int. Is only being read.
+* \param[in] width - int. Is only being read.
+*
+* \returns Returns the 4-byte word referenced by \p var from the source thread ID as \p nv_bfloat162.
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_sync(const unsigned int mask, const __nv_bfloat162 var, const int srcLane, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller.
+*
+* \details Calculates a source thread ID by subtracting \p delta from the caller's lane ID.
+* The value of \p var held by the resulting lane ID is returned: in effect, \p var is shifted up
+* the warp by \p delta threads. If the \p width is less than \p warpSize, then each subsection of the warp
+* behaves as a separate entity with a starting logical thread ID of 0. The source thread index
+* will not wrap around the value of \p width, so effectively the lower \p delta threads will be unchanged.
+* \p width must have a value which is a power of 2; results are undefined if \p width is not a power of 2,
+* or is a number greater than \p warpSize.
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read.
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - nv_bfloat162. Is only being read.
+* \param[in] delta - unsigned int. Is only being read.
+* \param[in] width - int. Is only being read.
+*
+* \returns Returns the 4-byte word referenced by \p var from the source thread ID as \p nv_bfloat162.
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_up_sync(const unsigned int mask, const __nv_bfloat162 var, const unsigned int delta, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller.
+*
+* \details Calculates a source thread ID by adding \p delta to the caller's thread ID.
+* The value of \p var held by the resulting thread ID is returned: this has the effect
+* of shifting \p var down the warp by \p delta threads. If the \p width is less than \p warpSize, then
+* each subsection of the warp behaves as a separate entity with a starting logical
+* thread ID of 0. Similarly to the __shfl_up_sync(), the ID number of the source thread
+* will not wrap around the value of \p width and the upper \p delta threads
+* will remain unchanged.
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read.
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - nv_bfloat162. Is only being read.
+* \param[in] delta - unsigned int. Is only being read.
+* \param[in] width - int. Is only being read.
+*
+* \returns Returns the 4-byte word referenced by \p var from the source thread ID as \p nv_bfloat162.
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_down_sync(const unsigned int mask, const __nv_bfloat162 var, const unsigned int delta, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID.
+*
+* \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with \p laneMask:
+* the value of \p var held by the resulting thread ID is returned. If the \p width is less than \p warpSize, then each
+* group of \p width consecutive threads are able to access elements from earlier groups of threads,
+* however if they attempt to access elements from later groups of threads their own value of \p var
+* will be returned. This mode implements a butterfly addressing pattern such as is used in tree
+* reduction and broadcast.
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read.
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - nv_bfloat162. Is only being read.
+* \param[in] laneMask - int. Is only being read.
+* \param[in] width - int. Is only being read.
+*
+* \returns Returns the 4-byte word referenced by \p var from the source thread ID as \p nv_bfloat162.
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_xor_sync(const unsigned int mask, const __nv_bfloat162 var, const int laneMask, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Exchange a variable between threads within a warp. Direct copy from indexed thread.
+*
+* \details Returns the value of \p var held by the thread whose ID is given by \p srcLane.
+* If the \p width is less than \p warpSize, then each subsection of the warp behaves as a separate
+* entity with a starting logical thread ID of 0. If \p srcLane is outside the range \p [0:width-1],
+* the value returned corresponds to the value of \p var held by the \p srcLane modulo \p width (i.e.
+* within the same subsection). \p width must have a value which is a power of 2;
+* results are undefined if \p width is not a power of 2, or is a number greater than
+* \p warpSize.
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read.
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - nv_bfloat16. Is only being read.
+* \param[in] srcLane - int. Is only being read.
+* \param[in] width - int. Is only being read.
+*
+* \returns Returns the 2-byte word referenced by \p var from the source thread ID as \p nv_bfloat16.
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_sync(const unsigned int mask, const __nv_bfloat16 var, const int srcLane, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller.
+*
+* \details Calculates a source thread ID by subtracting \p delta from the caller's lane ID.
+* The value of \p var held by the resulting lane ID is returned: in effect, \p var is shifted up
+* the warp by \p delta threads. If the \p width is less than \p warpSize, then each subsection of the warp
+* behaves as a separate entity with a starting logical thread ID of 0. The source thread index
+* will not wrap around the value of \p width, so effectively the lower \p delta threads will be unchanged.
+* \p width must have a value which is a power of 2; results are undefined if \p width is not a power of 2,
+* or is a number greater than \p warpSize.
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read.
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - nv_bfloat16. Is only being read.
+* \param[in] delta - unsigned int. Is only being read.
+* \param[in] width - int. Is only being read.
+*
+* \returns Returns the 2-byte word referenced by \p var from the source thread ID as \p nv_bfloat16.
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_up_sync(const unsigned int mask, const __nv_bfloat16 var, const unsigned int delta, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller.
+*
+* \details Calculates a source thread ID by adding \p delta to the caller's thread ID.
+* The value of \p var held by the resulting thread ID is returned: this has the effect
+* of shifting \p var down the warp by \p delta threads. If the \p width is less than \p warpSize, then
+* each subsection of the warp behaves as a separate entity with a starting logical
+* thread ID of 0. Similarly to the __shfl_up_sync(), the ID number of the source thread
+* will not wrap around the value of \p width and the upper \p delta threads
+* will remain unchanged.
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read.
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - nv_bfloat16. Is only being read.
+* \param[in] delta - unsigned int. Is only being read.
+* \param[in] width - int. Is only being read.
+*
+* \returns Returns the 2-byte word referenced by \p var from the source thread ID as \p nv_bfloat16.
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_down_sync(const unsigned int mask, const __nv_bfloat16 var, const unsigned int delta, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID.
+*
+* \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with \p laneMask:
+* the value of \p var held by the resulting thread ID is returned. If the \p width is less than \p warpSize, then each
+* group of \p width consecutive threads are able to access elements from earlier groups of threads,
+* however if they attempt to access elements from later groups of threads their own value of \p var
+* will be returned. This mode implements a butterfly addressing pattern such as is used in tree
+* reduction and broadcast.
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read.
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - nv_bfloat16. Is only being read.
+* \param[in] laneMask - int. Is only being read.
+* \param[in] width - int. Is only being read.
+*
+* \returns Returns the 2-byte word referenced by \p var from the source thread ID as \p nv_bfloat16.
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_xor_sync(const unsigned int mask, const __nv_bfloat16 var, const int laneMask, const int width = warpSize);
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300))) || defined(_NVHPC_CUDA) */
+
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320))) || defined(_NVHPC_CUDA)
+#if defined(__local_warpSize)
+#undef warpSize
+#undef __local_warpSize
+#endif
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.nc` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldg(const  __nv_bfloat162 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.nc` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldg(const __nv_bfloat16 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.cg` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldcg(const  __nv_bfloat162 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.cg` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldcg(const __nv_bfloat16 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.ca` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldca(const  __nv_bfloat162 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.ca` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldca(const __nv_bfloat16 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.cs` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldcs(const  __nv_bfloat162 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.cs` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldcs(const __nv_bfloat16 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.lu` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldlu(const  __nv_bfloat162 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.lu` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldlu(const __nv_bfloat16 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.cv` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldcv(const  __nv_bfloat162 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.cv` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldcv(const __nv_bfloat16 *const ptr);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `st.global.wb` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_BF16_DECL__ void __stwb(__nv_bfloat162 *const ptr, const __nv_bfloat162 value);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `st.global.wb` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_BF16_DECL__ void __stwb(__nv_bfloat16 *const ptr, const __nv_bfloat16 value);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `st.global.cg` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_BF16_DECL__ void __stcg(__nv_bfloat162 *const ptr, const __nv_bfloat162 value);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `st.global.cg` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_BF16_DECL__ void __stcg(__nv_bfloat16 *const ptr, const __nv_bfloat16 value);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `st.global.cs` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_BF16_DECL__ void __stcs(__nv_bfloat162 *const ptr, const __nv_bfloat162 value);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `st.global.cs` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_BF16_DECL__ void __stcs(__nv_bfloat16 *const ptr, const __nv_bfloat16 value);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `st.global.wt` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_BF16_DECL__ void __stwt(__nv_bfloat162 *const ptr, const __nv_bfloat162 value);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `st.global.wt` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_BF16_DECL__ void __stwt(__nv_bfloat16 *const ptr, const __nv_bfloat16 value);
+
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320))) || defined(_NVHPC_CUDA) */
+
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs nv_bfloat162 vector if-equal comparison.
+*
+* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The vector result of if-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __heq2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector not-equal comparison.
+*
+* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The vector result of not-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hne2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector less-equal comparison.
+*
+* \details Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The \p nv_bfloat162 result of less-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hle2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector greater-equal comparison.
+*
+* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The vector result of greater-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hge2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector less-than comparison.
+*
+* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The nv_bfloat162 vector result of less-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hlt2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector greater-than comparison.
+*
+* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The vector result of greater-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hgt2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered if-equal comparison.
+*
+* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The vector result of unordered if-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hequ2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered not-equal comparison.
+*
+* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The vector result of unordered not-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hneu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered less-equal comparison.
+*
+* Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The vector result of unordered less-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hleu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered greater-equal comparison.
+*
+* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The \p nv_bfloat162 vector result of unordered greater-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered less-than comparison.
+*
+* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The vector result of unordered less-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hltu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered greater-than comparison.
+*
+* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The \p nv_bfloat162 vector result of unordered greater-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs nv_bfloat162 vector if-equal comparison.
+*
+* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of if-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __heq2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector not-equal comparison.
+*
+* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of not-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hne2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector less-equal comparison.
+*
+* \details Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of less-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hle2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector greater-equal comparison.
+*
+* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of greater-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hge2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector less-than comparison.
+*
+* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of less-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hlt2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector greater-than comparison.
+*
+* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of greater-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hgt2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered if-equal comparison.
+*
+* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of unordered if-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hequ2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered not-equal comparison.
+*
+* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of unordered not-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hneu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered less-equal comparison.
+*
+* Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of unordered less-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hleu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered greater-equal comparison.
+*
+* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of unordered greater-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hgeu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered less-than comparison.
+*
+* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of unordered less-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hltu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered greater-than comparison.
+*
+* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of unordered greater-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hgtu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Determine whether \p nv_bfloat162 argument is a NaN.
+*
+* \details Determine whether each nv_bfloat16 of input \p nv_bfloat162 number \p a is a NaN.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The nv_bfloat162 with the corresponding \p nv_bfloat16 results set to
+* 1.0 for NaN, 0.0 otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hisnan2(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector addition in round-to-nearest-even mode.
+*
+* \details Performs \p nv_bfloat162 vector add of inputs \p a and \p b, in round-to-nearest-even
+* mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-95
+* \endinternal
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The sum of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hadd2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector subtraction in round-to-nearest-even mode.
+*
+* \details Subtracts \p nv_bfloat162 input vector \p b from input vector \p a in
+* round-to-nearest-even mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-104
+* \endinternal
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The subtraction of vector \p b from \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hsub2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector multiplication in round-to-nearest-even mode.
+*
+* \details Performs \p nv_bfloat162 vector multiplication of inputs \p a and \p b, in
+* round-to-nearest-even mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-102
+* \endinternal
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The result of elementwise multiplying the vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmul2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector addition in round-to-nearest-even mode.
+*
+* \details Performs \p nv_bfloat162 vector add of inputs \p a and \p b, in round-to-nearest-even
+* mode. Prevents floating-point contractions of mul+add into fma.
+* \internal
+* \req DEEPLEARN-SRM_REQ-95
+* \endinternal
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The sum of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hadd2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector subtraction in round-to-nearest-even mode.
+*
+* \details Subtracts \p nv_bfloat162 input vector \p b from input vector \p a in
+* round-to-nearest-even mode. Prevents floating-point contractions of mul+sub into fma.
+* \internal
+* \req DEEPLEARN-SRM_REQ-104
+* \endinternal
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The subtraction of vector \p b from \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hsub2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector multiplication in round-to-nearest-even mode.
+*
+* \details Performs \p nv_bfloat162 vector multiplication of inputs \p a and \p b, in
+* round-to-nearest-even mode. Prevents floating-point contractions of mul+add
+* or sub into fma.
+* \internal
+* \req DEEPLEARN-SRM_REQ-102
+* \endinternal
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The result of elementwise multiplying the vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmul2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector division in round-to-nearest-even mode.
+*
+* \details Divides \p nv_bfloat162 input vector \p a by input vector \p b in round-to-nearest-even
+* mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-103
+* \endinternal
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The elementwise division of \p a with \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __h2div(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Calculates the absolute value of both halves of the input \p nv_bfloat162 number and
+* returns the result.
+*
+* \details Calculates the absolute value of both halves of the input \p nv_bfloat162 number and
+* returns the result.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns bfloat2
+* - Returns \p a with the absolute value of both halves.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __habs2(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector addition in round-to-nearest-even mode, with
+* saturation to [0.0, 1.0].
+*
+* \details Performs \p nv_bfloat162 vector add of inputs \p a and \p b, in round-to-nearest-even
+* mode, and clamps the results to range [0.0, 1.0]. NaN results are flushed to
+* +0.0.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The sum of \p a and \p b, with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hadd2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector subtraction in round-to-nearest-even mode,
+* with saturation to [0.0, 1.0].
+*
+* \details Subtracts \p nv_bfloat162 input vector \p b from input vector \p a in
+* round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN
+* results are flushed to +0.0.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The subtraction of vector \p b from \p a, with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hsub2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector multiplication in round-to-nearest-even mode,
+* with saturation to [0.0, 1.0].
+*
+* \details Performs \p nv_bfloat162 vector multiplication of inputs \p a and \p b, in
+* round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN
+* results are flushed to +0.0.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The result of elementwise multiplication of vectors \p a and \p b,
+* with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmul2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b);
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector fused multiply-add in round-to-nearest-even
+* mode.
+*
+* \details Performs \p nv_bfloat162 vector multiply on inputs \p a and \p b,
+* then performs a \p nv_bfloat162 vector add of the result with \p c,
+* rounding the result once in round-to-nearest-even mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-105
+* \endinternal
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+* \param[in] c - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector fused multiply-add in round-to-nearest-even
+* mode, with saturation to [0.0, 1.0].
+*
+* \details Performs \p nv_bfloat162 vector multiply on inputs \p a and \p b,
+* then performs a \p nv_bfloat162 vector add of the result with \p c,
+* rounding the result once in round-to-nearest-even mode, and clamps the
+* results to range [0.0, 1.0]. NaN results are flushed to +0.0.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+* \param[in] c - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c,
+* with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c);
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Negates both halves of the input \p nv_bfloat162 number and returns the
+* result.
+*
+* \details Negates both halves of the input \p nv_bfloat162 number \p a and returns the result.
+* \internal
+* \req DEEPLEARN-SRM_REQ-101
+* \endinternal
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - Returns \p a with both halves negated.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hneg2(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Calculates the absolute value of input \p nv_bfloat16 number and returns the result.
+*
+* \details Calculates the absolute value of input \p nv_bfloat16 number and returns the result.
+* \param[in] a - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The absolute value of a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __habs(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 addition in round-to-nearest-even mode.
+*
+* \details Performs \p nv_bfloat16 addition of inputs \p a and \p b, in round-to-nearest-even
+* mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-94
+* \endinternal
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The sum of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hadd(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 subtraction in round-to-nearest-even mode.
+*
+* \details Subtracts \p nv_bfloat16 input \p b from input \p a in round-to-nearest-even
+* mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-97
+* \endinternal
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The result of subtracting \p b from \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hsub(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 multiplication in round-to-nearest-even mode.
+*
+* \details Performs \p nv_bfloat16 multiplication of inputs \p a and \p b, in round-to-nearest-even
+* mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-99
+* \endinternal
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The result of multiplying \p a and \p b.
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmul(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 addition in round-to-nearest-even mode.
+*
+* \details Performs \p nv_bfloat16 addition of inputs \p a and \p b, in round-to-nearest-even
+* mode. Prevents floating-point contractions of mul+add into fma.
+* \internal
+* \req DEEPLEARN-SRM_REQ-94
+* \endinternal
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The sum of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hadd_rn(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 subtraction in round-to-nearest-even mode.
+*
+* \details Subtracts \p nv_bfloat16 input \p b from input \p a in round-to-nearest-even
+* mode. Prevents floating-point contractions of mul+sub into fma.
+* \internal
+* \req DEEPLEARN-SRM_REQ-97
+* \endinternal
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The result of subtracting \p b from \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hsub_rn(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 multiplication in round-to-nearest-even mode.
+*
+* \details Performs \p nv_bfloat16 multiplication of inputs \p a and \p b, in round-to-nearest-even
+* mode. Prevents floating-point contractions of mul+add or sub into fma.
+* \internal
+* \req DEEPLEARN-SRM_REQ-99
+* \endinternal
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The result of multiplying \p a and \p b.
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmul_rn(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 division in round-to-nearest-even mode.
+*
+* \details Divides \p nv_bfloat16 input \p a by input \p b in round-to-nearest-even
+* mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-98
+* \endinternal
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The result of dividing \p a by \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__  __nv_bfloat16 __hdiv(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 addition in round-to-nearest-even mode, with
+* saturation to [0.0, 1.0].
+*
+* \details Performs \p nv_bfloat16 add of inputs \p a and \p b, in round-to-nearest-even mode,
+* and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The sum of \p a and \p b, with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hadd_sat(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 subtraction in round-to-nearest-even mode, with
+* saturation to [0.0, 1.0].
+*
+* \details Subtracts \p nv_bfloat16 input \p b from input \p a in round-to-nearest-even
+* mode,
+* and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The result of subtraction of \p b from \p a, with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hsub_sat(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 multiplication in round-to-nearest-even mode, with
+* saturation to [0.0, 1.0].
+*
+* \details Performs \p nv_bfloat16 multiplication of inputs \p a and \p b, in round-to-nearest-even
+* mode, and clamps the result to range [0.0, 1.0]. NaN results are flushed to
+* +0.0.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The result of multiplying \p a and \p b, with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmul_sat(const __nv_bfloat16 a, const __nv_bfloat16 b);
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 fused multiply-add in round-to-nearest-even mode.
+*
+* \details Performs \p nv_bfloat16 multiply on inputs \p a and \p b,
+* then performs a \p nv_bfloat16 add of the result with \p c,
+* rounding the result once in round-to-nearest-even mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-96
+* \endinternal
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+* \param[in] c - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The result of fused multiply-add operation on \p
+* a, \p b, and \p c.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __hfma(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 fused multiply-add in round-to-nearest-even mode,
+* with saturation to [0.0, 1.0].
+*
+* \details Performs \p nv_bfloat16 multiply on inputs \p a and \p b,
+* then performs a \p nv_bfloat16 add of the result with \p c,
+* rounding the result once in round-to-nearest-even mode, and clamps the result
+* to range [0.0, 1.0]. NaN results are flushed to +0.0.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+* \param[in] c - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The result of fused multiply-add operation on \p
+* a, \p b, and \p c, with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __hfma_sat(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c);
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Negates input \p nv_bfloat16 number and returns the result.
+*
+* \details Negates input \p nv_bfloat16 number and returns the result.
+* \internal
+* \req DEEPLEARN-SRM_REQ-100
+* \endinternal
+* \param[in] a - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - minus a
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hneg(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector if-equal comparison and returns boolean true
+* if both \p nv_bfloat16 results are true, boolean false otherwise.
+*
+* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 if-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of if-equal comparison
+* of vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbeq2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector not-equal comparison and returns boolean
+* true if both \p nv_bfloat16 results are true, boolean false otherwise.
+*
+* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 not-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of not-equal comparison
+* of vectors \p a and \p b are true,
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbne2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector less-equal comparison and returns boolean
+* true if both \p nv_bfloat16 results are true, boolean false otherwise.
+*
+* \details Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 less-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of less-equal comparison
+* of vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hble2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector greater-equal comparison and returns boolean
+* true if both \p nv_bfloat16 results are true, boolean false otherwise.
+*
+* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 greater-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of greater-equal
+* comparison of vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbge2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector less-than comparison and returns boolean
+* true if both \p nv_bfloat16 results are true, boolean false otherwise.
+*
+* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 less-than comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of less-than comparison
+* of vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hblt2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector greater-than comparison and returns boolean
+* true if both \p nv_bfloat16 results are true, boolean false otherwise.
+*
+* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 greater-than comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of greater-than
+* comparison of vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbgt2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered if-equal comparison and returns
+* boolean true if both \p nv_bfloat16 results are true, boolean false otherwise.
+*
+* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 if-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of unordered if-equal
+* comparison of vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbequ2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered not-equal comparison and returns
+* boolean true if both \p nv_bfloat16 results are true, boolean false otherwise.
+*
+* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 not-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of unordered not-equal
+* comparison of vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbneu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered less-equal comparison and returns
+* boolean true if both \p nv_bfloat16 results are true, boolean false otherwise.
+*
+* \details Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 less-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of unordered less-equal
+* comparison of vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbleu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered greater-equal comparison and
+* returns boolean true if both \p nv_bfloat16 results are true, boolean false
+* otherwise.
+*
+* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 greater-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of unordered
+* greater-equal comparison of vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered less-than comparison and returns
+* boolean true if both \p nv_bfloat16 results are true, boolean false otherwise.
+*
+* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 less-than comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of unordered less-than comparison of
+* vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbltu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered greater-than comparison and
+* returns boolean true if both \p nv_bfloat16 results are true, boolean false
+* otherwise.
+*
+* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 greater-than comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of unordered
+* greater-than comparison of vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 if-equal comparison.
+*
+* \details Performs \p nv_bfloat16 if-equal comparison of inputs \p a and \p b.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns bool
+* - The boolean result of if-equal comparison of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __heq(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 not-equal comparison.
+*
+* \details Performs \p nv_bfloat16 not-equal comparison of inputs \p a and \p b.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns bool
+* - The boolean result of not-equal comparison of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hne(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 less-equal comparison.
+*
+* \details Performs \p nv_bfloat16 less-equal comparison of inputs \p a and \p b.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns bool
+* - The boolean result of less-equal comparison of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hle(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 greater-equal comparison.
+*
+* \details Performs \p nv_bfloat16 greater-equal comparison of inputs \p a and \p b.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns bool
+* - The boolean result of greater-equal comparison of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hge(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 less-than comparison.
+*
+* \details Performs \p nv_bfloat16 less-than comparison of inputs \p a and \p b.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns bool
+* - The boolean result of less-than comparison of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hlt(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 greater-than comparison.
+*
+* \details Performs \p nv_bfloat16 greater-than comparison of inputs \p a and \p b.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns bool
+* - The boolean result of greater-than comparison of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hgt(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 unordered if-equal comparison.
+*
+* \details Performs \p nv_bfloat16 if-equal comparison of inputs \p a and \p b.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns bool
+* - The boolean result of unordered if-equal comparison of \p a and
+* \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hequ(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 unordered not-equal comparison.
+*
+* \details Performs \p nv_bfloat16 not-equal comparison of inputs \p a and \p b.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns bool
+* - The boolean result of unordered not-equal comparison of \p a and
+* \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hneu(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 unordered less-equal comparison.
+*
+* \details Performs \p nv_bfloat16 less-equal comparison of inputs \p a and \p b.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns bool
+* - The boolean result of unordered less-equal comparison of \p a and
+* \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hleu(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 unordered greater-equal comparison.
+*
+* \details Performs \p nv_bfloat16 greater-equal comparison of inputs \p a and \p b.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns bool
+* - The boolean result of unordered greater-equal comparison of \p a
+* and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hgeu(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 unordered less-than comparison.
+*
+* \details Performs \p nv_bfloat16 less-than comparison of inputs \p a and \p b.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns bool
+* - The boolean result of unordered less-than comparison of \p a and
+* \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hltu(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 unordered greater-than comparison.
+*
+* \details Performs \p nv_bfloat16 greater-than comparison of inputs \p a and \p b.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns bool
+* - The boolean result of unordered greater-than comparison of \p a
+* and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hgtu(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Determine whether \p nv_bfloat16 argument is a NaN.
+*
+* \details Determine whether \p nv_bfloat16 value \p a is a NaN.
+* \param[in] a - nv_bfloat16. Is only being read.
+*
+* \returns bool
+* - true if argument is NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hisnan(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Calculates \p nv_bfloat16 maximum of two input values.
+*
+* \details Calculates \p nv_bfloat16 max(\p a, \p b)
+* defined as (\p a > \p b) ? \p a : \p b.
+* - If either of inputs is NaN, the other input is returned.
+* - If both inputs are NaNs, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmax(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Calculates \p nv_bfloat16 minimum of two input values.
+*
+* \details Calculates \p nv_bfloat16 min(\p a, \p b)
+* defined as (\p a < \p b) ? \p a : \p b.
+* - If either of inputs is NaN, the other input is returned.
+* - If both inputs are NaNs, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmin(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Calculates \p nv_bfloat16 maximum of two input values, NaNs pass through.
+*
+* \details Calculates \p nv_bfloat16 max(\p a, \p b)
+* defined as (\p a > \p b) ? \p a : \p b.
+* - If either of inputs is NaN, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmax_nan(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Calculates \p nv_bfloat16 minimum of two input values, NaNs pass through.
+*
+* \details Calculates \p nv_bfloat16 min(\p a, \p b)
+* defined as (\p a < \p b) ? \p a : \p b.
+* - If either of inputs is NaN, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmin_nan(const __nv_bfloat16 a, const __nv_bfloat16 b);
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 fused multiply-add in round-to-nearest-even mode with relu saturation.
+*
+* \details Performs \p nv_bfloat16 multiply on inputs \p a and \p b,
+* then performs a \p nv_bfloat16 add of the result with \p c,
+* rounding the result once in round-to-nearest-even mode.
+* Then negative result is clamped to 0.
+* NaN result is converted to canonical NaN.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+* \param[in] c - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The result of fused multiply-add operation on \p
+* a, \p b, and \p c with relu saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __hfma_relu(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c);
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Calculates \p nv_bfloat162 vector maximum of two inputs.
+*
+* \details Calculates \p nv_bfloat162 vector max(\p a, \p b).
+* Elementwise \p nv_bfloat16 operation is defined as
+* (\p a > \p b) ? \p a : \p b.
+* - If either of inputs is NaN, the other input is returned.
+* - If both inputs are NaNs, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The result of elementwise maximum of vectors \p a  and \p b
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmax2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Calculates \p nv_bfloat162 vector minimum of two inputs.
+*
+* \details Calculates \p nv_bfloat162 vector min(\p a, \p b).
+* Elementwise \p nv_bfloat16 operation is defined as
+* (\p a < \p b) ? \p a : \p b.
+* - If either of inputs is NaN, the other input is returned.
+* - If both inputs are NaNs, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The result of elementwise minimum of vectors \p a  and \p b
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmin2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Calculates \p nv_bfloat162 vector maximum of two inputs, NaNs pass through.
+*
+* \details Calculates \p nv_bfloat162 vector max(\p a, \p b).
+* Elementwise \p nv_bfloat16 operation is defined as
+* (\p a > \p b) ? \p a : \p b.
+* - If either of inputs is NaN, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The result of elementwise maximum of vectors \p a  and \p b, with NaNs pass through
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmax2_nan(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Calculates \p nv_bfloat162 vector minimum of two inputs, NaNs pass through.
+*
+* \details Calculates \p nv_bfloat162 vector min(\p a, \p b).
+* Elementwise \p nv_bfloat16 operation is defined as
+* (\p a < \p b) ? \p a : \p b.
+* - If either of inputs is NaN, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The result of elementwise minimum of vectors \p a  and \p b, with NaNs pass through
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmin2_nan(const __nv_bfloat162 a, const __nv_bfloat162 b);
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector fused multiply-add in round-to-nearest-even
+* mode with relu saturation.
+*
+* \details Performs \p nv_bfloat162 vector multiply on inputs \p a and \p b,
+* then performs a \p nv_bfloat162 vector add of the result with \p c,
+* rounding the result once in round-to-nearest-even mode.
+* Then negative result is clamped to 0.
+* NaN result is converted to canonical NaN.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+* \param[in] c - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c with relu saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2_relu(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs fast complex multiply-accumulate
+*
+* \details Interprets vector \p nv_bfloat162 input pairs \p a, \p b, and \p c as
+* complex numbers in \p nv_bfloat16 precision and performs
+* complex multiply-accumulate operation: a*b + c
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+* \param[in] c - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The result of complex multiply-accumulate operation on complex numbers \p a, \p b, and \p c
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __hcmadd(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c);
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 square root in round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat16 square root of input \p a in round-to-nearest-even mode.
+* \param[in] a - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The square root of \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hsqrt(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 reciprocal square root in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p nv_bfloat16 reciprocal square root of input \p a in round-to-nearest-even
+* mode.
+* \param[in] a - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The reciprocal square root of \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hrsqrt(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 reciprocal in round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat16 reciprocal of input \p a in round-to-nearest-even mode.
+* \param[in] a - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The reciprocal of \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hrcp(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 natural logarithm in round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat16 natural logarithm of input \p a in round-to-nearest-even
+* mode.
+* \param[in] a - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The natural logarithm of \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hlog(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 binary logarithm in round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat16 binary logarithm of input \p a in round-to-nearest-even
+* mode.
+* \param[in] a - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The binary logarithm of \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hlog2(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 decimal logarithm in round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat16 decimal logarithm of input \p a in round-to-nearest-even
+* mode.
+* \param[in] a - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The decimal logarithm of \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hlog10(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 natural exponential function in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p nv_bfloat16 natural exponential function of input \p a in
+* round-to-nearest-even mode.
+* \param[in] a - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The natural exponential function on \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hexp(const __nv_bfloat16 a);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates approximate \p nv_bfloat16 hyperbolic tangent function.
+*
+* \details Calculates approximate \p nv_bfloat16 hyperbolic tangent function: \cuda_math_formula \tanh(a)\end_cuda_math_formula.
+* This operation uses HW acceleration on devices of compute capability 9.x and higher.
+* \param[in] a - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The approximate hyperbolic tangent function of \p a.
+* - htanh_approx \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula (\pm 0)\end_cuda_math_formula.
+* - htanh_approx \cuda_math_formula (\pm\infty)\end_cuda_math_formula returns \cuda_math_formula (\pm 1)\end_cuda_math_formula.
+* - htanh_approx(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 htanh_approx(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector approximate hyperbolic tangent function.
+*
+* \details Calculates \p nv_bfloat162 approximate hyperbolic tangent function of input vector \p a.
+* This operation uses HW acceleration on devices of compute capability 9.x and higher.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The elementwise approximate hyperbolic tangent function on vector \p a.
+*
+* \see htanh_approx(__nv_bfloat16) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2tanh_approx(const __nv_bfloat162 a);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 hyperbolic tangent function in
+* round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat16 hyperbolic tangent function: \cuda_math_formula \tanh(a)\end_cuda_math_formula in
+* round-to-nearest-even mode.
+* \param[in] a - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The hyperbolic tangent function of \p a.
+* - htanh \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula (\pm 0)\end_cuda_math_formula.
+* - htanh \cuda_math_formula (\pm\infty)\end_cuda_math_formula returns \cuda_math_formula (\pm 1)\end_cuda_math_formula.
+* - htanh(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 htanh(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector hyperbolic tangent function in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p nv_bfloat162 hyperbolic tangent function of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The elementwise hyperbolic tangent function on vector \p a.
+*
+* \see htanh(__nv_bfloat16) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2tanh(const __nv_bfloat162 a);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 binary exponential function in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p nv_bfloat16 binary exponential function of input \p a in
+* round-to-nearest-even mode.
+* \param[in] a - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The binary exponential function on \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hexp2(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 decimal exponential function in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p nv_bfloat16 decimal exponential function of input \p a in
+* round-to-nearest-even mode.
+* \param[in] a - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The decimal exponential function on \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hexp10(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 cosine in round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat16 cosine of input \p a in round-to-nearest-even mode.
+*
+* NOTE: this function's implementation calls cosf(float) function and is exposed
+* to compiler optimizations. Specifically, \p --use_fast_math flag changes cosf(float)
+* into an intrinsic __cosf(float), which has less accurate numeric behavior.
+*
+* \param[in] a - nv_bfloat16. Is only being read.
+* \returns nv_bfloat16
+* - The cosine of \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hcos(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 sine in round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat16 sine of input \p a in round-to-nearest-even mode.
+*
+* NOTE: this function's implementation calls sinf(float) function and is exposed
+* to compiler optimizations. Specifically, \p --use_fast_math flag changes sinf(float)
+* into an intrinsic __sinf(float), which has less accurate numeric behavior.
+*
+* \param[in] a - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The sine of \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hsin(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector square root in round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat162 square root of input vector \p a in round-to-nearest-even
+* mode.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The elementwise square root on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2sqrt(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector reciprocal square root in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p nv_bfloat162 reciprocal square root of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The elementwise reciprocal square root on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2rsqrt(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector reciprocal in round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat162 reciprocal of input vector \p a in round-to-nearest-even
+* mode.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The elementwise reciprocal on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2rcp(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector natural logarithm in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p nv_bfloat162 natural logarithm of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The elementwise natural logarithm on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2log(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector binary logarithm in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p nv_bfloat162 binary logarithm of input vector \p a in round-to-nearest-even
+* mode.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The elementwise binary logarithm on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2log2(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector decimal logarithm in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p nv_bfloat162 decimal logarithm of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The elementwise decimal logarithm on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2log10(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector exponential function in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p nv_bfloat162 exponential function of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The elementwise exponential function on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2exp(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector binary exponential function in
+* round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat162 binary exponential function of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The elementwise binary exponential function on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2exp2(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector decimal exponential function in
+* round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat162 decimal exponential function of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The elementwise decimal exponential function on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2exp10(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector cosine in round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat162 cosine of input vector \p a in round-to-nearest-even
+* mode.
+*
+* NOTE: this function's implementation calls cosf(float) function and is exposed
+* to compiler optimizations. Specifically, \p --use_fast_math flag changes cosf(float)
+* into an intrinsic __cosf(float), which has less accurate numeric behavior.
+*
+* \param[in] a - nv_bfloat162. Is only being read.
+* \returns nv_bfloat162
+* - The elementwise cosine on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2cos(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector sine in round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat162 sine of input vector \p a in round-to-nearest-even mode.
+*
+* NOTE: this function's implementation calls sinf(float) function and is exposed
+* to compiler optimizations. Specifically, \p --use_fast_math flag changes sinf(float)
+* into an intrinsic __sinf(float), which has less accurate numeric behavior.
+*
+* \param[in] a - nv_bfloat162. Is only being read.
+* \returns nv_bfloat162
+* - The elementwise sine on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2sin(const __nv_bfloat162 a);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Vector add \p val to the value stored at \p address in global or shared memory, and writes this
+* value back to \p address. The atomicity of the add operation is guaranteed separately for each of the
+* two nv_bfloat16 elements; the entire __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access.
+*
+* \details The location of \p address must be in global or shared memory. This operation has undefined
+* behavior otherwise. This operation is natively supported by devices of compute capability 9.x and higher,
+* older devices use emulation path.
+*
+* \param[in] address - __nv_bfloat162*. An address in global or shared memory.
+* \param[in] val - __nv_bfloat162. The value to be added.
+*
+* \returns __nv_bfloat162
+* - The old value read from \p address.
+*
+* \note_ref_guide_atomic
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 atomicAdd(__nv_bfloat162 *const address, const __nv_bfloat162 val);
+
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700))) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Adds \p val to the value stored at \p address in global or shared memory, and writes this value
+* back to \p address. This operation is performed in one atomic operation.
+*
+* \details The location of \p address must be in global or shared memory. This operation has undefined
+* behavior otherwise. This operation is natively supported by devices of compute capability 9.x and higher,
+* older devices of compute capability 7.x and 8.x use emulation path.
+*
+* \param[in] address - __nv_bfloat16*. An address in global or shared memory.
+* \param[in] val - __nv_bfloat16. The value to be added.
+*
+* \returns __nv_bfloat16
+* - The old value read from \p address.
+*
+* \note_ref_guide_atomic
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 atomicAdd(__nv_bfloat16 *const address, const __nv_bfloat16 val);
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700))) || defined(_NVHPC_CUDA) */
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+
+
+#endif /* defined(__cplusplus) */
+
+#if !defined(_MSC_VER) && __cplusplus >= 201103L
+#   define __CPP_VERSION_AT_LEAST_11_BF16
+#elif _MSC_FULL_VER >= 190024210 && _MSVC_LANG >= 201103L
+#   define __CPP_VERSION_AT_LEAST_11_BF16
+#endif
+
+/* C++11 header for ::std::move.
+ * In RTC mode, ::std::move is provided implicitly; don't include the header
+ */
+#if defined(__CPP_VERSION_AT_LEAST_11_BF16) && !defined(__CUDACC_RTC__)
+#include <utility>
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) && !defined(__CUDACC_RTC__) */
+
+/* C++ header for ::std::memcpy (used for type punning in host-side implementations).
+ * When compiling as a CUDA source file memcpy is provided implicitly.
+ * !defined(__CUDACC__) implies !defined(__CUDACC_RTC__).
+ */
+#if defined(__cplusplus) && !defined(__CUDACC__)
+#include <cstring>
+#endif /* defined(__cplusplus) && !defined(__CUDACC__) */
+
+// implicitly provided by NVRTC
+#if !defined(__CUDACC_RTC__)
+#include <nv/target>
+#endif  /* !defined(__CUDACC_RTC__) */
+
+#if (defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3))))
+#define __CUDA_BF16_INLINE__
+#define __CUDA_BF16_FORCEINLINE__
+#else
+#define __CUDA_BF16_INLINE__ inline
+#define __CUDA_BF16_FORCEINLINE__ __forceinline__
+#endif /* #if (defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3)))) */
+
+/* Set up structure-alignment attribute */
+#if defined(__CUDACC__)
+#define __CUDA_ALIGN__(align) __align__(align)
+#else
+/* Define alignment macro based on compiler type (cannot assume C11 "_Alignas" is available) */
+#if defined(__CPP_VERSION_AT_LEAST_11_BF16)
+#define __CUDA_ALIGN__(n) alignas(n)    /* C++11 kindly gives us a keyword for this */
+#else /* defined(__CPP_VERSION_AT_LEAST_11_BF16)*/
+#if defined(__GNUC__)
+#define __CUDA_ALIGN__(n) __attribute__ ((aligned(n)))
+#elif defined(_MSC_VER)
+#define __CUDA_ALIGN__(n) __declspec(align(n))
+#else
+#define __CUDA_ALIGN__(n)
+#endif /* defined(__GNUC__) */
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */
+#endif /* defined(__CUDACC__) */
+
+// define __CUDA_BF16_CONSTEXPR__ in order to
+// use constexpr where possible, with supporting C++ dialects
+// undef after use
+#if (defined __CPP_VERSION_AT_LEAST_11_BF16)
+#define __CUDA_BF16_CONSTEXPR__   constexpr
+#else
+#define __CUDA_BF16_CONSTEXPR__
+#endif
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+ * \brief __nv_bfloat16_raw data type
+ * \details Type allows static initialization of \p nv_bfloat16 until it becomes
+ * a built-in type.
+ *
+ * - Note: this initialization is as a bit-field representation of \p nv_bfloat16,
+ * and not a conversion from \p short to \p nv_bfloat16.
+ * Such representation will be deprecated in a future version of CUDA.
+ *
+ * - Note: this is visible to non-nvcc compilers, including C-only compilations
+ */
+typedef struct __CUDA_ALIGN__(2) {
+    /**
+     * Storage field contains bits representation of the \p nv_bfloat16 floating-point number.
+     */
+    unsigned short x;
+} __nv_bfloat16_raw;
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+ * \brief __nv_bfloat162_raw data type
+ * \details Type allows static initialization of \p nv_bfloat162 until it becomes
+ * a built-in type.
+ *
+ * - Note: this initialization is as a bit-field representation of \p nv_bfloat162,
+ * and not a conversion from \p short2 to \p nv_bfloat162.
+ * Such representation will be deprecated in a future version of CUDA.
+ *
+ * - Note: this is visible to non-nvcc compilers, including C-only compilations
+ */
+typedef struct __CUDA_ALIGN__(4) {
+    /**
+     * Storage field contains bits of the lower \p nv_bfloat16 part.
+     */
+    unsigned short x;
+    /**
+     * Storage field contains bits of the upper \p nv_bfloat16 part.
+     */
+    unsigned short y;
+} __nv_bfloat162_raw;
+
+/* All other definitions in this file are only visible to C++ compilers */
+#if defined(__cplusplus)
+
+/* Hide GCC member initialization list warnings because of host/device in-function init requirement */
+#if defined(__GNUC__)
+#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Weffc++"
+#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */
+#endif /* defined(__GNUC__) */
+
+/* class' : multiple assignment operators specified
+   The class has multiple assignment operators of a single type. This warning is informational */
+#if defined(_MSC_VER) && _MSC_VER >= 1500
+#pragma warning( push )
+#pragma warning( disable:4522 )
+#endif /* defined(__GNUC__) */
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+ * \brief nv_bfloat16 datatype
+ *
+ * \details This structure implements the datatype for storing
+ * nv_bfloat16 floating-point numbers. The structure implements
+ * assignment operators and type conversions. 16 bits are being
+ * used in total: 1 sign bit, 8 bits for the exponent, and
+ * the significand is being stored in 7 bits. The total
+ * precision is 8 bits.
+ *
+ */
+struct __CUDA_ALIGN__(2) __nv_bfloat16 {
+protected:
+    /**
+     * Protected storage variable contains the bits of floating-point data.
+     */
+    unsigned short __x;
+
+public:
+
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * \brief Constructor by default.
+     * \details Emtpy default constructor, result is uninitialized.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_BF16)
+    __nv_bfloat16() = default;
+#else
+    __CUDA_HOSTDEVICE__ __nv_bfloat16() { }
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */
+
+    /* Convert to/from __nv_bfloat16_raw */
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Constructor from \p __nv_bfloat16_raw.
+     */
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_CONSTEXPR__ __nv_bfloat16(const __nv_bfloat16_raw &hr) : __x(hr.x) { }
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Assignment operator from \p __nv_bfloat16_raw.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(const __nv_bfloat16_raw &hr);
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Assignment operator from \p __nv_bfloat16_raw to \p volatile \p __nv_bfloat16.
+     */
+    __CUDA_HOSTDEVICE__ volatile __nv_bfloat16 &operator=(const __nv_bfloat16_raw &hr) volatile;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Assignment operator from \p volatile \p __nv_bfloat16_raw to \p volatile \p __nv_bfloat16.
+     */
+    __CUDA_HOSTDEVICE__ volatile __nv_bfloat16 &operator=(const volatile __nv_bfloat16_raw &hr) volatile;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Type cast to \p __nv_bfloat16_raw operator.
+     */
+    __CUDA_HOSTDEVICE__ operator __nv_bfloat16_raw() const;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Type cast to \p __nv_bfloat16_raw operator with \p volatile input.
+     */
+    __CUDA_HOSTDEVICE__ operator __nv_bfloat16_raw() const volatile;
+
+#if !defined(__CUDA_NO_BFLOAT16_CONVERSIONS__)
+#if defined(__CPP_VERSION_AT_LEAST_11_BF16)
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Construct \p __nv_bfloat16 from \p __half input using default round-to-nearest-even rounding mode.
+     */
+    explicit __CUDA_HOSTDEVICE__ __nv_bfloat16(const __half f)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    asm("{  cvt.rn.bf16.f16 %0, %1;}\n" : "=h"(__x) : "h"(__BFLOAT16_TO_CUS(f)));
+,
+    __x = __float2bfloat16(__half2float(f)).__x;
+)
+}
+#endif /* #if defined(__CPP_VERSION_AT_LEAST_11_BF16) */
+
+    /* Construct from float/double */
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Construct \p __nv_bfloat16 from \p float input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(const float f) { __x = __float2bfloat16(f).__x; }
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Construct \p __nv_bfloat16 from \p double input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(const double f) { __x = __double2bfloat16(f).__x; }
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Type cast to \p float operator.
+     */
+    __CUDA_HOSTDEVICE__ operator float() const;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Type cast to \p __nv_bfloat16 assignment operator from \p float input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(const float f);
+
+    /* We omit "cast to double" operator, so as to not be ambiguous about up-cast */
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Type cast to \p __nv_bfloat16 assignment operator from \p double input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(const double f);
+
+/*
+ * Implicit type conversions to/from integer types were only available to nvcc compilation.
+ * Introducing them for all compilers is a potentially breaking change that may affect
+ * overloads resolution and will require users to update their code.
+ * Define __CUDA_BF16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__ to opt-out.
+ */
+#if !(defined __CUDA_BF16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__) || (defined __CUDACC__)
+    /* Allow automatic construction from types supported natively in hardware */
+    /* Note we do avoid constructor init-list because of special host/device compilation rules */
+
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Construct \p __nv_bfloat16 from \p short integer input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(short val) { __x = __short2bfloat16_rn(val).__x; }
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Construct \p __nv_bfloat16 from \p unsigned \p short integer input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(unsigned short val) { __x = __ushort2bfloat16_rn(val).__x; }
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Construct \p __nv_bfloat16 from \p int input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(int val) { __x = __int2bfloat16_rn(val).__x; }
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Construct \p __nv_bfloat16 from \p unsigned \p int input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(unsigned int val) { __x = __uint2bfloat16_rn(val).__x; }
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Construct \p __nv_bfloat16 from \p long input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(const long val) {
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (sizeof(long) == sizeof(long long))
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            __x = __ll2bfloat16_rn(static_cast<long long>(val)).__x;
+        } else {
+            __x = __int2bfloat16_rn(static_cast<int>(val)).__x;
+        }
+    }
+
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Construct \p __nv_bfloat16 from \p unsigned \p long input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(const unsigned long val) {
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (sizeof(unsigned long) == sizeof(unsigned long long))
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            __x = __ull2bfloat16_rn(static_cast<unsigned long long>(val)).__x;
+        } else {
+            __x = __uint2bfloat16_rn(static_cast<unsigned int>(val)).__x;
+        }
+    }
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Construct \p __nv_bfloat16 from \p long \p long input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(long long val) { __x = __ll2bfloat16_rn(val).__x; }
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Construct \p __nv_bfloat16 from \p unsigned \p long \p long input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(unsigned long long val) { __x = __ull2bfloat16_rn(val).__x; }
+
+    /* Allow automatic casts to supported built-in types, matching all that are permitted with float */
+
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Conversion operator to \p signed \p char data type.
+     * Using round-toward-zero rounding mode.
+     *
+     * See __bfloat162char_rz(__nv_bfloat16) for further details
+     */
+    __CUDA_HOSTDEVICE__ operator signed char() const;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Conversion operator to \p unsigned \p char data type.
+     * Using round-toward-zero rounding mode.
+     *
+     * See __bfloat162uchar_rz(__nv_bfloat16) for further details
+     */
+    __CUDA_HOSTDEVICE__ operator unsigned char() const;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Conversion operator to an implementation defined \p char data type.
+     * Using round-toward-zero rounding mode.
+     *
+     * Detects signedness of the \p char type and proceeds accordingly, see
+     * further details in signed and unsigned char operators.
+     */
+    __CUDA_HOSTDEVICE__ operator char() const;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Conversion operator to \p short data type.
+     * Using round-toward-zero rounding mode.
+     *
+     * See __bfloat162short_rz(__nv_bfloat16) for further details
+     */
+    __CUDA_HOSTDEVICE__ operator short() const;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Conversion operator to \p unsigned \p short data type.
+     * Using round-toward-zero rounding mode.
+     *
+     * See __bfloat162ushort_rz(__nv_bfloat16) for further details
+     */
+    __CUDA_HOSTDEVICE__ operator unsigned short() const;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Conversion operator to \p int data type.
+     * Using round-toward-zero rounding mode.
+     *
+     * See __bfloat162int_rz(__nv_bfloat16) for further details
+     */
+    __CUDA_HOSTDEVICE__ operator int() const;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Conversion operator to \p unsigned \p int data type.
+     * Using round-toward-zero rounding mode.
+     *
+     * See __bfloat162uint_rz(__nv_bfloat16) for further details
+     */
+    __CUDA_HOSTDEVICE__ operator unsigned int() const;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Conversion operator to \p long data type.
+     * Using round-toward-zero rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ operator long() const;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Conversion operator to \p unsigned \p long data type.
+     * Using round-toward-zero rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ operator unsigned long() const;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Conversion operator to \p long \p long data type.
+     * Using round-toward-zero rounding mode.
+     *
+     * See __bfloat162ll_rz(__nv_bfloat16) for further details
+     */
+    __CUDA_HOSTDEVICE__ operator long long() const;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Conversion operator to \p unsigned \p long \p long data type.
+     * Using round-toward-zero rounding mode.
+     *
+     * See __bfloat162ull_rz(__nv_bfloat16) for further details
+     */
+    __CUDA_HOSTDEVICE__ operator unsigned long long() const;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Type cast from \p short assignment operator, using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(short val);
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Type cast from \p unsigned \p short assignment operator, using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(unsigned short val);
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Type cast from \p int assignment operator, using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(int val);
+   /**
+    * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Type cast from \p unsigned \p int assignment operator, using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(unsigned int val);
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Type cast from \p long \p long assignment operator, using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(long long val);
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Type cast from \p unsigned \p long \p long assignment operator, using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(unsigned long long val);
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Conversion operator to \p bool data type.
+     * +0 and -0 inputs convert to \p false.
+     * Non-zero inputs convert to \p true.
+     */
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_CONSTEXPR__ operator bool() const { return (__x & 0x7FFFU) != 0U; }
+#endif /* !(defined __CUDA_BF16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__) || (defined __CUDACC__) */
+#endif /* !defined(__CUDA_NO_BFLOAT16_CONVERSIONS__) */
+};
+
+#if !defined(__CUDA_NO_BFLOAT16_OPERATORS__)
+/* Some basic arithmetic operations expected of a built-in */
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Performs \p nv_bfloat16 addition operation.
+ * See also __hadd(__nv_bfloat16, __nv_bfloat16)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator+(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Performs \p nv_bfloat16 subtraction operation.
+ * See also __hsub(__nv_bfloat16, __nv_bfloat16)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator-(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Performs \p nv_bfloat16 multiplication operation.
+ * See also __hmul(__nv_bfloat16, __nv_bfloat16)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator*(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Performs \p nv_bfloat16 division operation.
+ * See also __hdiv(__nv_bfloat16, __nv_bfloat16)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator/(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Performs \p nv_bfloat16 compound assignment with addition operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator+=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Performs \p nv_bfloat16 compound assignment with subtraction operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator-=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Performs \p nv_bfloat16 compound assignment with multiplication operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator*=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Performs \p nv_bfloat16 compound assignment with division operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator/=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+
+/* Note for increment and decrement we use the raw value 0x3F80U equating to nv_bfloat16(1.0F), to avoid the extra conversion */
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Performs \p nv_bfloat16 prefix increment operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator++(__nv_bfloat16 &h);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Performs \p nv_bfloat16 prefix decrement operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator--(__nv_bfloat16 &h);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Performs \p nv_bfloat16 postfix increment operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16  operator++(__nv_bfloat16 &h, const int ignored);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Performs \p nv_bfloat16 postfix decrement operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16  operator--(__nv_bfloat16 &h, const int ignored);
+/* Unary plus and inverse operators */
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Implements \p nv_bfloat16 unary plus operator, returns input value.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator+(const __nv_bfloat16 &h);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+ * Implements \p nv_bfloat16 unary minus operator.
+ * See also __hneg(__nv_bfloat16)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator-(const __nv_bfloat16 &h);
+
+/* Some basic comparison operations to make it look like a built-in */
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+ * Performs \p nv_bfloat16 ordered compare equal operation.
+ * See also __heq(__nv_bfloat16, __nv_bfloat16)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator==(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+ * Performs \p nv_bfloat16 unordered compare not-equal operation.
+ * See also __hneu(__nv_bfloat16, __nv_bfloat16)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator!=(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+ * Performs \p nv_bfloat16 ordered greater-than compare operation.
+ * See also __hgt(__nv_bfloat16, __nv_bfloat16)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator> (const __nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+ * Performs \p nv_bfloat16 ordered less-than compare operation.
+ * See also __hlt(__nv_bfloat16, __nv_bfloat16)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator< (const __nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+ * Performs \p nv_bfloat16 ordered greater-or-equal compare operation.
+ * See also __hge(__nv_bfloat16, __nv_bfloat16)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator>=(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+ * Performs \p nv_bfloat16 ordered less-or-equal compare operation.
+ * See also __hle(__nv_bfloat16, __nv_bfloat16)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator<=(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh);
+#endif /* !defined(__CUDA_NO_BFLOAT16_OPERATORS__) */
+
+/**
+* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+ * \brief nv_bfloat162 datatype
+ * \details This structure implements the datatype for storing two
+ * nv_bfloat16 floating-point numbers.
+ * The structure implements assignment, arithmetic and comparison
+ * operators, and type conversions.
+ *
+ * - NOTE: __nv_bfloat162 is visible to non-nvcc host compilers
+ */
+struct __CUDA_ALIGN__(4) __nv_bfloat162 {
+    /**
+     * Storage field holding lower \p __nv_bfloat16 part.
+     */
+    __nv_bfloat16 x;
+    /**
+     * Storage field holding upper \p __nv_bfloat16 part.
+     */
+    __nv_bfloat16 y;
+
+    // All construct/copy/assign/move
+public:
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * \brief Constructor by default.
+     * \details Emtpy default constructor, result is uninitialized.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_BF16)
+    __nv_bfloat162() = default;
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Move constructor, available for \p C++11 and later dialects
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat162(__nv_bfloat162 &&src);
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Move assignment operator, available for \p C++11 and later dialects
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat162 &operator=(__nv_bfloat162 &&src);
+#else
+    __CUDA_HOSTDEVICE__ __nv_bfloat162();
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */
+
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Constructor from two \p __nv_bfloat16 variables
+     */
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_CONSTEXPR__ __nv_bfloat162(const __nv_bfloat16 &a, const __nv_bfloat16 &b) : x(a), y(b) { }
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Copy constructor
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat162(const __nv_bfloat162 &src);
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Copy assignment operator
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat162 &operator=(const __nv_bfloat162 &src);
+
+    /* Convert to/from __nv_bfloat162_raw */
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Constructor from \p __nv_bfloat162_raw
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat162(const __nv_bfloat162_raw &h2r );
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Assignment operator from \p __nv_bfloat162_raw
+     */
+    __CUDA_HOSTDEVICE__ __nv_bfloat162 &operator=(const __nv_bfloat162_raw &h2r);
+    /**
+     * \ingroup CUDA_MATH__BFLOAT16_MISC
+     * Conversion operator to \p __nv_bfloat162_raw
+     */
+    __CUDA_HOSTDEVICE__ operator __nv_bfloat162_raw() const;
+};
+
+#if !defined(__CUDA_NO_BFLOAT162_OPERATORS__)
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Performs packed \p nv_bfloat16 addition operation.
+ * See also __hadd2(__nv_bfloat162, __nv_bfloat162)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator+(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Performs packed \p nv_bfloat16 subtraction operation.
+ * See also __hsub2(__nv_bfloat162, __nv_bfloat162)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator-(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Performs packed \p nv_bfloat16 multiplication operation.
+ * See also __hmul2(__nv_bfloat162, __nv_bfloat162)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator*(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Performs packed \p nv_bfloat16 division operation.
+ * See also __h2div(__nv_bfloat162, __nv_bfloat162)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator/(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Performs packed \p nv_bfloat16 compound assignment with addition operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162& operator+=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Performs packed \p nv_bfloat16 compound assignment with subtraction operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162& operator-=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Performs packed \p nv_bfloat16 compound assignment with multiplication operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162& operator*=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Performs packed \p nv_bfloat16 compound assignment with division operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162& operator/=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Performs packed \p nv_bfloat16 prefix increment operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 &operator++(__nv_bfloat162 &h);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Performs packed \p nv_bfloat16 prefix decrement operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 &operator--(__nv_bfloat162 &h);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Performs packed \p nv_bfloat16 postfix increment operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162  operator++(__nv_bfloat162 &h, const int ignored);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Performs packed \p nv_bfloat16 postfix decrement operation.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162  operator--(__nv_bfloat162 &h, const int ignored);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Implements packed \p nv_bfloat16 unary plus operator, returns input value.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator+(const __nv_bfloat162 &h);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+ * Implements packed \p nv_bfloat16 unary minus operator.
+ * See also __hneg2(__nv_bfloat162)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator-(const __nv_bfloat162 &h);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+ * Performs packed \p nv_bfloat16 ordered compare equal operation.
+ * See also __hbeq2(__nv_bfloat162, __nv_bfloat162)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator==(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+ * Performs packed \p nv_bfloat16 unordered compare not-equal operation.
+ * See also __hbneu2(__nv_bfloat162, __nv_bfloat162)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator!=(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+ * Performs packed \p nv_bfloat16 ordered greater-than compare operation.
+ * See also __hbgt2(__nv_bfloat162, __nv_bfloat162)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator>(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+ * Performs packed \p nv_bfloat16 ordered less-than compare operation.
+ * See also __hblt2(__nv_bfloat162, __nv_bfloat162)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator<(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+ * Performs packed \p nv_bfloat16 ordered greater-or-equal compare operation.
+ * See also __hbge2(__nv_bfloat162, __nv_bfloat162)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator>=(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+/**
+ * \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+ * Performs packed \p nv_bfloat16 ordered less-or-equal compare operation.
+ * See also __hble2(__nv_bfloat162, __nv_bfloat162)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator<=(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh);
+
+#endif /* !defined(__CUDA_NO_BFLOAT162_OPERATORS__) */
+
+#if defined(__CPP_VERSION_AT_LEAST_11_BF16)
+#if !defined(__CUDA_NO_HALF_CONVERSIONS__)
+__CUDA_HOSTDEVICE__
+#ifdef __CUDACC_RTC__
+inline
+#else
+__CUDA_BF16_FORCEINLINE__
+#endif
+__half::__half(const __nv_bfloat16 f)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    asm("{  cvt.rn.f16.bf16 %0, %1;}\n" : "=h"(__x) : "h"(__BFLOAT16_TO_CUS(f)));
+,
+    __x = __float2half_rn(__bfloat162float(f)).__x;
+)
+}
+#endif
+#endif /* #if defined(__CPP_VERSION_AT_LEAST_11_BF16) */
+
+#endif /* defined(__cplusplus) */
+
+#if (defined(__FORCE_INCLUDE_CUDA_BF16_HPP_FROM_BF16_H__) || \
+    !(defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3)))))
+/* Note the .hpp file is included to capture the "nv_bfloat16" & "nv_bfloat162" built-in function definitions. For NVRTC, the built-in
+   function definitions are compiled at NVRTC library build-time and are available through the NVRTC built-ins library at
+   link time.
+*/
+#include "cuda_bf16.hpp"
+#endif /* (defined(__FORCE_INCLUDE_CUDA_BF16_HPP_FROM_BF16_H__) || \
+          !(defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3))))) */
+
+/* Define first-class types "nv_bfloat16" and "nv_bfloat162", unless user specifies otherwise via "#define CUDA_NO_BFLOAT16" */
+/* C cannot ever have these types defined here, because __nv_bfloat16 and __nv_bfloat162 are C++ classes */
+#if defined(__cplusplus) && !defined(CUDA_NO_BFLOAT16)
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+ * \brief This datatype is meant to be the first-class or fundamental
+ * implementation of the bfloat16 numbers format.
+ *
+ * \details Should be implemented in the compiler in the future.
+ * Current implementation is a simple typedef to a respective
+ * user-level type with underscores.
+ */
+typedef __nv_bfloat16  nv_bfloat16;
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+ * \brief This datatype is meant to be the first-class or fundamental
+ * implementation of type for pairs of bfloat16 numbers.
+ *
+ * \details Should be implemented in the compiler in the future.
+ * Current implementation is a simple typedef to a respective
+ * user-level type with underscores.
+ */
+typedef __nv_bfloat162 nv_bfloat162;
+
+#endif /* defined(__cplusplus) && !defined(CUDA_NO_BFLOAT16) */
+
+#undef __CUDA_BF16_DECL__
+#undef __CUDA_HOSTDEVICE_BF16_DECL__
+#undef __CUDA_HOSTDEVICE__
+#undef __CUDA_BF16_INLINE__
+#undef __CUDA_BF16_FORCEINLINE__
+#undef ___CUDA_BF16_STRINGIFY_INNERMOST
+#undef __CUDA_BF16_STRINGIFY
+
+#endif /* end of include guard: __CUDA_BF16_H__ */
diff --git a/numba_cuda/numba/cuda/include/13/cuda_bf16.hpp b/numba_cuda/numba/cuda/include/13/cuda_bf16.hpp
new file mode 100644
index 000000000..5f610c976
--- /dev/null
+++ b/numba_cuda/numba/cuda/include/13/cuda_bf16.hpp
@@ -0,0 +1,3865 @@
+/*
+* Copyright 1993-2024 NVIDIA Corporation.  All rights reserved.
+*
+* NOTICE TO LICENSEE:
+*
+* This source code and/or documentation ("Licensed Deliverables") are
+* subject to NVIDIA intellectual property rights under U.S. and
+* international Copyright laws.
+*
+* These Licensed Deliverables contained herein is PROPRIETARY and
+* CONFIDENTIAL to NVIDIA and is being provided under the terms and
+* conditions of a form of NVIDIA software license agreement by and
+* between NVIDIA and Licensee ("License Agreement") or electronically
+* accepted by Licensee.  Notwithstanding any terms or conditions to
+* the contrary in the License Agreement, reproduction or disclosure
+* of the Licensed Deliverables to any third party without the express
+* written consent of NVIDIA is prohibited.
+*
+* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+* OF THESE LICENSED DELIVERABLES.
+*
+* U.S. Government End Users.  These Licensed Deliverables are a
+* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+* 1995), consisting of "commercial computer software" and "commercial
+* computer software documentation" as such terms are used in 48
+* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+* only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+* U.S. Government End Users acquire the Licensed Deliverables with
+* only those rights set forth herein.
+*
+* Any use of the Licensed Deliverables in individual and commercial
+* software must include, in the user documentation and internal
+* comments to the code, the above Disclaimer and U.S. Government End
+* Users Notice.
+*/
+
+#if !defined(__CUDA_BF16_HPP__)
+#define __CUDA_BF16_HPP__
+
+#if !defined(__CUDA_BF16_H__)
+#error "Do not include this file directly. Instead, include cuda_bf16.h."
+#endif
+
+#if !defined(IF_DEVICE_OR_CUDACC)
+#if defined(__CUDACC__)
+    #define IF_DEVICE_OR_CUDACC(d, c, f) NV_IF_ELSE_TARGET(NV_IS_DEVICE, d, c)
+#else
+    #define IF_DEVICE_OR_CUDACC(d, c, f) NV_IF_ELSE_TARGET(NV_IS_DEVICE, d, f)
+#endif
+#endif
+
+/* All other definitions in this file are only visible to C++ compilers */
+#if defined(__cplusplus)
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16_CONSTANTS
+ * \brief Defines floating-point positive infinity value for the \p nv_bfloat16 data type
+ */
+#define CUDART_INF_BF16            __ushort_as_bfloat16((unsigned short)0x7F80U)
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16_CONSTANTS
+ * \brief Defines canonical NaN value for the \p nv_bfloat16 data type
+ */
+#define CUDART_NAN_BF16            __ushort_as_bfloat16((unsigned short)0x7FFFU)
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16_CONSTANTS
+ * \brief Defines a minimum representable (denormalized) value for the \p nv_bfloat16 data type
+ */
+#define CUDART_MIN_DENORM_BF16     __ushort_as_bfloat16((unsigned short)0x0001U)
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16_CONSTANTS
+ * \brief Defines a maximum representable value for the \p nv_bfloat16 data type
+ */
+#define CUDART_MAX_NORMAL_BF16     __ushort_as_bfloat16((unsigned short)0x7F7FU)
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16_CONSTANTS
+ * \brief Defines a negative zero value for the \p nv_bfloat16 data type
+ */
+#define CUDART_NEG_ZERO_BF16       __ushort_as_bfloat16((unsigned short)0x8000U)
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16_CONSTANTS
+ * \brief Defines a positive zero value for the \p nv_bfloat16 data type
+ */
+#define CUDART_ZERO_BF16           __ushort_as_bfloat16((unsigned short)0x0000U)
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16_CONSTANTS
+ * \brief Defines a value of 1.0 for the \p nv_bfloat16 data type
+ */
+#define CUDART_ONE_BF16            __ushort_as_bfloat16((unsigned short)0x3F80U)
+
+#if !(defined __DOXYGEN_ONLY__)
+
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(const __nv_bfloat16_raw &hr) { __x = hr.x; return *this; }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ volatile __nv_bfloat16 &__nv_bfloat16::operator=(const __nv_bfloat16_raw &hr) volatile { __x = hr.x; return *this; }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ volatile __nv_bfloat16 &__nv_bfloat16::operator=(const volatile __nv_bfloat16_raw &hr) volatile { __x = hr.x; return *this; }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator __nv_bfloat16_raw() const { __nv_bfloat16_raw ret; ret.x = __x; return ret; }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator __nv_bfloat16_raw() const volatile { __nv_bfloat16_raw ret; ret.x = __x; return ret; }
+
+#if !defined(__CUDA_NO_BFLOAT16_CONVERSIONS__)
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator float() const { return __bfloat162float(*this); }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(const float f) { __x = __float2bfloat16(f).__x; return *this; }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(const double f) { __x = __double2bfloat16(f).__x; return *this; }
+
+/*
+ * Implicit type conversions to/from integer types were only available to nvcc compilation.
+ * Introducing them for all compilers is a potentially breaking change that may affect
+ * overloads resolution and will require users to update their code.
+ * Define __CUDA_BF16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__ to opt-out.
+ */
+#if !(defined __CUDA_BF16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__) || (defined __CUDACC__)
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator signed char() const { return __bfloat162char_rz(*this); }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator unsigned char() const { return __bfloat162uchar_rz(*this); }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator char() const {
+        char value;
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (((char)-1) < (char)0)
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            value = static_cast<char>(__bfloat162char_rz(*this));
+        }
+        else
+        {
+            value = static_cast<char>(__bfloat162uchar_rz(*this));
+        }
+        return value;
+    }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator short() const { return __bfloat162short_rz(*this); }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator unsigned short() const { return __bfloat162ushort_rz(*this); }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator int() const { return __bfloat162int_rz(*this); }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator unsigned int() const { return __bfloat162uint_rz(*this); }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator long() const {
+        long retval;
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (sizeof(long) == sizeof(long long))
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            retval = static_cast<long>(__bfloat162ll_rz(*this));
+        }
+        else
+        {
+            retval = static_cast<long>(__bfloat162int_rz(*this));
+        }
+        return retval;
+    }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator unsigned long() const {
+        unsigned long retval;
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (sizeof(unsigned long) == sizeof(unsigned long long))
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            retval = static_cast<unsigned long>(__bfloat162ull_rz(*this));
+        }
+        else
+        {
+            retval = static_cast<unsigned long>(__bfloat162uint_rz(*this));
+        }
+        return retval;
+    }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator long long() const { return __bfloat162ll_rz(*this); }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator unsigned long long() const { return __bfloat162ull_rz(*this); }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(short val) { __x = __short2bfloat16_rn(val).__x; return *this; }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(unsigned short val) { __x = __ushort2bfloat16_rn(val).__x; return *this; }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(int val) { __x = __int2bfloat16_rn(val).__x; return *this; }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(unsigned int val) { __x = __uint2bfloat16_rn(val).__x; return *this; }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(long long val) { __x = __ll2bfloat16_rn(val).__x; return *this; }
+    __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(unsigned long long val) { __x = __ull2bfloat16_rn(val).__x; return *this; }
+#endif /* !(defined __CUDA_BF16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__) || (defined __CUDACC__) */
+#endif /* !defined(__CUDA_NO_BFLOAT16_CONVERSIONS__) */
+
+
+#if !defined(__CUDA_NO_BFLOAT16_OPERATORS__)
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator+(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hadd(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator-(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hsub(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator*(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hmul(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator/(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hdiv(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator+=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh) { lh = __hadd(lh, rh); return lh; }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator-=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh) { lh = __hsub(lh, rh); return lh; }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator*=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh) { lh = __hmul(lh, rh); return lh; }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator/=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh) { lh = __hdiv(lh, rh); return lh; }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator++(__nv_bfloat16 &h)      { __nv_bfloat16_raw one; one.x = 0x3F80U; h += one; return h; }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator--(__nv_bfloat16 &h)      { __nv_bfloat16_raw one; one.x = 0x3F80U; h -= one; return h; }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16  operator++(__nv_bfloat16 &h, const int ignored)
+{
+    // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
+    static_cast<void>(ignored);
+
+    const __nv_bfloat16 ret = h;
+    __nv_bfloat16_raw one;
+    one.x = 0x3F80U;
+    h += one;
+    return ret;
+}
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16  operator--(__nv_bfloat16 &h, const int ignored)
+{
+    // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
+    static_cast<void>(ignored);
+
+    const __nv_bfloat16 ret = h;
+    __nv_bfloat16_raw one;
+    one.x = 0x3F80U;
+    h -= one;
+    return ret;
+}
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator+(const __nv_bfloat16 &h) { return h; }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator-(const __nv_bfloat16 &h) { return __hneg(h); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator==(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __heq(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator!=(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hneu(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator> (const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hgt(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator< (const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hlt(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator>=(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hge(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator<=(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hle(lh, rh); }
+#endif /* !defined(__CUDA_NO_BFLOAT16_OPERATORS__) */
+
+#if defined(__CPP_VERSION_AT_LEAST_11_BF16)
+__CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat162::__nv_bfloat162(__nv_bfloat162 &&src) {
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __BFLOAT162_TO_UI(*this) = ::std::move(__BFLOAT162_TO_CUI(src));
+,
+    this->x = src.x;
+    this->y = src.y;
+)
+}
+__CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat162 &__nv_bfloat162::operator=(__nv_bfloat162 &&src) {
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __BFLOAT162_TO_UI(*this) = ::std::move(__BFLOAT162_TO_CUI(src));
+,
+    this->x = src.x;
+    this->y = src.y;
+)
+    return *this;
+}
+#else
+__CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat162::__nv_bfloat162() { }
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */
+__CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat162::__nv_bfloat162(const __nv_bfloat162 &src) {
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+   __BFLOAT162_TO_UI(*this) = __BFLOAT162_TO_CUI(src);
+,
+    this->x = src.x;
+    this->y = src.y;
+)
+}
+__CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat162 &__nv_bfloat162::operator=(const __nv_bfloat162 &src) {
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+   __BFLOAT162_TO_UI(*this) = __BFLOAT162_TO_CUI(src);
+,
+    this->x = src.x;
+    this->y = src.y;
+)
+    return *this;
+}
+__CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat162::__nv_bfloat162(const __nv_bfloat162_raw &h2r ) {
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __BFLOAT162_TO_UI(*this) = __BFLOAT162_TO_CUI(h2r);
+,
+    __nv_bfloat16_raw tr;
+    tr.x = h2r.x;
+    this->x = static_cast<__nv_bfloat16>(tr);
+    tr.x = h2r.y;
+    this->y = static_cast<__nv_bfloat16>(tr);
+)
+}
+__CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat162 &__nv_bfloat162::operator=(const __nv_bfloat162_raw &h2r) {
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __BFLOAT162_TO_UI(*this) = __BFLOAT162_TO_CUI(h2r);
+,
+    __nv_bfloat16_raw tr;
+    tr.x = h2r.x;
+    this->x = static_cast<__nv_bfloat16>(tr);
+    tr.x = h2r.y;
+    this->y = static_cast<__nv_bfloat16>(tr);
+)
+    return *this;
+}
+__CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat162::operator __nv_bfloat162_raw() const {
+    __nv_bfloat162_raw ret;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    ret.x = 0U;
+    ret.y = 0U;
+    __BFLOAT162_TO_UI(ret) = __BFLOAT162_TO_CUI(*this);
+,
+    ret.x = static_cast<__nv_bfloat16_raw>(this->x).x;
+    ret.y = static_cast<__nv_bfloat16_raw>(this->y).x;
+)
+    return ret;
+}
+
+#if !defined(__CUDA_NO_BFLOAT162_OPERATORS__)
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator+(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hadd2(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator-(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hsub2(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator*(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hmul2(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator/(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __h2div(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162& operator+=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh) { lh = __hadd2(lh, rh); return lh; }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162& operator-=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh) { lh = __hsub2(lh, rh); return lh; }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162& operator*=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh) { lh = __hmul2(lh, rh); return lh; }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162& operator/=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh) { lh = __h2div(lh, rh); return lh; }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 &operator++(__nv_bfloat162 &h)      { __nv_bfloat162_raw one; one.x = 0x3F80U; one.y = 0x3F80U; h = __hadd2(h, one); return h; }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 &operator--(__nv_bfloat162 &h)      { __nv_bfloat162_raw one; one.x = 0x3F80U; one.y = 0x3F80U; h = __hsub2(h, one); return h; }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162  operator++(__nv_bfloat162 &h, const int ignored)
+{
+    // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
+    static_cast<void>(ignored);
+
+    const __nv_bfloat162 ret = h;
+    __nv_bfloat162_raw one;
+    one.x = 0x3F80U;
+    one.y = 0x3F80U;
+    h = __hadd2(h, one);
+    return ret;
+}
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162  operator--(__nv_bfloat162 &h, const int ignored)
+{
+    // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
+    static_cast<void>(ignored);
+
+    const __nv_bfloat162 ret = h;
+    __nv_bfloat162_raw one;
+    one.x = 0x3F80U;
+    one.y = 0x3F80U;
+    h = __hsub2(h, one);
+    return ret;
+}
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator+(const __nv_bfloat162 &h) { return h; }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator-(const __nv_bfloat162 &h) { return __hneg2(h); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator==(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hbeq2(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator!=(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hbneu2(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator>(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hbgt2(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator<(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hblt2(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator>=(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hbge2(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator<=(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hble2(lh, rh); }
+#endif /* !defined(__CUDA_NO_BFLOAT162_OPERATORS__) */
+
+/* Restore warning for multiple assignment operators */
+#if defined(_MSC_VER) && _MSC_VER >= 1500
+#pragma warning( pop )
+#endif /* defined(_MSC_VER) && _MSC_VER >= 1500 */
+
+/* Restore -Weffc++ warnings from here on */
+#if defined(__GNUC__)
+#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
+#pragma GCC diagnostic pop
+#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */
+#endif /* defined(__GNUC__) */
+
+#undef __CUDA_HOSTDEVICE__
+#undef __CUDA_ALIGN__
+
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __internal_float_as_uint(const float f)
+{
+    unsigned int u;
+IF_DEVICE_OR_CUDACC(
+    u = __float_as_uint(f);
+,
+    memcpy(&u, &f, sizeof(f));
+,
+    ::std::memcpy(&u, &f, sizeof(f));
+)
+    return u;
+}
+
+__CUDA_HOSTDEVICE_BF16_DECL__ float __internal_uint_as_float(const unsigned int u)
+{
+    float f;
+IF_DEVICE_OR_CUDACC(
+    f = __uint_as_float(u);
+,
+    memcpy(&f, &u, sizeof(u));
+,
+    ::std::memcpy(&f, &u, sizeof(u));
+)
+    return f;
+}
+
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned short __internal_float2bfloat16(const float f, unsigned int &sign, unsigned int &remainder)
+{
+    unsigned int x;
+
+    x = __internal_float_as_uint(f);
+
+    if ((x & 0x7fffffffU) > 0x7f800000U) {
+        sign = 0U;
+        remainder = 0U;
+        return static_cast<unsigned short>(0x7fffU);
+    }
+    sign = x >> 31U;
+    remainder = x << 16U;
+    return static_cast<unsigned short>(x >> 16U);
+}
+
+__CUDA_HOSTDEVICE_BF16_DECL__ float __internal_double2float_rn(const double x)
+{
+    float r;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rn.f32.f64 %0, %1;" : "=f"(r) : "d"(x));
+,
+    r = static_cast<float>(x);
+)
+    return r;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ double __internal_float2double(const float x)
+{
+    double r;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.f64.f32 %0, %1;" : "=d"(r) : "f"(x));
+,
+    r = static_cast<double>(x);
+)
+    return r;
+}
+
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __double2bfloat16(const double x)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 val;
+    asm("{  cvt.rn.bf16.f64 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "d"(x));
+    return val;
+,
+    float f = __internal_double2float_rn(x);
+    const double d = __internal_float2double(f);
+    unsigned int u = __internal_float_as_uint(f);
+
+    bool x_is_not_nan = ((u << (unsigned)1U) <= (unsigned)0xFF000000U);
+
+
+    if ((x > 0.0) && (d > x)) {
+        u--;
+    }
+    if ((x < 0.0) && (d < x)) {
+        u--;
+    }
+    if ((d != x) && x_is_not_nan) {
+        u |= 1U;
+    }
+
+    f = __internal_uint_as_float(u);
+
+    return __float2bfloat16(f);
+)
+}
+
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16(const float a)
+{
+    __nv_bfloat16 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    asm("{  cvt.rn.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a));
+,
+    __nv_bfloat16_raw r;
+    unsigned int sign = 0U;
+    unsigned int remainder = 0U;
+    r.x = __internal_float2bfloat16(a, sign, remainder);
+    if ((remainder > 0x80000000U) || ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) {
+        r.x++;
+    }
+    val = r;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rn(const float a)
+{
+    __nv_bfloat16 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    asm("{  cvt.rn.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a));
+,
+    __nv_bfloat16_raw r;
+    unsigned int sign = 0U;
+    unsigned int remainder = 0U;
+    r.x = __internal_float2bfloat16(a, sign, remainder);
+    if ((remainder > 0x80000000U) || ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) {
+        r.x++;
+    }
+    val = r;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rz(const float a)
+{
+    __nv_bfloat16 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    asm("{  cvt.rz.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a));
+,
+    __nv_bfloat16_raw r;
+    unsigned int sign = 0U;
+    unsigned int remainder = 0U;
+    r.x = __internal_float2bfloat16(a, sign, remainder);
+    val = r;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rd(const float a)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 val;
+    asm("{  cvt.rm.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a));
+    return val;
+,
+    __nv_bfloat16 val;
+    __nv_bfloat16_raw r;
+    unsigned int sign = 0U;
+    unsigned int remainder = 0U;
+    r.x = __internal_float2bfloat16(a, sign, remainder);
+    if ((remainder != 0U) && (sign != 0U)) {
+        r.x++;
+    }
+    val = r;
+    return val;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_ru(const float a)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 val;
+    asm("{  cvt.rp.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a));
+    return val;
+,
+    __nv_bfloat16 val;
+    __nv_bfloat16_raw r;
+    unsigned int sign = 0U;
+    unsigned int remainder = 0U;
+    r.x = __internal_float2bfloat16(a, sign, remainder);
+    if ((remainder != 0U) && (sign == 0U)) {
+        r.x++;
+    }
+    val = r;
+    return val;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __float2bfloat162_rn(const float a)
+{
+    __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    asm("{.reg .b16 low;\n"
+        "  cvt.rn.bf16.f32 low, %1;\n"
+        "  mov.b32 %0, {low,low};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "f"(a));
+,
+    val = __nv_bfloat162(__float2bfloat16_rn(a), __float2bfloat16_rn(a));
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __floats2bfloat162_rn(const float a, const float b)
+{
+    __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    asm("{ cvt.rn.bf16x2.f32 %0, %2, %1;}\n"
+        : "=r"(__BFLOAT162_TO_UI(val)) : "f"(a), "f"(b));
+,
+    val = __nv_bfloat162(__float2bfloat16_rn(a), __float2bfloat16_rn(b));
+)
+    return val;
+}
+
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ float __internal_device_bfloat162float(const unsigned short h)
+{
+    float f;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    asm("{ cvt.f32.bf16 %0, %1;}\n" : "=f"(f) : "h"(h));
+,
+    asm("{ mov.b32 %0, {0,%1};}\n" : "=f"(f) : "h"(h));
+)
+    return f;
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+
+__CUDA_HOSTDEVICE_BF16_DECL__ float __internal_bfloat162float(const unsigned short h)
+{
+    float f;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    f = __internal_device_bfloat162float(h);
+,
+    unsigned int u = static_cast<unsigned int>(h) << 16;
+    f = __internal_uint_as_float(u);
+)
+    return f;
+}
+
+__CUDA_HOSTDEVICE_BF16_DECL__ float __bfloat162float(const __nv_bfloat16 a)
+{
+    return __internal_bfloat162float(static_cast<__nv_bfloat16_raw>(a).x);
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ float __low2float(const __nv_bfloat162 a)
+{
+    return __internal_bfloat162float(static_cast<__nv_bfloat162_raw>(a).x);
+}
+
+__CUDA_HOSTDEVICE_BF16_DECL__ float __high2float(const __nv_bfloat162 a)
+{
+    return __internal_bfloat162float(static_cast<__nv_bfloat162_raw>(a).y);
+}
+
+/* CUDA vector-types compatible vector creation function (note returns __nv_bfloat162, not nv_bfloat162) */
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 make_bfloat162(const __nv_bfloat16 x, const __nv_bfloat16 y)
+{
+    __nv_bfloat162 t; t.x = x; t.y = y; return t;
+}
+
+/* Definitions of intrinsics */
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __float22bfloat162_rn(const float2 a)
+{
+    __nv_bfloat162 val = __floats2bfloat162_rn(a.x, a.y);
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ float2 __bfloat1622float2(const __nv_bfloat162 a)
+{
+    float hi_float;
+    float lo_float;
+    lo_float = __internal_bfloat162float(((__nv_bfloat162_raw)a).x);
+    hi_float = __internal_bfloat162float(((__nv_bfloat162_raw)a).y);
+    return make_float2(lo_float, hi_float);
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ int __bfloat162int_rn(const __nv_bfloat16 h)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    int val;
+    asm("{  cvt.rni.s32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h)));
+    return val;
+,
+    return __float2int_rn(__bfloat162float(h));
+)
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+
+__CUDA_HOSTDEVICE_BF16_DECL__ int __internal_bfloat162int_rz(const __nv_bfloat16 h)
+{
+    const float f = __bfloat162float(h);
+    int   i;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    i = __float2int_rz(f);
+,
+    const int max_val = (int)0x7fffffffU;
+    const int min_val = (int)0x80000000U;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__nv_bfloat16_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xFF00U) {
+        // NaN
+        i = 0;
+    } else if (f >= static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    } else {
+        i = static_cast<int>(f);
+    }
+)
+    return i;
+}
+
+__CUDA_HOSTDEVICE_BF16_DECL__ int __bfloat162int_rz(const __nv_bfloat16 h)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    int val;
+    asm("{  cvt.rzi.s32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h)));
+    return val;
+,
+    return __internal_bfloat162int_rz(h);
+)
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ int __bfloat162int_rd(const __nv_bfloat16 h)
+{
+    int val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    asm("{  cvt.rmi.s32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h)));
+,
+    const float f = __bfloat162float(h);
+    asm("cvt.rmi.s32.f32 %0, %1;" : "=r"(val) : "f"(f));
+)
+    return val;
+}
+__CUDA_BF16_DECL__ int __bfloat162int_ru(const __nv_bfloat16 h)
+{
+    int val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    asm("{  cvt.rpi.s32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h)));
+,
+    const float f = __bfloat162float(h);
+    asm("cvt.rpi.s32.f32 %0, %1;" : "=r"(val) : "f"(f));
+)
+    return val;
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat16 __internal_device_int2bfloat16_rn(const int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+        __nv_bfloat16 val;
+       asm("cvt.rn.bf16.s32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i));
+       return val;
+,
+        const float ru = __int2float_ru(i);
+        const float rd = __int2float_rd(i);
+        float rz = __int2float_rz(i);
+        if (ru != rd) {
+            rz = __uint_as_float(__float_as_uint(rz) | 1U);
+        }
+        return __float2bfloat16_rn(rz);
+)
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rn(const int i)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_int2bfloat16_rn(i);
+,
+    const double d = static_cast<double>(i);
+    return __double2bfloat16(d);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ signed char __bfloat162char_rz(const __nv_bfloat16 h)
+{
+    signed char i;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    unsigned short tmp = 0;
+    asm("{ .reg.b8 myreg;\n"
+        "  cvt.rzi.s8.bf16 myreg, %1;\n"
+        "  mov.b16 %0, {myreg, 0};\n}"
+         :"=h"(tmp) : "h"(__BFLOAT16_TO_CUS(h)));
+    const unsigned char u = static_cast<unsigned char>(tmp);
+    i = static_cast<signed char>(u);
+,
+    const float f = __bfloat162float(h);
+    const signed char max_val = (signed char)0x7fU;
+    const signed char min_val = (signed char)0x80U;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__nv_bfloat16_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xFF00U) {
+        // NaN
+        i = 0;
+    } else if (f > static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    } else {
+        // normal value, conversion is well-defined
+        i = static_cast<signed char>(f);
+    }
+)
+    return i;
+}
+
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned char __bfloat162uchar_rz(const __nv_bfloat16 h)
+{
+    unsigned char i;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    unsigned short tmp = 0;
+    asm("{ .reg.b8 myreg;\n"
+        "  cvt.rzi.u8.bf16 myreg, %1;\n"
+        "  mov.b16 %0, {myreg, 0};\n}"
+         :"=h"(tmp) : "h"(__BFLOAT16_TO_CUS(h)));
+    i = static_cast<unsigned char>(tmp);
+,
+    const float f = __bfloat162float(h);
+    const unsigned char max_val = 0xffU;
+    const unsigned char min_val = 0U;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__nv_bfloat16_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xFF00U) {
+        // NaN
+        i = 0U;
+    } else if (f > static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    } else {
+        // normal value, conversion is well-defined
+        i = static_cast<unsigned char>(f);
+    }
+)
+    return i;
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rz(const int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+     __nv_bfloat16 val;
+    asm("cvt.rz.bf16.s32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i));
+    return val;
+,
+    return __float2bfloat16_rz(__int2float_rz(i));
+)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rd(const int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+     __nv_bfloat16 val;
+    asm("cvt.rm.bf16.s32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i));
+    return val;
+,
+    return __float2bfloat16_rd(__int2float_rd(i));
+)
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_ru(const int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+     __nv_bfloat16 val;
+    asm("cvt.rp.bf16.s32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i));
+    return val;
+,
+    return __float2bfloat16_ru(__int2float_ru(i));
+)
+}
+
+__CUDA_BF16_DECL__ short int __bfloat162short_rn(const __nv_bfloat16 h)
+{
+   short int val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm("cvt.rni.s16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+,
+   asm("{ .reg.f32 f;\n"
+       "  mov.b32 f, {0,%1};\n"
+       "  cvt.rni.s16.f32 %0,f;\n}"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+)
+   return val;
+}
+
+__CUDA_BF16_DECL__ short int __internal_device_bfloat162short_rz(const __nv_bfloat16 h)
+{
+    short int val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    asm("cvt.rzi.s16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+,
+    asm("{ .reg.f32 f;\n"
+        "  mov.b32 f, {0,%1};\n"
+        "  cvt.rzi.s16.f32 %0,f;\n}"
+         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+)
+    return val;
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_BF16_DECL__ short int __bfloat162short_rz(const __nv_bfloat16 h)
+{
+    short int val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    val = __internal_device_bfloat162short_rz(h);
+,
+    const float f = __bfloat162float(h);
+    const short int max_val = (short int)0x7fffU;
+    const short int min_val = (short int)0x8000U;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__nv_bfloat16_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xFF00U) {
+        // NaN
+        val = 0;
+    } else if (f > static_cast<float>(max_val)) {
+        // saturate maximum
+        val = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        val = min_val;
+    } else {
+        val = static_cast<short int>(f);
+    }
+)
+   return val;
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ short int __bfloat162short_rd(const __nv_bfloat16 h)
+{
+   short int val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm("cvt.rmi.s16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+,
+   asm("{ .reg.f32 f;\n"
+       "  mov.b32 f, {0,%1};\n"
+       "  cvt.rmi.s16.f32 %0,f;\n}"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+)
+   return val;
+}
+__CUDA_BF16_DECL__ short int __bfloat162short_ru(const __nv_bfloat16 h)
+{
+   short int val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm("cvt.rpi.s16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+,
+   asm("{ .reg.f32 f;\n"
+       "  mov.b32 f, {0,%1};\n"
+       "  cvt.rpi.s16.f32 %0,f;\n}"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+)
+   return val;
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rn(const short int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 val;
+    asm("cvt.rn.bf16.s16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i));
+    return val;
+,
+    const float f = static_cast<float>(i);
+    return __float2bfloat16_rn(f);
+)
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rz(const short int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 val;
+    asm("cvt.rz.bf16.s16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i));
+    return val;
+,
+    return __float2bfloat16_rz(__int2float_rz(static_cast<int>(i)));
+)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rd(const short int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 val;
+    asm("cvt.rm.bf16.s16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i));
+    return val;
+,
+    return __float2bfloat16_rd(__int2float_rd(static_cast<int>(i)));
+)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_ru(const short int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 val;
+    asm("cvt.rp.bf16.s16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i));
+    return val;
+,
+    return __float2bfloat16_ru(__int2float_ru(static_cast<int>(i)));
+)
+}
+
+__CUDA_BF16_DECL__ unsigned int __bfloat162uint_rn(const __nv_bfloat16 h)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    unsigned int val;
+    asm("{  cvt.rni.u32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h)));
+    return val;
+,
+    return __float2uint_rn(__bfloat162float(h));
+)
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __internal_bfloat162uint_rz(const __nv_bfloat16 h)
+{
+    const float f = __bfloat162float(h);
+    unsigned int i;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    i = __float2uint_rz(f);
+,
+    const unsigned int max_val = 0xffffffffU;
+    const unsigned int min_val = 0U;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__nv_bfloat16_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xFF00U) {
+        // NaN
+        i = 0U;
+    } else if (f >= static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    } else {
+        i = static_cast<unsigned int>(f);
+    }
+)
+    return i;
+}
+
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __bfloat162uint_rz(const __nv_bfloat16 h)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    unsigned int val;
+    asm("{  cvt.rzi.u32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h)));
+    return val;
+,
+    return __internal_bfloat162uint_rz(h);
+)
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ unsigned int __bfloat162uint_rd(const __nv_bfloat16 h)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    unsigned int val;
+    asm("{  cvt.rmi.u32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h)));
+    return val;
+,
+    return __float2uint_rd(__bfloat162float(h));
+)
+}
+__CUDA_BF16_DECL__ unsigned int __bfloat162uint_ru(const __nv_bfloat16 h)
+{
+    unsigned int val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    asm("{  cvt.rpi.u32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h)));
+,
+    const float f = __bfloat162float(h);
+    asm("cvt.rpi.u32.f32 %0, %1;" : "=r"(val) : "f"(f));
+)
+    return val;
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat16 __internal_device_uint2bfloat16_rn(const unsigned int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 val;
+    asm("cvt.rn.bf16.u32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i));
+    return val;
+,
+    const float ru = __uint2float_ru(i);
+    const float rd = __uint2float_rd(i);
+    float rz = __uint2float_rz(i);
+    if (ru != rd) {
+        rz = __uint_as_float(__float_as_uint(rz) | 1U);
+    }
+    return __float2bfloat16_rn(rz);
+)
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rn(const unsigned int i)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_uint2bfloat16_rn(i);
+,
+    const double d = static_cast<double>(i);
+    return __double2bfloat16(d);
+)
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rz(const unsigned int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+     __nv_bfloat16 val;
+    asm("cvt.rz.bf16.u32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i));
+    return val;
+,
+    return __float2bfloat16_rz(__uint2float_rz(i));
+)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rd(const unsigned int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+     __nv_bfloat16 val;
+    asm("cvt.rm.bf16.u32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i));
+    return val;
+,
+    return __float2bfloat16_rd(__uint2float_rd(i));
+)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_ru(const unsigned int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+     __nv_bfloat16 val;
+    asm("cvt.rp.bf16.u32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i));
+    return val;
+,
+    return __float2bfloat16_ru(__uint2float_ru(i));
+)
+}
+
+__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_rn(const __nv_bfloat16 h)
+{
+   unsigned short int val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm("cvt.rni.u16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+,
+   asm("{ .reg.f32 f;\n"
+       "  mov.b32 f, {0,%1};\n"
+       "  cvt.rni.u16.f32 %0,f;\n}"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+)
+   return val;
+}
+
+__CUDA_BF16_DECL__ unsigned short int __internal_device_bfloat162ushort_rz(const __nv_bfloat16 h)
+{
+   unsigned short int val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm("cvt.rzi.u16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+,
+   asm("{ .reg.f32 f;\n"
+       "  mov.b32 f, {0,%1};\n"
+       "  cvt.rzi.u16.f32 %0,f;\n}"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+)
+   return val;
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned short int __bfloat162ushort_rz(const __nv_bfloat16 h)
+{
+   unsigned short int val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+   val = __internal_device_bfloat162ushort_rz(h);
+,
+    const float f = __bfloat162float(h);
+    const unsigned short int max_val = 0xffffU;
+    const unsigned short int min_val = 0U;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__nv_bfloat16_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xFF00U) {
+        // NaN
+        val = 0U;
+    } else if (f > static_cast<float>(max_val)) {
+        // saturate maximum
+        val = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        val = min_val;
+    } else {
+        val = static_cast<unsigned short int>(f);
+    }
+)
+   return val;
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_rd(const __nv_bfloat16 h)
+{
+   unsigned short int val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm("cvt.rmi.u16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+,
+   asm("{ .reg.f32 f;\n"
+       "  mov.b32 f, {0,%1};\n"
+       "  cvt.rmi.u16.f32 %0,f;\n}"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+)
+   return val;
+}
+__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_ru(const __nv_bfloat16 h)
+{
+   unsigned short int val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm("cvt.rpi.u16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+,
+   asm("{ .reg.f32 f;\n"
+       "  mov.b32 f, {0,%1};\n"
+       "  cvt.rpi.u16.f32 %0,f;\n}"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+)
+   return val;
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rn(const unsigned short int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 val;
+    asm("cvt.rn.bf16.u16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i));
+    return val;
+,
+    const float f = static_cast<float>(i);
+    return __float2bfloat16_rn(f);
+)
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rz(const unsigned short int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 val;
+    asm("cvt.rz.bf16.u16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i));
+    return val;
+,
+    return __float2bfloat16_rz(__uint2float_rz(static_cast<unsigned int>(i)));
+)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rd(const unsigned short int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 val;
+    asm("cvt.rm.bf16.u16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i));
+    return val;
+,
+    return __float2bfloat16_rd(__uint2float_rd(static_cast<unsigned int>(i)));
+)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_ru(const unsigned short int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 val;
+    asm("cvt.rp.bf16.u16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i));
+    return val;
+,
+    return __float2bfloat16_ru(__uint2float_ru(static_cast<unsigned int>(i)));
+)
+}
+
+__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_rn(const __nv_bfloat16 h)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    unsigned long long int i;
+    asm("cvt.rni.u64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h)));
+    return i;
+,
+    return __float2ull_rn(__bfloat162float(h));
+)
+}
+
+__CUDA_BF16_DECL__ unsigned long long int __internal_device_bfloat162ull_rz(const __nv_bfloat16 h)
+{
+    unsigned long long int i;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    asm("cvt.rzi.u64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h)));
+,
+    const float f = __bfloat162float(h);
+    i = __float2ull_rz(f);
+)
+    return i;
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned long long int __bfloat162ull_rz(const __nv_bfloat16 h)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_bfloat162ull_rz(h);
+,
+    const float f = __bfloat162float(h);
+    unsigned long long int i;
+    const unsigned long long int max_val = 0xffffffffffffffffULL;
+    const unsigned long long int min_val = 0ULL;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__nv_bfloat16_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xFF00U) {
+        // NaN
+        i = 0x8000000000000000ULL;
+    } else if (f >= static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    } else {
+        i = static_cast<unsigned long long int>(f);
+    }
+    return i;
+)
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_rd(const __nv_bfloat16 h)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    unsigned long long int i;
+    asm("cvt.rmi.u64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h)));
+    return i;
+,
+    return __float2ull_rd(__bfloat162float(h));
+)
+}
+__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_ru(const __nv_bfloat16 h)
+{
+    unsigned long long int i;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    asm("cvt.rpi.u64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h)));
+,
+    const float f = __bfloat162float(h);
+    asm("cvt.rpi.u64.f32 %0, %1;" : "=l"(i) : "f"(f));
+)
+    return i;
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat16 __internal_device_ull2bfloat16_rn(const unsigned long long int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 h;
+    asm("cvt.rn.bf16.u64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i));
+    return h;
+,
+    const float ru = __ull2float_ru(i);
+    const float rd = __ull2float_rd(i);
+    float rz = __ull2float_rz(i);
+    if (ru != rd) {
+        rz = __uint_as_float(__float_as_uint(rz) | 1U);
+    }
+    return __float2bfloat16_rn(rz);
+)
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rn(const unsigned long long int i)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_ull2bfloat16_rn(i);
+,
+    float f = static_cast<float>(i);
+    const unsigned long long int uf = static_cast<unsigned long long int>(f);
+    unsigned int u = __internal_float_as_uint(f);
+    // round up happened here
+    // note: no need to handle round up to f == 0x1.p64 specially
+    if (uf > i) {
+        u--;
+    }
+    if (uf != i) {
+        u |= 1U;
+    }
+    f = __internal_uint_as_float(u);
+    return __float2bfloat16_rn(f);
+)
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rz(const unsigned long long int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 h;
+    asm("cvt.rz.bf16.u64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i));
+    return h;
+,
+    return __float2bfloat16_rz(__ull2float_rz(i));
+)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rd(const unsigned long long int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 h;
+    asm("cvt.rm.bf16.u64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i));
+    return h;
+,
+    return __float2bfloat16_rd(__ull2float_rd(i));
+)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_ru(const unsigned long long int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 h;
+    asm("cvt.rp.bf16.u64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i));
+    return h;
+,
+    return __float2bfloat16_ru(__ull2float_ru(i));
+)
+}
+__CUDA_BF16_DECL__ long long int __bfloat162ll_rn(const __nv_bfloat16 h)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    long long int i;
+    asm("cvt.rni.s64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h)));
+    return i;
+,
+    return __float2ll_rn(__bfloat162float(h));
+)
+}
+
+__CUDA_BF16_DECL__ long long int __internal_device_bfloat162ll_rz(const __nv_bfloat16 h)
+{
+    long long int i;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    asm("cvt.rzi.s64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h)));
+,
+    const float f = __bfloat162float(h);
+    i = __float2ll_rz(f);
+)
+    return i;
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_BF16_DECL__ long long int __bfloat162ll_rz(const __nv_bfloat16 h)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_bfloat162ll_rz(h);
+,
+    long long int i;
+    const float f = __bfloat162float(h);
+    const long long int max_val = (long long int)0x7fffffffffffffffULL;
+    const long long int min_val = (long long int)0x8000000000000000ULL;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__nv_bfloat16_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xFF00U) {
+        // NaN
+        i = min_val;
+    } else if (f >= static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    } else {
+        i = static_cast<long long int>(f);
+    }
+    return i;
+)
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ long long int __bfloat162ll_rd(const __nv_bfloat16 h)
+{
+    long long int i;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    asm("cvt.rmi.s64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h)));
+,
+    const float f = __bfloat162float(h);
+    asm("cvt.rmi.s64.f32 %0, %1;" : "=l"(i) : "f"(f));
+)
+    return i;
+}
+__CUDA_BF16_DECL__ long long int __bfloat162ll_ru(const __nv_bfloat16 h)
+{
+    long long int i;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    asm("cvt.rpi.s64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h)));
+,
+    const float f = __bfloat162float(h);
+    asm("cvt.rpi.s64.f32 %0, %1;" : "=l"(i) : "f"(f));
+)
+    return i;
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat16 __internal_device_ll2bfloat16_rn(const long long int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 h;
+    asm("cvt.rn.bf16.s64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i));
+    return h;
+,
+    const float ru = __ll2float_ru(i);
+    const float rd = __ll2float_rd(i);
+    float rz = __ll2float_rz(i);
+    if (ru != rd) {
+        rz = __uint_as_float(__float_as_uint(rz) | 1U);
+    }
+    return __float2bfloat16_rn(rz);
+)
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rn(const long long int i)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_ll2bfloat16_rn(i);
+,
+    float f = static_cast<float>(i);
+    const long long int lf = static_cast<long long int>(f);
+    unsigned int u = __internal_float_as_uint(f);
+
+    if ((f > 0.0f) && (lf > i)) {
+        u--;
+    }
+    if ((f < 0.0f) && (lf < i)) {
+        u--;
+    }
+    if (lf != i) {
+        u |= 1U;
+    }
+
+    f = __internal_uint_as_float(u);
+    return __float2bfloat16_rn(f);
+)
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rz(const long long int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 h;
+    asm("cvt.rz.bf16.s64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i));
+    return h;
+,
+    return __float2bfloat16_rz(__ll2float_rz(i));
+)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rd(const long long int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 h;
+    asm("cvt.rm.bf16.s64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i));
+    return h;
+,
+    return __float2bfloat16_rd(__ll2float_rd(i));
+)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_ru(const long long int i)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 h;
+    asm("cvt.rp.bf16.s64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i));
+    return h;
+,
+    return __float2bfloat16_ru(__ll2float_ru(i));
+)
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat16 htrunc(const __nv_bfloat16 h)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 r;
+    asm("cvt.rzi.bf16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(h)));
+    return r;
+,
+    return __float2bfloat16_rz(truncf(__bfloat162float(h)));
+)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 hceil(const __nv_bfloat16 h)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 r;
+    asm("cvt.rpi.bf16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(h)));
+    return r;
+,
+    float fh = __bfloat162float(h);
+    asm( "{ cvt.rpi.f32.f32 %0, %0; }\n"
+        :"+f"(fh));
+    return __float2bfloat16_rz(fh);
+)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 hfloor(const __nv_bfloat16 h)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 r;
+    asm("cvt.rmi.bf16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(h)));
+    return r;
+,
+    float fh = __bfloat162float(h);
+    asm( "{ cvt.rmi.f32.f32 %0, %0; }\n"
+        :"+f"(fh));
+    return __float2bfloat16_rz(fh);
+)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 hrint(const __nv_bfloat16 h)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 r;
+    asm("cvt.rni.bf16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(h)));
+    return r;
+,
+    return __float2bfloat16_rz(rintf(__bfloat162float(h)));
+)
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat162 h2trunc(const __nv_bfloat162 h)
+{
+    const __nv_bfloat16 low  = htrunc(h.x);
+    const __nv_bfloat16 high = htrunc(h.y);
+    return __nv_bfloat162(low, high);
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2ceil(const __nv_bfloat162 h)
+{
+    const __nv_bfloat16 low  = hceil(h.x);
+    const __nv_bfloat16 high = hceil(h.y);
+    return __nv_bfloat162(low, high);
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2floor(const __nv_bfloat162 h)
+{
+    const __nv_bfloat16 low  = hfloor(h.x);
+    const __nv_bfloat16 high = hfloor(h.y);
+    return __nv_bfloat162(low, high);
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat162 h2rint(const __nv_bfloat162 h)
+{
+    return __halves2bfloat162(hrint(__low2bfloat16(h)), hrint(__high2bfloat16(h)));
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __lows2bfloat162(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{.reg .b16 alow,ahigh,blow,bhigh;\n"
+        "  mov.b32 {alow,ahigh}, %1;\n"
+        "  mov.b32 {blow,bhigh}, %2;\n"
+        "  mov.b32 %0, {alow,blow};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)), "r"(__BFLOAT162_TO_CUI(b)));
+,
+    val.x = a.x;
+    val.y = b.x;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __highs2bfloat162(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{.reg .b16 alow,ahigh,blow,bhigh;\n"
+        "  mov.b32 {alow,ahigh}, %1;\n"
+        "  mov.b32 {blow,bhigh}, %2;\n"
+        "  mov.b32 %0, {ahigh,bhigh};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)), "r"(__BFLOAT162_TO_CUI(b)));
+,
+    val.x = a.y;
+    val.y = b.y;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __low2bfloat16(const __nv_bfloat162 a)
+{
+    __nv_bfloat16 ret;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{.reg .b16 low,high;\n"
+        " mov.b32 {low,high}, %1;\n"
+        " mov.b16 %0, low;}" : "=h"(__BFLOAT16_TO_US(ret)) : "r"(__BFLOAT162_TO_CUI(a)));
+,
+    ret = a.x;
+)
+    return ret;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ int __hisinf(const __nv_bfloat16 a)
+{
+    int retval;
+    const __nv_bfloat16_raw araw = __nv_bfloat16_raw(a);
+    if (araw.x == 0xFF80U) {
+        retval = -1;
+    } else if (araw.x == 0x7F80U) {
+        retval = 1;
+    } else {
+        retval = 0;
+    }
+    return retval;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __low2bfloat162(const __nv_bfloat162 a)
+{
+    __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{.reg .b16 low,high;\n"
+        "  mov.b32 {low,high}, %1;\n"
+        "  mov.b32 %0, {low,low};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)));
+,
+    val.x = a.x;
+    val.y = a.x;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __high2bfloat162(const __nv_bfloat162 a)
+{
+    __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{.reg .b16 low,high;\n"
+        "  mov.b32 {low,high}, %1;\n"
+        "  mov.b32 %0, {high,high};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)));
+,
+    val.x = a.y;
+    val.y = a.y;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __high2bfloat16(const __nv_bfloat162 a)
+{
+    __nv_bfloat16 ret;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{.reg .b16 low,high;\n"
+        " mov.b32 {low,high}, %1;\n"
+        " mov.b16 %0, high;}" : "=h"(__BFLOAT16_TO_US(ret)) : "r"(__BFLOAT162_TO_CUI(a)));
+,
+    ret = a.y;
+)
+    return ret;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __halves2bfloat162(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{  mov.b32 %0, {%1,%2};}\n"
+        : "=r"(__BFLOAT162_TO_UI(val)) : "h"(__BFLOAT16_TO_CUS(a)), "h"(__BFLOAT16_TO_CUS(b)));
+,
+    val.x = a;
+    val.y = b;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __bfloat162bfloat162(const __nv_bfloat16 a)
+{
+    __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{  mov.b32 %0, {%1,%1};}\n"
+        : "=r"(__BFLOAT162_TO_UI(val)) : "h"(__BFLOAT16_TO_CUS(a)));
+,
+    val.x = a;
+    val.y = a;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __lowhigh2highlow(const __nv_bfloat162 a)
+{
+    __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{.reg .b16 low,high;\n"
+        "  mov.b32 {low,high}, %1;\n"
+        "  mov.b32 %0, {high,low};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)));
+,
+    val.x = a.y;
+    val.y = a.x;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ short int __bfloat16_as_short(const __nv_bfloat16 h)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return static_cast<short int>(__BFLOAT16_TO_CUS(h));
+,
+    return static_cast<short int>(__nv_bfloat16_raw(h).x);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned short int __bfloat16_as_ushort(const __nv_bfloat16 h)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __BFLOAT16_TO_CUS(h);
+,
+    return __nv_bfloat16_raw(h).x;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __short_as_bfloat16(const short int i)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __nv_bfloat16 h;
+    __BFLOAT16_TO_US(h) = static_cast<unsigned short int>(i);
+    return h;
+,
+    __nv_bfloat16_raw hr;
+    hr.x = static_cast<unsigned short int>(i);
+    return __nv_bfloat16(hr);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ushort_as_bfloat16(const unsigned short int i)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __nv_bfloat16 h;
+    __BFLOAT16_TO_US(h) = i;
+    return h;
+,
+    __nv_bfloat16_raw hr;
+    hr.x = i;
+    return __nv_bfloat16(hr);
+)
+}
+
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300))) || defined(_NVHPC_CUDA)
+/******************************************************************************
+*                           __nv_bfloat16, __nv_bfloat162 warp shuffle        *
+******************************************************************************/
+#define __SHUFFLE_SYNC_BFLOAT162_MACRO(name, var, delta, c, mask) /* do */ {\
+   __nv_bfloat162 r; \
+   asm volatile ("{" __CUDA_BF16_STRINGIFY(name) " %0,%1,%2,%3,%4;\n}" \
+       :"=r"(__BFLOAT162_TO_UI(r)): "r"(__BFLOAT162_TO_CUI(var)), "r"(delta), "r"(c), "r"(mask)); \
+   return r; \
+} /* while(0) */
+
+__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_sync(const unsigned int mask, const __nv_bfloat162 var, const int srcLane, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
+    __SHUFFLE_SYNC_BFLOAT162_MACRO(shfl.sync.idx.b32, var, srcLane, c, mask)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_up_sync(const unsigned int mask, const __nv_bfloat162 var, const unsigned int delta, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = (warp_size - static_cast<unsigned>(width)) << 8U;
+    __SHUFFLE_SYNC_BFLOAT162_MACRO(shfl.sync.up.b32, var, delta, c, mask)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_down_sync(const unsigned int mask, const __nv_bfloat162 var, const unsigned int delta, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
+    __SHUFFLE_SYNC_BFLOAT162_MACRO(shfl.sync.down.b32, var, delta, c, mask)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_xor_sync(const unsigned int mask, const __nv_bfloat162 var, const int laneMask, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
+    __SHUFFLE_SYNC_BFLOAT162_MACRO(shfl.sync.bfly.b32, var, laneMask, c, mask)
+}
+
+#undef __SHUFFLE_SYNC_BFLOAT162_MACRO
+
+__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_sync(const unsigned int mask, const __nv_bfloat16 var, const int srcLane, const int width)
+{
+    const __nv_bfloat162 temp1 = __halves2bfloat162(var, var);
+    const __nv_bfloat162 temp2 = __shfl_sync(mask, temp1, srcLane, width);
+    return __low2bfloat16(temp2);
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_up_sync(const unsigned int mask, const __nv_bfloat16 var, const unsigned int delta, const int width)
+{
+    const __nv_bfloat162 temp1 = __halves2bfloat162(var, var);
+    const __nv_bfloat162 temp2 = __shfl_up_sync(mask, temp1, delta, width);
+    return __low2bfloat16(temp2);
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_down_sync(const unsigned int mask, const __nv_bfloat16 var, const unsigned int delta, const int width)
+{
+    const __nv_bfloat162 temp1 = __halves2bfloat162(var, var);
+    const __nv_bfloat162 temp2 = __shfl_down_sync(mask, temp1, delta, width);
+    return __low2bfloat16(temp2);
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_xor_sync(const unsigned int mask, const __nv_bfloat16 var, const int laneMask, const int width)
+{
+    const __nv_bfloat162 temp1 = __halves2bfloat162(var, var);
+    const __nv_bfloat162 temp2 = __shfl_xor_sync(mask, temp1, laneMask, width);
+    return __low2bfloat16(temp2);
+}
+
+/******************************************************************************
+*               __nv_bfloat16 and __nv_bfloat162 __ldg,__ldcg,__ldca,__ldcs   *
+******************************************************************************/
+
+#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
+#define __LDG_PTR   "l"
+#else
+#define __LDG_PTR   "r"
+#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldg(const  __nv_bfloat162 *const ptr)
+{
+    __nv_bfloat162 ret;
+    asm ("ld.global.nc.b32 %0, [%1];"  : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldg(const __nv_bfloat16 *const ptr)
+{
+    __nv_bfloat16 ret;
+    asm ("ld.global.nc.b16 %0, [%1];"  : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldcg(const  __nv_bfloat162 *const ptr)
+{
+    __nv_bfloat162 ret;
+    asm ("ld.global.cg.b32 %0, [%1];"  : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldcg(const __nv_bfloat16 *const ptr)
+{
+    __nv_bfloat16 ret;
+    asm ("ld.global.cg.b16 %0, [%1];"  : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldca(const  __nv_bfloat162 *const ptr)
+{
+    __nv_bfloat162 ret;
+    asm ("ld.global.ca.b32 %0, [%1];"  : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldca(const __nv_bfloat16 *const ptr)
+{
+    __nv_bfloat16 ret;
+    asm ("ld.global.ca.b16 %0, [%1];"  : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldcs(const  __nv_bfloat162 *const ptr)
+{
+    __nv_bfloat162 ret;
+    asm ("ld.global.cs.b32 %0, [%1];"  : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldcs(const __nv_bfloat16 *const ptr)
+{
+    __nv_bfloat16 ret;
+    asm ("ld.global.cs.b16 %0, [%1];"  : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldlu(const  __nv_bfloat162 *const ptr)
+{
+    __nv_bfloat162 ret;
+    asm ("ld.global.lu.b32 %0, [%1];"  : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr) : "memory");
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldlu(const __nv_bfloat16 *const ptr)
+{
+    __nv_bfloat16 ret;
+    asm ("ld.global.lu.b16 %0, [%1];"  : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr) : "memory");
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldcv(const  __nv_bfloat162 *const ptr)
+{
+    __nv_bfloat162 ret;
+    asm ("ld.global.cv.b32 %0, [%1];"  : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr) : "memory");
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldcv(const __nv_bfloat16 *const ptr)
+{
+    __nv_bfloat16 ret;
+    asm ("ld.global.cv.b16 %0, [%1];"  : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr) : "memory");
+    return ret;
+}
+
+__CUDA_BF16_DECL__ void __stwb(__nv_bfloat162 *const ptr, const __nv_bfloat162 value)
+{
+    asm ("st.global.wb.b32 [%0], %1;"  :: __LDG_PTR(ptr), "r"(__BFLOAT162_TO_CUI(value)) : "memory");
+}
+__CUDA_BF16_DECL__ void __stwb(__nv_bfloat16 *const ptr, const __nv_bfloat16 value)
+{
+    asm ("st.global.wb.b16 [%0], %1;"  :: __LDG_PTR(ptr),  "h"(__BFLOAT16_TO_CUS(value)) : "memory");
+}
+__CUDA_BF16_DECL__ void __stcg(__nv_bfloat162 *const ptr, const __nv_bfloat162 value)
+{
+    asm ("st.global.cg.b32 [%0], %1;"  :: __LDG_PTR(ptr), "r"(__BFLOAT162_TO_CUI(value)) : "memory");
+}
+__CUDA_BF16_DECL__ void __stcg(__nv_bfloat16 *const ptr, const __nv_bfloat16 value)
+{
+    asm ("st.global.cg.b16 [%0], %1;"  :: __LDG_PTR(ptr),  "h"(__BFLOAT16_TO_CUS(value)) : "memory");
+}
+__CUDA_BF16_DECL__ void __stcs(__nv_bfloat162 *const ptr, const __nv_bfloat162 value)
+{
+    asm ("st.global.cs.b32 [%0], %1;"  :: __LDG_PTR(ptr), "r"(__BFLOAT162_TO_CUI(value)) : "memory");
+}
+__CUDA_BF16_DECL__ void __stcs(__nv_bfloat16 *const ptr, const __nv_bfloat16 value)
+{
+    asm ("st.global.cs.b16 [%0], %1;"  :: __LDG_PTR(ptr),  "h"(__BFLOAT16_TO_CUS(value)) : "memory");
+}
+__CUDA_BF16_DECL__ void __stwt(__nv_bfloat162 *const ptr, const __nv_bfloat162 value)
+{
+    asm ("st.global.wt.b32 [%0], %1;"  :: __LDG_PTR(ptr), "r"(__BFLOAT162_TO_CUI(value)) : "memory");
+}
+__CUDA_BF16_DECL__ void __stwt(__nv_bfloat16 *const ptr, const __nv_bfloat16 value)
+{
+    asm ("st.global.wt.b16 [%0], %1;"  :: __LDG_PTR(ptr),  "h"(__BFLOAT16_TO_CUS(value)) : "memory");
+}
+
+#undef __LDG_PTR
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300))) || defined(_NVHPC_CUDA) */
+/******************************************************************************
+*                             __nv_bfloat162 comparison                       *
+******************************************************************************/
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+#define __COMPARISON_OP_BFLOAT162_MACRO(name) {\
+   __nv_bfloat162 val; \
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,\
+   asm( "{ " __CUDA_BF16_STRINGIFY(name) ".bf16x2.bf16x2 %0,%1,%2;\n}" \
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \
+,\
+   asm( "{.reg .b32 low_a,low_b,high_a,high_b,high_res,low_res;\n"\
+        "  and.b32 high_a, %1, 0xffff0000U;\n"\
+        "  and.b32 high_b, %2, 0xffff0000U;\n"\
+        "  shl.b32 low_a, %1, 16;\n"\
+        "  shl.b32 low_b, %2, 16;\n"\
+        "  " __CUDA_BF16_STRINGIFY(name) ".f32.f32 low_res, low_a, low_b;\n"\
+        "  " __CUDA_BF16_STRINGIFY(name) ".f32.f32 high_res, high_a, high_b;\n"\
+        "  shr.u32 low_res, low_res, 16;\n"\
+        "  or.b32  %0, high_res, low_res;}\n"\
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \
+)\
+   return val; \
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_heq2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.eq)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hne2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.ne)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hle2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.le)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hge2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.ge)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hlt2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.lt)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hgt2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.gt)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hequ2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.equ)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hneu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.neu)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hleu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.leu)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.geu)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hltu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.ltu)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.gtu)
+}
+#undef __COMPARISON_OP_BFLOAT162_MACRO
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __heq2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_heq2(a, b);
+,
+    __nv_bfloat162_raw val;
+    val.x = __heq(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    val.y = __heq(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    return __nv_bfloat162(val);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hne2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hne2(a, b);
+,
+    __nv_bfloat162_raw val;
+    val.x = __hne(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    val.y = __hne(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    return __nv_bfloat162(val);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hle2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hle2(a, b);
+,
+    __nv_bfloat162_raw val;
+    val.x = __hle(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    val.y = __hle(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    return __nv_bfloat162(val);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hge2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hge2(a, b);
+,
+    __nv_bfloat162_raw val;
+    val.x = __hge(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    val.y = __hge(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    return __nv_bfloat162(val);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hlt2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hlt2(a, b);
+,
+    __nv_bfloat162_raw val;
+    val.x = __hlt(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    val.y = __hlt(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    return __nv_bfloat162(val);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hgt2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hgt2(a, b);
+,
+    __nv_bfloat162_raw val;
+    val.x = __hgt(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    val.y = __hgt(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    return __nv_bfloat162(val);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hequ2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hequ2(a, b);
+,
+    __nv_bfloat162_raw val;
+    val.x = __hequ(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    val.y = __hequ(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    return __nv_bfloat162(val);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hneu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hneu2(a, b);
+,
+    __nv_bfloat162_raw val;
+    val.x = __hneu(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    val.y = __hneu(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    return __nv_bfloat162(val);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hleu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hleu2(a, b);
+,
+    __nv_bfloat162_raw val;
+    val.x = __hleu(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    val.y = __hleu(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    return __nv_bfloat162(val);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hgeu2(a, b);
+,
+    __nv_bfloat162_raw val;
+    val.x = __hgeu(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    val.y = __hgeu(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    return __nv_bfloat162(val);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hltu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hltu2(a, b);
+,
+    __nv_bfloat162_raw val;
+    val.x = __hltu(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    val.y = __hltu(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    return __nv_bfloat162(val);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hgtu2(a, b);
+,
+    __nv_bfloat162_raw val;
+    val.x = __hgtu(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    val.y = __hgtu(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    return __nv_bfloat162(val);
+)
+}
+
+/******************************************************************************
+*                __nv_bfloat162 comparison with mask output                   *
+******************************************************************************/
+#define __COMPARISON_OP_BFLOAT162_MACRO_MASK(name) {\
+   unsigned val; \
+   asm( "{ " __CUDA_BF16_STRINGIFY(name) ".u32.bf16x2 %0,%1,%2;\n}" \
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \
+   return val; \
+}
+
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __heq2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.eq)
+,
+    const unsigned short px = __heq(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __heq(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hne2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.ne)
+,
+    const unsigned short px = __hne(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hne(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hle2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.le)
+,
+    const unsigned short px = __hle(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hle(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hge2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.ge)
+,
+    const unsigned short px = __hge(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hge(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hlt2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.lt)
+,
+    const unsigned short px = __hlt(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hlt(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hgt2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.gt)
+,
+    const unsigned short px = __hgt(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hgt(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hequ2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.equ)
+,
+    const unsigned short px = __hequ(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hequ(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hneu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.neu)
+,
+    const unsigned short px = __hneu(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hneu(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hleu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.leu)
+,
+    const unsigned short px = __hleu(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hleu(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hgeu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.geu)
+,
+    const unsigned short px = __hgeu(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hgeu(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hltu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.ltu)
+,
+    const unsigned short px = __hltu(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hltu(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hgtu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.gtu)
+,
+    const unsigned short px = __hgtu(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hgtu(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+#undef __COMPARISON_OP_BFLOAT162_MACRO_MASK
+
+#define __BOOL_COMPARISON_OP_BFLOAT162_MACRO(name) {\
+   unsigned int val; \
+   bool retval; \
+   asm( "{ " __CUDA_BF16_STRINGIFY(name) ".bf16x2.bf16x2 %0,%1,%2;\n}" \
+        :"=r"(val) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \
+   if (val == 0x3F803F80U) {\
+      retval = true; \
+   } else { \
+      retval = false; \
+   }\
+   return retval;\
+}
+
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbeq2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.eq)
+,
+    return (__heq(a.x, b.x) && __heq(a.y, b.y));
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbne2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.ne)
+,
+    return (__hne(a.x, b.x) && __hne(a.y, b.y));
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hble2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.le)
+,
+    return (__hle(a.x, b.x) && __hle(a.y, b.y));
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbge2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.ge)
+,
+    return (__hge(a.x, b.x) && __hge(a.y, b.y));
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hblt2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.lt)
+,
+    return (__hlt(a.x, b.x) && __hlt(a.y, b.y));
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbgt2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.gt)
+,
+    return (__hgt(a.x, b.x) && __hgt(a.y, b.y));
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbequ2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.equ)
+,
+    return (__hequ(a.x, b.x) && __hequ(a.y, b.y));
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbneu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.neu)
+,
+    return (__hneu(a.x, b.x) && __hneu(a.y, b.y));
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbleu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.leu)
+,
+    return (__hleu(a.x, b.x) && __hleu(a.y, b.y));
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.geu)
+,
+    return (__hgeu(a.x, b.x) && __hgeu(a.y, b.y));
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbltu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.ltu)
+,
+    return (__hltu(a.x, b.x) && __hltu(a.y, b.y));
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.gtu)
+,
+    return (__hgtu(a.x, b.x) && __hgtu(a.y, b.y));
+)
+}
+#undef __BOOL_COMPARISON_OP_BFLOAT162_MACRO
+/******************************************************************************
+*                             __nv_bfloat16 comparison                              *
+******************************************************************************/
+#define __COMPARISON_OP_BFLOAT16_MACRO(name) {\
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,\
+   unsigned short val; \
+   asm( "{ .reg .pred __$temp3;\n" \
+        "  setp." __CUDA_BF16_STRINGIFY(name) ".bf16  __$temp3, %1, %2;\n" \
+        "  selp.u16 %0, 1, 0, __$temp3;}" \
+        : "=h"(val) : "h"(__BFLOAT16_TO_CUS(a)), "h"(__BFLOAT16_TO_CUS(b))); \
+   return (val != 0U) ? true : false; \
+,\
+   unsigned int val; \
+   asm( "{.reg .b32 a,b;\n"\
+        "  mov.b32 a, {0, %1};\n"\
+        "  mov.b32 b, {0, %2};\n"\
+        "  set." __CUDA_BF16_STRINGIFY(name) ".f32.f32 %0, a, b;}\n"\
+        :"=r"(val) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \
+   return (val != 0U) ? true : false; \
+)\
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __heq(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __COMPARISON_OP_BFLOAT16_MACRO(eq)
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    return (fa == fb);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hne(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __COMPARISON_OP_BFLOAT16_MACRO(ne)
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    return (fa != fb) && (!__hisnan(a)) && (!__hisnan(b));
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hle(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __COMPARISON_OP_BFLOAT16_MACRO(le)
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    return (fa <= fb);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hge(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __COMPARISON_OP_BFLOAT16_MACRO(ge)
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    return (fa >= fb);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hlt(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __COMPARISON_OP_BFLOAT16_MACRO(lt)
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    return (fa < fb);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hgt(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __COMPARISON_OP_BFLOAT16_MACRO(gt)
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    return (fa > fb);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hequ(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __COMPARISON_OP_BFLOAT16_MACRO(equ)
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    return (fa == fb) || (__hisnan(a)) || (__hisnan(b));
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hneu(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __COMPARISON_OP_BFLOAT16_MACRO(neu)
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    return (fa != fb);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hleu(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __COMPARISON_OP_BFLOAT16_MACRO(leu)
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    return (fa <= fb) || (__hisnan(a)) || (__hisnan(b));
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hgeu(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __COMPARISON_OP_BFLOAT16_MACRO(geu)
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    return (fa >= fb) || (__hisnan(a)) || (__hisnan(b));
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hltu(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __COMPARISON_OP_BFLOAT16_MACRO(ltu)
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    return (fa < fb) || (__hisnan(a)) || (__hisnan(b));
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hgtu(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __COMPARISON_OP_BFLOAT16_MACRO(gtu)
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    return (fa > fb) || (__hisnan(a)) || (__hisnan(b));
+)
+}
+#undef __COMPARISON_OP_BFLOAT16_MACRO
+/******************************************************************************
+*                            __nv_bfloat162 arithmetic                        *
+******************************************************************************/
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hadd2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+   __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm( "{ add.bf16x2 %0,%1,%2; }\n"
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+,
+   asm( "{.reg .b32 c;\n"
+        "  mov.b32 c, 0x3f803f80U;\n"
+        "  fma.rn.bf16x2 %0,%1,c,%2;}\n"
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+)
+   return val;
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hsub2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+   __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm( "{ sub.bf16x2 %0,%1,%2; }\n"
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+,
+   asm( "{.reg .b32 c;\n"
+        "  mov.b32 c, 0xbf80bf80U;\n"
+        "  fma.rn.bf16x2 %0,%2,c,%1;}\n"
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+)
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hmul2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+   __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm( "{ mul.bf16x2 %0,%1,%2; }\n"
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+,
+   asm( "{.reg .b32 c;\n"
+        "  mov.b32 c, 0x80008000U;\n"
+        "  fma.rn.bf16x2 %0,%1,%2,c;}\n"
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+)
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hadd2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+   __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm( "{ add.rn.bf16x2 %0,%1,%2; }\n"
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+,
+   asm( "{.reg .b32 c;\n"
+        "  mov.b32 c, 0x3f803f80U;\n"
+        "  fma.rn.bf16x2 %0,%1,c,%2;}\n"
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+)
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hsub2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+   __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm( "{ sub.rn.bf16x2 %0,%1,%2; }\n"
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+,
+   asm( "{.reg .b32 c;\n"
+        "  mov.b32 c, 0xbf80bf80U;\n"
+        "  fma.rn.bf16x2 %0,%2,c,%1;}\n"
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+)
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hmul2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+   __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm( "{ mul.rn.bf16x2 %0,%1,%2; }\n"
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+,
+   asm( "{.reg .b32 c;\n"
+        "  mov.b32 c, 0x80008000U;\n"
+        "  fma.rn.bf16x2 %0,%1,%2,c;}\n"
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+)
+   return val;
+}
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hadd2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    val = __internal_device_hadd2(a, b);
+,
+    val.x = __hadd(a.x, b.x);
+    val.y = __hadd(a.y, b.y);
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hsub2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    val = __internal_device_hsub2(a, b);
+,
+    val.x = __hsub(a.x, b.x);
+    val.y = __hsub(a.y, b.y);
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmul2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    val = __internal_device_hmul2(a, b);
+,
+    val.x = __hmul(a.x, b.x);
+    val.y = __hmul(a.y, b.y);
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hadd2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    val = __internal_device_hadd2_rn(a, b);
+,
+    val.x = __hadd_rn(a.x, b.x);
+    val.y = __hadd_rn(a.y, b.y);
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hsub2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    val = __internal_device_hsub2_rn(a, b);
+,
+    val.x = __hsub_rn(a.x, b.x);
+    val.y = __hsub_rn(a.y, b.y);
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmul2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    val = __internal_device_hmul2_rn(a, b);
+,
+    val.x = __hmul_rn(a.x, b.x);
+    val.y = __hmul_rn(a.y, b.y);
+)
+    return val;
+}
+
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hadd2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    asm( "{.reg .b32 f, one, zero;\n"
+        "  mov.b32 one, 0x3f803f80U;\n"
+        "  mov.b32 zero, 0;\n"
+        "  fma.rn.bf16x2 f,%1,one,%2;\n"
+        "  max.bf16x2 f, f, zero;\n"
+        "  min.bf16x2 %0, f, one;\n}"
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+,
+    val.x = __hadd_sat(a.x, b.x);
+    val.y = __hadd_sat(a.y, b.y);
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hsub2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    asm( "{.reg .b32 f, one, zero, mone;\n"
+        "  mov.b32 one, 0x3f803f80U;\n"
+        "  mov.b32 zero, 0;\n"
+        "  mov.b32 mone, 0xbf80bf80U;\n"
+        "  fma.rn.bf16x2 f,%2,mone,%1;\n"
+        "  max.bf16x2 f, f, zero;\n"
+        "  min.bf16x2 %0, f, one;\n}"
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+,
+    val.x = __hsub_sat(a.x, b.x);
+    val.y = __hsub_sat(a.y, b.y);
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmul2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __nv_bfloat162 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    asm( "{.reg .b32 f, one, zero, mzero;\n"
+        "  mov.b32 one, 0x3f803f80U;\n"
+        "  mov.b32 zero, 0;\n"
+        "  mov.b32 mzero, 0x80008000U;\n"
+        "  fma.rn.bf16x2 f,%1,%2,mzero;\n"
+        "  max.bf16x2 f, f, zero;\n"
+        "  min.bf16x2 %0, f, one;\n}"
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+,
+    val.x = __hmul_sat(a.x, b.x);
+    val.y = __hmul_sat(a.y, b.y);
+)
+    return val;
+}
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c)
+{
+    __nv_bfloat162 val;
+    asm( "{fma.rn.bf16x2 %0,%1,%2,%3;\n}"
+         :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)),"r"(__BFLOAT162_TO_CUI(c)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c)
+{
+    __nv_bfloat162 val;
+    asm( "{ .reg .b32 f, one, zero;\n"
+         "  mov.b32 one, 0x3f803f80U;\n"
+         "  mov.b32 zero, 0;\n"
+         "  fma.rn.bf16x2 f, %1, %2, %3;\n"
+         "  max.bf16x2 f, f, zero;\n"
+         "  min.bf16x2 %0, f, one;\n}"
+         :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)),"r"(__BFLOAT162_TO_CUI(c)));
+    return val;
+}
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __h2div(const __nv_bfloat162 a, const __nv_bfloat162 b) {
+    __nv_bfloat16 ha, hb;
+
+    ha = __low2bfloat16(a);
+    hb = __low2bfloat16(b);
+
+    const __nv_bfloat16 v1 = __hdiv(ha, hb);
+
+    ha = __high2bfloat16(a);
+    hb = __high2bfloat16(b);
+
+    const __nv_bfloat16 v2 = __hdiv(ha, hb);
+
+    return __halves2bfloat162(v1, v2);
+}
+/******************************************************************************
+*                             __nv_bfloat16 arithmetic                        *
+******************************************************************************/
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ __nv_bfloat16 __internal_sm80_device_hadd(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __nv_bfloat16 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm( "{ add.bf16 %0,%1,%2; }\n"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+,
+    asm( "{.reg .b16 c;\n"
+        "  mov.b16 c, 0x3f80U;\n"
+        "  fma.rn.bf16 %0,%1,c,%2;}\n"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+)
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __internal_sm80_device_hsub(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+   __nv_bfloat16 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm( "{ sub.bf16 %0,%1,%2; }\n"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+,
+   asm( "{.reg .b16 c;\n"
+        "  mov.b16 c, 0xbf80U;\n"
+        "  fma.rn.bf16 %0,%2,c,%1;}\n"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+)
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __internal_sm80_device_hmul(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+   __nv_bfloat16 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm( "{ mul.bf16 %0,%1,%2; }\n"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+,
+   asm( "{.reg .b16 c;\n"
+        "  mov.b16 c, 0x8000U;\n"
+        "  fma.rn.bf16 %0,%1,%2,c;}\n"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+)
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __internal_sm80_device_hadd_rn(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+   __nv_bfloat16 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm( "{ add.rn.bf16 %0,%1,%2; }\n"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+,
+   asm( "{.reg .b16 c;\n"
+        "  mov.b16 c, 0x3f80U;\n"
+        "  fma.rn.bf16 %0,%1,c,%2;}\n"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+)
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __internal_sm80_device_hsub_rn(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+   __nv_bfloat16 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm( "{ sub.rn.bf16 %0,%1,%2; }\n"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+,
+   asm( "{.reg .b16 c;\n"
+        "  mov.b16 c, 0xbf80U;\n"
+        "  fma.rn.bf16 %0,%2,c,%1;}\n"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+)
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __internal_sm80_device_hmul_rn(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+   __nv_bfloat16 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+   asm( "{ mul.rn.bf16 %0,%1,%2; }\n"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+,
+   asm( "{.reg .b16 c;\n"
+        "  mov.b16 c, 0x8000U;\n"
+        "  fma.rn.bf16 %0,%1,%2,c;}\n"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+)
+   return val;
+}
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ __nv_bfloat16 __internal_device_hadd(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __nv_bfloat16 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    val = __internal_sm80_device_hadd(a, b);
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    // avoid ftz in device code
+    val = __float2bfloat16(__fmaf_ieee_rn(fa, 1.0f, fb));
+)
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __internal_device_hsub(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __nv_bfloat16 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    val = __internal_sm80_device_hsub(a, b);
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    // avoid ftz in device code
+    val = __float2bfloat16(__fmaf_ieee_rn(fb, -1.0f, fa));
+)
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __internal_device_hmul(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __nv_bfloat16 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    val = __internal_sm80_device_hmul(a, b);
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    // avoid ftz in device code
+    val = __float2bfloat16(__fmaf_ieee_rn(fa, fb, -0.0f));
+)
+    return val;
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hadd(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hadd(a, b);
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    return __float2bfloat16(fa + fb);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hsub(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hsub(a, b);
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    return __float2bfloat16(fa - fb);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmul(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hmul(a, b);
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    return __float2bfloat16(fa * fb);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hadd_rn(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    return __internal_sm80_device_hadd_rn(a, b);
+,
+    return __hadd(a, b);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hsub_rn(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    return __internal_sm80_device_hsub_rn(a, b);
+,
+    return __hsub(a, b);
+
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmul_rn(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    return __internal_sm80_device_hmul_rn(a, b);
+,
+    return __hmul(a, b);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hadd_sat(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __nv_bfloat16 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    asm( "{ .reg .b16 f, one, zero;\n"
+         "  mov.b16 one, 0x3f80U;\n"
+         "  mov.b16 zero, 0;\n"
+         "  fma.rn.bf16 f, %1, one, %2;\n"
+         "  max.bf16 f, f, zero;\n"
+         "  min.bf16 %0, f, one;\n}"
+         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+,
+    val = __hmin(__hmax(__hadd(a, b), CUDART_ZERO_BF16), CUDART_ONE_BF16);
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hsub_sat(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __nv_bfloat16 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    asm( "{ .reg .b16 f, one, zero, mone;\n"
+         "  mov.b16 one, 0x3f80U;\n"
+         "  mov.b16 zero, 0;\n"
+         "  mov.b16 mone, 0xbf80U;\n"
+         "  fma.rn.bf16 f, %2, mone, %1;\n"
+         "  max.bf16 f, f, zero;\n"
+         "  min.bf16 %0, f, one;\n}"
+         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+,
+    val = __hmin(__hmax(__hsub(a, b), CUDART_ZERO_BF16), CUDART_ONE_BF16);
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmul_sat(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __nv_bfloat16 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    asm( "{ .reg .b16 f, one, zero, mzero;\n"
+         "  mov.b16 one, 0x3f80U;\n"
+         "  mov.b16 zero, 0;\n"
+         "  mov.b16 mzero, 0x8000U;\n"
+         "  fma.rn.bf16 f, %1, %2, mzero;\n"
+         "  max.bf16 f, f, zero;\n"
+         "  min.bf16 %0, f, one;\n}"
+         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+,
+    val = __hmin(__hmax(__hmul(a, b), CUDART_ZERO_BF16), CUDART_ONE_BF16);
+)
+    return val;
+}
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ __nv_bfloat16 __hfma(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c)
+{
+    __nv_bfloat16 val;
+    asm( "{fma.rn.bf16 %0,%1,%2,%3;\n}"
+         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)),"h"(__BFLOAT16_TO_CUS(c)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __hfma_sat(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c)
+{
+    __nv_bfloat16 val;
+    asm( "{ .reg .b16 f, one, zero;\n"
+         "  mov.b16 one, 0x3f80U;\n"
+         "  mov.b16 zero, 0;\n"
+         "  fma.rn.bf16 f, %1, %2, %3;\n"
+         "  max.bf16 f, f, zero;\n"
+         "  min.bf16 %0, f, one;\n}"
+         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)),"h"(__BFLOAT16_TO_CUS(c)));
+    return val;
+}
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+#define __BINARY_OP_BFLOAT16_MACRO(name) /* do */ {\
+   __nv_bfloat16 val; \
+   asm( "{.reg .b32 a,b,res;\n"\
+        "  mov.b32 a, {0,%1};\n"\
+        "  mov.b32 b, {0,%2};\n"\
+        "  " __CUDA_BF16_STRINGIFY(name) ".f32 res, a, b;\n"\
+        "  cvt.rn.bf16.f32 %0, res;}\n"\
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \
+   return val; \
+} /* while(0) */
+__CUDA_BF16_DECL__ __nv_bfloat16 __internal_device_hdiv(const __nv_bfloat16 a, const __nv_bfloat16 b) {
+    const float two_126 =  __uint_as_float(0x7E800000U) ; //2^126
+    const float a_f = __bfloat162float(a);
+    float b_f = __bfloat162float(b);
+    float ans;
+    bool b_big = (fabsf(b_f) >= two_126);
+    if(b_big){b_f *= 0.25f;}
+
+    // f32 div approximation. Good enough for c-r bfloat div.
+    asm("{ div.approx.f32 %0, %1, %2; }" : "=f"(ans) : "f"(a_f), "f"(b_f));
+
+    // Prevent ftz:
+    if(b_big){ans = __fmaf_ieee_rn(ans, 0.25f, -0.0f);}
+    return __float2bfloat16(ans);
+}
+
+#undef __BINARY_OP_BFLOAT16_MACRO
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hdiv(const __nv_bfloat16 a, const __nv_bfloat16 b) {
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hdiv(a, b);
+,
+    const float fa = __bfloat162float(a);
+    const float fb = __bfloat162float(b);
+    return __float2bfloat16(fa / fb);
+)
+}
+
+/******************************************************************************
+*                             __nv_bfloat162 functions                        *
+******************************************************************************/
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ __nv_bfloat16 __hsin_internal(const __nv_bfloat16 a) {
+    float f = __bfloat162float(a);
+    float r = sinf(f);
+    // Detect compile-time FTZ setting:
+    // if subnormal constant is not flushed to zero at compile-time, then
+    // ftz=off, and it is safe to return result of sinf()
+    // Otherwise, ftz=on, then sinf() result is valid for non-flushed
+    // values, and subnormal input is returned unchanged via else
+    // branch.
+    if ((__uint_as_float(0x00000001U) > 0.0f) || (f != 0.0f))
+    {
+        f = r;
+    }
+    return __float2bfloat16_rn(f);
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 hsin(const __nv_bfloat16 a) {
+    return __hsin_internal(a);
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2sin(const __nv_bfloat162 a) {
+    const __nv_bfloat16 l = __low2bfloat16(a);
+    const __nv_bfloat16 h = __high2bfloat16(a);
+    return __halves2bfloat162(__hsin_internal(l), __hsin_internal(h));
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __hcos_internal(const __nv_bfloat16 a) {
+    float f = __bfloat162float(a);
+    f = cosf(f);
+    return __float2bfloat16_rn(f);
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 hcos(const __nv_bfloat16 a) {
+    return __hcos_internal(a);
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2cos(const __nv_bfloat162 a) {
+    const __nv_bfloat16 l = __low2bfloat16(a);
+    const __nv_bfloat16 h = __high2bfloat16(a);
+    return __halves2bfloat162(__hcos_internal(l), __hcos_internal(h));
+}
+
+__CUDA_BF16_DECL__ float __internal_device_fast_bf16exp(const float x)
+{
+    const float log2e_up = __uint_as_float(0x3FB8AA3CU);
+    float fa = x * log2e_up;
+    asm("{ ex2.approx.f32 %0, %0; }" : "+f"(fa));
+    return fa;
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat16 hexp(const __nv_bfloat16 a) {
+    float fa = __bfloat162float(a);
+    fa = __internal_device_fast_bf16exp(fa);
+    return __float2bfloat16_rn(fa);
+}
+
+#define __APPROX_FCAST2(fun) /* do */ {\
+   __nv_bfloat162 val;\
+   asm("{.reg.b16         hl, hu;         \n"\
+                " .reg.b32         fl, fu;         \n"\
+                "  mov.b32         {hl, hu}, %1;   \n"\
+                "  mov.b32         fl, {0,hl};     \n"\
+                "  mov.b32         fu, {0,hu};     \n"\
+                "  " __CUDA_BF16_STRINGIFY(fun) ".approx.f32   fl, fl;     \n"\
+                "  " __CUDA_BF16_STRINGIFY(fun) ".approx.f32   fu, fu;     \n"\
+                "  cvt.rn.bf16.f32    hl, fl;     \n"\
+                "  cvt.rn.bf16.f32    hu, fu;     \n"\
+                "  mov.b32         %0, {hl, hu};   \n"\
+                "}":"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)));       \
+   return val;\
+} /* while(0) */
+#define __BF16_SPEC_CASE2(i,r, spc, ulp) \
+   "{.reg.b32 spc, ulp, p;\n"\
+   "  mov.b32 spc," __CUDA_BF16_STRINGIFY(spc) ";\n"\
+   "  mov.b32 ulp," __CUDA_BF16_STRINGIFY(ulp) ";\n"\
+   "  set.eq.f16x2.f16x2 p," __CUDA_BF16_STRINGIFY(i) ", spc;\n"\
+   "  fma.rn.bf16x2 " __CUDA_BF16_STRINGIFY(r) ",p,ulp," __CUDA_BF16_STRINGIFY(r) ";\n}\n"
+
+__CUDA_BF16_DECL__ __nv_bfloat162 h2exp(const __nv_bfloat162 a) {
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __nv_bfloat162 val;
+    asm("{.reg.b16         hl, hu;         \n"
+        " .reg.b32         h,r,fl,fu, C;   \n"
+        "  mov.b32         {hl, hu}, %1;   \n"
+        "  mov.b32         h, %1;          \n"
+        "  mov.b32         fl, {0,hl};     \n"
+        "  mov.b32         fu, {0,hu};     \n"
+        "  mov.b32         C, 0x3FB8AA3CU;  \n"
+        "  mul.f32         fl,fl,C;        \n"
+        "  mul.f32         fu,fu,C;        \n"
+        "  ex2.approx.f32      fl, fl;     \n"
+        "  ex2.approx.f32      fu, fu;     \n"
+        "  cvt.rn.bf16.f32    hl, fl;     \n"
+        "  cvt.rn.bf16.f32    hu, fu;     \n"
+        "  mov.b32         r, {hl, hu};    \n"
+        "  mov.b32         %0, r;  \n"
+        "}":"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)));
+    return val;
+,
+    return __floats2bfloat162_rn( __internal_device_fast_bf16exp(__low2float(a)), __internal_device_fast_bf16exp(__high2float(a)) );
+)
+}
+
+__CUDA_BF16_DECL__ float __internal_device_tanhf_noftz(const float x)
+{
+    float f = x;
+    float r = tanhf(x);
+    // Detect compile-time FTZ setting:
+    // if subnormal constant is not flushed to zero at compile-time, then
+    // ftz=off, and it is safe to return result of tanhf()
+    // Otherwise, ftz=on, then tanhf() result is valid for non-flushed
+    // values, and subnormal input is returned unchanged via else
+    // branch.
+    if ((__uint_as_float(0x00000001U) > 0.0f) || (f != 0.0f))
+    {
+        f = r;
+    }
+    return f;
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat16 htanh(const __nv_bfloat16 a) {
+    float f = __bfloat162float(a);
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_75,
+    asm("{ tanh.approx.f32 %0, %0; }" : "+f"(f));
+,
+    f = __internal_device_tanhf_noftz(f);
+)
+    __nv_bfloat16 h = __float2bfloat16_rn(f);
+    return h;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2tanh(const __nv_bfloat162 a) {
+    float2 f = __bfloat1622float2(a);
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_75,
+    asm("{ tanh.approx.f32 %0, %0; }" : "+f"(f.x));
+    asm("{ tanh.approx.f32 %0, %0; }" : "+f"(f.y));
+,
+    f.x = __internal_device_tanhf_noftz(f.x);
+    f.y = __internal_device_tanhf_noftz(f.y);
+)
+    __nv_bfloat162 h = __float22bfloat162_rn(f);
+    return h;
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat16 htanh_approx(const __nv_bfloat16 a) {
+    __nv_bfloat16 r;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16_raw hr = (__nv_bfloat16_raw)a;
+    asm("tanh.approx.bf16 %0, %0;" : "+h"(hr.x));
+    r = (__nv_bfloat16)hr;
+,
+    r = htanh(a);
+)
+    return r;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2tanh_approx(const __nv_bfloat162 a) {
+    __nv_bfloat162 res;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    asm("tanh.approx.bf16x2 %0, %1;" : "=r"(__BFLOAT162_TO_UI(res)) : "r"(__BFLOAT162_TO_CUI(a)));
+,
+    res = h2tanh(a);
+)
+    return res;
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat16 hexp2(const __nv_bfloat16 a) {
+    float fa = __bfloat162float(a);
+    asm("{ ex2.approx.f32 %0, %0; }" : "+f"(fa));
+    return __float2bfloat16_rn(fa);
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2exp2(const __nv_bfloat162 a) {
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __APPROX_FCAST2(ex2)
+,
+    float fl = __low2float(a);
+    asm("{ ex2.approx.f32 %0, %0; }" : "+f"(fl));
+    float fh = __high2float(a);
+    asm("{ ex2.approx.f32 %0, %0; }" : "+f"(fh));
+    return __floats2bfloat162_rn( fl, fh );
+)
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat16 hexp10(const __nv_bfloat16 a) {
+    const float log10_2 = __uint_as_float(0x40549A78U);
+    float fa = __bfloat162float(a) * log10_2;
+    asm("{ ex2.approx.f32 %0, %0; }" : "+f"(fa));
+    __nv_bfloat16 r = __float2bfloat16_rn(fa);
+    __nv_bfloat16_raw araw = static_cast<__nv_bfloat16_raw>(a);
+    if (araw.x == (unsigned short)0xBC95U)
+    {
+        araw.x = 0x3f75U;
+        r = static_cast<__nv_bfloat16>(araw);
+    }
+    return r;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2exp10(const __nv_bfloat162 a) {
+    __nv_bfloat162 r;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    asm("{.reg.b16         hl, hu;         \n"
+        " .reg.b32         h,r,fl,fu, C;   \n"
+        "  mov.b32         {hl, hu}, %1;   \n"
+        "  mov.b32         fl, {0,hl};     \n"
+        "  mov.b32         fu, {0,hu};     \n"
+        "  mov.b32         C, 0x40549A78U;  \n"
+        "  mul.f32         fl,fl,C;        \n"
+        "  mul.f32         fu,fu,C;        \n"
+        "  ex2.approx.f32      fl, fl;     \n"
+        "  ex2.approx.f32      fu, fu;     \n"
+        "  cvt.rn.bf16.f32    hl, fl;     \n"
+        "  cvt.rn.bf16.f32    hu, fu;     \n"
+        "  mov.b32         r, {hl, hu};    \n"
+        __BF16_SPEC_CASE2(%1, r, 0xBC95BC95U,0xBF00BF00U)
+        "  mov.b32         %0, r;  \n"
+        "}":"=r"(__BFLOAT162_TO_UI(r)) : "r"(__BFLOAT162_TO_CUI(a)));
+,
+    const float log10_2 = __uint_as_float(0x40549A78U);
+    float fl = __low2float(a) * log10_2;
+    asm("{ ex2.approx.f32 %0, %0; }" : "+f"(fl));
+
+    float fh = __high2float(a) * log10_2;
+    asm("{ ex2.approx.f32 %0, %0; }" : "+f"(fh));
+
+    r = __floats2bfloat162_rn( fl, fh );
+
+    const __nv_bfloat162_raw araw = static_cast<__nv_bfloat162_raw>(a);
+    if (araw.x == (unsigned short)0xBC95U)
+    {
+        __nv_bfloat16_raw raw_fix;
+        raw_fix.x = (unsigned short)0x3f75U;
+        r.x = static_cast<__nv_bfloat16>(raw_fix);
+    }
+    if (araw.y == (unsigned short)0xBC95U)
+    {
+        __nv_bfloat16_raw raw_fix;
+        raw_fix.x = (unsigned short)0x3f75U;
+        r.y = static_cast<__nv_bfloat16>(raw_fix);
+    }
+)
+    return r;
+}
+
+__CUDA_BF16_DECL__ float __internal_device_fast_bf16log2(float x)
+{
+    asm("{ lg2.approx.f32 %0, %0; }" : "+f"(x));
+    return x;
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat16 hlog2(const __nv_bfloat16 a) {
+    float fa = __bfloat162float(a);
+    fa = __internal_device_fast_bf16log2(fa);
+    return __float2bfloat16_rn(fa);
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat162 h2log2(const __nv_bfloat162 a) {
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __APPROX_FCAST2(lg2)
+,
+    float fl = __low2float(a);
+    fl = __internal_device_fast_bf16log2(fl);
+    float fh = __high2float(a);
+    fh = __internal_device_fast_bf16log2(fh);
+    return __floats2bfloat162_rn( fl, fh );
+)
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat16 hlog(const __nv_bfloat16 a) {
+    const float flt_ln2 = __uint_as_float(0x3f317218U);
+    float fa = __bfloat162float(a);
+    fa = __internal_device_fast_bf16log2(fa);
+    fa = fa * flt_ln2;
+    return __float2bfloat16_rn(fa);
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2log(const __nv_bfloat162 a) {
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __nv_bfloat162 val;
+    asm("{.reg.b16         hl, hu;             \n"
+        " .reg.b32         r, fl, fu, C, h;    \n"
+        "  mov.b32         {hl, hu}, %1;       \n"
+        "  mov.b32         h, %1;              \n"
+        "  mov.b32         fl, {0,hl};         \n"
+        "  mov.b32         fu, {0,hu};         \n"
+        "  lg2.approx.f32      fl, fl;         \n"
+        "  lg2.approx.f32      fu, fu;         \n"
+        "  mov.b32         C, 0x3f317218U;     \n"
+        "  mul.f32         fl,fl,C;            \n"
+        "  mul.f32         fu,fu,C;            \n"
+        "  cvt.rn.bf16.f32    hl, fl;         \n"
+        "  cvt.rn.bf16.f32    hu, fu;         \n"
+        "  mov.b32         r, {hl, hu};        \n"
+        "  mov.b32         %0, r;              \n"
+        "}":"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)));
+    return val;
+,
+    const float flt_ln2 = __uint_as_float(0x3f317218U);
+
+    float fl = __low2float(a);
+    fl = __internal_device_fast_bf16log2(fl);
+    fl = fl * flt_ln2;
+
+    float fh = __high2float(a);
+    fh = __internal_device_fast_bf16log2(fh);
+    fh = fh * flt_ln2;
+
+    return __floats2bfloat162_rn( fl, fh );
+)
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat16 hlog10(const __nv_bfloat16 a) {
+    const float flt_log10_2 = __uint_as_float(0x3E9A209BU);
+    float fa = __bfloat162float(a);
+    fa = __internal_device_fast_bf16log2(fa);
+    fa = fa * flt_log10_2;
+    return __float2bfloat16_rn(fa);
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2log10(const __nv_bfloat162 a) {
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __nv_bfloat162 val;
+    asm("{.reg.b16         hl, hu;             \n"
+        " .reg.b32         r, fl, fu, C, h;    \n"
+        "  mov.b32         {hl, hu}, %1;       \n"
+        "  mov.b32         h, %1;              \n"
+        "  mov.b32         fl, {0,hl};         \n"
+        "  mov.b32         fu, {0,hu};         \n"
+        "  lg2.approx.f32      fl, fl;         \n"
+        "  lg2.approx.f32      fu, fu;         \n"
+        "  mov.b32         C, 0x3E9A209BU;      \n"
+        "  mul.f32         fl,fl,C;            \n"
+        "  mul.f32         fu,fu,C;            \n"
+        "  cvt.rn.bf16.f32    hl, fl;         \n"
+        "  cvt.rn.bf16.f32    hu, fu;         \n"
+        "  mov.b32         r, {hl, hu};        \n"
+        "  mov.b32         %0, r;              \n"
+        "}":"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)));
+    return val;
+,
+    const float flt_log10_2 = __uint_as_float(0x3E9A209BU);
+
+    float fl = __low2float(a);
+    fl = __internal_device_fast_bf16log2(fl);
+    fl = fl * flt_log10_2;
+
+    float fh = __high2float(a);
+    fh = __internal_device_fast_bf16log2(fh);
+    fh = fh * flt_log10_2;
+
+    return __floats2bfloat162_rn( fl, fh );
+)
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat162 h2rcp(const __nv_bfloat162 a) {
+    float fl = __low2float(a);
+    asm("{ rcp.approx.f32 %0, %0; }" : "+f"(fl));
+    float fh = __high2float(a);
+    asm("{ rcp.approx.f32 %0, %0; }" : "+f"(fh));
+    return __floats2bfloat162_rn( fl, fh );
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 hrcp(const __nv_bfloat16 a) {
+    float fa = __bfloat162float(a);
+    asm("{ rcp.approx.f32 %0, %0; }" : "+f"(fa));
+    return __float2bfloat16_rn(fa);
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2rsqrt(const __nv_bfloat162 a) {
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __APPROX_FCAST2(rsqrt)
+,
+    float fl = __low2float(a);
+    asm("{ rsqrt.approx.f32 %0, %0; }" : "+f"(fl));
+    float fh = __high2float(a);
+    asm("{ rsqrt.approx.f32 %0, %0; }" : "+f"(fh));
+    return __floats2bfloat162_rn( fl, fh );
+)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 hrsqrt(const __nv_bfloat16 a) {
+    float fa = __bfloat162float(a);
+    asm("{ rsqrt.approx.f32 %0, %0; }" : "+f"(fa));
+    return __float2bfloat16_rn(fa);
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2sqrt(const __nv_bfloat162 a) {
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __APPROX_FCAST2(sqrt)
+,
+    float fl = __low2float(a);
+    asm("{ sqrt.approx.f32 %0, %0; }" : "+f"(fl));
+    float fh = __high2float(a);
+    asm("{ sqrt.approx.f32 %0, %0; }" : "+f"(fh));
+    return __floats2bfloat162_rn( fl, fh );
+)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 hsqrt(const __nv_bfloat16 a) {
+    float fa = __bfloat162float(a);
+    asm("{ sqrt.approx.f32 %0, %0; }" : "+f"(fa));
+    return __float2bfloat16_rn(fa);
+}
+#undef __APPROX_FCAST2
+#undef __BF16_SPEC_CASE2
+
+__CUDA_BF16_DECL__ bool __internal_device_hisnan(const __nv_bfloat16 a)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 r;
+    asm("{set.nan.bf16.bf16 %0,%1,%1;\n}"
+        :"=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(a)));
+    return __BFLOAT16_TO_CUS(r) != 0U;
+,
+    unsigned int r;
+    asm( "{.reg .b32 a;\n"
+         "  mov.b32 a, {0,%1};\n"
+         "  set.nan.f32.f32 %0, a, a;}\n"
+         :"=r"(r) : "h"(__BFLOAT16_TO_CUS(a)));
+    return r != 0U;
+)
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hisnan2(const __nv_bfloat162 a)
+{
+    __nv_bfloat162 r;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    asm("{set.nan.bf16x2.bf16x2 %0,%1,%1;\n}"
+        :"=r"(__BFLOAT162_TO_UI(r)) : "r"(__BFLOAT162_TO_CUI(a)));
+,
+    __nv_bfloat162_raw val;
+    val.x = __hisnan(a.x) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    val.y = __hisnan(a.y) ? (unsigned short)0x3F80U : (unsigned short)0U;
+    r = __nv_bfloat162(val);
+)
+    return r;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ bool __hisnan(const __nv_bfloat16 a)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hisnan(a);
+,
+    const __nv_bfloat16_raw hr = static_cast<__nv_bfloat16_raw>(a);
+    return ((hr.x & 0x7FFFU) > 0x7F80U);
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hneg2(const __nv_bfloat162 a)
+{
+    __nv_bfloat162 r;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    asm("{neg.bf16x2 %0,%1;\n}"
+        :"=r"(__BFLOAT162_TO_UI(r)) : "r"(__BFLOAT162_TO_CUI(a)));
+,
+    r.x = __hneg(a.x);
+    r.y = __hneg(a.y);
+)
+    return r;
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ __nv_bfloat16 __internal_device_hneg(const __nv_bfloat16 a)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __nv_bfloat16 r;
+    asm("{neg.bf16 %0,%1;\n}"
+        :"=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(a)));
+    return r;
+,
+    const float fa = __bfloat162float(a);
+    return __float2bfloat16(__fmaf_ieee_rn(fa, -1.0f, -0.0f));
+)
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hneg(const __nv_bfloat16 a)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hneg(a);
+,
+    const float fa = __bfloat162float(a);
+    return __float2bfloat16(-fa);
+)
+}
+
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __habs2(const __nv_bfloat162 a)
+{
+    __nv_bfloat162 r;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    asm("{abs.bf16x2 %0,%1;\n}"
+        :"=r"(__BFLOAT162_TO_UI(r)) : "r"(__BFLOAT162_TO_CUI(a)));
+,
+    r.x = __habs(a.x);
+    r.y = __habs(a.y);
+)
+    return r;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __habs(const __nv_bfloat16 a)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __nv_bfloat16 r;
+    asm("{abs.bf16 %0,%1;\n}"
+        :"=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(a)));
+    return r;
+,
+    __nv_bfloat16_raw abs_a_raw = static_cast<__nv_bfloat16_raw>(a);
+    abs_a_raw.x &= (unsigned short)0x7FFFU;
+    if (abs_a_raw.x > (unsigned short)0x7F80U)
+    {
+        // return canonical NaN
+        abs_a_raw.x = (unsigned short)0x7FFFU;
+    }
+    return static_cast<__nv_bfloat16>(abs_a_raw);
+)
+}
+
+/******************************************************************************
+*                             __nv_bfloat16 arithmetic                             *
+******************************************************************************/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmax(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __nv_bfloat16 val;
+    asm( "{ max.bf16 %0,%1,%2;\n}"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+    return val;
+,
+    __nv_bfloat16 maxval;
+
+    maxval = (__hge(a, b) || __hisnan(b)) ? a : b;
+
+    if (__hisnan(maxval))
+    {
+        // if both inputs are NaN, return canonical NaN
+        maxval = CUDART_NAN_BF16;
+    }
+    else if (__heq(a, b))
+    {
+        // hmax(+0.0, -0.0) = +0.0
+        // unsigned compare 0x8000U > 0x0000U
+        __nv_bfloat16_raw ra = __nv_bfloat16_raw(a);
+        __nv_bfloat16_raw rb = __nv_bfloat16_raw(b);
+        maxval = (ra.x > rb.x) ? b : a;
+    }
+
+    return maxval;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmin(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __nv_bfloat16 val;
+    asm( "{ min.bf16 %0,%1,%2;\n}"
+         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+    return val;
+,
+    __nv_bfloat16 minval;
+
+    minval = (__hle(a, b) || __hisnan(b)) ? a : b;
+
+    if (__hisnan(minval))
+    {
+        // if both inputs are NaN, return canonical NaN
+        minval = CUDART_NAN_BF16;
+    }
+    else if (__heq(a, b))
+    {
+        // hmin(+0.0, -0.0) = -0.0
+        // unsigned compare 0x8000U > 0x0000U
+        __nv_bfloat16_raw ra = __nv_bfloat16_raw(a);
+        __nv_bfloat16_raw rb = __nv_bfloat16_raw(b);
+        minval = (ra.x > rb.x) ? a : b;
+    }
+
+    return minval;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmax_nan(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __nv_bfloat16 val;
+    asm( "{ max.NaN.bf16 %0,%1,%2;\n}"
+         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+    return val;
+,
+    __nv_bfloat16 maxval;
+
+    if (__hisnan(a) || __hisnan(b))
+    {
+        // if either input is NaN, return canonical NaN
+        maxval = CUDART_NAN_BF16;
+    }
+    else
+    {
+        maxval = __hge(a, b) ? a : b;
+    }
+
+    return maxval;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmin_nan(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __nv_bfloat16 val;
+    asm( "{ min.NaN.bf16 %0,%1,%2;\n}"
+         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+    return val;
+,
+    __nv_bfloat16 minval;
+
+    if (__hisnan(a) || __hisnan(b))
+    {
+        // if either input is NaN, return canonical NaN
+        minval = CUDART_NAN_BF16;
+    }
+    else
+    {
+        minval = __hle(a, b) ? a : b;
+    }
+
+    return minval;
+)
+}
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ __nv_bfloat16 __hfma_relu(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c)
+{
+    __nv_bfloat16 val;
+    asm( "{ fma.rn.relu.bf16 %0,%1,%2,%3;\n}"
+         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)),"h"(__BFLOAT16_TO_CUS(c)));
+    return val;
+}
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */
+/******************************************************************************
+*                            __nv_bfloat162 arithmetic                             *
+******************************************************************************/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmax2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __nv_bfloat162 val;
+    asm( "{ max.bf16x2 %0,%1,%2;\n}"
+         :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+    return val;
+,
+    __nv_bfloat162 val;
+    val.x = __hmax(a.x, b.x);
+    val.y = __hmax(a.y, b.y);
+    return val;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmin2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __nv_bfloat162 val;
+    asm( "{ min.bf16x2 %0,%1,%2;\n}"
+         :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+    return val;
+,
+    __nv_bfloat162 val;
+    val.x = __hmin(a.x, b.x);
+    val.y = __hmin(a.y, b.y);
+    return val;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmax2_nan(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __nv_bfloat162 val;
+    asm( "{ max.NaN.bf16x2 %0,%1,%2;\n}"
+         :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+    return val;
+,
+    __nv_bfloat162 val;
+    val.x = __hmax_nan(a.x, b.x);
+    val.y = __hmax_nan(a.y, b.y);
+    return val;
+)
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmin2_nan(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __nv_bfloat162 val;
+    asm( "{ min.NaN.bf16x2 %0,%1,%2;\n}"
+         :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+    return val;
+,
+    __nv_bfloat162 val;
+    val.x = __hmin_nan(a.x, b.x);
+    val.y = __hmin_nan(a.y, b.y);
+    return val;
+)
+}
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2_relu(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c)
+{
+    __nv_bfloat162 val;
+    asm( "{ fma.rn.relu.bf16x2 %0,%1,%2,%3;\n}"
+         :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)),"r"(__BFLOAT162_TO_CUI(c)));
+    return val;
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat162 __hcmadd(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c)
+{
+    // fast version of complex multiply-accumulate
+    // (a.re, a.im) * (b.re, b.im) + (c.re, c.im)
+    // acc.re = (c.re + a.re*b.re) - a.im*b.im
+    // acc.im = (c.im + a.re*b.im) + a.im*b.re
+    __nv_bfloat16 real_tmp = __hfma(a.x, b.x, c.x);
+    __nv_bfloat16 img_tmp  = __hfma(a.x, b.y, c.y);
+    real_tmp = __hfma(__hneg(a.y), b.y, real_tmp);
+    img_tmp  = __hfma(a.y,         b.x, img_tmp);
+    return make_bfloat162(real_tmp, img_tmp);
+}
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */
+
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/* Define __PTR for atomicAdd prototypes below, undef after done */
+#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
+#define __PTR   "l"
+#else
+#define __PTR   "r"
+#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/
+
+__CUDA_BF16_DECL__ __nv_bfloat162 atomicAdd(__nv_bfloat162 *const address, const __nv_bfloat162 val)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat162 r;
+    asm volatile ("{ atom.add.noftz.bf16x2 %0,[%1],%2; }\n"
+                  : "=r"(__BFLOAT162_TO_UI(r)) : __PTR(address), "r"(__BFLOAT162_TO_CUI(val))
+                  : "memory");
+    return r;
+,
+    unsigned int* address_as_uint = (unsigned int*)address;
+    unsigned int old = *address_as_uint;
+    unsigned int assumed;
+    do {
+        assumed = old;
+        __nv_bfloat162 new_val = __hadd2(val, *(__nv_bfloat162*)&assumed);
+        old = atomicCAS(address_as_uint, assumed, *(unsigned int*)&new_val);
+    } while (assumed != old);
+    return *(__nv_bfloat162*)&old;
+)
+}
+
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700))) || defined(_NVHPC_CUDA)
+__CUDA_BF16_DECL__ __nv_bfloat16 atomicAdd(__nv_bfloat16 *const address, const __nv_bfloat16 val)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    __nv_bfloat16 r;
+    asm volatile ("{ atom.add.noftz.bf16 %0,[%1],%2; }\n"
+                  : "=h"(__BFLOAT16_TO_US(r))
+                  : __PTR(address), "h"(__BFLOAT16_TO_CUS(val))
+                  : "memory");
+    return r;
+,
+    unsigned short int* address_as_us = (unsigned short int*)address;
+    unsigned short int old = *address_as_us;
+    unsigned short int assumed;
+    do {
+        assumed = old;
+        old = atomicCAS(address_as_us, assumed,
+            __bfloat16_as_ushort(__hadd(val, __ushort_as_bfloat16(assumed))));
+    } while (assumed != old);
+    return __ushort_as_bfloat16(old);
+)
+}
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700))) || defined(_NVHPC_CUDA) */
+
+#undef __PTR
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+
+#endif /* !(defined __DOXYGEN_ONLY__) */
+
+#endif /* defined(__cplusplus) */
+
+#undef __CUDA_HOSTDEVICE_BF16_DECL__
+#undef __CUDA_BF16_DECL__
+#undef __CUDA_BF16_CONSTEXPR__
+
+#if defined(__CPP_VERSION_AT_LEAST_11_BF16)
+#undef __CPP_VERSION_AT_LEAST_11_BF16
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */
+
+#undef ___CUDA_BF16_STRINGIFY_INNERMOST
+#undef __CUDA_BF16_STRINGIFY
+
+#endif /* end of include guard: __CUDA_BF16_HPP__ */
diff --git a/numba_cuda/numba/cuda/include/13/cuda_fp16.h b/numba_cuda/numba/cuda/include/13/cuda_fp16.h
new file mode 100644
index 000000000..788b81452
--- /dev/null
+++ b/numba_cuda/numba/cuda/include/13/cuda_fp16.h
@@ -0,0 +1,5363 @@
+/*
+* Copyright 1993-2024 NVIDIA Corporation.  All rights reserved.
+*
+* NOTICE TO LICENSEE:
+*
+* This source code and/or documentation ("Licensed Deliverables") are
+* subject to NVIDIA intellectual property rights under U.S. and
+* international Copyright laws.
+*
+* These Licensed Deliverables contained herein is PROPRIETARY and
+* CONFIDENTIAL to NVIDIA and is being provided under the terms and
+* conditions of a form of NVIDIA software license agreement by and
+* between NVIDIA and Licensee ("License Agreement") or electronically
+* accepted by Licensee.  Notwithstanding any terms or conditions to
+* the contrary in the License Agreement, reproduction or disclosure
+* of the Licensed Deliverables to any third party without the express
+* written consent of NVIDIA is prohibited.
+*
+* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+* OF THESE LICENSED DELIVERABLES.
+*
+* U.S. Government End Users.  These Licensed Deliverables are a
+* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+* 1995), consisting of "commercial computer software" and "commercial
+* computer software documentation" as such terms are used in 48
+* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+* only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+* U.S. Government End Users acquire the Licensed Deliverables with
+* only those rights set forth herein.
+*
+* Any use of the Licensed Deliverables in individual and commercial
+* software must include, in the user documentation and internal
+* comments to the code, the above Disclaimer and U.S. Government End
+* Users Notice.
+*/
+
+/**
+* \defgroup CUDA_MATH_INTRINSIC_HALF Half Precision Intrinsics
+* This section describes half precision intrinsic functions.
+* To use these functions, include the header file \p cuda_fp16.h in your program.
+* All of the functions defined here are available in device code.
+* Some of the functions are also available to host compilers, please
+* refer to respective functions' documentation for details.
+*
+* NOTE: Aggressive floating-point optimizations performed by host or device
+* compilers may affect numeric behavior of the functions implemented in this
+* header.
+*
+* The following macros are available to help users selectively enable/disable
+* various definitions present in the header file:
+* - \p CUDA_NO_HALF - If defined, this macro will prevent the definition of
+* additional type aliases in the global namespace, helping to avoid potential
+* conflicts with symbols defined in the user program.
+* - \p __CUDA_NO_HALF_CONVERSIONS__ - If defined, this macro will prevent the
+* use of the C++ type conversions (converting constructors and conversion
+* operators) that are common for built-in floating-point types, but may be
+* undesirable for \p half which is essentially a user-defined type.
+* - \p __CUDA_NO_HALF_OPERATORS__ and \p __CUDA_NO_HALF2_OPERATORS__ - If
+* defined, these macros will prevent the inadvertent use of usual arithmetic
+* and comparison operators. This enforces the storage-only type semantics and
+* prevents C++ style computations on \p half and \p half2 types.
+*/
+
+/**
+* \defgroup CUDA_MATH_INTRINSIC_HALF_CONSTANTS Half Arithmetic Constants
+* \ingroup CUDA_MATH_INTRINSIC_HALF
+* To use these constants, include the header file \p cuda_fp16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__HALF_ARITHMETIC Half Arithmetic Functions
+* \ingroup CUDA_MATH_INTRINSIC_HALF
+* To use these functions, include the header file \p cuda_fp16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__HALF2_ARITHMETIC Half2 Arithmetic Functions
+* \ingroup CUDA_MATH_INTRINSIC_HALF
+* To use these functions, include the header file \p cuda_fp16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__HALF_COMPARISON Half Comparison Functions
+* \ingroup CUDA_MATH_INTRINSIC_HALF
+* To use these functions, include the header file \p cuda_fp16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__HALF2_COMPARISON Half2 Comparison Functions
+* \ingroup CUDA_MATH_INTRINSIC_HALF
+* To use these functions, include the header file \p cuda_fp16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__HALF_MISC Half Precision Conversion and Data Movement
+* \ingroup CUDA_MATH_INTRINSIC_HALF
+* To use these functions, include the header file \p cuda_fp16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__HALF_FUNCTIONS Half Math Functions
+* \ingroup CUDA_MATH_INTRINSIC_HALF
+* To use these functions, include the header file \p cuda_fp16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__HALF2_FUNCTIONS Half2 Math Functions
+* \ingroup CUDA_MATH_INTRINSIC_HALF
+* To use these functions, include the header file \p cuda_fp16.h in your program.
+*/
+
+#ifndef __CUDA_FP16_H__
+#define __CUDA_FP16_H__
+
+// implicitly provided by NVRTC
+#if !defined(__CUDACC_RTC__)
+/* bring in float2, double4, etc vector types */
+#include "vector_types.h"
+/* bring in operations on vector types like: make_float2 */
+#include "vector_functions.h"
+#endif  /* !defined(__CUDACC_RTC__) */
+
+#define ___CUDA_FP16_STRINGIFY_INNERMOST(x) #x
+#define __CUDA_FP16_STRINGIFY(x) ___CUDA_FP16_STRINGIFY_INNERMOST(x)
+
+#if defined(__cplusplus)
+
+/* Set up function decorations */
+#if (defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3))))
+#define __CUDA_FP16_DECL__ __device__
+#define __CUDA_HOSTDEVICE_FP16_DECL__ __device__
+#define __CUDA_HOSTDEVICE__ __device__
+#elif defined(__CUDACC__) || defined(_NVHPC_CUDA)
+#define __CUDA_FP16_DECL__ static __device__ __inline__
+#define __CUDA_HOSTDEVICE_FP16_DECL__ static __host__ __device__ __inline__
+#define __CUDA_HOSTDEVICE__ __host__ __device__
+#else /* !defined(__CUDACC__) */
+#if defined(__GNUC__)
+#define __CUDA_HOSTDEVICE_FP16_DECL__ static __attribute__ ((unused))
+#else
+#define __CUDA_HOSTDEVICE_FP16_DECL__ static
+#endif /* defined(__GNUC__) */
+#define __CUDA_HOSTDEVICE__
+#endif /* (defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3)))) */
+
+#define __CUDA_FP16_TYPES_EXIST__
+
+/* Macros to allow half & half2 to be used by inline assembly */
+#define __HALF_TO_US(var) *(reinterpret_cast<unsigned short *>(&(var)))
+#define __HALF_TO_CUS(var) *(reinterpret_cast<const unsigned short *>(&(var)))
+#define __HALF2_TO_UI(var) *(reinterpret_cast<unsigned int *>(&(var)))
+#define __HALF2_TO_CUI(var) *(reinterpret_cast<const unsigned int *>(&(var)))
+
+/* Forward-declaration of structures defined in "cuda_fp16.hpp" */
+struct __half;
+struct __half2;
+
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Converts double number to half precision in round-to-nearest-even mode
+* and returns \p half with converted value.
+*
+* \details Converts double number \p a to half precision in round-to-nearest-even mode.
+* \param[in] a - double. Is only being read.
+* \returns half
+* - \p a converted to half precision using round-to-nearest-even mode.
+* - __double2half \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - __double2half \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - __double2half(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __double2half(const double a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Converts float number to half precision in round-to-nearest-even mode
+* and returns \p half with converted value.
+*
+* \details Converts float number \p a to half precision in round-to-nearest-even mode.
+* \param[in] a - float. Is only being read.
+* \returns half
+* - \p a converted to half precision using round-to-nearest-even mode.
+*
+* \see __float2half_rn(float) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half(const float a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Converts float number to half precision in round-to-nearest-even mode
+* and returns \p half with converted value.
+*
+* \details Converts float number \p a to half precision in round-to-nearest-even mode.
+* \param[in] a - float. Is only being read.
+* \returns half
+* - \p a converted to half precision using round-to-nearest-even mode.
+* - __float2half_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - __float2half_rn \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - __float2half_rn(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rn(const float a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Converts float number to half precision in round-towards-zero mode
+* and returns \p half with converted value.
+*
+* \details Converts float number \p a to half precision in round-towards-zero mode.
+* \param[in] a - float. Is only being read.
+* \returns half
+* - \p a converted to half precision using round-towards-zero mode.
+* - __float2half_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - __float2half_rz \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - __float2half_rz(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rz(const float a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Converts float number to half precision in round-down mode
+* and returns \p half with converted value.
+*
+* \details Converts float number \p a to half precision in round-down mode.
+* \param[in] a - float. Is only being read.
+*
+* \returns half
+* - \p a converted to half precision using round-down mode.
+* - __float2half_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - __float2half_rd \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - __float2half_rd(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rd(const float a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Converts float number to half precision in round-up mode
+* and returns \p half with converted value.
+*
+* \details Converts float number \p a to half precision in round-up mode.
+* \param[in] a - float. Is only being read.
+*
+* \returns half
+* - \p a converted to half precision using round-up mode.
+* - __float2half_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - __float2half_ru \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - __float2half_ru(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_ru(const float a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Converts \p half number to float.
+*
+* \details Converts half number \p a to float.
+* \param[in] a - float. Is only being read.
+*
+* \returns float
+* - \p a converted to float.
+* - __half2float \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - __half2float \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - __half2float(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ float __half2float(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Converts input to half precision in round-to-nearest-even mode and
+* populates both halves of \p half2 with converted value.
+*
+* \details Converts input \p a to half precision in round-to-nearest-even mode and
+* populates both halves of \p half2 with converted value.
+* \param[in] a - float. Is only being read.
+*
+* \returns half2
+* - The \p half2 value with both halves equal to the converted half
+* precision number.
+*
+* \see __float2half_rn(float) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float2half2_rn(const float a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Converts both input floats to half precision in round-to-nearest-even
+* mode and returns \p half2 with converted values.
+*
+* \details Converts both input floats to half precision in round-to-nearest-even mode
+* and combines the results into one \p half2 number. Low 16 bits of the return
+* value correspond to the input \p a, high 16 bits correspond to the input \p
+* b.
+* \param[in] a - float. Is only being read.
+* \param[in] b - float. Is only being read.
+*
+* \returns half2
+* - The \p half2 value with corresponding halves equal to the
+* converted input floats.
+*
+* \see __float2half_rn(float) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __floats2half2_rn(const float a, const float b);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Converts low 16 bits of \p half2 to float and returns the result
+*
+* \details Converts low 16 bits of \p half2 input \p a to 32-bit floating-point number
+* and returns the result.
+* \param[in] a - half2. Is only being read.
+*
+* \returns float
+* - The low 16 bits of \p a converted to float.
+*
+* \see __half2float(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ float __low2float(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Converts high 16 bits of \p half2 to float and returns the result
+*
+* \details Converts high 16 bits of \p half2 input \p a to 32-bit floating-point number
+* and returns the result.
+* \param[in] a - half2. Is only being read.
+*
+* \returns float
+* - The high 16 bits of \p a converted to float.
+*
+* \see __half2float(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ float __high2float(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to a signed char in round-towards-zero mode.
+*
+* \details Convert the half-precision floating-point value \p h to a signed char
+* integer in round-towards-zero mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read.
+*
+* \returns signed char
+* - \p h converted to a signed char using round-towards-zero mode.
+* - __half2char_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2char_rz \cuda_math_formula (x), x > 127\end_cuda_math_formula returns SCHAR_MAX = \p 0x7F.
+* - __half2char_rz \cuda_math_formula (x), x < -128\end_cuda_math_formula returns SCHAR_MIN = \p 0x80.
+* - __half2char_rz(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ signed char __half2char_rz(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to an unsigned char in round-towards-zero
+* mode.
+*
+* \details Convert the half-precision floating-point value \p h to an unsigned
+* char in round-towards-zero mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read.
+*
+* \returns unsigned char
+* - \p h converted to an unsigned char using round-towards-zero mode.
+* - __half2uchar_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2uchar_rz \cuda_math_formula (x), x > 255\end_cuda_math_formula returns UCHAR_MAX = \p 0xFF.
+* - __half2uchar_rz \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0.
+* - __half2uchar_rz(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned char __half2uchar_rz(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to a signed short integer in round-towards-zero mode.
+*
+* \details Convert the half-precision floating-point value \p h to a signed short
+* integer in round-towards-zero mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read.
+*
+* \returns short int
+* - \p h converted to a signed short integer using round-towards-zero mode.
+* - __half2short_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2short_rz \cuda_math_formula (x), x > 32767\end_cuda_math_formula returns SHRT_MAX = \p 0x7FFF.
+* - __half2short_rz \cuda_math_formula (x), x < -32768\end_cuda_math_formula returns SHRT_MIN = \p 0x8000.
+* - __half2short_rz(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ short int __half2short_rz(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to an unsigned short integer in round-towards-zero
+* mode.
+*
+* \details Convert the half-precision floating-point value \p h to an unsigned short
+* integer in round-towards-zero mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read.
+*
+* \returns unsigned short int
+* - \p h converted to an unsigned short integer using round-towards-zero mode.
+* - __half2ushort_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2ushort_rz \cuda_math_formula (+\infty)\end_cuda_math_formula returns USHRT_MAX = \p 0xFFFF.
+* - __half2ushort_rz \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0.
+* - __half2ushort_rz(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned short int __half2ushort_rz(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to a signed integer in round-towards-zero mode.
+*
+* \details Convert the half-precision floating-point value \p h to a signed integer in
+* round-towards-zero mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read.
+*
+* \returns int
+* - \p h converted to a signed integer using round-towards-zero mode.
+* - __half2int_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2int_rz \cuda_math_formula (+\infty)\end_cuda_math_formula returns INT_MAX = \p 0x7FFFFFFF.
+* - __half2int_rz \cuda_math_formula (-\infty)\end_cuda_math_formula returns INT_MIN = \p 0x80000000.
+* - __half2int_rz(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ int __half2int_rz(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to an unsigned integer in round-towards-zero mode.
+*
+* \details Convert the half-precision floating-point value \p h to an unsigned integer
+* in round-towards-zero mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read.
+*
+* \returns unsigned int
+* - \p h converted to an unsigned integer using round-towards-zero mode.
+* - __half2uint_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2uint_rz \cuda_math_formula (+\infty)\end_cuda_math_formula returns UINT_MAX = \p 0xFFFFFFFF.
+* - __half2uint_rz \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0.
+* - __half2uint_rz(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __half2uint_rz(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to a signed 64-bit integer in round-towards-zero mode.
+*
+* \details Convert the half-precision floating-point value \p h to a signed 64-bit
+* integer in round-towards-zero mode. NaN inputs return a long long int with hex value of \p 0x8000000000000000.
+* \param[in] h - half. Is only being read.
+*
+* \returns long long int
+* - \p h converted to a signed 64-bit integer using round-towards-zero mode.
+* - __half2ll_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2ll_rz \cuda_math_formula (+\infty)\end_cuda_math_formula returns LLONG_MAX = \p 0x7FFFFFFFFFFFFFFF.
+* - __half2ll_rz \cuda_math_formula (-\infty)\end_cuda_math_formula returns LLONG_MIN = \p 0x8000000000000000.
+* - __half2ll_rz(NaN) returns \p 0x8000000000000000.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ long long int __half2ll_rz(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to an unsigned 64-bit integer in round-towards-zero
+* mode.
+*
+* \details Convert the half-precision floating-point value \p h to an unsigned 64-bit
+* integer in round-towards-zero mode. NaN inputs return \p 0x8000000000000000.
+* \param[in] h - half. Is only being read.
+*
+* \returns unsigned long long int
+* - \p h converted to an unsigned 64-bit integer using round-towards-zero mode.
+* - __half2ull_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2ull_rz \cuda_math_formula (+\infty)\end_cuda_math_formula returns ULLONG_MAX = \p 0xFFFFFFFFFFFFFFFF.
+* - __half2ull_rz \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0.
+* - __half2ull_rz(NaN) returns \p 0x8000000000000000.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned long long int __half2ull_rz(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Vector function, combines two \p __half numbers into one \p __half2 number.
+*
+* \details Combines two input \p __half number \p x and \p y into one \p __half2 number.
+* Input \p x is stored in low 16 bits of the return value, input \p y is stored
+* in high 16 bits of the return value.
+* \param[in] x - half. Is only being read.
+* \param[in] y - half. Is only being read.
+*
+* \returns __half2
+* - The \p __half2 vector with one half equal to \p x and the other to \p y.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 make_half2(const __half x, const __half y);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Converts both components of \p float2 number to half precision in
+* round-to-nearest-even mode and returns \p half2 with converted values.
+*
+* \details Converts both components of \p float2 to half precision in round-to-nearest-even
+* mode and combines the results into one \p half2 number. Low 16 bits of the
+* return value correspond to \p a.x and high 16 bits of the return value
+* correspond to \p a.y.
+* \param[in] a - float2. Is only being read.
+*
+* \returns half2
+* - The \p half2 which has corresponding halves equal to the
+* converted \p float2 components.
+*
+* \see __float2half_rn(float) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float22half2_rn(const float2 a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Converts both halves of \p half2 to \p float2 and returns the result.
+*
+* \details Converts both halves of \p half2 input \p a to \p float2 and returns the
+* result.
+* \param[in] a - half2. Is only being read.
+*
+* \returns float2
+* - \p a converted to \p float2.
+*
+* \see __half2float(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ float2 __half22float2(const __half2 a);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to a signed integer in round-to-nearest-even mode.
+*
+* \details Convert the half-precision floating-point value \p h to a signed integer in
+* round-to-nearest-even mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read.
+*
+* \returns int
+* - \p h converted to a signed integer using round-to-nearest-even mode.
+* - __half2int_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2int_rn \cuda_math_formula (+\infty)\end_cuda_math_formula returns INT_MAX = \p 0x7FFFFFFF.
+* - __half2int_rn \cuda_math_formula (-\infty)\end_cuda_math_formula returns INT_MIN = \p 0x80000000.
+* - __half2int_rn(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ int __half2int_rn(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to a signed integer in round-down mode.
+*
+* \details Convert the half-precision floating-point value \p h to a signed integer in
+* round-down mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read.
+*
+* \returns int
+* - \p h converted to a signed integer using round-down mode.
+* - __half2int_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2int_rd \cuda_math_formula (+\infty)\end_cuda_math_formula returns INT_MAX = \p 0x7FFFFFFF.
+* - __half2int_rd \cuda_math_formula (-\infty)\end_cuda_math_formula returns INT_MIN = \p 0x80000000.
+* - __half2int_rd(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ int __half2int_rd(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to a signed integer in round-up mode.
+*
+* \details Convert the half-precision floating-point value \p h to a signed integer in
+* round-up mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read.
+*
+* \returns int
+* - \p h converted to a signed integer using round-up mode.
+* - __half2int_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2int_ru \cuda_math_formula (+\infty)\end_cuda_math_formula returns INT_MAX = \p 0x7FFFFFFF.
+* - __half2int_ru \cuda_math_formula (-\infty)\end_cuda_math_formula returns INT_MIN = \p 0x80000000.
+* - __half2int_ru(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ int __half2int_ru(const __half h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a signed integer to a half in round-to-nearest-even mode.
+*
+* \details Convert the signed integer value \p i to a half-precision floating-point
+* value in round-to-nearest-even mode.
+* \param[in] i - int. Is only being read.
+*
+* \returns half
+* - \p i converted to half.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_rn(const int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a signed integer to a half in round-towards-zero mode.
+*
+* \details Convert the signed integer value \p i to a half-precision floating-point
+* value in round-towards-zero mode.
+* \param[in] i - int. Is only being read.
+*
+* \returns half
+* - \p i converted to half.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_rz(const int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a signed integer to a half in round-down mode.
+*
+* \details Convert the signed integer value \p i to a half-precision floating-point
+* value in round-down mode.
+* \param[in] i - int. Is only being read.
+*
+* \returns half
+* - \p i converted to half.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_rd(const int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a signed integer to a half in round-up mode.
+*
+* \details Convert the signed integer value \p i to a half-precision floating-point
+* value in round-up mode.
+* \param[in] i - int. Is only being read.
+*
+* \returns half
+* - \p i converted to half.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_ru(const int i);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to a signed short integer in round-to-nearest-even
+* mode.
+*
+* \details Convert the half-precision floating-point value \p h to a signed short
+* integer in round-to-nearest-even mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read.
+*
+* \returns short int
+* - \p h converted to a signed short integer using round-to-nearest-even mode.
+* - __half2short_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2short_rn \cuda_math_formula (x), x > 32767\end_cuda_math_formula returns SHRT_MAX = \p 0x7FFF.
+* - __half2short_rn \cuda_math_formula (x), x < -32768\end_cuda_math_formula returns SHRT_MIN = \p 0x8000.
+* - __half2short_rn(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ short int __half2short_rn(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to a signed short integer in round-down mode.
+*
+* \details Convert the half-precision floating-point value \p h to a signed short
+* integer in round-down mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read.
+*
+* \returns short int
+* - \p h converted to a signed short integer using round-down mode.
+* - __half2short_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2short_rd \cuda_math_formula (x), x > 32767\end_cuda_math_formula returns SHRT_MAX = \p 0x7FFF.
+* - __half2short_rd \cuda_math_formula (x), x < -32768\end_cuda_math_formula returns SHRT_MIN = \p 0x8000.
+* - __half2short_rd(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ short int __half2short_rd(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to a signed short integer in round-up mode.
+*
+* \details Convert the half-precision floating-point value \p h to a signed short
+* integer in round-up mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read.
+*
+* \returns short int
+* - \p h converted to a signed short integer using round-up mode.
+* - __half2short_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2short_ru \cuda_math_formula (x), x > 32767\end_cuda_math_formula returns SHRT_MAX = \p 0x7FFF.
+* - __half2short_ru \cuda_math_formula (x), x < -32768\end_cuda_math_formula returns SHRT_MIN = \p 0x8000.
+* - __half2short_ru(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ short int __half2short_ru(const __half h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a signed short integer to a half in round-to-nearest-even
+* mode.
+*
+* \details Convert the signed short integer value \p i to a half-precision floating-point
+* value in round-to-nearest-even mode.
+* \param[in] i - short int. Is only being read.
+*
+* \returns half
+* - \p i converted to half.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_rn(const short int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a signed short integer to a half in round-towards-zero mode.
+*
+* \details Convert the signed short integer value \p i to a half-precision floating-point
+* value in round-towards-zero mode.
+* \param[in] i - short int. Is only being read.
+*
+* \returns half
+* - \p i converted to half.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_rz(const short int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a signed short integer to a half in round-down mode.
+*
+* \details Convert the signed short integer value \p i to a half-precision floating-point
+* value in round-down mode.
+* \param[in] i - short int. Is only being read.
+*
+* \returns half
+* - \p i converted to half.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_rd(const short int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a signed short integer to a half in round-up mode.
+*
+* \details Convert the signed short integer value \p i to a half-precision floating-point
+* value in round-up mode.
+* \param[in] i - short int. Is only being read.
+*
+* \returns half
+* - \p i converted to half.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_ru(const short int i);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to an unsigned integer in round-to-nearest-even mode.
+*
+* \details Convert the half-precision floating-point value \p h to an unsigned integer
+* in round-to-nearest-even mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read.
+*
+* \returns unsigned int
+* - \p h converted to an unsigned integer using round-to-nearest-even mode.
+* - __half2uint_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2uint_rn \cuda_math_formula (+\infty)\end_cuda_math_formula returns UINT_MAX = \p 0xFFFFFFFF.
+* - __half2uint_rn \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0.
+* - __half2uint_rn(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ unsigned int __half2uint_rn(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to an unsigned integer in round-down mode.
+*
+* \details Convert the half-precision floating-point value \p h to an unsigned integer
+* in round-down mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read.
+*
+* \returns unsigned int
+* - \p h converted to an unsigned integer using round-down mode.
+* - __half2uint_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2uint_rd \cuda_math_formula (+\infty)\end_cuda_math_formula returns UINT_MAX = \p 0xFFFFFFFF.
+* - __half2uint_rd \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0.
+* - __half2uint_rd(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ unsigned int __half2uint_rd(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to an unsigned integer in round-up mode.
+*
+* \details Convert the half-precision floating-point value \p h to an unsigned integer
+* in round-up mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read.
+*
+* \returns unsigned int
+* - \p h converted to an unsigned integer using round-up mode.
+* - __half2uint_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2uint_ru \cuda_math_formula (+\infty)\end_cuda_math_formula returns UINT_MAX = \p 0xFFFFFFFF.
+* - __half2uint_ru \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0.
+* - __half2uint_ru(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ unsigned int __half2uint_ru(const __half h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert an unsigned integer to a half in round-to-nearest-even mode.
+*
+* \details Convert the unsigned integer value \p i to a half-precision floating-point
+* value in round-to-nearest-even mode.
+* \param[in] i - unsigned int. Is only being read.
+*
+* \returns half
+* - \p i converted to half.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_rn(const unsigned int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert an unsigned integer to a half in round-towards-zero mode.
+*
+* \details Convert the unsigned integer value \p i to a half-precision floating-point
+* value in round-towards-zero mode.
+* \param[in] i - unsigned int. Is only being read.
+*
+* \returns half
+* - \p i converted to half.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_rz(const unsigned int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert an unsigned integer to a half in round-down mode.
+*
+* \details Convert the unsigned integer value \p i to a half-precision floating-point
+* value in round-down mode.
+* \param[in] i - unsigned int. Is only being read.
+*
+* \returns half
+* - \p i converted to half.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_rd(const unsigned int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert an unsigned integer to a half in round-up mode.
+*
+* \details Convert the unsigned integer value \p i to a half-precision floating-point
+* value in round-up mode.
+* \param[in] i - unsigned int. Is only being read.
+*
+* \returns half
+* - \p i converted to half.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_ru(const unsigned int i);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to an unsigned short integer in round-to-nearest-even
+* mode.
+*
+* \details Convert the half-precision floating-point value \p h to an unsigned short
+* integer in round-to-nearest-even mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read.
+*
+* \returns unsigned short int
+* - \p h converted to an unsigned short integer using round-to-nearest-even mode.
+* - __half2ushort_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2ushort_rn \cuda_math_formula (+\infty)\end_cuda_math_formula returns USHRT_MAX = \p 0xFFFF.
+* - __half2ushort_rn \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0.
+* - __half2ushort_rn(NaN) returns 0.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ unsigned short int __half2ushort_rn(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to an unsigned short integer in round-down mode.
+*
+* \details Convert the half-precision floating-point value \p h to an unsigned short
+* integer in round-down mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read.
+*
+* \returns unsigned short int
+* - \p h converted to an unsigned short integer using round-down mode.
+* - __half2ushort_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2ushort_rd \cuda_math_formula (+\infty)\end_cuda_math_formula returns USHRT_MAX = \p 0xFFFF.
+* - __half2ushort_rd \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0.
+* - __half2ushort_rd(NaN) returns 0.
+*/
+__CUDA_FP16_DECL__ unsigned short int __half2ushort_rd(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to an unsigned short integer in round-up mode.
+*
+* \details Convert the half-precision floating-point value \p h to an unsigned short
+* integer in round-up mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read.
+*
+* \returns unsigned short int
+* - \p h converted to an unsigned short integer using round-up mode.
+* - __half2ushort_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2ushort_ru \cuda_math_formula (+\infty)\end_cuda_math_formula returns USHRT_MAX = \p 0xFFFF.
+* - __half2ushort_ru \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0.
+* - __half2ushort_ru(NaN) returns 0.
+*/
+__CUDA_FP16_DECL__ unsigned short int __half2ushort_ru(const __half h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert an unsigned short integer to a half in round-to-nearest-even
+* mode.
+*
+* \details Convert the unsigned short integer value \p i to a half-precision floating-point
+* value in round-to-nearest-even mode.
+* \param[in] i - unsigned short int. Is only being read.
+*
+* \returns half
+* - \p i converted to half.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_rn(const unsigned short int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert an unsigned short integer to a half in round-towards-zero
+* mode.
+*
+* \details Convert the unsigned short integer value \p i to a half-precision floating-point
+* value in round-towards-zero mode.
+* \param[in] i - unsigned short int. Is only being read.
+*
+* \returns half
+* - \p i converted to half.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_rz(const unsigned short int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert an unsigned short integer to a half in round-down mode.
+*
+* \details Convert the unsigned short integer value \p i to a half-precision floating-point
+* value in round-down mode.
+* \param[in] i - unsigned short int. Is only being read.
+*
+* \returns half
+* - \p i converted to half.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_rd(const unsigned short int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert an unsigned short integer to a half in round-up mode.
+*
+* \details Convert the unsigned short integer value \p i to a half-precision floating-point
+* value in round-up mode.
+* \param[in] i - unsigned short int. Is only being read.
+*
+* \returns half
+* - \p i converted to half.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_ru(const unsigned short int i);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to an unsigned 64-bit integer in round-to-nearest-even
+* mode.
+*
+* \details Convert the half-precision floating-point value \p h to an unsigned 64-bit
+* integer in round-to-nearest-even mode. NaN inputs return \p 0x8000000000000000.
+* \param[in] h - half. Is only being read.
+*
+* \returns unsigned long long int
+* - \p h converted to an unsigned 64-bit integer using round-to-nearest-even mode.
+* - __half2ull_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2ull_rn \cuda_math_formula (+\infty)\end_cuda_math_formula returns ULLONG_MAX = \p 0xFFFFFFFFFFFFFFFF.
+* - __half2ull_rn \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0.
+* - __half2ull_rn(NaN) returns \p 0x8000000000000000.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ unsigned long long int __half2ull_rn(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to an unsigned 64-bit integer in round-down mode.
+*
+* \details Convert the half-precision floating-point value \p h to an unsigned 64-bit
+* integer in round-down mode. NaN inputs return \p 0x8000000000000000.
+* \param[in] h - half. Is only being read.
+*
+* \returns unsigned long long int
+* - \p h converted to an unsigned 64-bit integer using round-down mode.
+* - __half2ull_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2ull_rd \cuda_math_formula (+\infty)\end_cuda_math_formula returns ULLONG_MAX = \p 0xFFFFFFFFFFFFFFFF.
+* - __half2ull_rd \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0.
+* - __half2ull_rd(NaN) returns \p 0x8000000000000000.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ unsigned long long int __half2ull_rd(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to an unsigned 64-bit integer in round-up mode.
+*
+* \details Convert the half-precision floating-point value \p h to an unsigned 64-bit
+* integer in round-up mode. NaN inputs return \p 0x8000000000000000.
+* \param[in] h - half. Is only being read.
+*
+* \returns unsigned long long int
+* - \p h converted to an unsigned 64-bit integer using round-up mode.
+* - __half2ull_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2ull_ru \cuda_math_formula (+\infty)\end_cuda_math_formula returns ULLONG_MAX = \p 0xFFFFFFFFFFFFFFFF.
+* - __half2ull_ru \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0.
+* - __half2ull_ru(NaN) returns \p 0x8000000000000000.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ unsigned long long int __half2ull_ru(const __half h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert an unsigned 64-bit integer to a half in round-to-nearest-even
+* mode.
+*
+* \details Convert the unsigned 64-bit integer value \p i to a half-precision floating-point
+* value in round-to-nearest-even mode.
+* \param[in] i - unsigned long long int. Is only being read.
+*
+* \returns half
+* - \p i converted to half.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_rn(const unsigned long long int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert an unsigned 64-bit integer to a half in round-towards-zero
+* mode.
+*
+* \details Convert the unsigned 64-bit integer value \p i to a half-precision floating-point
+* value in round-towards-zero mode.
+* \param[in] i - unsigned long long int. Is only being read.
+*
+* \returns half
+* - \p i converted to half.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_rz(const unsigned long long int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert an unsigned 64-bit integer to a half in round-down mode.
+*
+* \details Convert the unsigned 64-bit integer value \p i to a half-precision floating-point
+* value in round-down mode.
+* \param[in] i - unsigned long long int. Is only being read.
+*
+* \returns half
+* - \p i converted to half.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_rd(const unsigned long long int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert an unsigned 64-bit integer to a half in round-up mode.
+*
+* \details Convert the unsigned 64-bit integer value \p i to a half-precision floating-point
+* value in round-up mode.
+* \param[in] i - unsigned long long int. Is only being read.
+*
+* \returns half
+* - \p i converted to half.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_ru(const unsigned long long int i);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to a signed 64-bit integer in round-to-nearest-even
+* mode.
+*
+* \details Convert the half-precision floating-point value \p h to a signed 64-bit
+* integer in round-to-nearest-even mode. NaN inputs return a long long int with hex value of \p 0x8000000000000000.
+* \param[in] h - half. Is only being read.
+*
+* \returns long long int
+* - \p h converted to a signed 64-bit integer using round-to-nearest-even mode.
+* - __half2ll_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2ll_rn \cuda_math_formula (+\infty)\end_cuda_math_formula returns LLONG_MAX = \p 0x7FFFFFFFFFFFFFFF.
+* - __half2ll_rn \cuda_math_formula (-\infty)\end_cuda_math_formula returns LLONG_MIN = \p 0x8000000000000000.
+* - __half2ll_rn(NaN) returns \p 0x8000000000000000.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ long long int __half2ll_rn(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to a signed 64-bit integer in round-down mode.
+*
+* \details Convert the half-precision floating-point value \p h to a signed 64-bit
+* integer in round-down mode. NaN inputs return a long long int with hex value of \p 0x8000000000000000.
+* \param[in] h - half. Is only being read.
+*
+* \returns long long int
+* - \p h converted to a signed 64-bit integer using round-down mode.
+* - __half2ll_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2ll_rd \cuda_math_formula (+\infty)\end_cuda_math_formula returns LLONG_MAX = \p 0x7FFFFFFFFFFFFFFF.
+* - __half2ll_rd \cuda_math_formula (-\infty)\end_cuda_math_formula returns LLONG_MIN = \p 0x8000000000000000.
+* - __half2ll_rd(NaN) returns \p 0x8000000000000000.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ long long int __half2ll_rd(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to a signed 64-bit integer in round-up mode.
+*
+* \details Convert the half-precision floating-point value \p h to a signed 64-bit
+* integer in round-up mode. NaN inputs return a long long int with hex value of \p 0x8000000000000000.
+* \param[in] h - half. Is only being read.
+*
+* \returns long long int
+* - \p h converted to a signed 64-bit integer using round-up mode.
+* - __half2ll_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0.
+* - __half2ll_ru \cuda_math_formula (+\infty)\end_cuda_math_formula returns LLONG_MAX = \p 0x7FFFFFFFFFFFFFFF.
+* - __half2ll_ru \cuda_math_formula (-\infty)\end_cuda_math_formula returns LLONG_MIN = \p 0x8000000000000000.
+* - __half2ll_ru(NaN) returns \p 0x8000000000000000.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ long long int __half2ll_ru(const __half h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a signed 64-bit integer to a half in round-to-nearest-even
+* mode.
+*
+* \details Convert the signed 64-bit integer value \p i to a half-precision floating-point
+* value in round-to-nearest-even mode.
+* \param[in] i - long long int. Is only being read.
+*
+* \returns half
+* - \p i converted to half.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_rn(const long long int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a signed 64-bit integer to a half in round-towards-zero mode.
+*
+* \details Convert the signed 64-bit integer value \p i to a half-precision floating-point
+* value in round-towards-zero mode.
+* \param[in] i - long long int. Is only being read.
+*
+* \returns half
+* - \p i converted to half.
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_rz(const long long int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a signed 64-bit integer to a half in round-down mode.
+*
+* \details Convert the signed 64-bit integer value \p i to a half-precision floating-point
+* value in round-down mode.
+* \param[in] i - long long int. Is only being read.
+*
+* \returns half
+* - \p i converted to half.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_rd(const long long int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a signed 64-bit integer to a half in round-up mode.
+*
+* \details Convert the signed 64-bit integer value \p i to a half-precision floating-point
+* value in round-up mode.
+* \param[in] i - long long int. Is only being read.
+*
+* \returns half
+* - \p i converted to half.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_ru(const long long int i);
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Truncate input argument to the integral part.
+*
+* \details Round \p h to the largest integer value that does not exceed \p h in
+* magnitude.
+* \param[in] h - half. Is only being read.
+*
+* \returns half
+* - The truncated value.
+* - htrunc(
+* \cuda_math_formula \pm 0 \end_cuda_math_formula
+* ) returns
+* \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - htrunc(
+* \cuda_math_formula \pm \infty \end_cuda_math_formula
+* ) returns
+* \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - htrunc(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half htrunc(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculate ceiling of the input argument.
+*
+* \details Compute the smallest integer value not less than \p h.
+* \param[in] h - half. Is only being read.
+*
+* \returns half
+* - The smallest integer value not less than \p h.
+* - hceil(
+* \cuda_math_formula \pm 0 \end_cuda_math_formula
+* ) returns
+* \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - hceil(
+* \cuda_math_formula \pm \infty \end_cuda_math_formula
+* ) returns
+* \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - hceil(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hceil(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculate the largest integer less than or equal to \p h.
+*
+* \details Calculate the largest integer value which is less than or equal to \p h.
+* \param[in] h - half. Is only being read.
+*
+* \returns half
+* - The largest integer value which is less than or equal to \p h.
+* - hfloor(
+* \cuda_math_formula \pm 0 \end_cuda_math_formula
+* ) returns
+* \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - hfloor(
+* \cuda_math_formula \pm \infty \end_cuda_math_formula
+* ) returns
+* \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - hfloor(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hfloor(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Round input to nearest integer value in half-precision floating-point
+* number.
+*
+* \details Round \p h to the nearest integer value in half-precision floating-point
+* format, with halfway cases rounded to the nearest even integer value.
+* \param[in] h - half. Is only being read.
+*
+* \returns half
+* - The nearest integer to \p h.
+* - hrint(
+* \cuda_math_formula \pm 0 \end_cuda_math_formula
+* ) returns
+* \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - hrint(
+* \cuda_math_formula \pm \infty \end_cuda_math_formula
+* ) returns
+* \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - hrint(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hrint(const __half h);
+
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Truncate \p half2 vector input argument to the integral part.
+*
+* \details Round each component of vector \p h to the largest integer value that does
+* not exceed \p h in magnitude.
+* \param[in] h - half2. Is only being read.
+*
+* \returns half2
+* - The truncated \p h.
+*
+* \see htrunc(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2trunc(const __half2 h);
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculate \p half2 vector ceiling of the input argument.
+*
+* \details For each component of vector \p h compute the smallest integer value not less
+* than \p h.
+* \param[in] h - half2. Is only being read.
+*
+* \returns half2
+* - The vector of smallest integers not less than \p h.
+*
+* \see hceil(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2ceil(const __half2 h);
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculate the largest integer less than or equal to \p h.
+*
+* \details For each component of vector \p h calculate the largest integer value which
+* is less than or equal to \p h.
+* \param[in] h - half2. Is only being read.
+*
+* \returns half2
+* - The vector of largest integers which is less than or equal to \p h.
+*
+* \see hfloor(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2floor(const __half2 h);
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Round input to nearest integer value in half-precision floating-point
+* number.
+*
+* \details Round each component of \p half2 vector \p h to the nearest integer value in
+* half-precision floating-point format, with halfway cases rounded to the
+* nearest even integer value.
+* \param[in] h - half2. Is only being read.
+*
+* \returns half2
+* - The vector of rounded integer values.
+*
+* \see hrint(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2rint(const __half2 h);
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Returns \p half2 with both halves equal to the input value.
+*
+* \details Returns \p half2 number with both halves equal to the input \p a \p half
+* number.
+* \param[in] a - half. Is only being read.
+*
+* \returns half2
+* - The vector which has both its halves equal to the input \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __half2half2(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Swaps both halves of the \p half2 input.
+*
+* \details Swaps both halves of the \p half2 input and returns a new \p half2 number
+* with swapped halves.
+* \param[in] a - half2. Is only being read.
+*
+* \returns half2
+* - \p a with its halves being swapped.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __lowhigh2highlow(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Extracts low 16 bits from each of the two \p half2 inputs and combines
+* into one \p half2 number.
+*
+* \details Extracts low 16 bits from each of the two \p half2 inputs and combines into
+* one \p half2 number. Low 16 bits from input \p a is stored in low 16 bits of
+* the return value, low 16 bits from input \p b is stored in high 16 bits of
+* the return value.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The low 16 bits of \p a and of \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __lows2half2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Extracts high 16 bits from each of the two \p half2 inputs and
+* combines into one \p half2 number.
+*
+* \details Extracts high 16 bits from each of the two \p half2 inputs and combines into
+* one \p half2 number. High 16 bits from input \p a is stored in low 16 bits of
+* the return value, high 16 bits from input \p b is stored in high 16 bits of
+* the return value.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The high 16 bits of \p a and of \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __highs2half2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Returns high 16 bits of \p half2 input.
+*
+* \details Returns high 16 bits of \p half2 input \p a.
+* \param[in] a - half2. Is only being read.
+*
+* \returns half
+* - The high 16 bits of the input.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __high2half(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Returns low 16 bits of \p half2 input.
+*
+* \details Returns low 16 bits of \p half2 input \p a.
+* \param[in] a - half2. Is only being read.
+*
+* \returns half
+* - Returns \p half which contains low 16 bits of the input \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __low2half(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Checks if the input \p half number is infinite.
+*
+* \details Checks if the input \p half number \p a is infinite.
+* \param[in] a - half. Is only being read.
+*
+* \returns int
+* - -1 if \p a is equal to negative infinity,
+* - 1 if \p a is equal to positive infinity,
+* - 0 otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ int __hisinf(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Combines two \p half numbers into one \p half2 number.
+*
+* \details Combines two input \p half number \p a and \p b into one \p half2 number.
+* Input \p a is stored in low 16 bits of the return value, input \p b is stored
+* in high 16 bits of the return value.
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns half2
+* - The half2 with one half equal to \p a and the other to \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __halves2half2(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Extracts low 16 bits from \p half2 input.
+*
+* \details Extracts low 16 bits from \p half2 input \p a and returns a new \p half2
+* number which has both halves equal to the extracted bits.
+* \param[in] a - half2. Is only being read.
+*
+* \returns half2
+* - The half2 with both halves equal to the low 16 bits of the input.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __low2half2(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Extracts high 16 bits from \p half2 input.
+*
+* \details Extracts high 16 bits from \p half2 input \p a and returns a new \p half2
+* number which has both halves equal to the extracted bits.
+* \param[in] a - half2. Is only being read.
+*
+* \returns half2
+* - The half2 with both halves equal to the high 16 bits of the input.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __high2half2(const __half2 a);
+
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Reinterprets bits in a \p half as a signed short integer.
+*
+* \details Reinterprets the bits in the half-precision floating-point number \p h
+* as a signed short integer.
+* \param[in] h - half. Is only being read.
+*
+* \returns short int
+* - The reinterpreted value.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ short int __half_as_short(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Reinterprets bits in a \p half as an unsigned short integer.
+*
+* \details Reinterprets the bits in the half-precision floating-point \p h
+* as an unsigned short number.
+* \param[in] h - half. Is only being read.
+*
+* \returns unsigned short int
+* - The reinterpreted value.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned short int __half_as_ushort(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Reinterprets bits in a signed short integer as a \p half.
+*
+* \details Reinterprets the bits in the signed short integer \p i as a
+* half-precision floating-point number.
+* \param[in] i - short int. Is only being read.
+*
+* \returns half
+* - The reinterpreted value.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __short_as_half(const short int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Reinterprets bits in an unsigned short integer as a \p half.
+*
+* \details Reinterprets the bits in the unsigned short integer \p i as a
+* half-precision floating-point number.
+* \param[in] i - unsigned short int. Is only being read.
+*
+* \returns half
+* - The reinterpreted value.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort_as_half(const unsigned short int i);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Calculates \p half maximum of two input values.
+*
+* \details Calculates \p half max(\p a, \p b)
+* defined as (\p a > \p b) ? \p a : \p b.
+* - If either of inputs is NaN, the other input is returned.
+* - If both inputs are NaNs, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns half
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmax(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Calculates \p half minimum of two input values.
+*
+* \details Calculates \p half min(\p a, \p b)
+* defined as (\p a < \p b) ? \p a : \p b.
+* - If either of inputs is NaN, the other input is returned.
+* - If both inputs are NaNs, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns half
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmin(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Calculates \p half2 vector maximum of two inputs.
+*
+* \details Calculates \p half2 vector max(\p a, \p b).
+* Elementwise \p half operation is defined as
+* (\p a > \p b) ? \p a : \p b.
+* - If either of inputs is NaN, the other input is returned.
+* - If both inputs are NaNs, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The result of elementwise maximum of vectors \p a  and \p b
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmax2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Calculates \p half2 vector minimum of two inputs.
+*
+* \details Calculates \p half2 vector min(\p a, \p b).
+* Elementwise \p half operation is defined as
+* (\p a < \p b) ? \p a : \p b.
+* - If either of inputs is NaN, the other input is returned.
+* - If both inputs are NaNs, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The result of elementwise minimum of vectors \p a  and \p b
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmin2(const __half2 a, const __half2 b);
+
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300)
+#if !defined warpSize && !defined __local_warpSize
+#define warpSize    32
+#define __local_warpSize
+#endif
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ < 700)
+
+#if defined(_WIN32)
+# define __CUDA_FP16_DEPRECATED__(msg) __declspec(deprecated(msg))
+#elif (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 5 && !defined(__clang__))))
+# define __CUDA_FP16_DEPRECATED__(msg) __attribute__((deprecated))
+#else
+# define __CUDA_FP16_DEPRECATED__(msg) __attribute__((deprecated(msg)))
+#endif
+
+#if defined(_NVHPC_CUDA)
+#define __CUDA_FP16_WSB_DEPRECATION_MESSAGE(x) __CUDA_FP16_STRINGIFY(x) "() is deprecated in favor of " __CUDA_FP16_STRINGIFY(x) "_sync() and may be removed in a future release."
+#else
+#define __CUDA_FP16_WSB_DEPRECATION_MESSAGE(x) __CUDA_FP16_STRINGIFY(x) "() is deprecated in favor of " __CUDA_FP16_STRINGIFY(x) "_sync() and may be removed in a future release (Use -Wno-deprecated-declarations to suppress this warning)."
+#endif
+
+__CUDA_FP16_DECL__ __CUDA_FP16_DEPRECATED__(__CUDA_FP16_WSB_DEPRECATION_MESSAGE(__shfl)) __half2 __shfl(const __half2 var, const int delta, const int width = warpSize);
+__CUDA_FP16_DECL__ __CUDA_FP16_DEPRECATED__(__CUDA_FP16_WSB_DEPRECATION_MESSAGE(__shfl_up)) __half2 __shfl_up(const __half2 var, const unsigned int delta, const int width = warpSize);
+__CUDA_FP16_DECL__ __CUDA_FP16_DEPRECATED__(__CUDA_FP16_WSB_DEPRECATION_MESSAGE(__shfl_down))__half2 __shfl_down(const __half2 var, const unsigned int delta, const int width = warpSize);
+__CUDA_FP16_DECL__ __CUDA_FP16_DEPRECATED__(__CUDA_FP16_WSB_DEPRECATION_MESSAGE(__shfl_xor)) __half2 __shfl_xor(const __half2 var, const int delta, const int width = warpSize);
+__CUDA_FP16_DECL__ __CUDA_FP16_DEPRECATED__(__CUDA_FP16_WSB_DEPRECATION_MESSAGE(__shfl)) __half __shfl(const __half var, const int delta, const int width = warpSize);
+__CUDA_FP16_DECL__ __CUDA_FP16_DEPRECATED__(__CUDA_FP16_WSB_DEPRECATION_MESSAGE(__shfl_up)) __half __shfl_up(const __half var, const unsigned int delta, const int width = warpSize);
+__CUDA_FP16_DECL__ __CUDA_FP16_DEPRECATED__(__CUDA_FP16_WSB_DEPRECATION_MESSAGE(__shfl_down)) __half __shfl_down(const __half var, const unsigned int delta, const int width = warpSize);
+__CUDA_FP16_DECL__ __CUDA_FP16_DEPRECATED__(__CUDA_FP16_WSB_DEPRECATION_MESSAGE(__shfl_xor)) __half __shfl_xor(const __half var, const int delta, const int width = warpSize);
+
+#undef __CUDA_FP16_WSB_DEPRECATION_MESSAGE
+#undef __CUDA_FP16_DEPRECATED__
+#endif /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 */
+
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Exchange a variable between threads within a warp. Direct copy from indexed thread.
+*
+* \details Returns the value of \p var held by the thread whose ID is given by \p srcLane.
+* If the \p width is less than \p warpSize, then each subsection of the warp behaves as a separate
+* entity with a starting logical thread ID of 0. If \p srcLane is outside the range \p [0:width-1],
+* the value returned corresponds to the value of \p var held by the \p srcLane modulo \p width (i.e.
+* within the same subsection). \p width must have a value which is a power of 2;
+* results are undefined if \p width is not a power of 2, or is a number greater than
+* \p warpSize.
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read.
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - half2. Is only being read.
+* \param[in] srcLane - int. Is only being read.
+* \param[in] width - int. Is only being read.
+*
+* \returns Returns the 4-byte word referenced by \p var from the source thread ID as \p half2.
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __shfl_sync(const unsigned int mask, const __half2 var, const int srcLane, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller.
+*
+* \details Calculates a source thread ID by subtracting \p delta from the caller's lane ID.
+* The value of \p var held by the resulting lane ID is returned: in effect, \p var is shifted up
+* the warp by \p delta threads. If the \p width is less than \p warpSize, then each subsection of the warp
+* behaves as a separate entity with a starting logical thread ID of 0. The source thread index
+* will not wrap around the value of \p width, so effectively the lower \p delta threads will be unchanged.
+* \p width must have a value which is a power of 2; results are undefined if \p width is not a power of 2,
+* or is a number greater than \p warpSize.
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read.
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - half2. Is only being read.
+* \param[in] delta - unsigned int. Is only being read.
+* \param[in] width - int. Is only being read.
+*
+* \returns Returns the 4-byte word referenced by \p var from the source thread ID as \p half2.
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __shfl_up_sync(const unsigned int mask, const __half2 var, const unsigned int delta, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller.
+*
+* \details Calculates a source thread ID by adding \p delta to the caller's thread ID.
+* The value of \p var held by the resulting thread ID is returned: this has the effect
+* of shifting \p var down the warp by \p delta threads. If the \p width is less than \p warpSize, then
+* each subsection of the warp behaves as a separate entity with a starting logical
+* thread ID of 0. Similarly to the __shfl_up_sync(), the ID number of the source thread
+* will not wrap around the value of \p width and the upper \p delta threads
+* will remain unchanged.
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read.
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - half2. Is only being read.
+* \param[in] delta - unsigned int. Is only being read.
+* \param[in] width - int. Is only being read.
+*
+* \returns Returns the 4-byte word referenced by \p var from the source thread ID as \p half2.
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __shfl_down_sync(const unsigned int mask, const __half2 var, const unsigned int delta, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID.
+*
+* \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with \p laneMask:
+* the value of \p var held by the resulting thread ID is returned. If the \p width is less than \p warpSize, then each
+* group of \p width consecutive threads are able to access elements from earlier groups of threads,
+* however if they attempt to access elements from later groups of threads their own value of \p var
+* will be returned. This mode implements a butterfly addressing pattern such as is used in tree
+* reduction and broadcast.
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read.
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - half2. Is only being read.
+* \param[in] laneMask - int. Is only being read.
+* \param[in] width - int. Is only being read.
+*
+* \returns Returns the 4-byte word referenced by \p var from the source thread ID as \p half2.
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __shfl_xor_sync(const unsigned int mask, const __half2 var, const int laneMask, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Exchange a variable between threads within a warp. Direct copy from indexed thread.
+*
+* \details Returns the value of \p var held by the thread whose ID is given by \p srcLane.
+* If the \p width is less than \p warpSize, then each subsection of the warp behaves as a separate
+* entity with a starting logical thread ID of 0. If \p srcLane is outside the range \p [0:width-1],
+* the value returned corresponds to the value of \p var held by the \p srcLane modulo \p width (i.e.
+* within the same subsection). \p width must have a value which is a power of 2;
+* results are undefined if \p width is not a power of 2, or is a number greater than
+* \p warpSize.
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read.
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - half. Is only being read.
+* \param[in] srcLane - int. Is only being read.
+* \param[in] width - int. Is only being read.
+*
+* \returns Returns the 2-byte word referenced by \p var from the source thread ID as \p half.
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __shfl_sync(const unsigned int mask, const __half var, const int srcLane, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller.
+*
+* \details Calculates a source thread ID by subtracting \p delta from the caller's lane ID.
+* The value of \p var held by the resulting lane ID is returned: in effect, \p var is shifted up
+* the warp by \p delta threads. If the \p width is less than \p warpSize, then each subsection of the warp
+* behaves as a separate entity with a starting logical thread ID of 0. The source thread index
+* will not wrap around the value of \p width, so effectively the lower \p delta threads will be unchanged.
+* \p width must have a value which is a power of 2; results are undefined if \p width is not a power of 2,
+* or is a number greater than \p warpSize.
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read.
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - half. Is only being read.
+* \param[in] delta - unsigned int. Is only being read.
+* \param[in] width - int. Is only being read.
+*
+* \returns Returns the 2-byte word referenced by \p var from the source thread ID as \p half.
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __shfl_up_sync(const unsigned int mask, const __half var, const unsigned int delta, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller.
+*
+* \details Calculates a source thread ID by adding \p delta to the caller's thread ID.
+* The value of \p var held by the resulting thread ID is returned: this has the effect
+* of shifting \p var down the warp by \p delta threads. If the \p width is less than \p warpSize, then
+* each subsection of the warp behaves as a separate entity with a starting logical
+* thread ID of 0. Similarly to the __shfl_up_sync(), the ID number of the source thread
+* will not wrap around the value of \p width and the upper \p delta threads
+* will remain unchanged.
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read.
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - half. Is only being read.
+* \param[in] delta - unsigned int. Is only being read.
+* \param[in] width - int. Is only being read.
+*
+* \returns Returns the 2-byte word referenced by \p var from the source thread ID as \p half.
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __shfl_down_sync(const unsigned int mask, const __half var, const unsigned int delta, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID.
+*
+* \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with \p laneMask:
+* the value of \p var held by the resulting thread ID is returned. If the \p width is less than \p warpSize, then each
+* group of \p width consecutive threads are able to access elements from earlier groups of threads,
+* however if they attempt to access elements from later groups of threads their own value of \p var
+* will be returned. This mode implements a butterfly addressing pattern such as is used in tree
+* reduction and broadcast.
+* Threads may only read data from another thread which is actively participating in the
+* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined.
+* \param[in] mask - unsigned int. Is only being read.
+*  - Indicates the threads participating in the call.
+*  - A bit, representing the thread's lane ID, must be set for each participating thread
+*    to ensure they are properly converged before the intrinsic is executed by the hardware.
+*  - Each calling thread must have its own bit set in the \p mask and all non-exited threads
+*    named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined.
+* \param[in] var - half. Is only being read.
+* \param[in] laneMask - int. Is only being read.
+* \param[in] width - int. Is only being read.
+*
+* \returns Returns the 2-byte word referenced by \p var from the source thread ID as \p half.
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __shfl_xor_sync(const unsigned int mask, const __half var, const int laneMask, const int width = warpSize);
+
+#if defined(__local_warpSize)
+#undef warpSize
+#undef __local_warpSize
+#endif
+#endif /*!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300) */
+
+#if defined(__cplusplus) && ( !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320) )
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `ld.global.nc` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_FP16_DECL__ __half2 __ldg(const  __half2 *const ptr);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `ld.global.nc` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_FP16_DECL__ __half __ldg(const __half *const ptr);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `ld.global.cg` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_FP16_DECL__ __half2 __ldcg(const  __half2 *const ptr);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `ld.global.cg` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_FP16_DECL__ __half __ldcg(const __half *const ptr);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `ld.global.ca` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_FP16_DECL__ __half2 __ldca(const  __half2 *const ptr);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `ld.global.ca` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_FP16_DECL__ __half __ldca(const __half *const ptr);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `ld.global.cs` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_FP16_DECL__ __half2 __ldcs(const  __half2 *const ptr);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `ld.global.cs` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_FP16_DECL__ __half __ldcs(const __half *const ptr);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `ld.global.lu` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_FP16_DECL__ __half2 __ldlu(const  __half2 *const ptr);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `ld.global.lu` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_FP16_DECL__ __half __ldlu(const __half *const ptr);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `ld.global.cv` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_FP16_DECL__ __half2 __ldcv(const  __half2 *const ptr);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `ld.global.cv` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_FP16_DECL__ __half __ldcv(const __half *const ptr);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `st.global.wb` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_FP16_DECL__ void __stwb(__half2 *const ptr, const __half2 value);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `st.global.wb` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_FP16_DECL__ void __stwb(__half *const ptr, const __half value);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `st.global.cg` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_FP16_DECL__ void __stcg(__half2 *const ptr, const __half2 value);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `st.global.cg` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_FP16_DECL__ void __stcg(__half *const ptr, const __half value);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `st.global.cs` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_FP16_DECL__ void __stcs(__half2 *const ptr, const __half2 value);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `st.global.cs` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_FP16_DECL__ void __stcs(__half *const ptr, const __half value);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `st.global.wt` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_FP16_DECL__ void __stwt(__half2 *const ptr, const __half2 value);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `st.global.wt` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_FP16_DECL__ void __stwt(__half *const ptr, const __half value);
+#endif /*defined(__cplusplus) && ( !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320) )*/
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs half2 vector if-equal comparison.
+*
+* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b.
+* The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The vector result of if-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __heq2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector not-equal comparison.
+*
+* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b.
+* The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The vector result of not-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hne2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector less-equal comparison.
+*
+* \details Performs \p half2 vector less-equal comparison of inputs \p a and \p b.
+* The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The \p half2 result of less-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hle2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector greater-equal comparison.
+*
+* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b.
+* The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The vector result of greater-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hge2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector less-than comparison.
+*
+* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b.
+* The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The half2 vector result of less-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hlt2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector greater-than comparison.
+*
+* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b.
+* The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The vector result of greater-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hgt2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered if-equal comparison.
+*
+* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b.
+* The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The vector result of unordered if-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hequ2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered not-equal comparison.
+*
+* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b.
+* The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The vector result of unordered not-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hneu2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered less-equal comparison.
+*
+* Performs \p half2 vector less-equal comparison of inputs \p a and \p b.
+* The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The vector result of unordered less-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hleu2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered greater-equal comparison.
+*
+* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b.
+* The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The \p half2 vector result of unordered greater-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hgeu2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered less-than comparison.
+*
+* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b.
+* The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The vector result of unordered less-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hltu2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered greater-than comparison.
+*
+* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b.
+* The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The \p half2 vector result of unordered greater-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hgtu2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs half2 vector if-equal comparison.
+*
+* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of if-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __heq2_mask(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector not-equal comparison.
+*
+* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of not-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hne2_mask(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector less-equal comparison.
+*
+* \details Performs \p half2 vector less-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of less-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hle2_mask(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector greater-equal comparison.
+*
+* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of greater-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hge2_mask(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector less-than comparison.
+*
+* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of less-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hlt2_mask(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector greater-than comparison.
+*
+* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of greater-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hgt2_mask(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered if-equal comparison.
+*
+* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of unordered if-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hequ2_mask(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered not-equal comparison.
+*
+* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of unordered not-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hneu2_mask(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered less-equal comparison.
+*
+* Performs \p half2 vector less-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of unordered less-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hleu2_mask(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered greater-equal comparison.
+*
+* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of unordered greater-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hgeu2_mask(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered less-than comparison.
+*
+* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of unordered less-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hltu2_mask(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered greater-than comparison.
+*
+* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b.
+* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns unsigned int
+* - The vector mask result of unordered greater-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hgtu2_mask(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Determine whether \p half2 argument is a NaN.
+*
+* \details Determine whether each half of input \p half2 number \p a is a NaN.
+* \param[in] a - half2. Is only being read.
+*
+* \returns half2
+* - The half2 with the corresponding \p half results set to
+* 1.0 for NaN, 0.0 otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hisnan2(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs \p half2 vector addition in round-to-nearest-even mode.
+*
+* \details Performs \p half2 vector add of inputs \p a and \p b, in round-to-nearest-even
+* mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-95
+* \endinternal
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The sum of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hadd2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs \p half2 vector subtraction in round-to-nearest-even mode.
+*
+* \details Subtracts \p half2 input vector \p b from input vector \p a in
+* round-to-nearest-even mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-104
+* \endinternal
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The subtraction of vector \p b from \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hsub2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs \p half2 vector multiplication in round-to-nearest-even mode.
+*
+* \details Performs \p half2 vector multiplication of inputs \p a and \p b, in
+* round-to-nearest-even mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-102
+* \endinternal
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The result of elementwise multiplying the vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmul2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs \p half2 vector addition in round-to-nearest-even mode.
+*
+* \details Performs \p half2 vector add of inputs \p a and \p b, in round-to-nearest-even
+* mode. Prevents floating-point contractions of mul+add into fma.
+* \internal
+* \req DEEPLEARN-SRM_REQ-95
+* \endinternal
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The sum of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hadd2_rn(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs \p half2 vector subtraction in round-to-nearest-even mode.
+*
+* \details Subtracts \p half2 input vector \p b from input vector \p a in
+* round-to-nearest-even mode. Prevents floating-point contractions of mul+sub
+* into fma.
+* \internal
+* \req DEEPLEARN-SRM_REQ-104
+* \endinternal
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The subtraction of vector \p b from \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hsub2_rn(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs \p half2 vector multiplication in round-to-nearest-even mode.
+*
+* \details Performs \p half2 vector multiplication of inputs \p a and \p b, in
+* round-to-nearest-even mode. Prevents floating-point contractions of
+* mul+add or sub into fma.
+* \internal
+* \req DEEPLEARN-SRM_REQ-102
+* \endinternal
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The result of elementwise multiplying the vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmul2_rn(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs \p half2 vector division in round-to-nearest-even mode.
+*
+* \details Divides \p half2 input vector \p a by input vector \p b in round-to-nearest-even
+* mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-103
+* \endinternal
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The elementwise division of \p a with \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __h2div(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Calculates the absolute value of both halves of the input \p half2 number and
+* returns the result.
+*
+* \details Calculates the absolute value of both halves of the input \p half2 number and
+* returns the result.
+* \param[in] a - half2. Is only being read.
+*
+* \returns half2
+* - Returns \p a with the absolute value of both halves.
+*
+* \see __habs(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __habs2(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs \p half2 vector addition in round-to-nearest-even mode, with
+* saturation to [0.0, 1.0].
+*
+* \details Performs \p half2 vector add of inputs \p a and \p b, in round-to-nearest-even
+* mode, and clamps the results to range [0.0, 1.0]. NaN results are flushed to
+* +0.0.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The sum of \p a and \p b, with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hadd2_sat(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs \p half2 vector subtraction in round-to-nearest-even mode,
+* with saturation to [0.0, 1.0].
+*
+* \details Subtracts \p half2 input vector \p b from input vector \p a in
+* round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN
+* results are flushed to +0.0.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The subtraction of vector \p b from \p a, with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hsub2_sat(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs \p half2 vector multiplication in round-to-nearest-even mode,
+* with saturation to [0.0, 1.0].
+*
+* \details Performs \p half2 vector multiplication of inputs \p a and \p b, in
+* round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN
+* results are flushed to +0.0.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The result of elementwise multiplication of vectors \p a and \p b,
+* with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmul2_sat(const __half2 a, const __half2 b);
+
+#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs \p half2 vector fused multiply-add in round-to-nearest-even
+* mode.
+*
+* \details Performs \p half2 vector multiply on inputs \p a and \p b,
+* then performs a \p half2 vector add of the result with \p c,
+* rounding the result once in round-to-nearest-even mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-105
+* \endinternal
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+* \param[in] c - half2. Is only being read.
+*
+* \returns half2
+* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __hfma2(const __half2 a, const __half2 b, const __half2 c);
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs \p half2 vector fused multiply-add in round-to-nearest-even
+* mode, with saturation to [0.0, 1.0].
+*
+* \details Performs \p half2 vector multiply on inputs \p a and \p b,
+* then performs a \p half2 vector add of the result with \p c,
+* rounding the result once in round-to-nearest-even mode, and clamps the
+* results to range [0.0, 1.0]. NaN results are flushed to +0.0.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+* \param[in] c - half2. Is only being read.
+*
+* \returns half2
+* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c,
+* with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __hfma2_sat(const __half2 a, const __half2 b, const __half2 c);
+#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Negates both halves of the input \p half2 number and returns the
+* result.
+*
+* \details Negates both halves of the input \p half2 number \p a and returns the result.
+* \internal
+* \req DEEPLEARN-SRM_REQ-101
+* \endinternal
+* \param[in] a - half2. Is only being read.
+*
+* \returns half2
+* - Returns \p a with both halves negated.
+*
+* \see __hneg(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hneg2(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Calculates the absolute value of input \p half number and returns the result.
+*
+* \details Calculates the absolute value of input \p half number and returns the result.
+* \param[in] a - half. Is only being read.
+*
+* \returns half
+* - The absolute value of \p a.
+* - __habs \cuda_math_formula (\pm 0)\end_cuda_math_formula returns +0.
+* - __habs \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula +\infty \end_cuda_math_formula.
+* - __habs(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __habs(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Performs \p half addition in round-to-nearest-even mode.
+*
+* \details Performs \p half addition of inputs \p a and \p b, in round-to-nearest-even
+* mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-94
+* \endinternal
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns half
+* - The sum of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hadd(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Performs \p half subtraction in round-to-nearest-even mode.
+*
+* \details Subtracts \p half input \p b from input \p a in round-to-nearest-even
+* mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-97
+* \endinternal
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns half
+* - The result of subtracting \p b from \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hsub(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Performs \p half multiplication in round-to-nearest-even mode.
+*
+* \details Performs \p half multiplication of inputs \p a and \p b, in round-to-nearest-even
+* mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-99
+* \endinternal
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns half
+* - The result of multiplying \p a and \p b.
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmul(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Performs \p half addition in round-to-nearest-even mode.
+*
+* \details Performs \p half addition of inputs \p a and \p b, in round-to-nearest-even
+* mode. Prevents floating-point contractions of mul+add into fma.
+* \internal
+* \req DEEPLEARN-SRM_REQ-94
+* \endinternal
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns half
+* - The sum of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hadd_rn(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Performs \p half subtraction in round-to-nearest-even mode.
+*
+* \details Subtracts \p half input \p b from input \p a in round-to-nearest-even
+* mode. Prevents floating-point contractions of mul+sub into fma.
+* \internal
+* \req DEEPLEARN-SRM_REQ-97
+* \endinternal
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns half
+* - The result of subtracting \p b from \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hsub_rn(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Performs \p half multiplication in round-to-nearest-even mode.
+*
+* \details Performs \p half multiplication of inputs \p a and \p b, in round-to-nearest-even
+* mode. Prevents floating-point contractions of mul+add or sub into fma.
+* \internal
+* \req DEEPLEARN-SRM_REQ-99
+* \endinternal
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns half
+* - The result of multiplying \p a and \p b.
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmul_rn(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Performs \p half division in round-to-nearest-even mode.
+*
+* \details Divides \p half input \p a by input \p b in round-to-nearest-even
+* mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-98
+* \endinternal
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns half
+* - The result of dividing \p a by \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__  __half __hdiv(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Performs \p half addition in round-to-nearest-even mode, with
+* saturation to [0.0, 1.0].
+*
+* \details Performs \p half add of inputs \p a and \p b, in round-to-nearest-even mode,
+* and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0.
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns half
+* - The sum of \p a and \p b, with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hadd_sat(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Performs \p half subtraction in round-to-nearest-even mode, with
+* saturation to [0.0, 1.0].
+*
+* \details Subtracts \p half input \p b from input \p a in round-to-nearest-even
+* mode,
+* and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0.
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns half
+* - The result of subtraction of \p b from \p a, with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hsub_sat(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Performs \p half multiplication in round-to-nearest-even mode, with
+* saturation to [0.0, 1.0].
+*
+* \details Performs \p half multiplication of inputs \p a and \p b, in round-to-nearest-even
+* mode, and clamps the result to range [0.0, 1.0]. NaN results are flushed to
+* +0.0.
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns half
+* - The result of multiplying \p a and \p b, with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmul_sat(const __half a, const __half b);
+
+#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Performs \p half fused multiply-add in round-to-nearest-even mode.
+*
+* \details Performs \p half multiply on inputs \p a and \p b,
+* then performs a \p half add of the result with \p c,
+* rounding the result once in round-to-nearest-even mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-96
+* \endinternal
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+* \param[in] c - half. Is only being read.
+*
+* \returns half
+* - The result of fused multiply-add operation on \p
+* a, \p b, and \p c.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __hfma(const __half a, const __half b, const __half c);
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Performs \p half fused multiply-add in round-to-nearest-even mode,
+* with saturation to [0.0, 1.0].
+*
+* \details Performs \p half multiply on inputs \p a and \p b,
+* then performs a \p half add of the result with \p c,
+* rounding the result once in round-to-nearest-even mode, and clamps the result
+* to range [0.0, 1.0]. NaN results are flushed to +0.0.
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+* \param[in] c - half. Is only being read.
+*
+* \returns half
+* - The result of fused multiply-add operation on \p
+* a, \p b, and \p c, with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __hfma_sat(const __half a, const __half b, const __half c);
+#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */
+
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Negates input \p half number and returns the result.
+*
+* \details Negates input \p half number and returns the result.
+* \internal
+* \req DEEPLEARN-SRM_REQ-100
+* \endinternal
+* \param[in] a - half. Is only being read.
+*
+* \returns half
+* - Negated input \p a.
+* - __hneg \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \mp 0 \end_cuda_math_formula.
+* - __hneg \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \mp \infty \end_cuda_math_formula.
+* - __hneg(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hneg(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector if-equal comparison and returns boolean true
+* if both \p half results are true, boolean false otherwise.
+*
+* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p half if-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns bool
+* - true if both \p half results of if-equal comparison
+* of vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbeq2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector not-equal comparison and returns boolean
+* true if both \p half results are true, boolean false otherwise.
+*
+* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p half not-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns bool
+* - true if both \p half results of not-equal comparison
+* of vectors \p a and \p b are true,
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbne2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector less-equal comparison and returns boolean
+* true if both \p half results are true, boolean false otherwise.
+*
+* \details Performs \p half2 vector less-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p half less-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns bool
+* - true if both \p half results of less-equal comparison
+* of vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hble2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector greater-equal comparison and returns boolean
+* true if both \p half results are true, boolean false otherwise.
+*
+* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p half greater-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns bool
+* - true if both \p half results of greater-equal
+* comparison of vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbge2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector less-than comparison and returns boolean
+* true if both \p half results are true, boolean false otherwise.
+*
+* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p half less-than comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns bool
+* - true if both \p half results of less-than comparison
+* of vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hblt2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector greater-than comparison and returns boolean
+* true if both \p half results are true, boolean false otherwise.
+*
+* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p half greater-than comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns bool
+* - true if both \p half results of greater-than
+* comparison of vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbgt2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered if-equal comparison and returns
+* boolean true if both \p half results are true, boolean false otherwise.
+*
+* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p half if-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns bool
+* - true if both \p half results of unordered if-equal
+* comparison of vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbequ2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered not-equal comparison and returns
+* boolean true if both \p half results are true, boolean false otherwise.
+*
+* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p half not-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns bool
+* - true if both \p half results of unordered not-equal
+* comparison of vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbneu2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered less-equal comparison and returns
+* boolean true if both \p half results are true, boolean false otherwise.
+*
+* \details Performs \p half2 vector less-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p half less-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns bool
+* - true if both \p half results of unordered less-equal
+* comparison of vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbleu2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered greater-equal comparison and
+* returns boolean true if both \p half results are true, boolean false
+* otherwise.
+*
+* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p half greater-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns bool
+* - true if both \p half results of unordered
+* greater-equal comparison of vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbgeu2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered less-than comparison and returns
+* boolean true if both \p half results are true, boolean false otherwise.
+*
+* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p half less-than comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns bool
+* - true if both \p half results of unordered less-than comparison of
+* vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbltu2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered greater-than comparison and
+* returns boolean true if both \p half results are true, boolean false
+* otherwise.
+*
+* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p half greater-than comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns bool
+* - true if both \p half results of unordered
+* greater-than comparison of vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbgtu2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Performs \p half if-equal comparison.
+*
+* \details Performs \p half if-equal comparison of inputs \p a and \p b.
+* NaN inputs generate false results.
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns bool
+* - The boolean result of if-equal comparison of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __heq(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Performs \p half not-equal comparison.
+*
+* \details Performs \p half not-equal comparison of inputs \p a and \p b.
+* NaN inputs generate false results.
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns bool
+* - The boolean result of not-equal comparison of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hne(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Performs \p half less-equal comparison.
+*
+* \details Performs \p half less-equal comparison of inputs \p a and \p b.
+* NaN inputs generate false results.
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns bool
+* - The boolean result of less-equal comparison of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hle(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Performs \p half greater-equal comparison.
+*
+* \details Performs \p half greater-equal comparison of inputs \p a and \p b.
+* NaN inputs generate false results.
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns bool
+* - The boolean result of greater-equal comparison of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hge(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Performs \p half less-than comparison.
+*
+* \details Performs \p half less-than comparison of inputs \p a and \p b.
+* NaN inputs generate false results.
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns bool
+* - The boolean result of less-than comparison of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hlt(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Performs \p half greater-than comparison.
+*
+* \details Performs \p half greater-than comparison of inputs \p a and \p b.
+* NaN inputs generate false results.
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns bool
+* - The boolean result of greater-than comparison of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hgt(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Performs \p half unordered if-equal comparison.
+*
+* \details Performs \p half if-equal comparison of inputs \p a and \p b.
+* NaN inputs generate true results.
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns bool
+* - The boolean result of unordered if-equal comparison of \p a and
+* \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hequ(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Performs \p half unordered not-equal comparison.
+*
+* \details Performs \p half not-equal comparison of inputs \p a and \p b.
+* NaN inputs generate true results.
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns bool
+* - The boolean result of unordered not-equal comparison of \p a and
+* \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hneu(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Performs \p half unordered less-equal comparison.
+*
+* \details Performs \p half less-equal comparison of inputs \p a and \p b.
+* NaN inputs generate true results.
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns bool
+* - The boolean result of unordered less-equal comparison of \p a and
+* \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hleu(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Performs \p half unordered greater-equal comparison.
+*
+* \details Performs \p half greater-equal comparison of inputs \p a and \p b.
+* NaN inputs generate true results.
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns bool
+* - The boolean result of unordered greater-equal comparison of \p a
+* and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hgeu(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Performs \p half unordered less-than comparison.
+*
+* \details Performs \p half less-than comparison of inputs \p a and \p b.
+* NaN inputs generate true results.
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns bool
+* - The boolean result of unordered less-than comparison of \p a and
+* \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hltu(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Performs \p half unordered greater-than comparison.
+*
+* \details Performs \p half greater-than comparison of inputs \p a and \p b.
+* NaN inputs generate true results.
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns bool
+* - The boolean result of unordered greater-than comparison of \p a
+* and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hgtu(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Determine whether \p half argument is a NaN.
+*
+* \details Determine whether \p half value \p a is a NaN.
+* \param[in] a - half. Is only being read.
+*
+* \returns bool
+* - true if argument is NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hisnan(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Calculates \p half maximum of two input values, NaNs pass through.
+*
+* \details Calculates \p half max(\p a, \p b)
+* defined as (\p a > \p b) ? \p a : \p b.
+* - If either of inputs is NaN, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns half
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmax_nan(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Calculates \p half minimum of two input values, NaNs pass through.
+*
+* \details Calculates \p half min(\p a, \p b)
+* defined as (\p a < \p b) ? \p a : \p b.
+* - If either of inputs is NaN, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns half
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmin_nan(const __half a, const __half b);
+#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Performs \p half fused multiply-add in round-to-nearest-even mode with relu saturation.
+*
+* \details Performs \p half multiply on inputs \p a and \p b,
+* then performs a \p half add of the result with \p c,
+* rounding the result once in round-to-nearest-even mode.
+* Then negative result is clamped to 0.
+* NaN result is converted to canonical NaN.
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+* \param[in] c - half. Is only being read.
+*
+* \returns half
+* - The result of fused multiply-add operation on \p
+* a, \p b, and \p c with relu saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __hfma_relu(const __half a, const __half b, const __half c);
+#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Calculates \p half2 vector maximum of two inputs, NaNs pass through.
+*
+* \details Calculates \p half2 vector max(\p a, \p b).
+* Elementwise \p half operation is defined as
+* (\p a > \p b) ? \p a : \p b.
+* - If either of inputs is NaN, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The result of elementwise maximum of vectors \p a  and \p b, with NaNs pass through
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmax2_nan(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Calculates \p half2 vector minimum of two inputs, NaNs pass through.
+*
+* \details Calculates \p half2 vector min(\p a, \p b).
+* Elementwise \p half operation is defined as
+* (\p a < \p b) ? \p a : \p b.
+* - If either of inputs is NaN, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The result of elementwise minimum of vectors \p a  and \p b, with NaNs pass through
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmin2_nan(const __half2 a, const __half2 b);
+#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs \p half2 vector fused multiply-add in round-to-nearest-even
+* mode with relu saturation.
+*
+* \details Performs \p half2 vector multiply on inputs \p a and \p b,
+* then performs a \p half2 vector add of the result with \p c,
+* rounding the result once in round-to-nearest-even mode.
+* Then negative result is clamped to 0.
+* NaN result is converted to canonical NaN.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+* \param[in] c - half2. Is only being read.
+*
+* \returns half2
+* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c with relu saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __hfma2_relu(const __half2 a, const __half2 b, const __half2 c);
+
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs fast complex multiply-accumulate
+*
+* \details Interprets vector \p half2 input pairs \p a, \p b, and \p c as
+* complex numbers in \p half precision: (a.x + I*a.y), (b.x + I*b.y), (c.x + I*c.y)
+* and performs complex multiply-accumulate operation: a*b + c in a simple way:
+* ((a.x*b.x + c.x) - a.y*b.y) + I*((a.x*b.y + c.y) + a.y*b.x)
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+* \param[in] c - half2. Is only being read.
+*
+* \returns half2
+* - The result of complex multiply-accumulate operation on complex numbers \p a, \p b, and \p c
+* - __half2 result = __hcmadd(a, b, c) is numerically in agreement with:
+* - result.x = __hfma(-a.y, b.y, __hfma(a.x, b.x, c.x))
+* - result.y = __hfma( a.y, b.x, __hfma(a.x, b.y, c.y))
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __hcmadd(const __half2 a, const __half2 b, const __half2 c);
+#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculates \p half square root in round-to-nearest-even mode.
+*
+* \details Calculates \p half square root of input: \cuda_math_formula \sqrt{a} \end_cuda_math_formula in round-to-nearest-even mode.
+* \param[in] a - half. Is only being read.
+*
+* \returns half
+* - The square root of \p a.
+* - hsqrt \cuda_math_formula (+\infty)\end_cuda_math_formula returns \cuda_math_formula +\infty \end_cuda_math_formula.
+* - hsqrt \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - hsqrt \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns NaN.
+* - hsqrt(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hsqrt(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculates \p half reciprocal square root in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p half reciprocal square root of input: \cuda_math_formula \frac{1}{\sqrt{a}}\end_cuda_math_formula in round-to-nearest-even
+* mode.
+* \param[in] a - half. Is only being read.
+*
+* \returns half
+* - The reciprocal square root of \p a.
+* - hrsqrt \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - hrsqrt \cuda_math_formula (+\infty)\end_cuda_math_formula returns +0.
+* - hrsqrt \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns NaN.
+* - hrsqrt(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hrsqrt(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculates \p half reciprocal in round-to-nearest-even mode.
+*
+* \details Calculates \p half reciprocal of input: \cuda_math_formula \frac{1}{a}\end_cuda_math_formula in round-to-nearest-even mode.
+* \param[in] a - half. Is only being read.
+*
+* \returns half
+* - The reciprocal of \p a.
+* - hrcp \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula.
+* - hrcp \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+* - hrcp(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hrcp(const __half a);
+#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculates \p half natural logarithm in round-to-nearest-even mode.
+*
+* \details Calculates \p half natural logarithm of input: \cuda_math_formula \ln(a)\end_cuda_math_formula in round-to-nearest-even
+* mode.
+* \param[in] a - half. Is only being read.
+*
+* \returns half
+* - The natural logarithm of \p a.
+* - hlog \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula -\infty \end_cuda_math_formula.
+* - hlog(1) returns +0.
+* - hlog(x), x < 0 returns NaN.
+* - hlog \cuda_math_formula (+\infty)\end_cuda_math_formula returns \cuda_math_formula +\infty \end_cuda_math_formula.
+* - hlog(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hlog(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculates \p half binary logarithm in round-to-nearest-even mode.
+*
+* \details Calculates \p half binary logarithm of input: \cuda_math_formula \log_{2}(a)\end_cuda_math_formula in round-to-nearest-even
+* mode.
+* \param[in] a - half. Is only being read.
+*
+* \returns half
+* - The binary logarithm of \p a.
+* - hlog2 \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula -\infty \end_cuda_math_formula.
+* - hlog2(1) returns +0.
+* - hlog2(x), x < 0 returns NaN.
+* - hlog2 \cuda_math_formula (+\infty)\end_cuda_math_formula returns \cuda_math_formula +\infty \end_cuda_math_formula.
+* - hlog2(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hlog2(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculates \p half decimal logarithm in round-to-nearest-even mode.
+*
+* \details Calculates \p half decimal logarithm of input: \cuda_math_formula \log_{10}(a)\end_cuda_math_formula in round-to-nearest-even
+* mode.
+* \param[in] a - half. Is only being read.
+*
+* \returns half
+* - The decimal logarithm of \p a.
+* - hlog10 \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula -\infty \end_cuda_math_formula.
+* - hlog10(1) returns +0.
+* - hlog10(x), x < 0 returns NaN.
+* - hlog10 \cuda_math_formula (+\infty)\end_cuda_math_formula returns \cuda_math_formula +\infty \end_cuda_math_formula.
+* - hlog10(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hlog10(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculates \p half natural exponential function in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p half natural exponential function of input: \cuda_math_formula e^{a}\end_cuda_math_formula in
+* round-to-nearest-even mode.
+* \param[in] a - half. Is only being read.
+*
+* \returns half
+* - The natural exponential function on \p a.
+* - hexp \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 1.
+* - hexp \cuda_math_formula (-\infty)\end_cuda_math_formula returns +0.
+* - hexp \cuda_math_formula (+\infty)\end_cuda_math_formula returns \cuda_math_formula +\infty \end_cuda_math_formula.
+* - hexp(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hexp(const __half a);
+#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */
+
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculates approximate \p half hyperbolic tangent function.
+*
+* \details Calculates approximate \p half hyperbolic tangent function: \cuda_math_formula \tanh(a)\end_cuda_math_formula.
+* This operation uses HW acceleration on devices of compute capability 7.5 and higher.
+* \param[in] a - half. Is only being read.
+*
+* \returns half
+* - The approximate hyperbolic tangent function of \p a.
+* - htanh_approx \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula (\pm 0)\end_cuda_math_formula.
+* - htanh_approx \cuda_math_formula (\pm\infty)\end_cuda_math_formula returns \cuda_math_formula (\pm 1)\end_cuda_math_formula.
+* - htanh_approx(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half htanh_approx(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculates \p half2 vector approximate hyperbolic tangent function.
+*
+* \details Calculates \p half2 approximate hyperbolic tangent function of input vector \p a.
+* This operation uses HW acceleration on devices of compute capability 7.5 and higher.
+* \param[in] a - half2. Is only being read.
+*
+* \returns half2
+* - The elementwise approximate hyperbolic tangent function on vector \p a.
+*
+* \see htanh_approx(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2tanh_approx(const __half2 a);
+
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculates \p half hyperbolic tangent function in
+* round-to-nearest-even mode.
+*
+* \details Calculates \p half hyperbolic tangent function: \cuda_math_formula \tanh(a)\end_cuda_math_formula in
+* round-to-nearest-even mode.
+* \param[in] a - half. Is only being read.
+*
+* \returns half
+* - The hyperbolic tangent function of \p a.
+* - htanh \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula (\pm 0)\end_cuda_math_formula.
+* - htanh \cuda_math_formula (\pm\infty)\end_cuda_math_formula returns \cuda_math_formula (\pm 1)\end_cuda_math_formula.
+* - htanh(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half htanh(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculates \p half2 vector hyperbolic tangent function in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p half2 hyperbolic tangent function of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - half2. Is only being read.
+*
+* \returns half2
+* - The elementwise hyperbolic tangent function on vector \p a.
+*
+* \see htanh(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2tanh(const __half2 a);
+
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculates \p half binary exponential function in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p half binary exponential function of input: \cuda_math_formula 2^{a}\end_cuda_math_formula in
+* round-to-nearest-even mode.
+* \param[in] a - half. Is only being read.
+*
+* \returns half
+* - The binary exponential function on \p a.
+* - hexp2 \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 1.
+* - hexp2 \cuda_math_formula (-\infty)\end_cuda_math_formula returns +0.
+* - hexp2 \cuda_math_formula (+\infty)\end_cuda_math_formula returns \cuda_math_formula +\infty \end_cuda_math_formula.
+* - hexp2(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hexp2(const __half a);
+#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculates \p half decimal exponential function in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p half decimal exponential function of input: \cuda_math_formula 10^{a}\end_cuda_math_formula in
+* round-to-nearest-even mode.
+* \param[in] a - half. Is only being read.
+*
+* \returns half
+* - The decimal exponential function on \p a.
+* - hexp10 \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 1.
+* - hexp10 \cuda_math_formula (-\infty)\end_cuda_math_formula returns +0.
+* - hexp10 \cuda_math_formula (+\infty)\end_cuda_math_formula returns \cuda_math_formula +\infty \end_cuda_math_formula.
+* - hexp10(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hexp10(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculates \p half cosine in round-to-nearest-even mode.
+*
+* \details Calculates \p half cosine of input \p a in round-to-nearest-even mode.
+* \param[in] a - half. Is only being read.
+*
+* \returns half
+* - The cosine of \p a.
+* - hcos \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 1.
+* - hcos \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns NaN.
+* - hcos(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hcos(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculates \p half sine in round-to-nearest-even mode.
+*
+* \details Calculates \p half sine of input \p a in round-to-nearest-even mode.
+* \param[in] a - half. Is only being read.
+*
+* \returns half
+* - The sine of \p a.
+* - hsin \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula (\pm 0)\end_cuda_math_formula.
+* - hsin \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns NaN.
+* - hsin(NaN) returns NaN.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hsin(const __half a);
+#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculates \p half2 vector square root in round-to-nearest-even mode.
+*
+* \details Calculates \p half2 square root of input vector \p a in round-to-nearest-even
+* mode.
+* \param[in] a - half2. Is only being read.
+*
+* \returns half2
+* - The elementwise square root on vector \p a.
+*
+* \see hsqrt(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2sqrt(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculates \p half2 vector reciprocal square root in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p half2 reciprocal square root of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - half2. Is only being read.
+*
+* \returns half2
+* - The elementwise reciprocal square root on vector \p a.
+*
+* \see hrsqrt(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2rsqrt(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculates \p half2 vector reciprocal in round-to-nearest-even mode.
+*
+* \details Calculates \p half2 reciprocal of input vector \p a in round-to-nearest-even
+* mode.
+* \param[in] a - half2. Is only being read.
+*
+* \returns half2
+* - The elementwise reciprocal on vector \p a.
+*
+* \see hrcp(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2rcp(const __half2 a);
+#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculates \p half2 vector natural logarithm in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p half2 natural logarithm of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - half2. Is only being read.
+*
+* \returns half2
+* - The elementwise natural logarithm on vector \p a.
+*
+* \see hlog(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2log(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculates \p half2 vector binary logarithm in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p half2 binary logarithm of input vector \p a in round-to-nearest-even
+* mode.
+* \param[in] a - half2. Is only being read.
+*
+* \returns half2
+* - The elementwise binary logarithm on vector \p a.
+*
+* \see hlog2(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2log2(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculates \p half2 vector decimal logarithm in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p half2 decimal logarithm of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - half2. Is only being read.
+*
+* \returns half2
+* - The elementwise decimal logarithm on vector \p a.
+*
+* \see hlog10(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2log10(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculates \p half2 vector exponential function in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p half2 exponential function of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - half2. Is only being read.
+*
+* \returns half2
+* - The elementwise exponential function on vector \p a.
+*
+* \see hexp(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2exp(const __half2 a);
+#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculates \p half2 vector binary exponential function in
+* round-to-nearest-even mode.
+*
+* \details Calculates \p half2 binary exponential function of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - half2. Is only being read.
+*
+* \returns half2
+* - The elementwise binary exponential function on vector \p a.
+*
+* \see hexp2(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2exp2(const __half2 a);
+#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculates \p half2 vector decimal exponential function in
+* round-to-nearest-even mode.
+*
+* \details Calculates \p half2 decimal exponential function of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - half2. Is only being read.
+*
+* \returns half2
+* - The elementwise decimal exponential function on vector \p a.
+*
+* \see hexp10(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2exp10(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculates \p half2 vector cosine in round-to-nearest-even mode.
+*
+* \details Calculates \p half2 cosine of input vector \p a in round-to-nearest-even
+* mode.
+* \param[in] a - half2. Is only being read.
+*
+* \returns half2
+* - The elementwise cosine on vector \p a.
+*
+* \see hcos(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2cos(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculates \p half2 vector sine in round-to-nearest-even mode.
+*
+* \details Calculates \p half2 sine of input vector \p a in round-to-nearest-even mode.
+* \param[in] a - half2. Is only being read.
+*
+* \returns half2
+* - The elementwise sine on vector \p a.
+*
+* \see hsin(__half) for further details.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2sin(const __half2 a);
+#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */
+
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Vector add \p val to the value stored at \p address in global or shared memory, and writes this
+* value back to \p address. The atomicity of the add operation is guaranteed separately for each of the
+* two \p __half elements; the entire \p __half2 is not guaranteed to be atomic as a single 32-bit access.
+*
+* \details The location of \p address must be in global or shared memory. This operation has undefined
+* behavior otherwise. This operation is natively supported by devices of compute capability 6.x and higher,
+* older devices use emulation path.
+*
+* \param[in] address - half2*. An address in global or shared memory.
+* \param[in] val - half2. The value to be added.
+*
+* \returns half2
+* - The old value read from \p address.
+*
+* \note_ref_guide_atomic
+*/
+__CUDA_FP16_DECL__ __half2 atomicAdd(__half2 *const address, const __half2 val);
+
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700))) || defined(_NVHPC_CUDA)
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Adds \p val to the value stored at \p address in global or shared memory, and writes this value
+* back to \p address. This operation is performed in one atomic operation.
+*
+* \details The location of \p address must be in global or shared memory. This operation has undefined
+* behavior otherwise. This operation is only supported by devices of compute capability 7.x and higher.
+*
+* \param[in] address - half*. An address in global or shared memory.
+* \param[in] val - half. The value to be added.
+*
+* \returns half
+* - The old value read from \p address.
+*
+* \note_ref_guide_atomic
+*/
+__CUDA_FP16_DECL__ __half atomicAdd(__half *const address, const __half val);
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700))) || defined(_NVHPC_CUDA) */
+#endif /*defined(__CUDACC__) || defined(_NVHPC_CUDA)*/
+
+
+#endif /* defined(__cplusplus) */
+
+#if !defined(_MSC_VER) && __cplusplus >= 201103L
+#   define __CPP_VERSION_AT_LEAST_11_FP16
+#elif _MSC_FULL_VER >= 190024210 && _MSVC_LANG >= 201103L
+#   define __CPP_VERSION_AT_LEAST_11_FP16
+#endif
+
+// implicitly provided by NVRTC
+#if !defined(__CUDACC_RTC__)
+#include <nv/target>
+#endif  /* !defined(__CUDACC_RTC__) */
+
+/* C++11 header for ::std::move.
+ * In RTC mode, ::std::move is provided implicitly; don't include the header
+ */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP16) && !defined(__CUDACC_RTC__)
+#include <utility>
+#endif /* __cplusplus >= 201103L && !defined(__CUDACC_RTC__) */
+
+/* C++ header for ::std::memcpy (used for type punning in host-side implementations).
+ * When compiling as a CUDA source file memcpy is provided implicitly.
+ * !defined(__CUDACC__) implies !defined(__CUDACC_RTC__).
+ */
+#if defined(__cplusplus) && !defined(__CUDACC__)
+#include <cstring>
+#endif /* defined(__cplusplus) && !defined(__CUDACC__) */
+
+#if (defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3))))
+#define __CUDA_FP16_INLINE__
+#define __CUDA_FP16_FORCEINLINE__
+#else
+#define __CUDA_FP16_INLINE__ inline
+#define __CUDA_FP16_FORCEINLINE__ __forceinline__
+#endif /* (defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3)))) */
+
+/* Set up structure-alignment attribute */
+#if defined(__CUDACC__)
+#define __CUDA_ALIGN__(align) __align__(align)
+#else
+/* Define alignment macro based on compiler type (cannot assume C11 "_Alignas" is available) */
+#if __cplusplus >= 201103L
+#define __CUDA_ALIGN__(n) alignas(n)    /* C++11 kindly gives us a keyword for this */
+#else /* !defined(__CPP_VERSION_AT_LEAST_11_FP16)*/
+#if defined(__GNUC__)
+#define __CUDA_ALIGN__(n) __attribute__ ((aligned(n)))
+#elif defined(_MSC_VER)
+#define __CUDA_ALIGN__(n) __declspec(align(n))
+#else
+#define __CUDA_ALIGN__(n)
+#endif /* defined(__GNUC__) */
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */
+#endif /* defined(__CUDACC__) */
+
+// define __CUDA_FP16_CONSTEXPR__ in order to
+// use constexpr where possible, with supporting C++ dialects
+// undef after use
+#if (defined __CPP_VERSION_AT_LEAST_11_FP16)
+#define __CUDA_FP16_CONSTEXPR__   constexpr
+#else
+#define __CUDA_FP16_CONSTEXPR__
+#endif
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF
+ * \brief __half_raw data type
+ * \details Type allows static initialization of \p half until it becomes
+ * a built-in type.
+ *
+ * - Note: this initialization is as a bit-field representation of \p half,
+ * and not a conversion from \p short to \p half.
+ * Such representation will be deprecated in a future version of CUDA.
+ *
+ * - Note: this is visible to non-nvcc compilers, including C-only compilations
+ */
+typedef struct __CUDA_ALIGN__(2) {
+    /**
+     * Storage field contains bits representation of the \p half floating-point number.
+     */
+    unsigned short x;
+} __half_raw;
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF
+ * \brief __half2_raw data type
+ * \details Type allows static initialization of \p half2 until it becomes
+ * a built-in type.
+ *
+ * - Note: this initialization is as a bit-field representation of \p half2,
+ * and not a conversion from \p short2 to \p half2.
+ * Such representation will be deprecated in a future version of CUDA.
+ *
+ * - Note: this is visible to non-nvcc compilers, including C-only compilations
+ */
+typedef struct __CUDA_ALIGN__(4) {
+    /**
+     * Storage field contains bits of the lower \p half part.
+     */
+    unsigned short x;
+    /**
+     * Storage field contains bits of the upper \p half part.
+     */
+    unsigned short y;
+} __half2_raw;
+
+/* All other definitions in this file are only visible to C++ compilers */
+#if defined(__cplusplus)
+
+/* Hide GCC member initialization list warnings because of host/device in-function init requirement */
+#if defined(__GNUC__)
+#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Weffc++"
+#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */
+#endif /* defined(__GNUC__) */
+
+/* class' : multiple assignment operators specified
+   The class has multiple assignment operators of a single type. This warning is informational */
+#if defined(_MSC_VER) && _MSC_VER >= 1500
+#pragma warning( push )
+#pragma warning( disable:4522 )
+#endif /* defined(_MSC_VER) && _MSC_VER >= 1500 */
+
+// forward-declaration of bfloat type to be used in converting constructor
+struct __nv_bfloat16;
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF
+ * \brief __half data type
+ * \details This structure implements the datatype for storing
+ * half-precision floating-point numbers. The structure implements
+ * assignment, arithmetic and comparison operators, and type conversions.
+ * 16 bits are being used in total: 1 sign bit, 5 bits for the exponent,
+ * and the significand is being stored in 10 bits.
+ * The total precision is 11 bits. There are 15361 representable
+ * numbers within the interval [0.0, 1.0], endpoints included.
+ * On average we have log10(2**11) ~ 3.311 decimal digits.
+ *
+ * The objective here is to provide IEEE754-compliant implementation
+ * of \p binary16 type and arithmetic with limitations due to
+ * device HW not supporting floating-point exceptions.
+ */
+struct __CUDA_ALIGN__(2) __half {
+protected:
+    /**
+     * Protected storage variable contains the bits of floating-point data.
+     */
+    unsigned short __x;
+
+public:
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * \brief Constructor by default.
+     * \details Emtpy default constructor, result is uninitialized.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP16)
+    __half() = default;
+#else
+    __CUDA_HOSTDEVICE__ __half() {}
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */
+
+    /* Convert to/from __half_raw */
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Constructor from \p __half_raw.
+     */
+    __CUDA_HOSTDEVICE__ __CUDA_FP16_CONSTEXPR__ __half(const __half_raw &hr) : __x(hr.x) { }
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Assignment operator from \p __half_raw.
+     */
+    __CUDA_HOSTDEVICE__ __half &operator=(const __half_raw &hr);
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Assignment operator from \p __half_raw to \p volatile \p __half.
+     */
+    __CUDA_HOSTDEVICE__ volatile __half &operator=(const __half_raw &hr) volatile;
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Assignment operator from \p volatile \p __half_raw to \p volatile \p __half.
+     */
+    __CUDA_HOSTDEVICE__ volatile __half &operator=(const volatile __half_raw &hr) volatile;
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Type cast to \p __half_raw operator.
+     */
+    __CUDA_HOSTDEVICE__ operator __half_raw() const;
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Type cast to \p __half_raw operator with \p volatile input.
+     */
+    __CUDA_HOSTDEVICE__ operator __half_raw() const volatile;
+#if !defined(__CUDA_NO_HALF_CONVERSIONS__)
+#if defined(__CPP_VERSION_AT_LEAST_11_FP16)
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Construct \p __half from \p __nv_bfloat16 input using default round-to-nearest-even rounding mode.
+     * Need to include the header file \p cuda_bf16.h
+     */
+    explicit __CUDA_HOSTDEVICE__ __half(const __nv_bfloat16 f); //forward declaration only, implemented in cuda_bf16.hpp
+#endif /* #if defined(__CPP_VERSION_AT_LEAST_11_FP16) */
+    /* Construct from float/double */
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Construct \p __half from \p float input using default round-to-nearest-even rounding mode.
+     *
+     * \see __float2half(float) for further details.
+     */
+    __CUDA_HOSTDEVICE__ __half(const float f) { __x = __float2half(f).__x; }
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Construct \p __half from \p double input using default round-to-nearest-even rounding mode.
+     *
+     * \see __double2half(double) for further details.
+     */
+    __CUDA_HOSTDEVICE__ __half(const double f) { __x = __double2half(f).__x; }
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Type cast to \p float operator.
+     */
+    __CUDA_HOSTDEVICE__ operator float() const;
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Type cast to \p __half assignment operator from \p float input using default round-to-nearest-even rounding mode.
+     *
+     * \see __float2half(float) for further details.
+     */
+    __CUDA_HOSTDEVICE__ __half &operator=(const float f);
+
+    /* We omit "cast to double" operator, so as to not be ambiguous about up-cast */
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Type cast to \p __half assignment operator from \p double input using default round-to-nearest-even rounding mode.
+     *
+     * \see __double2half(double) for further details.
+     */
+    __CUDA_HOSTDEVICE__ __half &operator=(const double f);
+
+/*
+ * Implicit type conversions to/from integer types were only available to nvcc compilation.
+ * Introducing them for all compilers is a potentially breaking change that may affect
+ * overloads resolution and will require users to update their code.
+ * Define __CUDA_FP16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__ to opt-out.
+ */
+#if !(defined __CUDA_FP16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__) || (defined __CUDACC__)
+    /* Allow automatic construction from types supported natively in hardware */
+    /* Note we do avoid constructor init-list because of special host/device compilation rules */
+
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Construct \p __half from \p short integer input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __half(const short val) { __x = __short2half_rn(val).__x; }
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Construct \p __half from \p unsigned \p short integer input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __half(const unsigned short val) { __x = __ushort2half_rn(val).__x; }
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Construct \p __half from \p int input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __half(const int val) { __x = __int2half_rn(val).__x; }
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Construct \p __half from \p unsigned \p int input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __half(const unsigned int val) { __x = __uint2half_rn(val).__x; }
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Construct \p __half from \p long input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __half(const long val) {
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (sizeof(long) == sizeof(long long))
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (default: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            __x = __ll2half_rn(static_cast<long long>(val)).__x;
+        } else {
+            __x = __int2half_rn(static_cast<int>(val)).__x;
+        }
+    }
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Construct \p __half from \p unsigned \p long input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __half(const unsigned long val) {
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (sizeof(unsigned long) == sizeof(unsigned long long))
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (default: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            __x = __ull2half_rn(static_cast<unsigned long long>(val)).__x;
+        } else {
+            __x = __uint2half_rn(static_cast<unsigned int>(val)).__x;
+        }
+    }
+
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Construct \p __half from \p long \p long input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __half(const long long val) { __x = __ll2half_rn(val).__x; }
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Construct \p __half from \p unsigned \p long \p long input using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __half(const unsigned long long val) { __x = __ull2half_rn(val).__x; }
+
+    /* Allow automatic casts to supported built-in types, matching all that are permitted with float */
+
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Conversion operator to \p signed \p char data type.
+     * Using round-toward-zero rounding mode.
+     *
+     * \see __half2char_rz(__half) for further details.
+     */
+    __CUDA_HOSTDEVICE__ operator signed char() const;
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Conversion operator to \p unsigned \p char data type.
+     * Using round-toward-zero rounding mode.
+     *
+     * \see __half2uchar_rz(__half) for further details.
+     */
+    __CUDA_HOSTDEVICE__ operator unsigned char() const;
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Conversion operator to an implementation defined \p char data type.
+     * Using round-toward-zero rounding mode.
+     *
+     * Detects signedness of the \p char type and proceeds accordingly, see
+     * further details in __half2char_rz(__half) and __half2uchar_rz(__half).
+     */
+    __CUDA_HOSTDEVICE__ operator char() const;
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Conversion operator to \p short data type.
+     * Using round-toward-zero rounding mode.
+     *
+     * \see __half2short_rz(__half) for further details.
+     */
+    __CUDA_HOSTDEVICE__ operator short() const;
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Conversion operator to \p unsigned \p short data type.
+     * Using round-toward-zero rounding mode.
+     *
+     * \see __half2ushort_rz(__half) for further details.
+     */
+    __CUDA_HOSTDEVICE__ operator unsigned short() const;
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Conversion operator to \p int data type.
+     * Using round-toward-zero rounding mode.
+     *
+     * \see __half2int_rz(__half) for further details.
+     */
+    __CUDA_HOSTDEVICE__ operator int() const;
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Conversion operator to \p unsigned \p int data type.
+     * Using round-toward-zero rounding mode.
+     *
+     * \see __half2uint_rz(__half) for further details.
+     */
+    __CUDA_HOSTDEVICE__ operator unsigned int() const;
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Conversion operator to \p long data type.
+     * Using round-toward-zero rounding mode.
+     *
+     * Detects size of the \p long type and proceeds accordingly, see
+     * further details in __half2int_rz(__half) and __half2ll_rz(__half).
+     */
+    __CUDA_HOSTDEVICE__ operator long() const;
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Conversion operator to \p unsigned \p long data type.
+     * Using round-toward-zero rounding mode.
+     *
+     * Detects size of the \p unsigned \p long type and proceeds
+     * accordingly, see further details in __half2uint_rz(__half) and __half2ull_rz(__half).
+     */
+    __CUDA_HOSTDEVICE__ operator unsigned long() const;
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Conversion operator to \p long \p long data type.
+     * Using round-toward-zero rounding mode.
+     *
+     * \see __half2ll_rz(__half) for further details.
+     */
+    __CUDA_HOSTDEVICE__ operator long long() const;
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Conversion operator to \p unsigned \p long \p long data type.
+     * Using round-toward-zero rounding mode.
+     *
+     * \see __half2ull_rz(__half) for further details.
+     */
+    __CUDA_HOSTDEVICE__ operator unsigned long long() const;
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Type cast from \p short assignment operator, using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __half &operator=(const short val);
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Type cast from \p unsigned \p short assignment operator, using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __half &operator=(const unsigned short val);
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Type cast from \p int assignment operator, using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __half &operator=(const int val);
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Type cast from \p unsigned \p int assignment operator, using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __half &operator=(const unsigned int val);
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Type cast from \p long \p long assignment operator, using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __half &operator=(const long long val);
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Type cast from \p unsigned \p long \p long assignment operator, using default round-to-nearest-even rounding mode.
+     */
+    __CUDA_HOSTDEVICE__ __half &operator=(const unsigned long long val);
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Conversion operator to \p bool data type.
+     * +0 and -0 inputs convert to \p false.
+     * Non-zero inputs convert to \p true.
+     */
+    __CUDA_HOSTDEVICE__ __CUDA_FP16_CONSTEXPR__ operator bool() const { return (__x & 0x7FFFU) != 0U; }
+#endif /* #if !(defined __CUDA_FP16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__) || (defined __CUDACC__) */
+#endif /* !defined(__CUDA_NO_HALF_CONVERSIONS__) */
+};
+
+#if !defined(__CUDA_NO_HALF_OPERATORS__)
+/* Some basic arithmetic operations expected of a built-in */
+
+/**
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
+ * Performs \p half addition operation.
+ * \see __hadd(__half, __half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator+(const __half &lh, const __half &rh);
+/**
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
+ * Performs \p half subtraction operation.
+ * \see __hsub(__half, __half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator-(const __half &lh, const __half &rh);
+/**
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
+ * Performs \p half multiplication operation.
+ * \see __hmul(__half, __half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator*(const __half &lh, const __half &rh);
+/**
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
+ * Performs \p half division operation.
+ * \see __hdiv(__half, __half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator/(const __half &lh, const __half &rh);
+/**
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
+ * Performs \p half compound assignment with addition operation.
+ * \see __hadd(__half, __half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator+=(__half &lh, const __half &rh);
+/**
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
+ * Performs \p half compound assignment with subtraction operation.
+ * \see __hsub(__half, __half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator-=(__half &lh, const __half &rh);
+/**
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
+ * Performs \p half compound assignment with multiplication operation.
+ * \see __hmul(__half, __half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator*=(__half &lh, const __half &rh);
+/**
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
+ * Performs \p half compound assignment with division operation.
+ * \see __hdiv(__half, __half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator/=(__half &lh, const __half &rh);
+/* Note for increment and decrement we use the raw value 0x3C00U equating to half(1.0F), to avoid the extra conversion */
+/**
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
+ * Performs \p half prefix increment operation.
+ * \see __hadd(__half, __half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator++(__half &h);
+/**
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
+ * Performs \p half prefix decrement operation.
+ * \see __hsub(__half, __half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator--(__half &h);
+/**
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
+ * Performs \p half postfix increment operation.
+ * \see __hadd(__half, __half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half  operator++(__half &h, const int ignored);
+/**
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
+ * Performs \p half postfix decrement operation.
+ * \see __hsub(__half, __half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half  operator--(__half &h, const int ignored);
+
+/* Unary plus and inverse operators */
+/**
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
+ * Implements \p half unary plus operator, returns input value.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator+(const __half &h);
+/**
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
+ * Implements \p half unary minus operator.
+ * \see __hneg(__half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator-(const __half &h);
+/* Some basic comparison operations to make it look like a built-in */
+/**
+ * \ingroup CUDA_MATH__HALF_COMPARISON
+ * Performs \p half ordered compare equal operation.
+ * \see __heq(__half, __half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator==(const __half &lh, const __half &rh);
+/**
+ * \ingroup CUDA_MATH__HALF_COMPARISON
+ * Performs \p half unordered compare not-equal operation.
+ * \see __hneu(__half, __half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator!=(const __half &lh, const __half &rh);
+/**
+ * \ingroup CUDA_MATH__HALF_COMPARISON
+ * Performs \p half ordered greater-than compare operation.
+ * \see __hgt(__half, __half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator> (const __half &lh, const __half &rh);
+/**
+ * \ingroup CUDA_MATH__HALF_COMPARISON
+ * Performs \p half ordered less-than compare operation.
+ * \see __hlt(__half, __half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator< (const __half &lh, const __half &rh);
+/**
+ * \ingroup CUDA_MATH__HALF_COMPARISON
+ * Performs \p half ordered greater-or-equal compare operation.
+ * \see __hge(__half, __half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator>=(const __half &lh, const __half &rh);
+/**
+ * \ingroup CUDA_MATH__HALF_COMPARISON
+ * Performs \p half ordered less-or-equal compare operation.
+ * \see __hle(__half, __half)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator<=(const __half &lh, const __half &rh);
+#endif /* !defined(__CUDA_NO_HALF_OPERATORS__) */
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF
+ * \brief __half2 data type
+ * \details This structure implements the datatype for storing two
+ * half-precision floating-point numbers.
+ * The structure implements assignment, arithmetic and comparison
+ * operators, and type conversions.
+ *
+ * - NOTE: __half2 is visible to non-nvcc host compilers
+ */
+struct __CUDA_ALIGN__(4) __half2 {
+    /**
+     * Storage field holding lower \p __half part.
+     */
+    __half x;
+    /**
+     * Storage field holding upper \p __half part.
+     */
+    __half y;
+
+    // All construct/copy/assign/move
+public:
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * \brief Constructor by default.
+     * \details Emtpy default constructor, result is uninitialized.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP16)
+    __half2() = default;
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Move constructor, available for \p C++11 and later dialects
+     */
+    __CUDA_HOSTDEVICE__ __half2(const __half2 &&src) {
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __HALF2_TO_UI(*this) = ::std::move(__HALF2_TO_CUI(src));
+,
+    this->x = src.x;
+    this->y = src.y;
+)
+}
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Move assignment operator, available for \p C++11 and later dialects
+     */
+    __CUDA_HOSTDEVICE__ __half2 &operator=(const __half2 &&src);
+#else
+    __CUDA_HOSTDEVICE__ __half2() { }
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */
+
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Constructor from two \p __half variables
+     */
+    __CUDA_HOSTDEVICE__ __CUDA_FP16_CONSTEXPR__ __half2(const __half &a, const __half &b) : x(a), y(b) { }
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Copy constructor
+     */
+    __CUDA_HOSTDEVICE__ __half2(const __half2 &src) {
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __HALF2_TO_UI(*this) = __HALF2_TO_CUI(src);
+,
+    this->x = src.x;
+    this->y = src.y;
+)
+}    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Copy assignment operator
+     */
+    __CUDA_HOSTDEVICE__ __half2 &operator=(const __half2 &src);
+
+    /* Convert to/from __half2_raw */
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Constructor from \p __half2_raw
+     */
+    __CUDA_HOSTDEVICE__ __half2(const __half2_raw &h2r ) {
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __HALF2_TO_UI(*this) = __HALF2_TO_CUI(h2r);
+,
+    __half_raw tr;
+    tr.x = h2r.x;
+    this->x = static_cast<__half>(tr);
+    tr.x = h2r.y;
+    this->y = static_cast<__half>(tr);
+)
+}
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Assignment operator from \p __half2_raw
+     */
+    __CUDA_HOSTDEVICE__ __half2 &operator=(const __half2_raw &h2r);
+    /**
+     * \ingroup CUDA_MATH__HALF_MISC
+     * Conversion operator to \p __half2_raw
+     */
+    __CUDA_HOSTDEVICE__ operator __half2_raw() const;
+};
+
+#if !defined(__CUDA_NO_HALF2_OPERATORS__)
+/**
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
+ * Performs packed \p half addition operation.
+ * \see __hadd2(__half2, __half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator+(const __half2 &lh, const __half2 &rh);
+/**
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
+ * Performs packed \p half subtraction operation.
+ * \see __hsub2(__half2, __half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator-(const __half2 &lh, const __half2 &rh);
+/**
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
+ * Performs packed \p half multiplication operation.
+ * \see __hmul2(__half2, __half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator*(const __half2 &lh, const __half2 &rh);
+/**
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
+ * Performs packed \p half division operation.
+ * \see __h2div(__half2, __half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator/(const __half2 &lh, const __half2 &rh);
+/**
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
+ * Performs packed \p half compound assignment with addition operation.
+ * \see __hadd2(__half2, __half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2& operator+=(__half2 &lh, const __half2 &rh);
+/**
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
+ * Performs packed \p half compound assignment with subtraction operation.
+ * \see __hsub2(__half2, __half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2& operator-=(__half2 &lh, const __half2 &rh);
+/**
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
+ * Performs packed \p half compound assignment with multiplication operation.
+ * \see __hmul2(__half2, __half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2& operator*=(__half2 &lh, const __half2 &rh);
+/**
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
+ * Performs packed \p half compound assignment with division operation.
+ * \see __h2div(__half2, __half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2& operator/=(__half2 &lh, const __half2 &rh);
+/**
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
+ * Performs packed \p half prefix increment operation.
+ * \see __hadd2(__half2, __half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 &operator++(__half2 &h);
+/**
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
+ * Performs packed \p half prefix decrement operation.
+ * \see __hsub2(__half2, __half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 &operator--(__half2 &h);
+/**
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
+ * Performs packed \p half postfix increment operation.
+ * \see __hadd2(__half2, __half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2  operator++(__half2 &h, const int ignored);
+/**
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
+ * Performs packed \p half postfix decrement operation.
+ * \see __hsub2(__half2, __half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2  operator--(__half2 &h, const int ignored);
+/**
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
+ * Implements packed \p half unary plus operator, returns input value.
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator+(const __half2 &h);
+/**
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
+ * Implements packed \p half unary minus operator.
+ * \see __hneg2(__half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator-(const __half2 &h);
+/**
+ * \ingroup CUDA_MATH__HALF2_COMPARISON
+ * Performs packed \p half ordered compare equal operation.
+ * \see __hbeq2(__half2, __half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator==(const __half2 &lh, const __half2 &rh);
+/**
+ * \ingroup CUDA_MATH__HALF2_COMPARISON
+ * Performs packed \p half unordered compare not-equal operation.
+ * \see __hbneu2(__half2, __half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator!=(const __half2 &lh, const __half2 &rh);
+/**
+ * \ingroup CUDA_MATH__HALF2_COMPARISON
+ * Performs packed \p half ordered greater-than compare operation.
+ * \see __hbgt2(__half2, __half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator>(const __half2 &lh, const __half2 &rh);
+/**
+ * \ingroup CUDA_MATH__HALF2_COMPARISON
+ * Performs packed \p half ordered less-than compare operation.
+ * \see __hblt2(__half2, __half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator<(const __half2 &lh, const __half2 &rh);
+/**
+ * \ingroup CUDA_MATH__HALF2_COMPARISON
+ * Performs packed \p half ordered greater-or-equal compare operation.
+ * \see __hbge2(__half2, __half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator>=(const __half2 &lh, const __half2 &rh);
+/**
+ * \ingroup CUDA_MATH__HALF2_COMPARISON
+ * Performs packed \p half ordered less-or-equal compare operation.
+ * \see __hble2(__half2, __half2)
+ */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator<=(const __half2 &lh, const __half2 &rh);
+
+#endif /* !defined(__CUDA_NO_HALF2_OPERATORS__) */
+#endif /* defined(__cplusplus) */
+
+#if (defined(__FORCE_INCLUDE_CUDA_FP16_HPP_FROM_FP16_H__) || \
+    !(defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3)))))
+
+/* Note the .hpp file is included to capture the "half" & "half2" built-in function definitions. For NVRTC, the built-in
+   function definitions are compiled at NVRTC library build-time and are available through the NVRTC built-ins library at
+   link time.
+*/
+#include "cuda_fp16.hpp"
+#endif /* (defined(__FORCE_INCLUDE_CUDA_FP16_HPP_FROM_FP16_H__) || \
+          !(defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3))))) */
+
+/* Define first-class types "half" and "half2", unless user specifies otherwise via "#define CUDA_NO_HALF" */
+/* C cannot ever have these types defined here, because __half and __half2 are C++ classes */
+#if defined(__cplusplus) && !defined(CUDA_NO_HALF)
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF
+ * \brief This datatype is meant to be the first-class or fundamental
+ * implementation of the half-precision numbers format.
+ *
+ * \details Should be implemented in the compiler in the future.
+ * Current implementation is a simple typedef to a respective
+ * user-level type with underscores.
+ */
+typedef __half half;
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF
+ * \brief This datatype is meant to be the first-class or fundamental
+ * implementation of type for pairs of half-precision numbers.
+ *
+ * \details Should be implemented in the compiler in the future.
+ * Current implementation is a simple typedef to a respective
+ * user-level type with underscores.
+ */
+typedef __half2 half2;
+// for consistency with __nv_bfloat16
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF
+ * \brief This datatype is an \p __nv_ prefixed alias
+ */
+typedef __half      __nv_half;
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF
+ * \brief This datatype is an \p __nv_ prefixed alias
+ */
+typedef __half2     __nv_half2;
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF
+ * \brief This datatype is an \p __nv_ prefixed alias
+ */
+typedef __half_raw  __nv_half_raw;
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF
+ * \brief This datatype is an \p __nv_ prefixed alias
+ */
+typedef __half2_raw __nv_half2_raw;
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF
+ * \brief This datatype is an \p nv_ prefixed alias
+ */
+typedef __half        nv_half;
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF
+ * \brief This datatype is an \p nv_ prefixed alias
+ */
+typedef __half2       nv_half2;
+#endif /* defined(__cplusplus) && !defined(CUDA_NO_HALF) */
+
+#undef __CUDA_FP16_DECL__
+#undef __CUDA_HOSTDEVICE_FP16_DECL__
+#undef __CUDA_HOSTDEVICE__
+#undef __CUDA_FP16_INLINE__
+#undef __CUDA_FP16_FORCEINLINE__
+#undef ___CUDA_FP16_STRINGIFY_INNERMOST
+#undef __CUDA_FP16_STRINGIFY
+
+#endif /* end of include guard: __CUDA_FP16_H__ */
diff --git a/numba_cuda/numba/cuda/include/13/cuda_fp16.hpp b/numba_cuda/numba/cuda/include/13/cuda_fp16.hpp
new file mode 100644
index 000000000..4259992df
--- /dev/null
+++ b/numba_cuda/numba/cuda/include/13/cuda_fp16.hpp
@@ -0,0 +1,3483 @@
+/*
+* Copyright 1993-2024 NVIDIA Corporation.  All rights reserved.
+*
+* NOTICE TO LICENSEE:
+*
+* This source code and/or documentation ("Licensed Deliverables") are
+* subject to NVIDIA intellectual property rights under U.S. and
+* international Copyright laws.
+*
+* These Licensed Deliverables contained herein is PROPRIETARY and
+* CONFIDENTIAL to NVIDIA and is being provided under the terms and
+* conditions of a form of NVIDIA software license agreement by and
+* between NVIDIA and Licensee ("License Agreement") or electronically
+* accepted by Licensee.  Notwithstanding any terms or conditions to
+* the contrary in the License Agreement, reproduction or disclosure
+* of the Licensed Deliverables to any third party without the express
+* written consent of NVIDIA is prohibited.
+*
+* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+* OF THESE LICENSED DELIVERABLES.
+*
+* U.S. Government End Users.  These Licensed Deliverables are a
+* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+* 1995), consisting of "commercial computer software" and "commercial
+* computer software documentation" as such terms are used in 48
+* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+* only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+* U.S. Government End Users acquire the Licensed Deliverables with
+* only those rights set forth herein.
+*
+* Any use of the Licensed Deliverables in individual and commercial
+* software must include, in the user documentation and internal
+* comments to the code, the above Disclaimer and U.S. Government End
+* Users Notice.
+*/
+
+#if !defined(__CUDA_FP16_HPP__)
+#define __CUDA_FP16_HPP__
+
+#if !defined(__CUDA_FP16_H__)
+#error "Do not include this file directly. Instead, include cuda_fp16.h."
+#endif
+
+#if !defined(IF_DEVICE_OR_CUDACC)
+#if defined(__CUDACC__)
+    #define IF_DEVICE_OR_CUDACC(d, c, f) NV_IF_ELSE_TARGET(NV_IS_DEVICE, d, c)
+#else
+    #define IF_DEVICE_OR_CUDACC(d, c, f) NV_IF_ELSE_TARGET(NV_IS_DEVICE, d, f)
+#endif
+#endif
+
+/* Macros for half & half2 binary arithmetic */
+#define __BINARY_OP_HALF_MACRO(name) /* do */ {\
+   __half val; \
+   asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16 %0,%1,%2;\n}" \
+        :"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)),"h"(__HALF_TO_CUS(b))); \
+   return val; \
+} /* while(0) */
+#define __BINARY_OP_HALF2_MACRO(name) /* do */ {\
+   __half2 val; \
+   asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16x2 %0,%1,%2;\n}" \
+        :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \
+   return val; \
+} /* while(0) */
+#define __TERNARY_OP_HALF_MACRO(name) /* do */ {\
+   __half val; \
+   asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16 %0,%1,%2,%3;\n}" \
+        :"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)),"h"(__HALF_TO_CUS(b)),"h"(__HALF_TO_CUS(c))); \
+   return val; \
+} /* while(0) */
+#define __TERNARY_OP_HALF2_MACRO(name) /* do */ {\
+   __half2 val; \
+   asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16x2 %0,%1,%2,%3;\n}" \
+        :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b)),"r"(__HALF2_TO_CUI(c))); \
+   return val; \
+} /* while(0) */
+
+/* All other definitions in this file are only visible to C++ compilers */
+#if defined(__cplusplus)
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF_CONSTANTS
+ * \brief Defines floating-point positive infinity value for the \p half data type
+ */
+#define CUDART_INF_FP16            __ushort_as_half((unsigned short)0x7C00U)
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF_CONSTANTS
+ * \brief Defines canonical NaN value for the \p half data type
+ */
+#define CUDART_NAN_FP16            __ushort_as_half((unsigned short)0x7FFFU)
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF_CONSTANTS
+ * \brief Defines a minimum representable (denormalized) value for the \p half data type
+ */
+#define CUDART_MIN_DENORM_FP16     __ushort_as_half((unsigned short)0x0001U)
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF_CONSTANTS
+ * \brief Defines a maximum representable value for the \p half data type
+ */
+#define CUDART_MAX_NORMAL_FP16     __ushort_as_half((unsigned short)0x7BFFU)
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF_CONSTANTS
+ * \brief Defines a negative zero value for the \p half data type
+ */
+#define CUDART_NEG_ZERO_FP16       __ushort_as_half((unsigned short)0x8000U)
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF_CONSTANTS
+ * \brief Defines a positive zero value for the \p half data type
+ */
+#define CUDART_ZERO_FP16           __ushort_as_half((unsigned short)0x0000U)
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_HALF_CONSTANTS
+ * \brief Defines a value of 1.0 for the \p half data type
+ */
+#define CUDART_ONE_FP16            __ushort_as_half((unsigned short)0x3C00U)
+
+#if !(defined __DOXYGEN_ONLY__)
+
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const __half_raw &hr) { __x = hr.x; return *this; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ volatile __half &__half::operator=(const __half_raw &hr) volatile { __x = hr.x; return *this; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ volatile __half &__half::operator=(const volatile __half_raw &hr) volatile { __x = hr.x; return *this; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator __half_raw() const { __half_raw ret; ret.x = __x; return ret; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator __half_raw() const volatile { __half_raw ret; ret.x = __x; return ret; }
+#if !defined(__CUDA_NO_HALF_CONVERSIONS__)
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator float() const { return __half2float(*this); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const float f) { __x = __float2half(f).__x; return *this; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const double f) { __x = __double2half(f).__x; return *this; }
+#if !(defined __CUDA_FP16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__) || (defined __CUDACC__)
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator signed char() const { return __half2char_rz(*this); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator unsigned char() const { return __half2uchar_rz(*this); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator char() const {
+    char value;
+    /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+    if (((char)-1) < (char)0)
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+    {
+        value = static_cast<char>(__half2char_rz(*this));
+    }
+    else
+    {
+        value = static_cast<char>(__half2uchar_rz(*this));
+    }
+    return value;
+}
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator short() const { return __half2short_rz(*this); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator unsigned short() const { return __half2ushort_rz(*this); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator int() const { return __half2int_rz(*this); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator unsigned int() const { return __half2uint_rz(*this); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator long() const {
+    long retval;
+    /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+    if (sizeof(long) == sizeof(long long))
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+    {
+        retval = static_cast<long>(__half2ll_rz(*this));
+    }
+    else
+    {
+        retval = static_cast<long>(__half2int_rz(*this));
+    }
+    return retval;
+}
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator unsigned long() const {
+    unsigned long retval;
+    /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+    if (sizeof(unsigned long) == sizeof(unsigned long long))
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+    {
+        retval = static_cast<unsigned long>(__half2ull_rz(*this));
+    }
+    else
+    {
+        retval = static_cast<unsigned long>(__half2uint_rz(*this));
+    }
+    return retval;
+}
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator long long() const { return __half2ll_rz(*this); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator unsigned long long() const { return __half2ull_rz(*this); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const short val) { __x = __short2half_rn(val).__x; return *this; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const unsigned short val) { __x = __ushort2half_rn(val).__x; return *this; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const int val) { __x = __int2half_rn(val).__x; return *this; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const unsigned int val) { __x = __uint2half_rn(val).__x; return *this; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const long long val) { __x = __ll2half_rn(val).__x; return *this; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const unsigned long long val) { __x = __ull2half_rn(val).__x; return *this; }
+
+#endif /* #if !(defined __CUDA_FP16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__) || (defined __CUDACC__) */
+#endif /* !defined(__CUDA_NO_HALF_CONVERSIONS__) */
+#if !defined(__CUDA_NO_HALF_OPERATORS__)
+/* Some basic arithmetic operations expected of a built-in */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator+(const __half &lh, const __half &rh) { return __hadd(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator-(const __half &lh, const __half &rh) { return __hsub(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator*(const __half &lh, const __half &rh) { return __hmul(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator/(const __half &lh, const __half &rh) { return __hdiv(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator+=(__half &lh, const __half &rh) { lh = __hadd(lh, rh); return lh; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator-=(__half &lh, const __half &rh) { lh = __hsub(lh, rh); return lh; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator*=(__half &lh, const __half &rh) { lh = __hmul(lh, rh); return lh; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator/=(__half &lh, const __half &rh) { lh = __hdiv(lh, rh); return lh; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator++(__half &h)      { __half_raw one; one.x = 0x3C00U; h += one; return h; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator--(__half &h)      { __half_raw one; one.x = 0x3C00U; h -= one; return h; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half  operator++(__half &h, const int ignored)
+{
+    // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
+    static_cast<void>(ignored);
+
+    const __half ret = h;
+    __half_raw one;
+    one.x = 0x3C00U;
+    h += one;
+    return ret;
+}
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half  operator--(__half &h, const int ignored)
+{
+    // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
+    static_cast<void>(ignored);
+
+    const __half ret = h;
+    __half_raw one;
+    one.x = 0x3C00U;
+    h -= one;
+    return ret;
+}
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator+(const __half &h) { return h; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator-(const __half &h) { return __hneg(h); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator==(const __half &lh, const __half &rh) { return __heq(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator!=(const __half &lh, const __half &rh) { return __hneu(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator> (const __half &lh, const __half &rh) { return __hgt(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator< (const __half &lh, const __half &rh) { return __hlt(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator>=(const __half &lh, const __half &rh) { return __hge(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator<=(const __half &lh, const __half &rh) { return __hle(lh, rh); }
+#endif /* !defined(__CUDA_NO_HALF_OPERATORS__) */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP16)
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half2 &__half2::operator=(const __half2 &&src) {
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __HALF2_TO_UI(*this) = ::std::move(__HALF2_TO_CUI(src));
+,
+    this->x = src.x;
+    this->y = src.y;
+)
+    return *this;
+}
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half2 &__half2::operator=(const __half2 &src) {
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __HALF2_TO_UI(*this) = __HALF2_TO_CUI(src);
+,
+    this->x = src.x;
+    this->y = src.y;
+)
+    return *this;
+}
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half2 &__half2::operator=(const __half2_raw &h2r) {
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __HALF2_TO_UI(*this) = __HALF2_TO_CUI(h2r);
+,
+    __half_raw tr;
+    tr.x = h2r.x;
+    this->x = static_cast<__half>(tr);
+    tr.x = h2r.y;
+    this->y = static_cast<__half>(tr);
+)
+    return *this;
+}
+__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half2::operator __half2_raw() const {
+    __half2_raw ret;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    ret.x = 0U;
+    ret.y = 0U;
+    __HALF2_TO_UI(ret) = __HALF2_TO_CUI(*this);
+,
+    ret.x = static_cast<__half_raw>(this->x).x;
+    ret.y = static_cast<__half_raw>(this->y).x;
+)
+    return ret;
+}
+#if !defined(__CUDA_NO_HALF2_OPERATORS__)
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator+(const __half2 &lh, const __half2 &rh) { return __hadd2(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator-(const __half2 &lh, const __half2 &rh) { return __hsub2(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator*(const __half2 &lh, const __half2 &rh) { return __hmul2(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator/(const __half2 &lh, const __half2 &rh) { return __h2div(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2& operator+=(__half2 &lh, const __half2 &rh) { lh = __hadd2(lh, rh); return lh; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2& operator-=(__half2 &lh, const __half2 &rh) { lh = __hsub2(lh, rh); return lh; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2& operator*=(__half2 &lh, const __half2 &rh) { lh = __hmul2(lh, rh); return lh; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2& operator/=(__half2 &lh, const __half2 &rh) { lh = __h2div(lh, rh); return lh; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 &operator++(__half2 &h)      { __half2_raw one; one.x = 0x3C00U; one.y = 0x3C00U; h = __hadd2(h, one); return h; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 &operator--(__half2 &h)      { __half2_raw one; one.x = 0x3C00U; one.y = 0x3C00U; h = __hsub2(h, one); return h; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2  operator++(__half2 &h, const int ignored)
+{
+    // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
+    static_cast<void>(ignored);
+
+    const __half2 ret = h;
+    __half2_raw one;
+    one.x = 0x3C00U;
+    one.y = 0x3C00U;
+    h = __hadd2(h, one);
+    return ret;
+}
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2  operator--(__half2 &h, const int ignored)
+{
+    // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
+    static_cast<void>(ignored);
+
+    const __half2 ret = h;
+    __half2_raw one;
+    one.x = 0x3C00U;
+    one.y = 0x3C00U;
+    h = __hsub2(h, one);
+    return ret;
+}
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator+(const __half2 &h) { return h; }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator-(const __half2 &h) { return __hneg2(h); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator==(const __half2 &lh, const __half2 &rh) { return __hbeq2(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator!=(const __half2 &lh, const __half2 &rh) { return __hbneu2(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator>(const __half2 &lh, const __half2 &rh) { return __hbgt2(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator<(const __half2 &lh, const __half2 &rh) { return __hblt2(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator>=(const __half2 &lh, const __half2 &rh) { return __hbge2(lh, rh); }
+__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator<=(const __half2 &lh, const __half2 &rh) { return __hble2(lh, rh); }
+#endif /* !defined(__CUDA_NO_HALF2_OPERATORS__) */
+
+/* Restore warning for multiple assignment operators */
+#if defined(_MSC_VER) && _MSC_VER >= 1500
+#pragma warning( pop )
+#endif /* defined(_MSC_VER) && _MSC_VER >= 1500 */
+
+/* Restore -Weffc++ warnings from here on */
+#if defined(__GNUC__)
+#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
+#pragma GCC diagnostic pop
+#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */
+#endif /* defined(__GNUC__) */
+
+#undef __CUDA_HOSTDEVICE__
+#undef __CUDA_ALIGN__
+
+#ifndef __CUDACC_RTC__  /* no host functions in NVRTC mode */
+static inline unsigned short __internal_float2half(const float f, unsigned int &sign, unsigned int &remainder)
+{
+    unsigned int x;
+    unsigned int u;
+    unsigned int result;
+#if defined(__CUDACC__)
+    (void)memcpy(&x, &f, sizeof(f));
+#else
+    (void)::std::memcpy(&x, &f, sizeof(f));
+#endif
+    u = (x & 0x7fffffffU);
+    sign = ((x >> 16U) & 0x8000U);
+    // NaN/+Inf/-Inf
+    if (u >= 0x7f800000U) {
+        remainder = 0U;
+        result = ((u == 0x7f800000U) ? (sign | 0x7c00U) : 0x7fffU);
+    } else if (u > 0x477fefffU) { // Overflows
+        remainder = 0x80000000U;
+        result = (sign | 0x7bffU);
+    } else if (u >= 0x38800000U) { // Normal numbers
+        remainder = u << 19U;
+        u -= 0x38000000U;
+        result = (sign | (u >> 13U));
+    } else if (u < 0x33000001U) { // +0/-0
+        remainder = u;
+        result = sign;
+    } else { // Denormal numbers
+        const unsigned int exponent = u >> 23U;
+        const unsigned int shift = 0x7eU - exponent;
+        unsigned int mantissa = (u & 0x7fffffU);
+        mantissa |= 0x800000U;
+        remainder = mantissa << (32U - shift);
+        result = (sign | (mantissa >> shift));
+        result &= 0x0000FFFFU;
+    }
+    return static_cast<unsigned short>(result);
+}
+#endif  /* #if !defined(__CUDACC_RTC__) */
+
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __double2half(const double a)
+{
+IF_DEVICE_OR_CUDACC(
+    __half val;
+    asm("{  cvt.rn.f16.f64 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "d"(a));
+    return val;
+,
+    __half result;
+    // Perform rounding to 11 bits of precision, convert value
+    // to float and call existing float to half conversion.
+    // By pre-rounding to 11 bits we avoid additional rounding
+    // in float to half conversion.
+    unsigned long long int absa;
+    unsigned long long int ua;
+    (void)memcpy(&ua, &a, sizeof(a));
+    absa = (ua & 0x7fffffffffffffffULL);
+    if ((absa >= 0x40f0000000000000ULL) || (absa <= 0x3e60000000000000ULL))
+    {
+        // |a| >= 2^16 or NaN or |a| <= 2^(-25)
+        // double-rounding is not a problem
+        result = __float2half(static_cast<float>(a));
+    }
+    else
+    {
+        // here 2^(-25) < |a| < 2^16
+        // prepare shifter value such that a + shifter
+        // done in double precision performs round-to-nearest-even
+        // and (a + shifter) - shifter results in a rounded to
+        // 11 bits of precision. Shifter needs to have exponent of
+        // a plus 53 - 11 = 42 and a leading bit in mantissa to guard
+        // against negative values.
+        // So need to have |a| capped to avoid overflow in exponent.
+        // For inputs that are smaller than half precision minnorm
+        // we prepare fixed shifter exponent.
+        unsigned long long shifterBits;
+        if (absa >= 0x3f10000000000000ULL)
+        {   // Here if |a| >= 2^(-14)
+            // add 42 to exponent bits
+            shifterBits  = (ua & 0x7ff0000000000000ULL) + 0x02A0000000000000ULL;
+        }
+        else
+        {   // 2^(-25) < |a| < 2^(-14), potentially results in denormal
+            // set exponent bits to 42 - 14 + bias
+            shifterBits = 0x41B0000000000000ULL;
+        }
+        // set leading mantissa bit to protect against negative inputs
+        shifterBits |= 0x0008000000000000ULL;
+        double shifter;
+        (void)memcpy(&shifter, &shifterBits, sizeof(shifterBits));
+        double aShiftRound = a + shifter;
+
+        // Prevent the compiler from optimizing away a + shifter - shifter
+        // by doing intermediate memcopy and harmless bitwize operation
+        unsigned long long int aShiftRoundBits;
+        (void)memcpy(&aShiftRoundBits, &aShiftRound, sizeof(aShiftRound));
+
+        // the value is positive, so this operation doesn't change anything
+        aShiftRoundBits &= 0x7fffffffffffffffULL;
+
+        (void)memcpy(&aShiftRound, &aShiftRoundBits, sizeof(aShiftRound));
+
+        result = __float2half(static_cast<float>(aShiftRound - shifter));
+    }
+
+    return result;
+,
+    __half result;
+    /*
+    // Perform rounding to 11 bits of precision, convert value
+    // to float and call existing float to half conversion.
+    // By pre-rounding to 11 bits we avoid additional rounding
+    // in float to half conversion.
+    */
+    unsigned long long int absa;
+    unsigned long long int ua;
+    (void)::std::memcpy(&ua, &a, sizeof(a));
+    absa = (ua & 0x7fffffffffffffffULL);
+    if ((absa >= 0x40f0000000000000ULL) || (absa <= 0x3e60000000000000ULL))
+    {
+        /*
+        // |a| >= 2^16 or NaN or |a| <= 2^(-25)
+        // double-rounding is not a problem
+        */
+        result = __float2half(static_cast<float>(a));
+    }
+    else
+    {
+        /*
+        // here 2^(-25) < |a| < 2^16
+        // prepare shifter value such that a + shifter
+        // done in double precision performs round-to-nearest-even
+        // and (a + shifter) - shifter results in a rounded to
+        // 11 bits of precision. Shifter needs to have exponent of
+        // a plus 53 - 11 = 42 and a leading bit in mantissa to guard
+        // against negative values.
+        // So need to have |a| capped to avoid overflow in exponent.
+        // For inputs that are smaller than half precision minnorm
+        // we prepare fixed shifter exponent.
+        */
+        unsigned long long shifterBits;
+        if (absa >= 0x3f10000000000000ULL)
+        {
+            /*
+            // Here if |a| >= 2^(-14)
+            // add 42 to exponent bits
+            */
+            shifterBits  = (ua & 0x7ff0000000000000ULL) + 0x02A0000000000000ULL;
+        }
+        else
+        {
+            /*
+            // 2^(-25) < |a| < 2^(-14), potentially results in denormal
+            // set exponent bits to 42 - 14 + bias
+            */
+            shifterBits = 0x41B0000000000000ULL;
+        }
+        // set leading mantissa bit to protect against negative inputs
+        shifterBits |= 0x0008000000000000ULL;
+        double shifter;
+        (void)::std::memcpy(&shifter, &shifterBits, sizeof(shifterBits));
+        double aShiftRound = a + shifter;
+
+        /*
+        // Prevent the compiler from optimizing away a + shifter - shifter
+        // by doing intermediate memcopy and harmless bitwize operation
+        */
+        unsigned long long int aShiftRoundBits;
+        (void)::std::memcpy(&aShiftRoundBits, &aShiftRound, sizeof(aShiftRound));
+
+        // the value is positive, so this operation doesn't change anything
+        aShiftRoundBits &= 0x7fffffffffffffffULL;
+
+        (void)::std::memcpy(&aShiftRound, &aShiftRoundBits, sizeof(aShiftRound));
+
+        result = __float2half(static_cast<float>(aShiftRound - shifter));
+    }
+
+    return result;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half(const float a)
+{
+    __half val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a));
+,
+    __half_raw r;
+    unsigned int sign = 0U;
+    unsigned int remainder = 0U;
+    r.x = __internal_float2half(a, sign, remainder);
+    if ((remainder > 0x80000000U) || ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) {
+        r.x++;
+    }
+    val = r;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rn(const float a)
+{
+    __half val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a));
+,
+    __half_raw r;
+    unsigned int sign = 0U;
+    unsigned int remainder = 0U;
+    r.x = __internal_float2half(a, sign, remainder);
+    if ((remainder > 0x80000000U) || ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) {
+        r.x++;
+    }
+    val = r;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rz(const float a)
+{
+    __half val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{  cvt.rz.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a));
+,
+    __half_raw r;
+    unsigned int sign = 0U;
+    unsigned int remainder = 0U;
+    r.x = __internal_float2half(a, sign, remainder);
+    val = r;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rd(const float a)
+{
+    __half val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{  cvt.rm.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a));
+,
+    __half_raw r;
+    unsigned int sign = 0U;
+    unsigned int remainder = 0U;
+    r.x = __internal_float2half(a, sign, remainder);
+    if ((remainder != 0U) && (sign != 0U)) {
+        r.x++;
+    }
+    val = r;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_ru(const float a)
+{
+    __half val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{  cvt.rp.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a));
+,
+    __half_raw r;
+    unsigned int sign = 0U;
+    unsigned int remainder = 0U;
+    r.x = __internal_float2half(a, sign, remainder);
+    if ((remainder != 0U) && (sign == 0U)) {
+        r.x++;
+    }
+    val = r;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float2half2_rn(const float a)
+{
+    __half2 val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{.reg .f16 low;\n"
+        "  cvt.rn.f16.f32 low, %1;\n"
+        "  mov.b32 %0, {low,low};}\n" : "=r"(__HALF2_TO_UI(val)) : "f"(a));
+,
+    val = __half2(__float2half_rn(a), __float2half_rn(a));
+)
+    return val;
+}
+
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_FP16_DECL__ __half2 __internal_device_float2_to_half2_rn(const float a, const float b) {
+    __half2 val;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    asm("{ cvt.rn.f16x2.f32 %0, %2, %1; }\n"
+        : "=r"(__HALF2_TO_UI(val)) : "f"(a), "f"(b));
+,
+    asm("{.reg .f16 low,high;\n"
+        "  cvt.rn.f16.f32 low, %1;\n"
+        "  cvt.rn.f16.f32 high, %2;\n"
+        "  mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "f"(a), "f"(b));
+)
+    return val;
+}
+
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __floats2half2_rn(const float a, const float b)
+{
+    __half2 val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    val = __internal_device_float2_to_half2_rn(a,b);
+,
+    val = __half2(__float2half_rn(a), __float2half_rn(b));
+)
+    return val;
+}
+
+#ifndef __CUDACC_RTC__  /* no host functions in NVRTC mode */
+static inline float __internal_half2float(const unsigned short h)
+{
+    unsigned int sign = ((static_cast<unsigned int>(h) >> 15U) & 1U);
+    unsigned int exponent = ((static_cast<unsigned int>(h) >> 10U) & 0x1fU);
+    unsigned int mantissa = ((static_cast<unsigned int>(h) & 0x3ffU) << 13U);
+    float f;
+    if (exponent == 0x1fU) { /* NaN or Inf */
+        /* discard sign of a NaN */
+        sign = ((mantissa != 0U) ? (sign >> 1U) : sign);
+        mantissa = ((mantissa != 0U) ? 0x7fffffU : 0U);
+        exponent = 0xffU;
+    } else if (exponent == 0U) { /* Denorm or Zero */
+        if (mantissa != 0U) {
+            unsigned int msb;
+            exponent = 0x71U;
+            do {
+                msb = (mantissa & 0x400000U);
+                mantissa <<= 1U; /* normalize */
+                --exponent;
+            } while (msb == 0U);
+            mantissa &= 0x7fffffU; /* 1.mantissa is implicit */
+        }
+    } else {
+        exponent += 0x70U;
+    }
+    const unsigned int u = ((sign << 31U) | (exponent << 23U) | mantissa);
+#if defined(__CUDACC__)
+    (void)memcpy(&f, &u, sizeof(u));
+#else
+    (void)::std::memcpy(&f, &u, sizeof(u));
+#endif
+    return f;
+}
+#endif  /* !defined(__CUDACC_RTC__) */
+
+__CUDA_HOSTDEVICE_FP16_DECL__ float __half2float(const __half a)
+{
+    float val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{  cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(__HALF_TO_CUS(a)));
+,
+    val = __internal_half2float(static_cast<__half_raw>(a).x);
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ float __low2float(const __half2 a)
+{
+    float val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high},%1;\n"
+        "  cvt.f32.f16 %0, low;}\n" : "=f"(val) : "r"(__HALF2_TO_CUI(a)));
+,
+    val = __internal_half2float(static_cast<__half2_raw>(a).x);
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ float __high2float(const __half2 a)
+{
+    float val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high},%1;\n"
+        "  cvt.f32.f16 %0, high;}\n" : "=f"(val) : "r"(__HALF2_TO_CUI(a)));
+,
+    val = __internal_half2float(static_cast<__half2_raw>(a).y);
+)
+    return val;
+}
+
+__CUDA_HOSTDEVICE_FP16_DECL__ signed char __half2char_rz(const __half h)
+{
+    signed char i;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    unsigned int tmp;
+    asm("cvt.rzi.s8.f16 %0, %1;" : "=r"(tmp) : "h"(__HALF_TO_CUS(h)));
+    const unsigned char u = static_cast<unsigned char>(tmp);
+    i = static_cast<signed char>(u);
+,
+    const float f = __half2float(h);
+    const signed char max_val = (signed char)0x7fU;
+    const signed char min_val = (signed char)0x80U;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xF800U) {
+        // NaN
+        i = 0;
+    } else if (f > static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    } else {
+        // normal value, conversion is well-defined
+        i = static_cast<signed char>(f);
+    }
+)
+    return i;
+}
+
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned char __half2uchar_rz(const __half h)
+{
+    unsigned char i;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    unsigned int tmp;
+    asm("cvt.rzi.u8.f16 %0, %1;" : "=r"(tmp) : "h"(__HALF_TO_CUS(h)));
+    i = static_cast<unsigned char>(tmp);
+,
+    const float f = __half2float(h);
+    const unsigned char max_val = 0xffU;
+    const unsigned char min_val = 0U;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xF800U) {
+        // NaN
+        i = 0U;
+    } else if (f > static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    } else {
+        // normal value, conversion is well-defined
+        i = static_cast<unsigned char>(f);
+    }
+)
+    return i;
+}
+
+__CUDA_HOSTDEVICE_FP16_DECL__ short int __half2short_rz(const __half h)
+{
+    short int i;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rzi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
+,
+    const float f = __half2float(h);
+    const short int max_val = (short int)0x7fffU;
+    const short int min_val = (short int)0x8000U;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xF800U) {
+        // NaN
+        i = 0;
+    } else if (f > static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    } else {
+        // normal value, conversion is well-defined
+        i = static_cast<short int>(f);
+    }
+)
+    return i;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned short int __half2ushort_rz(const __half h)
+{
+    unsigned short int i;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rzi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
+,
+    const float f = __half2float(h);
+    const unsigned short int max_val = 0xffffU;
+    const unsigned short int min_val = 0U;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xF800U) {
+        // NaN
+        i = 0U;
+    } else if (f > static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    } else {
+        // normal value, conversion is well-defined
+        i = static_cast<unsigned short int>(f);
+    }
+)
+    return i;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ int __half2int_rz(const __half h)
+{
+    int i;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rzi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
+,
+    const float f = __half2float(h);
+    const int max_val = (int)0x7fffffffU;
+    const int min_val = (int)0x80000000U;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xF800U) {
+        // NaN
+        i = 0;
+    } else if (f > static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    } else {
+        // normal value, conversion is well-defined
+        i = static_cast<int>(f);
+    }
+)
+    return i;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __half2uint_rz(const __half h)
+{
+    unsigned int i;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rzi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
+,
+    const float f = __half2float(h);
+    const unsigned int max_val = 0xffffffffU;
+    const unsigned int min_val = 0U;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xF800U) {
+        // NaN
+        i = 0U;
+    } else if (f > static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    } else {
+        // normal value, conversion is well-defined
+        i = static_cast<unsigned int>(f);
+    }
+)
+    return i;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ long long int __half2ll_rz(const __half h)
+{
+    long long int i;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rzi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
+,
+    const float f = __half2float(h);
+    const long long int max_val = (long long int)0x7fffffffffffffffULL;
+    const long long int min_val = (long long int)0x8000000000000000ULL;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xF800U) {
+        // NaN
+        i = min_val;
+    } else if (f > static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    } else {
+        // normal value, conversion is well-defined
+        i = static_cast<long long int>(f);
+    }
+)
+    return i;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned long long int __half2ull_rz(const __half h)
+{
+    unsigned long long int i;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rzi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
+,
+    const float f = __half2float(h);
+    const unsigned long long int max_val = 0xffffffffffffffffULL;
+    const unsigned long long int min_val = 0ULL;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xF800U) {
+        // NaN
+        i = 0x8000000000000000ULL;
+    } else if (f > static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    } else {
+        // normal value, conversion is well-defined
+        i = static_cast<unsigned long long int>(f);
+    }
+)
+    return i;
+}
+/* CUDA vector-types compatible vector creation function (note returns __half2, not half2) */
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 make_half2(const __half x, const __half y)
+{
+    __half2 t; t.x = x; t.y = y; return t;
+}
+
+
+/* Definitions of intrinsics */
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float22half2_rn(const float2 a)
+{
+    const __half2 val = __floats2half2_rn(a.x, a.y);
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ float2 __half22float2(const __half2 a)
+{
+    float hi_float;
+    float lo_float;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high},%1;\n"
+        "  cvt.f32.f16 %0, low;}\n" : "=f"(lo_float) : "r"(__HALF2_TO_CUI(a)));
+
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high},%1;\n"
+        "  cvt.f32.f16 %0, high;}\n" : "=f"(hi_float) : "r"(__HALF2_TO_CUI(a)));
+,
+    lo_float = __internal_half2float(((__half2_raw)a).x);
+    hi_float = __internal_half2float(((__half2_raw)a).y);
+)
+    return make_float2(lo_float, hi_float);
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_FP16_DECL__ int __half2int_rn(const __half h)
+{
+    int i;
+    asm("cvt.rni.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ int __half2int_rd(const __half h)
+{
+    int i;
+    asm("cvt.rmi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ int __half2int_ru(const __half h)
+{
+    int i;
+    asm("cvt.rpi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_rn(const int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rn.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
+,
+    // double-rounding is not a problem here: if integer
+    // has more than 24 bits, it is already too large to
+    // be represented in half precision, and result will
+    // be infinity.
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rn(f);
+)
+    return h;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_rz(const int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rz.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rz(f);
+)
+    return h;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_rd(const int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rm.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rd(f);
+)
+    return h;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_ru(const int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rp.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_ru(f);
+)
+    return h;
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_FP16_DECL__ short int __half2short_rn(const __half h)
+{
+    short int i;
+    asm("cvt.rni.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ short int __half2short_rd(const __half h)
+{
+    short int i;
+    asm("cvt.rmi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ short int __half2short_ru(const __half h)
+{
+    short int i;
+    asm("cvt.rpi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_rn(const short int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rn.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rn(f);
+)
+    return h;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_rz(const short int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rz.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rz(f);
+)
+    return h;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_rd(const short int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rm.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rd(f);
+)
+    return h;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_ru(const short int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rp.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_ru(f);
+)
+    return h;
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_FP16_DECL__ unsigned int __half2uint_rn(const __half h)
+{
+    unsigned int i;
+    asm("cvt.rni.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ unsigned int __half2uint_rd(const __half h)
+{
+    unsigned int i;
+    asm("cvt.rmi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ unsigned int __half2uint_ru(const __half h)
+{
+    unsigned int i;
+    asm("cvt.rpi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_rn(const unsigned int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rn.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
+,
+    // double-rounding is not a problem here: if integer
+    // has more than 24 bits, it is already too large to
+    // be represented in half precision, and result will
+    // be infinity.
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rn(f);
+)
+    return h;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_rz(const unsigned int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rz.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rz(f);
+)
+    return h;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_rd(const unsigned int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rm.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rd(f);
+)
+    return h;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_ru(const unsigned int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rp.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_ru(f);
+)
+    return h;
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_FP16_DECL__ unsigned short int __half2ushort_rn(const __half h)
+{
+    unsigned short int i;
+    asm("cvt.rni.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ unsigned short int __half2ushort_rd(const __half h)
+{
+    unsigned short int i;
+    asm("cvt.rmi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ unsigned short int __half2ushort_ru(const __half h)
+{
+    unsigned short int i;
+    asm("cvt.rpi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_rn(const unsigned short int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rn.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rn(f);
+)
+    return h;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_rz(const unsigned short int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rz.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rz(f);
+)
+    return h;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_rd(const unsigned short int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rm.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rd(f);
+)
+    return h;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_ru(const unsigned short int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rp.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_ru(f);
+)
+    return h;
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_FP16_DECL__ unsigned long long int __half2ull_rn(const __half h)
+{
+    unsigned long long int i;
+    asm("cvt.rni.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ unsigned long long int __half2ull_rd(const __half h)
+{
+    unsigned long long int i;
+    asm("cvt.rmi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ unsigned long long int __half2ull_ru(const __half h)
+{
+    unsigned long long int i;
+    asm("cvt.rpi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_rn(const unsigned long long int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rn.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
+,
+    // double-rounding is not a problem here: if integer
+    // has more than 24 bits, it is already too large to
+    // be represented in half precision, and result will
+    // be infinity.
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rn(f);
+)
+    return h;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_rz(const unsigned long long int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rz.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rz(f);
+)
+    return h;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_rd(const unsigned long long int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rm.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rd(f);
+)
+    return h;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_ru(const unsigned long long int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rp.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_ru(f);
+)
+    return h;
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_FP16_DECL__ long long int __half2ll_rn(const __half h)
+{
+    long long int i;
+    asm("cvt.rni.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ long long int __half2ll_rd(const __half h)
+{
+    long long int i;
+    asm("cvt.rmi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ long long int __half2ll_ru(const __half h)
+{
+    long long int i;
+    asm("cvt.rpi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_rn(const long long int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rn.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
+,
+    // double-rounding is not a problem here: if integer
+    // has more than 24 bits, it is already too large to
+    // be represented in half precision, and result will
+    // be infinity.
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rn(f);
+)
+    return h;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_rz(const long long int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rz.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rz(f);
+)
+    return h;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_rd(const long long int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rm.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rd(f);
+)
+    return h;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_ru(const long long int i)
+{
+    __half h;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("cvt.rp.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
+,
+    const float  f = static_cast<float>(i);
+                 h = __float2half_ru(f);
+)
+    return h;
+}
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_FP16_DECL__ __half htrunc(const __half h)
+{
+    __half r;
+    asm("cvt.rzi.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h)));
+    return r;
+}
+__CUDA_FP16_DECL__ __half hceil(const __half h)
+{
+    __half r;
+    asm("cvt.rpi.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h)));
+    return r;
+}
+__CUDA_FP16_DECL__ __half hfloor(const __half h)
+{
+    __half r;
+    asm("cvt.rmi.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h)));
+    return r;
+}
+__CUDA_FP16_DECL__ __half hrint(const __half h)
+{
+    __half r;
+    asm("cvt.rni.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h)));
+    return r;
+}
+
+__CUDA_FP16_DECL__ __half2 h2trunc(const __half2 h)
+{
+    __half2 val;
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high}, %1;\n"
+        "  cvt.rzi.f16.f16 low, low;\n"
+        "  cvt.rzi.f16.f16 high, high;\n"
+        "  mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 h2ceil(const __half2 h)
+{
+    __half2 val;
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high}, %1;\n"
+        "  cvt.rpi.f16.f16 low, low;\n"
+        "  cvt.rpi.f16.f16 high, high;\n"
+        "  mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 h2floor(const __half2 h)
+{
+    __half2 val;
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high}, %1;\n"
+        "  cvt.rmi.f16.f16 low, low;\n"
+        "  cvt.rmi.f16.f16 high, high;\n"
+        "  mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 h2rint(const __half2 h)
+{
+    __half2 val;
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high}, %1;\n"
+        "  cvt.rni.f16.f16 low, low;\n"
+        "  cvt.rni.f16.f16 high, high;\n"
+        "  mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h)));
+    return val;
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __lows2half2(const __half2 a, const __half2 b)
+{
+    __half2 val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{.reg .f16 alow,ahigh,blow,bhigh;\n"
+        "  mov.b32 {alow,ahigh}, %1;\n"
+        "  mov.b32 {blow,bhigh}, %2;\n"
+        "  mov.b32 %0, {alow,blow};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)), "r"(__HALF2_TO_CUI(b)));
+,
+    val.x = a.x;
+    val.y = b.x;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __highs2half2(const __half2 a, const __half2 b)
+{
+    __half2 val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{.reg .f16 alow,ahigh,blow,bhigh;\n"
+        "  mov.b32 {alow,ahigh}, %1;\n"
+        "  mov.b32 {blow,bhigh}, %2;\n"
+        "  mov.b32 %0, {ahigh,bhigh};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)), "r"(__HALF2_TO_CUI(b)));
+,
+    val.x = a.y;
+    val.y = b.y;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __low2half(const __half2 a)
+{
+    __half ret;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{.reg .f16 low,high;\n"
+        " mov.b32 {low,high}, %1;\n"
+        " mov.b16 %0, low;}" : "=h"(__HALF_TO_US(ret)) : "r"(__HALF2_TO_CUI(a)));
+,
+    ret = a.x;
+)
+    return ret;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ int __hisinf(const __half a)
+{
+    int retval;
+    const __half_raw araw = __half_raw(a);
+    if (araw.x == 0xFC00U) {
+        retval = -1;
+    } else if (araw.x == 0x7C00U) {
+        retval = 1;
+    } else {
+        retval = 0;
+    }
+    return retval;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __low2half2(const __half2 a)
+{
+    __half2 val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high}, %1;\n"
+        "  mov.b32 %0, {low,low};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
+,
+    val.x = a.x;
+    val.y = a.x;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __high2half2(const __half2 a)
+{
+    __half2 val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high}, %1;\n"
+        "  mov.b32 %0, {high,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
+,
+    val.x = a.y;
+    val.y = a.y;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __high2half(const __half2 a)
+{
+    __half ret;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{.reg .f16 low,high;\n"
+        " mov.b32 {low,high}, %1;\n"
+        " mov.b16 %0, high;}" : "=h"(__HALF_TO_US(ret)) : "r"(__HALF2_TO_CUI(a)));
+,
+    ret = a.y;
+)
+    return ret;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __halves2half2(const __half a, const __half b)
+{
+    __half2 val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{  mov.b32 %0, {%1,%2};}\n"
+        : "=r"(__HALF2_TO_UI(val)) : "h"(__HALF_TO_CUS(a)), "h"(__HALF_TO_CUS(b)));
+,
+    val.x = a;
+    val.y = b;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __half2half2(const __half a)
+{
+    __half2 val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{  mov.b32 %0, {%1,%1};}\n"
+        : "=r"(__HALF2_TO_UI(val)) : "h"(__HALF_TO_CUS(a)));
+,
+    val.x = a;
+    val.y = a;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __lowhigh2highlow(const __half2 a)
+{
+    __half2 val;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high}, %1;\n"
+        "  mov.b32 %0, {high,low};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
+,
+    val.x = a.y;
+    val.y = a.x;
+)
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ short int __half_as_short(const __half h)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return static_cast<short int>(__HALF_TO_CUS(h));
+,
+    return static_cast<short int>(__half_raw(h).x);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned short int __half_as_ushort(const __half h)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __HALF_TO_CUS(h);
+,
+    return __half_raw(h).x;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __short_as_half(const short int i)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __half h;
+    __HALF_TO_US(h) = static_cast<unsigned short int>(i);
+    return h;
+,
+    __half_raw hr;
+    hr.x = static_cast<unsigned short int>(i);
+    return __half(hr);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort_as_half(const unsigned short int i)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __half h;
+    __HALF_TO_US(h) = i;
+    return h;
+,
+    __half_raw hr;
+    hr.x = i;
+    return __half(hr);)
+}
+
+/******************************************************************************
+*                             __half arithmetic                             *
+******************************************************************************/
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+__CUDA_FP16_DECL__ __half __internal_device_hmax(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __BINARY_OP_HALF_MACRO(max)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    float fr;
+    asm("{max.f32 %0,%1,%2;\n}"
+        :"=f"(fr) : "f"(fa), "f"(fb));
+    const __half hr = __float2half(fr);
+    return hr;
+)
+}
+__CUDA_FP16_DECL__ __half __internal_device_hmin(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __BINARY_OP_HALF_MACRO(min)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    float fr;
+    asm("{min.f32 %0,%1,%2;\n}"
+        :"=f"(fr) : "f"(fa), "f"(fb));
+    const __half hr = __float2half(fr);
+    return hr;
+)
+}
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmax(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hmax(a, b);
+,
+    __half maxval;
+
+    maxval = (__hge(a, b) || __hisnan(b)) ? a : b;
+
+    if (__hisnan(maxval))
+    {
+        // if both inputs are NaN, return canonical NaN
+        maxval = CUDART_NAN_FP16;
+    }
+    else if (__heq(a, b))
+    {
+        // hmax(+0.0, -0.0) = +0.0
+        // unsigned compare 0x8000U > 0x0000U
+        __half_raw ra = __half_raw(a);
+        __half_raw rb = __half_raw(b);
+        maxval = (ra.x > rb.x) ? b : a;
+    }
+    return maxval;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmin(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    return __internal_device_hmin(a, b);
+,
+    __half minval;
+
+    minval = (__hle(a, b) || __hisnan(b)) ? a : b;
+
+    if (__hisnan(minval))
+    {
+        // if both inputs are NaN, return canonical NaN
+        minval = CUDART_NAN_FP16;
+    }
+    else if (__heq(a, b))
+    {
+        // hmin(+0.0, -0.0) = -0.0
+        // unsigned compare 0x8000U > 0x0000U
+        __half_raw ra = __half_raw(a);
+        __half_raw rb = __half_raw(b);
+        minval = (ra.x > rb.x) ? a : b;
+    }
+
+    return minval;
+)
+}
+
+
+/******************************************************************************
+*                            __half2 arithmetic                             *
+******************************************************************************/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmax2(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __BINARY_OP_HALF2_MACRO(max)
+,
+    __half2 val;
+    val.x = __hmax(a.x, b.x);
+    val.y = __hmax(a.y, b.y);
+    return val;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmin2(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __BINARY_OP_HALF2_MACRO(min)
+,
+    __half2 val;
+    val.x = __hmin(a.x, b.x);
+    val.y = __hmin(a.y, b.y);
+    return val;
+)
+}
+
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300) || defined(_NVHPC_CUDA)
+/******************************************************************************
+*                           __half, __half2 warp shuffle                     *
+******************************************************************************/
+#define __SHUFFLE_HALF2_MACRO(name) /* do */ {\
+   __half2 r; \
+   asm volatile ("{" __CUDA_FP16_STRINGIFY(name) " %0,%1,%2,%3;\n}" \
+       :"=r"(__HALF2_TO_UI(r)): "r"(__HALF2_TO_CUI(var)), "r"(delta), "r"(c)); \
+   return r; \
+} /* while(0) */
+
+#define __SHUFFLE_SYNC_HALF2_MACRO(name, var, delta, c, mask) /* do */ {\
+   __half2 r; \
+   asm volatile ("{" __CUDA_FP16_STRINGIFY(name) " %0,%1,%2,%3,%4;\n}" \
+       :"=r"(__HALF2_TO_UI(r)): "r"(__HALF2_TO_CUI(var)), "r"(delta), "r"(c), "r"(mask)); \
+   return r; \
+} /* while(0) */
+
+#if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ < 700)
+
+__CUDA_FP16_DECL__ __half2 __shfl(const __half2 var, const int delta, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
+    __SHUFFLE_HALF2_MACRO(shfl.idx.b32)
+}
+__CUDA_FP16_DECL__ __half2 __shfl_up(const __half2 var, const unsigned int delta, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = (warp_size - static_cast<unsigned>(width)) << 8U;
+    __SHUFFLE_HALF2_MACRO(shfl.up.b32)
+}
+__CUDA_FP16_DECL__ __half2 __shfl_down(const __half2 var, const unsigned int delta, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
+    __SHUFFLE_HALF2_MACRO(shfl.down.b32)
+}
+__CUDA_FP16_DECL__ __half2 __shfl_xor(const __half2 var, const int delta, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
+    __SHUFFLE_HALF2_MACRO(shfl.bfly.b32)
+}
+
+#endif /* defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ < 700) */
+
+__CUDA_FP16_DECL__ __half2 __shfl_sync(const unsigned int mask, const __half2 var, const int srcLane, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
+    __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.idx.b32, var, srcLane, c, mask)
+}
+__CUDA_FP16_DECL__ __half2 __shfl_up_sync(const unsigned int mask, const __half2 var, const unsigned int delta, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = (warp_size - static_cast<unsigned>(width)) << 8U;
+    __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.up.b32, var, delta, c, mask)
+}
+__CUDA_FP16_DECL__ __half2 __shfl_down_sync(const unsigned int mask, const __half2 var, const unsigned int delta, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
+    __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.down.b32, var, delta, c, mask)
+}
+__CUDA_FP16_DECL__ __half2 __shfl_xor_sync(const unsigned int mask, const __half2 var, const int laneMask, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
+    __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.bfly.b32, var, laneMask, c, mask)
+}
+
+#undef __SHUFFLE_HALF2_MACRO
+#undef __SHUFFLE_SYNC_HALF2_MACRO
+
+#if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ < 700)
+
+__CUDA_FP16_DECL__ __half __shfl(const __half var, const int delta, const int width)
+{
+    const __half2 temp1 = __halves2half2(var, var);
+    const __half2 temp2 = __shfl(temp1, delta, width);
+    return __low2half(temp2);
+}
+__CUDA_FP16_DECL__ __half __shfl_up(const __half var, const unsigned int delta, const int width)
+{
+    const __half2 temp1 = __halves2half2(var, var);
+    const __half2 temp2 = __shfl_up(temp1, delta, width);
+    return __low2half(temp2);
+}
+__CUDA_FP16_DECL__ __half __shfl_down(const __half var, const unsigned int delta, const int width)
+{
+    const __half2 temp1 = __halves2half2(var, var);
+    const __half2 temp2 = __shfl_down(temp1, delta, width);
+    return __low2half(temp2);
+}
+__CUDA_FP16_DECL__ __half __shfl_xor(const __half var, const int delta, const int width)
+{
+    const __half2 temp1 = __halves2half2(var, var);
+    const __half2 temp2 = __shfl_xor(temp1, delta, width);
+    return __low2half(temp2);
+}
+
+#endif /* defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ < 700) */
+
+__CUDA_FP16_DECL__ __half __shfl_sync(const unsigned int mask, const __half var, const int srcLane, const int width)
+{
+    const __half2 temp1 = __halves2half2(var, var);
+    const __half2 temp2 = __shfl_sync(mask, temp1, srcLane, width);
+    return __low2half(temp2);
+}
+__CUDA_FP16_DECL__ __half __shfl_up_sync(const unsigned int mask, const __half var, const unsigned int delta, const int width)
+{
+    const __half2 temp1 = __halves2half2(var, var);
+    const __half2 temp2 = __shfl_up_sync(mask, temp1, delta, width);
+    return __low2half(temp2);
+}
+__CUDA_FP16_DECL__ __half __shfl_down_sync(const unsigned int mask, const __half var, const unsigned int delta, const int width)
+{
+    const __half2 temp1 = __halves2half2(var, var);
+    const __half2 temp2 = __shfl_down_sync(mask, temp1, delta, width);
+    return __low2half(temp2);
+}
+__CUDA_FP16_DECL__ __half __shfl_xor_sync(const unsigned int mask, const __half var, const int laneMask, const int width)
+{
+    const __half2 temp1 = __halves2half2(var, var);
+    const __half2 temp2 = __shfl_xor_sync(mask, temp1, laneMask, width);
+    return __low2half(temp2);
+}
+
+#endif /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300) || defined(_NVHPC_CUDA) */
+/******************************************************************************
+*               __half and __half2 __ldg,__ldcg,__ldca,__ldcs                *
+******************************************************************************/
+
+#if defined(__cplusplus) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320) || defined(_NVHPC_CUDA))
+#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
+#define __LDG_PTR   "l"
+#else
+#define __LDG_PTR   "r"
+#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/
+__CUDA_FP16_DECL__ __half2 __ldg(const  __half2 *const ptr)
+{
+    __half2 ret;
+    asm ("ld.global.nc.b32 %0, [%1];"  : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_FP16_DECL__ __half __ldg(const __half *const ptr)
+{
+    __half ret;
+    asm ("ld.global.nc.b16 %0, [%1];"  : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_FP16_DECL__ __half2 __ldcg(const  __half2 *const ptr)
+{
+    __half2 ret;
+    asm ("ld.global.cg.b32 %0, [%1];"  : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_FP16_DECL__ __half __ldcg(const __half *const ptr)
+{
+    __half ret;
+    asm ("ld.global.cg.b16 %0, [%1];"  : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_FP16_DECL__ __half2 __ldca(const  __half2 *const ptr)
+{
+    __half2 ret;
+    asm ("ld.global.ca.b32 %0, [%1];"  : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_FP16_DECL__ __half __ldca(const __half *const ptr)
+{
+    __half ret;
+    asm ("ld.global.ca.b16 %0, [%1];"  : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_FP16_DECL__ __half2 __ldcs(const  __half2 *const ptr)
+{
+    __half2 ret;
+    asm ("ld.global.cs.b32 %0, [%1];"  : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_FP16_DECL__ __half __ldcs(const __half *const ptr)
+{
+    __half ret;
+    asm ("ld.global.cs.b16 %0, [%1];"  : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_FP16_DECL__ __half2 __ldlu(const  __half2 *const ptr)
+{
+    __half2 ret;
+    asm ("ld.global.lu.b32 %0, [%1];"  : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr) : "memory");
+    return ret;
+}
+__CUDA_FP16_DECL__ __half __ldlu(const __half *const ptr)
+{
+    __half ret;
+    asm ("ld.global.lu.b16 %0, [%1];"  : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr) : "memory");
+    return ret;
+}
+__CUDA_FP16_DECL__ __half2 __ldcv(const  __half2 *const ptr)
+{
+    __half2 ret;
+    asm ("ld.global.cv.b32 %0, [%1];"  : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr) : "memory");
+    return ret;
+}
+__CUDA_FP16_DECL__ __half __ldcv(const __half *const ptr)
+{
+    __half ret;
+    asm ("ld.global.cv.b16 %0, [%1];"  : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr) : "memory");
+    return ret;
+}
+__CUDA_FP16_DECL__ void __stwb(__half2 *const ptr, const __half2 value)
+{
+    asm ("st.global.wb.b32 [%0], %1;"  :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory");
+}
+__CUDA_FP16_DECL__ void __stwb(__half *const ptr, const __half value)
+{
+    asm ("st.global.wb.b16 [%0], %1;"  :: __LDG_PTR(ptr),  "h"(__HALF_TO_CUS(value)) : "memory");
+}
+__CUDA_FP16_DECL__ void __stcg(__half2 *const ptr, const __half2 value)
+{
+    asm ("st.global.cg.b32 [%0], %1;"  :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory");
+}
+__CUDA_FP16_DECL__ void __stcg(__half *const ptr, const __half value)
+{
+    asm ("st.global.cg.b16 [%0], %1;"  :: __LDG_PTR(ptr),  "h"(__HALF_TO_CUS(value)) : "memory");
+}
+__CUDA_FP16_DECL__ void __stcs(__half2 *const ptr, const __half2 value)
+{
+    asm ("st.global.cs.b32 [%0], %1;"  :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory");
+}
+__CUDA_FP16_DECL__ void __stcs(__half *const ptr, const __half value)
+{
+    asm ("st.global.cs.b16 [%0], %1;"  :: __LDG_PTR(ptr),  "h"(__HALF_TO_CUS(value)) : "memory");
+}
+__CUDA_FP16_DECL__ void __stwt(__half2 *const ptr, const __half2 value)
+{
+    asm ("st.global.wt.b32 [%0], %1;"  :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory");
+}
+__CUDA_FP16_DECL__ void __stwt(__half *const ptr, const __half value)
+{
+    asm ("st.global.wt.b16 [%0], %1;"  :: __LDG_PTR(ptr),  "h"(__HALF_TO_CUS(value)) : "memory");
+}
+#undef __LDG_PTR
+#endif /* defined(__cplusplus) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320) || defined(_NVHPC_CUDA)) */
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+
+/******************************************************************************
+*                             __half2 comparison                             *
+******************************************************************************/
+#define __COMPARISON_OP_HALF2_MACRO(name) /* do */ {\
+   __half2 val; \
+   asm( "{ " __CUDA_FP16_STRINGIFY(name) ".f16x2.f16x2 %0,%1,%2;\n}" \
+        :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \
+   return val; \
+} /* while(0) */
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __heq2(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO(set.eq)
+,
+    __half2_raw val;
+    val.x = __heq(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    val.y = __heq(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    return __half2(val);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hne2(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO(set.ne)
+,
+    __half2_raw val;
+    val.x = __hne(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    val.y = __hne(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    return __half2(val);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hle2(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO(set.le)
+,
+    __half2_raw val;
+    val.x = __hle(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    val.y = __hle(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    return __half2(val);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hge2(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO(set.ge)
+,
+    __half2_raw val;
+    val.x = __hge(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    val.y = __hge(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    return __half2(val);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hlt2(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO(set.lt)
+,
+    __half2_raw val;
+    val.x = __hlt(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    val.y = __hlt(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    return __half2(val);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hgt2(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO(set.gt)
+,
+    __half2_raw val;
+    val.x = __hgt(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    val.y = __hgt(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    return __half2(val);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hequ2(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO(set.equ)
+,
+    __half2_raw val;
+    val.x = __hequ(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    val.y = __hequ(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    return __half2(val);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hneu2(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO(set.neu)
+,
+    __half2_raw val;
+    val.x = __hneu(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    val.y = __hneu(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    return __half2(val);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hleu2(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO(set.leu)
+,
+    __half2_raw val;
+    val.x = __hleu(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    val.y = __hleu(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    return __half2(val);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hgeu2(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO(set.geu)
+,
+    __half2_raw val;
+    val.x = __hgeu(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    val.y = __hgeu(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    return __half2(val);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hltu2(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO(set.ltu)
+,
+    __half2_raw val;
+    val.x = __hltu(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    val.y = __hltu(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    return __half2(val);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hgtu2(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO(set.gtu)
+,
+    __half2_raw val;
+    val.x = __hgtu(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    val.y = __hgtu(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    return __half2(val);
+)
+}
+#undef __COMPARISON_OP_HALF2_MACRO
+/******************************************************************************
+*                 __half2 comparison with mask output                        *
+******************************************************************************/
+#define __COMPARISON_OP_HALF2_MACRO_MASK(name) /* do */ {\
+   unsigned val; \
+   asm( "{ " __CUDA_FP16_STRINGIFY(name) ".u32.f16x2 %0,%1,%2;\n}" \
+        :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \
+   return val; \
+} /* while(0) */
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __heq2_mask(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO_MASK(set.eq)
+,
+    const unsigned short px = __heq(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __heq(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hne2_mask(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO_MASK(set.ne)
+,
+    const unsigned short px = __hne(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hne(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hle2_mask(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO_MASK(set.le)
+,
+    const unsigned short px = __hle(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hle(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hge2_mask(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO_MASK(set.ge)
+,
+    const unsigned short px = __hge(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hge(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hlt2_mask(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO_MASK(set.lt)
+,
+    const unsigned short px = __hlt(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hlt(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hgt2_mask(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO_MASK(set.gt)
+,
+    const unsigned short px = __hgt(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hgt(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hequ2_mask(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO_MASK(set.equ)
+,
+    const unsigned short px = __hequ(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hequ(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hneu2_mask(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO_MASK(set.neu)
+,
+    const unsigned short px = __hneu(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hneu(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hleu2_mask(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO_MASK(set.leu)
+,
+    const unsigned short px = __hleu(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hleu(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hgeu2_mask(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO_MASK(set.geu)
+,
+    const unsigned short px = __hgeu(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hgeu(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hltu2_mask(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO_MASK(set.ltu)
+,
+    const unsigned short px = __hltu(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hltu(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hgtu2_mask(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF2_MACRO_MASK(set.gtu)
+,
+    const unsigned short px = __hgtu(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    const unsigned short py = __hgtu(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U;
+    unsigned ur = (unsigned)py;
+             ur <<= (unsigned)16U;
+             ur |= (unsigned)px;
+    return ur;
+)
+}
+#undef __COMPARISON_OP_HALF2_MACRO_MASK
+
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbeq2(const __half2 a, const __half2 b)
+{
+    const unsigned mask = __heq2_mask(a, b);
+    return (mask == 0xFFFFFFFFU);
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbne2(const __half2 a, const __half2 b)
+{
+    const unsigned mask = __hne2_mask(a, b);
+    return (mask == 0xFFFFFFFFU);
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hble2(const __half2 a, const __half2 b)
+{
+    const unsigned mask = __hle2_mask(a, b);
+    return (mask == 0xFFFFFFFFU);
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbge2(const __half2 a, const __half2 b)
+{
+    const unsigned mask = __hge2_mask(a, b);
+    return (mask == 0xFFFFFFFFU);
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hblt2(const __half2 a, const __half2 b)
+{
+    const unsigned mask = __hlt2_mask(a, b);
+    return (mask == 0xFFFFFFFFU);
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbgt2(const __half2 a, const __half2 b)
+{
+    const unsigned mask = __hgt2_mask(a, b);
+    return (mask == 0xFFFFFFFFU);
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbequ2(const __half2 a, const __half2 b)
+{
+    const unsigned mask = __hequ2_mask(a, b);
+    return (mask == 0xFFFFFFFFU);
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbneu2(const __half2 a, const __half2 b)
+{
+    const unsigned mask = __hneu2_mask(a, b);
+    return (mask == 0xFFFFFFFFU);
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbleu2(const __half2 a, const __half2 b)
+{
+    const unsigned mask = __hleu2_mask(a, b);
+    return (mask == 0xFFFFFFFFU);
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbgeu2(const __half2 a, const __half2 b)
+{
+    const unsigned mask = __hgeu2_mask(a, b);
+    return (mask == 0xFFFFFFFFU);
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbltu2(const __half2 a, const __half2 b)
+{
+    const unsigned mask = __hltu2_mask(a, b);
+    return (mask == 0xFFFFFFFFU);
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbgtu2(const __half2 a, const __half2 b)
+{
+    const unsigned mask = __hgtu2_mask(a, b);
+    return (mask == 0xFFFFFFFFU);
+}
+/******************************************************************************
+*                             __half comparison                              *
+******************************************************************************/
+#define __COMPARISON_OP_HALF_MACRO(name) /* do */ {\
+   unsigned short val; \
+   asm( "{ .reg .pred __$temp3;\n" \
+        "  setp." __CUDA_FP16_STRINGIFY(name) ".f16  __$temp3, %1, %2;\n" \
+        "  selp.u16 %0, 1, 0, __$temp3;}" \
+        : "=h"(val) : "h"(__HALF_TO_CUS(a)), "h"(__HALF_TO_CUS(b))); \
+   return (val != 0U) ? true : false; \
+} /* while(0) */
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __heq(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF_MACRO(eq)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return (fa == fb);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hne(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF_MACRO(ne)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return (fa != fb) && (!__hisnan(a)) && (!__hisnan(b));
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hle(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF_MACRO(le)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return (fa <= fb);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hge(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF_MACRO(ge)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return (fa >= fb);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hlt(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF_MACRO(lt)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return (fa < fb);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hgt(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF_MACRO(gt)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return (fa > fb);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hequ(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF_MACRO(equ)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return (fa == fb) || (__hisnan(a)) || (__hisnan(b));
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hneu(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF_MACRO(neu)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return (fa != fb);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hleu(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF_MACRO(leu)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return (fa <= fb) || (__hisnan(a)) || (__hisnan(b));
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hgeu(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF_MACRO(geu)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return (fa >= fb) || (__hisnan(a)) || (__hisnan(b));
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hltu(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF_MACRO(ltu)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return (fa < fb) || (__hisnan(a)) || (__hisnan(b));
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hgtu(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __COMPARISON_OP_HALF_MACRO(gtu)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return (fa > fb) || (__hisnan(a)) || (__hisnan(b));
+)
+}
+#undef __COMPARISON_OP_HALF_MACRO
+/******************************************************************************
+*                            __half2 arithmetic                             *
+******************************************************************************/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hadd2(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __BINARY_OP_HALF2_MACRO(add)
+,
+    __half2 val;
+    val.x = __hadd(a.x, b.x);
+    val.y = __hadd(a.y, b.y);
+    return val;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hsub2(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __BINARY_OP_HALF2_MACRO(sub)
+,
+    __half2 val;
+    val.x = __hsub(a.x, b.x);
+    val.y = __hsub(a.y, b.y);
+    return val;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmul2(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __BINARY_OP_HALF2_MACRO(mul)
+,
+    __half2 val;
+    val.x = __hmul(a.x, b.x);
+    val.y = __hmul(a.y, b.y);
+    return val;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hadd2_sat(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __BINARY_OP_HALF2_MACRO(add.sat)
+,
+    __half2 val;
+    val.x = __hadd_sat(a.x, b.x);
+    val.y = __hadd_sat(a.y, b.y);
+    return val;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hsub2_sat(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __BINARY_OP_HALF2_MACRO(sub.sat)
+,
+    __half2 val;
+    val.x = __hsub_sat(a.x, b.x);
+    val.y = __hsub_sat(a.y, b.y);
+    return val;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmul2_sat(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __BINARY_OP_HALF2_MACRO(mul.sat)
+,
+    __half2 val;
+    val.x = __hmul_sat(a.x, b.x);
+    val.y = __hmul_sat(a.y, b.y);
+    return val;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hadd2_rn(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __BINARY_OP_HALF2_MACRO(add.rn)
+,
+    __half2 val;
+    val.x = __hadd_rn(a.x, b.x);
+    val.y = __hadd_rn(a.y, b.y);
+    return val;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hsub2_rn(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __BINARY_OP_HALF2_MACRO(sub.rn)
+,
+    __half2 val;
+    val.x = __hsub_rn(a.x, b.x);
+    val.y = __hsub_rn(a.y, b.y);
+    return val;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmul2_rn(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __BINARY_OP_HALF2_MACRO(mul.rn)
+,
+    __half2 val;
+    val.x = __hmul_rn(a.x, b.x);
+    val.y = __hmul_rn(a.y, b.y);
+    return val;
+)
+}
+#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA)
+__CUDA_FP16_DECL__ __half2 __hfma2(const __half2 a, const __half2 b, const __half2 c)
+{
+    __TERNARY_OP_HALF2_MACRO(fma.rn)
+}
+__CUDA_FP16_DECL__ __half2 __hfma2_sat(const __half2 a, const __half2 b, const __half2 c)
+{
+    __TERNARY_OP_HALF2_MACRO(fma.rn.sat)
+}
+#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __h2div(const __half2 a, const __half2 b) {
+    __half ha = __low2half(a);
+    __half hb = __low2half(b);
+
+    const __half v1 = __hdiv(ha, hb);
+
+    ha = __high2half(a);
+    hb = __high2half(b);
+
+    const __half v2 = __hdiv(ha, hb);
+
+    return __halves2half2(v1, v2);
+}
+
+/******************************************************************************
+*                             __half arithmetic                             *
+******************************************************************************/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hadd(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __BINARY_OP_HALF_MACRO(add)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return __float2half(fa + fb);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hsub(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __BINARY_OP_HALF_MACRO(sub)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return __float2half(fa - fb);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmul(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __BINARY_OP_HALF_MACRO(mul)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return __float2half(fa * fb);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hadd_sat(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __BINARY_OP_HALF_MACRO(add.sat)
+,
+    return __hmin(__hmax(__hadd(a, b), CUDART_ZERO_FP16), CUDART_ONE_FP16);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hsub_sat(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __BINARY_OP_HALF_MACRO(sub.sat)
+,
+    return __hmin(__hmax(__hsub(a, b), CUDART_ZERO_FP16), CUDART_ONE_FP16);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmul_sat(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __BINARY_OP_HALF_MACRO(mul.sat)
+,
+    return __hmin(__hmax(__hmul(a, b), CUDART_ZERO_FP16), CUDART_ONE_FP16);
+)
+}
+
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hadd_rn(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __BINARY_OP_HALF_MACRO(add.rn)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return __float2half(fa + fb);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hsub_rn(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __BINARY_OP_HALF_MACRO(sub.rn)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return __float2half(fa - fb);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmul_rn(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __BINARY_OP_HALF_MACRO(mul.rn)
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return __float2half(fa * fb);
+)
+}
+#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA)
+__CUDA_FP16_DECL__ __half __hfma(const __half a, const __half b, const __half c)
+{
+    __TERNARY_OP_HALF_MACRO(fma.rn)
+}
+__CUDA_FP16_DECL__ __half __hfma_sat(const __half a, const __half b, const __half c)
+{
+    __TERNARY_OP_HALF_MACRO(fma.rn.sat)
+}
+#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hdiv(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    __half v;
+    __half abs;
+    __half den;
+    __HALF_TO_US(den) = 0x008FU;
+
+    float rcp;
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+
+    asm("{rcp.approx.ftz.f32 %0, %1;\n}" :"=f"(rcp) : "f"(fb));
+
+    float fv = rcp * fa;
+
+    v = __float2half(fv);
+    abs = __habs(v);
+    if (__hlt(abs, den) && __hlt(__float2half(0.0f), abs))  {
+        const float err = __fmaf_rn(-fb, fv, fa);
+        fv = __fmaf_rn(rcp, err, fv);
+        v = __float2half(fv);
+    }
+    return v;
+,
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    return __float2half(fa / fb);
+)
+}
+
+/******************************************************************************
+*                             __half2 functions                  *
+******************************************************************************/
+#if defined(_NVHPC_CUDA) || defined(__CUDACC__)
+#define __APPROX_FCAST(fun) /* do */ {\
+   __half val;\
+   asm("{.reg.b32         f;        \n"\
+                " .reg.b16         r;        \n"\
+                "  mov.b16         r,%1;     \n"\
+                "  cvt.f32.f16     f,r;      \n"\
+                "  " __CUDA_FP16_STRINGIFY(fun) ".approx.ftz.f32   f,f;  \n"\
+                "  cvt.rn.f16.f32      r,f;  \n"\
+                "  mov.b16         %0,r;     \n"\
+                "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));\
+   return val;\
+} /* while(0) */
+#define __APPROX_FCAST2(fun) /* do */ {\
+   __half2 val;\
+   asm("{.reg.b16         hl, hu;         \n"\
+                " .reg.b32         fl, fu;         \n"\
+                "  mov.b32         {hl, hu}, %1;   \n"\
+                "  cvt.f32.f16     fl, hl;         \n"\
+                "  cvt.f32.f16     fu, hu;         \n"\
+                "  " __CUDA_FP16_STRINGIFY(fun) ".approx.ftz.f32   fl, fl;     \n"\
+                "  " __CUDA_FP16_STRINGIFY(fun) ".approx.ftz.f32   fu, fu;     \n"\
+                "  cvt.rn.f16.f32      hl, fl;     \n"\
+                "  cvt.rn.f16.f32      hu, fu;     \n"\
+                "  mov.b32         %0, {hl, hu};   \n"\
+                "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));       \
+   return val;\
+} /* while(0) */
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) || defined(_NVHPC_CUDA)
+#define __SPEC_CASE2(i,r, spc, ulp) \
+   "{.reg.b32 spc, ulp, p;\n"\
+   "  mov.b32 spc," __CUDA_FP16_STRINGIFY(spc) ";\n"\
+   "  mov.b32 ulp," __CUDA_FP16_STRINGIFY(ulp) ";\n"\
+   "  set.eq.f16x2.f16x2 p," __CUDA_FP16_STRINGIFY(i) ", spc;\n"\
+   "  fma.rn.f16x2 " __CUDA_FP16_STRINGIFY(r) ",p,ulp," __CUDA_FP16_STRINGIFY(r) ";\n}\n"
+#define __SPEC_CASE(i,r, spc, ulp) \
+   "{.reg.b16 spc, ulp, p;\n"\
+   "  mov.b16 spc," __CUDA_FP16_STRINGIFY(spc) ";\n"\
+   "  mov.b16 ulp," __CUDA_FP16_STRINGIFY(ulp) ";\n"\
+   "  set.eq.f16.f16 p," __CUDA_FP16_STRINGIFY(i) ", spc;\n"\
+   "  fma.rn.f16 " __CUDA_FP16_STRINGIFY(r) ",p,ulp," __CUDA_FP16_STRINGIFY(r) ";\n}\n"
+static __device__ __forceinline__ float __float_simpl_sinf(float a);
+static __device__ __forceinline__ float __float_simpl_cosf(float a);
+__CUDA_FP16_DECL__ __half hsin(const __half a) {
+    const float sl = __float_simpl_sinf(__half2float(a));
+    __half r = __float2half_rn(sl);
+    asm("{\n\t"
+        "  .reg.b16 i,r,t;     \n\t"
+        "  mov.b16 r, %0;      \n\t"
+        "  mov.b16 i, %1;      \n\t"
+        "  and.b16 t, r, 0x8000U; \n\t"
+        "  abs.f16 r, r;   \n\t"
+        "  abs.f16 i, i;   \n\t"
+        __SPEC_CASE(i, r, 0X32B3U, 0x0800U)
+        __SPEC_CASE(i, r, 0X5CB0U, 0x9000U)
+        "  or.b16  r,r,t;      \n\t"
+        "  mov.b16 %0, r;      \n"
+        "}\n" : "+h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a)));
+    return r;
+}
+__CUDA_FP16_DECL__ __half2 h2sin(const __half2 a) {
+    const float sl = __float_simpl_sinf(__half2float(a.x));
+    const float sh = __float_simpl_sinf(__half2float(a.y));
+    __half2 r = __floats2half2_rn(sl, sh);
+    asm("{\n\t"
+        "  .reg.b32 i,r,t;             \n\t"
+        "  mov.b32 r, %0;              \n\t"
+        "  mov.b32 i, %1;              \n\t"
+        "  and.b32 t, r, 0x80008000U;   \n\t"
+        "  abs.f16x2 r, r;   \n\t"
+        "  abs.f16x2 i, i;   \n\t"
+        __SPEC_CASE2(i, r, 0X32B332B3U, 0x08000800U)
+        __SPEC_CASE2(i, r, 0X5CB05CB0U, 0x90009000U)
+        "  or.b32  r, r, t;            \n\t"
+        "  mov.b32 %0, r;              \n"
+        "}\n" : "+r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a)));
+    return r;
+}
+__CUDA_FP16_DECL__ __half hcos(const __half a) {
+    const float cl = __float_simpl_cosf(__half2float(a));
+    __half r = __float2half_rn(cl);
+    asm("{\n\t"
+        "  .reg.b16 i,r;        \n\t"
+        "  mov.b16 r, %0;       \n\t"
+        "  mov.b16 i, %1;       \n\t"
+        "  abs.f16 i, i;        \n\t"
+        __SPEC_CASE(i, r, 0X2B7CU, 0x1000U)
+        "  mov.b16 %0, r;       \n"
+        "}\n" : "+h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a)));
+    return r;
+}
+__CUDA_FP16_DECL__ __half2 h2cos(const __half2 a) {
+    const float cl = __float_simpl_cosf(__half2float(a.x));
+    const float ch = __float_simpl_cosf(__half2float(a.y));
+    __half2 r = __floats2half2_rn(cl, ch);
+    asm("{\n\t"
+        "  .reg.b32 i,r;   \n\t"
+        "  mov.b32 r, %0;  \n\t"
+        "  mov.b32 i, %1;  \n\t"
+        "  abs.f16x2 i, i; \n\t"
+        __SPEC_CASE2(i, r, 0X2B7C2B7CU, 0x10001000U)
+        "  mov.b32 %0, r;  \n"
+        "}\n" : "+r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a)));
+    return r;
+}
+static __device__ __forceinline__ float __internal_trig_reduction_kernel(const float a, unsigned int *const quadrant)
+{
+    const float ar = __fmaf_rn(a, 0.636619772F, 12582912.0F);
+    const unsigned q = __float_as_uint(ar);
+    const float j = __fsub_rn(ar, 12582912.0F);
+    float t = __fmaf_rn(j, -1.5707962512969971e+000F, a);
+    t = __fmaf_rn(j, -7.5497894158615964e-008F, t);
+    *quadrant = q;
+    return t;
+}
+static __device__ __forceinline__ float __internal_sin_cos_kernel(const float x, const unsigned int i)
+{
+    float z;
+    const float x2 = x*x;
+    float a8;
+    float a6;
+    float a4;
+    float a2;
+    float a1;
+    float a0;
+
+    if ((i & 1U) != 0U) {
+        // cos
+        a8 =  2.44331571e-5F;
+        a6 = -1.38873163e-3F;
+        a4 =  4.16666457e-2F;
+        a2 = -5.00000000e-1F;
+        a1 = x2;
+        a0 = 1.0F;
+    }
+    else {
+        // sin
+        a8 = -1.95152959e-4F;
+        a6 =  8.33216087e-3F;
+        a4 = -1.66666546e-1F;
+        a2 = 0.0F;
+        a1 = x;
+        a0 = x;
+    }
+
+    z = __fmaf_rn(a8, x2, a6);
+    z = __fmaf_rn(z, x2, a4);
+    z = __fmaf_rn(z, x2, a2);
+    z = __fmaf_rn(z, a1, a0);
+
+    if ((i & 2U) != 0U) {
+        z = -z;
+    }
+    return z;
+}
+static __device__ __forceinline__ float __float_simpl_sinf(float a)
+{
+    float z;
+    unsigned i;
+    a = __internal_trig_reduction_kernel(a, &i);
+    z = __internal_sin_cos_kernel(a, i);
+    return z;
+}
+static __device__ __forceinline__ float __float_simpl_cosf(float a)
+{
+    float z;
+    unsigned i;
+    a = __internal_trig_reduction_kernel(a, &i);
+    z = __internal_sin_cos_kernel(a, (i & 0x3U) + 1U);
+    return z;
+}
+
+__CUDA_FP16_DECL__ __half hexp(const __half a) {
+    __half val;
+    asm("{.reg.b32         f, C, nZ;       \n"
+        " .reg.b16         h,r;            \n"
+        "  mov.b16         h,%1;           \n"
+        "  cvt.f32.f16     f,h;            \n"
+        "  mov.b32         C, 0x3fb8aa3bU; \n"
+        "  mov.b32         nZ, 0x80000000U;\n"
+        "  fma.rn.f32      f,f,C,nZ;       \n"
+        "  ex2.approx.ftz.f32  f,f;        \n"
+        "  cvt.rn.f16.f32      r,f;        \n"
+        __SPEC_CASE(h, r, 0X1F79U, 0x9400U)
+        __SPEC_CASE(h, r, 0X25CFU, 0x9400U)
+        __SPEC_CASE(h, r, 0XC13BU, 0x0400U)
+        __SPEC_CASE(h, r, 0XC1EFU, 0x0200U)
+        "  mov.b16         %0,r;           \n"
+        "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 h2exp(const __half2 a) {
+    __half2 val;
+    asm("{.reg.b16         hl, hu;         \n"
+        " .reg.b32         h,r,fl,fu,C,nZ; \n"
+        "  mov.b32         {hl, hu}, %1;   \n"
+        "  mov.b32         h, %1;          \n"
+        "  cvt.f32.f16     fl, hl;         \n"
+        "  cvt.f32.f16     fu, hu;         \n"
+        "  mov.b32         C, 0x3fb8aa3bU; \n"
+        "  mov.b32         nZ, 0x80000000U;\n"
+        "  fma.rn.f32      fl,fl,C,nZ;     \n"
+        "  fma.rn.f32      fu,fu,C,nZ;     \n"
+        "  ex2.approx.ftz.f32  fl, fl;     \n"
+        "  ex2.approx.ftz.f32  fu, fu;     \n"
+        "  cvt.rn.f16.f32      hl, fl;     \n"
+        "  cvt.rn.f16.f32      hu, fu;     \n"
+        "  mov.b32         r, {hl, hu};    \n"
+        __SPEC_CASE2(h, r, 0X1F791F79U, 0x94009400U)
+        __SPEC_CASE2(h, r, 0X25CF25CFU, 0x94009400U)
+        __SPEC_CASE2(h, r, 0XC13BC13BU, 0x04000400U)
+        __SPEC_CASE2(h, r, 0XC1EFC1EFU, 0x02000200U)
+        "  mov.b32         %0, r;  \n"
+        "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
+    return val;
+}
+#endif /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) || defined(_NVHPC_CUDA) */
+
+__CUDA_FP16_DECL__ __half htanh(const __half a) {
+    float f = __half2float(a);
+    f = tanhf(f);
+    __half h = __float2half_rn(f);
+    return h;
+}
+__CUDA_FP16_DECL__ __half2 h2tanh(const __half2 a) {
+    float2 f = __half22float2(a);
+    f.x = tanhf(f.x);
+    f.y = tanhf(f.y);
+    __half2 h = __float22half2_rn(f);
+    return h;
+}
+
+__CUDA_FP16_DECL__ __half htanh_approx(const __half a) {
+    __half r;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_75,
+    __half_raw hr = (__half_raw)a;
+    asm("tanh.approx.f16 %0, %0;" : "+h"(hr.x));
+    r = (__half)hr;
+,
+    r = htanh(a);
+)
+    return r;
+}
+__CUDA_FP16_DECL__ __half2 h2tanh_approx(const __half2 a) {
+    __half2 res;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_75,
+    asm("tanh.approx.f16x2 %0, %1;" : "=r"(__HALF2_TO_UI(res)) : "r"(__HALF2_TO_CUI(a)));
+,
+    res = h2tanh(a);
+)
+    return res;
+}
+
+__CUDA_FP16_DECL__ __half hexp2(const __half a) {
+    __half val;
+    asm("{.reg.b32         f, ULP;         \n"
+        " .reg.b16         r;              \n"
+        "  mov.b16         r,%1;           \n"
+        "  cvt.f32.f16     f,r;            \n"
+        "  ex2.approx.ftz.f32      f,f;    \n"
+        "  mov.b32         ULP, 0x33800000U;\n"
+        "  fma.rn.f32      f,f,ULP,f;      \n"
+        "  cvt.rn.f16.f32      r,f;        \n"
+        "  mov.b16         %0,r;           \n"
+        "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 h2exp2(const __half2 a) {
+    __half2 val;
+    asm("{.reg.b16         hl, hu;         \n"
+        " .reg.b32         fl, fu, ULP;    \n"
+        "  mov.b32         {hl, hu}, %1;   \n"
+        "  cvt.f32.f16     fl, hl;         \n"
+        "  cvt.f32.f16     fu, hu;         \n"
+        "  ex2.approx.ftz.f32  fl, fl;     \n"
+        "  ex2.approx.ftz.f32  fu, fu;     \n"
+        "  mov.b32         ULP, 0x33800000U;\n"
+        "  fma.rn.f32      fl,fl,ULP,fl;   \n"
+        "  fma.rn.f32      fu,fu,ULP,fu;   \n"
+        "  cvt.rn.f16.f32      hl, fl;     \n"
+        "  cvt.rn.f16.f32      hu, fu;     \n"
+        "  mov.b32         %0, {hl, hu};   \n"
+        "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
+    return val;
+}
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) || defined(_NVHPC_CUDA)
+__CUDA_FP16_DECL__ __half hexp10(const __half a) {
+    __half val;
+    asm("{.reg.b16         h,r;            \n"
+        " .reg.b32         f, C, nZ;       \n"
+        "  mov.b16         h, %1;          \n"
+        "  cvt.f32.f16     f, h;           \n"
+        "  mov.b32         C, 0x40549A78U; \n"
+        "  mov.b32         nZ, 0x80000000U;\n"
+        "  fma.rn.f32      f,f,C,nZ;       \n"
+        "  ex2.approx.ftz.f32  f, f;       \n"
+        "  cvt.rn.f16.f32      r, f;       \n"
+        __SPEC_CASE(h, r, 0x34DEU, 0x9800U)
+        __SPEC_CASE(h, r, 0x9766U, 0x9000U)
+        __SPEC_CASE(h, r, 0x9972U, 0x1000U)
+        __SPEC_CASE(h, r, 0xA5C4U, 0x1000U)
+        __SPEC_CASE(h, r, 0xBF0AU, 0x8100U)
+        "  mov.b16         %0, r;          \n"
+        "}":"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 h2exp10(const __half2 a) {
+    __half2 val;
+    asm("{.reg.b16         hl, hu;         \n"
+        " .reg.b32         h,r,fl,fu,C,nZ; \n"
+        "  mov.b32         {hl, hu}, %1;   \n"
+        "  mov.b32         h, %1;          \n"
+        "  cvt.f32.f16     fl, hl;         \n"
+        "  cvt.f32.f16     fu, hu;         \n"
+        "  mov.b32         C, 0x40549A78U; \n"
+        "  mov.b32         nZ, 0x80000000U;\n"
+        "  fma.rn.f32      fl,fl,C,nZ;     \n"
+        "  fma.rn.f32      fu,fu,C,nZ;     \n"
+        "  ex2.approx.ftz.f32  fl, fl;     \n"
+        "  ex2.approx.ftz.f32  fu, fu;     \n"
+        "  cvt.rn.f16.f32      hl, fl;     \n"
+        "  cvt.rn.f16.f32      hu, fu;     \n"
+        "  mov.b32         r, {hl, hu};    \n"
+        __SPEC_CASE2(h, r, 0x34DE34DEU, 0x98009800U)
+        __SPEC_CASE2(h, r, 0x97669766U, 0x90009000U)
+        __SPEC_CASE2(h, r, 0x99729972U, 0x10001000U)
+        __SPEC_CASE2(h, r, 0xA5C4A5C4U, 0x10001000U)
+        __SPEC_CASE2(h, r, 0xBF0ABF0AU, 0x81008100U)
+        "  mov.b32         %0, r;  \n"
+        "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half hlog2(const __half a) {
+    __half val;
+    asm("{.reg.b16         h, r;           \n"
+        " .reg.b32         f;              \n"
+        "  mov.b16         h, %1;          \n"
+        "  cvt.f32.f16     f, h;           \n"
+        "  lg2.approx.ftz.f32  f, f;       \n"
+        "  cvt.rn.f16.f32      r, f;       \n"
+        __SPEC_CASE(r, r, 0xA2E2U, 0x8080U)
+        __SPEC_CASE(r, r, 0xBF46U, 0x9400U)
+        "  mov.b16         %0, r;          \n"
+        "}":"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 h2log2(const __half2 a) {
+    __half2 val;
+    asm("{.reg.b16         hl, hu;         \n"
+        " .reg.b32         fl, fu, r, p;   \n"
+        "  mov.b32         {hl, hu}, %1;   \n"
+        "  cvt.f32.f16     fl, hl;         \n"
+        "  cvt.f32.f16     fu, hu;         \n"
+        "  lg2.approx.ftz.f32  fl, fl;     \n"
+        "  lg2.approx.ftz.f32  fu, fu;     \n"
+        "  cvt.rn.f16.f32      hl, fl;     \n"
+        "  cvt.rn.f16.f32      hu, fu;     \n"
+        "  mov.b32         r, {hl, hu};    \n"
+        __SPEC_CASE2(r, r, 0xA2E2A2E2U, 0x80808080U)
+        __SPEC_CASE2(r, r, 0xBF46BF46U, 0x94009400U)
+        "  mov.b32         %0, r;          \n"
+        "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half hlog(const __half a) {
+    __half val;
+    asm("{.reg.b32         f, C;           \n"
+        " .reg.b16         r,h;            \n"
+        "  mov.b16         h,%1;           \n"
+        "  cvt.f32.f16     f,h;            \n"
+        "  lg2.approx.ftz.f32  f,f;        \n"
+        "  mov.b32         C, 0x3f317218U;  \n"
+        "  mul.f32         f,f,C;          \n"
+        "  cvt.rn.f16.f32      r,f;        \n"
+        __SPEC_CASE(h, r, 0X160DU, 0x9C00U)
+        __SPEC_CASE(h, r, 0X3BFEU, 0x8010U)
+        __SPEC_CASE(h, r, 0X3C0BU, 0x8080U)
+        __SPEC_CASE(h, r, 0X6051U, 0x1C00U)
+        "  mov.b16         %0,r;           \n"
+        "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 h2log(const __half2 a) {
+    __half2 val;
+    asm("{.reg.b16         hl, hu;             \n"
+        " .reg.b32         r, fl, fu, C, h;    \n"
+        "  mov.b32         {hl, hu}, %1;       \n"
+        "  mov.b32         h, %1;              \n"
+        "  cvt.f32.f16     fl, hl;             \n"
+        "  cvt.f32.f16     fu, hu;             \n"
+        "  lg2.approx.ftz.f32  fl, fl;         \n"
+        "  lg2.approx.ftz.f32  fu, fu;         \n"
+        "  mov.b32         C, 0x3f317218U;     \n"
+        "  mul.f32         fl,fl,C;            \n"
+        "  mul.f32         fu,fu,C;            \n"
+        "  cvt.rn.f16.f32      hl, fl;         \n"
+        "  cvt.rn.f16.f32      hu, fu;         \n"
+        "  mov.b32         r, {hl, hu};        \n"
+        __SPEC_CASE2(h, r, 0X160D160DU, 0x9C009C00U)
+        __SPEC_CASE2(h, r, 0X3BFE3BFEU, 0x80108010U)
+        __SPEC_CASE2(h, r, 0X3C0B3C0BU, 0x80808080U)
+        __SPEC_CASE2(h, r, 0X60516051U, 0x1C001C00U)
+        "  mov.b32         %0, r;              \n"
+        "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half hlog10(const __half a) {
+    __half val;
+    asm("{.reg.b16         h, r;           \n"
+        " .reg.b32         f, C;           \n"
+        "  mov.b16         h, %1;          \n"
+        "  cvt.f32.f16     f, h;           \n"
+        "  lg2.approx.ftz.f32  f, f;       \n"
+        "  mov.b32         C, 0x3E9A209BU; \n"
+        "  mul.f32         f,f,C;          \n"
+        "  cvt.rn.f16.f32      r, f;       \n"
+        __SPEC_CASE(h, r, 0x338FU, 0x1000U)
+        __SPEC_CASE(h, r, 0x33F8U, 0x9000U)
+        __SPEC_CASE(h, r, 0x57E1U, 0x9800U)
+        __SPEC_CASE(h, r, 0x719DU, 0x9C00U)
+        "  mov.b16         %0, r;          \n"
+        "}":"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 h2log10(const __half2 a) {
+    __half2 val;
+    asm("{.reg.b16         hl, hu;             \n"
+        " .reg.b32         r, fl, fu, C, h;    \n"
+        "  mov.b32         {hl, hu}, %1;       \n"
+        "  mov.b32         h, %1;              \n"
+        "  cvt.f32.f16     fl, hl;             \n"
+        "  cvt.f32.f16     fu, hu;             \n"
+        "  lg2.approx.ftz.f32  fl, fl;         \n"
+        "  lg2.approx.ftz.f32  fu, fu;         \n"
+        "  mov.b32         C, 0x3E9A209BU;     \n"
+        "  mul.f32         fl,fl,C;            \n"
+        "  mul.f32         fu,fu,C;            \n"
+        "  cvt.rn.f16.f32      hl, fl;         \n"
+        "  cvt.rn.f16.f32      hu, fu;         \n"
+        "  mov.b32         r, {hl, hu};        \n"
+        __SPEC_CASE2(h, r, 0x338F338FU, 0x10001000U)
+        __SPEC_CASE2(h, r, 0x33F833F8U, 0x90009000U)
+        __SPEC_CASE2(h, r, 0x57E157E1U, 0x98009800U)
+        __SPEC_CASE2(h, r, 0x719D719DU, 0x9C009C00U)
+        "  mov.b32         %0, r;              \n"
+        "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
+    return val;
+}
+#undef __SPEC_CASE2
+#undef __SPEC_CASE
+#endif /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) || defined(_NVHPC_CUDA) */
+__CUDA_FP16_DECL__ __half2 h2rcp(const __half2 a) {
+    __APPROX_FCAST2(rcp)
+}
+__CUDA_FP16_DECL__ __half hrcp(const __half a) {
+    __APPROX_FCAST(rcp)
+}
+__CUDA_FP16_DECL__ __half2 h2rsqrt(const __half2 a) {
+    __APPROX_FCAST2(rsqrt)
+}
+__CUDA_FP16_DECL__ __half hrsqrt(const __half a) {
+    __APPROX_FCAST(rsqrt)
+}
+__CUDA_FP16_DECL__ __half2 h2sqrt(const __half2 a) {
+    __APPROX_FCAST2(sqrt)
+}
+__CUDA_FP16_DECL__ __half hsqrt(const __half a) {
+    __APPROX_FCAST(sqrt)
+}
+#undef __APPROX_FCAST
+#undef __APPROX_FCAST2
+#endif /* defined(_NVHPC_CUDA) || defined(__CUDACC__) */
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hisnan2(const __half2 a)
+{
+    __half2 r;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    asm("{set.nan.f16x2.f16x2 %0,%1,%2;\n}"
+        :"=r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a)), "r"(__HALF2_TO_CUI(a)));
+,
+    __half2_raw val;
+    val.x = __hisnan(a.x) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    val.y = __hisnan(a.y) ? (unsigned short)0x3C00U : (unsigned short)0U;
+    r = __half2(val);
+)
+    return r;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ bool __hisnan(const __half a)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __half r;
+    asm("{set.nan.f16.f16 %0,%1,%2;\n}"
+        :"=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a)), "h"(__HALF_TO_CUS(a)));
+    return __HALF_TO_CUS(r) != 0U;
+,
+    const __half_raw hr = static_cast<__half_raw>(a);
+    return ((hr.x & (unsigned short)0x7FFFU) > (unsigned short)0x7C00U);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hneg2(const __half2 a)
+{
+    __half2 r;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    asm("{neg.f16x2 %0,%1;\n}"
+        :"=r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a)));
+,
+    r.x = __hneg(a.x);
+    r.y = __hneg(a.y);
+)
+    return r;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hneg(const __half a)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __half r;
+    asm("{neg.f16 %0,%1;\n}"
+        :"=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a)));
+    return r;
+,
+    const float fa = __half2float(a);
+    return __float2half(-fa);
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __habs2(const __half2 a)
+{
+    __half2 r;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    asm("{abs.f16x2 %0,%1;\n}"
+        :"=r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a)));
+,
+    r.x = __habs(a.x);
+    r.y = __habs(a.y);
+)
+    return r;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __habs(const __half a)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53,
+    __half r;
+    asm("{abs.f16 %0,%1;\n}"
+        :"=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a)));
+    return r;
+,
+    __half_raw abs_a_raw = static_cast<__half_raw>(a);
+    abs_a_raw.x &= (unsigned short)0x7FFFU;
+    if (abs_a_raw.x > (unsigned short)0x7C00U)
+    {
+        // return canonical NaN
+        abs_a_raw.x = (unsigned short)0x7FFFU;
+    }
+    return static_cast<__half>(abs_a_raw);
+)
+}
+#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA)
+__CUDA_FP16_DECL__ __half2 __hcmadd(const __half2 a, const __half2 b, const __half2 c)
+{
+    // fast version of complex multiply-accumulate
+    // (a.re, a.im) * (b.re, b.im) + (c.re, c.im)
+    // acc.re = (c.re + a.re*b.re) - a.im*b.im
+    // acc.im = (c.im + a.re*b.im) + a.im*b.re
+    __half real_tmp =  __hfma(a.x, b.x, c.x);
+    __half img_tmp  =  __hfma(a.x, b.y, c.y);
+    real_tmp = __hfma(__hneg(a.y), b.y, real_tmp);
+    img_tmp  = __hfma(a.y,         b.x, img_tmp);
+    return make_half2(real_tmp, img_tmp);
+}
+#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */
+
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmax_nan(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __BINARY_OP_HALF_MACRO(max.NaN)
+,
+    __half maxval;
+    if (__hisnan(a) || __hisnan(b))
+    {
+        maxval = CUDART_NAN_FP16;
+    }
+    else
+    {
+        maxval = __hmax(a, b);
+    }
+    return maxval;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmin_nan(const __half a, const __half b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __BINARY_OP_HALF_MACRO(min.NaN)
+,
+    __half minval;
+    if (__hisnan(a) || __hisnan(b))
+    {
+        minval = CUDART_NAN_FP16;
+    }
+    else
+    {
+        minval = __hmin(a, b);
+    }
+    return minval;
+)
+}
+
+#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA)
+__CUDA_FP16_DECL__ __half __hfma_relu(const __half a, const __half b, const __half c)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __TERNARY_OP_HALF_MACRO(fma.rn.relu)
+,
+    return __hmax_nan(__hfma(a, b, c), CUDART_ZERO_FP16);
+)
+}
+#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */
+
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmax2_nan(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __BINARY_OP_HALF2_MACRO(max.NaN)
+,
+    __half2 result = __hmax2(a, b);
+    if (__hisnan(a.x) || __hisnan(b.x))
+    {
+        result.x = CUDART_NAN_FP16;
+    }
+    if (__hisnan(a.y) || __hisnan(b.y))
+    {
+        result.y = CUDART_NAN_FP16;
+    }
+    return result;
+)
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmin2_nan(const __half2 a, const __half2 b)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __BINARY_OP_HALF2_MACRO(min.NaN)
+,
+    __half2 result = __hmin2(a, b);
+    if (__hisnan(a.x) || __hisnan(b.x))
+    {
+        result.x = CUDART_NAN_FP16;
+    }
+    if (__hisnan(a.y) || __hisnan(b.y))
+    {
+        result.y = CUDART_NAN_FP16;
+    }
+    return result;
+)
+}
+#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA)
+__CUDA_FP16_DECL__ __half2 __hfma2_relu(const __half2 a, const __half2 b, const __half2 c)
+{
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    __TERNARY_OP_HALF2_MACRO(fma.rn.relu)
+,
+    __half2_raw hzero;
+    hzero.x = (unsigned short)0U;
+    hzero.y = (unsigned short)0U;
+    return __hmax2_nan(__hfma2(a, b, c), __half2(hzero));
+)
+}
+#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */
+
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+/* Define __PTR for atomicAdd prototypes below, undef after done */
+#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
+#define __PTR   "l"
+#else
+#define __PTR   "r"
+#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/
+
+__CUDA_FP16_DECL__  __half2 atomicAdd(__half2 *const address, const __half2 val) {
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_60,
+    __half2 r;
+    asm volatile ("{ atom.add.noftz.f16x2 %0,[%1],%2; }\n"
+                  : "=r"(__HALF2_TO_UI(r)) : __PTR(address), "r"(__HALF2_TO_CUI(val))
+                  : "memory");
+    return r;
+,
+    unsigned int* address_as_uint = (unsigned int*)address;
+    unsigned int old = *address_as_uint;
+    unsigned int assumed;
+    do {
+        assumed = old;
+        __half2 new_val = __hadd2(val, *(__half2*)&assumed);
+        old = atomicCAS(address_as_uint, assumed, *(unsigned int*)&new_val);
+    } while (assumed != old);
+    return *(__half2*)&old;
+)
+}
+
+#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700))) || defined(_NVHPC_CUDA)
+__CUDA_FP16_DECL__  __half atomicAdd(__half *const address, const __half val) {
+    __half r;
+    asm volatile ("{ atom.add.noftz.f16 %0,[%1],%2; }\n"
+                  : "=h"(__HALF_TO_US(r))
+                  : __PTR(address), "h"(__HALF_TO_CUS(val))
+                  : "memory");
+    return r;
+}
+#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700))) || defined(_NVHPC_CUDA) */
+
+#undef __PTR
+#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */
+#endif /* !(defined __DOXYGEN_ONLY__) */
+#endif /* defined(__cplusplus) */
+
+#undef __TERNARY_OP_HALF2_MACRO
+#undef __TERNARY_OP_HALF_MACRO
+#undef __BINARY_OP_HALF2_MACRO
+#undef __BINARY_OP_HALF_MACRO
+
+#undef __CUDA_HOSTDEVICE_FP16_DECL__
+#undef __CUDA_FP16_DECL__
+
+#undef __HALF_TO_US
+#undef __HALF_TO_CUS
+#undef __HALF2_TO_UI
+#undef __HALF2_TO_CUI
+#undef __CUDA_FP16_CONSTEXPR__
+
+#if defined(__CPP_VERSION_AT_LEAST_11_FP16)
+#undef __CPP_VERSION_AT_LEAST_11_FP16
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */
+
+#undef ___CUDA_FP16_STRINGIFY_INNERMOST
+#undef __CUDA_FP16_STRINGIFY
+
+#endif /* end of include guard: __CUDA_FP16_HPP__ */
diff --git a/numba_cuda/numba/cuda/models.py b/numba_cuda/numba/cuda/models.py
index a98af1f6d..4b53bee8b 100644
--- a/numba_cuda/numba/cuda/models.py
+++ b/numba_cuda/numba/cuda/models.py
@@ -6,9 +6,10 @@
 from llvmlite import ir
 
 from numba.core.datamodel.registry import DataModelManager, register
+from numba.core.datamodel import PrimitiveModel
 from numba.core.extending import models
 from numba.core import types
-from numba.cuda.types import Dim3, GridGroup, CUDADispatcher
+from numba.cuda.types import Dim3, GridGroup, CUDADispatcher, Bfloat16
 
 
 cuda_data_manager = DataModelManager()
@@ -45,3 +46,10 @@ def __init__(self, dmm, fe_type):
 
 
 register_model(CUDADispatcher)(models.OpaqueModel)
+
+
+@register_model(Bfloat16)
+class _model___nv_bfloat16(PrimitiveModel):
+    def __init__(self, dmm, fe_type):
+        be_type = ir.IntType(16)
+        super(_model___nv_bfloat16, self).__init__(dmm, fe_type, be_type)
diff --git a/numba_cuda/numba/cuda/printimpl.py b/numba_cuda/numba/cuda/printimpl.py
index 558335191..a0e236909 100644
--- a/numba_cuda/numba/cuda/printimpl.py
+++ b/numba_cuda/numba/cuda/printimpl.py
@@ -8,7 +8,7 @@
 from numba.core.errors import NumbaWarning
 from numba.core.imputils import Registry
 from numba.cuda import nvvmutils
-from numba.cuda.types import Dim3
+from numba.cuda.types import Dim3, Bfloat16
 from warnings import warn
 
 registry = Registry()
@@ -51,6 +51,17 @@ def real_print_impl(ty, context, builder, val):
     return "%f", [lld]
 
 
+@print_item.register(Bfloat16)
+def bfloat16_print_impl(ty, context, builder, val):
+    # Hand rolled bfloat16 -> float32 -> double conversion with zero-ext
+    bits32 = builder.zext(val, ir.IntType(32))
+    shift = builder.shl(bits32, ir.Constant(ir.IntType(32), 16))
+    f32 = builder.bitcast(shift, ir.FloatType())
+    # printf("%f") expects a double; promote to f64 to match vararg expectation
+    f64 = builder.fpext(f32, ir.DoubleType())
+    return "%f", [f64]
+
+
 @print_item.register(types.StringLiteral)
 def const_print_impl(ty, context, builder, sigval):
     pyval = ty.literal_value
diff --git a/numba_cuda/numba/cuda/target.py b/numba_cuda/numba/cuda/target.py
index af6988dca..1ee2c5be6 100644
--- a/numba_cuda/numba/cuda/target.py
+++ b/numba_cuda/numba/cuda/target.py
@@ -33,7 +33,14 @@
 
 class CUDATypingContext(typing.BaseContext):
     def load_additional_registries(self):
-        from . import cudadecl, cudamath, fp16, libdevicedecl, vector_types
+        from . import (
+            cudadecl,
+            cudamath,
+            fp16,
+            bf16,
+            libdevicedecl,
+            vector_types,
+        )
         from numba.core.typing import enumdecl, cffi_utils
 
         self.install_registry(cudadecl.registry)
@@ -44,6 +51,7 @@ def load_additional_registries(self):
         self.install_registry(enumdecl.registry)
         self.install_registry(vector_types.typing_registry)
         self.install_registry(fp16.typing_registry)
+        self.install_registry(bf16.typing_registry)
 
     def resolve_value_type(self, val):
         # treat other dispatcher object as another device function
@@ -156,6 +164,7 @@ def load_additional_registries(self):
             libdeviceimpl,
             mathimpl,
             vector_types,
+            bf16,
         )
 
         # fix for #8940
@@ -169,6 +178,7 @@ def load_additional_registries(self):
         self.install_registry(mathimpl.registry)
         self.install_registry(vector_types.impl_registry)
         self.install_registry(fp16.target_registry)
+        self.install_registry(bf16.target_registry)
 
     def codegen(self):
         return self._internal_codegen
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py
index af392eb39..95e5fe140 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py
@@ -1,8 +1,118 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-2-Clause
 
-from numba import cuda, float32
-from numba.cuda.bf16 import bfloat16
+import numpy as np
+from ml_dtypes import bfloat16 as mldtypes_bf16
+
+from numba import (
+    cuda,
+    float32,
+    float64,
+    int16,
+    int32,
+    int64,
+    uint16,
+    uint32,
+    uint64,
+    config,
+)
+
+
+if not config.ENABLE_CUDASIM:
+    from numba.cuda.bf16 import (
+        bfloat16,
+        habs,
+        hadd,
+        hsub,
+        hmul,
+        hadd_rn,
+        hsub_rn,
+        hmul_rn,
+        hdiv,
+        hadd_sat,
+        hsub_sat,
+        hmul_sat,
+        hfma,
+        hfma_sat,
+        hneg,
+        hfma_relu,
+        # Comparison intrinsics
+        heq,
+        hne,
+        hge,
+        hgt,
+        hle,
+        hlt,
+        hmax,
+        hmin,
+        hmax_nan,
+        hmin_nan,
+        hisnan,
+        hisinf,
+        # Conversion intrinsics (NumPy-style names)
+        bfloat16_to_int8_rz,
+        bfloat16_to_uint8_rz,
+        int16_to_bfloat16_rn,
+        int16_to_bfloat16_rz,
+        int16_to_bfloat16_rd,
+        int16_to_bfloat16_ru,
+        bfloat16_to_int16_rn,
+        bfloat16_to_int16_rz,
+        bfloat16_to_int16_rd,
+        bfloat16_to_int16_ru,
+        uint16_to_bfloat16_rn,
+        uint16_to_bfloat16_rz,
+        uint16_to_bfloat16_rd,
+        uint16_to_bfloat16_ru,
+        bfloat16_to_uint16_rn,
+        bfloat16_to_uint16_rz,
+        bfloat16_to_uint16_rd,
+        bfloat16_to_uint16_ru,
+        int32_to_bfloat16_rn,
+        int32_to_bfloat16_rz,
+        int32_to_bfloat16_rd,
+        int32_to_bfloat16_ru,
+        bfloat16_to_int32_rn,
+        bfloat16_to_int32_rz,
+        bfloat16_to_int32_rd,
+        bfloat16_to_int32_ru,
+        uint32_to_bfloat16_rn,
+        uint32_to_bfloat16_rz,
+        uint32_to_bfloat16_rd,
+        uint32_to_bfloat16_ru,
+        bfloat16_to_uint32_rn,
+        bfloat16_to_uint32_rz,
+        bfloat16_to_uint32_rd,
+        bfloat16_to_uint32_ru,
+        bfloat16_to_int64_rn,
+        bfloat16_to_int64_rz,
+        bfloat16_to_int64_rd,
+        bfloat16_to_int64_ru,
+        int64_to_bfloat16_rn,
+        int64_to_bfloat16_rz,
+        int64_to_bfloat16_rd,
+        int64_to_bfloat16_ru,
+        bfloat16_to_uint64_rn,
+        bfloat16_to_uint64_rz,
+        bfloat16_to_uint64_rd,
+        bfloat16_to_uint64_ru,
+        uint64_to_bfloat16_rn,
+        uint64_to_bfloat16_rz,
+        uint64_to_bfloat16_rd,
+        uint64_to_bfloat16_ru,
+        bfloat16_as_int16,
+        int16_as_bfloat16,
+        bfloat16_as_uint16,
+        uint16_as_bfloat16,
+        bfloat16_to_float32,
+        float32_to_bfloat16,
+        float64_to_bfloat16,
+        float32_to_bfloat16_rn,
+        float32_to_bfloat16_rz,
+        float32_to_bfloat16_rd,
+        float32_to_bfloat16_ru,
+    )
+
 from numba.cuda.testing import CUDATestCase
 
 import math
@@ -61,3 +171,431 @@ def kernel(arr):
                     self.assertAlmostEqual(arr[0], f(3.14), delta=1e-1)
                 else:
                     self.assertAlmostEqual(arr[0], f(3.14), delta=1e-2)
+
+    def test_arithmetic_intrinsics_basic(self):
+        self.skip_unsupported()
+
+        @cuda.jit
+        def kernel(out):
+            a = bfloat16(1.25)
+            b = bfloat16(-2.5)
+
+            out[0] = float32(habs(b))
+            out[1] = float32(hadd(a, b))
+            out[2] = float32(hsub(a, b))
+            out[3] = float32(hmul(a, b))
+            out[4] = float32(hdiv(b, a))
+            out[5] = float32(hneg(a))
+            out[6] = float32(hfma(a, b, b))
+
+            out[7] = float32(hadd_rn(a, b))
+            out[8] = float32(hsub_rn(a, b))
+            out[9] = float32(hmul_rn(a, b))
+
+        out = cuda.device_array((10,), dtype="float32")
+        kernel[1, 1](out)
+
+        a = 1.25
+        b = -2.5
+        expected = [
+            abs(b),
+            a + b,
+            a - b,
+            a * b,
+            b / a,
+            -a,
+            a * b + b,
+            a + b,
+            a - b,
+            a * b,
+        ]
+        for i, exp in enumerate(expected):
+            self.assertAlmostEqual(out[i], exp, delta=1e-2)
+
+    def test_arithmetic_intrinsics_saturating(self):
+        self.skip_unsupported()
+
+        @cuda.jit
+        def kernel(out):
+            a = bfloat16(1.5)
+            b = bfloat16(0.75)
+
+            out[0] = float32(hadd_sat(a, b))  # 2.25 -> 1.0
+            out[1] = float32(hsub_sat(b, a))  # -0.75 -> 0.0
+            out[2] = float32(hmul_sat(a, b))  # 1.125 -> 1.0
+            out[3] = float32(hfma_sat(a, b, a))  # 1.125 + 1.5 -> 1.0
+
+        out = cuda.device_array((4,), dtype="float32")
+        kernel[1, 1](out)
+
+        self.assertAlmostEqual(out[0], 1.0, delta=1e-3)
+        self.assertAlmostEqual(out[1], 0.0, delta=1e-3)
+        self.assertAlmostEqual(out[2], 1.0, delta=1e-3)
+        self.assertAlmostEqual(out[3], 1.0, delta=1e-3)
+
+        # Also check they are clamped within [0, 1]
+        for i in range(4):
+            self.assertGreaterEqual(out[i], 0.0)
+            self.assertLessEqual(out[i], 1.0)
+
+    def test_fma_relu_intrinsic(self):
+        self.skip_unsupported()
+
+        @cuda.jit
+        def kernel(out):
+            a = bfloat16(-1.5)
+            b = bfloat16(2.0)
+            c = bfloat16(0.0)
+
+            out[0] = float32(hfma_relu(a, b, c))  # -3.0 -> relu -> 0.0
+
+        out = cuda.device_array((1,), dtype="float32")
+        kernel[1, 1](out)
+
+        self.assertAlmostEqual(out[0], 0.0, delta=1e-3)
+
+    def test_comparison_intrinsics(self):
+        self.skip_unsupported()
+
+        def make_kernel(cmpfn):
+            @cuda.jit
+            def kernel(out, a, b):
+                a_bf16 = bfloat16(a)
+                b_bf16 = bfloat16(b)
+                out[0] = cmpfn(a_bf16, b_bf16)
+
+            return kernel
+
+        comparisons = [heq, hne, hge, hgt, hle, hlt]
+        ops = [
+            lambda x, y: x == y,
+            lambda x, y: x != y,
+            lambda x, y: x >= y,
+            lambda x, y: x > y,
+            lambda x, y: x <= y,
+            lambda x, y: x < y,
+        ]
+
+        for cmpfn, op in zip(comparisons, ops):
+            with self.subTest(cmpfn=cmpfn):
+                kernel = make_kernel(cmpfn)
+                out = cuda.device_array((1,), dtype="bool")
+
+                a = 3.0
+                b = 3.0
+                kernel[1, 1](out, a, b)
+                self.assertEqual(bool(out[0]), op(3.0, 3.0))
+
+                a = 3.0
+                b = 4.0
+                kernel[1, 1](out, a, b)
+                self.assertEqual(bool(out[0]), op(3.0, 4.0))
+
+                a = 4.0
+                b = 3.0
+                kernel[1, 1](out, a, b)
+                self.assertEqual(bool(out[0]), op(4.0, 3.0))
+
+    def test_hmax_hmin_intrinsics(self):
+        self.skip_unsupported()
+
+        @cuda.jit
+        def kernel(out):
+            a = bfloat16(3.0)
+            b = bfloat16(4.0)
+            out[0] = float32(hmax(a, b))
+            out[1] = float32(hmin(a, b))
+
+        out = cuda.device_array((2,), dtype="float32")
+        kernel[1, 1](out)
+        self.assertAlmostEqual(out[0], 4.0, delta=1e-3)
+        self.assertAlmostEqual(out[1], 3.0, delta=1e-3)
+
+    def test_nan_and_inf_intrinsics(self):
+        self.skip_unsupported()
+
+        @cuda.jit
+        def kernel(out_bool, out_int):
+            nanv = bfloat16(float("nan"))
+            infv = bfloat16(float("inf"))
+            out_bool[0] = hisnan(nanv)
+            out_int[0] = hisinf(infv)
+
+        out_bool = cuda.device_array((1,), dtype="bool")
+        out_int = cuda.device_array((1,), dtype="int32")
+        kernel[1, 1](out_bool, out_int)
+        self.assertTrue(bool(out_bool[0]))
+        self.assertNotEqual(int(out_int[0]), 0)
+
+    def test_hmax_nan_hmin_nan_intrinsics(self):
+        self.skip_unsupported()
+
+        @cuda.jit
+        def kernel(out):
+            a = bfloat16(float("nan"))
+            b = bfloat16(2.0)
+            out[0] = float32(hmax_nan(a, b))
+            out[1] = float32(hmin_nan(a, b))
+            out[2] = float32(hmax(a, b))
+            out[3] = float32(hmin(a, b))
+
+        out = cuda.device_array((4,), dtype="float32")
+        kernel[1, 1](out)
+        # NaN-propagating variants should produce NaN
+        self.assertTrue(math.isnan(out[0]))
+        self.assertTrue(math.isnan(out[1]))
+        # Non-NaN variants should return the non-NaN operand
+        self.assertAlmostEqual(out[2], 2.0, delta=1e-3)
+        self.assertAlmostEqual(out[3], 2.0, delta=1e-3)
+
+    def test_bfloat16_as_bitcast(self):
+        self.skip_unsupported()
+
+        @cuda.jit
+        def roundtrip_kernel(test_val, i2, u2):
+            i2[0] = int16_as_bfloat16(bfloat16_as_int16(test_val))
+            u2[0] = uint16_as_bfloat16(bfloat16_as_uint16(test_val))
+
+        test_val = np.int16(0x3FC0)  # 1.5 in bfloat16
+        i2 = cuda.device_array((1,), dtype="int16")
+        u2 = cuda.device_array((1,), dtype="uint16")
+        roundtrip_kernel[1, 1](test_val, i2, u2)
+
+        self.assertEqual(i2[0], test_val)
+        self.assertEqual(u2[0], test_val)
+
+    def test_to_integer_conversions(self):
+        self.skip_unsupported()
+
+        @cuda.jit
+        def kernel(test_val, i1, i2, i3, i4, u1, u2, u3, u4):
+            a = int16_as_bfloat16(test_val)
+
+            i1[0] = bfloat16_to_int8_rz(a)
+            u1[0] = bfloat16_to_uint8_rz(a)
+            i2[0] = bfloat16_to_int16_rn(a)
+            i2[1] = bfloat16_to_int16_rz(a)
+            i2[2] = bfloat16_to_int16_rd(a)
+            i2[3] = bfloat16_to_int16_ru(a)
+            u2[0] = bfloat16_to_uint16_rn(a)
+            u2[1] = bfloat16_to_uint16_rz(a)
+            u2[2] = bfloat16_to_uint16_rd(a)
+            u2[3] = bfloat16_to_uint16_ru(a)
+            i3[0] = bfloat16_to_int32_rn(a)
+            i3[1] = bfloat16_to_int32_rz(a)
+            i3[2] = bfloat16_to_int32_rd(a)
+            i3[3] = bfloat16_to_int32_ru(a)
+            u3[0] = bfloat16_to_uint32_rn(a)
+            u3[1] = bfloat16_to_uint32_rz(a)
+            u3[2] = bfloat16_to_uint32_rd(a)
+            u3[3] = bfloat16_to_uint32_ru(a)
+            i4[0] = bfloat16_to_int64_rn(a)
+            i4[1] = bfloat16_to_int64_rz(a)
+            i4[2] = bfloat16_to_int64_rd(a)
+            i4[3] = bfloat16_to_int64_ru(a)
+            u4[0] = bfloat16_to_uint64_rn(a)
+            u4[1] = bfloat16_to_uint64_rz(a)
+            u4[2] = bfloat16_to_uint64_rd(a)
+            u4[3] = bfloat16_to_uint64_ru(a)
+
+        # rz
+        i1 = cuda.device_array((1,), dtype="int8")
+        # rn, rz, rd, ru
+        i2 = cuda.device_array((4,), dtype="int16")
+        i3 = cuda.device_array((4,), dtype="int32")
+        i4 = cuda.device_array((4,), dtype="int64")
+        # rz
+        u1 = cuda.device_array((1,), dtype="uint8")
+        # rn, rz, rd, ru
+        u2 = cuda.device_array((4,), dtype="uint16")
+        u3 = cuda.device_array((4,), dtype="uint32")
+        u4 = cuda.device_array((4,), dtype="uint64")
+
+        test_val = np.int16(0x3FC0)  # 1.5 in bfloat16
+
+        kernel[1, 1](test_val, i1, i2, i3, i4, u1, u2, u3, u4)
+
+        self.assertEqual(i1[0], 1)
+        self.assertEqual(u1[0], 1)
+
+        np.testing.assert_equal(i2, np.array([2, 1, 1, 2], "int16"))
+        np.testing.assert_equal(i3, np.array([2, 1, 1, 2], "int32"))
+        np.testing.assert_equal(i4, np.array([2, 1, 1, 2], "int64"))
+        np.testing.assert_equal(u2, np.array([2, 1, 1, 2], "uint16"))
+        np.testing.assert_equal(u3, np.array([2, 1, 1, 2], "uint32"))
+        np.testing.assert_equal(u4, np.array([2, 1, 1, 2], "uint64"))
+
+    def test_from_integer_conversions(self):
+        self.skip_unsupported()
+
+        test_val = 789
+
+        @cuda.jit
+        def kernel(out):
+            i2 = int16(test_val)
+            i3 = int32(test_val)
+            i4 = int64(test_val)
+            u2 = uint16(test_val)
+            u3 = uint32(test_val)
+            u4 = uint64(test_val)
+
+            i2rn = int16_to_bfloat16_rn(i2)
+            i2rz = int16_to_bfloat16_rz(i2)
+            i2rd = int16_to_bfloat16_rd(i2)
+            i2ru = int16_to_bfloat16_ru(i2)
+
+            u2rn = uint16_to_bfloat16_rn(u2)
+            u2rz = uint16_to_bfloat16_rz(u2)
+            u2rd = uint16_to_bfloat16_rd(u2)
+            u2ru = uint16_to_bfloat16_ru(u2)
+
+            i3rn = int32_to_bfloat16_rn(i3)
+            i3rz = int32_to_bfloat16_rz(i3)
+            i3rd = int32_to_bfloat16_rd(i3)
+            i3ru = int32_to_bfloat16_ru(i3)
+
+            u3rn = uint32_to_bfloat16_rn(u3)
+            u3rz = uint32_to_bfloat16_rz(u3)
+            u3rd = uint32_to_bfloat16_rd(u3)
+            u3ru = uint32_to_bfloat16_ru(u3)
+
+            i4rn = int64_to_bfloat16_rn(i4)
+            i4rz = int64_to_bfloat16_rz(i4)
+            i4rd = int64_to_bfloat16_rd(i4)
+            i4ru = int64_to_bfloat16_ru(i4)
+
+            u4rn = uint64_to_bfloat16_rn(u4)
+            u4rz = uint64_to_bfloat16_rz(u4)
+            u4rd = uint64_to_bfloat16_rd(u4)
+            u4ru = uint64_to_bfloat16_ru(u4)
+
+            out[0] = bfloat16_as_int16(i2rn)
+            out[1] = bfloat16_as_int16(i2rz)
+            out[2] = bfloat16_as_int16(i2rd)
+            out[3] = bfloat16_as_int16(i2ru)
+            out[4] = bfloat16_as_int16(u2rn)
+            out[5] = bfloat16_as_int16(u2rz)
+            out[6] = bfloat16_as_int16(u2rd)
+            out[7] = bfloat16_as_int16(u2ru)
+            out[8] = bfloat16_as_int16(i3rn)
+            out[9] = bfloat16_as_int16(i3rz)
+            out[10] = bfloat16_as_int16(i3rd)
+            out[11] = bfloat16_as_int16(i3ru)
+            out[12] = bfloat16_as_int16(u3rn)
+            out[13] = bfloat16_as_int16(u3rz)
+            out[14] = bfloat16_as_int16(u3rd)
+            out[15] = bfloat16_as_int16(u3ru)
+            out[16] = bfloat16_as_int16(i4rn)
+            out[17] = bfloat16_as_int16(i4rz)
+            out[18] = bfloat16_as_int16(i4rd)
+            out[19] = bfloat16_as_int16(i4ru)
+            out[20] = bfloat16_as_int16(u4rn)
+            out[21] = bfloat16_as_int16(u4rz)
+            out[22] = bfloat16_as_int16(u4rd)
+            out[23] = bfloat16_as_int16(u4ru)
+
+        out = cuda.device_array((24,), dtype="int16")
+        kernel[1, 1](out)
+        res = out.copy_to_host()
+
+        i2 = np.int16(789).astype(mldtypes_bf16).view("int16")
+        i3 = np.int32(789).astype(mldtypes_bf16).view("int16")
+        i4 = np.int64(789).astype(mldtypes_bf16).view("int16")
+        u2 = np.uint16(789).astype(mldtypes_bf16).view("int16")
+        u3 = np.uint32(789).astype(mldtypes_bf16).view("int16")
+        u4 = np.uint64(789).astype(mldtypes_bf16).view("int16")
+
+        i2arr = np.array([i2] * 4)
+        i3arr = np.array([i3] * 4)
+        i4arr = np.array([i4] * 4)
+        u2arr = np.array([u2] * 4)
+        u3arr = np.array([u3] * 4)
+        u4arr = np.array([u4] * 4)
+
+        two = np.ones_like(res[0:4]) * 2
+        np.testing.assert_array_less(_bf16_ulp_distance(res[0:4], i2arr), two)
+        np.testing.assert_array_less(_bf16_ulp_distance(res[4:8], i3arr), two)
+        np.testing.assert_array_less(_bf16_ulp_distance(res[8:12], i4arr), two)
+        np.testing.assert_array_less(_bf16_ulp_distance(res[12:16], u2arr), two)
+        np.testing.assert_array_less(_bf16_ulp_distance(res[16:20], u3arr), two)
+        np.testing.assert_array_less(_bf16_ulp_distance(res[20:24], u4arr), two)
+
+    def test_to_float_conversions(self):
+        self.skip_unsupported()
+
+        @cuda.jit
+        def kernel(out):
+            a = bfloat16(1.5)
+            out[0] = bfloat16_to_float32(a)
+
+        out = cuda.device_array((1,), dtype="float32")
+        kernel[1, 1](out)
+
+        self.assertAlmostEqual(out[0], 1.5, delta=1e-7)  # conversion is exact
+
+    def test_from_float_conversions(self):
+        self.skip_unsupported()
+
+        test_val = 1.5
+
+        @cuda.jit
+        def kernel(out):
+            f4 = float32(test_val)
+            f8 = float64(test_val)
+
+            f4rn = float32_to_bfloat16_rn(f4)
+            f4rz = float32_to_bfloat16_rz(f4)
+            f4rd = float32_to_bfloat16_rd(f4)
+            f4ru = float32_to_bfloat16_ru(f4)
+
+            f4_default = float32_to_bfloat16(f4)
+            f8_default = float64_to_bfloat16(f8)
+
+            out[0] = bfloat16_as_int16(f4rn)
+            out[1] = bfloat16_as_int16(f4rz)
+            out[2] = bfloat16_as_int16(f4rd)
+            out[3] = bfloat16_as_int16(f4ru)
+            out[4] = bfloat16_as_int16(f4_default)
+            out[5] = bfloat16_as_int16(f8_default)
+
+        out = cuda.device_array((1,), dtype="int16")
+        kernel[1, 1](out)
+        raw = out.copy_to_host()
+
+        f4_expected = (
+            np.array([test_val] * 4, "float32")
+            .astype(mldtypes_bf16)
+            .view("int16")
+        )
+        f8_expected = (
+            np.array([test_val] * 1, "float64")
+            .astype(mldtypes_bf16)
+            .view("int16")
+        )
+
+        np.testing.assert_array_less(
+            _bf16_ulp_distance(raw[0:4], f4_expected), 2
+        )
+        np.testing.assert_array_less(
+            _bf16_ulp_distance(raw[4:], f8_expected), 2
+        )
+
+
+def _bf16_ulp_rank(bits_int16: np.ndarray) -> np.ndarray:
+    """
+    Compute the ULP rank of a bfloat16 value. Input is the bits of the bfloat16 value as an int16.
+    The ULP rank is the number of ULPs between the value and 0.
+    Negative values are performed the inverse of 2's complement before computing the rank.
+    """
+    u = bits_int16.view(np.uint16)
+    sign = u >> 15
+    return np.where(sign == 0, u + 0x8000, 0x8000 - u).astype(np.int32)
+
+
+def _bf16_ulp_distance(
+    a_bits_int16: np.ndarray, b_bits_int16: np.ndarray
+) -> np.ndarray:
+    """
+    Compute the difference between two bfloat16 values in ULPs.
+    """
+    return np.abs(_bf16_ulp_rank(a_bits_int16) - _bf16_ulp_rank(b_bits_int16))
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py
index a10949de9..7d4343e35 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py
@@ -4,6 +4,8 @@
 import numba.cuda as cuda
 from numba.cuda.testing import unittest, CUDATestCase
 import numpy as np
+import operator
+from numba.cuda.testing import skip_if_nvjitlink_missing
 
 from numba import (
     config,
@@ -292,6 +294,37 @@ def kernel(arr):
 
         np.testing.assert_allclose(arr, [3], atol=1e-2)
 
+    @skip_if_nvjitlink_missing("LTO is not supported without nvjitlink.")
+    def test_bf16_intrinsics_used_in_lto(self):
+        self.skip_unsupported()
+
+        operations = [
+            (operator.add, "fma.rn.bf16"),
+            (operator.sub, "fma.rn.bf16"),
+            (operator.mul, "fma.rn.bf16"),
+            (
+                operator.truediv,
+                "div.approx.f32",
+            ),  # no native bf16 div, see cuda_bf16.hpp:L3067
+        ]
+
+        for op, ptx_op in operations:
+            with self.subTest(op=op):
+
+                @cuda.jit(lto=True)
+                def kernel(arr):
+                    a = nv_bfloat16(3.14)
+                    b = nv_bfloat16(5)
+                    arr[0] = float32(op(a, b))
+
+                arr = np.zeros(1, np.float32)
+                kernel[1, 1](arr)
+                np.testing.assert_allclose(arr, [op(3.14, 5)], atol=1e-1)
+
+                ptx = next(iter(kernel.inspect_lto_ptx().values()))
+
+                assert ptx_op in ptx, ptx
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_print.py b/numba_cuda/numba/cuda/tests/cudapy/test_print.py
index ca7a5ff13..ff27fd169 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_print.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_print.py
@@ -102,6 +102,20 @@ def print_too_many(r):
 cuda.synchronize()
 """
 
+print_bfloat16_usecase = """\
+from numba import cuda, config
+
+@cuda.jit
+def print_bfloat16():
+    # 0.9375 is a dyadic rational, it's integer significand can expand within 7 digits.
+    # printing this should not give any rounding error.
+    a = cuda.types.bfloat16(0.9375)
+    print(a, a, a)
+
+print_bfloat16[1, 1]()
+cuda.synchronize()
+"""
+
 
 class TestPrint(CUDATestCase):
     # Note that in these tests we generally strip the output to avoid dealing
@@ -148,6 +162,11 @@ def test_dim3(self):
         expected = [str(i) for i in np.ndindex(2, 2, 2)]
         self.assertEqual(sorted(lines), expected)
 
+    @skip_on_cudasim("bfloat16 on host is not yet supported.")
+    def test_bfloat16(self):
+        output, _ = self.run_code(print_bfloat16_usecase)
+        self.assertEqual(output.strip(), "0.937500 0.937500 0.937500")
+
     @skip_on_cudasim("cudasim can print unlimited output")
     def test_too_many_args(self):
         # Tests that we emit the format string and warn when there are more
diff --git a/numba_cuda/numba/cuda/types.py b/numba_cuda/numba/cuda/types.py
index 437e0d2f2..d1ec8c28d 100644
--- a/numba_cuda/numba/cuda/types.py
+++ b/numba_cuda/numba/cuda/types.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: BSD-2-Clause
 
 from numba.core import types
+from numba.core.typeconv import Conversion
 
 
 class Dim3(types.Type):
@@ -41,3 +42,58 @@ class CUDADispatcher(types.Dispatcher):
     # is still probably a good idea to have a separate type for CUDA
     # dispatchers, and this type might get other differentiation from the CPU
     # dispatcher type in future.
+
+
+class Bfloat16(types.Number):
+    """
+    A bfloat16 type. Has 8 exponent bits and 7 significand bits.
+
+    Conversion rules:
+    Floats:
+    from:
+        fp32, fp64: UNSAFE
+        fp16: UNSAFE (loses precision)
+    to:
+        fp32, fp64: PROMOTE (same exponent, more mantissa)
+        fp16: UNSAFE (loses range)
+
+    Integers:
+    from:
+        int8: SAFE
+        other int: All UNSAFE (bf16 cannot represent all integers in range)
+    to: UNSAFE (loses precision, round to zeros)
+
+    All other conversions are not allowed.
+    """
+
+    def __init__(self):
+        super().__init__(name="__nv_bfloat16")
+
+        self.alignof_ = 2
+        self.bitwidth = 16
+
+    def can_convert_from(self, typingctx, other):
+        if isinstance(other, types.Float):
+            return Conversion.unsafe
+
+        elif isinstance(other, types.Integer):
+            if other.bitwidth == 8:
+                return Conversion.safe
+            else:
+                return Conversion.unsafe
+
+    def can_convert_to(self, typingctx, other):
+        if isinstance(other, types.Float):
+            if other.bitwidth >= 32:
+                return Conversion.safe
+            else:
+                return Conversion.unsafe
+        elif isinstance(other, types.Integer):
+            return Conversion.unsafe
+
+    def unify(self, typingctx, other):
+        if isinstance(other, (types.Float, types.Integer)):
+            return typingctx.unify_pairs(self, other)
+
+
+bfloat16 = Bfloat16()
diff --git a/pyproject.toml b/pyproject.toml
index bed757c95..6ccc44b30 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -49,6 +49,7 @@ test = [
     "pytest",
     "pytest-xdist",
     "filecheck",
+    "ml_dtypes",
 ]
 test-cu12 = [
     "numba-cuda[cu12]",