diff --git a/ci/test_conda.sh b/ci/test_conda.sh index 717ba5dc2..ed2d57cef 100755 --- a/ci/test_conda.sh +++ b/ci/test_conda.sh @@ -36,6 +36,7 @@ DEPENDENCIES=( "pytest" "pytest-xdist" "cffi" + "ml_dtypes" "python=${RAPIDS_PY_VERSION}" ) # Constrain oldest supported dependencies for testing diff --git a/ci/test_conda_ctypes_binding.sh b/ci/test_conda_ctypes_binding.sh index a7058619c..844b35b40 100755 --- a/ci/test_conda_ctypes_binding.sh +++ b/ci/test_conda_ctypes_binding.sh @@ -26,6 +26,7 @@ DEPENDENCIES=( "pytest" "pytest-xdist" "cffi" + "ml_dtypes" "python=${RAPIDS_PY_VERSION}" "numba-cuda" ) diff --git a/ci/test_simulator.sh b/ci/test_simulator.sh index 4bdaf8bef..bb85a8733 100755 --- a/ci/test_simulator.sh +++ b/ci/test_simulator.sh @@ -13,6 +13,7 @@ DEPENDENCIES=( "pytest" "pytest-xdist" "cffi" + "ml_dtypes" "python=${RAPIDS_PY_VERSION}" "numba-cuda" ) diff --git a/configs/cuda_bf16.yml b/configs/cuda_bf16.yml index f08cdbd77..2045b0376 100644 --- a/configs/cuda_bf16.yml +++ b/configs/cuda_bf16.yml @@ -1,10 +1,12 @@ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: BSD-2-Clause Name: Numba Bfloat16 -Version: 0.0.1 -Entry Point: ./numba_cuda/numba/cuda/include/12/cuda_bf16.h +Version: 0.0.2 +GPU Arch: + - sm_80 # sm_80 is the first CUDA architecture that supports bfloat16 +Entry Point: ./numba_cuda/numba/cuda/include/13/cuda_bf16.h File List: - - ./numba_cuda/numba/cuda/include/12/cuda_bf16.h + - ./numba_cuda/numba/cuda/include/13/cuda_bf16.h Exclude: {} Types: __nv_bfloat16_raw: Number @@ -21,6 +23,4 @@ Data Models: __nv_bfloat162: StructModel nv_bfloat162: StructModel Shim Include Override: "\"cuda_bf16.h\"" -Additional Import: - - os -Require Pynvjitlink: False +Use Separate Registry: True diff --git a/docs/source/reference/types.rst b/docs/source/reference/types.rst index 4591a8905..9cc4c2bf2 100644 --- a/docs/source/reference/types.rst +++ b/docs/source/reference/types.rst @@ -84,7 +84,7 @@ Data Movement and Casts Construction of a single instance of a ``bfloat16`` object: -.. function:: numba.cuda.bf16.bfloat16(b) +.. function:: numba.cuda.types.bfloat16(b) Constructs a ``bfloat16`` from existing device `scalar`. Supported scalar types: @@ -96,6 +96,7 @@ Construction of a single instance of a ``bfloat16`` object: - ``int32`` - ``uint64`` - ``uint32`` + - ``float16`` Conversely, ``bfloat16`` data can be cast back to existing native data type via ``dtype(b)``, where ``dtype`` is one of the data types above (except float16), @@ -104,7 +105,7 @@ and ``b`` is a bfloat16 object. Arithmetic ********** -Supported arithmetic operations on ``bfloat`16`` operands are: +Supported arithmetic operations on ``bfloat16`` operands are: - Arithmetic (``+``, ``-``, ``*``, ``/``) - Arithmetic assignment operators (``+=``, ``-=``, ``*=``, ``/=``) @@ -144,11 +145,11 @@ on ``bfloat16`` are provided: mode. .. function:: numba.cuda.bf16.hlog2(b) - Calculates bfloat16 decimal logarithm of input ``b`` in round-to-nearest-even - mode. + Calculates bfloat16 binary logarithm (base-2) of input ``b`` in + round-to-nearest-even mode. .. function:: numba.cuda.bf16.hlog10(b) - Calculates bfloat16 natural exponential function of input ``b`` in + Calculates bfloat16 common logarithm (base-10) of input ``b`` in round-to-nearest-even mode. .. function:: numba.cuda.bf16.hcos(b) @@ -191,3 +192,352 @@ on ``bfloat16`` are provided: .. function:: numba.cuda.bf16.hexp10(b) Calculates bfloat16 decimal exponential function of input ``b`` in round-to-nearest-even mode. + + +Arithmetic Intrinsics +********************* + +The following low-level arithmetic intrinsics are available under +``numba.cuda.bf16`` and map to CUDA bfloat16 arithmetic functions. Unless +otherwise noted, operations are performed in round-to-nearest-even mode. + +.. function:: numba.cuda.bf16.habs(a) + + Calculates the absolute value of input ``a`` (bfloat16) and returns the result. + +.. function:: numba.cuda.bf16.hneg(a) + + Negates input ``a`` (bfloat16) and returns the result. + +.. function:: numba.cuda.bf16.hadd(a, b) + + Adds ``a`` and ``b`` (bfloat16) in round-to-nearest-even mode. + +.. function:: numba.cuda.bf16.hadd_rn(a, b) + + Adds ``a`` and ``b`` (bfloat16) in round-to-nearest-even mode. Prevents + contraction of separate operations into a fused-multiply-add. + +.. function:: numba.cuda.bf16.hadd_sat(a, b) + + Adds ``a`` and ``b`` (bfloat16) in round-to-nearest-even mode, with + saturation to the range ``[0.0, 1.0]``. NaN results are flushed to ``+0.0``. + +.. function:: numba.cuda.bf16.hsub(a, b) + + Subtracts ``b`` from ``a`` (bfloat16) in round-to-nearest-even mode. + +.. function:: numba.cuda.bf16.hsub_rn(a, b) + + Subtracts ``b`` from ``a`` (bfloat16) in round-to-nearest-even mode. + Prevents contraction of separate operations into a fused-multiply-add. + +.. function:: numba.cuda.bf16.hsub_sat(a, b) + + Subtracts ``b`` from ``a`` (bfloat16) in round-to-nearest-even mode, with + saturation to the range ``[0.0, 1.0]``. NaN results are flushed to ``+0.0``. + +.. function:: numba.cuda.bf16.hmul(a, b) + + Multiplies ``a`` and ``b`` (bfloat16) in round-to-nearest-even mode. + +.. function:: numba.cuda.bf16.hmul_rn(a, b) + + Multiplies ``a`` and ``b`` (bfloat16) in round-to-nearest-even mode. + Prevents contraction of separate operations into a fused-multiply-add. + +.. function:: numba.cuda.bf16.hmul_sat(a, b) + + Multiplies ``a`` and ``b`` (bfloat16) in round-to-nearest-even mode, with + saturation to the range ``[0.0, 1.0]``. NaN results are flushed to ``+0.0``. + +.. function:: numba.cuda.bf16.hdiv(a, b) + + Divides ``a`` by ``b`` (bfloat16) in round-to-nearest-even mode. + +.. function:: numba.cuda.bf16.hfma(a, b, c) + + Computes a fused multiply-add of ``a`` and ``b`` plus ``c`` (bfloat16) in + round-to-nearest-even mode; i.e. returns ``a * b + c``. + +.. function:: numba.cuda.bf16.hfma_sat(a, b, c) + + Fused multiply-add in round-to-nearest-even mode with saturation to the + range ``[0.0, 1.0]``. NaN results are flushed to ``+0.0``. + +.. function:: numba.cuda.bf16.hfma_relu(a, b, c) + + Fused multiply-add in round-to-nearest-even mode with ReLU saturation; + i.e. returns ``max(0, a * b + c)``. + +Comparison Intrinsics +********************* + +Device-level comparison intrinsics operating on ``bfloat16`` values are +available under ``numba.cuda.bf16``. Unless stated otherwise, the ordered +comparisons return ``False`` if either input is NaN, following IEEE semantics. + +.. function:: numba.cuda.bf16.heq(a, b) + + Ordered equality. Returns ``True`` iff ``a == b``. NaN inputs yield ``False``. + +.. function:: numba.cuda.bf16.hne(a, b) + + Ordered inequality. Returns ``True`` iff ``a != b`` and neither input is NaN. + NaN inputs yield ``False``. + +.. function:: numba.cuda.bf16.hge(a, b) + + Ordered greater-or-equal. NaN inputs yield ``False``. + +.. function:: numba.cuda.bf16.hgt(a, b) + + Ordered greater-than. NaN inputs yield ``False``. + +.. function:: numba.cuda.bf16.hle(a, b) + + Ordered less-or-equal. NaN inputs yield ``False``. + +.. function:: numba.cuda.bf16.hlt(a, b) + + Ordered less-than. NaN inputs yield ``False``. + +The unordered comparison variants return ``True`` when either input is NaN: + +.. function:: numba.cuda.bf16.hequ(a, b) + + Unordered equality. Returns ``True`` if ``a`` or ``b`` is NaN, or if ``a == b``. + +.. function:: numba.cuda.bf16.hneu(a, b) + + Unordered inequality. Returns ``True`` if ``a`` or ``b`` is NaN, or if ``a != b``. + +.. function:: numba.cuda.bf16.hgeu(a, b) + + Unordered greater-or-equal. Returns ``True`` if ``a`` or ``b`` is NaN, or if ``a >= b``. + +.. function:: numba.cuda.bf16.hgtu(a, b) + + Unordered greater-than. Returns ``True`` if ``a`` or ``b`` is NaN, or if ``a > b``. + +.. function:: numba.cuda.bf16.hleu(a, b) + + Unordered less-or-equal. Returns ``True`` if ``a`` or ``b`` is NaN, or if ``a <= b``. + +.. function:: numba.cuda.bf16.hltu(a, b) + + Unordered less-than. Returns ``True`` if ``a`` or ``b`` is NaN, or if ``a < b``. + +Min/Max operations follow CUDA semantics for zeros and NaNs: + +.. function:: numba.cuda.bf16.hmax(a, b) + + Returns ``max(a, b)`` with the following behavior: + if either input is NaN, the other input is returned; if both are NaN, + the canonical NaN is returned. If both inputs are zero, ``+0.0 > -0.0``. + +.. function:: numba.cuda.bf16.hmin(a, b) + + Returns ``min(a, b)`` with the following behavior: + if either input is NaN, the other input is returned; if both are NaN, + the canonical NaN is returned. If both inputs are zero, ``+0.0 > -0.0``. + +.. function:: numba.cuda.bf16.hmax_nan(a, b) + + Returns ``max(a, b)`` where NaNs pass through: if either input is NaN, + the canonical NaN is returned. + +.. function:: numba.cuda.bf16.hmin_nan(a, b) + + Returns ``min(a, b)`` where NaNs pass through: if either input is NaN, + the canonical NaN is returned. + +Special value predicates: + +.. function:: numba.cuda.bf16.hisnan(a) + + Returns ``True`` if ``a`` is a NaN, ``False`` otherwise. + +.. function:: numba.cuda.bf16.hisinf(a) + + Returns a nonzero integer if ``a`` is infinite, otherwise ``0``. + +.. note:: + + Python comparison operators on ``bfloat16`` values in device code map to + the ordered comparisons above. For more details on the CUDA bfloat16 + comparison semantics, see `NVIDIA CUDA Math API: Bfloat16 Comparison Functions + `_. + +Precision Conversion and Data Movement +************************************** + +The following conversion intrinsics convert between ``bfloat16`` and other +scalar types. Rounding-mode suffixes: + +- ``_rn``: round-to-nearest-even +- ``_rz``: round-towards-zero +- ``_rd``: round-down (towards −∞) +- ``_ru``: round-up (towards +∞) + +Floating-point conversions +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. function:: numba.cuda.bf16.float32_to_bfloat16(x) + + Convert a ``float32`` to ``bfloat16`` (default rounding is round-to-nearest-even). + +.. function:: numba.cuda.bf16.float64_to_bfloat16(x) + + Convert a ``float64`` to ``bfloat16`` (default rounding is round-to-nearest-even). + +.. function:: numba.cuda.bf16.bfloat16_to_float32(x) + + Convert a ``bfloat16`` to ``float32``. + +.. function:: numba.cuda.bf16.float32_to_bfloat16_rn(x) +.. function:: numba.cuda.bf16.float32_to_bfloat16_rz(x) +.. function:: numba.cuda.bf16.float32_to_bfloat16_rd(x) +.. function:: numba.cuda.bf16.float32_to_bfloat16_ru(x) + + Convert a ``float32`` to ``bfloat16`` using the specified rounding mode. + +Integer conversions +^^^^^^^^^^^^^^^^^^^^ + +Representative APIs for each integer width are listed below. All have +rounding-mode variants ``_rn``, ``_rz``, ``_rd``, ``_ru``. + +int16 (signed 16-bit) +""""""""""""""""""""" + +.. function:: numba.cuda.bf16.int16_to_bfloat16_rn(x) +.. function:: numba.cuda.bf16.int16_to_bfloat16_rz(x) +.. function:: numba.cuda.bf16.int16_to_bfloat16_rd(x) +.. function:: numba.cuda.bf16.int16_to_bfloat16_ru(x) + + Convert an ``int16`` to ``bfloat16`` with the selected rounding mode. + +.. function:: numba.cuda.bf16.bfloat16_to_int16_rn(x) +.. function:: numba.cuda.bf16.bfloat16_to_int16_rz(x) +.. function:: numba.cuda.bf16.bfloat16_to_int16_rd(x) +.. function:: numba.cuda.bf16.bfloat16_to_int16_ru(x) + + Convert a ``bfloat16`` to ``int16`` with the selected rounding mode. + +uint16 (unsigned 16-bit) +""""""""""""""""""""""""" + +.. function:: numba.cuda.bf16.uint16_to_bfloat16_rn(x) +.. function:: numba.cuda.bf16.uint16_to_bfloat16_rz(x) +.. function:: numba.cuda.bf16.uint16_to_bfloat16_rd(x) +.. function:: numba.cuda.bf16.uint16_to_bfloat16_ru(x) + + Convert a ``uint16`` to ``bfloat16`` with the selected rounding mode. + +.. function:: numba.cuda.bf16.bfloat16_to_uint16_rn(x) +.. function:: numba.cuda.bf16.bfloat16_to_uint16_rz(x) +.. function:: numba.cuda.bf16.bfloat16_to_uint16_rd(x) +.. function:: numba.cuda.bf16.bfloat16_to_uint16_ru(x) + + Convert a ``bfloat16`` to ``uint16`` with the selected rounding mode. + +int32 (signed 32-bit) +""""""""""""""""""""" + +.. function:: numba.cuda.bf16.int32_to_bfloat16_rn(x) +.. function:: numba.cuda.bf16.int32_to_bfloat16_rz(x) +.. function:: numba.cuda.bf16.int32_to_bfloat16_rd(x) +.. function:: numba.cuda.bf16.int32_to_bfloat16_ru(x) + + Convert an ``int32`` to ``bfloat16`` with the selected rounding mode. + +.. function:: numba.cuda.bf16.bfloat16_to_int32_rn(x) +.. function:: numba.cuda.bf16.bfloat16_to_int32_rz(x) +.. function:: numba.cuda.bf16.bfloat16_to_int32_rd(x) +.. function:: numba.cuda.bf16.bfloat16_to_int32_ru(x) + + Convert a ``bfloat16`` to ``int32`` with the selected rounding mode. + +uint32 (unsigned 32-bit) +""""""""""""""""""""""""" + +.. function:: numba.cuda.bf16.uint32_to_bfloat16_rn(x) +.. function:: numba.cuda.bf16.uint32_to_bfloat16_rz(x) +.. function:: numba.cuda.bf16.uint32_to_bfloat16_rd(x) +.. function:: numba.cuda.bf16.uint32_to_bfloat16_ru(x) + + Convert a ``uint32`` to ``bfloat16`` with the selected rounding mode. + +.. function:: numba.cuda.bf16.bfloat16_to_uint32_rn(x) +.. function:: numba.cuda.bf16.bfloat16_to_uint32_rz(x) +.. function:: numba.cuda.bf16.bfloat16_to_uint32_rd(x) +.. function:: numba.cuda.bf16.bfloat16_to_uint32_ru(x) + + Convert a ``bfloat16`` to ``uint32`` with the selected rounding mode. + +int64 (signed 64-bit) +""""""""""""""""""""" + +.. function:: numba.cuda.bf16.int64_to_bfloat16_rn(x) +.. function:: numba.cuda.bf16.int64_to_bfloat16_rz(x) +.. function:: numba.cuda.bf16.int64_to_bfloat16_rd(x) +.. function:: numba.cuda.bf16.int64_to_bfloat16_ru(x) + + Convert an ``int64`` to ``bfloat16`` with the selected rounding mode. + +.. function:: numba.cuda.bf16.bfloat16_to_int64_rn(x) +.. function:: numba.cuda.bf16.bfloat16_to_int64_rz(x) +.. function:: numba.cuda.bf16.bfloat16_to_int64_rd(x) +.. function:: numba.cuda.bf16.bfloat16_to_int64_ru(x) + + Convert a ``bfloat16`` to ``int64`` with the selected rounding mode. + +uint64 (unsigned 64-bit) +""""""""""""""""""""""""" + +.. function:: numba.cuda.bf16.uint64_to_bfloat16_rn(x) +.. function:: numba.cuda.bf16.uint64_to_bfloat16_rz(x) +.. function:: numba.cuda.bf16.uint64_to_bfloat16_rd(x) +.. function:: numba.cuda.bf16.uint64_to_bfloat16_ru(x) + + Convert a ``uint64`` to ``bfloat16`` with the selected rounding mode. + +.. function:: numba.cuda.bf16.bfloat16_to_uint64_rn(x) +.. function:: numba.cuda.bf16.bfloat16_to_uint64_rz(x) +.. function:: numba.cuda.bf16.bfloat16_to_uint64_rd(x) +.. function:: numba.cuda.bf16.bfloat16_to_uint64_ru(x) + + Convert a ``bfloat16`` to ``uint64`` with the selected rounding mode. + +8-bit conversions +^^^^^^^^^^^^^^^^^^ + +.. function:: numba.cuda.bf16.bfloat16_to_int8_rz(x) + + Convert a ``bfloat16`` to ``int8`` with round-towards-zero. + +.. function:: numba.cuda.bf16.bfloat16_to_uint8_rz(x) + + Convert a ``bfloat16`` to ``uint8`` with round-towards-zero. + +Bit Reinterpret Casts +^^^^^^^^^^^^^^^^^^^^^ + +These APIs reinterpret bits without numeric conversion: + +.. function:: numba.cuda.bf16.bfloat16_as_int16(x) + + Reinterpret the bits of ``bfloat16`` as an ``int16``. + +.. function:: numba.cuda.bf16.bfloat16_as_uint16(x) + + Reinterpret the bits of ``bfloat16`` as a ``uint16``. + +.. function:: numba.cuda.bf16.int16_as_bfloat16(x) + + Reinterpret the bits of an ``int16`` as a ``bfloat16``. + +.. function:: numba.cuda.bf16.uint16_as_bfloat16(x) + + Reinterpret the bits of a ``uint16`` as a ``bfloat16``. diff --git a/numba_cuda/numba/cuda/_internal/cuda_bf16.py b/numba_cuda/numba/cuda/_internal/cuda_bf16.py index a1cabfdff..33beb2b5a 100644 --- a/numba_cuda/numba/cuda/_internal/cuda_bf16.py +++ b/numba_cuda/numba/cuda/_internal/cuda_bf16.py @@ -3,19 +3,18 @@ # Automatically generated by Numbast Static Binding Generator # Generator Information: -# Ast_canopy version: 0.3.0 -# Numbast version: 0.3.0 -# Generation command: /home/wangm/numbast/numbast/src/numbast/__main__.py --cfg-path configs/cuda_bf16.yml --output-dir numba_cuda/numba/cuda/_internal -# Static binding generator parameters: {'cfg_path': 'configs/cuda_bf16.yml', 'output_dir': 'numba_cuda/numba/cuda/', 'entry_point': None, 'retain': None, 'types': None, 'datamodels': None, 'compute_capability': None, 'run_ruff_format': True} -# Config file path (relative to the path of the generated binding): ../../../../configs/cuda_bf16.yml +# Ast_canopy version: 0.5.0 +# Numbast version: 0.5.0 +# Generation command: /home/wangm/numbast/numbast/src/numbast/__main__.py --cfg-path configs/cuda_bf16.yml --output-dir numba_cuda/numba/cuda/_internal/ +# Static binding generator parameters: {'cfg_path': 'configs/cuda_bf16.yml', 'output_dir': 'numba_cuda/numba/cuda/_internal/', 'run_ruff_format': True} +# Config file path (relative to the path of the generated binding): ../../../../../configs/cuda_bf16.yml # Cudatoolkit version: (12, 8) -# Default CUDA_HOME path: /home/wangm/micromamba/envs/numbast +# Default CUDA_HOME path: /home/wangm/miniforge3/envs/numbast # Imports: import io import operator -import os import numba from llvmlite import ir @@ -26,11 +25,21 @@ make_attribute_wrapper, register_model, ) -from numba.cuda.typing import signature +from numba.core.imputils import Registry as TargetRegistry +from numba.core.imputils import lower_cast +from numba.core.typing import signature +from numba.core.typing.builtins import ( + BinOp, + BinOpTrueDiv, + UnaryNegate, + UnaryPositive, + UnorderedCmpOp, + OrderedCmpOp, +) from numba.cuda.typing.templates import AttributeTemplate, ConcreteTemplate +from numba.cuda.typing.templates import Registry as TypingRegistry from numba.cuda import CUSource, declare_device -from numba.cuda.cudadecl import register, register_attr, register_global -from numba.cuda.cudaimpl import lower +from numba.cuda.vector_types import vector_types from numba.extending import as_numba_type from numba.types import ( CPointer, @@ -49,9 +58,22 @@ uint16, uint32, uint64, + void, ) +from numba.cuda.types import bfloat16 + +float32x2 = vector_types["float32x2"] +__half = float16 -# Setups: + +typing_registry = TypingRegistry() +register = typing_registry.register +register_attr = typing_registry.register_attr +register_global = typing_registry.register_global +target_registry = TargetRegistry() +lower = target_registry.lower +lower_attr = target_registry.lower_getattr +lower_constant = target_registry.lower_constant # Shim Stream: @@ -79,83 +101,84 @@ def reset(self): shim_stream.write(shim_prefix) shim_obj = CUSource(shim_stream) + # Enums: # Structs: -# Typing for unnamed1401637 -class _type_class_unnamed1401637(Type): +# Typing for unnamed1405307 +class _type_class_unnamed1405307(Type): def __init__(self): - super().__init__(name="unnamed1401637") + super().__init__(name="unnamed1405307") self.alignof_ = 2 self.bitwidth = 2 * 8 -_type_unnamed1401637 = _type_class_unnamed1401637() +_type_unnamed1405307 = _type_class_unnamed1405307() # Make Python API for struct -unnamed1401637 = type("unnamed1401637", (), {"_nbtype": _type_unnamed1401637}) +unnamed1405307 = type("unnamed1405307", (), {"_nbtype": _type_unnamed1405307}) -as_numba_type.register(unnamed1401637, _type_unnamed1401637) +as_numba_type.register(unnamed1405307, _type_unnamed1405307) -@register_model(_type_class_unnamed1401637) -class _model_unnamed1401637(StructModel): +@register_model(_type_class_unnamed1405307) +class _model_unnamed1405307(StructModel): def __init__(self, dmm, fe_type): members = [("x", uint16)] super().__init__(dmm, fe_type, members) @register_attr -class _attr_typing_unnamed1401637(AttributeTemplate): - key = globals()["unnamed1401637"] +class _attr_typing_unnamed1405307(AttributeTemplate): + key = globals()["unnamed1405307"] def resolve_x(self, obj): return uint16 -make_attribute_wrapper(_type_class_unnamed1401637, "x", "x") +make_attribute_wrapper(_type_class_unnamed1405307, "x", "x") @register -class _ctor_template_unnamed1401637(ConcreteTemplate): - key = globals()["unnamed1401637"] +class _ctor_template_unnamed1405307(ConcreteTemplate): + key = globals()["unnamed1405307"] cases = [] -register_global(unnamed1401637, Function(_ctor_template_unnamed1401637)) +register_global(unnamed1405307, Function(_ctor_template_unnamed1405307)) -# Typing for unnamed1401746 -class _type_class_unnamed1401746(Type): +# Typing for unnamed1405416 +class _type_class_unnamed1405416(Type): def __init__(self): - super().__init__(name="unnamed1401746") + super().__init__(name="unnamed1405416") self.alignof_ = 4 self.bitwidth = 4 * 8 -_type_unnamed1401746 = _type_class_unnamed1401746() +_type_unnamed1405416 = _type_class_unnamed1405416() # Make Python API for struct -unnamed1401746 = type("unnamed1401746", (), {"_nbtype": _type_unnamed1401746}) +unnamed1405416 = type("unnamed1405416", (), {"_nbtype": _type_unnamed1405416}) -as_numba_type.register(unnamed1401746, _type_unnamed1401746) +as_numba_type.register(unnamed1405416, _type_unnamed1405416) -@register_model(_type_class_unnamed1401746) -class _model_unnamed1401746(StructModel): +@register_model(_type_class_unnamed1405416) +class _model_unnamed1405416(StructModel): def __init__(self, dmm, fe_type): members = [("x", uint16), ("y", uint16)] super().__init__(dmm, fe_type, members) @register_attr -class _attr_typing_unnamed1401746(AttributeTemplate): - key = globals()["unnamed1401746"] +class _attr_typing_unnamed1405416(AttributeTemplate): + key = globals()["unnamed1405416"] def resolve_x(self, obj): return uint16 @@ -164,56 +187,26 @@ def resolve_y(self, obj): return uint16 -make_attribute_wrapper(_type_class_unnamed1401746, "x", "x") - - -make_attribute_wrapper(_type_class_unnamed1401746, "y", "y") - - -@register -class _ctor_template_unnamed1401746(ConcreteTemplate): - key = globals()["unnamed1401746"] - cases = [] - - -register_global(unnamed1401746, Function(_ctor_template_unnamed1401746)) - - -# Typing for __nv_bfloat16 -class _type_class___nv_bfloat16(Number): - def __init__(self): - super().__init__(name="__nv_bfloat16") - self.alignof_ = 2 - self.bitwidth = 2 * 8 - - -_type___nv_bfloat16 = _type_class___nv_bfloat16() - +make_attribute_wrapper(_type_class_unnamed1405416, "x", "x") -# Make Python API for struct -__nv_bfloat16 = type("__nv_bfloat16", (), {"_nbtype": _type___nv_bfloat16}) -as_numba_type.register(__nv_bfloat16, _type___nv_bfloat16) +make_attribute_wrapper(_type_class_unnamed1405416, "y", "y") -@register_model(_type_class___nv_bfloat16) -class _model___nv_bfloat16(PrimitiveModel): - def __init__(self, dmm, fe_type): - be_type = ir.IntType(fe_type.bitwidth) - super(_model___nv_bfloat16, self).__init__(dmm, fe_type, be_type) +__nv_bfloat16 = _type___nv_bfloat16 = bfloat16 -def _lower___nv_bfloat16_void(shim_stream, shim_obj): +def _lower__ZN13__nv_bfloat16C1Ev(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16____nv_bfloat16_1(int &ignore, __nv_bfloat16 *self ) { + _ZN13__nv_bfloat16C1Ev_nbst(int &ignore, __nv_bfloat16 *self ) { new (self) __nv_bfloat16(); return 0; } """ _ctor_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16____nv_bfloat16_1", + "_ZN13__nv_bfloat16C1Ev_nbst", int32( CPointer(_type___nv_bfloat16), ), @@ -227,9 +220,7 @@ def __nv_bfloat16_device_caller(arg_0): ) def ctor_impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key( - "____nv_bfloat16____nv_bfloat16_1", shim_raw_str - ) + shim_stream.write_with_key("_ZN13__nv_bfloat16C1Ev_nbst", shim_raw_str) selfptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" ) @@ -253,31 +244,31 @@ def ctor_impl(context, builder, sig, args): ) -_lower___nv_bfloat16_void(shim_stream, shim_obj) +_lower__ZN13__nv_bfloat16C1Ev(shim_stream, shim_obj) -def _lower___nv_bfloat16__type_unnamed1401637(shim_stream, shim_obj): +def _lower__ZN13__nv_bfloat16C1ERK17__nv_bfloat16_raw(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16____nv_bfloat16_2(int &ignore, __nv_bfloat16 *self , __nv_bfloat16_raw* hr) { + _ZN13__nv_bfloat16C1ERK17__nv_bfloat16_raw_nbst(int &ignore, __nv_bfloat16 *self , __nv_bfloat16_raw* hr) { new (self) __nv_bfloat16(*hr); return 0; } """ _ctor_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16____nv_bfloat16_2", - int32(CPointer(_type___nv_bfloat16), CPointer(_type_unnamed1401637)), + "_ZN13__nv_bfloat16C1ERK17__nv_bfloat16_raw_nbst", + int32(CPointer(_type___nv_bfloat16), CPointer(_type_unnamed1405307)), ) def __nv_bfloat16_device_caller(arg_0, arg_1): return _ctor_decl___nv_bfloat16(arg_0, arg_1) - @lower(__nv_bfloat16, _type_unnamed1401637) + @lower(__nv_bfloat16, _type_unnamed1405307) def ctor_impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( - "____nv_bfloat16____nv_bfloat16_2", shim_raw_str + "_ZN13__nv_bfloat16C1ERK17__nv_bfloat16_raw_nbst", shim_raw_str ) selfptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" @@ -294,7 +285,7 @@ def ctor_impl(context, builder, sig, args): signature( int32, CPointer(_type___nv_bfloat16), - CPointer(_type_unnamed1401637), + CPointer(_type_unnamed1405307), ), (selfptr, *argptrs), ) @@ -302,21 +293,30 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) ) + @lower_cast(_type_unnamed1405307, _type___nv_bfloat16) + def conversion_impl(context, builder, fromty, toty, value): + return ctor_impl( + context, + builder, + signature(_type___nv_bfloat16, fromty), + [value], + ) -_lower___nv_bfloat16__type_unnamed1401637(shim_stream, shim_obj) +_lower__ZN13__nv_bfloat16C1ERK17__nv_bfloat16_raw(shim_stream, shim_obj) -def _lower___nv_bfloat16_float16(shim_stream, shim_obj): + +def _lower__ZN13__nv_bfloat16C1E6__half(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16____nv_bfloat16_3(int &ignore, __nv_bfloat16 *self , __half* f) { + _ZN13__nv_bfloat16C1E6__half_nbst(int &ignore, __nv_bfloat16 *self , __half* f) { new (self) __nv_bfloat16(*f); return 0; } """ _ctor_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16____nv_bfloat16_3", + "_ZN13__nv_bfloat16C1E6__half_nbst", int32(CPointer(_type___nv_bfloat16), CPointer(float16)), ) @@ -327,7 +327,7 @@ def __nv_bfloat16_device_caller(arg_0, arg_1): def ctor_impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( - "____nv_bfloat16____nv_bfloat16_3", shim_raw_str + "_ZN13__nv_bfloat16C1E6__half_nbst", shim_raw_str ) selfptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" @@ -348,21 +348,32 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) ) + # By default, Numbast does not generate this cast because the c++ conversion + # constructor is marked explict. We enable it by hand here. + @lower_cast(float16, __nv_bfloat16) + def conversion_impl(context, builder, fromty, toty, value): + return ctor_impl( + context, + builder, + signature(__nv_bfloat16, fromty), + [value], + ) + -_lower___nv_bfloat16_float16(shim_stream, shim_obj) +_lower__ZN13__nv_bfloat16C1E6__half(shim_stream, shim_obj) -def _lower___nv_bfloat16_float32(shim_stream, shim_obj): +def _lower__ZN13__nv_bfloat16C1Ef(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16____nv_bfloat16_4(int &ignore, __nv_bfloat16 *self , float* f) { + _ZN13__nv_bfloat16C1Ef_nbst(int &ignore, __nv_bfloat16 *self , float* f) { new (self) __nv_bfloat16(*f); return 0; } """ _ctor_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16____nv_bfloat16_4", + "_ZN13__nv_bfloat16C1Ef_nbst", int32(CPointer(_type___nv_bfloat16), CPointer(float32)), ) @@ -372,9 +383,7 @@ def __nv_bfloat16_device_caller(arg_0, arg_1): @lower(__nv_bfloat16, float32) def ctor_impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key( - "____nv_bfloat16____nv_bfloat16_4", shim_raw_str - ) + shim_stream.write_with_key("_ZN13__nv_bfloat16C1Ef_nbst", shim_raw_str) selfptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" ) @@ -394,21 +403,30 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) ) + @lower_cast(float32, _type___nv_bfloat16) + def conversion_impl(context, builder, fromty, toty, value): + return ctor_impl( + context, + builder, + signature(_type___nv_bfloat16, fromty), + [value], + ) -_lower___nv_bfloat16_float32(shim_stream, shim_obj) +_lower__ZN13__nv_bfloat16C1Ef(shim_stream, shim_obj) -def _lower___nv_bfloat16_float64(shim_stream, shim_obj): + +def _lower__ZN13__nv_bfloat16C1Ed(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16____nv_bfloat16_5(int &ignore, __nv_bfloat16 *self , double* f) { + _ZN13__nv_bfloat16C1Ed_nbst(int &ignore, __nv_bfloat16 *self , double* f) { new (self) __nv_bfloat16(*f); return 0; } """ _ctor_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16____nv_bfloat16_5", + "_ZN13__nv_bfloat16C1Ed_nbst", int32(CPointer(_type___nv_bfloat16), CPointer(float64)), ) @@ -418,9 +436,7 @@ def __nv_bfloat16_device_caller(arg_0, arg_1): @lower(__nv_bfloat16, float64) def ctor_impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key( - "____nv_bfloat16____nv_bfloat16_5", shim_raw_str - ) + shim_stream.write_with_key("_ZN13__nv_bfloat16C1Ed_nbst", shim_raw_str) selfptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" ) @@ -440,21 +456,30 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) ) + @lower_cast(float64, _type___nv_bfloat16) + def conversion_impl(context, builder, fromty, toty, value): + return ctor_impl( + context, + builder, + signature(_type___nv_bfloat16, fromty), + [value], + ) + -_lower___nv_bfloat16_float64(shim_stream, shim_obj) +_lower__ZN13__nv_bfloat16C1Ed(shim_stream, shim_obj) -def _lower___nv_bfloat16_int16(shim_stream, shim_obj): +def _lower__ZN13__nv_bfloat16C1Es(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16____nv_bfloat16_6(int &ignore, __nv_bfloat16 *self , short* val) { + _ZN13__nv_bfloat16C1Es_nbst(int &ignore, __nv_bfloat16 *self , short* val) { new (self) __nv_bfloat16(*val); return 0; } """ _ctor_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16____nv_bfloat16_6", + "_ZN13__nv_bfloat16C1Es_nbst", int32(CPointer(_type___nv_bfloat16), CPointer(int16)), ) @@ -464,9 +489,7 @@ def __nv_bfloat16_device_caller(arg_0, arg_1): @lower(__nv_bfloat16, int16) def ctor_impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key( - "____nv_bfloat16____nv_bfloat16_6", shim_raw_str - ) + shim_stream.write_with_key("_ZN13__nv_bfloat16C1Es_nbst", shim_raw_str) selfptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" ) @@ -486,21 +509,30 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) ) + @lower_cast(int16, _type___nv_bfloat16) + def conversion_impl(context, builder, fromty, toty, value): + return ctor_impl( + context, + builder, + signature(_type___nv_bfloat16, fromty), + [value], + ) + -_lower___nv_bfloat16_int16(shim_stream, shim_obj) +_lower__ZN13__nv_bfloat16C1Es(shim_stream, shim_obj) -def _lower___nv_bfloat16_uint16(shim_stream, shim_obj): +def _lower__ZN13__nv_bfloat16C1Et(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16____nv_bfloat16_7(int &ignore, __nv_bfloat16 *self , unsigned short* val) { + _ZN13__nv_bfloat16C1Et_nbst(int &ignore, __nv_bfloat16 *self , unsigned short* val) { new (self) __nv_bfloat16(*val); return 0; } """ _ctor_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16____nv_bfloat16_7", + "_ZN13__nv_bfloat16C1Et_nbst", int32(CPointer(_type___nv_bfloat16), CPointer(uint16)), ) @@ -510,9 +542,7 @@ def __nv_bfloat16_device_caller(arg_0, arg_1): @lower(__nv_bfloat16, uint16) def ctor_impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key( - "____nv_bfloat16____nv_bfloat16_7", shim_raw_str - ) + shim_stream.write_with_key("_ZN13__nv_bfloat16C1Et_nbst", shim_raw_str) selfptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" ) @@ -532,21 +562,30 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) ) + @lower_cast(uint16, _type___nv_bfloat16) + def conversion_impl(context, builder, fromty, toty, value): + return ctor_impl( + context, + builder, + signature(_type___nv_bfloat16, fromty), + [value], + ) + -_lower___nv_bfloat16_uint16(shim_stream, shim_obj) +_lower__ZN13__nv_bfloat16C1Et(shim_stream, shim_obj) -def _lower___nv_bfloat16_int32(shim_stream, shim_obj): +def _lower__ZN13__nv_bfloat16C1Ei(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16____nv_bfloat16_8(int &ignore, __nv_bfloat16 *self , int* val) { + _ZN13__nv_bfloat16C1Ei_nbst(int &ignore, __nv_bfloat16 *self , int* val) { new (self) __nv_bfloat16(*val); return 0; } """ _ctor_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16____nv_bfloat16_8", + "_ZN13__nv_bfloat16C1Ei_nbst", int32(CPointer(_type___nv_bfloat16), CPointer(int32)), ) @@ -556,9 +595,7 @@ def __nv_bfloat16_device_caller(arg_0, arg_1): @lower(__nv_bfloat16, int32) def ctor_impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key( - "____nv_bfloat16____nv_bfloat16_8", shim_raw_str - ) + shim_stream.write_with_key("_ZN13__nv_bfloat16C1Ei_nbst", shim_raw_str) selfptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" ) @@ -578,21 +615,30 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) ) + @lower_cast(int32, _type___nv_bfloat16) + def conversion_impl(context, builder, fromty, toty, value): + return ctor_impl( + context, + builder, + signature(_type___nv_bfloat16, fromty), + [value], + ) -_lower___nv_bfloat16_int32(shim_stream, shim_obj) +_lower__ZN13__nv_bfloat16C1Ei(shim_stream, shim_obj) -def _lower___nv_bfloat16_uint32(shim_stream, shim_obj): + +def _lower__ZN13__nv_bfloat16C1Ej(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16____nv_bfloat16_9(int &ignore, __nv_bfloat16 *self , unsigned int* val) { + _ZN13__nv_bfloat16C1Ej_nbst(int &ignore, __nv_bfloat16 *self , unsigned int* val) { new (self) __nv_bfloat16(*val); return 0; } """ _ctor_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16____nv_bfloat16_9", + "_ZN13__nv_bfloat16C1Ej_nbst", int32(CPointer(_type___nv_bfloat16), CPointer(uint32)), ) @@ -602,9 +648,7 @@ def __nv_bfloat16_device_caller(arg_0, arg_1): @lower(__nv_bfloat16, uint32) def ctor_impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key( - "____nv_bfloat16____nv_bfloat16_9", shim_raw_str - ) + shim_stream.write_with_key("_ZN13__nv_bfloat16C1Ej_nbst", shim_raw_str) selfptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" ) @@ -624,21 +668,30 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) ) + @lower_cast(uint32, _type___nv_bfloat16) + def conversion_impl(context, builder, fromty, toty, value): + return ctor_impl( + context, + builder, + signature(_type___nv_bfloat16, fromty), + [value], + ) + -_lower___nv_bfloat16_uint32(shim_stream, shim_obj) +_lower__ZN13__nv_bfloat16C1Ej(shim_stream, shim_obj) -def _lower___nv_bfloat16_int64(shim_stream, shim_obj): +def _lower__ZN13__nv_bfloat16C1El(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16____nv_bfloat16_10(int &ignore, __nv_bfloat16 *self , long* val) { + _ZN13__nv_bfloat16C1El_nbst(int &ignore, __nv_bfloat16 *self , long* val) { new (self) __nv_bfloat16(*val); return 0; } """ _ctor_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16____nv_bfloat16_10", + "_ZN13__nv_bfloat16C1El_nbst", int32(CPointer(_type___nv_bfloat16), CPointer(int64)), ) @@ -648,9 +701,7 @@ def __nv_bfloat16_device_caller(arg_0, arg_1): @lower(__nv_bfloat16, int64) def ctor_impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key( - "____nv_bfloat16____nv_bfloat16_10", shim_raw_str - ) + shim_stream.write_with_key("_ZN13__nv_bfloat16C1El_nbst", shim_raw_str) selfptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" ) @@ -670,21 +721,30 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) ) + @lower_cast(int64, _type___nv_bfloat16) + def conversion_impl(context, builder, fromty, toty, value): + return ctor_impl( + context, + builder, + signature(_type___nv_bfloat16, fromty), + [value], + ) + -_lower___nv_bfloat16_int64(shim_stream, shim_obj) +_lower__ZN13__nv_bfloat16C1El(shim_stream, shim_obj) -def _lower___nv_bfloat16_uint64(shim_stream, shim_obj): +def _lower__ZN13__nv_bfloat16C1Em(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16____nv_bfloat16_11(int &ignore, __nv_bfloat16 *self , unsigned long* val) { + _ZN13__nv_bfloat16C1Em_nbst(int &ignore, __nv_bfloat16 *self , unsigned long* val) { new (self) __nv_bfloat16(*val); return 0; } """ _ctor_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16____nv_bfloat16_11", + "_ZN13__nv_bfloat16C1Em_nbst", int32(CPointer(_type___nv_bfloat16), CPointer(uint64)), ) @@ -694,9 +754,7 @@ def __nv_bfloat16_device_caller(arg_0, arg_1): @lower(__nv_bfloat16, uint64) def ctor_impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key( - "____nv_bfloat16____nv_bfloat16_11", shim_raw_str - ) + shim_stream.write_with_key("_ZN13__nv_bfloat16C1Em_nbst", shim_raw_str) selfptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" ) @@ -716,21 +774,30 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) ) + @lower_cast(uint64, _type___nv_bfloat16) + def conversion_impl(context, builder, fromty, toty, value): + return ctor_impl( + context, + builder, + signature(_type___nv_bfloat16, fromty), + [value], + ) + -_lower___nv_bfloat16_uint64(shim_stream, shim_obj) +_lower__ZN13__nv_bfloat16C1Em(shim_stream, shim_obj) -def _lower___nv_bfloat16_int64(shim_stream, shim_obj): +def _lower__ZN13__nv_bfloat16C1Ex(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16____nv_bfloat16_12(int &ignore, __nv_bfloat16 *self , long long* val) { + _ZN13__nv_bfloat16C1Ex_nbst(int &ignore, __nv_bfloat16 *self , long long* val) { new (self) __nv_bfloat16(*val); return 0; } """ _ctor_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16____nv_bfloat16_12", + "_ZN13__nv_bfloat16C1Ex_nbst", int32(CPointer(_type___nv_bfloat16), CPointer(int64)), ) @@ -740,9 +807,7 @@ def __nv_bfloat16_device_caller(arg_0, arg_1): @lower(__nv_bfloat16, int64) def ctor_impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key( - "____nv_bfloat16____nv_bfloat16_12", shim_raw_str - ) + shim_stream.write_with_key("_ZN13__nv_bfloat16C1Ex_nbst", shim_raw_str) selfptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" ) @@ -762,21 +827,30 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) ) + @lower_cast(int64, _type___nv_bfloat16) + def conversion_impl(context, builder, fromty, toty, value): + return ctor_impl( + context, + builder, + signature(_type___nv_bfloat16, fromty), + [value], + ) -_lower___nv_bfloat16_int64(shim_stream, shim_obj) +_lower__ZN13__nv_bfloat16C1Ex(shim_stream, shim_obj) -def _lower___nv_bfloat16_uint64(shim_stream, shim_obj): + +def _lower__ZN13__nv_bfloat16C1Ey(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16____nv_bfloat16_13(int &ignore, __nv_bfloat16 *self , unsigned long long* val) { + _ZN13__nv_bfloat16C1Ey_nbst(int &ignore, __nv_bfloat16 *self , unsigned long long* val) { new (self) __nv_bfloat16(*val); return 0; } """ _ctor_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16____nv_bfloat16_13", + "_ZN13__nv_bfloat16C1Ey_nbst", int32(CPointer(_type___nv_bfloat16), CPointer(uint64)), ) @@ -786,9 +860,7 @@ def __nv_bfloat16_device_caller(arg_0, arg_1): @lower(__nv_bfloat16, uint64) def ctor_impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key( - "____nv_bfloat16____nv_bfloat16_13", shim_raw_str - ) + shim_stream.write_with_key("_ZN13__nv_bfloat16C1Ey_nbst", shim_raw_str) selfptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" ) @@ -808,8 +880,17 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) ) + @lower_cast(uint64, _type___nv_bfloat16) + def conversion_impl(context, builder, fromty, toty, value): + return ctor_impl( + context, + builder, + signature(_type___nv_bfloat16, fromty), + [value], + ) + -_lower___nv_bfloat16_uint64(shim_stream, shim_obj) +_lower__ZN13__nv_bfloat16C1Ey(shim_stream, shim_obj) @register @@ -819,7 +900,7 @@ class _ctor_template___nv_bfloat16(ConcreteTemplate): signature( _type___nv_bfloat16, ), - signature(_type___nv_bfloat16, _type_unnamed1401637), + signature(_type___nv_bfloat16, _type_unnamed1405307), signature(_type___nv_bfloat16, float16), signature(_type___nv_bfloat16, float32), signature(_type___nv_bfloat16, float64), @@ -837,18 +918,18 @@ class _ctor_template___nv_bfloat16(ConcreteTemplate): register_global(__nv_bfloat16, Function(_ctor_template___nv_bfloat16)) -def _from___nv_bfloat16_to__type_unnamed1401637_lower(shim_stream, shim_obj): +def _from___nv_bfloat16_to__type_unnamed1405307_lower(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16_operator___nv_bfloat16_raw_1(__nv_bfloat16_raw &retval, __nv_bfloat16 *self) { + ____nv_bfloat16__ZNK13__nv_bfloat16cv17__nv_bfloat16_rawEv_1(__nv_bfloat16_raw &retval, __nv_bfloat16 *self) { retval = self->operator __nv_bfloat16_raw(); return 0; } """ _op_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16_operator___nv_bfloat16_raw_1", - _type_unnamed1401637( + "____nv_bfloat16__ZNK13__nv_bfloat16cv17__nv_bfloat16_rawEv_1", + _type_unnamed1405307( CPointer(_type___nv_bfloat16), ), ) @@ -856,11 +937,12 @@ def _from___nv_bfloat16_to__type_unnamed1401637_lower(shim_stream, shim_obj): def _conversion_op_caller___nv_bfloat16(arg): return _op_decl___nv_bfloat16(arg) - @lower_cast(_type___nv_bfloat16, _type_unnamed1401637) + @lower_cast(_type___nv_bfloat16, _type_unnamed1405307) def impl(context, builder, fromty, toty, value): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( - "____nv_bfloat16_operator___nv_bfloat16_raw_1", shim_raw_str + "____nv_bfloat16__ZNK13__nv_bfloat16cv17__nv_bfloat16_rawEv_1", + shim_raw_str, ) ptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" @@ -873,28 +955,28 @@ def impl(context, builder, fromty, toty, value): builder, _conversion_op_caller___nv_bfloat16, signature( - _type_unnamed1401637, + _type_unnamed1405307, CPointer(_type___nv_bfloat16), ), (ptr,), ) -_from___nv_bfloat16_to__type_unnamed1401637_lower(shim_stream, shim_obj) +_from___nv_bfloat16_to__type_unnamed1405307_lower(shim_stream, shim_obj) -def _from___nv_bfloat16_to__type_unnamed1401637_lower(shim_stream, shim_obj): +def _from___nv_bfloat16_to__type_unnamed1405307_lower(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16_operator___nv_bfloat16_raw_2(__nv_bfloat16_raw &retval, __nv_bfloat16 *self) { + ____nv_bfloat16__ZNVK13__nv_bfloat16cv17__nv_bfloat16_rawEv_1(__nv_bfloat16_raw &retval, __nv_bfloat16 *self) { retval = self->operator __nv_bfloat16_raw(); return 0; } """ _op_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16_operator___nv_bfloat16_raw_2", - _type_unnamed1401637( + "____nv_bfloat16__ZNVK13__nv_bfloat16cv17__nv_bfloat16_rawEv_1", + _type_unnamed1405307( CPointer(_type___nv_bfloat16), ), ) @@ -902,11 +984,12 @@ def _from___nv_bfloat16_to__type_unnamed1401637_lower(shim_stream, shim_obj): def _conversion_op_caller___nv_bfloat16(arg): return _op_decl___nv_bfloat16(arg) - @lower_cast(_type___nv_bfloat16, _type_unnamed1401637) + @lower_cast(_type___nv_bfloat16, _type_unnamed1405307) def impl(context, builder, fromty, toty, value): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( - "____nv_bfloat16_operator___nv_bfloat16_raw_2", shim_raw_str + "____nv_bfloat16__ZNVK13__nv_bfloat16cv17__nv_bfloat16_rawEv_1", + shim_raw_str, ) ptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" @@ -919,27 +1002,27 @@ def impl(context, builder, fromty, toty, value): builder, _conversion_op_caller___nv_bfloat16, signature( - _type_unnamed1401637, + _type_unnamed1405307, CPointer(_type___nv_bfloat16), ), (ptr,), ) -_from___nv_bfloat16_to__type_unnamed1401637_lower(shim_stream, shim_obj) +_from___nv_bfloat16_to__type_unnamed1405307_lower(shim_stream, shim_obj) def _from___nv_bfloat16_to_float32_lower(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16_operator_float_1(float &retval, __nv_bfloat16 *self) { + ____nv_bfloat16__ZNK13__nv_bfloat16cvfEv_1(float &retval, __nv_bfloat16 *self) { retval = self->operator float(); return 0; } """ _op_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16_operator_float_1", + "____nv_bfloat16__ZNK13__nv_bfloat16cvfEv_1", float32( CPointer(_type___nv_bfloat16), ), @@ -952,7 +1035,7 @@ def _conversion_op_caller___nv_bfloat16(arg): def impl(context, builder, fromty, toty, value): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( - "____nv_bfloat16_operator_float_1", shim_raw_str + "____nv_bfloat16__ZNK13__nv_bfloat16cvfEv_1", shim_raw_str ) ptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" @@ -978,14 +1061,14 @@ def impl(context, builder, fromty, toty, value): def _from___nv_bfloat16_to_int8_lower(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16_operator_signed_char_1(signed char &retval, __nv_bfloat16 *self) { + ____nv_bfloat16__ZNK13__nv_bfloat16cvaEv_1(signed char &retval, __nv_bfloat16 *self) { retval = self->operator signed char(); return 0; } """ _op_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16_operator_signed_char_1", + "____nv_bfloat16__ZNK13__nv_bfloat16cvaEv_1", int8( CPointer(_type___nv_bfloat16), ), @@ -998,7 +1081,7 @@ def _conversion_op_caller___nv_bfloat16(arg): def impl(context, builder, fromty, toty, value): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( - "____nv_bfloat16_operator_signed_char_1", shim_raw_str + "____nv_bfloat16__ZNK13__nv_bfloat16cvaEv_1", shim_raw_str ) ptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" @@ -1024,14 +1107,14 @@ def impl(context, builder, fromty, toty, value): def _from___nv_bfloat16_to_uint8_lower(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16_operator_unsigned_char_1(unsigned char &retval, __nv_bfloat16 *self) { + ____nv_bfloat16__ZNK13__nv_bfloat16cvhEv_1(unsigned char &retval, __nv_bfloat16 *self) { retval = self->operator unsigned char(); return 0; } """ _op_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16_operator_unsigned_char_1", + "____nv_bfloat16__ZNK13__nv_bfloat16cvhEv_1", uint8( CPointer(_type___nv_bfloat16), ), @@ -1044,7 +1127,7 @@ def _conversion_op_caller___nv_bfloat16(arg): def impl(context, builder, fromty, toty, value): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( - "____nv_bfloat16_operator_unsigned_char_1", shim_raw_str + "____nv_bfloat16__ZNK13__nv_bfloat16cvhEv_1", shim_raw_str ) ptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" @@ -1070,14 +1153,14 @@ def impl(context, builder, fromty, toty, value): def _from___nv_bfloat16_to_int8_lower(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16_operator_char_1(char &retval, __nv_bfloat16 *self) { + ____nv_bfloat16__ZNK13__nv_bfloat16cvcEv_1(char &retval, __nv_bfloat16 *self) { retval = self->operator char(); return 0; } """ _op_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16_operator_char_1", + "____nv_bfloat16__ZNK13__nv_bfloat16cvcEv_1", int8( CPointer(_type___nv_bfloat16), ), @@ -1090,7 +1173,7 @@ def _conversion_op_caller___nv_bfloat16(arg): def impl(context, builder, fromty, toty, value): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( - "____nv_bfloat16_operator_char_1", shim_raw_str + "____nv_bfloat16__ZNK13__nv_bfloat16cvcEv_1", shim_raw_str ) ptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" @@ -1116,14 +1199,14 @@ def impl(context, builder, fromty, toty, value): def _from___nv_bfloat16_to_int16_lower(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16_operator_short_1(short &retval, __nv_bfloat16 *self) { + ____nv_bfloat16__ZNK13__nv_bfloat16cvsEv_1(short &retval, __nv_bfloat16 *self) { retval = self->operator short(); return 0; } """ _op_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16_operator_short_1", + "____nv_bfloat16__ZNK13__nv_bfloat16cvsEv_1", int16( CPointer(_type___nv_bfloat16), ), @@ -1136,7 +1219,7 @@ def _conversion_op_caller___nv_bfloat16(arg): def impl(context, builder, fromty, toty, value): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( - "____nv_bfloat16_operator_short_1", shim_raw_str + "____nv_bfloat16__ZNK13__nv_bfloat16cvsEv_1", shim_raw_str ) ptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" @@ -1162,14 +1245,14 @@ def impl(context, builder, fromty, toty, value): def _from___nv_bfloat16_to_uint16_lower(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16_operator_unsigned_short_1(unsigned short &retval, __nv_bfloat16 *self) { + ____nv_bfloat16__ZNK13__nv_bfloat16cvtEv_1(unsigned short &retval, __nv_bfloat16 *self) { retval = self->operator unsigned short(); return 0; } """ _op_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16_operator_unsigned_short_1", + "____nv_bfloat16__ZNK13__nv_bfloat16cvtEv_1", uint16( CPointer(_type___nv_bfloat16), ), @@ -1182,7 +1265,7 @@ def _conversion_op_caller___nv_bfloat16(arg): def impl(context, builder, fromty, toty, value): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( - "____nv_bfloat16_operator_unsigned_short_1", shim_raw_str + "____nv_bfloat16__ZNK13__nv_bfloat16cvtEv_1", shim_raw_str ) ptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" @@ -1208,14 +1291,14 @@ def impl(context, builder, fromty, toty, value): def _from___nv_bfloat16_to_int32_lower(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16_operator_int_1(int &retval, __nv_bfloat16 *self) { + ____nv_bfloat16__ZNK13__nv_bfloat16cviEv_1(int &retval, __nv_bfloat16 *self) { retval = self->operator int(); return 0; } """ _op_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16_operator_int_1", + "____nv_bfloat16__ZNK13__nv_bfloat16cviEv_1", int32( CPointer(_type___nv_bfloat16), ), @@ -1228,7 +1311,7 @@ def _conversion_op_caller___nv_bfloat16(arg): def impl(context, builder, fromty, toty, value): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( - "____nv_bfloat16_operator_int_1", shim_raw_str + "____nv_bfloat16__ZNK13__nv_bfloat16cviEv_1", shim_raw_str ) ptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" @@ -1254,14 +1337,14 @@ def impl(context, builder, fromty, toty, value): def _from___nv_bfloat16_to_uint32_lower(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16_operator_unsigned_int_1(unsigned int &retval, __nv_bfloat16 *self) { + ____nv_bfloat16__ZNK13__nv_bfloat16cvjEv_1(unsigned int &retval, __nv_bfloat16 *self) { retval = self->operator unsigned int(); return 0; } """ _op_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16_operator_unsigned_int_1", + "____nv_bfloat16__ZNK13__nv_bfloat16cvjEv_1", uint32( CPointer(_type___nv_bfloat16), ), @@ -1274,7 +1357,7 @@ def _conversion_op_caller___nv_bfloat16(arg): def impl(context, builder, fromty, toty, value): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( - "____nv_bfloat16_operator_unsigned_int_1", shim_raw_str + "____nv_bfloat16__ZNK13__nv_bfloat16cvjEv_1", shim_raw_str ) ptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" @@ -1300,14 +1383,14 @@ def impl(context, builder, fromty, toty, value): def _from___nv_bfloat16_to_int64_lower(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16_operator_long_1(long &retval, __nv_bfloat16 *self) { + ____nv_bfloat16__ZNK13__nv_bfloat16cvlEv_1(long &retval, __nv_bfloat16 *self) { retval = self->operator long(); return 0; } """ _op_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16_operator_long_1", + "____nv_bfloat16__ZNK13__nv_bfloat16cvlEv_1", int64( CPointer(_type___nv_bfloat16), ), @@ -1320,7 +1403,7 @@ def _conversion_op_caller___nv_bfloat16(arg): def impl(context, builder, fromty, toty, value): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( - "____nv_bfloat16_operator_long_1", shim_raw_str + "____nv_bfloat16__ZNK13__nv_bfloat16cvlEv_1", shim_raw_str ) ptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" @@ -1346,14 +1429,14 @@ def impl(context, builder, fromty, toty, value): def _from___nv_bfloat16_to_uint64_lower(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16_operator_unsigned_long_1(unsigned long &retval, __nv_bfloat16 *self) { + ____nv_bfloat16__ZNK13__nv_bfloat16cvmEv_1(unsigned long &retval, __nv_bfloat16 *self) { retval = self->operator unsigned long(); return 0; } """ _op_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16_operator_unsigned_long_1", + "____nv_bfloat16__ZNK13__nv_bfloat16cvmEv_1", uint64( CPointer(_type___nv_bfloat16), ), @@ -1366,7 +1449,7 @@ def _conversion_op_caller___nv_bfloat16(arg): def impl(context, builder, fromty, toty, value): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( - "____nv_bfloat16_operator_unsigned_long_1", shim_raw_str + "____nv_bfloat16__ZNK13__nv_bfloat16cvmEv_1", shim_raw_str ) ptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" @@ -1392,14 +1475,14 @@ def impl(context, builder, fromty, toty, value): def _from___nv_bfloat16_to_int64_lower(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16_operator_long_long_1(long long &retval, __nv_bfloat16 *self) { + ____nv_bfloat16__ZNK13__nv_bfloat16cvxEv_1(long long &retval, __nv_bfloat16 *self) { retval = self->operator long long(); return 0; } """ _op_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16_operator_long_long_1", + "____nv_bfloat16__ZNK13__nv_bfloat16cvxEv_1", int64( CPointer(_type___nv_bfloat16), ), @@ -1412,7 +1495,7 @@ def _conversion_op_caller___nv_bfloat16(arg): def impl(context, builder, fromty, toty, value): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( - "____nv_bfloat16_operator_long_long_1", shim_raw_str + "____nv_bfloat16__ZNK13__nv_bfloat16cvxEv_1", shim_raw_str ) ptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" @@ -1438,14 +1521,14 @@ def impl(context, builder, fromty, toty, value): def _from___nv_bfloat16_to_uint64_lower(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16_operator_unsigned_long_long_1(unsigned long long &retval, __nv_bfloat16 *self) { + ____nv_bfloat16__ZNK13__nv_bfloat16cvyEv_1(unsigned long long &retval, __nv_bfloat16 *self) { retval = self->operator unsigned long long(); return 0; } """ _op_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16_operator_unsigned_long_long_1", + "____nv_bfloat16__ZNK13__nv_bfloat16cvyEv_1", uint64( CPointer(_type___nv_bfloat16), ), @@ -1458,7 +1541,7 @@ def _conversion_op_caller___nv_bfloat16(arg): def impl(context, builder, fromty, toty, value): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( - "____nv_bfloat16_operator_unsigned_long_long_1", shim_raw_str + "____nv_bfloat16__ZNK13__nv_bfloat16cvyEv_1", shim_raw_str ) ptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" @@ -1484,14 +1567,14 @@ def impl(context, builder, fromty, toty, value): def _from___nv_bfloat16_to_bool__lower(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16_operator_bool_1(bool &retval, __nv_bfloat16 *self) { + ____nv_bfloat16__ZNK13__nv_bfloat16cvbEv_1(bool &retval, __nv_bfloat16 *self) { retval = self->operator bool(); return 0; } """ _op_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16_operator_bool_1", + "____nv_bfloat16__ZNK13__nv_bfloat16cvbEv_1", bool_( CPointer(_type___nv_bfloat16), ), @@ -1504,7 +1587,7 @@ def _conversion_op_caller___nv_bfloat16(arg): def impl(context, builder, fromty, toty, value): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( - "____nv_bfloat16_operator_bool_1", shim_raw_str + "____nv_bfloat16__ZNK13__nv_bfloat16cvbEv_1", shim_raw_str ) ptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" @@ -1527,6 +1610,33 @@ def impl(context, builder, fromty, toty, value): _from___nv_bfloat16_to_bool__lower(shim_stream, shim_obj) +# C++ does not provide a conversion operator from bfloat16 to double, so we need to implement it manually. +def _from___nv_bfloat16_to_float64__lower(): + @lower_cast(_type___nv_bfloat16, float64) + def impl(context, builder, fromty, toty, value): + # Hand rolled bfloat16 -> float32 -> double conversion with zero-ext + bits32 = builder.zext(value, ir.IntType(32)) + shift = builder.shl(bits32, ir.Constant(ir.IntType(32), 16)) + f32 = builder.bitcast(shift, ir.FloatType()) + f64 = builder.fpext(f32, ir.DoubleType()) + return f64 + + +_from___nv_bfloat16_to_float64__lower() + + +def _literalint_to_bf16_lower(): + @lower_cast(types.IntegerLiteral, _type___nv_bfloat16) + def impl(context, builder, fromty, toty, value): + f32 = context.cast(builder, value, fromty, float32) + i32 = builder.bitcast(f32, ir.IntType(32)) + i16 = builder.trunc(i32, ir.IntType(16)) + return i16 + + +_literalint_to_bf16_lower() + + # Typing for __nv_bfloat162 class _type_class___nv_bfloat162(Type): def __init__(self): @@ -1568,17 +1678,17 @@ def resolve_y(self, obj): make_attribute_wrapper(_type_class___nv_bfloat162, "y", "y") -def _lower___nv_bfloat162_void(shim_stream, shim_obj): +def _lower__ZN14__nv_bfloat162C1Ev(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat162____nv_bfloat162_1(int &ignore, __nv_bfloat162 *self ) { + _ZN14__nv_bfloat162C1Ev_nbst(int &ignore, __nv_bfloat162 *self ) { new (self) __nv_bfloat162(); return 0; } """ _ctor_decl___nv_bfloat162 = declare_device( - "____nv_bfloat162____nv_bfloat162_1", + "_ZN14__nv_bfloat162C1Ev_nbst", int32( CPointer(_type___nv_bfloat162), ), @@ -1592,9 +1702,7 @@ def __nv_bfloat162_device_caller(arg_0): ) def ctor_impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key( - "____nv_bfloat162____nv_bfloat162_1", shim_raw_str - ) + shim_stream.write_with_key("_ZN14__nv_bfloat162C1Ev_nbst", shim_raw_str) selfptr = builder.alloca( context.get_value_type(_type___nv_bfloat162), name="selfptr" ) @@ -1618,20 +1726,20 @@ def ctor_impl(context, builder, sig, args): ) -_lower___nv_bfloat162_void(shim_stream, shim_obj) +_lower__ZN14__nv_bfloat162C1Ev(shim_stream, shim_obj) -def _lower___nv_bfloat162__type___nv_bfloat162(shim_stream, shim_obj): +def _lower__ZN14__nv_bfloat162C1EOS_(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat162____nv_bfloat162_2(int &ignore, __nv_bfloat162 *self , __nv_bfloat162* src) { + _ZN14__nv_bfloat162C1EOS__nbst(int &ignore, __nv_bfloat162 *self , __nv_bfloat162* src) { new (self) __nv_bfloat162(*src); return 0; } """ _ctor_decl___nv_bfloat162 = declare_device( - "____nv_bfloat162____nv_bfloat162_2", + "_ZN14__nv_bfloat162C1EOS__nbst", int32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), ) @@ -1642,7 +1750,7 @@ def __nv_bfloat162_device_caller(arg_0, arg_1): def ctor_impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( - "____nv_bfloat162____nv_bfloat162_2", shim_raw_str + "_ZN14__nv_bfloat162C1EOS__nbst", shim_raw_str ) selfptr = builder.alloca( context.get_value_type(_type___nv_bfloat162), name="selfptr" @@ -1668,22 +1776,20 @@ def ctor_impl(context, builder, sig, args): ) -_lower___nv_bfloat162__type___nv_bfloat162(shim_stream, shim_obj) +_lower__ZN14__nv_bfloat162C1EOS_(shim_stream, shim_obj) -def _lower___nv_bfloat162__type___nv_bfloat16__type___nv_bfloat16( - shim_stream, shim_obj -): +def _lower__ZN14__nv_bfloat162C1ERK13__nv_bfloat16S2_(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat162____nv_bfloat162_3(int &ignore, __nv_bfloat162 *self , __nv_bfloat16* a, __nv_bfloat16* b) { + _ZN14__nv_bfloat162C1ERK13__nv_bfloat16S2__nbst(int &ignore, __nv_bfloat162 *self , __nv_bfloat16* a, __nv_bfloat16* b) { new (self) __nv_bfloat162(*a, *b); return 0; } """ _ctor_decl___nv_bfloat162 = declare_device( - "____nv_bfloat162____nv_bfloat162_3", + "_ZN14__nv_bfloat162C1ERK13__nv_bfloat16S2__nbst", int32( CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat16), @@ -1698,7 +1804,7 @@ def __nv_bfloat162_device_caller(arg_0, arg_1, arg_2): def ctor_impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( - "____nv_bfloat162____nv_bfloat162_3", shim_raw_str + "_ZN14__nv_bfloat162C1ERK13__nv_bfloat16S2__nbst", shim_raw_str ) selfptr = builder.alloca( context.get_value_type(_type___nv_bfloat162), name="selfptr" @@ -1725,22 +1831,20 @@ def ctor_impl(context, builder, sig, args): ) -_lower___nv_bfloat162__type___nv_bfloat16__type___nv_bfloat16( - shim_stream, shim_obj -) +_lower__ZN14__nv_bfloat162C1ERK13__nv_bfloat16S2_(shim_stream, shim_obj) -def _lower___nv_bfloat162__type___nv_bfloat162(shim_stream, shim_obj): +def _lower__ZN14__nv_bfloat162C1ERKS_(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat162____nv_bfloat162_4(int &ignore, __nv_bfloat162 *self , __nv_bfloat162* src) { + _ZN14__nv_bfloat162C1ERKS__nbst(int &ignore, __nv_bfloat162 *self , __nv_bfloat162* src) { new (self) __nv_bfloat162(*src); return 0; } """ _ctor_decl___nv_bfloat162 = declare_device( - "____nv_bfloat162____nv_bfloat162_4", + "_ZN14__nv_bfloat162C1ERKS__nbst", int32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), ) @@ -1751,7 +1855,7 @@ def __nv_bfloat162_device_caller(arg_0, arg_1): def ctor_impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( - "____nv_bfloat162____nv_bfloat162_4", shim_raw_str + "_ZN14__nv_bfloat162C1ERKS__nbst", shim_raw_str ) selfptr = builder.alloca( context.get_value_type(_type___nv_bfloat162), name="selfptr" @@ -1777,31 +1881,31 @@ def ctor_impl(context, builder, sig, args): ) -_lower___nv_bfloat162__type___nv_bfloat162(shim_stream, shim_obj) +_lower__ZN14__nv_bfloat162C1ERKS_(shim_stream, shim_obj) -def _lower___nv_bfloat162__type_unnamed1401746(shim_stream, shim_obj): +def _lower__ZN14__nv_bfloat162C1ERK18__nv_bfloat162_raw(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat162____nv_bfloat162_5(int &ignore, __nv_bfloat162 *self , __nv_bfloat162_raw* h2r) { + _ZN14__nv_bfloat162C1ERK18__nv_bfloat162_raw_nbst(int &ignore, __nv_bfloat162 *self , __nv_bfloat162_raw* h2r) { new (self) __nv_bfloat162(*h2r); return 0; } """ _ctor_decl___nv_bfloat162 = declare_device( - "____nv_bfloat162____nv_bfloat162_5", - int32(CPointer(_type___nv_bfloat162), CPointer(_type_unnamed1401746)), + "_ZN14__nv_bfloat162C1ERK18__nv_bfloat162_raw_nbst", + int32(CPointer(_type___nv_bfloat162), CPointer(_type_unnamed1405416)), ) def __nv_bfloat162_device_caller(arg_0, arg_1): return _ctor_decl___nv_bfloat162(arg_0, arg_1) - @lower(__nv_bfloat162, _type_unnamed1401746) + @lower(__nv_bfloat162, _type_unnamed1405416) def ctor_impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( - "____nv_bfloat162____nv_bfloat162_5", shim_raw_str + "_ZN14__nv_bfloat162C1ERK18__nv_bfloat162_raw_nbst", shim_raw_str ) selfptr = builder.alloca( context.get_value_type(_type___nv_bfloat162), name="selfptr" @@ -1818,7 +1922,7 @@ def ctor_impl(context, builder, sig, args): signature( int32, CPointer(_type___nv_bfloat162), - CPointer(_type_unnamed1401746), + CPointer(_type_unnamed1405416), ), (selfptr, *argptrs), ) @@ -1826,8 +1930,17 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat162, "alignof_", None) ) + @lower_cast(_type_unnamed1405416, _type___nv_bfloat162) + def conversion_impl(context, builder, fromty, toty, value): + return ctor_impl( + context, + builder, + signature(_type___nv_bfloat162, fromty), + [value], + ) + -_lower___nv_bfloat162__type_unnamed1401746(shim_stream, shim_obj) +_lower__ZN14__nv_bfloat162C1ERK18__nv_bfloat162_raw(shim_stream, shim_obj) @register @@ -1842,25 +1955,25 @@ class _ctor_template___nv_bfloat162(ConcreteTemplate): _type___nv_bfloat162, _type___nv_bfloat16, _type___nv_bfloat16 ), signature(_type___nv_bfloat162, _type___nv_bfloat162), - signature(_type___nv_bfloat162, _type_unnamed1401746), + signature(_type___nv_bfloat162, _type_unnamed1405416), ] register_global(__nv_bfloat162, Function(_ctor_template___nv_bfloat162)) -def _from___nv_bfloat162_to__type_unnamed1401746_lower(shim_stream, shim_obj): +def _from___nv_bfloat162_to__type_unnamed1405416_lower(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat162_operator___nv_bfloat162_raw_1(__nv_bfloat162_raw &retval, __nv_bfloat162 *self) { + ____nv_bfloat162__ZNK14__nv_bfloat162cv18__nv_bfloat162_rawEv_1(__nv_bfloat162_raw &retval, __nv_bfloat162 *self) { retval = self->operator __nv_bfloat162_raw(); return 0; } """ _op_decl___nv_bfloat162 = declare_device( - "____nv_bfloat162_operator___nv_bfloat162_raw_1", - _type_unnamed1401746( + "____nv_bfloat162__ZNK14__nv_bfloat162cv18__nv_bfloat162_rawEv_1", + _type_unnamed1405416( CPointer(_type___nv_bfloat162), ), ) @@ -1868,11 +1981,12 @@ def _from___nv_bfloat162_to__type_unnamed1401746_lower(shim_stream, shim_obj): def _conversion_op_caller___nv_bfloat162(arg): return _op_decl___nv_bfloat162(arg) - @lower_cast(_type___nv_bfloat162, _type_unnamed1401746) + @lower_cast(_type___nv_bfloat162, _type_unnamed1405416) def impl(context, builder, fromty, toty, value): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( - "____nv_bfloat162_operator___nv_bfloat162_raw_1", shim_raw_str + "____nv_bfloat162__ZNK14__nv_bfloat162cv18__nv_bfloat162_rawEv_1", + shim_raw_str, ) ptr = builder.alloca( context.get_value_type(_type___nv_bfloat162), name="selfptr" @@ -1885,1997 +1999,2083 @@ def impl(context, builder, fromty, toty, value): builder, _conversion_op_caller___nv_bfloat162, signature( - _type_unnamed1401746, + _type_unnamed1405416, CPointer(_type___nv_bfloat162), ), (ptr,), ) -_from___nv_bfloat162_to__type_unnamed1401746_lower(shim_stream, shim_obj) +_from___nv_bfloat162_to__type_unnamed1405416_lower(shim_stream, shim_obj) # Functions: -def make_bfloat162(): +def __double2bfloat16(): pass -def _make_bfloat162_1_lower(shim_stream, shim_obj): +def _lower__ZL17__double2bfloat16d_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - make_bfloat162_1(__nv_bfloat162 &retval , __nv_bfloat16* x, __nv_bfloat16* y) { - retval = make_bfloat162(*x, *y); + _ZL17__double2bfloat16d_nbst(__nv_bfloat16 &retval , double* a) { + retval = __double2bfloat16(*a); return 0; } """ - make_bfloat162_1 = declare_device( - "make_bfloat162_1", - _type___nv_bfloat162( - CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) - ), + _ZL17__double2bfloat16d_nbst = declare_device( + "_ZL17__double2bfloat16d_nbst", _type___nv_bfloat16(CPointer(float64)) ) - def make_bfloat162_1_caller(arg_0, arg_1): - return make_bfloat162_1(arg_0, arg_1) + def _ZL17__double2bfloat16d_nbst_caller(arg_0): + return _ZL17__double2bfloat16d_nbst(arg_0) - @lower(make_bfloat162, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__double2bfloat16, float64) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("make_bfloat162_1", shim_raw_str) + shim_stream.write_with_key("_ZL17__double2bfloat16d_nbst", shim_raw_str) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - make_bfloat162_1_caller, - signature( - _type___nv_bfloat162, - CPointer(_type___nv_bfloat16), - CPointer(_type___nv_bfloat16), - ), + _ZL17__double2bfloat16d_nbst_caller, + signature(_type___nv_bfloat16, CPointer(float64)), ptrs, ) -_make_bfloat162_1_lower(shim_stream, shim_obj) +_lower__ZL17__double2bfloat16d_nbst(shim_stream, shim_obj) -def htrunc(): +def __float2bfloat16(): pass -def _htrunc_1_lower(shim_stream, shim_obj): +def _lower__ZL16__float2bfloat16f_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - htrunc_1(__nv_bfloat16 &retval , __nv_bfloat16* h) { - retval = htrunc(*h); + _ZL16__float2bfloat16f_nbst(__nv_bfloat16 &retval , float* a) { + retval = __float2bfloat16(*a); return 0; } """ - htrunc_1 = declare_device( - "htrunc_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16)) + _ZL16__float2bfloat16f_nbst = declare_device( + "_ZL16__float2bfloat16f_nbst", _type___nv_bfloat16(CPointer(float32)) ) - def htrunc_1_caller(arg_0): - return htrunc_1(arg_0) + def _ZL16__float2bfloat16f_nbst_caller(arg_0): + return _ZL16__float2bfloat16f_nbst(arg_0) - @lower(htrunc, _type___nv_bfloat16) + @lower(__float2bfloat16, float32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("htrunc_1", shim_raw_str) + shim_stream.write_with_key("_ZL16__float2bfloat16f_nbst", shim_raw_str) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - htrunc_1_caller, - signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + _ZL16__float2bfloat16f_nbst_caller, + signature(_type___nv_bfloat16, CPointer(float32)), ptrs, ) -_htrunc_1_lower(shim_stream, shim_obj) +_lower__ZL16__float2bfloat16f_nbst(shim_stream, shim_obj) -def hceil(): +def __float2bfloat16_rn(): pass -def _hceil_1_lower(shim_stream, shim_obj): +def _lower__ZL19__float2bfloat16_rnf_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - hceil_1(__nv_bfloat16 &retval , __nv_bfloat16* h) { - retval = hceil(*h); + _ZL19__float2bfloat16_rnf_nbst(__nv_bfloat16 &retval , float* a) { + retval = __float2bfloat16_rn(*a); return 0; } """ - hceil_1 = declare_device( - "hceil_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16)) + _ZL19__float2bfloat16_rnf_nbst = declare_device( + "_ZL19__float2bfloat16_rnf_nbst", _type___nv_bfloat16(CPointer(float32)) ) - def hceil_1_caller(arg_0): - return hceil_1(arg_0) + def _ZL19__float2bfloat16_rnf_nbst_caller(arg_0): + return _ZL19__float2bfloat16_rnf_nbst(arg_0) - @lower(hceil, _type___nv_bfloat16) + @lower(__float2bfloat16_rn, float32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("hceil_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL19__float2bfloat16_rnf_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - hceil_1_caller, - signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + _ZL19__float2bfloat16_rnf_nbst_caller, + signature(_type___nv_bfloat16, CPointer(float32)), ptrs, ) -_hceil_1_lower(shim_stream, shim_obj) +_lower__ZL19__float2bfloat16_rnf_nbst(shim_stream, shim_obj) -def hfloor(): +def __float2bfloat16_rz(): pass -def _hfloor_1_lower(shim_stream, shim_obj): +def _lower__ZL19__float2bfloat16_rzf_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - hfloor_1(__nv_bfloat16 &retval , __nv_bfloat16* h) { - retval = hfloor(*h); + _ZL19__float2bfloat16_rzf_nbst(__nv_bfloat16 &retval , float* a) { + retval = __float2bfloat16_rz(*a); return 0; } """ - hfloor_1 = declare_device( - "hfloor_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16)) + _ZL19__float2bfloat16_rzf_nbst = declare_device( + "_ZL19__float2bfloat16_rzf_nbst", _type___nv_bfloat16(CPointer(float32)) ) - def hfloor_1_caller(arg_0): - return hfloor_1(arg_0) + def _ZL19__float2bfloat16_rzf_nbst_caller(arg_0): + return _ZL19__float2bfloat16_rzf_nbst(arg_0) - @lower(hfloor, _type___nv_bfloat16) + @lower(__float2bfloat16_rz, float32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("hfloor_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL19__float2bfloat16_rzf_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - hfloor_1_caller, - signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + _ZL19__float2bfloat16_rzf_nbst_caller, + signature(_type___nv_bfloat16, CPointer(float32)), ptrs, ) -_hfloor_1_lower(shim_stream, shim_obj) +_lower__ZL19__float2bfloat16_rzf_nbst(shim_stream, shim_obj) -def hrint(): +def __float2bfloat16_rd(): pass -def _hrint_1_lower(shim_stream, shim_obj): +def _lower__ZL19__float2bfloat16_rdf_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - hrint_1(__nv_bfloat16 &retval , __nv_bfloat16* h) { - retval = hrint(*h); + _ZL19__float2bfloat16_rdf_nbst(__nv_bfloat16 &retval , float* a) { + retval = __float2bfloat16_rd(*a); return 0; } """ - hrint_1 = declare_device( - "hrint_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16)) + _ZL19__float2bfloat16_rdf_nbst = declare_device( + "_ZL19__float2bfloat16_rdf_nbst", _type___nv_bfloat16(CPointer(float32)) ) - def hrint_1_caller(arg_0): - return hrint_1(arg_0) + def _ZL19__float2bfloat16_rdf_nbst_caller(arg_0): + return _ZL19__float2bfloat16_rdf_nbst(arg_0) - @lower(hrint, _type___nv_bfloat16) + @lower(__float2bfloat16_rd, float32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("hrint_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL19__float2bfloat16_rdf_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - hrint_1_caller, - signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + _ZL19__float2bfloat16_rdf_nbst_caller, + signature(_type___nv_bfloat16, CPointer(float32)), ptrs, ) -_hrint_1_lower(shim_stream, shim_obj) +_lower__ZL19__float2bfloat16_rdf_nbst(shim_stream, shim_obj) -def h2trunc(): +def __float2bfloat16_ru(): pass -def _h2trunc_1_lower(shim_stream, shim_obj): +def _lower__ZL19__float2bfloat16_ruf_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - h2trunc_1(__nv_bfloat162 &retval , __nv_bfloat162* h) { - retval = h2trunc(*h); + _ZL19__float2bfloat16_ruf_nbst(__nv_bfloat16 &retval , float* a) { + retval = __float2bfloat16_ru(*a); return 0; } """ - h2trunc_1 = declare_device( - "h2trunc_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162)) + _ZL19__float2bfloat16_ruf_nbst = declare_device( + "_ZL19__float2bfloat16_ruf_nbst", _type___nv_bfloat16(CPointer(float32)) ) - def h2trunc_1_caller(arg_0): - return h2trunc_1(arg_0) + def _ZL19__float2bfloat16_ruf_nbst_caller(arg_0): + return _ZL19__float2bfloat16_ruf_nbst(arg_0) - @lower(h2trunc, _type___nv_bfloat162) + @lower(__float2bfloat16_ru, float32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("h2trunc_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL19__float2bfloat16_ruf_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - h2trunc_1_caller, - signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + _ZL19__float2bfloat16_ruf_nbst_caller, + signature(_type___nv_bfloat16, CPointer(float32)), ptrs, ) -_h2trunc_1_lower(shim_stream, shim_obj) +_lower__ZL19__float2bfloat16_ruf_nbst(shim_stream, shim_obj) -def h2ceil(): +def __bfloat162float(): pass -def _h2ceil_1_lower(shim_stream, shim_obj): +def _lower__ZL16__bfloat162float13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - h2ceil_1(__nv_bfloat162 &retval , __nv_bfloat162* h) { - retval = h2ceil(*h); + _ZL16__bfloat162float13__nv_bfloat16_nbst(float &retval , __nv_bfloat16* a) { + retval = __bfloat162float(*a); return 0; } """ - h2ceil_1 = declare_device( - "h2ceil_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162)) + _ZL16__bfloat162float13__nv_bfloat16_nbst = declare_device( + "_ZL16__bfloat162float13__nv_bfloat16_nbst", + float32(CPointer(_type___nv_bfloat16)), ) - def h2ceil_1_caller(arg_0): - return h2ceil_1(arg_0) + def _ZL16__bfloat162float13__nv_bfloat16_nbst_caller(arg_0): + return _ZL16__bfloat162float13__nv_bfloat16_nbst(arg_0) - @lower(h2ceil, _type___nv_bfloat162) + @lower(__bfloat162float, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("h2ceil_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL16__bfloat162float13__nv_bfloat16_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - h2ceil_1_caller, - signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + _ZL16__bfloat162float13__nv_bfloat16_nbst_caller, + signature(float32, CPointer(_type___nv_bfloat16)), ptrs, ) -_h2ceil_1_lower(shim_stream, shim_obj) +_lower__ZL16__bfloat162float13__nv_bfloat16_nbst(shim_stream, shim_obj) -def h2floor(): +def __float2bfloat162_rn(): pass -def _h2floor_1_lower(shim_stream, shim_obj): +def _lower__ZL20__float2bfloat162_rnf_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - h2floor_1(__nv_bfloat162 &retval , __nv_bfloat162* h) { - retval = h2floor(*h); + _ZL20__float2bfloat162_rnf_nbst(__nv_bfloat162 &retval , float* a) { + retval = __float2bfloat162_rn(*a); return 0; } """ - h2floor_1 = declare_device( - "h2floor_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162)) + _ZL20__float2bfloat162_rnf_nbst = declare_device( + "_ZL20__float2bfloat162_rnf_nbst", + _type___nv_bfloat162(CPointer(float32)), ) - def h2floor_1_caller(arg_0): - return h2floor_1(arg_0) + def _ZL20__float2bfloat162_rnf_nbst_caller(arg_0): + return _ZL20__float2bfloat162_rnf_nbst(arg_0) - @lower(h2floor, _type___nv_bfloat162) + @lower(__float2bfloat162_rn, float32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("h2floor_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL20__float2bfloat162_rnf_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - h2floor_1_caller, - signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + _ZL20__float2bfloat162_rnf_nbst_caller, + signature(_type___nv_bfloat162, CPointer(float32)), ptrs, ) -_h2floor_1_lower(shim_stream, shim_obj) +_lower__ZL20__float2bfloat162_rnf_nbst(shim_stream, shim_obj) -def h2rint(): +def __floats2bfloat162_rn(): pass -def _h2rint_1_lower(shim_stream, shim_obj): +def _lower__ZL21__floats2bfloat162_rnff_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - h2rint_1(__nv_bfloat162 &retval , __nv_bfloat162* h) { - retval = h2rint(*h); + _ZL21__floats2bfloat162_rnff_nbst(__nv_bfloat162 &retval , float* a, float* b) { + retval = __floats2bfloat162_rn(*a, *b); return 0; } """ - h2rint_1 = declare_device( - "h2rint_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162)) + _ZL21__floats2bfloat162_rnff_nbst = declare_device( + "_ZL21__floats2bfloat162_rnff_nbst", + _type___nv_bfloat162(CPointer(float32), CPointer(float32)), ) - def h2rint_1_caller(arg_0): - return h2rint_1(arg_0) + def _ZL21__floats2bfloat162_rnff_nbst_caller(arg_0, arg_1): + return _ZL21__floats2bfloat162_rnff_nbst(arg_0, arg_1) - @lower(h2rint, _type___nv_bfloat162) + @lower(__floats2bfloat162_rn, float32, float32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("h2rint_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL21__floats2bfloat162_rnff_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - h2rint_1_caller, - signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + _ZL21__floats2bfloat162_rnff_nbst_caller, + signature( + _type___nv_bfloat162, CPointer(float32), CPointer(float32) + ), ptrs, ) -_h2rint_1_lower(shim_stream, shim_obj) +_lower__ZL21__floats2bfloat162_rnff_nbst(shim_stream, shim_obj) -def hsqrt(): +def __low2float(): pass -def _hsqrt_1_lower(shim_stream, shim_obj): +def _lower__ZL11__low2float14__nv_bfloat162_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - hsqrt_1(__nv_bfloat16 &retval , __nv_bfloat16* a) { - retval = hsqrt(*a); + _ZL11__low2float14__nv_bfloat162_nbst(float &retval , __nv_bfloat162* a) { + retval = __low2float(*a); return 0; } """ - hsqrt_1 = declare_device( - "hsqrt_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16)) + _ZL11__low2float14__nv_bfloat162_nbst = declare_device( + "_ZL11__low2float14__nv_bfloat162_nbst", + float32(CPointer(_type___nv_bfloat162)), ) - def hsqrt_1_caller(arg_0): - return hsqrt_1(arg_0) + def _ZL11__low2float14__nv_bfloat162_nbst_caller(arg_0): + return _ZL11__low2float14__nv_bfloat162_nbst(arg_0) - @lower(hsqrt, _type___nv_bfloat16) + @lower(__low2float, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("hsqrt_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL11__low2float14__nv_bfloat162_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - hsqrt_1_caller, - signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + _ZL11__low2float14__nv_bfloat162_nbst_caller, + signature(float32, CPointer(_type___nv_bfloat162)), ptrs, ) -_hsqrt_1_lower(shim_stream, shim_obj) +_lower__ZL11__low2float14__nv_bfloat162_nbst(shim_stream, shim_obj) -def hrsqrt(): +def __high2float(): pass -def _hrsqrt_1_lower(shim_stream, shim_obj): +def _lower__ZL12__high2float14__nv_bfloat162_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - hrsqrt_1(__nv_bfloat16 &retval , __nv_bfloat16* a) { - retval = hrsqrt(*a); + _ZL12__high2float14__nv_bfloat162_nbst(float &retval , __nv_bfloat162* a) { + retval = __high2float(*a); return 0; } """ - hrsqrt_1 = declare_device( - "hrsqrt_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16)) + _ZL12__high2float14__nv_bfloat162_nbst = declare_device( + "_ZL12__high2float14__nv_bfloat162_nbst", + float32(CPointer(_type___nv_bfloat162)), ) - def hrsqrt_1_caller(arg_0): - return hrsqrt_1(arg_0) + def _ZL12__high2float14__nv_bfloat162_nbst_caller(arg_0): + return _ZL12__high2float14__nv_bfloat162_nbst(arg_0) - @lower(hrsqrt, _type___nv_bfloat16) + @lower(__high2float, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("hrsqrt_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL12__high2float14__nv_bfloat162_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - hrsqrt_1_caller, - signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + _ZL12__high2float14__nv_bfloat162_nbst_caller, + signature(float32, CPointer(_type___nv_bfloat162)), ptrs, ) -_hrsqrt_1_lower(shim_stream, shim_obj) +_lower__ZL12__high2float14__nv_bfloat162_nbst(shim_stream, shim_obj) -def hrcp(): +def __float22bfloat162_rn(): pass -def _hrcp_1_lower(shim_stream, shim_obj): +def _lower__ZL21__float22bfloat162_rn6float2_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - hrcp_1(__nv_bfloat16 &retval , __nv_bfloat16* a) { - retval = hrcp(*a); + _ZL21__float22bfloat162_rn6float2_nbst(__nv_bfloat162 &retval , float2* a) { + retval = __float22bfloat162_rn(*a); return 0; } """ - hrcp_1 = declare_device( - "hrcp_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16)) + _ZL21__float22bfloat162_rn6float2_nbst = declare_device( + "_ZL21__float22bfloat162_rn6float2_nbst", + _type___nv_bfloat162(CPointer(float32x2)), ) - def hrcp_1_caller(arg_0): - return hrcp_1(arg_0) + def _ZL21__float22bfloat162_rn6float2_nbst_caller(arg_0): + return _ZL21__float22bfloat162_rn6float2_nbst(arg_0) - @lower(hrcp, _type___nv_bfloat16) + @lower(__float22bfloat162_rn, float32x2) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("hrcp_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL21__float22bfloat162_rn6float2_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - hrcp_1_caller, - signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + _ZL21__float22bfloat162_rn6float2_nbst_caller, + signature(_type___nv_bfloat162, CPointer(float32x2)), ptrs, ) -_hrcp_1_lower(shim_stream, shim_obj) +_lower__ZL21__float22bfloat162_rn6float2_nbst(shim_stream, shim_obj) -def hlog(): +def __bfloat1622float2(): pass -def _hlog_1_lower(shim_stream, shim_obj): +def _lower__ZL18__bfloat1622float214__nv_bfloat162_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - hlog_1(__nv_bfloat16 &retval , __nv_bfloat16* a) { - retval = hlog(*a); + _ZL18__bfloat1622float214__nv_bfloat162_nbst(float2 &retval , __nv_bfloat162* a) { + retval = __bfloat1622float2(*a); return 0; } """ - hlog_1 = declare_device( - "hlog_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16)) + _ZL18__bfloat1622float214__nv_bfloat162_nbst = declare_device( + "_ZL18__bfloat1622float214__nv_bfloat162_nbst", + float32x2(CPointer(_type___nv_bfloat162)), ) - def hlog_1_caller(arg_0): - return hlog_1(arg_0) + def _ZL18__bfloat1622float214__nv_bfloat162_nbst_caller(arg_0): + return _ZL18__bfloat1622float214__nv_bfloat162_nbst(arg_0) - @lower(hlog, _type___nv_bfloat16) + @lower(__bfloat1622float2, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("hlog_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL18__bfloat1622float214__nv_bfloat162_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - hlog_1_caller, - signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + _ZL18__bfloat1622float214__nv_bfloat162_nbst_caller, + signature(float32x2, CPointer(_type___nv_bfloat162)), ptrs, ) -_hlog_1_lower(shim_stream, shim_obj) +_lower__ZL18__bfloat1622float214__nv_bfloat162_nbst(shim_stream, shim_obj) -def hlog2(): +def __bfloat162char_rz(): pass -def _hlog2_1_lower(shim_stream, shim_obj): +def _lower__ZL18__bfloat162char_rz13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - hlog2_1(__nv_bfloat16 &retval , __nv_bfloat16* a) { - retval = hlog2(*a); + _ZL18__bfloat162char_rz13__nv_bfloat16_nbst(signed char &retval , __nv_bfloat16* h) { + retval = __bfloat162char_rz(*h); return 0; } """ - hlog2_1 = declare_device( - "hlog2_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16)) + _ZL18__bfloat162char_rz13__nv_bfloat16_nbst = declare_device( + "_ZL18__bfloat162char_rz13__nv_bfloat16_nbst", + int8(CPointer(_type___nv_bfloat16)), ) - def hlog2_1_caller(arg_0): - return hlog2_1(arg_0) + def _ZL18__bfloat162char_rz13__nv_bfloat16_nbst_caller(arg_0): + return _ZL18__bfloat162char_rz13__nv_bfloat16_nbst(arg_0) - @lower(hlog2, _type___nv_bfloat16) + @lower(__bfloat162char_rz, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("hlog2_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL18__bfloat162char_rz13__nv_bfloat16_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - hlog2_1_caller, - signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + _ZL18__bfloat162char_rz13__nv_bfloat16_nbst_caller, + signature(int8, CPointer(_type___nv_bfloat16)), ptrs, ) -_hlog2_1_lower(shim_stream, shim_obj) +_lower__ZL18__bfloat162char_rz13__nv_bfloat16_nbst(shim_stream, shim_obj) -def hlog10(): +def __bfloat162uchar_rz(): pass -def _hlog10_1_lower(shim_stream, shim_obj): +def _lower__ZL19__bfloat162uchar_rz13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - hlog10_1(__nv_bfloat16 &retval , __nv_bfloat16* a) { - retval = hlog10(*a); + _ZL19__bfloat162uchar_rz13__nv_bfloat16_nbst(unsigned char &retval , __nv_bfloat16* h) { + retval = __bfloat162uchar_rz(*h); return 0; } """ - hlog10_1 = declare_device( - "hlog10_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16)) + _ZL19__bfloat162uchar_rz13__nv_bfloat16_nbst = declare_device( + "_ZL19__bfloat162uchar_rz13__nv_bfloat16_nbst", + uint8(CPointer(_type___nv_bfloat16)), ) - def hlog10_1_caller(arg_0): - return hlog10_1(arg_0) + def _ZL19__bfloat162uchar_rz13__nv_bfloat16_nbst_caller(arg_0): + return _ZL19__bfloat162uchar_rz13__nv_bfloat16_nbst(arg_0) - @lower(hlog10, _type___nv_bfloat16) + @lower(__bfloat162uchar_rz, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("hlog10_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL19__bfloat162uchar_rz13__nv_bfloat16_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - hlog10_1_caller, - signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + _ZL19__bfloat162uchar_rz13__nv_bfloat16_nbst_caller, + signature(uint8, CPointer(_type___nv_bfloat16)), ptrs, ) -_hlog10_1_lower(shim_stream, shim_obj) +_lower__ZL19__bfloat162uchar_rz13__nv_bfloat16_nbst(shim_stream, shim_obj) -def hexp(): +def __bfloat162int_rn(): pass -def _hexp_1_lower(shim_stream, shim_obj): +def _lower__ZL17__bfloat162int_rn13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - hexp_1(__nv_bfloat16 &retval , __nv_bfloat16* a) { - retval = hexp(*a); + _ZL17__bfloat162int_rn13__nv_bfloat16_nbst(int &retval , __nv_bfloat16* h) { + retval = __bfloat162int_rn(*h); return 0; } """ - hexp_1 = declare_device( - "hexp_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16)) + _ZL17__bfloat162int_rn13__nv_bfloat16_nbst = declare_device( + "_ZL17__bfloat162int_rn13__nv_bfloat16_nbst", + int32(CPointer(_type___nv_bfloat16)), ) - def hexp_1_caller(arg_0): - return hexp_1(arg_0) + def _ZL17__bfloat162int_rn13__nv_bfloat16_nbst_caller(arg_0): + return _ZL17__bfloat162int_rn13__nv_bfloat16_nbst(arg_0) - @lower(hexp, _type___nv_bfloat16) + @lower(__bfloat162int_rn, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("hexp_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL17__bfloat162int_rn13__nv_bfloat16_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - hexp_1_caller, - signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + _ZL17__bfloat162int_rn13__nv_bfloat16_nbst_caller, + signature(int32, CPointer(_type___nv_bfloat16)), ptrs, ) -_hexp_1_lower(shim_stream, shim_obj) +_lower__ZL17__bfloat162int_rn13__nv_bfloat16_nbst(shim_stream, shim_obj) -def htanh_approx(): +def __bfloat162int_rz(): pass -def _htanh_approx_1_lower(shim_stream, shim_obj): +def _lower__ZL17__bfloat162int_rz13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - htanh_approx_1(__nv_bfloat16 &retval , __nv_bfloat16* a) { - retval = htanh_approx(*a); + _ZL17__bfloat162int_rz13__nv_bfloat16_nbst(int &retval , __nv_bfloat16* h) { + retval = __bfloat162int_rz(*h); return 0; } """ - htanh_approx_1 = declare_device( - "htanh_approx_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16)) + _ZL17__bfloat162int_rz13__nv_bfloat16_nbst = declare_device( + "_ZL17__bfloat162int_rz13__nv_bfloat16_nbst", + int32(CPointer(_type___nv_bfloat16)), ) - def htanh_approx_1_caller(arg_0): - return htanh_approx_1(arg_0) + def _ZL17__bfloat162int_rz13__nv_bfloat16_nbst_caller(arg_0): + return _ZL17__bfloat162int_rz13__nv_bfloat16_nbst(arg_0) - @lower(htanh_approx, _type___nv_bfloat16) + @lower(__bfloat162int_rz, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("htanh_approx_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL17__bfloat162int_rz13__nv_bfloat16_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - htanh_approx_1_caller, - signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + _ZL17__bfloat162int_rz13__nv_bfloat16_nbst_caller, + signature(int32, CPointer(_type___nv_bfloat16)), ptrs, ) -_htanh_approx_1_lower(shim_stream, shim_obj) +_lower__ZL17__bfloat162int_rz13__nv_bfloat16_nbst(shim_stream, shim_obj) -def h2tanh_approx(): +def __bfloat162int_rd(): pass -def _h2tanh_approx_1_lower(shim_stream, shim_obj): +def _lower__ZL17__bfloat162int_rd13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - h2tanh_approx_1(__nv_bfloat162 &retval , __nv_bfloat162* a) { - retval = h2tanh_approx(*a); + _ZL17__bfloat162int_rd13__nv_bfloat16_nbst(int &retval , __nv_bfloat16* h) { + retval = __bfloat162int_rd(*h); return 0; } """ - h2tanh_approx_1 = declare_device( - "h2tanh_approx_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162)) + _ZL17__bfloat162int_rd13__nv_bfloat16_nbst = declare_device( + "_ZL17__bfloat162int_rd13__nv_bfloat16_nbst", + int32(CPointer(_type___nv_bfloat16)), ) - def h2tanh_approx_1_caller(arg_0): - return h2tanh_approx_1(arg_0) + def _ZL17__bfloat162int_rd13__nv_bfloat16_nbst_caller(arg_0): + return _ZL17__bfloat162int_rd13__nv_bfloat16_nbst(arg_0) - @lower(h2tanh_approx, _type___nv_bfloat162) + @lower(__bfloat162int_rd, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("h2tanh_approx_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL17__bfloat162int_rd13__nv_bfloat16_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - h2tanh_approx_1_caller, - signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + _ZL17__bfloat162int_rd13__nv_bfloat16_nbst_caller, + signature(int32, CPointer(_type___nv_bfloat16)), ptrs, ) -_h2tanh_approx_1_lower(shim_stream, shim_obj) +_lower__ZL17__bfloat162int_rd13__nv_bfloat16_nbst(shim_stream, shim_obj) -def htanh(): +def __bfloat162int_ru(): pass -def _htanh_1_lower(shim_stream, shim_obj): +def _lower__ZL17__bfloat162int_ru13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - htanh_1(__nv_bfloat16 &retval , __nv_bfloat16* a) { - retval = htanh(*a); + _ZL17__bfloat162int_ru13__nv_bfloat16_nbst(int &retval , __nv_bfloat16* h) { + retval = __bfloat162int_ru(*h); return 0; } """ - htanh_1 = declare_device( - "htanh_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16)) + _ZL17__bfloat162int_ru13__nv_bfloat16_nbst = declare_device( + "_ZL17__bfloat162int_ru13__nv_bfloat16_nbst", + int32(CPointer(_type___nv_bfloat16)), ) - def htanh_1_caller(arg_0): - return htanh_1(arg_0) + def _ZL17__bfloat162int_ru13__nv_bfloat16_nbst_caller(arg_0): + return _ZL17__bfloat162int_ru13__nv_bfloat16_nbst(arg_0) - @lower(htanh, _type___nv_bfloat16) + @lower(__bfloat162int_ru, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("htanh_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL17__bfloat162int_ru13__nv_bfloat16_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - htanh_1_caller, - signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + _ZL17__bfloat162int_ru13__nv_bfloat16_nbst_caller, + signature(int32, CPointer(_type___nv_bfloat16)), ptrs, ) -_htanh_1_lower(shim_stream, shim_obj) +_lower__ZL17__bfloat162int_ru13__nv_bfloat16_nbst(shim_stream, shim_obj) -def h2tanh(): +def __int2bfloat16_rn(): pass -def _h2tanh_1_lower(shim_stream, shim_obj): +def _lower__ZL17__int2bfloat16_rni_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - h2tanh_1(__nv_bfloat162 &retval , __nv_bfloat162* a) { - retval = h2tanh(*a); + _ZL17__int2bfloat16_rni_nbst(__nv_bfloat16 &retval , int* i) { + retval = __int2bfloat16_rn(*i); return 0; } """ - h2tanh_1 = declare_device( - "h2tanh_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162)) + _ZL17__int2bfloat16_rni_nbst = declare_device( + "_ZL17__int2bfloat16_rni_nbst", _type___nv_bfloat16(CPointer(int32)) ) - def h2tanh_1_caller(arg_0): - return h2tanh_1(arg_0) + def _ZL17__int2bfloat16_rni_nbst_caller(arg_0): + return _ZL17__int2bfloat16_rni_nbst(arg_0) - @lower(h2tanh, _type___nv_bfloat162) + @lower(__int2bfloat16_rn, int32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("h2tanh_1", shim_raw_str) + shim_stream.write_with_key("_ZL17__int2bfloat16_rni_nbst", shim_raw_str) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - h2tanh_1_caller, - signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + _ZL17__int2bfloat16_rni_nbst_caller, + signature(_type___nv_bfloat16, CPointer(int32)), ptrs, ) -_h2tanh_1_lower(shim_stream, shim_obj) +_lower__ZL17__int2bfloat16_rni_nbst(shim_stream, shim_obj) -def hexp2(): +def __int2bfloat16_rz(): pass -def _hexp2_1_lower(shim_stream, shim_obj): +def _lower__ZL17__int2bfloat16_rzi_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - hexp2_1(__nv_bfloat16 &retval , __nv_bfloat16* a) { - retval = hexp2(*a); + _ZL17__int2bfloat16_rzi_nbst(__nv_bfloat16 &retval , int* i) { + retval = __int2bfloat16_rz(*i); return 0; } """ - hexp2_1 = declare_device( - "hexp2_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16)) + _ZL17__int2bfloat16_rzi_nbst = declare_device( + "_ZL17__int2bfloat16_rzi_nbst", _type___nv_bfloat16(CPointer(int32)) ) - def hexp2_1_caller(arg_0): - return hexp2_1(arg_0) + def _ZL17__int2bfloat16_rzi_nbst_caller(arg_0): + return _ZL17__int2bfloat16_rzi_nbst(arg_0) - @lower(hexp2, _type___nv_bfloat16) + @lower(__int2bfloat16_rz, int32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("hexp2_1", shim_raw_str) + shim_stream.write_with_key("_ZL17__int2bfloat16_rzi_nbst", shim_raw_str) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - hexp2_1_caller, - signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + _ZL17__int2bfloat16_rzi_nbst_caller, + signature(_type___nv_bfloat16, CPointer(int32)), ptrs, ) -_hexp2_1_lower(shim_stream, shim_obj) +_lower__ZL17__int2bfloat16_rzi_nbst(shim_stream, shim_obj) -def hexp10(): +def __int2bfloat16_rd(): pass -def _hexp10_1_lower(shim_stream, shim_obj): +def _lower__ZL17__int2bfloat16_rdi_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - hexp10_1(__nv_bfloat16 &retval , __nv_bfloat16* a) { - retval = hexp10(*a); + _ZL17__int2bfloat16_rdi_nbst(__nv_bfloat16 &retval , int* i) { + retval = __int2bfloat16_rd(*i); return 0; } """ - hexp10_1 = declare_device( - "hexp10_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16)) + _ZL17__int2bfloat16_rdi_nbst = declare_device( + "_ZL17__int2bfloat16_rdi_nbst", _type___nv_bfloat16(CPointer(int32)) ) - def hexp10_1_caller(arg_0): - return hexp10_1(arg_0) + def _ZL17__int2bfloat16_rdi_nbst_caller(arg_0): + return _ZL17__int2bfloat16_rdi_nbst(arg_0) - @lower(hexp10, _type___nv_bfloat16) + @lower(__int2bfloat16_rd, int32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("hexp10_1", shim_raw_str) + shim_stream.write_with_key("_ZL17__int2bfloat16_rdi_nbst", shim_raw_str) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - hexp10_1_caller, - signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + _ZL17__int2bfloat16_rdi_nbst_caller, + signature(_type___nv_bfloat16, CPointer(int32)), ptrs, ) -_hexp10_1_lower(shim_stream, shim_obj) +_lower__ZL17__int2bfloat16_rdi_nbst(shim_stream, shim_obj) -def hcos(): +def __int2bfloat16_ru(): pass -def _hcos_1_lower(shim_stream, shim_obj): +def _lower__ZL17__int2bfloat16_rui_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - hcos_1(__nv_bfloat16 &retval , __nv_bfloat16* a) { - retval = hcos(*a); + _ZL17__int2bfloat16_rui_nbst(__nv_bfloat16 &retval , int* i) { + retval = __int2bfloat16_ru(*i); return 0; } """ - hcos_1 = declare_device( - "hcos_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16)) + _ZL17__int2bfloat16_rui_nbst = declare_device( + "_ZL17__int2bfloat16_rui_nbst", _type___nv_bfloat16(CPointer(int32)) ) - def hcos_1_caller(arg_0): - return hcos_1(arg_0) + def _ZL17__int2bfloat16_rui_nbst_caller(arg_0): + return _ZL17__int2bfloat16_rui_nbst(arg_0) - @lower(hcos, _type___nv_bfloat16) + @lower(__int2bfloat16_ru, int32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("hcos_1", shim_raw_str) + shim_stream.write_with_key("_ZL17__int2bfloat16_rui_nbst", shim_raw_str) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - hcos_1_caller, - signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + _ZL17__int2bfloat16_rui_nbst_caller, + signature(_type___nv_bfloat16, CPointer(int32)), ptrs, ) -_hcos_1_lower(shim_stream, shim_obj) +_lower__ZL17__int2bfloat16_rui_nbst(shim_stream, shim_obj) -def hsin(): +def __bfloat162short_rn(): pass -def _hsin_1_lower(shim_stream, shim_obj): +def _lower__ZL19__bfloat162short_rn13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - hsin_1(__nv_bfloat16 &retval , __nv_bfloat16* a) { - retval = hsin(*a); + _ZL19__bfloat162short_rn13__nv_bfloat16_nbst(short &retval , __nv_bfloat16* h) { + retval = __bfloat162short_rn(*h); return 0; } """ - hsin_1 = declare_device( - "hsin_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16)) + _ZL19__bfloat162short_rn13__nv_bfloat16_nbst = declare_device( + "_ZL19__bfloat162short_rn13__nv_bfloat16_nbst", + int16(CPointer(_type___nv_bfloat16)), ) - def hsin_1_caller(arg_0): - return hsin_1(arg_0) + def _ZL19__bfloat162short_rn13__nv_bfloat16_nbst_caller(arg_0): + return _ZL19__bfloat162short_rn13__nv_bfloat16_nbst(arg_0) - @lower(hsin, _type___nv_bfloat16) + @lower(__bfloat162short_rn, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("hsin_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL19__bfloat162short_rn13__nv_bfloat16_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - hsin_1_caller, - signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + _ZL19__bfloat162short_rn13__nv_bfloat16_nbst_caller, + signature(int16, CPointer(_type___nv_bfloat16)), ptrs, ) -_hsin_1_lower(shim_stream, shim_obj) +_lower__ZL19__bfloat162short_rn13__nv_bfloat16_nbst(shim_stream, shim_obj) -def h2sqrt(): +def __bfloat162short_rz(): pass -def _h2sqrt_1_lower(shim_stream, shim_obj): +def _lower__ZL19__bfloat162short_rz13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - h2sqrt_1(__nv_bfloat162 &retval , __nv_bfloat162* a) { - retval = h2sqrt(*a); + _ZL19__bfloat162short_rz13__nv_bfloat16_nbst(short &retval , __nv_bfloat16* h) { + retval = __bfloat162short_rz(*h); return 0; } """ - h2sqrt_1 = declare_device( - "h2sqrt_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162)) + _ZL19__bfloat162short_rz13__nv_bfloat16_nbst = declare_device( + "_ZL19__bfloat162short_rz13__nv_bfloat16_nbst", + int16(CPointer(_type___nv_bfloat16)), ) - def h2sqrt_1_caller(arg_0): - return h2sqrt_1(arg_0) + def _ZL19__bfloat162short_rz13__nv_bfloat16_nbst_caller(arg_0): + return _ZL19__bfloat162short_rz13__nv_bfloat16_nbst(arg_0) - @lower(h2sqrt, _type___nv_bfloat162) + @lower(__bfloat162short_rz, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("h2sqrt_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL19__bfloat162short_rz13__nv_bfloat16_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - h2sqrt_1_caller, - signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + _ZL19__bfloat162short_rz13__nv_bfloat16_nbst_caller, + signature(int16, CPointer(_type___nv_bfloat16)), ptrs, ) -_h2sqrt_1_lower(shim_stream, shim_obj) +_lower__ZL19__bfloat162short_rz13__nv_bfloat16_nbst(shim_stream, shim_obj) -def h2rsqrt(): +def __bfloat162short_rd(): pass -def _h2rsqrt_1_lower(shim_stream, shim_obj): +def _lower__ZL19__bfloat162short_rd13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - h2rsqrt_1(__nv_bfloat162 &retval , __nv_bfloat162* a) { - retval = h2rsqrt(*a); + _ZL19__bfloat162short_rd13__nv_bfloat16_nbst(short &retval , __nv_bfloat16* h) { + retval = __bfloat162short_rd(*h); return 0; } """ - h2rsqrt_1 = declare_device( - "h2rsqrt_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162)) + _ZL19__bfloat162short_rd13__nv_bfloat16_nbst = declare_device( + "_ZL19__bfloat162short_rd13__nv_bfloat16_nbst", + int16(CPointer(_type___nv_bfloat16)), ) - def h2rsqrt_1_caller(arg_0): - return h2rsqrt_1(arg_0) + def _ZL19__bfloat162short_rd13__nv_bfloat16_nbst_caller(arg_0): + return _ZL19__bfloat162short_rd13__nv_bfloat16_nbst(arg_0) - @lower(h2rsqrt, _type___nv_bfloat162) + @lower(__bfloat162short_rd, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("h2rsqrt_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL19__bfloat162short_rd13__nv_bfloat16_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - h2rsqrt_1_caller, - signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + _ZL19__bfloat162short_rd13__nv_bfloat16_nbst_caller, + signature(int16, CPointer(_type___nv_bfloat16)), ptrs, ) -_h2rsqrt_1_lower(shim_stream, shim_obj) +_lower__ZL19__bfloat162short_rd13__nv_bfloat16_nbst(shim_stream, shim_obj) -def h2rcp(): +def __bfloat162short_ru(): pass -def _h2rcp_1_lower(shim_stream, shim_obj): +def _lower__ZL19__bfloat162short_ru13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - h2rcp_1(__nv_bfloat162 &retval , __nv_bfloat162* a) { - retval = h2rcp(*a); + _ZL19__bfloat162short_ru13__nv_bfloat16_nbst(short &retval , __nv_bfloat16* h) { + retval = __bfloat162short_ru(*h); return 0; } """ - h2rcp_1 = declare_device( - "h2rcp_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162)) + _ZL19__bfloat162short_ru13__nv_bfloat16_nbst = declare_device( + "_ZL19__bfloat162short_ru13__nv_bfloat16_nbst", + int16(CPointer(_type___nv_bfloat16)), ) - def h2rcp_1_caller(arg_0): - return h2rcp_1(arg_0) + def _ZL19__bfloat162short_ru13__nv_bfloat16_nbst_caller(arg_0): + return _ZL19__bfloat162short_ru13__nv_bfloat16_nbst(arg_0) - @lower(h2rcp, _type___nv_bfloat162) + @lower(__bfloat162short_ru, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("h2rcp_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL19__bfloat162short_ru13__nv_bfloat16_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - h2rcp_1_caller, - signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + _ZL19__bfloat162short_ru13__nv_bfloat16_nbst_caller, + signature(int16, CPointer(_type___nv_bfloat16)), ptrs, ) -_h2rcp_1_lower(shim_stream, shim_obj) +_lower__ZL19__bfloat162short_ru13__nv_bfloat16_nbst(shim_stream, shim_obj) -def h2log(): +def __short2bfloat16_rn(): pass -def _h2log_1_lower(shim_stream, shim_obj): +def _lower__ZL19__short2bfloat16_rns_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - h2log_1(__nv_bfloat162 &retval , __nv_bfloat162* a) { - retval = h2log(*a); + _ZL19__short2bfloat16_rns_nbst(__nv_bfloat16 &retval , short* i) { + retval = __short2bfloat16_rn(*i); return 0; } """ - h2log_1 = declare_device( - "h2log_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162)) + _ZL19__short2bfloat16_rns_nbst = declare_device( + "_ZL19__short2bfloat16_rns_nbst", _type___nv_bfloat16(CPointer(int16)) ) - def h2log_1_caller(arg_0): - return h2log_1(arg_0) + def _ZL19__short2bfloat16_rns_nbst_caller(arg_0): + return _ZL19__short2bfloat16_rns_nbst(arg_0) - @lower(h2log, _type___nv_bfloat162) + @lower(__short2bfloat16_rn, int16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("h2log_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL19__short2bfloat16_rns_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - h2log_1_caller, - signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + _ZL19__short2bfloat16_rns_nbst_caller, + signature(_type___nv_bfloat16, CPointer(int16)), ptrs, ) -_h2log_1_lower(shim_stream, shim_obj) +_lower__ZL19__short2bfloat16_rns_nbst(shim_stream, shim_obj) -def h2log2(): +def __short2bfloat16_rz(): pass -def _h2log2_1_lower(shim_stream, shim_obj): +def _lower__ZL19__short2bfloat16_rzs_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - h2log2_1(__nv_bfloat162 &retval , __nv_bfloat162* a) { - retval = h2log2(*a); + _ZL19__short2bfloat16_rzs_nbst(__nv_bfloat16 &retval , short* i) { + retval = __short2bfloat16_rz(*i); return 0; } """ - h2log2_1 = declare_device( - "h2log2_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162)) + _ZL19__short2bfloat16_rzs_nbst = declare_device( + "_ZL19__short2bfloat16_rzs_nbst", _type___nv_bfloat16(CPointer(int16)) ) - def h2log2_1_caller(arg_0): - return h2log2_1(arg_0) + def _ZL19__short2bfloat16_rzs_nbst_caller(arg_0): + return _ZL19__short2bfloat16_rzs_nbst(arg_0) - @lower(h2log2, _type___nv_bfloat162) + @lower(__short2bfloat16_rz, int16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("h2log2_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL19__short2bfloat16_rzs_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - h2log2_1_caller, - signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + _ZL19__short2bfloat16_rzs_nbst_caller, + signature(_type___nv_bfloat16, CPointer(int16)), ptrs, ) -_h2log2_1_lower(shim_stream, shim_obj) +_lower__ZL19__short2bfloat16_rzs_nbst(shim_stream, shim_obj) -def h2log10(): +def __short2bfloat16_rd(): pass -def _h2log10_1_lower(shim_stream, shim_obj): +def _lower__ZL19__short2bfloat16_rds_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - h2log10_1(__nv_bfloat162 &retval , __nv_bfloat162* a) { - retval = h2log10(*a); + _ZL19__short2bfloat16_rds_nbst(__nv_bfloat16 &retval , short* i) { + retval = __short2bfloat16_rd(*i); return 0; } """ - h2log10_1 = declare_device( - "h2log10_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162)) + _ZL19__short2bfloat16_rds_nbst = declare_device( + "_ZL19__short2bfloat16_rds_nbst", _type___nv_bfloat16(CPointer(int16)) ) - def h2log10_1_caller(arg_0): - return h2log10_1(arg_0) + def _ZL19__short2bfloat16_rds_nbst_caller(arg_0): + return _ZL19__short2bfloat16_rds_nbst(arg_0) - @lower(h2log10, _type___nv_bfloat162) + @lower(__short2bfloat16_rd, int16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("h2log10_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL19__short2bfloat16_rds_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - h2log10_1_caller, - signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + _ZL19__short2bfloat16_rds_nbst_caller, + signature(_type___nv_bfloat16, CPointer(int16)), ptrs, ) -_h2log10_1_lower(shim_stream, shim_obj) +_lower__ZL19__short2bfloat16_rds_nbst(shim_stream, shim_obj) -def h2exp(): +def __short2bfloat16_ru(): pass -def _h2exp_1_lower(shim_stream, shim_obj): +def _lower__ZL19__short2bfloat16_rus_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - h2exp_1(__nv_bfloat162 &retval , __nv_bfloat162* a) { - retval = h2exp(*a); + _ZL19__short2bfloat16_rus_nbst(__nv_bfloat16 &retval , short* i) { + retval = __short2bfloat16_ru(*i); return 0; } """ - h2exp_1 = declare_device( - "h2exp_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162)) + _ZL19__short2bfloat16_rus_nbst = declare_device( + "_ZL19__short2bfloat16_rus_nbst", _type___nv_bfloat16(CPointer(int16)) ) - def h2exp_1_caller(arg_0): - return h2exp_1(arg_0) + def _ZL19__short2bfloat16_rus_nbst_caller(arg_0): + return _ZL19__short2bfloat16_rus_nbst(arg_0) - @lower(h2exp, _type___nv_bfloat162) + @lower(__short2bfloat16_ru, int16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("h2exp_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL19__short2bfloat16_rus_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - h2exp_1_caller, - signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + _ZL19__short2bfloat16_rus_nbst_caller, + signature(_type___nv_bfloat16, CPointer(int16)), ptrs, ) -_h2exp_1_lower(shim_stream, shim_obj) +_lower__ZL19__short2bfloat16_rus_nbst(shim_stream, shim_obj) -def h2exp2(): +def __bfloat162uint_rn(): pass -def _h2exp2_1_lower(shim_stream, shim_obj): +def _lower__ZL18__bfloat162uint_rn13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - h2exp2_1(__nv_bfloat162 &retval , __nv_bfloat162* a) { - retval = h2exp2(*a); + _ZL18__bfloat162uint_rn13__nv_bfloat16_nbst(unsigned int &retval , __nv_bfloat16* h) { + retval = __bfloat162uint_rn(*h); return 0; } """ - h2exp2_1 = declare_device( - "h2exp2_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162)) + _ZL18__bfloat162uint_rn13__nv_bfloat16_nbst = declare_device( + "_ZL18__bfloat162uint_rn13__nv_bfloat16_nbst", + uint32(CPointer(_type___nv_bfloat16)), ) - def h2exp2_1_caller(arg_0): - return h2exp2_1(arg_0) + def _ZL18__bfloat162uint_rn13__nv_bfloat16_nbst_caller(arg_0): + return _ZL18__bfloat162uint_rn13__nv_bfloat16_nbst(arg_0) - @lower(h2exp2, _type___nv_bfloat162) + @lower(__bfloat162uint_rn, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("h2exp2_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL18__bfloat162uint_rn13__nv_bfloat16_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - h2exp2_1_caller, - signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + _ZL18__bfloat162uint_rn13__nv_bfloat16_nbst_caller, + signature(uint32, CPointer(_type___nv_bfloat16)), ptrs, ) -_h2exp2_1_lower(shim_stream, shim_obj) +_lower__ZL18__bfloat162uint_rn13__nv_bfloat16_nbst(shim_stream, shim_obj) -def h2exp10(): +def __bfloat162uint_rz(): pass -def _h2exp10_1_lower(shim_stream, shim_obj): +def _lower__ZL18__bfloat162uint_rz13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - h2exp10_1(__nv_bfloat162 &retval , __nv_bfloat162* a) { - retval = h2exp10(*a); + _ZL18__bfloat162uint_rz13__nv_bfloat16_nbst(unsigned int &retval , __nv_bfloat16* h) { + retval = __bfloat162uint_rz(*h); return 0; } """ - h2exp10_1 = declare_device( - "h2exp10_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162)) + _ZL18__bfloat162uint_rz13__nv_bfloat16_nbst = declare_device( + "_ZL18__bfloat162uint_rz13__nv_bfloat16_nbst", + uint32(CPointer(_type___nv_bfloat16)), ) - def h2exp10_1_caller(arg_0): - return h2exp10_1(arg_0) + def _ZL18__bfloat162uint_rz13__nv_bfloat16_nbst_caller(arg_0): + return _ZL18__bfloat162uint_rz13__nv_bfloat16_nbst(arg_0) - @lower(h2exp10, _type___nv_bfloat162) + @lower(__bfloat162uint_rz, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("h2exp10_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL18__bfloat162uint_rz13__nv_bfloat16_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - h2exp10_1_caller, - signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + _ZL18__bfloat162uint_rz13__nv_bfloat16_nbst_caller, + signature(uint32, CPointer(_type___nv_bfloat16)), ptrs, ) -_h2exp10_1_lower(shim_stream, shim_obj) +_lower__ZL18__bfloat162uint_rz13__nv_bfloat16_nbst(shim_stream, shim_obj) -def h2cos(): +def __bfloat162uint_rd(): pass -def _h2cos_1_lower(shim_stream, shim_obj): +def _lower__ZL18__bfloat162uint_rd13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - h2cos_1(__nv_bfloat162 &retval , __nv_bfloat162* a) { - retval = h2cos(*a); + _ZL18__bfloat162uint_rd13__nv_bfloat16_nbst(unsigned int &retval , __nv_bfloat16* h) { + retval = __bfloat162uint_rd(*h); return 0; } """ - h2cos_1 = declare_device( - "h2cos_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162)) + _ZL18__bfloat162uint_rd13__nv_bfloat16_nbst = declare_device( + "_ZL18__bfloat162uint_rd13__nv_bfloat16_nbst", + uint32(CPointer(_type___nv_bfloat16)), ) - def h2cos_1_caller(arg_0): - return h2cos_1(arg_0) + def _ZL18__bfloat162uint_rd13__nv_bfloat16_nbst_caller(arg_0): + return _ZL18__bfloat162uint_rd13__nv_bfloat16_nbst(arg_0) - @lower(h2cos, _type___nv_bfloat162) + @lower(__bfloat162uint_rd, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("h2cos_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL18__bfloat162uint_rd13__nv_bfloat16_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - h2cos_1_caller, - signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + _ZL18__bfloat162uint_rd13__nv_bfloat16_nbst_caller, + signature(uint32, CPointer(_type___nv_bfloat16)), ptrs, ) -_h2cos_1_lower(shim_stream, shim_obj) +_lower__ZL18__bfloat162uint_rd13__nv_bfloat16_nbst(shim_stream, shim_obj) -def h2sin(): +def __bfloat162uint_ru(): pass -def _h2sin_1_lower(shim_stream, shim_obj): +def _lower__ZL18__bfloat162uint_ru13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - h2sin_1(__nv_bfloat162 &retval , __nv_bfloat162* a) { - retval = h2sin(*a); + _ZL18__bfloat162uint_ru13__nv_bfloat16_nbst(unsigned int &retval , __nv_bfloat16* h) { + retval = __bfloat162uint_ru(*h); return 0; } """ - h2sin_1 = declare_device( - "h2sin_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162)) + _ZL18__bfloat162uint_ru13__nv_bfloat16_nbst = declare_device( + "_ZL18__bfloat162uint_ru13__nv_bfloat16_nbst", + uint32(CPointer(_type___nv_bfloat16)), ) - def h2sin_1_caller(arg_0): - return h2sin_1(arg_0) + def _ZL18__bfloat162uint_ru13__nv_bfloat16_nbst_caller(arg_0): + return _ZL18__bfloat162uint_ru13__nv_bfloat16_nbst(arg_0) - @lower(h2sin, _type___nv_bfloat162) + @lower(__bfloat162uint_ru, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("h2sin_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL18__bfloat162uint_ru13__nv_bfloat16_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - h2sin_1_caller, - signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + _ZL18__bfloat162uint_ru13__nv_bfloat16_nbst_caller, + signature(uint32, CPointer(_type___nv_bfloat16)), ptrs, ) -_h2sin_1_lower(shim_stream, shim_obj) +_lower__ZL18__bfloat162uint_ru13__nv_bfloat16_nbst(shim_stream, shim_obj) -def atomicAdd(): +def __uint2bfloat16_rn(): pass -def _atomicAdd_1_lower(shim_stream, shim_obj): +def _lower__ZL18__uint2bfloat16_rnj_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - atomicAdd_1(__nv_bfloat162 &retval , __nv_bfloat162 ** address, __nv_bfloat162* val) { - retval = atomicAdd(*address, *val); + _ZL18__uint2bfloat16_rnj_nbst(__nv_bfloat16 &retval , unsigned int* i) { + retval = __uint2bfloat16_rn(*i); return 0; } """ - atomicAdd_1 = declare_device( - "atomicAdd_1", - _type___nv_bfloat162( - CPointer(CPointer(_type___nv_bfloat162)), - CPointer(_type___nv_bfloat162), - ), + _ZL18__uint2bfloat16_rnj_nbst = declare_device( + "_ZL18__uint2bfloat16_rnj_nbst", _type___nv_bfloat16(CPointer(uint32)) ) - def atomicAdd_1_caller(arg_0, arg_1): - return atomicAdd_1(arg_0, arg_1) + def _ZL18__uint2bfloat16_rnj_nbst_caller(arg_0): + return _ZL18__uint2bfloat16_rnj_nbst(arg_0) - @lower(atomicAdd, CPointer(_type___nv_bfloat162), _type___nv_bfloat162) + @lower(__uint2bfloat16_rn, uint32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("atomicAdd_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL18__uint2bfloat16_rnj_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - atomicAdd_1_caller, - signature( - _type___nv_bfloat162, - CPointer(CPointer(_type___nv_bfloat162)), - CPointer(_type___nv_bfloat162), - ), + _ZL18__uint2bfloat16_rnj_nbst_caller, + signature(_type___nv_bfloat16, CPointer(uint32)), ptrs, ) -_atomicAdd_1_lower(shim_stream, shim_obj) +_lower__ZL18__uint2bfloat16_rnj_nbst(shim_stream, shim_obj) + + +def __uint2bfloat16_rz(): + pass -def _atomicAdd_2_lower(shim_stream, shim_obj): +def _lower__ZL18__uint2bfloat16_rzj_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - atomicAdd_2(__nv_bfloat16 &retval , __nv_bfloat16 ** address, __nv_bfloat16* val) { - retval = atomicAdd(*address, *val); + _ZL18__uint2bfloat16_rzj_nbst(__nv_bfloat16 &retval , unsigned int* i) { + retval = __uint2bfloat16_rz(*i); return 0; } """ - atomicAdd_2 = declare_device( - "atomicAdd_2", - _type___nv_bfloat16( - CPointer(CPointer(_type___nv_bfloat16)), - CPointer(_type___nv_bfloat16), - ), + _ZL18__uint2bfloat16_rzj_nbst = declare_device( + "_ZL18__uint2bfloat16_rzj_nbst", _type___nv_bfloat16(CPointer(uint32)) ) - def atomicAdd_2_caller(arg_0, arg_1): - return atomicAdd_2(arg_0, arg_1) + def _ZL18__uint2bfloat16_rzj_nbst_caller(arg_0): + return _ZL18__uint2bfloat16_rzj_nbst(arg_0) - @lower(atomicAdd, CPointer(_type___nv_bfloat16), _type___nv_bfloat16) + @lower(__uint2bfloat16_rz, uint32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("atomicAdd_2", shim_raw_str) + shim_stream.write_with_key( + "_ZL18__uint2bfloat16_rzj_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - atomicAdd_2_caller, - signature( - _type___nv_bfloat16, - CPointer(CPointer(_type___nv_bfloat16)), - CPointer(_type___nv_bfloat16), - ), + _ZL18__uint2bfloat16_rzj_nbst_caller, + signature(_type___nv_bfloat16, CPointer(uint32)), ptrs, ) -_atomicAdd_2_lower(shim_stream, shim_obj) +_lower__ZL18__uint2bfloat16_rzj_nbst(shim_stream, shim_obj) + + +def __uint2bfloat16_rd(): + pass -def _operator_add_1_lower(shim_stream, shim_obj): +def _lower__ZL18__uint2bfloat16_rdj_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - operator_add_1(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { - retval = operator+(*lh, *rh); + _ZL18__uint2bfloat16_rdj_nbst(__nv_bfloat16 &retval , unsigned int* i) { + retval = __uint2bfloat16_rd(*i); return 0; } """ - operator_add_1 = declare_device( - "operator_add_1", - _type___nv_bfloat16( - CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) - ), + _ZL18__uint2bfloat16_rdj_nbst = declare_device( + "_ZL18__uint2bfloat16_rdj_nbst", _type___nv_bfloat16(CPointer(uint32)) ) - def operator_add_1_caller(arg_0, arg_1): - return operator_add_1(arg_0, arg_1) + def _ZL18__uint2bfloat16_rdj_nbst_caller(arg_0): + return _ZL18__uint2bfloat16_rdj_nbst(arg_0) - @lower(operator.add, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__uint2bfloat16_rd, uint32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_add_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL18__uint2bfloat16_rdj_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - operator_add_1_caller, - signature( - _type___nv_bfloat16, - CPointer(_type___nv_bfloat16), - CPointer(_type___nv_bfloat16), - ), + _ZL18__uint2bfloat16_rdj_nbst_caller, + signature(_type___nv_bfloat16, CPointer(uint32)), ptrs, ) -_operator_add_1_lower(shim_stream, shim_obj) +_lower__ZL18__uint2bfloat16_rdj_nbst(shim_stream, shim_obj) -def _operator_sub_1_lower(shim_stream, shim_obj): +def __uint2bfloat16_ru(): + pass + + +def _lower__ZL18__uint2bfloat16_ruj_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - operator_sub_1(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { - retval = operator-(*lh, *rh); + _ZL18__uint2bfloat16_ruj_nbst(__nv_bfloat16 &retval , unsigned int* i) { + retval = __uint2bfloat16_ru(*i); return 0; } """ - operator_sub_1 = declare_device( - "operator_sub_1", - _type___nv_bfloat16( - CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) - ), + _ZL18__uint2bfloat16_ruj_nbst = declare_device( + "_ZL18__uint2bfloat16_ruj_nbst", _type___nv_bfloat16(CPointer(uint32)) ) - def operator_sub_1_caller(arg_0, arg_1): - return operator_sub_1(arg_0, arg_1) + def _ZL18__uint2bfloat16_ruj_nbst_caller(arg_0): + return _ZL18__uint2bfloat16_ruj_nbst(arg_0) - @lower(operator.sub, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__uint2bfloat16_ru, uint32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_sub_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL18__uint2bfloat16_ruj_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - operator_sub_1_caller, - signature( - _type___nv_bfloat16, - CPointer(_type___nv_bfloat16), - CPointer(_type___nv_bfloat16), - ), + _ZL18__uint2bfloat16_ruj_nbst_caller, + signature(_type___nv_bfloat16, CPointer(uint32)), ptrs, ) -_operator_sub_1_lower(shim_stream, shim_obj) +_lower__ZL18__uint2bfloat16_ruj_nbst(shim_stream, shim_obj) + + +def __bfloat162ushort_rn(): + pass -def _operator_mul_1_lower(shim_stream, shim_obj): +def _lower__ZL20__bfloat162ushort_rn13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - operator_mul_1(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { - retval = operator*(*lh, *rh); + _ZL20__bfloat162ushort_rn13__nv_bfloat16_nbst(unsigned short &retval , __nv_bfloat16* h) { + retval = __bfloat162ushort_rn(*h); return 0; } """ - operator_mul_1 = declare_device( - "operator_mul_1", - _type___nv_bfloat16( - CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) - ), + _ZL20__bfloat162ushort_rn13__nv_bfloat16_nbst = declare_device( + "_ZL20__bfloat162ushort_rn13__nv_bfloat16_nbst", + uint16(CPointer(_type___nv_bfloat16)), ) - def operator_mul_1_caller(arg_0, arg_1): - return operator_mul_1(arg_0, arg_1) + def _ZL20__bfloat162ushort_rn13__nv_bfloat16_nbst_caller(arg_0): + return _ZL20__bfloat162ushort_rn13__nv_bfloat16_nbst(arg_0) - @lower(operator.mul, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__bfloat162ushort_rn, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_mul_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL20__bfloat162ushort_rn13__nv_bfloat16_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - operator_mul_1_caller, - signature( - _type___nv_bfloat16, - CPointer(_type___nv_bfloat16), - CPointer(_type___nv_bfloat16), - ), + _ZL20__bfloat162ushort_rn13__nv_bfloat16_nbst_caller, + signature(uint16, CPointer(_type___nv_bfloat16)), ptrs, ) -_operator_mul_1_lower(shim_stream, shim_obj) +_lower__ZL20__bfloat162ushort_rn13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def __bfloat162ushort_rz(): + pass -def _operator_truediv_1_lower(shim_stream, shim_obj): +def _lower__ZL20__bfloat162ushort_rz13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - operator_truediv_1(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { - retval = operator/(*lh, *rh); + _ZL20__bfloat162ushort_rz13__nv_bfloat16_nbst(unsigned short &retval , __nv_bfloat16* h) { + retval = __bfloat162ushort_rz(*h); return 0; } """ - operator_truediv_1 = declare_device( - "operator_truediv_1", - _type___nv_bfloat16( - CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) - ), + _ZL20__bfloat162ushort_rz13__nv_bfloat16_nbst = declare_device( + "_ZL20__bfloat162ushort_rz13__nv_bfloat16_nbst", + uint16(CPointer(_type___nv_bfloat16)), ) - def operator_truediv_1_caller(arg_0, arg_1): - return operator_truediv_1(arg_0, arg_1) + def _ZL20__bfloat162ushort_rz13__nv_bfloat16_nbst_caller(arg_0): + return _ZL20__bfloat162ushort_rz13__nv_bfloat16_nbst(arg_0) - @lower(operator.truediv, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__bfloat162ushort_rz, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_truediv_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL20__bfloat162ushort_rz13__nv_bfloat16_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - operator_truediv_1_caller, - signature( - _type___nv_bfloat16, - CPointer(_type___nv_bfloat16), - CPointer(_type___nv_bfloat16), - ), + _ZL20__bfloat162ushort_rz13__nv_bfloat16_nbst_caller, + signature(uint16, CPointer(_type___nv_bfloat16)), ptrs, ) -_operator_truediv_1_lower(shim_stream, shim_obj) +_lower__ZL20__bfloat162ushort_rz13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def __bfloat162ushort_rd(): + pass -def _operator_iadd_1_lower(shim_stream, shim_obj): +def _lower__ZL20__bfloat162ushort_rd13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - operator_iadd_1(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { - retval = operator+=(*lh, *rh); + _ZL20__bfloat162ushort_rd13__nv_bfloat16_nbst(unsigned short &retval , __nv_bfloat16* h) { + retval = __bfloat162ushort_rd(*h); return 0; } """ - operator_iadd_1 = declare_device( - "operator_iadd_1", - _type___nv_bfloat16( - CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) - ), + _ZL20__bfloat162ushort_rd13__nv_bfloat16_nbst = declare_device( + "_ZL20__bfloat162ushort_rd13__nv_bfloat16_nbst", + uint16(CPointer(_type___nv_bfloat16)), ) - def operator_iadd_1_caller(arg_0, arg_1): - return operator_iadd_1(arg_0, arg_1) + def _ZL20__bfloat162ushort_rd13__nv_bfloat16_nbst_caller(arg_0): + return _ZL20__bfloat162ushort_rd13__nv_bfloat16_nbst(arg_0) - @lower(operator.iadd, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__bfloat162ushort_rd, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_iadd_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL20__bfloat162ushort_rd13__nv_bfloat16_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - operator_iadd_1_caller, - signature( - _type___nv_bfloat16, - CPointer(_type___nv_bfloat16), - CPointer(_type___nv_bfloat16), - ), + _ZL20__bfloat162ushort_rd13__nv_bfloat16_nbst_caller, + signature(uint16, CPointer(_type___nv_bfloat16)), ptrs, ) -_operator_iadd_1_lower(shim_stream, shim_obj) +_lower__ZL20__bfloat162ushort_rd13__nv_bfloat16_nbst(shim_stream, shim_obj) -def _operator_isub_1_lower(shim_stream, shim_obj): +def __bfloat162ushort_ru(): + pass + + +def _lower__ZL20__bfloat162ushort_ru13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - operator_isub_1(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { - retval = operator-=(*lh, *rh); + _ZL20__bfloat162ushort_ru13__nv_bfloat16_nbst(unsigned short &retval , __nv_bfloat16* h) { + retval = __bfloat162ushort_ru(*h); return 0; } """ - operator_isub_1 = declare_device( - "operator_isub_1", - _type___nv_bfloat16( - CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) - ), + _ZL20__bfloat162ushort_ru13__nv_bfloat16_nbst = declare_device( + "_ZL20__bfloat162ushort_ru13__nv_bfloat16_nbst", + uint16(CPointer(_type___nv_bfloat16)), ) - def operator_isub_1_caller(arg_0, arg_1): - return operator_isub_1(arg_0, arg_1) + def _ZL20__bfloat162ushort_ru13__nv_bfloat16_nbst_caller(arg_0): + return _ZL20__bfloat162ushort_ru13__nv_bfloat16_nbst(arg_0) - @lower(operator.isub, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__bfloat162ushort_ru, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_isub_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL20__bfloat162ushort_ru13__nv_bfloat16_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - operator_isub_1_caller, - signature( - _type___nv_bfloat16, - CPointer(_type___nv_bfloat16), - CPointer(_type___nv_bfloat16), - ), + _ZL20__bfloat162ushort_ru13__nv_bfloat16_nbst_caller, + signature(uint16, CPointer(_type___nv_bfloat16)), ptrs, ) -_operator_isub_1_lower(shim_stream, shim_obj) +_lower__ZL20__bfloat162ushort_ru13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def __ushort2bfloat16_rn(): + pass -def _operator_imul_1_lower(shim_stream, shim_obj): +def _lower__ZL20__ushort2bfloat16_rnt_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - operator_imul_1(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { - retval = operator*=(*lh, *rh); + _ZL20__ushort2bfloat16_rnt_nbst(__nv_bfloat16 &retval , unsigned short* i) { + retval = __ushort2bfloat16_rn(*i); return 0; } """ - operator_imul_1 = declare_device( - "operator_imul_1", - _type___nv_bfloat16( - CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) - ), + _ZL20__ushort2bfloat16_rnt_nbst = declare_device( + "_ZL20__ushort2bfloat16_rnt_nbst", _type___nv_bfloat16(CPointer(uint16)) ) - def operator_imul_1_caller(arg_0, arg_1): - return operator_imul_1(arg_0, arg_1) + def _ZL20__ushort2bfloat16_rnt_nbst_caller(arg_0): + return _ZL20__ushort2bfloat16_rnt_nbst(arg_0) - @lower(operator.imul, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__ushort2bfloat16_rn, uint16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_imul_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL20__ushort2bfloat16_rnt_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - operator_imul_1_caller, - signature( - _type___nv_bfloat16, - CPointer(_type___nv_bfloat16), - CPointer(_type___nv_bfloat16), - ), + _ZL20__ushort2bfloat16_rnt_nbst_caller, + signature(_type___nv_bfloat16, CPointer(uint16)), ptrs, ) -_operator_imul_1_lower(shim_stream, shim_obj) +_lower__ZL20__ushort2bfloat16_rnt_nbst(shim_stream, shim_obj) + + +def __ushort2bfloat16_rz(): + pass -def _operator_itruediv_1_lower(shim_stream, shim_obj): +def _lower__ZL20__ushort2bfloat16_rzt_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - operator_itruediv_1(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { - retval = operator/=(*lh, *rh); + _ZL20__ushort2bfloat16_rzt_nbst(__nv_bfloat16 &retval , unsigned short* i) { + retval = __ushort2bfloat16_rz(*i); return 0; } """ - operator_itruediv_1 = declare_device( - "operator_itruediv_1", - _type___nv_bfloat16( - CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) - ), + _ZL20__ushort2bfloat16_rzt_nbst = declare_device( + "_ZL20__ushort2bfloat16_rzt_nbst", _type___nv_bfloat16(CPointer(uint16)) ) - def operator_itruediv_1_caller(arg_0, arg_1): - return operator_itruediv_1(arg_0, arg_1) + def _ZL20__ushort2bfloat16_rzt_nbst_caller(arg_0): + return _ZL20__ushort2bfloat16_rzt_nbst(arg_0) - @lower(operator.itruediv, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__ushort2bfloat16_rz, uint16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_itruediv_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL20__ushort2bfloat16_rzt_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - operator_itruediv_1_caller, - signature( - _type___nv_bfloat16, - CPointer(_type___nv_bfloat16), - CPointer(_type___nv_bfloat16), - ), + _ZL20__ushort2bfloat16_rzt_nbst_caller, + signature(_type___nv_bfloat16, CPointer(uint16)), ptrs, ) -_operator_itruediv_1_lower(shim_stream, shim_obj) +_lower__ZL20__ushort2bfloat16_rzt_nbst(shim_stream, shim_obj) + + +def __ushort2bfloat16_rd(): + pass -def _operator_pos_1_lower(shim_stream, shim_obj): +def _lower__ZL20__ushort2bfloat16_rdt_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - operator_pos_1(__nv_bfloat16 &retval , __nv_bfloat16* h) { - retval = operator+(*h); + _ZL20__ushort2bfloat16_rdt_nbst(__nv_bfloat16 &retval , unsigned short* i) { + retval = __ushort2bfloat16_rd(*i); return 0; } """ - operator_pos_1 = declare_device( - "operator_pos_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16)) + _ZL20__ushort2bfloat16_rdt_nbst = declare_device( + "_ZL20__ushort2bfloat16_rdt_nbst", _type___nv_bfloat16(CPointer(uint16)) ) - def operator_pos_1_caller(arg_0): - return operator_pos_1(arg_0) + def _ZL20__ushort2bfloat16_rdt_nbst_caller(arg_0): + return _ZL20__ushort2bfloat16_rdt_nbst(arg_0) - @lower(operator.pos, _type___nv_bfloat16) + @lower(__ushort2bfloat16_rd, uint16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_pos_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL20__ushort2bfloat16_rdt_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - operator_pos_1_caller, - signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + _ZL20__ushort2bfloat16_rdt_nbst_caller, + signature(_type___nv_bfloat16, CPointer(uint16)), ptrs, ) -_operator_pos_1_lower(shim_stream, shim_obj) +_lower__ZL20__ushort2bfloat16_rdt_nbst(shim_stream, shim_obj) + + +def __ushort2bfloat16_ru(): + pass -def _operator_neg_1_lower(shim_stream, shim_obj): +def _lower__ZL20__ushort2bfloat16_rut_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - operator_neg_1(__nv_bfloat16 &retval , __nv_bfloat16* h) { - retval = operator-(*h); + _ZL20__ushort2bfloat16_rut_nbst(__nv_bfloat16 &retval , unsigned short* i) { + retval = __ushort2bfloat16_ru(*i); return 0; } """ - operator_neg_1 = declare_device( - "operator_neg_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16)) + _ZL20__ushort2bfloat16_rut_nbst = declare_device( + "_ZL20__ushort2bfloat16_rut_nbst", _type___nv_bfloat16(CPointer(uint16)) ) - def operator_neg_1_caller(arg_0): - return operator_neg_1(arg_0) + def _ZL20__ushort2bfloat16_rut_nbst_caller(arg_0): + return _ZL20__ushort2bfloat16_rut_nbst(arg_0) - @lower(operator.neg, _type___nv_bfloat16) + @lower(__ushort2bfloat16_ru, uint16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_neg_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL20__ushort2bfloat16_rut_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - operator_neg_1_caller, - signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + _ZL20__ushort2bfloat16_rut_nbst_caller, + signature(_type___nv_bfloat16, CPointer(uint16)), ptrs, ) -_operator_neg_1_lower(shim_stream, shim_obj) +_lower__ZL20__ushort2bfloat16_rut_nbst(shim_stream, shim_obj) + + +def __bfloat162ull_rn(): + pass -def _operator_eq_1_lower(shim_stream, shim_obj): +def _lower__ZL17__bfloat162ull_rn13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - operator_eq_1(bool &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { - retval = operator==(*lh, *rh); + _ZL17__bfloat162ull_rn13__nv_bfloat16_nbst(unsigned long long &retval , __nv_bfloat16* h) { + retval = __bfloat162ull_rn(*h); return 0; } """ - operator_eq_1 = declare_device( - "operator_eq_1", - bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)), + _ZL17__bfloat162ull_rn13__nv_bfloat16_nbst = declare_device( + "_ZL17__bfloat162ull_rn13__nv_bfloat16_nbst", + uint64(CPointer(_type___nv_bfloat16)), ) - def operator_eq_1_caller(arg_0, arg_1): - return operator_eq_1(arg_0, arg_1) + def _ZL17__bfloat162ull_rn13__nv_bfloat16_nbst_caller(arg_0): + return _ZL17__bfloat162ull_rn13__nv_bfloat16_nbst(arg_0) - @lower(operator.eq, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__bfloat162ull_rn, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_eq_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL17__bfloat162ull_rn13__nv_bfloat16_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - operator_eq_1_caller, - signature( - bool_, - CPointer(_type___nv_bfloat16), - CPointer(_type___nv_bfloat16), - ), + _ZL17__bfloat162ull_rn13__nv_bfloat16_nbst_caller, + signature(uint64, CPointer(_type___nv_bfloat16)), ptrs, ) -_operator_eq_1_lower(shim_stream, shim_obj) +_lower__ZL17__bfloat162ull_rn13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def __bfloat162ull_rz(): + pass -def _operator_ne_1_lower(shim_stream, shim_obj): +def _lower__ZL17__bfloat162ull_rz13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - operator_ne_1(bool &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { - retval = operator!=(*lh, *rh); + _ZL17__bfloat162ull_rz13__nv_bfloat16_nbst(unsigned long long &retval , __nv_bfloat16* h) { + retval = __bfloat162ull_rz(*h); return 0; } """ - operator_ne_1 = declare_device( - "operator_ne_1", - bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)), + _ZL17__bfloat162ull_rz13__nv_bfloat16_nbst = declare_device( + "_ZL17__bfloat162ull_rz13__nv_bfloat16_nbst", + uint64(CPointer(_type___nv_bfloat16)), ) - def operator_ne_1_caller(arg_0, arg_1): - return operator_ne_1(arg_0, arg_1) + def _ZL17__bfloat162ull_rz13__nv_bfloat16_nbst_caller(arg_0): + return _ZL17__bfloat162ull_rz13__nv_bfloat16_nbst(arg_0) - @lower(operator.ne, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__bfloat162ull_rz, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_ne_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL17__bfloat162ull_rz13__nv_bfloat16_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - operator_ne_1_caller, - signature( - bool_, - CPointer(_type___nv_bfloat16), - CPointer(_type___nv_bfloat16), - ), + _ZL17__bfloat162ull_rz13__nv_bfloat16_nbst_caller, + signature(uint64, CPointer(_type___nv_bfloat16)), ptrs, ) -_operator_ne_1_lower(shim_stream, shim_obj) +_lower__ZL17__bfloat162ull_rz13__nv_bfloat16_nbst(shim_stream, shim_obj) + +def make_bfloat162(): + pass -def _operator_gt_1_lower(shim_stream, shim_obj): + +def _lower__ZL14make_bfloat16213__nv_bfloat16S__nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - operator_gt_1(bool &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { - retval = operator>(*lh, *rh); + _ZL14make_bfloat16213__nv_bfloat16S__nbst(__nv_bfloat162 &retval , __nv_bfloat16* x, __nv_bfloat16* y) { + retval = make_bfloat162(*x, *y); return 0; } """ - operator_gt_1 = declare_device( - "operator_gt_1", - bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)), + _ZL14make_bfloat16213__nv_bfloat16S__nbst = declare_device( + "_ZL14make_bfloat16213__nv_bfloat16S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) + ), ) - def operator_gt_1_caller(arg_0, arg_1): - return operator_gt_1(arg_0, arg_1) + def _ZL14make_bfloat16213__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL14make_bfloat16213__nv_bfloat16S__nbst(arg_0, arg_1) - @lower(operator.gt, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(make_bfloat162, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_gt_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL14make_bfloat16213__nv_bfloat16S__nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - operator_gt_1_caller, + _ZL14make_bfloat16213__nv_bfloat16S__nbst_caller, signature( - bool_, + _type___nv_bfloat162, CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16), ), @@ -3883,858 +4083,11629 @@ def impl(context, builder, sig, args): ) -_operator_gt_1_lower(shim_stream, shim_obj) +_lower__ZL14make_bfloat16213__nv_bfloat16S__nbst(shim_stream, shim_obj) -def _operator_lt_1_lower(shim_stream, shim_obj): - shim_raw_str = """ +def __bfloat162ull_rd(): + pass + + +def _lower__ZL17__bfloat162ull_rd13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ extern "C" __device__ int - operator_lt_1(bool &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { - retval = operator<(*lh, *rh); + _ZL17__bfloat162ull_rd13__nv_bfloat16_nbst(unsigned long long &retval , __nv_bfloat16* h) { + retval = __bfloat162ull_rd(*h); return 0; } """ - operator_lt_1 = declare_device( - "operator_lt_1", - bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)), + _ZL17__bfloat162ull_rd13__nv_bfloat16_nbst = declare_device( + "_ZL17__bfloat162ull_rd13__nv_bfloat16_nbst", + uint64(CPointer(_type___nv_bfloat16)), ) - def operator_lt_1_caller(arg_0, arg_1): - return operator_lt_1(arg_0, arg_1) + def _ZL17__bfloat162ull_rd13__nv_bfloat16_nbst_caller(arg_0): + return _ZL17__bfloat162ull_rd13__nv_bfloat16_nbst(arg_0) - @lower(operator.lt, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__bfloat162ull_rd, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_lt_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL17__bfloat162ull_rd13__nv_bfloat16_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - operator_lt_1_caller, - signature( - bool_, - CPointer(_type___nv_bfloat16), - CPointer(_type___nv_bfloat16), - ), + _ZL17__bfloat162ull_rd13__nv_bfloat16_nbst_caller, + signature(uint64, CPointer(_type___nv_bfloat16)), ptrs, ) -_operator_lt_1_lower(shim_stream, shim_obj) +_lower__ZL17__bfloat162ull_rd13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def __bfloat162ull_ru(): + pass -def _operator_ge_1_lower(shim_stream, shim_obj): +def _lower__ZL17__bfloat162ull_ru13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - operator_ge_1(bool &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { - retval = operator>=(*lh, *rh); + _ZL17__bfloat162ull_ru13__nv_bfloat16_nbst(unsigned long long &retval , __nv_bfloat16* h) { + retval = __bfloat162ull_ru(*h); return 0; } """ - operator_ge_1 = declare_device( - "operator_ge_1", - bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)), + _ZL17__bfloat162ull_ru13__nv_bfloat16_nbst = declare_device( + "_ZL17__bfloat162ull_ru13__nv_bfloat16_nbst", + uint64(CPointer(_type___nv_bfloat16)), ) - def operator_ge_1_caller(arg_0, arg_1): - return operator_ge_1(arg_0, arg_1) + def _ZL17__bfloat162ull_ru13__nv_bfloat16_nbst_caller(arg_0): + return _ZL17__bfloat162ull_ru13__nv_bfloat16_nbst(arg_0) - @lower(operator.ge, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__bfloat162ull_ru, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_ge_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL17__bfloat162ull_ru13__nv_bfloat16_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - operator_ge_1_caller, - signature( - bool_, - CPointer(_type___nv_bfloat16), - CPointer(_type___nv_bfloat16), - ), + _ZL17__bfloat162ull_ru13__nv_bfloat16_nbst_caller, + signature(uint64, CPointer(_type___nv_bfloat16)), ptrs, ) -_operator_ge_1_lower(shim_stream, shim_obj) +_lower__ZL17__bfloat162ull_ru13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def __ull2bfloat16_rn(): + pass -def _operator_le_1_lower(shim_stream, shim_obj): +def _lower__ZL17__ull2bfloat16_rny_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - operator_le_1(bool &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { - retval = operator<=(*lh, *rh); + _ZL17__ull2bfloat16_rny_nbst(__nv_bfloat16 &retval , unsigned long long* i) { + retval = __ull2bfloat16_rn(*i); return 0; } """ - operator_le_1 = declare_device( - "operator_le_1", - bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)), + _ZL17__ull2bfloat16_rny_nbst = declare_device( + "_ZL17__ull2bfloat16_rny_nbst", _type___nv_bfloat16(CPointer(uint64)) ) - def operator_le_1_caller(arg_0, arg_1): - return operator_le_1(arg_0, arg_1) + def _ZL17__ull2bfloat16_rny_nbst_caller(arg_0): + return _ZL17__ull2bfloat16_rny_nbst(arg_0) - @lower(operator.le, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__ull2bfloat16_rn, uint64) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_le_1", shim_raw_str) + shim_stream.write_with_key("_ZL17__ull2bfloat16_rny_nbst", shim_raw_str) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - operator_le_1_caller, - signature( - bool_, - CPointer(_type___nv_bfloat16), - CPointer(_type___nv_bfloat16), - ), + _ZL17__ull2bfloat16_rny_nbst_caller, + signature(_type___nv_bfloat16, CPointer(uint64)), ptrs, ) -_operator_le_1_lower(shim_stream, shim_obj) +_lower__ZL17__ull2bfloat16_rny_nbst(shim_stream, shim_obj) + + +def __ull2bfloat16_rz(): + pass -def _operator_add_2_lower(shim_stream, shim_obj): +def _lower__ZL17__ull2bfloat16_rzy_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - operator_add_2(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { - retval = operator+(*lh, *rh); + _ZL17__ull2bfloat16_rzy_nbst(__nv_bfloat16 &retval , unsigned long long* i) { + retval = __ull2bfloat16_rz(*i); return 0; } """ - operator_add_2 = declare_device( - "operator_add_2", - _type___nv_bfloat162( - CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) - ), + _ZL17__ull2bfloat16_rzy_nbst = declare_device( + "_ZL17__ull2bfloat16_rzy_nbst", _type___nv_bfloat16(CPointer(uint64)) ) - def operator_add_2_caller(arg_0, arg_1): - return operator_add_2(arg_0, arg_1) + def _ZL17__ull2bfloat16_rzy_nbst_caller(arg_0): + return _ZL17__ull2bfloat16_rzy_nbst(arg_0) - @lower(operator.add, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__ull2bfloat16_rz, uint64) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_add_2", shim_raw_str) + shim_stream.write_with_key("_ZL17__ull2bfloat16_rzy_nbst", shim_raw_str) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - operator_add_2_caller, - signature( - _type___nv_bfloat162, - CPointer(_type___nv_bfloat162), - CPointer(_type___nv_bfloat162), - ), + _ZL17__ull2bfloat16_rzy_nbst_caller, + signature(_type___nv_bfloat16, CPointer(uint64)), ptrs, ) -_operator_add_2_lower(shim_stream, shim_obj) +_lower__ZL17__ull2bfloat16_rzy_nbst(shim_stream, shim_obj) + + +def __ull2bfloat16_rd(): + pass -def _operator_sub_2_lower(shim_stream, shim_obj): +def _lower__ZL17__ull2bfloat16_rdy_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - operator_sub_2(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { - retval = operator-(*lh, *rh); + _ZL17__ull2bfloat16_rdy_nbst(__nv_bfloat16 &retval , unsigned long long* i) { + retval = __ull2bfloat16_rd(*i); return 0; } """ - operator_sub_2 = declare_device( - "operator_sub_2", - _type___nv_bfloat162( - CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) - ), + _ZL17__ull2bfloat16_rdy_nbst = declare_device( + "_ZL17__ull2bfloat16_rdy_nbst", _type___nv_bfloat16(CPointer(uint64)) ) - def operator_sub_2_caller(arg_0, arg_1): - return operator_sub_2(arg_0, arg_1) + def _ZL17__ull2bfloat16_rdy_nbst_caller(arg_0): + return _ZL17__ull2bfloat16_rdy_nbst(arg_0) - @lower(operator.sub, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__ull2bfloat16_rd, uint64) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_sub_2", shim_raw_str) + shim_stream.write_with_key("_ZL17__ull2bfloat16_rdy_nbst", shim_raw_str) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - operator_sub_2_caller, - signature( - _type___nv_bfloat162, - CPointer(_type___nv_bfloat162), - CPointer(_type___nv_bfloat162), - ), + _ZL17__ull2bfloat16_rdy_nbst_caller, + signature(_type___nv_bfloat16, CPointer(uint64)), ptrs, ) -_operator_sub_2_lower(shim_stream, shim_obj) +_lower__ZL17__ull2bfloat16_rdy_nbst(shim_stream, shim_obj) -def _operator_mul_2_lower(shim_stream, shim_obj): +def __ull2bfloat16_ru(): + pass + + +def _lower__ZL17__ull2bfloat16_ruy_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - operator_mul_2(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { - retval = operator*(*lh, *rh); + _ZL17__ull2bfloat16_ruy_nbst(__nv_bfloat16 &retval , unsigned long long* i) { + retval = __ull2bfloat16_ru(*i); return 0; } """ - operator_mul_2 = declare_device( - "operator_mul_2", - _type___nv_bfloat162( - CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) - ), + _ZL17__ull2bfloat16_ruy_nbst = declare_device( + "_ZL17__ull2bfloat16_ruy_nbst", _type___nv_bfloat16(CPointer(uint64)) ) - def operator_mul_2_caller(arg_0, arg_1): - return operator_mul_2(arg_0, arg_1) + def _ZL17__ull2bfloat16_ruy_nbst_caller(arg_0): + return _ZL17__ull2bfloat16_ruy_nbst(arg_0) - @lower(operator.mul, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__ull2bfloat16_ru, uint64) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_mul_2", shim_raw_str) + shim_stream.write_with_key("_ZL17__ull2bfloat16_ruy_nbst", shim_raw_str) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - operator_mul_2_caller, - signature( - _type___nv_bfloat162, - CPointer(_type___nv_bfloat162), - CPointer(_type___nv_bfloat162), - ), + _ZL17__ull2bfloat16_ruy_nbst_caller, + signature(_type___nv_bfloat16, CPointer(uint64)), ptrs, ) -_operator_mul_2_lower(shim_stream, shim_obj) +_lower__ZL17__ull2bfloat16_ruy_nbst(shim_stream, shim_obj) + + +def __bfloat162ll_rn(): + pass -def _operator_truediv_2_lower(shim_stream, shim_obj): +def _lower__ZL16__bfloat162ll_rn13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - operator_truediv_2(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { - retval = operator/(*lh, *rh); + _ZL16__bfloat162ll_rn13__nv_bfloat16_nbst(long long &retval , __nv_bfloat16* h) { + retval = __bfloat162ll_rn(*h); return 0; } """ - operator_truediv_2 = declare_device( - "operator_truediv_2", - _type___nv_bfloat162( - CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) - ), + _ZL16__bfloat162ll_rn13__nv_bfloat16_nbst = declare_device( + "_ZL16__bfloat162ll_rn13__nv_bfloat16_nbst", + int64(CPointer(_type___nv_bfloat16)), ) - def operator_truediv_2_caller(arg_0, arg_1): - return operator_truediv_2(arg_0, arg_1) + def _ZL16__bfloat162ll_rn13__nv_bfloat16_nbst_caller(arg_0): + return _ZL16__bfloat162ll_rn13__nv_bfloat16_nbst(arg_0) - @lower(operator.truediv, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__bfloat162ll_rn, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_truediv_2", shim_raw_str) + shim_stream.write_with_key( + "_ZL16__bfloat162ll_rn13__nv_bfloat16_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - operator_truediv_2_caller, - signature( - _type___nv_bfloat162, - CPointer(_type___nv_bfloat162), - CPointer(_type___nv_bfloat162), - ), + _ZL16__bfloat162ll_rn13__nv_bfloat16_nbst_caller, + signature(int64, CPointer(_type___nv_bfloat16)), ptrs, ) -_operator_truediv_2_lower(shim_stream, shim_obj) +_lower__ZL16__bfloat162ll_rn13__nv_bfloat16_nbst(shim_stream, shim_obj) -def _operator_iadd_2_lower(shim_stream, shim_obj): +def __bfloat162ll_rz(): + pass + + +def _lower__ZL16__bfloat162ll_rz13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - operator_iadd_2(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { - retval = operator+=(*lh, *rh); + _ZL16__bfloat162ll_rz13__nv_bfloat16_nbst(long long &retval , __nv_bfloat16* h) { + retval = __bfloat162ll_rz(*h); return 0; } """ - operator_iadd_2 = declare_device( - "operator_iadd_2", - _type___nv_bfloat162( - CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) - ), + _ZL16__bfloat162ll_rz13__nv_bfloat16_nbst = declare_device( + "_ZL16__bfloat162ll_rz13__nv_bfloat16_nbst", + int64(CPointer(_type___nv_bfloat16)), ) - def operator_iadd_2_caller(arg_0, arg_1): - return operator_iadd_2(arg_0, arg_1) + def _ZL16__bfloat162ll_rz13__nv_bfloat16_nbst_caller(arg_0): + return _ZL16__bfloat162ll_rz13__nv_bfloat16_nbst(arg_0) - @lower(operator.iadd, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__bfloat162ll_rz, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_iadd_2", shim_raw_str) + shim_stream.write_with_key( + "_ZL16__bfloat162ll_rz13__nv_bfloat16_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - operator_iadd_2_caller, - signature( - _type___nv_bfloat162, - CPointer(_type___nv_bfloat162), - CPointer(_type___nv_bfloat162), - ), + _ZL16__bfloat162ll_rz13__nv_bfloat16_nbst_caller, + signature(int64, CPointer(_type___nv_bfloat16)), ptrs, ) -_operator_iadd_2_lower(shim_stream, shim_obj) +_lower__ZL16__bfloat162ll_rz13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def __bfloat162ll_rd(): + pass -def _operator_isub_2_lower(shim_stream, shim_obj): +def _lower__ZL16__bfloat162ll_rd13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - operator_isub_2(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { - retval = operator-=(*lh, *rh); + _ZL16__bfloat162ll_rd13__nv_bfloat16_nbst(long long &retval , __nv_bfloat16* h) { + retval = __bfloat162ll_rd(*h); return 0; } """ - operator_isub_2 = declare_device( - "operator_isub_2", - _type___nv_bfloat162( - CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) - ), + _ZL16__bfloat162ll_rd13__nv_bfloat16_nbst = declare_device( + "_ZL16__bfloat162ll_rd13__nv_bfloat16_nbst", + int64(CPointer(_type___nv_bfloat16)), ) - def operator_isub_2_caller(arg_0, arg_1): - return operator_isub_2(arg_0, arg_1) + def _ZL16__bfloat162ll_rd13__nv_bfloat16_nbst_caller(arg_0): + return _ZL16__bfloat162ll_rd13__nv_bfloat16_nbst(arg_0) - @lower(operator.isub, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__bfloat162ll_rd, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_isub_2", shim_raw_str) + shim_stream.write_with_key( + "_ZL16__bfloat162ll_rd13__nv_bfloat16_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - operator_isub_2_caller, - signature( - _type___nv_bfloat162, - CPointer(_type___nv_bfloat162), - CPointer(_type___nv_bfloat162), - ), + _ZL16__bfloat162ll_rd13__nv_bfloat16_nbst_caller, + signature(int64, CPointer(_type___nv_bfloat16)), ptrs, ) -_operator_isub_2_lower(shim_stream, shim_obj) +_lower__ZL16__bfloat162ll_rd13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def __bfloat162ll_ru(): + pass -def _operator_imul_2_lower(shim_stream, shim_obj): +def _lower__ZL16__bfloat162ll_ru13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - operator_imul_2(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { - retval = operator*=(*lh, *rh); + _ZL16__bfloat162ll_ru13__nv_bfloat16_nbst(long long &retval , __nv_bfloat16* h) { + retval = __bfloat162ll_ru(*h); return 0; } """ - operator_imul_2 = declare_device( - "operator_imul_2", - _type___nv_bfloat162( - CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) - ), + _ZL16__bfloat162ll_ru13__nv_bfloat16_nbst = declare_device( + "_ZL16__bfloat162ll_ru13__nv_bfloat16_nbst", + int64(CPointer(_type___nv_bfloat16)), ) - def operator_imul_2_caller(arg_0, arg_1): - return operator_imul_2(arg_0, arg_1) + def _ZL16__bfloat162ll_ru13__nv_bfloat16_nbst_caller(arg_0): + return _ZL16__bfloat162ll_ru13__nv_bfloat16_nbst(arg_0) - @lower(operator.imul, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__bfloat162ll_ru, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_imul_2", shim_raw_str) + shim_stream.write_with_key( + "_ZL16__bfloat162ll_ru13__nv_bfloat16_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - operator_imul_2_caller, - signature( - _type___nv_bfloat162, - CPointer(_type___nv_bfloat162), - CPointer(_type___nv_bfloat162), - ), + _ZL16__bfloat162ll_ru13__nv_bfloat16_nbst_caller, + signature(int64, CPointer(_type___nv_bfloat16)), ptrs, ) -_operator_imul_2_lower(shim_stream, shim_obj) +_lower__ZL16__bfloat162ll_ru13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def __ll2bfloat16_rn(): + pass -def _operator_itruediv_2_lower(shim_stream, shim_obj): +def _lower__ZL16__ll2bfloat16_rnx_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - operator_itruediv_2(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { - retval = operator/=(*lh, *rh); + _ZL16__ll2bfloat16_rnx_nbst(__nv_bfloat16 &retval , long long* i) { + retval = __ll2bfloat16_rn(*i); return 0; } """ - operator_itruediv_2 = declare_device( - "operator_itruediv_2", - _type___nv_bfloat162( - CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) - ), + _ZL16__ll2bfloat16_rnx_nbst = declare_device( + "_ZL16__ll2bfloat16_rnx_nbst", _type___nv_bfloat16(CPointer(int64)) ) - def operator_itruediv_2_caller(arg_0, arg_1): - return operator_itruediv_2(arg_0, arg_1) + def _ZL16__ll2bfloat16_rnx_nbst_caller(arg_0): + return _ZL16__ll2bfloat16_rnx_nbst(arg_0) - @lower(operator.itruediv, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__ll2bfloat16_rn, int64) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_itruediv_2", shim_raw_str) + shim_stream.write_with_key("_ZL16__ll2bfloat16_rnx_nbst", shim_raw_str) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - operator_itruediv_2_caller, - signature( - _type___nv_bfloat162, - CPointer(_type___nv_bfloat162), - CPointer(_type___nv_bfloat162), - ), + _ZL16__ll2bfloat16_rnx_nbst_caller, + signature(_type___nv_bfloat16, CPointer(int64)), ptrs, ) -_operator_itruediv_2_lower(shim_stream, shim_obj) +_lower__ZL16__ll2bfloat16_rnx_nbst(shim_stream, shim_obj) + + +def __ll2bfloat16_rz(): + pass -def _operator_pos_2_lower(shim_stream, shim_obj): +def _lower__ZL16__ll2bfloat16_rzx_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - operator_pos_2(__nv_bfloat162 &retval , __nv_bfloat162* h) { - retval = operator+(*h); + _ZL16__ll2bfloat16_rzx_nbst(__nv_bfloat16 &retval , long long* i) { + retval = __ll2bfloat16_rz(*i); return 0; } """ - operator_pos_2 = declare_device( - "operator_pos_2", _type___nv_bfloat162(CPointer(_type___nv_bfloat162)) + _ZL16__ll2bfloat16_rzx_nbst = declare_device( + "_ZL16__ll2bfloat16_rzx_nbst", _type___nv_bfloat16(CPointer(int64)) ) - def operator_pos_2_caller(arg_0): - return operator_pos_2(arg_0) + def _ZL16__ll2bfloat16_rzx_nbst_caller(arg_0): + return _ZL16__ll2bfloat16_rzx_nbst(arg_0) - @lower(operator.pos, _type___nv_bfloat162) + @lower(__ll2bfloat16_rz, int64) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_pos_2", shim_raw_str) + shim_stream.write_with_key("_ZL16__ll2bfloat16_rzx_nbst", shim_raw_str) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - operator_pos_2_caller, - signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + _ZL16__ll2bfloat16_rzx_nbst_caller, + signature(_type___nv_bfloat16, CPointer(int64)), ptrs, ) -_operator_pos_2_lower(shim_stream, shim_obj) +_lower__ZL16__ll2bfloat16_rzx_nbst(shim_stream, shim_obj) -def _operator_neg_2_lower(shim_stream, shim_obj): +def __ll2bfloat16_rd(): + pass + + +def _lower__ZL16__ll2bfloat16_rdx_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - operator_neg_2(__nv_bfloat162 &retval , __nv_bfloat162* h) { - retval = operator-(*h); + _ZL16__ll2bfloat16_rdx_nbst(__nv_bfloat16 &retval , long long* i) { + retval = __ll2bfloat16_rd(*i); return 0; } """ - operator_neg_2 = declare_device( - "operator_neg_2", _type___nv_bfloat162(CPointer(_type___nv_bfloat162)) + _ZL16__ll2bfloat16_rdx_nbst = declare_device( + "_ZL16__ll2bfloat16_rdx_nbst", _type___nv_bfloat16(CPointer(int64)) ) - def operator_neg_2_caller(arg_0): - return operator_neg_2(arg_0) + def _ZL16__ll2bfloat16_rdx_nbst_caller(arg_0): + return _ZL16__ll2bfloat16_rdx_nbst(arg_0) - @lower(operator.neg, _type___nv_bfloat162) + @lower(__ll2bfloat16_rd, int64) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_neg_2", shim_raw_str) + shim_stream.write_with_key("_ZL16__ll2bfloat16_rdx_nbst", shim_raw_str) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - operator_neg_2_caller, - signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + _ZL16__ll2bfloat16_rdx_nbst_caller, + signature(_type___nv_bfloat16, CPointer(int64)), ptrs, ) -_operator_neg_2_lower(shim_stream, shim_obj) +_lower__ZL16__ll2bfloat16_rdx_nbst(shim_stream, shim_obj) + + +def __ll2bfloat16_ru(): + pass -def _operator_eq_2_lower(shim_stream, shim_obj): +def _lower__ZL16__ll2bfloat16_rux_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - operator_eq_2(bool &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { - retval = operator==(*lh, *rh); + _ZL16__ll2bfloat16_rux_nbst(__nv_bfloat16 &retval , long long* i) { + retval = __ll2bfloat16_ru(*i); + return 0; + } + """ + + _ZL16__ll2bfloat16_rux_nbst = declare_device( + "_ZL16__ll2bfloat16_rux_nbst", _type___nv_bfloat16(CPointer(int64)) + ) + + def _ZL16__ll2bfloat16_rux_nbst_caller(arg_0): + return _ZL16__ll2bfloat16_rux_nbst(arg_0) + + @lower(__ll2bfloat16_ru, int64) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key("_ZL16__ll2bfloat16_rux_nbst", shim_raw_str) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL16__ll2bfloat16_rux_nbst_caller, + signature(_type___nv_bfloat16, CPointer(int64)), + ptrs, + ) + + +_lower__ZL16__ll2bfloat16_rux_nbst(shim_stream, shim_obj) + + +def htrunc(): + pass + + +def _lower__ZL6htrunc13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6htrunc13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* h) { + retval = htrunc(*h); + return 0; + } + """ + + _ZL6htrunc13__nv_bfloat16_nbst = declare_device( + "_ZL6htrunc13__nv_bfloat16_nbst", + _type___nv_bfloat16(CPointer(_type___nv_bfloat16)), + ) + + def _ZL6htrunc13__nv_bfloat16_nbst_caller(arg_0): + return _ZL6htrunc13__nv_bfloat16_nbst(arg_0) + + @lower(htrunc, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6htrunc13__nv_bfloat16_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6htrunc13__nv_bfloat16_nbst_caller, + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + ptrs, + ) + + +_lower__ZL6htrunc13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def hceil(): + pass + + +def _lower__ZL5hceil13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL5hceil13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* h) { + retval = hceil(*h); return 0; } """ - operator_eq_2 = declare_device( - "operator_eq_2", - bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), - ) + _ZL5hceil13__nv_bfloat16_nbst = declare_device( + "_ZL5hceil13__nv_bfloat16_nbst", + _type___nv_bfloat16(CPointer(_type___nv_bfloat16)), + ) + + def _ZL5hceil13__nv_bfloat16_nbst_caller(arg_0): + return _ZL5hceil13__nv_bfloat16_nbst(arg_0) + + @lower(hceil, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL5hceil13__nv_bfloat16_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL5hceil13__nv_bfloat16_nbst_caller, + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + ptrs, + ) + + +_lower__ZL5hceil13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def hfloor(): + pass + + +def _lower__ZL6hfloor13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6hfloor13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* h) { + retval = hfloor(*h); + return 0; + } + """ + + _ZL6hfloor13__nv_bfloat16_nbst = declare_device( + "_ZL6hfloor13__nv_bfloat16_nbst", + _type___nv_bfloat16(CPointer(_type___nv_bfloat16)), + ) + + def _ZL6hfloor13__nv_bfloat16_nbst_caller(arg_0): + return _ZL6hfloor13__nv_bfloat16_nbst(arg_0) + + @lower(hfloor, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6hfloor13__nv_bfloat16_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6hfloor13__nv_bfloat16_nbst_caller, + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + ptrs, + ) + + +_lower__ZL6hfloor13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def hrint(): + pass + + +def _lower__ZL5hrint13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL5hrint13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* h) { + retval = hrint(*h); + return 0; + } + """ + + _ZL5hrint13__nv_bfloat16_nbst = declare_device( + "_ZL5hrint13__nv_bfloat16_nbst", + _type___nv_bfloat16(CPointer(_type___nv_bfloat16)), + ) + + def _ZL5hrint13__nv_bfloat16_nbst_caller(arg_0): + return _ZL5hrint13__nv_bfloat16_nbst(arg_0) + + @lower(hrint, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL5hrint13__nv_bfloat16_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL5hrint13__nv_bfloat16_nbst_caller, + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + ptrs, + ) + + +_lower__ZL5hrint13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def h2trunc(): + pass + + +def _lower__ZL7h2trunc14__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL7h2trunc14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* h) { + retval = h2trunc(*h); + return 0; + } + """ + + _ZL7h2trunc14__nv_bfloat162_nbst = declare_device( + "_ZL7h2trunc14__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(_type___nv_bfloat162)), + ) + + def _ZL7h2trunc14__nv_bfloat162_nbst_caller(arg_0): + return _ZL7h2trunc14__nv_bfloat162_nbst(arg_0) + + @lower(h2trunc, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL7h2trunc14__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL7h2trunc14__nv_bfloat162_nbst_caller, + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + ptrs, + ) + + +_lower__ZL7h2trunc14__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def h2ceil(): + pass + + +def _lower__ZL6h2ceil14__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6h2ceil14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* h) { + retval = h2ceil(*h); + return 0; + } + """ + + _ZL6h2ceil14__nv_bfloat162_nbst = declare_device( + "_ZL6h2ceil14__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(_type___nv_bfloat162)), + ) + + def _ZL6h2ceil14__nv_bfloat162_nbst_caller(arg_0): + return _ZL6h2ceil14__nv_bfloat162_nbst(arg_0) + + @lower(h2ceil, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6h2ceil14__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6h2ceil14__nv_bfloat162_nbst_caller, + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + ptrs, + ) + + +_lower__ZL6h2ceil14__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def h2floor(): + pass + + +def _lower__ZL7h2floor14__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL7h2floor14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* h) { + retval = h2floor(*h); + return 0; + } + """ + + _ZL7h2floor14__nv_bfloat162_nbst = declare_device( + "_ZL7h2floor14__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(_type___nv_bfloat162)), + ) + + def _ZL7h2floor14__nv_bfloat162_nbst_caller(arg_0): + return _ZL7h2floor14__nv_bfloat162_nbst(arg_0) + + @lower(h2floor, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL7h2floor14__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL7h2floor14__nv_bfloat162_nbst_caller, + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + ptrs, + ) + + +_lower__ZL7h2floor14__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def h2rint(): + pass + + +def _lower__ZL6h2rint14__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6h2rint14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* h) { + retval = h2rint(*h); + return 0; + } + """ + + _ZL6h2rint14__nv_bfloat162_nbst = declare_device( + "_ZL6h2rint14__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(_type___nv_bfloat162)), + ) + + def _ZL6h2rint14__nv_bfloat162_nbst_caller(arg_0): + return _ZL6h2rint14__nv_bfloat162_nbst(arg_0) + + @lower(h2rint, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6h2rint14__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6h2rint14__nv_bfloat162_nbst_caller, + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + ptrs, + ) + + +_lower__ZL6h2rint14__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def __bfloat162bfloat162(): + pass + + +def _lower__ZL20__bfloat162bfloat16213__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL20__bfloat162bfloat16213__nv_bfloat16_nbst(__nv_bfloat162 &retval , __nv_bfloat16* a) { + retval = __bfloat162bfloat162(*a); + return 0; + } + """ + + _ZL20__bfloat162bfloat16213__nv_bfloat16_nbst = declare_device( + "_ZL20__bfloat162bfloat16213__nv_bfloat16_nbst", + _type___nv_bfloat162(CPointer(_type___nv_bfloat16)), + ) + + def _ZL20__bfloat162bfloat16213__nv_bfloat16_nbst_caller(arg_0): + return _ZL20__bfloat162bfloat16213__nv_bfloat16_nbst(arg_0) + + @lower(__bfloat162bfloat162, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL20__bfloat162bfloat16213__nv_bfloat16_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL20__bfloat162bfloat16213__nv_bfloat16_nbst_caller, + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat16)), + ptrs, + ) + + +_lower__ZL20__bfloat162bfloat16213__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def __lowhigh2highlow(): + pass + + +def _lower__ZL17__lowhigh2highlow14__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL17__lowhigh2highlow14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) { + retval = __lowhigh2highlow(*a); + return 0; + } + """ + + _ZL17__lowhigh2highlow14__nv_bfloat162_nbst = declare_device( + "_ZL17__lowhigh2highlow14__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(_type___nv_bfloat162)), + ) + + def _ZL17__lowhigh2highlow14__nv_bfloat162_nbst_caller(arg_0): + return _ZL17__lowhigh2highlow14__nv_bfloat162_nbst(arg_0) + + @lower(__lowhigh2highlow, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL17__lowhigh2highlow14__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL17__lowhigh2highlow14__nv_bfloat162_nbst_caller, + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + ptrs, + ) + + +_lower__ZL17__lowhigh2highlow14__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def __lows2bfloat162(): + pass + + +def _lower__ZL16__lows2bfloat16214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL16__lows2bfloat16214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __lows2bfloat162(*a, *b); + return 0; + } + """ + + _ZL16__lows2bfloat16214__nv_bfloat162S__nbst = declare_device( + "_ZL16__lows2bfloat16214__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL16__lows2bfloat16214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL16__lows2bfloat16214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__lows2bfloat162, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL16__lows2bfloat16214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL16__lows2bfloat16214__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL16__lows2bfloat16214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __highs2bfloat162(): + pass + + +def _lower__ZL17__highs2bfloat16214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL17__highs2bfloat16214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __highs2bfloat162(*a, *b); + return 0; + } + """ + + _ZL17__highs2bfloat16214__nv_bfloat162S__nbst = declare_device( + "_ZL17__highs2bfloat16214__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL17__highs2bfloat16214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL17__highs2bfloat16214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__highs2bfloat162, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL17__highs2bfloat16214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL17__highs2bfloat16214__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL17__highs2bfloat16214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __high2bfloat16(): + pass + + +def _lower__ZL15__high2bfloat1614__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL15__high2bfloat1614__nv_bfloat162_nbst(__nv_bfloat16 &retval , __nv_bfloat162* a) { + retval = __high2bfloat16(*a); + return 0; + } + """ + + _ZL15__high2bfloat1614__nv_bfloat162_nbst = declare_device( + "_ZL15__high2bfloat1614__nv_bfloat162_nbst", + _type___nv_bfloat16(CPointer(_type___nv_bfloat162)), + ) + + def _ZL15__high2bfloat1614__nv_bfloat162_nbst_caller(arg_0): + return _ZL15__high2bfloat1614__nv_bfloat162_nbst(arg_0) + + @lower(__high2bfloat16, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL15__high2bfloat1614__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL15__high2bfloat1614__nv_bfloat162_nbst_caller, + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat162)), + ptrs, + ) + + +_lower__ZL15__high2bfloat1614__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def __low2bfloat16(): + pass + + +def _lower__ZL14__low2bfloat1614__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL14__low2bfloat1614__nv_bfloat162_nbst(__nv_bfloat16 &retval , __nv_bfloat162* a) { + retval = __low2bfloat16(*a); + return 0; + } + """ + + _ZL14__low2bfloat1614__nv_bfloat162_nbst = declare_device( + "_ZL14__low2bfloat1614__nv_bfloat162_nbst", + _type___nv_bfloat16(CPointer(_type___nv_bfloat162)), + ) + + def _ZL14__low2bfloat1614__nv_bfloat162_nbst_caller(arg_0): + return _ZL14__low2bfloat1614__nv_bfloat162_nbst(arg_0) + + @lower(__low2bfloat16, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL14__low2bfloat1614__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL14__low2bfloat1614__nv_bfloat162_nbst_caller, + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat162)), + ptrs, + ) + + +_lower__ZL14__low2bfloat1614__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def __hisinf(): + pass + + +def _lower__ZL8__hisinf13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL8__hisinf13__nv_bfloat16_nbst(int &retval , __nv_bfloat16* a) { + retval = __hisinf(*a); + return 0; + } + """ + + _ZL8__hisinf13__nv_bfloat16_nbst = declare_device( + "_ZL8__hisinf13__nv_bfloat16_nbst", int32(CPointer(_type___nv_bfloat16)) + ) + + def _ZL8__hisinf13__nv_bfloat16_nbst_caller(arg_0): + return _ZL8__hisinf13__nv_bfloat16_nbst(arg_0) + + @lower(__hisinf, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL8__hisinf13__nv_bfloat16_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL8__hisinf13__nv_bfloat16_nbst_caller, + signature(int32, CPointer(_type___nv_bfloat16)), + ptrs, + ) + + +_lower__ZL8__hisinf13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def __halves2bfloat162(): + pass + + +def _lower__ZL18__halves2bfloat16213__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL18__halves2bfloat16213__nv_bfloat16S__nbst(__nv_bfloat162 &retval , __nv_bfloat16* a, __nv_bfloat16* b) { + retval = __halves2bfloat162(*a, *b); + return 0; + } + """ + + _ZL18__halves2bfloat16213__nv_bfloat16S__nbst = declare_device( + "_ZL18__halves2bfloat16213__nv_bfloat16S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) + ), + ) + + def _ZL18__halves2bfloat16213__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL18__halves2bfloat16213__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__halves2bfloat162, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL18__halves2bfloat16213__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL18__halves2bfloat16213__nv_bfloat16S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL18__halves2bfloat16213__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __low2bfloat162(): + pass + + +def _lower__ZL15__low2bfloat16214__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL15__low2bfloat16214__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) { + retval = __low2bfloat162(*a); + return 0; + } + """ + + _ZL15__low2bfloat16214__nv_bfloat162_nbst = declare_device( + "_ZL15__low2bfloat16214__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(_type___nv_bfloat162)), + ) + + def _ZL15__low2bfloat16214__nv_bfloat162_nbst_caller(arg_0): + return _ZL15__low2bfloat16214__nv_bfloat162_nbst(arg_0) + + @lower(__low2bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL15__low2bfloat16214__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL15__low2bfloat16214__nv_bfloat162_nbst_caller, + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + ptrs, + ) + + +_lower__ZL15__low2bfloat16214__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def __high2bfloat162(): + pass + + +def _lower__ZL16__high2bfloat16214__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL16__high2bfloat16214__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) { + retval = __high2bfloat162(*a); + return 0; + } + """ + + _ZL16__high2bfloat16214__nv_bfloat162_nbst = declare_device( + "_ZL16__high2bfloat16214__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(_type___nv_bfloat162)), + ) + + def _ZL16__high2bfloat16214__nv_bfloat162_nbst_caller(arg_0): + return _ZL16__high2bfloat16214__nv_bfloat162_nbst(arg_0) + + @lower(__high2bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL16__high2bfloat16214__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL16__high2bfloat16214__nv_bfloat162_nbst_caller, + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + ptrs, + ) + + +_lower__ZL16__high2bfloat16214__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def __bfloat16_as_short(): + pass + + +def _lower__ZL19__bfloat16_as_short13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL19__bfloat16_as_short13__nv_bfloat16_nbst(short &retval , __nv_bfloat16* h) { + retval = __bfloat16_as_short(*h); + return 0; + } + """ + + _ZL19__bfloat16_as_short13__nv_bfloat16_nbst = declare_device( + "_ZL19__bfloat16_as_short13__nv_bfloat16_nbst", + int16(CPointer(_type___nv_bfloat16)), + ) + + def _ZL19__bfloat16_as_short13__nv_bfloat16_nbst_caller(arg_0): + return _ZL19__bfloat16_as_short13__nv_bfloat16_nbst(arg_0) + + @lower(__bfloat16_as_short, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL19__bfloat16_as_short13__nv_bfloat16_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL19__bfloat16_as_short13__nv_bfloat16_nbst_caller, + signature(int16, CPointer(_type___nv_bfloat16)), + ptrs, + ) + + +_lower__ZL19__bfloat16_as_short13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def __bfloat16_as_ushort(): + pass + + +def _lower__ZL20__bfloat16_as_ushort13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL20__bfloat16_as_ushort13__nv_bfloat16_nbst(unsigned short &retval , __nv_bfloat16* h) { + retval = __bfloat16_as_ushort(*h); + return 0; + } + """ + + _ZL20__bfloat16_as_ushort13__nv_bfloat16_nbst = declare_device( + "_ZL20__bfloat16_as_ushort13__nv_bfloat16_nbst", + uint16(CPointer(_type___nv_bfloat16)), + ) + + def _ZL20__bfloat16_as_ushort13__nv_bfloat16_nbst_caller(arg_0): + return _ZL20__bfloat16_as_ushort13__nv_bfloat16_nbst(arg_0) + + @lower(__bfloat16_as_ushort, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL20__bfloat16_as_ushort13__nv_bfloat16_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL20__bfloat16_as_ushort13__nv_bfloat16_nbst_caller, + signature(uint16, CPointer(_type___nv_bfloat16)), + ptrs, + ) + + +_lower__ZL20__bfloat16_as_ushort13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def __short_as_bfloat16(): + pass + + +def _lower__ZL19__short_as_bfloat16s_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL19__short_as_bfloat16s_nbst(__nv_bfloat16 &retval , short* i) { + retval = __short_as_bfloat16(*i); + return 0; + } + """ + + _ZL19__short_as_bfloat16s_nbst = declare_device( + "_ZL19__short_as_bfloat16s_nbst", _type___nv_bfloat16(CPointer(int16)) + ) + + def _ZL19__short_as_bfloat16s_nbst_caller(arg_0): + return _ZL19__short_as_bfloat16s_nbst(arg_0) + + @lower(__short_as_bfloat16, int16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL19__short_as_bfloat16s_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL19__short_as_bfloat16s_nbst_caller, + signature(_type___nv_bfloat16, CPointer(int16)), + ptrs, + ) + + +_lower__ZL19__short_as_bfloat16s_nbst(shim_stream, shim_obj) + + +def __ushort_as_bfloat16(): + pass + + +def _lower__ZL20__ushort_as_bfloat16t_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL20__ushort_as_bfloat16t_nbst(__nv_bfloat16 &retval , unsigned short* i) { + retval = __ushort_as_bfloat16(*i); + return 0; + } + """ + + _ZL20__ushort_as_bfloat16t_nbst = declare_device( + "_ZL20__ushort_as_bfloat16t_nbst", _type___nv_bfloat16(CPointer(uint16)) + ) + + def _ZL20__ushort_as_bfloat16t_nbst_caller(arg_0): + return _ZL20__ushort_as_bfloat16t_nbst(arg_0) + + @lower(__ushort_as_bfloat16, uint16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL20__ushort_as_bfloat16t_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL20__ushort_as_bfloat16t_nbst_caller, + signature(_type___nv_bfloat16, CPointer(uint16)), + ptrs, + ) + + +_lower__ZL20__ushort_as_bfloat16t_nbst(shim_stream, shim_obj) + + +def __shfl_sync(): + pass + + +def _lower__ZL11__shfl_syncj14__nv_bfloat162ii_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL11__shfl_syncj14__nv_bfloat162ii_nbst(__nv_bfloat162 &retval , unsigned int* mask, __nv_bfloat162* var, int* srcLane, int* width) { + retval = __shfl_sync(*mask, *var, *srcLane, *width); + return 0; + } + """ + + _ZL11__shfl_syncj14__nv_bfloat162ii_nbst = declare_device( + "_ZL11__shfl_syncj14__nv_bfloat162ii_nbst", + _type___nv_bfloat162( + CPointer(uint32), + CPointer(_type___nv_bfloat162), + CPointer(int32), + CPointer(int32), + ), + ) + + def _ZL11__shfl_syncj14__nv_bfloat162ii_nbst_caller( + arg_0, arg_1, arg_2, arg_3 + ): + return _ZL11__shfl_syncj14__nv_bfloat162ii_nbst( + arg_0, arg_1, arg_2, arg_3 + ) + + @lower(__shfl_sync, uint32, _type___nv_bfloat162, int32, int32) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL11__shfl_syncj14__nv_bfloat162ii_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL11__shfl_syncj14__nv_bfloat162ii_nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(uint32), + CPointer(_type___nv_bfloat162), + CPointer(int32), + CPointer(int32), + ), + ptrs, + ) + + +_lower__ZL11__shfl_syncj14__nv_bfloat162ii_nbst(shim_stream, shim_obj) + + +def __shfl_up_sync(): + pass + + +def _lower__ZL14__shfl_up_syncj14__nv_bfloat162ji_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL14__shfl_up_syncj14__nv_bfloat162ji_nbst(__nv_bfloat162 &retval , unsigned int* mask, __nv_bfloat162* var, unsigned int* delta, int* width) { + retval = __shfl_up_sync(*mask, *var, *delta, *width); + return 0; + } + """ + + _ZL14__shfl_up_syncj14__nv_bfloat162ji_nbst = declare_device( + "_ZL14__shfl_up_syncj14__nv_bfloat162ji_nbst", + _type___nv_bfloat162( + CPointer(uint32), + CPointer(_type___nv_bfloat162), + CPointer(uint32), + CPointer(int32), + ), + ) + + def _ZL14__shfl_up_syncj14__nv_bfloat162ji_nbst_caller( + arg_0, arg_1, arg_2, arg_3 + ): + return _ZL14__shfl_up_syncj14__nv_bfloat162ji_nbst( + arg_0, arg_1, arg_2, arg_3 + ) + + @lower(__shfl_up_sync, uint32, _type___nv_bfloat162, uint32, int32) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL14__shfl_up_syncj14__nv_bfloat162ji_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL14__shfl_up_syncj14__nv_bfloat162ji_nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(uint32), + CPointer(_type___nv_bfloat162), + CPointer(uint32), + CPointer(int32), + ), + ptrs, + ) + + +_lower__ZL14__shfl_up_syncj14__nv_bfloat162ji_nbst(shim_stream, shim_obj) + + +def __shfl_down_sync(): + pass + + +def _lower__ZL16__shfl_down_syncj14__nv_bfloat162ji_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL16__shfl_down_syncj14__nv_bfloat162ji_nbst(__nv_bfloat162 &retval , unsigned int* mask, __nv_bfloat162* var, unsigned int* delta, int* width) { + retval = __shfl_down_sync(*mask, *var, *delta, *width); + return 0; + } + """ + + _ZL16__shfl_down_syncj14__nv_bfloat162ji_nbst = declare_device( + "_ZL16__shfl_down_syncj14__nv_bfloat162ji_nbst", + _type___nv_bfloat162( + CPointer(uint32), + CPointer(_type___nv_bfloat162), + CPointer(uint32), + CPointer(int32), + ), + ) + + def _ZL16__shfl_down_syncj14__nv_bfloat162ji_nbst_caller( + arg_0, arg_1, arg_2, arg_3 + ): + return _ZL16__shfl_down_syncj14__nv_bfloat162ji_nbst( + arg_0, arg_1, arg_2, arg_3 + ) + + @lower(__shfl_down_sync, uint32, _type___nv_bfloat162, uint32, int32) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL16__shfl_down_syncj14__nv_bfloat162ji_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL16__shfl_down_syncj14__nv_bfloat162ji_nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(uint32), + CPointer(_type___nv_bfloat162), + CPointer(uint32), + CPointer(int32), + ), + ptrs, + ) + + +_lower__ZL16__shfl_down_syncj14__nv_bfloat162ji_nbst(shim_stream, shim_obj) + + +def __shfl_xor_sync(): + pass + + +def _lower__ZL15__shfl_xor_syncj14__nv_bfloat162ii_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL15__shfl_xor_syncj14__nv_bfloat162ii_nbst(__nv_bfloat162 &retval , unsigned int* mask, __nv_bfloat162* var, int* laneMask, int* width) { + retval = __shfl_xor_sync(*mask, *var, *laneMask, *width); + return 0; + } + """ + + _ZL15__shfl_xor_syncj14__nv_bfloat162ii_nbst = declare_device( + "_ZL15__shfl_xor_syncj14__nv_bfloat162ii_nbst", + _type___nv_bfloat162( + CPointer(uint32), + CPointer(_type___nv_bfloat162), + CPointer(int32), + CPointer(int32), + ), + ) + + def _ZL15__shfl_xor_syncj14__nv_bfloat162ii_nbst_caller( + arg_0, arg_1, arg_2, arg_3 + ): + return _ZL15__shfl_xor_syncj14__nv_bfloat162ii_nbst( + arg_0, arg_1, arg_2, arg_3 + ) + + @lower(__shfl_xor_sync, uint32, _type___nv_bfloat162, int32, int32) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL15__shfl_xor_syncj14__nv_bfloat162ii_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL15__shfl_xor_syncj14__nv_bfloat162ii_nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(uint32), + CPointer(_type___nv_bfloat162), + CPointer(int32), + CPointer(int32), + ), + ptrs, + ) + + +_lower__ZL15__shfl_xor_syncj14__nv_bfloat162ii_nbst(shim_stream, shim_obj) + + +def _lower__ZL11__shfl_syncj13__nv_bfloat16ii_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL11__shfl_syncj13__nv_bfloat16ii_nbst(__nv_bfloat16 &retval , unsigned int* mask, __nv_bfloat16* var, int* srcLane, int* width) { + retval = __shfl_sync(*mask, *var, *srcLane, *width); + return 0; + } + """ + + _ZL11__shfl_syncj13__nv_bfloat16ii_nbst = declare_device( + "_ZL11__shfl_syncj13__nv_bfloat16ii_nbst", + _type___nv_bfloat16( + CPointer(uint32), + CPointer(_type___nv_bfloat16), + CPointer(int32), + CPointer(int32), + ), + ) + + def _ZL11__shfl_syncj13__nv_bfloat16ii_nbst_caller( + arg_0, arg_1, arg_2, arg_3 + ): + return _ZL11__shfl_syncj13__nv_bfloat16ii_nbst( + arg_0, arg_1, arg_2, arg_3 + ) + + @lower(__shfl_sync, uint32, _type___nv_bfloat16, int32, int32) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL11__shfl_syncj13__nv_bfloat16ii_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL11__shfl_syncj13__nv_bfloat16ii_nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(uint32), + CPointer(_type___nv_bfloat16), + CPointer(int32), + CPointer(int32), + ), + ptrs, + ) + + +_lower__ZL11__shfl_syncj13__nv_bfloat16ii_nbst(shim_stream, shim_obj) + + +def _lower__ZL14__shfl_up_syncj13__nv_bfloat16ji_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL14__shfl_up_syncj13__nv_bfloat16ji_nbst(__nv_bfloat16 &retval , unsigned int* mask, __nv_bfloat16* var, unsigned int* delta, int* width) { + retval = __shfl_up_sync(*mask, *var, *delta, *width); + return 0; + } + """ + + _ZL14__shfl_up_syncj13__nv_bfloat16ji_nbst = declare_device( + "_ZL14__shfl_up_syncj13__nv_bfloat16ji_nbst", + _type___nv_bfloat16( + CPointer(uint32), + CPointer(_type___nv_bfloat16), + CPointer(uint32), + CPointer(int32), + ), + ) + + def _ZL14__shfl_up_syncj13__nv_bfloat16ji_nbst_caller( + arg_0, arg_1, arg_2, arg_3 + ): + return _ZL14__shfl_up_syncj13__nv_bfloat16ji_nbst( + arg_0, arg_1, arg_2, arg_3 + ) + + @lower(__shfl_up_sync, uint32, _type___nv_bfloat16, uint32, int32) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL14__shfl_up_syncj13__nv_bfloat16ji_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL14__shfl_up_syncj13__nv_bfloat16ji_nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(uint32), + CPointer(_type___nv_bfloat16), + CPointer(uint32), + CPointer(int32), + ), + ptrs, + ) + + +_lower__ZL14__shfl_up_syncj13__nv_bfloat16ji_nbst(shim_stream, shim_obj) + + +def _lower__ZL16__shfl_down_syncj13__nv_bfloat16ji_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL16__shfl_down_syncj13__nv_bfloat16ji_nbst(__nv_bfloat16 &retval , unsigned int* mask, __nv_bfloat16* var, unsigned int* delta, int* width) { + retval = __shfl_down_sync(*mask, *var, *delta, *width); + return 0; + } + """ + + _ZL16__shfl_down_syncj13__nv_bfloat16ji_nbst = declare_device( + "_ZL16__shfl_down_syncj13__nv_bfloat16ji_nbst", + _type___nv_bfloat16( + CPointer(uint32), + CPointer(_type___nv_bfloat16), + CPointer(uint32), + CPointer(int32), + ), + ) + + def _ZL16__shfl_down_syncj13__nv_bfloat16ji_nbst_caller( + arg_0, arg_1, arg_2, arg_3 + ): + return _ZL16__shfl_down_syncj13__nv_bfloat16ji_nbst( + arg_0, arg_1, arg_2, arg_3 + ) + + @lower(__shfl_down_sync, uint32, _type___nv_bfloat16, uint32, int32) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL16__shfl_down_syncj13__nv_bfloat16ji_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL16__shfl_down_syncj13__nv_bfloat16ji_nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(uint32), + CPointer(_type___nv_bfloat16), + CPointer(uint32), + CPointer(int32), + ), + ptrs, + ) + + +_lower__ZL16__shfl_down_syncj13__nv_bfloat16ji_nbst(shim_stream, shim_obj) + + +def _lower__ZL15__shfl_xor_syncj13__nv_bfloat16ii_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL15__shfl_xor_syncj13__nv_bfloat16ii_nbst(__nv_bfloat16 &retval , unsigned int* mask, __nv_bfloat16* var, int* laneMask, int* width) { + retval = __shfl_xor_sync(*mask, *var, *laneMask, *width); + return 0; + } + """ + + _ZL15__shfl_xor_syncj13__nv_bfloat16ii_nbst = declare_device( + "_ZL15__shfl_xor_syncj13__nv_bfloat16ii_nbst", + _type___nv_bfloat16( + CPointer(uint32), + CPointer(_type___nv_bfloat16), + CPointer(int32), + CPointer(int32), + ), + ) + + def _ZL15__shfl_xor_syncj13__nv_bfloat16ii_nbst_caller( + arg_0, arg_1, arg_2, arg_3 + ): + return _ZL15__shfl_xor_syncj13__nv_bfloat16ii_nbst( + arg_0, arg_1, arg_2, arg_3 + ) + + @lower(__shfl_xor_sync, uint32, _type___nv_bfloat16, int32, int32) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL15__shfl_xor_syncj13__nv_bfloat16ii_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL15__shfl_xor_syncj13__nv_bfloat16ii_nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(uint32), + CPointer(_type___nv_bfloat16), + CPointer(int32), + CPointer(int32), + ), + ptrs, + ) + + +_lower__ZL15__shfl_xor_syncj13__nv_bfloat16ii_nbst(shim_stream, shim_obj) + + +def __ldg(): + pass + + +def _lower__ZL5__ldgPK14__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL5__ldgPK14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162 ** ptr) { + retval = __ldg(*ptr); + return 0; + } + """ + + _ZL5__ldgPK14__nv_bfloat162_nbst = declare_device( + "_ZL5__ldgPK14__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(CPointer(_type___nv_bfloat162))), + ) + + def _ZL5__ldgPK14__nv_bfloat162_nbst_caller(arg_0): + return _ZL5__ldgPK14__nv_bfloat162_nbst(arg_0) + + @lower(__ldg, CPointer(_type___nv_bfloat162)) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL5__ldgPK14__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL5__ldgPK14__nv_bfloat162_nbst_caller, + signature( + _type___nv_bfloat162, CPointer(CPointer(_type___nv_bfloat162)) + ), + ptrs, + ) + + +_lower__ZL5__ldgPK14__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def _lower__ZL5__ldgPK13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL5__ldgPK13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16 ** ptr) { + retval = __ldg(*ptr); + return 0; + } + """ + + _ZL5__ldgPK13__nv_bfloat16_nbst = declare_device( + "_ZL5__ldgPK13__nv_bfloat16_nbst", + _type___nv_bfloat16(CPointer(CPointer(_type___nv_bfloat16))), + ) + + def _ZL5__ldgPK13__nv_bfloat16_nbst_caller(arg_0): + return _ZL5__ldgPK13__nv_bfloat16_nbst(arg_0) + + @lower(__ldg, CPointer(_type___nv_bfloat16)) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL5__ldgPK13__nv_bfloat16_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL5__ldgPK13__nv_bfloat16_nbst_caller, + signature( + _type___nv_bfloat16, CPointer(CPointer(_type___nv_bfloat16)) + ), + ptrs, + ) + + +_lower__ZL5__ldgPK13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def __ldcg(): + pass + + +def _lower__ZL6__ldcgPK14__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__ldcgPK14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162 ** ptr) { + retval = __ldcg(*ptr); + return 0; + } + """ + + _ZL6__ldcgPK14__nv_bfloat162_nbst = declare_device( + "_ZL6__ldcgPK14__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(CPointer(_type___nv_bfloat162))), + ) + + def _ZL6__ldcgPK14__nv_bfloat162_nbst_caller(arg_0): + return _ZL6__ldcgPK14__nv_bfloat162_nbst(arg_0) + + @lower(__ldcg, CPointer(_type___nv_bfloat162)) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__ldcgPK14__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__ldcgPK14__nv_bfloat162_nbst_caller, + signature( + _type___nv_bfloat162, CPointer(CPointer(_type___nv_bfloat162)) + ), + ptrs, + ) + + +_lower__ZL6__ldcgPK14__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def _lower__ZL6__ldcgPK13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__ldcgPK13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16 ** ptr) { + retval = __ldcg(*ptr); + return 0; + } + """ + + _ZL6__ldcgPK13__nv_bfloat16_nbst = declare_device( + "_ZL6__ldcgPK13__nv_bfloat16_nbst", + _type___nv_bfloat16(CPointer(CPointer(_type___nv_bfloat16))), + ) + + def _ZL6__ldcgPK13__nv_bfloat16_nbst_caller(arg_0): + return _ZL6__ldcgPK13__nv_bfloat16_nbst(arg_0) + + @lower(__ldcg, CPointer(_type___nv_bfloat16)) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__ldcgPK13__nv_bfloat16_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__ldcgPK13__nv_bfloat16_nbst_caller, + signature( + _type___nv_bfloat16, CPointer(CPointer(_type___nv_bfloat16)) + ), + ptrs, + ) + + +_lower__ZL6__ldcgPK13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def __ldca(): + pass + + +def _lower__ZL6__ldcaPK14__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__ldcaPK14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162 ** ptr) { + retval = __ldca(*ptr); + return 0; + } + """ + + _ZL6__ldcaPK14__nv_bfloat162_nbst = declare_device( + "_ZL6__ldcaPK14__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(CPointer(_type___nv_bfloat162))), + ) + + def _ZL6__ldcaPK14__nv_bfloat162_nbst_caller(arg_0): + return _ZL6__ldcaPK14__nv_bfloat162_nbst(arg_0) + + @lower(__ldca, CPointer(_type___nv_bfloat162)) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__ldcaPK14__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__ldcaPK14__nv_bfloat162_nbst_caller, + signature( + _type___nv_bfloat162, CPointer(CPointer(_type___nv_bfloat162)) + ), + ptrs, + ) + + +_lower__ZL6__ldcaPK14__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def _lower__ZL6__ldcaPK13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__ldcaPK13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16 ** ptr) { + retval = __ldca(*ptr); + return 0; + } + """ + + _ZL6__ldcaPK13__nv_bfloat16_nbst = declare_device( + "_ZL6__ldcaPK13__nv_bfloat16_nbst", + _type___nv_bfloat16(CPointer(CPointer(_type___nv_bfloat16))), + ) + + def _ZL6__ldcaPK13__nv_bfloat16_nbst_caller(arg_0): + return _ZL6__ldcaPK13__nv_bfloat16_nbst(arg_0) + + @lower(__ldca, CPointer(_type___nv_bfloat16)) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__ldcaPK13__nv_bfloat16_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__ldcaPK13__nv_bfloat16_nbst_caller, + signature( + _type___nv_bfloat16, CPointer(CPointer(_type___nv_bfloat16)) + ), + ptrs, + ) + + +_lower__ZL6__ldcaPK13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def __ldcs(): + pass + + +def _lower__ZL6__ldcsPK14__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__ldcsPK14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162 ** ptr) { + retval = __ldcs(*ptr); + return 0; + } + """ + + _ZL6__ldcsPK14__nv_bfloat162_nbst = declare_device( + "_ZL6__ldcsPK14__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(CPointer(_type___nv_bfloat162))), + ) + + def _ZL6__ldcsPK14__nv_bfloat162_nbst_caller(arg_0): + return _ZL6__ldcsPK14__nv_bfloat162_nbst(arg_0) + + @lower(__ldcs, CPointer(_type___nv_bfloat162)) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__ldcsPK14__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__ldcsPK14__nv_bfloat162_nbst_caller, + signature( + _type___nv_bfloat162, CPointer(CPointer(_type___nv_bfloat162)) + ), + ptrs, + ) + + +_lower__ZL6__ldcsPK14__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def _lower__ZL6__ldcsPK13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__ldcsPK13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16 ** ptr) { + retval = __ldcs(*ptr); + return 0; + } + """ + + _ZL6__ldcsPK13__nv_bfloat16_nbst = declare_device( + "_ZL6__ldcsPK13__nv_bfloat16_nbst", + _type___nv_bfloat16(CPointer(CPointer(_type___nv_bfloat16))), + ) + + def _ZL6__ldcsPK13__nv_bfloat16_nbst_caller(arg_0): + return _ZL6__ldcsPK13__nv_bfloat16_nbst(arg_0) + + @lower(__ldcs, CPointer(_type___nv_bfloat16)) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__ldcsPK13__nv_bfloat16_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__ldcsPK13__nv_bfloat16_nbst_caller, + signature( + _type___nv_bfloat16, CPointer(CPointer(_type___nv_bfloat16)) + ), + ptrs, + ) + + +_lower__ZL6__ldcsPK13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def __ldlu(): + pass + + +def _lower__ZL6__ldluPK14__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__ldluPK14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162 ** ptr) { + retval = __ldlu(*ptr); + return 0; + } + """ + + _ZL6__ldluPK14__nv_bfloat162_nbst = declare_device( + "_ZL6__ldluPK14__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(CPointer(_type___nv_bfloat162))), + ) + + def _ZL6__ldluPK14__nv_bfloat162_nbst_caller(arg_0): + return _ZL6__ldluPK14__nv_bfloat162_nbst(arg_0) + + @lower(__ldlu, CPointer(_type___nv_bfloat162)) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__ldluPK14__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__ldluPK14__nv_bfloat162_nbst_caller, + signature( + _type___nv_bfloat162, CPointer(CPointer(_type___nv_bfloat162)) + ), + ptrs, + ) + + +_lower__ZL6__ldluPK14__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def _lower__ZL6__ldluPK13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__ldluPK13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16 ** ptr) { + retval = __ldlu(*ptr); + return 0; + } + """ + + _ZL6__ldluPK13__nv_bfloat16_nbst = declare_device( + "_ZL6__ldluPK13__nv_bfloat16_nbst", + _type___nv_bfloat16(CPointer(CPointer(_type___nv_bfloat16))), + ) + + def _ZL6__ldluPK13__nv_bfloat16_nbst_caller(arg_0): + return _ZL6__ldluPK13__nv_bfloat16_nbst(arg_0) + + @lower(__ldlu, CPointer(_type___nv_bfloat16)) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__ldluPK13__nv_bfloat16_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__ldluPK13__nv_bfloat16_nbst_caller, + signature( + _type___nv_bfloat16, CPointer(CPointer(_type___nv_bfloat16)) + ), + ptrs, + ) + + +_lower__ZL6__ldluPK13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def __ldcv(): + pass + + +def _lower__ZL6__ldcvPK14__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__ldcvPK14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162 ** ptr) { + retval = __ldcv(*ptr); + return 0; + } + """ + + _ZL6__ldcvPK14__nv_bfloat162_nbst = declare_device( + "_ZL6__ldcvPK14__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(CPointer(_type___nv_bfloat162))), + ) + + def _ZL6__ldcvPK14__nv_bfloat162_nbst_caller(arg_0): + return _ZL6__ldcvPK14__nv_bfloat162_nbst(arg_0) + + @lower(__ldcv, CPointer(_type___nv_bfloat162)) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__ldcvPK14__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__ldcvPK14__nv_bfloat162_nbst_caller, + signature( + _type___nv_bfloat162, CPointer(CPointer(_type___nv_bfloat162)) + ), + ptrs, + ) + + +_lower__ZL6__ldcvPK14__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def _lower__ZL6__ldcvPK13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__ldcvPK13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16 ** ptr) { + retval = __ldcv(*ptr); + return 0; + } + """ + + _ZL6__ldcvPK13__nv_bfloat16_nbst = declare_device( + "_ZL6__ldcvPK13__nv_bfloat16_nbst", + _type___nv_bfloat16(CPointer(CPointer(_type___nv_bfloat16))), + ) + + def _ZL6__ldcvPK13__nv_bfloat16_nbst_caller(arg_0): + return _ZL6__ldcvPK13__nv_bfloat16_nbst(arg_0) + + @lower(__ldcv, CPointer(_type___nv_bfloat16)) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__ldcvPK13__nv_bfloat16_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__ldcvPK13__nv_bfloat16_nbst_caller, + signature( + _type___nv_bfloat16, CPointer(CPointer(_type___nv_bfloat16)) + ), + ptrs, + ) + + +_lower__ZL6__ldcvPK13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def __stwb(): + pass + + +def _lower__ZL6__stwbP14__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__stwbP14__nv_bfloat162S__nbst(int &retval , __nv_bfloat162 ** ptr, __nv_bfloat162* value) { + __stwb(*ptr, *value); + return 0; + } + """ + + _ZL6__stwbP14__nv_bfloat162S__nbst = declare_device( + "_ZL6__stwbP14__nv_bfloat162S__nbst", + void( + CPointer(CPointer(_type___nv_bfloat162)), + CPointer(_type___nv_bfloat162), + ), + ) + + def _ZL6__stwbP14__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL6__stwbP14__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__stwb, CPointer(_type___nv_bfloat162), _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__stwbP14__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__stwbP14__nv_bfloat162S__nbst_caller, + signature( + void, + CPointer(CPointer(_type___nv_bfloat162)), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL6__stwbP14__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def _lower__ZL6__stwbP13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__stwbP13__nv_bfloat16S__nbst(int &retval , __nv_bfloat16 ** ptr, __nv_bfloat16* value) { + __stwb(*ptr, *value); + return 0; + } + """ + + _ZL6__stwbP13__nv_bfloat16S__nbst = declare_device( + "_ZL6__stwbP13__nv_bfloat16S__nbst", + void( + CPointer(CPointer(_type___nv_bfloat16)), + CPointer(_type___nv_bfloat16), + ), + ) + + def _ZL6__stwbP13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL6__stwbP13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__stwb, CPointer(_type___nv_bfloat16), _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__stwbP13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__stwbP13__nv_bfloat16S__nbst_caller, + signature( + void, + CPointer(CPointer(_type___nv_bfloat16)), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL6__stwbP13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __stcg(): + pass + + +def _lower__ZL6__stcgP14__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__stcgP14__nv_bfloat162S__nbst(int &retval , __nv_bfloat162 ** ptr, __nv_bfloat162* value) { + __stcg(*ptr, *value); + return 0; + } + """ + + _ZL6__stcgP14__nv_bfloat162S__nbst = declare_device( + "_ZL6__stcgP14__nv_bfloat162S__nbst", + void( + CPointer(CPointer(_type___nv_bfloat162)), + CPointer(_type___nv_bfloat162), + ), + ) + + def _ZL6__stcgP14__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL6__stcgP14__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__stcg, CPointer(_type___nv_bfloat162), _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__stcgP14__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__stcgP14__nv_bfloat162S__nbst_caller, + signature( + void, + CPointer(CPointer(_type___nv_bfloat162)), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL6__stcgP14__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def _lower__ZL6__stcgP13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__stcgP13__nv_bfloat16S__nbst(int &retval , __nv_bfloat16 ** ptr, __nv_bfloat16* value) { + __stcg(*ptr, *value); + return 0; + } + """ + + _ZL6__stcgP13__nv_bfloat16S__nbst = declare_device( + "_ZL6__stcgP13__nv_bfloat16S__nbst", + void( + CPointer(CPointer(_type___nv_bfloat16)), + CPointer(_type___nv_bfloat16), + ), + ) + + def _ZL6__stcgP13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL6__stcgP13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__stcg, CPointer(_type___nv_bfloat16), _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__stcgP13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__stcgP13__nv_bfloat16S__nbst_caller, + signature( + void, + CPointer(CPointer(_type___nv_bfloat16)), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL6__stcgP13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __stcs(): + pass + + +def _lower__ZL6__stcsP14__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__stcsP14__nv_bfloat162S__nbst(int &retval , __nv_bfloat162 ** ptr, __nv_bfloat162* value) { + __stcs(*ptr, *value); + return 0; + } + """ + + _ZL6__stcsP14__nv_bfloat162S__nbst = declare_device( + "_ZL6__stcsP14__nv_bfloat162S__nbst", + void( + CPointer(CPointer(_type___nv_bfloat162)), + CPointer(_type___nv_bfloat162), + ), + ) + + def _ZL6__stcsP14__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL6__stcsP14__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__stcs, CPointer(_type___nv_bfloat162), _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__stcsP14__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__stcsP14__nv_bfloat162S__nbst_caller, + signature( + void, + CPointer(CPointer(_type___nv_bfloat162)), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL6__stcsP14__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def _lower__ZL6__stcsP13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__stcsP13__nv_bfloat16S__nbst(int &retval , __nv_bfloat16 ** ptr, __nv_bfloat16* value) { + __stcs(*ptr, *value); + return 0; + } + """ + + _ZL6__stcsP13__nv_bfloat16S__nbst = declare_device( + "_ZL6__stcsP13__nv_bfloat16S__nbst", + void( + CPointer(CPointer(_type___nv_bfloat16)), + CPointer(_type___nv_bfloat16), + ), + ) + + def _ZL6__stcsP13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL6__stcsP13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__stcs, CPointer(_type___nv_bfloat16), _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__stcsP13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__stcsP13__nv_bfloat16S__nbst_caller, + signature( + void, + CPointer(CPointer(_type___nv_bfloat16)), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL6__stcsP13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __stwt(): + pass + + +def _lower__ZL6__stwtP14__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__stwtP14__nv_bfloat162S__nbst(int &retval , __nv_bfloat162 ** ptr, __nv_bfloat162* value) { + __stwt(*ptr, *value); + return 0; + } + """ + + _ZL6__stwtP14__nv_bfloat162S__nbst = declare_device( + "_ZL6__stwtP14__nv_bfloat162S__nbst", + void( + CPointer(CPointer(_type___nv_bfloat162)), + CPointer(_type___nv_bfloat162), + ), + ) + + def _ZL6__stwtP14__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL6__stwtP14__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__stwt, CPointer(_type___nv_bfloat162), _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__stwtP14__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__stwtP14__nv_bfloat162S__nbst_caller, + signature( + void, + CPointer(CPointer(_type___nv_bfloat162)), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL6__stwtP14__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def _lower__ZL6__stwtP13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__stwtP13__nv_bfloat16S__nbst(int &retval , __nv_bfloat16 ** ptr, __nv_bfloat16* value) { + __stwt(*ptr, *value); + return 0; + } + """ + + _ZL6__stwtP13__nv_bfloat16S__nbst = declare_device( + "_ZL6__stwtP13__nv_bfloat16S__nbst", + void( + CPointer(CPointer(_type___nv_bfloat16)), + CPointer(_type___nv_bfloat16), + ), + ) + + def _ZL6__stwtP13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL6__stwtP13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__stwt, CPointer(_type___nv_bfloat16), _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__stwtP13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__stwtP13__nv_bfloat16S__nbst_caller, + signature( + void, + CPointer(CPointer(_type___nv_bfloat16)), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL6__stwtP13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __heq2(): + pass + + +def _lower__ZL6__heq214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__heq214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __heq2(*a, *b); + return 0; + } + """ + + _ZL6__heq214__nv_bfloat162S__nbst = declare_device( + "_ZL6__heq214__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL6__heq214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL6__heq214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__heq2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__heq214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__heq214__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL6__heq214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hne2(): + pass + + +def _lower__ZL6__hne214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__hne214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hne2(*a, *b); + return 0; + } + """ + + _ZL6__hne214__nv_bfloat162S__nbst = declare_device( + "_ZL6__hne214__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL6__hne214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL6__hne214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hne2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__hne214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__hne214__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL6__hne214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hle2(): + pass + + +def _lower__ZL6__hle214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__hle214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hle2(*a, *b); + return 0; + } + """ + + _ZL6__hle214__nv_bfloat162S__nbst = declare_device( + "_ZL6__hle214__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL6__hle214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL6__hle214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hle2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__hle214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__hle214__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL6__hle214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hge2(): + pass + + +def _lower__ZL6__hge214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__hge214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hge2(*a, *b); + return 0; + } + """ + + _ZL6__hge214__nv_bfloat162S__nbst = declare_device( + "_ZL6__hge214__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL6__hge214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL6__hge214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hge2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__hge214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__hge214__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL6__hge214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hlt2(): + pass + + +def _lower__ZL6__hlt214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__hlt214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hlt2(*a, *b); + return 0; + } + """ + + _ZL6__hlt214__nv_bfloat162S__nbst = declare_device( + "_ZL6__hlt214__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL6__hlt214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL6__hlt214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hlt2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__hlt214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__hlt214__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL6__hlt214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hgt2(): + pass + + +def _lower__ZL6__hgt214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__hgt214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hgt2(*a, *b); + return 0; + } + """ + + _ZL6__hgt214__nv_bfloat162S__nbst = declare_device( + "_ZL6__hgt214__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL6__hgt214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL6__hgt214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hgt2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__hgt214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__hgt214__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL6__hgt214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hequ2(): + pass + + +def _lower__ZL7__hequ214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL7__hequ214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hequ2(*a, *b); + return 0; + } + """ + + _ZL7__hequ214__nv_bfloat162S__nbst = declare_device( + "_ZL7__hequ214__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL7__hequ214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL7__hequ214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hequ2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL7__hequ214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL7__hequ214__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL7__hequ214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hneu2(): + pass + + +def _lower__ZL7__hneu214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL7__hneu214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hneu2(*a, *b); + return 0; + } + """ + + _ZL7__hneu214__nv_bfloat162S__nbst = declare_device( + "_ZL7__hneu214__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL7__hneu214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL7__hneu214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hneu2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL7__hneu214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL7__hneu214__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL7__hneu214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hleu2(): + pass + + +def _lower__ZL7__hleu214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL7__hleu214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hleu2(*a, *b); + return 0; + } + """ + + _ZL7__hleu214__nv_bfloat162S__nbst = declare_device( + "_ZL7__hleu214__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL7__hleu214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL7__hleu214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hleu2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL7__hleu214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL7__hleu214__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL7__hleu214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hgeu2(): + pass + + +def _lower__ZL7__hgeu214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL7__hgeu214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hgeu2(*a, *b); + return 0; + } + """ + + _ZL7__hgeu214__nv_bfloat162S__nbst = declare_device( + "_ZL7__hgeu214__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL7__hgeu214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL7__hgeu214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hgeu2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL7__hgeu214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL7__hgeu214__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL7__hgeu214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hltu2(): + pass + + +def _lower__ZL7__hltu214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL7__hltu214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hltu2(*a, *b); + return 0; + } + """ + + _ZL7__hltu214__nv_bfloat162S__nbst = declare_device( + "_ZL7__hltu214__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL7__hltu214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL7__hltu214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hltu2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL7__hltu214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL7__hltu214__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL7__hltu214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hgtu2(): + pass + + +def _lower__ZL7__hgtu214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL7__hgtu214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hgtu2(*a, *b); + return 0; + } + """ + + _ZL7__hgtu214__nv_bfloat162S__nbst = declare_device( + "_ZL7__hgtu214__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL7__hgtu214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL7__hgtu214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hgtu2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL7__hgtu214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL7__hgtu214__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL7__hgtu214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __heq2_mask(): + pass + + +def _lower__ZL11__heq2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL11__heq2_mask14__nv_bfloat162S__nbst(unsigned int &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __heq2_mask(*a, *b); + return 0; + } + """ + + _ZL11__heq2_mask14__nv_bfloat162S__nbst = declare_device( + "_ZL11__heq2_mask14__nv_bfloat162S__nbst", + uint32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZL11__heq2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL11__heq2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__heq2_mask, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL11__heq2_mask14__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL11__heq2_mask14__nv_bfloat162S__nbst_caller, + signature( + uint32, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL11__heq2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hne2_mask(): + pass + + +def _lower__ZL11__hne2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL11__hne2_mask14__nv_bfloat162S__nbst(unsigned int &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hne2_mask(*a, *b); + return 0; + } + """ + + _ZL11__hne2_mask14__nv_bfloat162S__nbst = declare_device( + "_ZL11__hne2_mask14__nv_bfloat162S__nbst", + uint32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZL11__hne2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL11__hne2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hne2_mask, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL11__hne2_mask14__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL11__hne2_mask14__nv_bfloat162S__nbst_caller, + signature( + uint32, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL11__hne2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hle2_mask(): + pass + + +def _lower__ZL11__hle2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL11__hle2_mask14__nv_bfloat162S__nbst(unsigned int &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hle2_mask(*a, *b); + return 0; + } + """ + + _ZL11__hle2_mask14__nv_bfloat162S__nbst = declare_device( + "_ZL11__hle2_mask14__nv_bfloat162S__nbst", + uint32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZL11__hle2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL11__hle2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hle2_mask, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL11__hle2_mask14__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL11__hle2_mask14__nv_bfloat162S__nbst_caller, + signature( + uint32, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL11__hle2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hge2_mask(): + pass + + +def _lower__ZL11__hge2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL11__hge2_mask14__nv_bfloat162S__nbst(unsigned int &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hge2_mask(*a, *b); + return 0; + } + """ + + _ZL11__hge2_mask14__nv_bfloat162S__nbst = declare_device( + "_ZL11__hge2_mask14__nv_bfloat162S__nbst", + uint32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZL11__hge2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL11__hge2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hge2_mask, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL11__hge2_mask14__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL11__hge2_mask14__nv_bfloat162S__nbst_caller, + signature( + uint32, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL11__hge2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hlt2_mask(): + pass + + +def _lower__ZL11__hlt2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL11__hlt2_mask14__nv_bfloat162S__nbst(unsigned int &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hlt2_mask(*a, *b); + return 0; + } + """ + + _ZL11__hlt2_mask14__nv_bfloat162S__nbst = declare_device( + "_ZL11__hlt2_mask14__nv_bfloat162S__nbst", + uint32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZL11__hlt2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL11__hlt2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hlt2_mask, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL11__hlt2_mask14__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL11__hlt2_mask14__nv_bfloat162S__nbst_caller, + signature( + uint32, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL11__hlt2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hgt2_mask(): + pass + + +def _lower__ZL11__hgt2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL11__hgt2_mask14__nv_bfloat162S__nbst(unsigned int &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hgt2_mask(*a, *b); + return 0; + } + """ + + _ZL11__hgt2_mask14__nv_bfloat162S__nbst = declare_device( + "_ZL11__hgt2_mask14__nv_bfloat162S__nbst", + uint32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZL11__hgt2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL11__hgt2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hgt2_mask, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL11__hgt2_mask14__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL11__hgt2_mask14__nv_bfloat162S__nbst_caller, + signature( + uint32, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL11__hgt2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hequ2_mask(): + pass + + +def _lower__ZL12__hequ2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL12__hequ2_mask14__nv_bfloat162S__nbst(unsigned int &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hequ2_mask(*a, *b); + return 0; + } + """ + + _ZL12__hequ2_mask14__nv_bfloat162S__nbst = declare_device( + "_ZL12__hequ2_mask14__nv_bfloat162S__nbst", + uint32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZL12__hequ2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL12__hequ2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hequ2_mask, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL12__hequ2_mask14__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL12__hequ2_mask14__nv_bfloat162S__nbst_caller, + signature( + uint32, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL12__hequ2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hneu2_mask(): + pass + + +def _lower__ZL12__hneu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL12__hneu2_mask14__nv_bfloat162S__nbst(unsigned int &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hneu2_mask(*a, *b); + return 0; + } + """ + + _ZL12__hneu2_mask14__nv_bfloat162S__nbst = declare_device( + "_ZL12__hneu2_mask14__nv_bfloat162S__nbst", + uint32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZL12__hneu2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL12__hneu2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hneu2_mask, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL12__hneu2_mask14__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL12__hneu2_mask14__nv_bfloat162S__nbst_caller, + signature( + uint32, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL12__hneu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hleu2_mask(): + pass + + +def _lower__ZL12__hleu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL12__hleu2_mask14__nv_bfloat162S__nbst(unsigned int &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hleu2_mask(*a, *b); + return 0; + } + """ + + _ZL12__hleu2_mask14__nv_bfloat162S__nbst = declare_device( + "_ZL12__hleu2_mask14__nv_bfloat162S__nbst", + uint32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZL12__hleu2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL12__hleu2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hleu2_mask, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL12__hleu2_mask14__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL12__hleu2_mask14__nv_bfloat162S__nbst_caller, + signature( + uint32, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL12__hleu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hgeu2_mask(): + pass + + +def _lower__ZL12__hgeu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL12__hgeu2_mask14__nv_bfloat162S__nbst(unsigned int &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hgeu2_mask(*a, *b); + return 0; + } + """ + + _ZL12__hgeu2_mask14__nv_bfloat162S__nbst = declare_device( + "_ZL12__hgeu2_mask14__nv_bfloat162S__nbst", + uint32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZL12__hgeu2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL12__hgeu2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hgeu2_mask, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL12__hgeu2_mask14__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL12__hgeu2_mask14__nv_bfloat162S__nbst_caller, + signature( + uint32, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL12__hgeu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hltu2_mask(): + pass + + +def _lower__ZL12__hltu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL12__hltu2_mask14__nv_bfloat162S__nbst(unsigned int &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hltu2_mask(*a, *b); + return 0; + } + """ + + _ZL12__hltu2_mask14__nv_bfloat162S__nbst = declare_device( + "_ZL12__hltu2_mask14__nv_bfloat162S__nbst", + uint32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZL12__hltu2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL12__hltu2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hltu2_mask, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL12__hltu2_mask14__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL12__hltu2_mask14__nv_bfloat162S__nbst_caller, + signature( + uint32, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL12__hltu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hgtu2_mask(): + pass + + +def _lower__ZL12__hgtu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL12__hgtu2_mask14__nv_bfloat162S__nbst(unsigned int &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hgtu2_mask(*a, *b); + return 0; + } + """ + + _ZL12__hgtu2_mask14__nv_bfloat162S__nbst = declare_device( + "_ZL12__hgtu2_mask14__nv_bfloat162S__nbst", + uint32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZL12__hgtu2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL12__hgtu2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hgtu2_mask, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL12__hgtu2_mask14__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL12__hgtu2_mask14__nv_bfloat162S__nbst_caller, + signature( + uint32, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL12__hgtu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hisnan2(): + pass + + +def _lower__ZL9__hisnan214__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL9__hisnan214__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) { + retval = __hisnan2(*a); + return 0; + } + """ + + _ZL9__hisnan214__nv_bfloat162_nbst = declare_device( + "_ZL9__hisnan214__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(_type___nv_bfloat162)), + ) + + def _ZL9__hisnan214__nv_bfloat162_nbst_caller(arg_0): + return _ZL9__hisnan214__nv_bfloat162_nbst(arg_0) + + @lower(__hisnan2, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL9__hisnan214__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL9__hisnan214__nv_bfloat162_nbst_caller, + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + ptrs, + ) + + +_lower__ZL9__hisnan214__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def __hadd2(): + pass + + +def _lower__ZL7__hadd214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL7__hadd214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hadd2(*a, *b); + return 0; + } + """ + + _ZL7__hadd214__nv_bfloat162S__nbst = declare_device( + "_ZL7__hadd214__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL7__hadd214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL7__hadd214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hadd2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL7__hadd214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL7__hadd214__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL7__hadd214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hsub2(): + pass + + +def _lower__ZL7__hsub214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL7__hsub214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hsub2(*a, *b); + return 0; + } + """ + + _ZL7__hsub214__nv_bfloat162S__nbst = declare_device( + "_ZL7__hsub214__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL7__hsub214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL7__hsub214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hsub2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL7__hsub214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL7__hsub214__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL7__hsub214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hmul2(): + pass + + +def _lower__ZL7__hmul214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL7__hmul214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hmul2(*a, *b); + return 0; + } + """ + + _ZL7__hmul214__nv_bfloat162S__nbst = declare_device( + "_ZL7__hmul214__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL7__hmul214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL7__hmul214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hmul2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL7__hmul214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL7__hmul214__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL7__hmul214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hadd2_rn(): + pass + + +def _lower__ZL10__hadd2_rn14__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL10__hadd2_rn14__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hadd2_rn(*a, *b); + return 0; + } + """ + + _ZL10__hadd2_rn14__nv_bfloat162S__nbst = declare_device( + "_ZL10__hadd2_rn14__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL10__hadd2_rn14__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL10__hadd2_rn14__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hadd2_rn, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL10__hadd2_rn14__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL10__hadd2_rn14__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL10__hadd2_rn14__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hsub2_rn(): + pass + + +def _lower__ZL10__hsub2_rn14__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL10__hsub2_rn14__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hsub2_rn(*a, *b); + return 0; + } + """ + + _ZL10__hsub2_rn14__nv_bfloat162S__nbst = declare_device( + "_ZL10__hsub2_rn14__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL10__hsub2_rn14__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL10__hsub2_rn14__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hsub2_rn, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL10__hsub2_rn14__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL10__hsub2_rn14__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL10__hsub2_rn14__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hmul2_rn(): + pass + + +def _lower__ZL10__hmul2_rn14__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL10__hmul2_rn14__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hmul2_rn(*a, *b); + return 0; + } + """ + + _ZL10__hmul2_rn14__nv_bfloat162S__nbst = declare_device( + "_ZL10__hmul2_rn14__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL10__hmul2_rn14__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL10__hmul2_rn14__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hmul2_rn, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL10__hmul2_rn14__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL10__hmul2_rn14__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL10__hmul2_rn14__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __h2div(): + pass + + +def _lower__ZL7__h2div14__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL7__h2div14__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __h2div(*a, *b); + return 0; + } + """ + + _ZL7__h2div14__nv_bfloat162S__nbst = declare_device( + "_ZL7__h2div14__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL7__h2div14__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL7__h2div14__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__h2div, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL7__h2div14__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL7__h2div14__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL7__h2div14__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __habs2(): + pass + + +def _lower__ZL7__habs214__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL7__habs214__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) { + retval = __habs2(*a); + return 0; + } + """ + + _ZL7__habs214__nv_bfloat162_nbst = declare_device( + "_ZL7__habs214__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(_type___nv_bfloat162)), + ) + + def _ZL7__habs214__nv_bfloat162_nbst_caller(arg_0): + return _ZL7__habs214__nv_bfloat162_nbst(arg_0) + + @lower(__habs2, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL7__habs214__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL7__habs214__nv_bfloat162_nbst_caller, + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + ptrs, + ) + + +_lower__ZL7__habs214__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def __hadd2_sat(): + pass + + +def _lower__ZL11__hadd2_sat14__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL11__hadd2_sat14__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hadd2_sat(*a, *b); + return 0; + } + """ + + _ZL11__hadd2_sat14__nv_bfloat162S__nbst = declare_device( + "_ZL11__hadd2_sat14__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL11__hadd2_sat14__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL11__hadd2_sat14__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hadd2_sat, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL11__hadd2_sat14__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL11__hadd2_sat14__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL11__hadd2_sat14__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hsub2_sat(): + pass + + +def _lower__ZL11__hsub2_sat14__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL11__hsub2_sat14__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hsub2_sat(*a, *b); + return 0; + } + """ + + _ZL11__hsub2_sat14__nv_bfloat162S__nbst = declare_device( + "_ZL11__hsub2_sat14__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL11__hsub2_sat14__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL11__hsub2_sat14__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hsub2_sat, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL11__hsub2_sat14__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL11__hsub2_sat14__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL11__hsub2_sat14__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hmul2_sat(): + pass + + +def _lower__ZL11__hmul2_sat14__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL11__hmul2_sat14__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hmul2_sat(*a, *b); + return 0; + } + """ + + _ZL11__hmul2_sat14__nv_bfloat162S__nbst = declare_device( + "_ZL11__hmul2_sat14__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL11__hmul2_sat14__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL11__hmul2_sat14__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hmul2_sat, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL11__hmul2_sat14__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL11__hmul2_sat14__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL11__hmul2_sat14__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hfma2(): + pass + + +def _lower__ZL7__hfma214__nv_bfloat162S_S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL7__hfma214__nv_bfloat162S_S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b, __nv_bfloat162* c) { + retval = __hfma2(*a, *b, *c); + return 0; + } + """ + + _ZL7__hfma214__nv_bfloat162S_S__nbst = declare_device( + "_ZL7__hfma214__nv_bfloat162S_S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ) + + def _ZL7__hfma214__nv_bfloat162S_S__nbst_caller(arg_0, arg_1, arg_2): + return _ZL7__hfma214__nv_bfloat162S_S__nbst(arg_0, arg_1, arg_2) + + @lower( + __hfma2, + _type___nv_bfloat162, + _type___nv_bfloat162, + _type___nv_bfloat162, + ) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL7__hfma214__nv_bfloat162S_S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL7__hfma214__nv_bfloat162S_S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL7__hfma214__nv_bfloat162S_S__nbst(shim_stream, shim_obj) + + +def __hfma2_sat(): + pass + + +def _lower__ZL11__hfma2_sat14__nv_bfloat162S_S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL11__hfma2_sat14__nv_bfloat162S_S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b, __nv_bfloat162* c) { + retval = __hfma2_sat(*a, *b, *c); + return 0; + } + """ + + _ZL11__hfma2_sat14__nv_bfloat162S_S__nbst = declare_device( + "_ZL11__hfma2_sat14__nv_bfloat162S_S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ) + + def _ZL11__hfma2_sat14__nv_bfloat162S_S__nbst_caller(arg_0, arg_1, arg_2): + return _ZL11__hfma2_sat14__nv_bfloat162S_S__nbst(arg_0, arg_1, arg_2) + + @lower( + __hfma2_sat, + _type___nv_bfloat162, + _type___nv_bfloat162, + _type___nv_bfloat162, + ) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL11__hfma2_sat14__nv_bfloat162S_S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL11__hfma2_sat14__nv_bfloat162S_S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL11__hfma2_sat14__nv_bfloat162S_S__nbst(shim_stream, shim_obj) + + +def __hneg2(): + pass + + +def _lower__ZL7__hneg214__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL7__hneg214__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) { + retval = __hneg2(*a); + return 0; + } + """ + + _ZL7__hneg214__nv_bfloat162_nbst = declare_device( + "_ZL7__hneg214__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(_type___nv_bfloat162)), + ) + + def _ZL7__hneg214__nv_bfloat162_nbst_caller(arg_0): + return _ZL7__hneg214__nv_bfloat162_nbst(arg_0) + + @lower(__hneg2, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL7__hneg214__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL7__hneg214__nv_bfloat162_nbst_caller, + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + ptrs, + ) + + +_lower__ZL7__hneg214__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def __habs(): + pass + + +def _lower__ZL6__habs13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__habs13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) { + retval = __habs(*a); + return 0; + } + """ + + _ZL6__habs13__nv_bfloat16_nbst = declare_device( + "_ZL6__habs13__nv_bfloat16_nbst", + _type___nv_bfloat16(CPointer(_type___nv_bfloat16)), + ) + + def _ZL6__habs13__nv_bfloat16_nbst_caller(arg_0): + return _ZL6__habs13__nv_bfloat16_nbst(arg_0) + + @lower(__habs, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__habs13__nv_bfloat16_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__habs13__nv_bfloat16_nbst_caller, + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + ptrs, + ) + + +_lower__ZL6__habs13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def __hadd(): + pass + + +def _lower__ZL6__hadd13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__hadd13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) { + retval = __hadd(*a, *b); + return 0; + } + """ + + _ZL6__hadd13__nv_bfloat16S__nbst = declare_device( + "_ZL6__hadd13__nv_bfloat16S__nbst", + _type___nv_bfloat16( + CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) + ), + ) + + def _ZL6__hadd13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL6__hadd13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__hadd, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__hadd13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__hadd13__nv_bfloat16S__nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL6__hadd13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __hsub(): + pass + + +def _lower__ZL6__hsub13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__hsub13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) { + retval = __hsub(*a, *b); + return 0; + } + """ + + _ZL6__hsub13__nv_bfloat16S__nbst = declare_device( + "_ZL6__hsub13__nv_bfloat16S__nbst", + _type___nv_bfloat16( + CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) + ), + ) + + def _ZL6__hsub13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL6__hsub13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__hsub, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__hsub13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__hsub13__nv_bfloat16S__nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL6__hsub13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __hmul(): + pass + + +def _lower__ZL6__hmul13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__hmul13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) { + retval = __hmul(*a, *b); + return 0; + } + """ + + _ZL6__hmul13__nv_bfloat16S__nbst = declare_device( + "_ZL6__hmul13__nv_bfloat16S__nbst", + _type___nv_bfloat16( + CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) + ), + ) + + def _ZL6__hmul13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL6__hmul13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__hmul, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__hmul13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__hmul13__nv_bfloat16S__nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL6__hmul13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __hadd_rn(): + pass + + +def _lower__ZL9__hadd_rn13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL9__hadd_rn13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) { + retval = __hadd_rn(*a, *b); + return 0; + } + """ + + _ZL9__hadd_rn13__nv_bfloat16S__nbst = declare_device( + "_ZL9__hadd_rn13__nv_bfloat16S__nbst", + _type___nv_bfloat16( + CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) + ), + ) + + def _ZL9__hadd_rn13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL9__hadd_rn13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__hadd_rn, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL9__hadd_rn13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL9__hadd_rn13__nv_bfloat16S__nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL9__hadd_rn13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __hsub_rn(): + pass + + +def _lower__ZL9__hsub_rn13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL9__hsub_rn13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) { + retval = __hsub_rn(*a, *b); + return 0; + } + """ + + _ZL9__hsub_rn13__nv_bfloat16S__nbst = declare_device( + "_ZL9__hsub_rn13__nv_bfloat16S__nbst", + _type___nv_bfloat16( + CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) + ), + ) + + def _ZL9__hsub_rn13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL9__hsub_rn13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__hsub_rn, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL9__hsub_rn13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL9__hsub_rn13__nv_bfloat16S__nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL9__hsub_rn13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __hmul_rn(): + pass + + +def _lower__ZL9__hmul_rn13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL9__hmul_rn13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) { + retval = __hmul_rn(*a, *b); + return 0; + } + """ + + _ZL9__hmul_rn13__nv_bfloat16S__nbst = declare_device( + "_ZL9__hmul_rn13__nv_bfloat16S__nbst", + _type___nv_bfloat16( + CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) + ), + ) + + def _ZL9__hmul_rn13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL9__hmul_rn13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__hmul_rn, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL9__hmul_rn13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL9__hmul_rn13__nv_bfloat16S__nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL9__hmul_rn13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __hdiv(): + pass + + +def _lower__ZL6__hdiv13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__hdiv13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) { + retval = __hdiv(*a, *b); + return 0; + } + """ + + _ZL6__hdiv13__nv_bfloat16S__nbst = declare_device( + "_ZL6__hdiv13__nv_bfloat16S__nbst", + _type___nv_bfloat16( + CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) + ), + ) + + def _ZL6__hdiv13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL6__hdiv13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__hdiv, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__hdiv13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__hdiv13__nv_bfloat16S__nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL6__hdiv13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __hadd_sat(): + pass + + +def _lower__ZL10__hadd_sat13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL10__hadd_sat13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) { + retval = __hadd_sat(*a, *b); + return 0; + } + """ + + _ZL10__hadd_sat13__nv_bfloat16S__nbst = declare_device( + "_ZL10__hadd_sat13__nv_bfloat16S__nbst", + _type___nv_bfloat16( + CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) + ), + ) + + def _ZL10__hadd_sat13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL10__hadd_sat13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__hadd_sat, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL10__hadd_sat13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL10__hadd_sat13__nv_bfloat16S__nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL10__hadd_sat13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __hsub_sat(): + pass + + +def _lower__ZL10__hsub_sat13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL10__hsub_sat13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) { + retval = __hsub_sat(*a, *b); + return 0; + } + """ + + _ZL10__hsub_sat13__nv_bfloat16S__nbst = declare_device( + "_ZL10__hsub_sat13__nv_bfloat16S__nbst", + _type___nv_bfloat16( + CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) + ), + ) + + def _ZL10__hsub_sat13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL10__hsub_sat13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__hsub_sat, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL10__hsub_sat13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL10__hsub_sat13__nv_bfloat16S__nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL10__hsub_sat13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __hmul_sat(): + pass + + +def _lower__ZL10__hmul_sat13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL10__hmul_sat13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) { + retval = __hmul_sat(*a, *b); + return 0; + } + """ + + _ZL10__hmul_sat13__nv_bfloat16S__nbst = declare_device( + "_ZL10__hmul_sat13__nv_bfloat16S__nbst", + _type___nv_bfloat16( + CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) + ), + ) + + def _ZL10__hmul_sat13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL10__hmul_sat13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__hmul_sat, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL10__hmul_sat13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL10__hmul_sat13__nv_bfloat16S__nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL10__hmul_sat13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __hfma(): + pass + + +def _lower__ZL6__hfma13__nv_bfloat16S_S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__hfma13__nv_bfloat16S_S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b, __nv_bfloat16* c) { + retval = __hfma(*a, *b, *c); + return 0; + } + """ + + _ZL6__hfma13__nv_bfloat16S_S__nbst = declare_device( + "_ZL6__hfma13__nv_bfloat16S_S__nbst", + _type___nv_bfloat16( + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ) + + def _ZL6__hfma13__nv_bfloat16S_S__nbst_caller(arg_0, arg_1, arg_2): + return _ZL6__hfma13__nv_bfloat16S_S__nbst(arg_0, arg_1, arg_2) + + @lower( + __hfma, _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16 + ) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__hfma13__nv_bfloat16S_S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__hfma13__nv_bfloat16S_S__nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL6__hfma13__nv_bfloat16S_S__nbst(shim_stream, shim_obj) + + +def __hfma_sat(): + pass + + +def _lower__ZL10__hfma_sat13__nv_bfloat16S_S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL10__hfma_sat13__nv_bfloat16S_S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b, __nv_bfloat16* c) { + retval = __hfma_sat(*a, *b, *c); + return 0; + } + """ + + _ZL10__hfma_sat13__nv_bfloat16S_S__nbst = declare_device( + "_ZL10__hfma_sat13__nv_bfloat16S_S__nbst", + _type___nv_bfloat16( + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ) + + def _ZL10__hfma_sat13__nv_bfloat16S_S__nbst_caller(arg_0, arg_1, arg_2): + return _ZL10__hfma_sat13__nv_bfloat16S_S__nbst(arg_0, arg_1, arg_2) + + @lower( + __hfma_sat, + _type___nv_bfloat16, + _type___nv_bfloat16, + _type___nv_bfloat16, + ) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL10__hfma_sat13__nv_bfloat16S_S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL10__hfma_sat13__nv_bfloat16S_S__nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL10__hfma_sat13__nv_bfloat16S_S__nbst(shim_stream, shim_obj) + + +def __hneg(): + pass + + +def _lower__ZL6__hneg13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__hneg13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) { + retval = __hneg(*a); + return 0; + } + """ + + _ZL6__hneg13__nv_bfloat16_nbst = declare_device( + "_ZL6__hneg13__nv_bfloat16_nbst", + _type___nv_bfloat16(CPointer(_type___nv_bfloat16)), + ) + + def _ZL6__hneg13__nv_bfloat16_nbst_caller(arg_0): + return _ZL6__hneg13__nv_bfloat16_nbst(arg_0) + + @lower(__hneg, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__hneg13__nv_bfloat16_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__hneg13__nv_bfloat16_nbst_caller, + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + ptrs, + ) + + +_lower__ZL6__hneg13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def __hbeq2(): + pass + + +def _lower__ZL7__hbeq214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL7__hbeq214__nv_bfloat162S__nbst(bool &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hbeq2(*a, *b); + return 0; + } + """ + + _ZL7__hbeq214__nv_bfloat162S__nbst = declare_device( + "_ZL7__hbeq214__nv_bfloat162S__nbst", + bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZL7__hbeq214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL7__hbeq214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hbeq2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL7__hbeq214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL7__hbeq214__nv_bfloat162S__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL7__hbeq214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hbne2(): + pass + + +def _lower__ZL7__hbne214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL7__hbne214__nv_bfloat162S__nbst(bool &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hbne2(*a, *b); + return 0; + } + """ + + _ZL7__hbne214__nv_bfloat162S__nbst = declare_device( + "_ZL7__hbne214__nv_bfloat162S__nbst", + bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZL7__hbne214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL7__hbne214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hbne2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL7__hbne214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL7__hbne214__nv_bfloat162S__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL7__hbne214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hble2(): + pass + + +def _lower__ZL7__hble214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL7__hble214__nv_bfloat162S__nbst(bool &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hble2(*a, *b); + return 0; + } + """ + + _ZL7__hble214__nv_bfloat162S__nbst = declare_device( + "_ZL7__hble214__nv_bfloat162S__nbst", + bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZL7__hble214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL7__hble214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hble2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL7__hble214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL7__hble214__nv_bfloat162S__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL7__hble214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hbge2(): + pass + + +def _lower__ZL7__hbge214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL7__hbge214__nv_bfloat162S__nbst(bool &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hbge2(*a, *b); + return 0; + } + """ + + _ZL7__hbge214__nv_bfloat162S__nbst = declare_device( + "_ZL7__hbge214__nv_bfloat162S__nbst", + bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZL7__hbge214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL7__hbge214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hbge2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL7__hbge214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL7__hbge214__nv_bfloat162S__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL7__hbge214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hblt2(): + pass + + +def _lower__ZL7__hblt214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL7__hblt214__nv_bfloat162S__nbst(bool &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hblt2(*a, *b); + return 0; + } + """ + + _ZL7__hblt214__nv_bfloat162S__nbst = declare_device( + "_ZL7__hblt214__nv_bfloat162S__nbst", + bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZL7__hblt214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL7__hblt214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hblt2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL7__hblt214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL7__hblt214__nv_bfloat162S__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL7__hblt214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hbgt2(): + pass + + +def _lower__ZL7__hbgt214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL7__hbgt214__nv_bfloat162S__nbst(bool &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hbgt2(*a, *b); + return 0; + } + """ + + _ZL7__hbgt214__nv_bfloat162S__nbst = declare_device( + "_ZL7__hbgt214__nv_bfloat162S__nbst", + bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZL7__hbgt214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL7__hbgt214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hbgt2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL7__hbgt214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL7__hbgt214__nv_bfloat162S__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL7__hbgt214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hbequ2(): + pass + + +def _lower__ZL8__hbequ214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL8__hbequ214__nv_bfloat162S__nbst(bool &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hbequ2(*a, *b); + return 0; + } + """ + + _ZL8__hbequ214__nv_bfloat162S__nbst = declare_device( + "_ZL8__hbequ214__nv_bfloat162S__nbst", + bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZL8__hbequ214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL8__hbequ214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hbequ2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL8__hbequ214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL8__hbequ214__nv_bfloat162S__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL8__hbequ214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hbneu2(): + pass + + +def _lower__ZL8__hbneu214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL8__hbneu214__nv_bfloat162S__nbst(bool &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hbneu2(*a, *b); + return 0; + } + """ + + _ZL8__hbneu214__nv_bfloat162S__nbst = declare_device( + "_ZL8__hbneu214__nv_bfloat162S__nbst", + bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZL8__hbneu214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL8__hbneu214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hbneu2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL8__hbneu214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL8__hbneu214__nv_bfloat162S__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL8__hbneu214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hbleu2(): + pass + + +def _lower__ZL8__hbleu214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL8__hbleu214__nv_bfloat162S__nbst(bool &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hbleu2(*a, *b); + return 0; + } + """ + + _ZL8__hbleu214__nv_bfloat162S__nbst = declare_device( + "_ZL8__hbleu214__nv_bfloat162S__nbst", + bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZL8__hbleu214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL8__hbleu214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hbleu2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL8__hbleu214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL8__hbleu214__nv_bfloat162S__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL8__hbleu214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hbgeu2(): + pass + + +def _lower__ZL8__hbgeu214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL8__hbgeu214__nv_bfloat162S__nbst(bool &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hbgeu2(*a, *b); + return 0; + } + """ + + _ZL8__hbgeu214__nv_bfloat162S__nbst = declare_device( + "_ZL8__hbgeu214__nv_bfloat162S__nbst", + bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZL8__hbgeu214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL8__hbgeu214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hbgeu2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL8__hbgeu214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL8__hbgeu214__nv_bfloat162S__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL8__hbgeu214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hbltu2(): + pass + + +def _lower__ZL8__hbltu214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL8__hbltu214__nv_bfloat162S__nbst(bool &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hbltu2(*a, *b); + return 0; + } + """ + + _ZL8__hbltu214__nv_bfloat162S__nbst = declare_device( + "_ZL8__hbltu214__nv_bfloat162S__nbst", + bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZL8__hbltu214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL8__hbltu214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hbltu2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL8__hbltu214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL8__hbltu214__nv_bfloat162S__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL8__hbltu214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hbgtu2(): + pass + + +def _lower__ZL8__hbgtu214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL8__hbgtu214__nv_bfloat162S__nbst(bool &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hbgtu2(*a, *b); + return 0; + } + """ + + _ZL8__hbgtu214__nv_bfloat162S__nbst = declare_device( + "_ZL8__hbgtu214__nv_bfloat162S__nbst", + bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZL8__hbgtu214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL8__hbgtu214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hbgtu2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL8__hbgtu214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL8__hbgtu214__nv_bfloat162S__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL8__hbgtu214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __heq(): + pass + + +def _lower__ZL5__heq13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL5__heq13__nv_bfloat16S__nbst(bool &retval , __nv_bfloat16* a, __nv_bfloat16* b) { + retval = __heq(*a, *b); + return 0; + } + """ + + _ZL5__heq13__nv_bfloat16S__nbst = declare_device( + "_ZL5__heq13__nv_bfloat16S__nbst", + bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)), + ) + + def _ZL5__heq13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL5__heq13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__heq, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL5__heq13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL5__heq13__nv_bfloat16S__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL5__heq13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __hne(): + pass + + +def _lower__ZL5__hne13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL5__hne13__nv_bfloat16S__nbst(bool &retval , __nv_bfloat16* a, __nv_bfloat16* b) { + retval = __hne(*a, *b); + return 0; + } + """ + + _ZL5__hne13__nv_bfloat16S__nbst = declare_device( + "_ZL5__hne13__nv_bfloat16S__nbst", + bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)), + ) + + def _ZL5__hne13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL5__hne13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__hne, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL5__hne13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL5__hne13__nv_bfloat16S__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL5__hne13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __hle(): + pass + + +def _lower__ZL5__hle13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL5__hle13__nv_bfloat16S__nbst(bool &retval , __nv_bfloat16* a, __nv_bfloat16* b) { + retval = __hle(*a, *b); + return 0; + } + """ + + _ZL5__hle13__nv_bfloat16S__nbst = declare_device( + "_ZL5__hle13__nv_bfloat16S__nbst", + bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)), + ) + + def _ZL5__hle13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL5__hle13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__hle, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL5__hle13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL5__hle13__nv_bfloat16S__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL5__hle13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __hge(): + pass + + +def _lower__ZL5__hge13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL5__hge13__nv_bfloat16S__nbst(bool &retval , __nv_bfloat16* a, __nv_bfloat16* b) { + retval = __hge(*a, *b); + return 0; + } + """ + + _ZL5__hge13__nv_bfloat16S__nbst = declare_device( + "_ZL5__hge13__nv_bfloat16S__nbst", + bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)), + ) + + def _ZL5__hge13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL5__hge13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__hge, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL5__hge13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL5__hge13__nv_bfloat16S__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL5__hge13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __hlt(): + pass + + +def _lower__ZL5__hlt13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL5__hlt13__nv_bfloat16S__nbst(bool &retval , __nv_bfloat16* a, __nv_bfloat16* b) { + retval = __hlt(*a, *b); + return 0; + } + """ + + _ZL5__hlt13__nv_bfloat16S__nbst = declare_device( + "_ZL5__hlt13__nv_bfloat16S__nbst", + bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)), + ) + + def _ZL5__hlt13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL5__hlt13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__hlt, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL5__hlt13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL5__hlt13__nv_bfloat16S__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL5__hlt13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __hgt(): + pass + + +def _lower__ZL5__hgt13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL5__hgt13__nv_bfloat16S__nbst(bool &retval , __nv_bfloat16* a, __nv_bfloat16* b) { + retval = __hgt(*a, *b); + return 0; + } + """ + + _ZL5__hgt13__nv_bfloat16S__nbst = declare_device( + "_ZL5__hgt13__nv_bfloat16S__nbst", + bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)), + ) + + def _ZL5__hgt13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL5__hgt13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__hgt, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL5__hgt13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL5__hgt13__nv_bfloat16S__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL5__hgt13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __hequ(): + pass + + +def _lower__ZL6__hequ13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__hequ13__nv_bfloat16S__nbst(bool &retval , __nv_bfloat16* a, __nv_bfloat16* b) { + retval = __hequ(*a, *b); + return 0; + } + """ + + _ZL6__hequ13__nv_bfloat16S__nbst = declare_device( + "_ZL6__hequ13__nv_bfloat16S__nbst", + bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)), + ) + + def _ZL6__hequ13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL6__hequ13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__hequ, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__hequ13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__hequ13__nv_bfloat16S__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL6__hequ13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __hneu(): + pass + + +def _lower__ZL6__hneu13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__hneu13__nv_bfloat16S__nbst(bool &retval , __nv_bfloat16* a, __nv_bfloat16* b) { + retval = __hneu(*a, *b); + return 0; + } + """ + + _ZL6__hneu13__nv_bfloat16S__nbst = declare_device( + "_ZL6__hneu13__nv_bfloat16S__nbst", + bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)), + ) + + def _ZL6__hneu13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL6__hneu13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__hneu, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__hneu13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__hneu13__nv_bfloat16S__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL6__hneu13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __hleu(): + pass + + +def _lower__ZL6__hleu13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__hleu13__nv_bfloat16S__nbst(bool &retval , __nv_bfloat16* a, __nv_bfloat16* b) { + retval = __hleu(*a, *b); + return 0; + } + """ + + _ZL6__hleu13__nv_bfloat16S__nbst = declare_device( + "_ZL6__hleu13__nv_bfloat16S__nbst", + bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)), + ) + + def _ZL6__hleu13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL6__hleu13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__hleu, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__hleu13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__hleu13__nv_bfloat16S__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL6__hleu13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __hgeu(): + pass + + +def _lower__ZL6__hgeu13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__hgeu13__nv_bfloat16S__nbst(bool &retval , __nv_bfloat16* a, __nv_bfloat16* b) { + retval = __hgeu(*a, *b); + return 0; + } + """ + + _ZL6__hgeu13__nv_bfloat16S__nbst = declare_device( + "_ZL6__hgeu13__nv_bfloat16S__nbst", + bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)), + ) + + def _ZL6__hgeu13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL6__hgeu13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__hgeu, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__hgeu13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__hgeu13__nv_bfloat16S__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL6__hgeu13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __hltu(): + pass + + +def _lower__ZL6__hltu13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__hltu13__nv_bfloat16S__nbst(bool &retval , __nv_bfloat16* a, __nv_bfloat16* b) { + retval = __hltu(*a, *b); + return 0; + } + """ + + _ZL6__hltu13__nv_bfloat16S__nbst = declare_device( + "_ZL6__hltu13__nv_bfloat16S__nbst", + bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)), + ) + + def _ZL6__hltu13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL6__hltu13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__hltu, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__hltu13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__hltu13__nv_bfloat16S__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL6__hltu13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __hgtu(): + pass + + +def _lower__ZL6__hgtu13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__hgtu13__nv_bfloat16S__nbst(bool &retval , __nv_bfloat16* a, __nv_bfloat16* b) { + retval = __hgtu(*a, *b); + return 0; + } + """ + + _ZL6__hgtu13__nv_bfloat16S__nbst = declare_device( + "_ZL6__hgtu13__nv_bfloat16S__nbst", + bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)), + ) + + def _ZL6__hgtu13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL6__hgtu13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__hgtu, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__hgtu13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__hgtu13__nv_bfloat16S__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL6__hgtu13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __hisnan(): + pass + + +def _lower__ZL8__hisnan13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL8__hisnan13__nv_bfloat16_nbst(bool &retval , __nv_bfloat16* a) { + retval = __hisnan(*a); + return 0; + } + """ + + _ZL8__hisnan13__nv_bfloat16_nbst = declare_device( + "_ZL8__hisnan13__nv_bfloat16_nbst", bool_(CPointer(_type___nv_bfloat16)) + ) + + def _ZL8__hisnan13__nv_bfloat16_nbst_caller(arg_0): + return _ZL8__hisnan13__nv_bfloat16_nbst(arg_0) + + @lower(__hisnan, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL8__hisnan13__nv_bfloat16_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL8__hisnan13__nv_bfloat16_nbst_caller, + signature(bool_, CPointer(_type___nv_bfloat16)), + ptrs, + ) + + +_lower__ZL8__hisnan13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def __hmax(): + pass + + +def _lower__ZL6__hmax13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__hmax13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) { + retval = __hmax(*a, *b); + return 0; + } + """ + + _ZL6__hmax13__nv_bfloat16S__nbst = declare_device( + "_ZL6__hmax13__nv_bfloat16S__nbst", + _type___nv_bfloat16( + CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) + ), + ) + + def _ZL6__hmax13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL6__hmax13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__hmax, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__hmax13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__hmax13__nv_bfloat16S__nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL6__hmax13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __hmin(): + pass + + +def _lower__ZL6__hmin13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__hmin13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) { + retval = __hmin(*a, *b); + return 0; + } + """ + + _ZL6__hmin13__nv_bfloat16S__nbst = declare_device( + "_ZL6__hmin13__nv_bfloat16S__nbst", + _type___nv_bfloat16( + CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) + ), + ) + + def _ZL6__hmin13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL6__hmin13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__hmin, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__hmin13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__hmin13__nv_bfloat16S__nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL6__hmin13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __hmax_nan(): + pass + + +def _lower__ZL10__hmax_nan13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL10__hmax_nan13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) { + retval = __hmax_nan(*a, *b); + return 0; + } + """ + + _ZL10__hmax_nan13__nv_bfloat16S__nbst = declare_device( + "_ZL10__hmax_nan13__nv_bfloat16S__nbst", + _type___nv_bfloat16( + CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) + ), + ) + + def _ZL10__hmax_nan13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL10__hmax_nan13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__hmax_nan, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL10__hmax_nan13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL10__hmax_nan13__nv_bfloat16S__nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL10__hmax_nan13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __hmin_nan(): + pass + + +def _lower__ZL10__hmin_nan13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL10__hmin_nan13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) { + retval = __hmin_nan(*a, *b); + return 0; + } + """ + + _ZL10__hmin_nan13__nv_bfloat16S__nbst = declare_device( + "_ZL10__hmin_nan13__nv_bfloat16S__nbst", + _type___nv_bfloat16( + CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) + ), + ) + + def _ZL10__hmin_nan13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL10__hmin_nan13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__hmin_nan, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL10__hmin_nan13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL10__hmin_nan13__nv_bfloat16S__nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL10__hmin_nan13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __hfma_relu(): + pass + + +def _lower__ZL11__hfma_relu13__nv_bfloat16S_S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL11__hfma_relu13__nv_bfloat16S_S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b, __nv_bfloat16* c) { + retval = __hfma_relu(*a, *b, *c); + return 0; + } + """ + + _ZL11__hfma_relu13__nv_bfloat16S_S__nbst = declare_device( + "_ZL11__hfma_relu13__nv_bfloat16S_S__nbst", + _type___nv_bfloat16( + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ) + + def _ZL11__hfma_relu13__nv_bfloat16S_S__nbst_caller(arg_0, arg_1, arg_2): + return _ZL11__hfma_relu13__nv_bfloat16S_S__nbst(arg_0, arg_1, arg_2) + + @lower( + __hfma_relu, + _type___nv_bfloat16, + _type___nv_bfloat16, + _type___nv_bfloat16, + ) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL11__hfma_relu13__nv_bfloat16S_S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL11__hfma_relu13__nv_bfloat16S_S__nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL11__hfma_relu13__nv_bfloat16S_S__nbst(shim_stream, shim_obj) + + +def __hmax2(): + pass + + +def _lower__ZL7__hmax214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL7__hmax214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hmax2(*a, *b); + return 0; + } + """ + + _ZL7__hmax214__nv_bfloat162S__nbst = declare_device( + "_ZL7__hmax214__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL7__hmax214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL7__hmax214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hmax2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL7__hmax214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL7__hmax214__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL7__hmax214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hmin2(): + pass + + +def _lower__ZL7__hmin214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL7__hmin214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hmin2(*a, *b); + return 0; + } + """ + + _ZL7__hmin214__nv_bfloat162S__nbst = declare_device( + "_ZL7__hmin214__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL7__hmin214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL7__hmin214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hmin2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL7__hmin214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL7__hmin214__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL7__hmin214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hmax2_nan(): + pass + + +def _lower__ZL11__hmax2_nan14__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL11__hmax2_nan14__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hmax2_nan(*a, *b); + return 0; + } + """ + + _ZL11__hmax2_nan14__nv_bfloat162S__nbst = declare_device( + "_ZL11__hmax2_nan14__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL11__hmax2_nan14__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL11__hmax2_nan14__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hmax2_nan, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL11__hmax2_nan14__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL11__hmax2_nan14__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL11__hmax2_nan14__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hmin2_nan(): + pass + + +def _lower__ZL11__hmin2_nan14__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL11__hmin2_nan14__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hmin2_nan(*a, *b); + return 0; + } + """ + + _ZL11__hmin2_nan14__nv_bfloat162S__nbst = declare_device( + "_ZL11__hmin2_nan14__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL11__hmin2_nan14__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL11__hmin2_nan14__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hmin2_nan, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL11__hmin2_nan14__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL11__hmin2_nan14__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL11__hmin2_nan14__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hfma2_relu(): + pass + + +def _lower__ZL12__hfma2_relu14__nv_bfloat162S_S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL12__hfma2_relu14__nv_bfloat162S_S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b, __nv_bfloat162* c) { + retval = __hfma2_relu(*a, *b, *c); + return 0; + } + """ + + _ZL12__hfma2_relu14__nv_bfloat162S_S__nbst = declare_device( + "_ZL12__hfma2_relu14__nv_bfloat162S_S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ) + + def _ZL12__hfma2_relu14__nv_bfloat162S_S__nbst_caller(arg_0, arg_1, arg_2): + return _ZL12__hfma2_relu14__nv_bfloat162S_S__nbst(arg_0, arg_1, arg_2) + + @lower( + __hfma2_relu, + _type___nv_bfloat162, + _type___nv_bfloat162, + _type___nv_bfloat162, + ) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL12__hfma2_relu14__nv_bfloat162S_S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL12__hfma2_relu14__nv_bfloat162S_S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL12__hfma2_relu14__nv_bfloat162S_S__nbst(shim_stream, shim_obj) + + +def __hcmadd(): + pass + + +def _lower__ZL8__hcmadd14__nv_bfloat162S_S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL8__hcmadd14__nv_bfloat162S_S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b, __nv_bfloat162* c) { + retval = __hcmadd(*a, *b, *c); + return 0; + } + """ + + _ZL8__hcmadd14__nv_bfloat162S_S__nbst = declare_device( + "_ZL8__hcmadd14__nv_bfloat162S_S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ) + + def _ZL8__hcmadd14__nv_bfloat162S_S__nbst_caller(arg_0, arg_1, arg_2): + return _ZL8__hcmadd14__nv_bfloat162S_S__nbst(arg_0, arg_1, arg_2) + + @lower( + __hcmadd, + _type___nv_bfloat162, + _type___nv_bfloat162, + _type___nv_bfloat162, + ) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL8__hcmadd14__nv_bfloat162S_S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL8__hcmadd14__nv_bfloat162S_S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL8__hcmadd14__nv_bfloat162S_S__nbst(shim_stream, shim_obj) + + +def hsqrt(): + pass + + +def _lower__ZL5hsqrt13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL5hsqrt13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) { + retval = hsqrt(*a); + return 0; + } + """ + + _ZL5hsqrt13__nv_bfloat16_nbst = declare_device( + "_ZL5hsqrt13__nv_bfloat16_nbst", + _type___nv_bfloat16(CPointer(_type___nv_bfloat16)), + ) + + def _ZL5hsqrt13__nv_bfloat16_nbst_caller(arg_0): + return _ZL5hsqrt13__nv_bfloat16_nbst(arg_0) + + @lower(hsqrt, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL5hsqrt13__nv_bfloat16_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL5hsqrt13__nv_bfloat16_nbst_caller, + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + ptrs, + ) + + +_lower__ZL5hsqrt13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def hrsqrt(): + pass + + +def _lower__ZL6hrsqrt13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6hrsqrt13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) { + retval = hrsqrt(*a); + return 0; + } + """ + + _ZL6hrsqrt13__nv_bfloat16_nbst = declare_device( + "_ZL6hrsqrt13__nv_bfloat16_nbst", + _type___nv_bfloat16(CPointer(_type___nv_bfloat16)), + ) + + def _ZL6hrsqrt13__nv_bfloat16_nbst_caller(arg_0): + return _ZL6hrsqrt13__nv_bfloat16_nbst(arg_0) + + @lower(hrsqrt, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6hrsqrt13__nv_bfloat16_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6hrsqrt13__nv_bfloat16_nbst_caller, + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + ptrs, + ) + + +_lower__ZL6hrsqrt13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def hrcp(): + pass + + +def _lower__ZL4hrcp13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL4hrcp13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) { + retval = hrcp(*a); + return 0; + } + """ + + _ZL4hrcp13__nv_bfloat16_nbst = declare_device( + "_ZL4hrcp13__nv_bfloat16_nbst", + _type___nv_bfloat16(CPointer(_type___nv_bfloat16)), + ) + + def _ZL4hrcp13__nv_bfloat16_nbst_caller(arg_0): + return _ZL4hrcp13__nv_bfloat16_nbst(arg_0) + + @lower(hrcp, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key("_ZL4hrcp13__nv_bfloat16_nbst", shim_raw_str) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL4hrcp13__nv_bfloat16_nbst_caller, + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + ptrs, + ) + + +_lower__ZL4hrcp13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def hlog(): + pass + + +def _lower__ZL4hlog13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL4hlog13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) { + retval = hlog(*a); + return 0; + } + """ + + _ZL4hlog13__nv_bfloat16_nbst = declare_device( + "_ZL4hlog13__nv_bfloat16_nbst", + _type___nv_bfloat16(CPointer(_type___nv_bfloat16)), + ) + + def _ZL4hlog13__nv_bfloat16_nbst_caller(arg_0): + return _ZL4hlog13__nv_bfloat16_nbst(arg_0) + + @lower(hlog, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key("_ZL4hlog13__nv_bfloat16_nbst", shim_raw_str) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL4hlog13__nv_bfloat16_nbst_caller, + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + ptrs, + ) + + +_lower__ZL4hlog13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def hlog2(): + pass + + +def _lower__ZL5hlog213__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL5hlog213__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) { + retval = hlog2(*a); + return 0; + } + """ + + _ZL5hlog213__nv_bfloat16_nbst = declare_device( + "_ZL5hlog213__nv_bfloat16_nbst", + _type___nv_bfloat16(CPointer(_type___nv_bfloat16)), + ) + + def _ZL5hlog213__nv_bfloat16_nbst_caller(arg_0): + return _ZL5hlog213__nv_bfloat16_nbst(arg_0) + + @lower(hlog2, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL5hlog213__nv_bfloat16_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL5hlog213__nv_bfloat16_nbst_caller, + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + ptrs, + ) + + +_lower__ZL5hlog213__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def hlog10(): + pass + + +def _lower__ZL6hlog1013__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6hlog1013__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) { + retval = hlog10(*a); + return 0; + } + """ + + _ZL6hlog1013__nv_bfloat16_nbst = declare_device( + "_ZL6hlog1013__nv_bfloat16_nbst", + _type___nv_bfloat16(CPointer(_type___nv_bfloat16)), + ) + + def _ZL6hlog1013__nv_bfloat16_nbst_caller(arg_0): + return _ZL6hlog1013__nv_bfloat16_nbst(arg_0) + + @lower(hlog10, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6hlog1013__nv_bfloat16_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6hlog1013__nv_bfloat16_nbst_caller, + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + ptrs, + ) + + +_lower__ZL6hlog1013__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def hexp(): + pass + + +def _lower__ZL4hexp13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL4hexp13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) { + retval = hexp(*a); + return 0; + } + """ + + _ZL4hexp13__nv_bfloat16_nbst = declare_device( + "_ZL4hexp13__nv_bfloat16_nbst", + _type___nv_bfloat16(CPointer(_type___nv_bfloat16)), + ) + + def _ZL4hexp13__nv_bfloat16_nbst_caller(arg_0): + return _ZL4hexp13__nv_bfloat16_nbst(arg_0) + + @lower(hexp, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key("_ZL4hexp13__nv_bfloat16_nbst", shim_raw_str) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL4hexp13__nv_bfloat16_nbst_caller, + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + ptrs, + ) + + +_lower__ZL4hexp13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def htanh_approx(): + pass + + +def _lower__ZL12htanh_approx13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL12htanh_approx13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) { + retval = htanh_approx(*a); + return 0; + } + """ + + _ZL12htanh_approx13__nv_bfloat16_nbst = declare_device( + "_ZL12htanh_approx13__nv_bfloat16_nbst", + _type___nv_bfloat16(CPointer(_type___nv_bfloat16)), + ) + + def _ZL12htanh_approx13__nv_bfloat16_nbst_caller(arg_0): + return _ZL12htanh_approx13__nv_bfloat16_nbst(arg_0) + + @lower(htanh_approx, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL12htanh_approx13__nv_bfloat16_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL12htanh_approx13__nv_bfloat16_nbst_caller, + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + ptrs, + ) + + +_lower__ZL12htanh_approx13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def h2tanh_approx(): + pass + + +def _lower__ZL13h2tanh_approx14__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL13h2tanh_approx14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) { + retval = h2tanh_approx(*a); + return 0; + } + """ + + _ZL13h2tanh_approx14__nv_bfloat162_nbst = declare_device( + "_ZL13h2tanh_approx14__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(_type___nv_bfloat162)), + ) + + def _ZL13h2tanh_approx14__nv_bfloat162_nbst_caller(arg_0): + return _ZL13h2tanh_approx14__nv_bfloat162_nbst(arg_0) + + @lower(h2tanh_approx, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL13h2tanh_approx14__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL13h2tanh_approx14__nv_bfloat162_nbst_caller, + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + ptrs, + ) + + +_lower__ZL13h2tanh_approx14__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def htanh(): + pass + + +def _lower__ZL5htanh13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL5htanh13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) { + retval = htanh(*a); + return 0; + } + """ + + _ZL5htanh13__nv_bfloat16_nbst = declare_device( + "_ZL5htanh13__nv_bfloat16_nbst", + _type___nv_bfloat16(CPointer(_type___nv_bfloat16)), + ) + + def _ZL5htanh13__nv_bfloat16_nbst_caller(arg_0): + return _ZL5htanh13__nv_bfloat16_nbst(arg_0) + + @lower(htanh, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL5htanh13__nv_bfloat16_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL5htanh13__nv_bfloat16_nbst_caller, + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + ptrs, + ) + + +_lower__ZL5htanh13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def h2tanh(): + pass + + +def _lower__ZL6h2tanh14__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6h2tanh14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) { + retval = h2tanh(*a); + return 0; + } + """ + + _ZL6h2tanh14__nv_bfloat162_nbst = declare_device( + "_ZL6h2tanh14__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(_type___nv_bfloat162)), + ) + + def _ZL6h2tanh14__nv_bfloat162_nbst_caller(arg_0): + return _ZL6h2tanh14__nv_bfloat162_nbst(arg_0) + + @lower(h2tanh, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6h2tanh14__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6h2tanh14__nv_bfloat162_nbst_caller, + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + ptrs, + ) + + +_lower__ZL6h2tanh14__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def hexp2(): + pass + + +def _lower__ZL5hexp213__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL5hexp213__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) { + retval = hexp2(*a); + return 0; + } + """ + + _ZL5hexp213__nv_bfloat16_nbst = declare_device( + "_ZL5hexp213__nv_bfloat16_nbst", + _type___nv_bfloat16(CPointer(_type___nv_bfloat16)), + ) + + def _ZL5hexp213__nv_bfloat16_nbst_caller(arg_0): + return _ZL5hexp213__nv_bfloat16_nbst(arg_0) + + @lower(hexp2, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL5hexp213__nv_bfloat16_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL5hexp213__nv_bfloat16_nbst_caller, + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + ptrs, + ) + + +_lower__ZL5hexp213__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def hexp10(): + pass + + +def _lower__ZL6hexp1013__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6hexp1013__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) { + retval = hexp10(*a); + return 0; + } + """ + + _ZL6hexp1013__nv_bfloat16_nbst = declare_device( + "_ZL6hexp1013__nv_bfloat16_nbst", + _type___nv_bfloat16(CPointer(_type___nv_bfloat16)), + ) + + def _ZL6hexp1013__nv_bfloat16_nbst_caller(arg_0): + return _ZL6hexp1013__nv_bfloat16_nbst(arg_0) + + @lower(hexp10, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6hexp1013__nv_bfloat16_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6hexp1013__nv_bfloat16_nbst_caller, + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + ptrs, + ) + + +_lower__ZL6hexp1013__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def hcos(): + pass + + +def _lower__ZL4hcos13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL4hcos13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) { + retval = hcos(*a); + return 0; + } + """ + + _ZL4hcos13__nv_bfloat16_nbst = declare_device( + "_ZL4hcos13__nv_bfloat16_nbst", + _type___nv_bfloat16(CPointer(_type___nv_bfloat16)), + ) + + def _ZL4hcos13__nv_bfloat16_nbst_caller(arg_0): + return _ZL4hcos13__nv_bfloat16_nbst(arg_0) + + @lower(hcos, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key("_ZL4hcos13__nv_bfloat16_nbst", shim_raw_str) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL4hcos13__nv_bfloat16_nbst_caller, + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + ptrs, + ) + + +_lower__ZL4hcos13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def hsin(): + pass + + +def _lower__ZL4hsin13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL4hsin13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) { + retval = hsin(*a); + return 0; + } + """ + + _ZL4hsin13__nv_bfloat16_nbst = declare_device( + "_ZL4hsin13__nv_bfloat16_nbst", + _type___nv_bfloat16(CPointer(_type___nv_bfloat16)), + ) + + def _ZL4hsin13__nv_bfloat16_nbst_caller(arg_0): + return _ZL4hsin13__nv_bfloat16_nbst(arg_0) + + @lower(hsin, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key("_ZL4hsin13__nv_bfloat16_nbst", shim_raw_str) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL4hsin13__nv_bfloat16_nbst_caller, + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + ptrs, + ) + + +_lower__ZL4hsin13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def h2sqrt(): + pass + + +def _lower__ZL6h2sqrt14__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6h2sqrt14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) { + retval = h2sqrt(*a); + return 0; + } + """ + + _ZL6h2sqrt14__nv_bfloat162_nbst = declare_device( + "_ZL6h2sqrt14__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(_type___nv_bfloat162)), + ) + + def _ZL6h2sqrt14__nv_bfloat162_nbst_caller(arg_0): + return _ZL6h2sqrt14__nv_bfloat162_nbst(arg_0) + + @lower(h2sqrt, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6h2sqrt14__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6h2sqrt14__nv_bfloat162_nbst_caller, + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + ptrs, + ) + + +_lower__ZL6h2sqrt14__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def h2rsqrt(): + pass + + +def _lower__ZL7h2rsqrt14__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL7h2rsqrt14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) { + retval = h2rsqrt(*a); + return 0; + } + """ + + _ZL7h2rsqrt14__nv_bfloat162_nbst = declare_device( + "_ZL7h2rsqrt14__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(_type___nv_bfloat162)), + ) + + def _ZL7h2rsqrt14__nv_bfloat162_nbst_caller(arg_0): + return _ZL7h2rsqrt14__nv_bfloat162_nbst(arg_0) + + @lower(h2rsqrt, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL7h2rsqrt14__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL7h2rsqrt14__nv_bfloat162_nbst_caller, + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + ptrs, + ) + + +_lower__ZL7h2rsqrt14__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def h2rcp(): + pass + + +def _lower__ZL5h2rcp14__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL5h2rcp14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) { + retval = h2rcp(*a); + return 0; + } + """ + + _ZL5h2rcp14__nv_bfloat162_nbst = declare_device( + "_ZL5h2rcp14__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(_type___nv_bfloat162)), + ) + + def _ZL5h2rcp14__nv_bfloat162_nbst_caller(arg_0): + return _ZL5h2rcp14__nv_bfloat162_nbst(arg_0) + + @lower(h2rcp, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL5h2rcp14__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL5h2rcp14__nv_bfloat162_nbst_caller, + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + ptrs, + ) + + +_lower__ZL5h2rcp14__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def h2log(): + pass + + +def _lower__ZL5h2log14__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL5h2log14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) { + retval = h2log(*a); + return 0; + } + """ + + _ZL5h2log14__nv_bfloat162_nbst = declare_device( + "_ZL5h2log14__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(_type___nv_bfloat162)), + ) + + def _ZL5h2log14__nv_bfloat162_nbst_caller(arg_0): + return _ZL5h2log14__nv_bfloat162_nbst(arg_0) + + @lower(h2log, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL5h2log14__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL5h2log14__nv_bfloat162_nbst_caller, + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + ptrs, + ) + + +_lower__ZL5h2log14__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def h2log2(): + pass + + +def _lower__ZL6h2log214__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6h2log214__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) { + retval = h2log2(*a); + return 0; + } + """ + + _ZL6h2log214__nv_bfloat162_nbst = declare_device( + "_ZL6h2log214__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(_type___nv_bfloat162)), + ) + + def _ZL6h2log214__nv_bfloat162_nbst_caller(arg_0): + return _ZL6h2log214__nv_bfloat162_nbst(arg_0) + + @lower(h2log2, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6h2log214__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6h2log214__nv_bfloat162_nbst_caller, + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + ptrs, + ) + + +_lower__ZL6h2log214__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def h2log10(): + pass + + +def _lower__ZL7h2log1014__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL7h2log1014__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) { + retval = h2log10(*a); + return 0; + } + """ + + _ZL7h2log1014__nv_bfloat162_nbst = declare_device( + "_ZL7h2log1014__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(_type___nv_bfloat162)), + ) + + def _ZL7h2log1014__nv_bfloat162_nbst_caller(arg_0): + return _ZL7h2log1014__nv_bfloat162_nbst(arg_0) + + @lower(h2log10, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL7h2log1014__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL7h2log1014__nv_bfloat162_nbst_caller, + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + ptrs, + ) + + +_lower__ZL7h2log1014__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def h2exp(): + pass + + +def _lower__ZL5h2exp14__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL5h2exp14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) { + retval = h2exp(*a); + return 0; + } + """ + + _ZL5h2exp14__nv_bfloat162_nbst = declare_device( + "_ZL5h2exp14__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(_type___nv_bfloat162)), + ) + + def _ZL5h2exp14__nv_bfloat162_nbst_caller(arg_0): + return _ZL5h2exp14__nv_bfloat162_nbst(arg_0) + + @lower(h2exp, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL5h2exp14__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL5h2exp14__nv_bfloat162_nbst_caller, + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + ptrs, + ) + + +_lower__ZL5h2exp14__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def h2exp2(): + pass + + +def _lower__ZL6h2exp214__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6h2exp214__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) { + retval = h2exp2(*a); + return 0; + } + """ + + _ZL6h2exp214__nv_bfloat162_nbst = declare_device( + "_ZL6h2exp214__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(_type___nv_bfloat162)), + ) + + def _ZL6h2exp214__nv_bfloat162_nbst_caller(arg_0): + return _ZL6h2exp214__nv_bfloat162_nbst(arg_0) + + @lower(h2exp2, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6h2exp214__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6h2exp214__nv_bfloat162_nbst_caller, + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + ptrs, + ) + + +_lower__ZL6h2exp214__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def h2exp10(): + pass + + +def _lower__ZL7h2exp1014__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL7h2exp1014__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) { + retval = h2exp10(*a); + return 0; + } + """ + + _ZL7h2exp1014__nv_bfloat162_nbst = declare_device( + "_ZL7h2exp1014__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(_type___nv_bfloat162)), + ) + + def _ZL7h2exp1014__nv_bfloat162_nbst_caller(arg_0): + return _ZL7h2exp1014__nv_bfloat162_nbst(arg_0) + + @lower(h2exp10, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL7h2exp1014__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL7h2exp1014__nv_bfloat162_nbst_caller, + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + ptrs, + ) + + +_lower__ZL7h2exp1014__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def h2cos(): + pass + + +def _lower__ZL5h2cos14__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL5h2cos14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) { + retval = h2cos(*a); + return 0; + } + """ + + _ZL5h2cos14__nv_bfloat162_nbst = declare_device( + "_ZL5h2cos14__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(_type___nv_bfloat162)), + ) + + def _ZL5h2cos14__nv_bfloat162_nbst_caller(arg_0): + return _ZL5h2cos14__nv_bfloat162_nbst(arg_0) + + @lower(h2cos, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL5h2cos14__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL5h2cos14__nv_bfloat162_nbst_caller, + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + ptrs, + ) + + +_lower__ZL5h2cos14__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def h2sin(): + pass + + +def _lower__ZL5h2sin14__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL5h2sin14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) { + retval = h2sin(*a); + return 0; + } + """ + + _ZL5h2sin14__nv_bfloat162_nbst = declare_device( + "_ZL5h2sin14__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(_type___nv_bfloat162)), + ) + + def _ZL5h2sin14__nv_bfloat162_nbst_caller(arg_0): + return _ZL5h2sin14__nv_bfloat162_nbst(arg_0) + + @lower(h2sin, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL5h2sin14__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL5h2sin14__nv_bfloat162_nbst_caller, + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + ptrs, + ) + + +_lower__ZL5h2sin14__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def atomicAdd(): + pass + + +def _lower__ZL9atomicAddP14__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL9atomicAddP14__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162 ** address, __nv_bfloat162* val) { + retval = atomicAdd(*address, *val); + return 0; + } + """ + + _ZL9atomicAddP14__nv_bfloat162S__nbst = declare_device( + "_ZL9atomicAddP14__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(CPointer(_type___nv_bfloat162)), + CPointer(_type___nv_bfloat162), + ), + ) + + def _ZL9atomicAddP14__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL9atomicAddP14__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(atomicAdd, CPointer(_type___nv_bfloat162), _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL9atomicAddP14__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL9atomicAddP14__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(CPointer(_type___nv_bfloat162)), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL9atomicAddP14__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def _lower__ZL9atomicAddP13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL9atomicAddP13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16 ** address, __nv_bfloat16* val) { + retval = atomicAdd(*address, *val); + return 0; + } + """ + + _ZL9atomicAddP13__nv_bfloat16S__nbst = declare_device( + "_ZL9atomicAddP13__nv_bfloat16S__nbst", + _type___nv_bfloat16( + CPointer(CPointer(_type___nv_bfloat16)), + CPointer(_type___nv_bfloat16), + ), + ) + + def _ZL9atomicAddP13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL9atomicAddP13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(atomicAdd, CPointer(_type___nv_bfloat16), _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL9atomicAddP13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL9atomicAddP13__nv_bfloat16S__nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(CPointer(_type___nv_bfloat16)), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL9atomicAddP13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def _lower__ZplRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZplRK13__nv_bfloat16S1__nbst(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { + retval = operator+(*lh, *rh); + return 0; + } + """ + + _ZplRK13__nv_bfloat16S1__nbst = declare_device( + "_ZplRK13__nv_bfloat16S1__nbst", + _type___nv_bfloat16( + CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) + ), + ) + + def _ZplRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1): + return _ZplRK13__nv_bfloat16S1__nbst(arg_0, arg_1) + + @lower(operator.add, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZplRK13__nv_bfloat16S1__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZplRK13__nv_bfloat16S1__nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZplRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj) + + +def _lower__ZmiRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZmiRK13__nv_bfloat16S1__nbst(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { + retval = operator-(*lh, *rh); + return 0; + } + """ + + _ZmiRK13__nv_bfloat16S1__nbst = declare_device( + "_ZmiRK13__nv_bfloat16S1__nbst", + _type___nv_bfloat16( + CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) + ), + ) + + def _ZmiRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1): + return _ZmiRK13__nv_bfloat16S1__nbst(arg_0, arg_1) + + @lower(operator.sub, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZmiRK13__nv_bfloat16S1__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZmiRK13__nv_bfloat16S1__nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZmiRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj) + + +def _lower__ZmlRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZmlRK13__nv_bfloat16S1__nbst(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { + retval = operator*(*lh, *rh); + return 0; + } + """ + + _ZmlRK13__nv_bfloat16S1__nbst = declare_device( + "_ZmlRK13__nv_bfloat16S1__nbst", + _type___nv_bfloat16( + CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) + ), + ) + + def _ZmlRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1): + return _ZmlRK13__nv_bfloat16S1__nbst(arg_0, arg_1) + + @lower(operator.mul, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZmlRK13__nv_bfloat16S1__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZmlRK13__nv_bfloat16S1__nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZmlRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj) + + +def _lower__ZdvRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZdvRK13__nv_bfloat16S1__nbst(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { + retval = operator/(*lh, *rh); + return 0; + } + """ + + _ZdvRK13__nv_bfloat16S1__nbst = declare_device( + "_ZdvRK13__nv_bfloat16S1__nbst", + _type___nv_bfloat16( + CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) + ), + ) + + def _ZdvRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1): + return _ZdvRK13__nv_bfloat16S1__nbst(arg_0, arg_1) + + @lower(operator.truediv, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZdvRK13__nv_bfloat16S1__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZdvRK13__nv_bfloat16S1__nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZdvRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj) + + +def _lower__ZpLR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZpLR13__nv_bfloat16RKS__nbst(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { + retval = operator+=(*lh, *rh); + return 0; + } + """ + + _ZpLR13__nv_bfloat16RKS__nbst = declare_device( + "_ZpLR13__nv_bfloat16RKS__nbst", + _type___nv_bfloat16( + CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) + ), + ) + + def _ZpLR13__nv_bfloat16RKS__nbst_caller(arg_0, arg_1): + return _ZpLR13__nv_bfloat16RKS__nbst(arg_0, arg_1) + + @lower(operator.iadd, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZpLR13__nv_bfloat16RKS__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZpLR13__nv_bfloat16RKS__nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZpLR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj) + + +def _lower__ZmIR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZmIR13__nv_bfloat16RKS__nbst(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { + retval = operator-=(*lh, *rh); + return 0; + } + """ + + _ZmIR13__nv_bfloat16RKS__nbst = declare_device( + "_ZmIR13__nv_bfloat16RKS__nbst", + _type___nv_bfloat16( + CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) + ), + ) + + def _ZmIR13__nv_bfloat16RKS__nbst_caller(arg_0, arg_1): + return _ZmIR13__nv_bfloat16RKS__nbst(arg_0, arg_1) + + @lower(operator.isub, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZmIR13__nv_bfloat16RKS__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZmIR13__nv_bfloat16RKS__nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZmIR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj) + + +def _lower__ZmLR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZmLR13__nv_bfloat16RKS__nbst(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { + retval = operator*=(*lh, *rh); + return 0; + } + """ + + _ZmLR13__nv_bfloat16RKS__nbst = declare_device( + "_ZmLR13__nv_bfloat16RKS__nbst", + _type___nv_bfloat16( + CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) + ), + ) + + def _ZmLR13__nv_bfloat16RKS__nbst_caller(arg_0, arg_1): + return _ZmLR13__nv_bfloat16RKS__nbst(arg_0, arg_1) + + @lower(operator.imul, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZmLR13__nv_bfloat16RKS__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZmLR13__nv_bfloat16RKS__nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZmLR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj) + + +def _lower__ZdVR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZdVR13__nv_bfloat16RKS__nbst(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { + retval = operator/=(*lh, *rh); + return 0; + } + """ + + _ZdVR13__nv_bfloat16RKS__nbst = declare_device( + "_ZdVR13__nv_bfloat16RKS__nbst", + _type___nv_bfloat16( + CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) + ), + ) + + def _ZdVR13__nv_bfloat16RKS__nbst_caller(arg_0, arg_1): + return _ZdVR13__nv_bfloat16RKS__nbst(arg_0, arg_1) + + @lower(operator.itruediv, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZdVR13__nv_bfloat16RKS__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZdVR13__nv_bfloat16RKS__nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZdVR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj) + + +def _lower__ZpsRK13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZpsRK13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* h) { + retval = operator+(*h); + return 0; + } + """ + + _ZpsRK13__nv_bfloat16_nbst = declare_device( + "_ZpsRK13__nv_bfloat16_nbst", + _type___nv_bfloat16(CPointer(_type___nv_bfloat16)), + ) + + def _ZpsRK13__nv_bfloat16_nbst_caller(arg_0): + return _ZpsRK13__nv_bfloat16_nbst(arg_0) + + @lower(operator.pos, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key("_ZpsRK13__nv_bfloat16_nbst", shim_raw_str) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZpsRK13__nv_bfloat16_nbst_caller, + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + ptrs, + ) + + +_lower__ZpsRK13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def _lower__ZngRK13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZngRK13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* h) { + retval = operator-(*h); + return 0; + } + """ + + _ZngRK13__nv_bfloat16_nbst = declare_device( + "_ZngRK13__nv_bfloat16_nbst", + _type___nv_bfloat16(CPointer(_type___nv_bfloat16)), + ) + + def _ZngRK13__nv_bfloat16_nbst_caller(arg_0): + return _ZngRK13__nv_bfloat16_nbst(arg_0) + + @lower(operator.neg, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key("_ZngRK13__nv_bfloat16_nbst", shim_raw_str) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZngRK13__nv_bfloat16_nbst_caller, + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + ptrs, + ) + + +_lower__ZngRK13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def _lower__ZeqRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZeqRK13__nv_bfloat16S1__nbst(bool &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { + retval = operator==(*lh, *rh); + return 0; + } + """ + + _ZeqRK13__nv_bfloat16S1__nbst = declare_device( + "_ZeqRK13__nv_bfloat16S1__nbst", + bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)), + ) + + def _ZeqRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1): + return _ZeqRK13__nv_bfloat16S1__nbst(arg_0, arg_1) + + @lower(operator.eq, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZeqRK13__nv_bfloat16S1__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZeqRK13__nv_bfloat16S1__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZeqRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj) + + +def _lower__ZneRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZneRK13__nv_bfloat16S1__nbst(bool &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { + retval = operator!=(*lh, *rh); + return 0; + } + """ + + _ZneRK13__nv_bfloat16S1__nbst = declare_device( + "_ZneRK13__nv_bfloat16S1__nbst", + bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)), + ) + + def _ZneRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1): + return _ZneRK13__nv_bfloat16S1__nbst(arg_0, arg_1) + + @lower(operator.ne, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZneRK13__nv_bfloat16S1__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZneRK13__nv_bfloat16S1__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZneRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj) + + +def _lower__ZgtRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZgtRK13__nv_bfloat16S1__nbst(bool &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { + retval = operator>(*lh, *rh); + return 0; + } + """ + + _ZgtRK13__nv_bfloat16S1__nbst = declare_device( + "_ZgtRK13__nv_bfloat16S1__nbst", + bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)), + ) + + def _ZgtRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1): + return _ZgtRK13__nv_bfloat16S1__nbst(arg_0, arg_1) + + @lower(operator.gt, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZgtRK13__nv_bfloat16S1__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZgtRK13__nv_bfloat16S1__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZgtRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj) + + +def _lower__ZltRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZltRK13__nv_bfloat16S1__nbst(bool &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { + retval = operator<(*lh, *rh); + return 0; + } + """ + + _ZltRK13__nv_bfloat16S1__nbst = declare_device( + "_ZltRK13__nv_bfloat16S1__nbst", + bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)), + ) + + def _ZltRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1): + return _ZltRK13__nv_bfloat16S1__nbst(arg_0, arg_1) + + @lower(operator.lt, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZltRK13__nv_bfloat16S1__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZltRK13__nv_bfloat16S1__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZltRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj) + + +def _lower__ZgeRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZgeRK13__nv_bfloat16S1__nbst(bool &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { + retval = operator>=(*lh, *rh); + return 0; + } + """ + + _ZgeRK13__nv_bfloat16S1__nbst = declare_device( + "_ZgeRK13__nv_bfloat16S1__nbst", + bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)), + ) + + def _ZgeRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1): + return _ZgeRK13__nv_bfloat16S1__nbst(arg_0, arg_1) + + @lower(operator.ge, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZgeRK13__nv_bfloat16S1__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZgeRK13__nv_bfloat16S1__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZgeRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj) + + +def _lower__ZleRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZleRK13__nv_bfloat16S1__nbst(bool &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { + retval = operator<=(*lh, *rh); + return 0; + } + """ + + _ZleRK13__nv_bfloat16S1__nbst = declare_device( + "_ZleRK13__nv_bfloat16S1__nbst", + bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)), + ) + + def _ZleRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1): + return _ZleRK13__nv_bfloat16S1__nbst(arg_0, arg_1) + + @lower(operator.le, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZleRK13__nv_bfloat16S1__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZleRK13__nv_bfloat16S1__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZleRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj) + + +def _lower__ZplRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZplRK14__nv_bfloat162S1__nbst(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { + retval = operator+(*lh, *rh); + return 0; + } + """ + + _ZplRK14__nv_bfloat162S1__nbst = declare_device( + "_ZplRK14__nv_bfloat162S1__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZplRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1): + return _ZplRK14__nv_bfloat162S1__nbst(arg_0, arg_1) + + @lower(operator.add, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZplRK14__nv_bfloat162S1__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZplRK14__nv_bfloat162S1__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZplRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj) + + +def _lower__ZmiRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZmiRK14__nv_bfloat162S1__nbst(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { + retval = operator-(*lh, *rh); + return 0; + } + """ + + _ZmiRK14__nv_bfloat162S1__nbst = declare_device( + "_ZmiRK14__nv_bfloat162S1__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZmiRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1): + return _ZmiRK14__nv_bfloat162S1__nbst(arg_0, arg_1) + + @lower(operator.sub, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZmiRK14__nv_bfloat162S1__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZmiRK14__nv_bfloat162S1__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZmiRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj) + + +def _lower__ZmlRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZmlRK14__nv_bfloat162S1__nbst(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { + retval = operator*(*lh, *rh); + return 0; + } + """ + + _ZmlRK14__nv_bfloat162S1__nbst = declare_device( + "_ZmlRK14__nv_bfloat162S1__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZmlRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1): + return _ZmlRK14__nv_bfloat162S1__nbst(arg_0, arg_1) + + @lower(operator.mul, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZmlRK14__nv_bfloat162S1__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZmlRK14__nv_bfloat162S1__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZmlRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj) + + +def _lower__ZdvRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZdvRK14__nv_bfloat162S1__nbst(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { + retval = operator/(*lh, *rh); + return 0; + } + """ + + _ZdvRK14__nv_bfloat162S1__nbst = declare_device( + "_ZdvRK14__nv_bfloat162S1__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZdvRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1): + return _ZdvRK14__nv_bfloat162S1__nbst(arg_0, arg_1) + + @lower(operator.truediv, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZdvRK14__nv_bfloat162S1__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZdvRK14__nv_bfloat162S1__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZdvRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj) + + +def _lower__ZpLR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZpLR14__nv_bfloat162RKS__nbst(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { + retval = operator+=(*lh, *rh); + return 0; + } + """ + + _ZpLR14__nv_bfloat162RKS__nbst = declare_device( + "_ZpLR14__nv_bfloat162RKS__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZpLR14__nv_bfloat162RKS__nbst_caller(arg_0, arg_1): + return _ZpLR14__nv_bfloat162RKS__nbst(arg_0, arg_1) + + @lower(operator.iadd, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZpLR14__nv_bfloat162RKS__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZpLR14__nv_bfloat162RKS__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZpLR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj) + + +def _lower__ZmIR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZmIR14__nv_bfloat162RKS__nbst(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { + retval = operator-=(*lh, *rh); + return 0; + } + """ + + _ZmIR14__nv_bfloat162RKS__nbst = declare_device( + "_ZmIR14__nv_bfloat162RKS__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZmIR14__nv_bfloat162RKS__nbst_caller(arg_0, arg_1): + return _ZmIR14__nv_bfloat162RKS__nbst(arg_0, arg_1) + + @lower(operator.isub, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZmIR14__nv_bfloat162RKS__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZmIR14__nv_bfloat162RKS__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZmIR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj) + + +def _lower__ZmLR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZmLR14__nv_bfloat162RKS__nbst(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { + retval = operator*=(*lh, *rh); + return 0; + } + """ + + _ZmLR14__nv_bfloat162RKS__nbst = declare_device( + "_ZmLR14__nv_bfloat162RKS__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZmLR14__nv_bfloat162RKS__nbst_caller(arg_0, arg_1): + return _ZmLR14__nv_bfloat162RKS__nbst(arg_0, arg_1) + + @lower(operator.imul, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZmLR14__nv_bfloat162RKS__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZmLR14__nv_bfloat162RKS__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZmLR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj) + + +def _lower__ZdVR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZdVR14__nv_bfloat162RKS__nbst(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { + retval = operator/=(*lh, *rh); + return 0; + } + """ + + _ZdVR14__nv_bfloat162RKS__nbst = declare_device( + "_ZdVR14__nv_bfloat162RKS__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZdVR14__nv_bfloat162RKS__nbst_caller(arg_0, arg_1): + return _ZdVR14__nv_bfloat162RKS__nbst(arg_0, arg_1) + + @lower(operator.itruediv, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZdVR14__nv_bfloat162RKS__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZdVR14__nv_bfloat162RKS__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZdVR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj) + + +def _lower__ZpsRK14__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZpsRK14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* h) { + retval = operator+(*h); + return 0; + } + """ + + _ZpsRK14__nv_bfloat162_nbst = declare_device( + "_ZpsRK14__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(_type___nv_bfloat162)), + ) + + def _ZpsRK14__nv_bfloat162_nbst_caller(arg_0): + return _ZpsRK14__nv_bfloat162_nbst(arg_0) + + @lower(operator.pos, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key("_ZpsRK14__nv_bfloat162_nbst", shim_raw_str) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZpsRK14__nv_bfloat162_nbst_caller, + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + ptrs, + ) + + +_lower__ZpsRK14__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def _lower__ZngRK14__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZngRK14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* h) { + retval = operator-(*h); + return 0; + } + """ + + _ZngRK14__nv_bfloat162_nbst = declare_device( + "_ZngRK14__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(_type___nv_bfloat162)), + ) + + def _ZngRK14__nv_bfloat162_nbst_caller(arg_0): + return _ZngRK14__nv_bfloat162_nbst(arg_0) + + @lower(operator.neg, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key("_ZngRK14__nv_bfloat162_nbst", shim_raw_str) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZngRK14__nv_bfloat162_nbst_caller, + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + ptrs, + ) + + +_lower__ZngRK14__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def _lower__ZeqRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZeqRK14__nv_bfloat162S1__nbst(bool &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { + retval = operator==(*lh, *rh); + return 0; + } + """ + + _ZeqRK14__nv_bfloat162S1__nbst = declare_device( + "_ZeqRK14__nv_bfloat162S1__nbst", + bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZeqRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1): + return _ZeqRK14__nv_bfloat162S1__nbst(arg_0, arg_1) + + @lower(operator.eq, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZeqRK14__nv_bfloat162S1__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZeqRK14__nv_bfloat162S1__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZeqRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj) + + +def _lower__ZneRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZneRK14__nv_bfloat162S1__nbst(bool &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { + retval = operator!=(*lh, *rh); + return 0; + } + """ + + _ZneRK14__nv_bfloat162S1__nbst = declare_device( + "_ZneRK14__nv_bfloat162S1__nbst", + bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZneRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1): + return _ZneRK14__nv_bfloat162S1__nbst(arg_0, arg_1) + + @lower(operator.ne, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZneRK14__nv_bfloat162S1__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZneRK14__nv_bfloat162S1__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZneRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj) + + +def _lower__ZgtRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZgtRK14__nv_bfloat162S1__nbst(bool &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { + retval = operator>(*lh, *rh); + return 0; + } + """ + + _ZgtRK14__nv_bfloat162S1__nbst = declare_device( + "_ZgtRK14__nv_bfloat162S1__nbst", + bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZgtRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1): + return _ZgtRK14__nv_bfloat162S1__nbst(arg_0, arg_1) + + @lower(operator.gt, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZgtRK14__nv_bfloat162S1__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZgtRK14__nv_bfloat162S1__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZgtRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj) + + +def _lower__ZltRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZltRK14__nv_bfloat162S1__nbst(bool &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { + retval = operator<(*lh, *rh); + return 0; + } + """ + + _ZltRK14__nv_bfloat162S1__nbst = declare_device( + "_ZltRK14__nv_bfloat162S1__nbst", + bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZltRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1): + return _ZltRK14__nv_bfloat162S1__nbst(arg_0, arg_1) + + @lower(operator.lt, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZltRK14__nv_bfloat162S1__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZltRK14__nv_bfloat162S1__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZltRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj) + + +def _lower__ZgeRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZgeRK14__nv_bfloat162S1__nbst(bool &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { + retval = operator>=(*lh, *rh); + return 0; + } + """ + + _ZgeRK14__nv_bfloat162S1__nbst = declare_device( + "_ZgeRK14__nv_bfloat162S1__nbst", + bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZgeRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1): + return _ZgeRK14__nv_bfloat162S1__nbst(arg_0, arg_1) + + @lower(operator.ge, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZgeRK14__nv_bfloat162S1__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZgeRK14__nv_bfloat162S1__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZgeRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj) + + +def _lower__ZleRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZleRK14__nv_bfloat162S1__nbst(bool &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { + retval = operator<=(*lh, *rh); + return 0; + } + """ + + _ZleRK14__nv_bfloat162S1__nbst = declare_device( + "_ZleRK14__nv_bfloat162S1__nbst", + bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZleRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1): + return _ZleRK14__nv_bfloat162S1__nbst(arg_0, arg_1) + + @lower(operator.le, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZleRK14__nv_bfloat162S1__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZleRK14__nv_bfloat162S1__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZleRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj) + + +def __half(): + pass + + +def _lower__ZN6__halfC1E13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZN6__halfC1E13__nv_bfloat16_nbst(int &retval , __nv_bfloat16* f) { + __half(*f); + return 0; + } + """ + + _ZN6__halfC1E13__nv_bfloat16_nbst = declare_device( + "_ZN6__halfC1E13__nv_bfloat16_nbst", void(CPointer(_type___nv_bfloat16)) + ) + + def _ZN6__halfC1E13__nv_bfloat16_nbst_caller(arg_0): + return _ZN6__halfC1E13__nv_bfloat16_nbst(arg_0) + + @lower(__half, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZN6__halfC1E13__nv_bfloat16_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZN6__halfC1E13__nv_bfloat16_nbst_caller, + signature(void, CPointer(_type___nv_bfloat16)), + ptrs, + ) + + +_lower__ZN6__halfC1E13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +@register +class _typing___double2bfloat16(ConcreteTemplate): + key = globals()["__double2bfloat16"] + cases = [signature(_type___nv_bfloat16, float64)] + + +register_global(__double2bfloat16, types.Function(_typing___double2bfloat16)) + + +@register +class _typing___float2bfloat16(ConcreteTemplate): + key = globals()["__float2bfloat16"] + cases = [signature(_type___nv_bfloat16, float32)] + + +register_global(__float2bfloat16, types.Function(_typing___float2bfloat16)) + + +@register +class _typing___float2bfloat16_rn(ConcreteTemplate): + key = globals()["__float2bfloat16_rn"] + cases = [signature(_type___nv_bfloat16, float32)] + + +register_global( + __float2bfloat16_rn, types.Function(_typing___float2bfloat16_rn) +) + + +@register +class _typing___float2bfloat16_rz(ConcreteTemplate): + key = globals()["__float2bfloat16_rz"] + cases = [signature(_type___nv_bfloat16, float32)] + + +register_global( + __float2bfloat16_rz, types.Function(_typing___float2bfloat16_rz) +) + + +@register +class _typing___float2bfloat16_rd(ConcreteTemplate): + key = globals()["__float2bfloat16_rd"] + cases = [signature(_type___nv_bfloat16, float32)] + + +register_global( + __float2bfloat16_rd, types.Function(_typing___float2bfloat16_rd) +) + + +@register +class _typing___float2bfloat16_ru(ConcreteTemplate): + key = globals()["__float2bfloat16_ru"] + cases = [signature(_type___nv_bfloat16, float32)] + + +register_global( + __float2bfloat16_ru, types.Function(_typing___float2bfloat16_ru) +) + + +@register +class _typing___bfloat162float(ConcreteTemplate): + key = globals()["__bfloat162float"] + cases = [signature(float32, _type___nv_bfloat16)] + + +register_global(__bfloat162float, types.Function(_typing___bfloat162float)) + + +@register +class _typing___float2bfloat162_rn(ConcreteTemplate): + key = globals()["__float2bfloat162_rn"] + cases = [signature(_type___nv_bfloat162, float32)] + + +register_global( + __float2bfloat162_rn, types.Function(_typing___float2bfloat162_rn) +) + + +@register +class _typing___floats2bfloat162_rn(ConcreteTemplate): + key = globals()["__floats2bfloat162_rn"] + cases = [signature(_type___nv_bfloat162, float32, float32)] + + +register_global( + __floats2bfloat162_rn, types.Function(_typing___floats2bfloat162_rn) +) + + +@register +class _typing___low2float(ConcreteTemplate): + key = globals()["__low2float"] + cases = [signature(float32, _type___nv_bfloat162)] + + +register_global(__low2float, types.Function(_typing___low2float)) + + +@register +class _typing___high2float(ConcreteTemplate): + key = globals()["__high2float"] + cases = [signature(float32, _type___nv_bfloat162)] + + +register_global(__high2float, types.Function(_typing___high2float)) + + +@register +class _typing___float22bfloat162_rn(ConcreteTemplate): + key = globals()["__float22bfloat162_rn"] + cases = [signature(_type___nv_bfloat162, float32x2)] + + +register_global( + __float22bfloat162_rn, types.Function(_typing___float22bfloat162_rn) +) + + +@register +class _typing___bfloat1622float2(ConcreteTemplate): + key = globals()["__bfloat1622float2"] + cases = [signature(float32x2, _type___nv_bfloat162)] + + +register_global(__bfloat1622float2, types.Function(_typing___bfloat1622float2)) + + +@register +class _typing___bfloat162char_rz(ConcreteTemplate): + key = globals()["__bfloat162char_rz"] + cases = [signature(int8, _type___nv_bfloat16)] + + +register_global(__bfloat162char_rz, types.Function(_typing___bfloat162char_rz)) + + +@register +class _typing___bfloat162uchar_rz(ConcreteTemplate): + key = globals()["__bfloat162uchar_rz"] + cases = [signature(uint8, _type___nv_bfloat16)] + + +register_global( + __bfloat162uchar_rz, types.Function(_typing___bfloat162uchar_rz) +) + + +@register +class _typing___bfloat162int_rn(ConcreteTemplate): + key = globals()["__bfloat162int_rn"] + cases = [signature(int32, _type___nv_bfloat16)] + + +register_global(__bfloat162int_rn, types.Function(_typing___bfloat162int_rn)) + + +@register +class _typing___bfloat162int_rz(ConcreteTemplate): + key = globals()["__bfloat162int_rz"] + cases = [signature(int32, _type___nv_bfloat16)] + + +register_global(__bfloat162int_rz, types.Function(_typing___bfloat162int_rz)) + + +@register +class _typing___bfloat162int_rd(ConcreteTemplate): + key = globals()["__bfloat162int_rd"] + cases = [signature(int32, _type___nv_bfloat16)] + + +register_global(__bfloat162int_rd, types.Function(_typing___bfloat162int_rd)) + + +@register +class _typing___bfloat162int_ru(ConcreteTemplate): + key = globals()["__bfloat162int_ru"] + cases = [signature(int32, _type___nv_bfloat16)] + + +register_global(__bfloat162int_ru, types.Function(_typing___bfloat162int_ru)) + + +@register +class _typing___int2bfloat16_rn(ConcreteTemplate): + key = globals()["__int2bfloat16_rn"] + cases = [signature(_type___nv_bfloat16, int32)] + + +register_global(__int2bfloat16_rn, types.Function(_typing___int2bfloat16_rn)) + + +@register +class _typing___int2bfloat16_rz(ConcreteTemplate): + key = globals()["__int2bfloat16_rz"] + cases = [signature(_type___nv_bfloat16, int32)] + + +register_global(__int2bfloat16_rz, types.Function(_typing___int2bfloat16_rz)) + + +@register +class _typing___int2bfloat16_rd(ConcreteTemplate): + key = globals()["__int2bfloat16_rd"] + cases = [signature(_type___nv_bfloat16, int32)] + + +register_global(__int2bfloat16_rd, types.Function(_typing___int2bfloat16_rd)) + + +@register +class _typing___int2bfloat16_ru(ConcreteTemplate): + key = globals()["__int2bfloat16_ru"] + cases = [signature(_type___nv_bfloat16, int32)] + + +register_global(__int2bfloat16_ru, types.Function(_typing___int2bfloat16_ru)) + + +@register +class _typing___bfloat162short_rn(ConcreteTemplate): + key = globals()["__bfloat162short_rn"] + cases = [signature(int16, _type___nv_bfloat16)] + + +register_global( + __bfloat162short_rn, types.Function(_typing___bfloat162short_rn) +) + + +@register +class _typing___bfloat162short_rz(ConcreteTemplate): + key = globals()["__bfloat162short_rz"] + cases = [signature(int16, _type___nv_bfloat16)] + + +register_global( + __bfloat162short_rz, types.Function(_typing___bfloat162short_rz) +) + + +@register +class _typing___bfloat162short_rd(ConcreteTemplate): + key = globals()["__bfloat162short_rd"] + cases = [signature(int16, _type___nv_bfloat16)] + + +register_global( + __bfloat162short_rd, types.Function(_typing___bfloat162short_rd) +) + + +@register +class _typing___bfloat162short_ru(ConcreteTemplate): + key = globals()["__bfloat162short_ru"] + cases = [signature(int16, _type___nv_bfloat16)] + + +register_global( + __bfloat162short_ru, types.Function(_typing___bfloat162short_ru) +) + + +@register +class _typing___short2bfloat16_rn(ConcreteTemplate): + key = globals()["__short2bfloat16_rn"] + cases = [signature(_type___nv_bfloat16, int16)] + + +register_global( + __short2bfloat16_rn, types.Function(_typing___short2bfloat16_rn) +) + + +@register +class _typing___short2bfloat16_rz(ConcreteTemplate): + key = globals()["__short2bfloat16_rz"] + cases = [signature(_type___nv_bfloat16, int16)] + + +register_global( + __short2bfloat16_rz, types.Function(_typing___short2bfloat16_rz) +) + + +@register +class _typing___short2bfloat16_rd(ConcreteTemplate): + key = globals()["__short2bfloat16_rd"] + cases = [signature(_type___nv_bfloat16, int16)] + + +register_global( + __short2bfloat16_rd, types.Function(_typing___short2bfloat16_rd) +) + + +@register +class _typing___short2bfloat16_ru(ConcreteTemplate): + key = globals()["__short2bfloat16_ru"] + cases = [signature(_type___nv_bfloat16, int16)] + + +register_global( + __short2bfloat16_ru, types.Function(_typing___short2bfloat16_ru) +) + + +@register +class _typing___bfloat162uint_rn(ConcreteTemplate): + key = globals()["__bfloat162uint_rn"] + cases = [signature(uint32, _type___nv_bfloat16)] + + +register_global(__bfloat162uint_rn, types.Function(_typing___bfloat162uint_rn)) + + +@register +class _typing___bfloat162uint_rz(ConcreteTemplate): + key = globals()["__bfloat162uint_rz"] + cases = [signature(uint32, _type___nv_bfloat16)] + + +register_global(__bfloat162uint_rz, types.Function(_typing___bfloat162uint_rz)) + + +@register +class _typing___bfloat162uint_rd(ConcreteTemplate): + key = globals()["__bfloat162uint_rd"] + cases = [signature(uint32, _type___nv_bfloat16)] + + +register_global(__bfloat162uint_rd, types.Function(_typing___bfloat162uint_rd)) + + +@register +class _typing___bfloat162uint_ru(ConcreteTemplate): + key = globals()["__bfloat162uint_ru"] + cases = [signature(uint32, _type___nv_bfloat16)] + + +register_global(__bfloat162uint_ru, types.Function(_typing___bfloat162uint_ru)) + + +@register +class _typing___uint2bfloat16_rn(ConcreteTemplate): + key = globals()["__uint2bfloat16_rn"] + cases = [signature(_type___nv_bfloat16, uint32)] + + +register_global(__uint2bfloat16_rn, types.Function(_typing___uint2bfloat16_rn)) + + +@register +class _typing___uint2bfloat16_rz(ConcreteTemplate): + key = globals()["__uint2bfloat16_rz"] + cases = [signature(_type___nv_bfloat16, uint32)] + + +register_global(__uint2bfloat16_rz, types.Function(_typing___uint2bfloat16_rz)) + + +@register +class _typing___uint2bfloat16_rd(ConcreteTemplate): + key = globals()["__uint2bfloat16_rd"] + cases = [signature(_type___nv_bfloat16, uint32)] + + +register_global(__uint2bfloat16_rd, types.Function(_typing___uint2bfloat16_rd)) + + +@register +class _typing___uint2bfloat16_ru(ConcreteTemplate): + key = globals()["__uint2bfloat16_ru"] + cases = [signature(_type___nv_bfloat16, uint32)] + + +register_global(__uint2bfloat16_ru, types.Function(_typing___uint2bfloat16_ru)) + + +@register +class _typing___bfloat162ushort_rn(ConcreteTemplate): + key = globals()["__bfloat162ushort_rn"] + cases = [signature(uint16, _type___nv_bfloat16)] + + +register_global( + __bfloat162ushort_rn, types.Function(_typing___bfloat162ushort_rn) +) + + +@register +class _typing___bfloat162ushort_rz(ConcreteTemplate): + key = globals()["__bfloat162ushort_rz"] + cases = [signature(uint16, _type___nv_bfloat16)] + + +register_global( + __bfloat162ushort_rz, types.Function(_typing___bfloat162ushort_rz) +) + + +@register +class _typing___bfloat162ushort_rd(ConcreteTemplate): + key = globals()["__bfloat162ushort_rd"] + cases = [signature(uint16, _type___nv_bfloat16)] + + +register_global( + __bfloat162ushort_rd, types.Function(_typing___bfloat162ushort_rd) +) + + +@register +class _typing___bfloat162ushort_ru(ConcreteTemplate): + key = globals()["__bfloat162ushort_ru"] + cases = [signature(uint16, _type___nv_bfloat16)] + + +register_global( + __bfloat162ushort_ru, types.Function(_typing___bfloat162ushort_ru) +) + + +@register +class _typing___ushort2bfloat16_rn(ConcreteTemplate): + key = globals()["__ushort2bfloat16_rn"] + cases = [signature(_type___nv_bfloat16, uint16)] + + +register_global( + __ushort2bfloat16_rn, types.Function(_typing___ushort2bfloat16_rn) +) + + +@register +class _typing___ushort2bfloat16_rz(ConcreteTemplate): + key = globals()["__ushort2bfloat16_rz"] + cases = [signature(_type___nv_bfloat16, uint16)] + + +register_global( + __ushort2bfloat16_rz, types.Function(_typing___ushort2bfloat16_rz) +) + + +@register +class _typing___ushort2bfloat16_rd(ConcreteTemplate): + key = globals()["__ushort2bfloat16_rd"] + cases = [signature(_type___nv_bfloat16, uint16)] + + +register_global( + __ushort2bfloat16_rd, types.Function(_typing___ushort2bfloat16_rd) +) + + +@register +class _typing___ushort2bfloat16_ru(ConcreteTemplate): + key = globals()["__ushort2bfloat16_ru"] + cases = [signature(_type___nv_bfloat16, uint16)] + + +register_global( + __ushort2bfloat16_ru, types.Function(_typing___ushort2bfloat16_ru) +) + + +@register +class _typing___bfloat162ull_rn(ConcreteTemplate): + key = globals()["__bfloat162ull_rn"] + cases = [signature(uint64, _type___nv_bfloat16)] + + +register_global(__bfloat162ull_rn, types.Function(_typing___bfloat162ull_rn)) + + +@register +class _typing___bfloat162ull_rz(ConcreteTemplate): + key = globals()["__bfloat162ull_rz"] + cases = [signature(uint64, _type___nv_bfloat16)] + + +register_global(__bfloat162ull_rz, types.Function(_typing___bfloat162ull_rz)) + + +@register +class _typing_make_bfloat162(ConcreteTemplate): + key = globals()["make_bfloat162"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat16, _type___nv_bfloat16 + ) + ] + + +register_global(make_bfloat162, types.Function(_typing_make_bfloat162)) + + +@register +class _typing___bfloat162ull_rd(ConcreteTemplate): + key = globals()["__bfloat162ull_rd"] + cases = [signature(uint64, _type___nv_bfloat16)] + + +register_global(__bfloat162ull_rd, types.Function(_typing___bfloat162ull_rd)) + + +@register +class _typing___bfloat162ull_ru(ConcreteTemplate): + key = globals()["__bfloat162ull_ru"] + cases = [signature(uint64, _type___nv_bfloat16)] + + +register_global(__bfloat162ull_ru, types.Function(_typing___bfloat162ull_ru)) + + +@register +class _typing___ull2bfloat16_rn(ConcreteTemplate): + key = globals()["__ull2bfloat16_rn"] + cases = [signature(_type___nv_bfloat16, uint64)] + + +register_global(__ull2bfloat16_rn, types.Function(_typing___ull2bfloat16_rn)) + + +@register +class _typing___ull2bfloat16_rz(ConcreteTemplate): + key = globals()["__ull2bfloat16_rz"] + cases = [signature(_type___nv_bfloat16, uint64)] + + +register_global(__ull2bfloat16_rz, types.Function(_typing___ull2bfloat16_rz)) + + +@register +class _typing___ull2bfloat16_rd(ConcreteTemplate): + key = globals()["__ull2bfloat16_rd"] + cases = [signature(_type___nv_bfloat16, uint64)] + + +register_global(__ull2bfloat16_rd, types.Function(_typing___ull2bfloat16_rd)) + + +@register +class _typing___ull2bfloat16_ru(ConcreteTemplate): + key = globals()["__ull2bfloat16_ru"] + cases = [signature(_type___nv_bfloat16, uint64)] + + +register_global(__ull2bfloat16_ru, types.Function(_typing___ull2bfloat16_ru)) + + +@register +class _typing___bfloat162ll_rn(ConcreteTemplate): + key = globals()["__bfloat162ll_rn"] + cases = [signature(int64, _type___nv_bfloat16)] + + +register_global(__bfloat162ll_rn, types.Function(_typing___bfloat162ll_rn)) + + +@register +class _typing___bfloat162ll_rz(ConcreteTemplate): + key = globals()["__bfloat162ll_rz"] + cases = [signature(int64, _type___nv_bfloat16)] + + +register_global(__bfloat162ll_rz, types.Function(_typing___bfloat162ll_rz)) + + +@register +class _typing___bfloat162ll_rd(ConcreteTemplate): + key = globals()["__bfloat162ll_rd"] + cases = [signature(int64, _type___nv_bfloat16)] + + +register_global(__bfloat162ll_rd, types.Function(_typing___bfloat162ll_rd)) + + +@register +class _typing___bfloat162ll_ru(ConcreteTemplate): + key = globals()["__bfloat162ll_ru"] + cases = [signature(int64, _type___nv_bfloat16)] + + +register_global(__bfloat162ll_ru, types.Function(_typing___bfloat162ll_ru)) + + +@register +class _typing___ll2bfloat16_rn(ConcreteTemplate): + key = globals()["__ll2bfloat16_rn"] + cases = [signature(_type___nv_bfloat16, int64)] + + +register_global(__ll2bfloat16_rn, types.Function(_typing___ll2bfloat16_rn)) + + +@register +class _typing___ll2bfloat16_rz(ConcreteTemplate): + key = globals()["__ll2bfloat16_rz"] + cases = [signature(_type___nv_bfloat16, int64)] + + +register_global(__ll2bfloat16_rz, types.Function(_typing___ll2bfloat16_rz)) + + +@register +class _typing___ll2bfloat16_rd(ConcreteTemplate): + key = globals()["__ll2bfloat16_rd"] + cases = [signature(_type___nv_bfloat16, int64)] + + +register_global(__ll2bfloat16_rd, types.Function(_typing___ll2bfloat16_rd)) + + +@register +class _typing___ll2bfloat16_ru(ConcreteTemplate): + key = globals()["__ll2bfloat16_ru"] + cases = [signature(_type___nv_bfloat16, int64)] + + +register_global(__ll2bfloat16_ru, types.Function(_typing___ll2bfloat16_ru)) + + +@register +class _typing_htrunc(ConcreteTemplate): + key = globals()["htrunc"] + cases = [signature(_type___nv_bfloat16, _type___nv_bfloat16)] + + +register_global(htrunc, types.Function(_typing_htrunc)) + + +@register +class _typing_hceil(ConcreteTemplate): + key = globals()["hceil"] + cases = [signature(_type___nv_bfloat16, _type___nv_bfloat16)] + + +register_global(hceil, types.Function(_typing_hceil)) + + +@register +class _typing_hfloor(ConcreteTemplate): + key = globals()["hfloor"] + cases = [signature(_type___nv_bfloat16, _type___nv_bfloat16)] + + +register_global(hfloor, types.Function(_typing_hfloor)) + + +@register +class _typing_hrint(ConcreteTemplate): + key = globals()["hrint"] + cases = [signature(_type___nv_bfloat16, _type___nv_bfloat16)] + + +register_global(hrint, types.Function(_typing_hrint)) + + +@register +class _typing_h2trunc(ConcreteTemplate): + key = globals()["h2trunc"] + cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(h2trunc, types.Function(_typing_h2trunc)) + + +@register +class _typing_h2ceil(ConcreteTemplate): + key = globals()["h2ceil"] + cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(h2ceil, types.Function(_typing_h2ceil)) + + +@register +class _typing_h2floor(ConcreteTemplate): + key = globals()["h2floor"] + cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(h2floor, types.Function(_typing_h2floor)) + + +@register +class _typing_h2rint(ConcreteTemplate): + key = globals()["h2rint"] + cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(h2rint, types.Function(_typing_h2rint)) + + +@register +class _typing___bfloat162bfloat162(ConcreteTemplate): + key = globals()["__bfloat162bfloat162"] + cases = [signature(_type___nv_bfloat162, _type___nv_bfloat16)] + + +register_global( + __bfloat162bfloat162, types.Function(_typing___bfloat162bfloat162) +) + + +@register +class _typing___lowhigh2highlow(ConcreteTemplate): + key = globals()["__lowhigh2highlow"] + cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__lowhigh2highlow, types.Function(_typing___lowhigh2highlow)) + + +@register +class _typing___lows2bfloat162(ConcreteTemplate): + key = globals()["__lows2bfloat162"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] + + +register_global(__lows2bfloat162, types.Function(_typing___lows2bfloat162)) + + +@register +class _typing___highs2bfloat162(ConcreteTemplate): + key = globals()["__highs2bfloat162"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] + + +register_global(__highs2bfloat162, types.Function(_typing___highs2bfloat162)) + + +@register +class _typing___high2bfloat16(ConcreteTemplate): + key = globals()["__high2bfloat16"] + cases = [signature(_type___nv_bfloat16, _type___nv_bfloat162)] + + +register_global(__high2bfloat16, types.Function(_typing___high2bfloat16)) + + +@register +class _typing___low2bfloat16(ConcreteTemplate): + key = globals()["__low2bfloat16"] + cases = [signature(_type___nv_bfloat16, _type___nv_bfloat162)] + + +register_global(__low2bfloat16, types.Function(_typing___low2bfloat16)) + + +@register +class _typing___hisinf(ConcreteTemplate): + key = globals()["__hisinf"] + cases = [signature(int32, _type___nv_bfloat16)] + + +register_global(__hisinf, types.Function(_typing___hisinf)) + + +@register +class _typing___halves2bfloat162(ConcreteTemplate): + key = globals()["__halves2bfloat162"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat16, _type___nv_bfloat16 + ) + ] + + +register_global(__halves2bfloat162, types.Function(_typing___halves2bfloat162)) + + +@register +class _typing___low2bfloat162(ConcreteTemplate): + key = globals()["__low2bfloat162"] + cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__low2bfloat162, types.Function(_typing___low2bfloat162)) + + +@register +class _typing___high2bfloat162(ConcreteTemplate): + key = globals()["__high2bfloat162"] + cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__high2bfloat162, types.Function(_typing___high2bfloat162)) + + +@register +class _typing___bfloat16_as_short(ConcreteTemplate): + key = globals()["__bfloat16_as_short"] + cases = [signature(int16, _type___nv_bfloat16)] + + +register_global( + __bfloat16_as_short, types.Function(_typing___bfloat16_as_short) +) + + +@register +class _typing___bfloat16_as_ushort(ConcreteTemplate): + key = globals()["__bfloat16_as_ushort"] + cases = [signature(uint16, _type___nv_bfloat16)] + + +register_global( + __bfloat16_as_ushort, types.Function(_typing___bfloat16_as_ushort) +) + + +@register +class _typing___short_as_bfloat16(ConcreteTemplate): + key = globals()["__short_as_bfloat16"] + cases = [signature(_type___nv_bfloat16, int16)] + + +register_global( + __short_as_bfloat16, types.Function(_typing___short_as_bfloat16) +) + + +@register +class _typing___ushort_as_bfloat16(ConcreteTemplate): + key = globals()["__ushort_as_bfloat16"] + cases = [signature(_type___nv_bfloat16, uint16)] + + +register_global( + __ushort_as_bfloat16, types.Function(_typing___ushort_as_bfloat16) +) + + +@register +class _typing___shfl_sync(ConcreteTemplate): + key = globals()["__shfl_sync"] + cases = [ + signature( + _type___nv_bfloat162, uint32, _type___nv_bfloat162, int32, int32 + ), + signature( + _type___nv_bfloat16, uint32, _type___nv_bfloat16, int32, int32 + ), + ] + + +register_global(__shfl_sync, types.Function(_typing___shfl_sync)) + + +@register +class _typing___shfl_up_sync(ConcreteTemplate): + key = globals()["__shfl_up_sync"] + cases = [ + signature( + _type___nv_bfloat162, uint32, _type___nv_bfloat162, uint32, int32 + ), + signature( + _type___nv_bfloat16, uint32, _type___nv_bfloat16, uint32, int32 + ), + ] + + +register_global(__shfl_up_sync, types.Function(_typing___shfl_up_sync)) + + +@register +class _typing___shfl_down_sync(ConcreteTemplate): + key = globals()["__shfl_down_sync"] + cases = [ + signature( + _type___nv_bfloat162, uint32, _type___nv_bfloat162, uint32, int32 + ), + signature( + _type___nv_bfloat16, uint32, _type___nv_bfloat16, uint32, int32 + ), + ] + + +register_global(__shfl_down_sync, types.Function(_typing___shfl_down_sync)) + + +@register +class _typing___shfl_xor_sync(ConcreteTemplate): + key = globals()["__shfl_xor_sync"] + cases = [ + signature( + _type___nv_bfloat162, uint32, _type___nv_bfloat162, int32, int32 + ), + signature( + _type___nv_bfloat16, uint32, _type___nv_bfloat16, int32, int32 + ), + ] + + +register_global(__shfl_xor_sync, types.Function(_typing___shfl_xor_sync)) + + +@register +class _typing___ldg(ConcreteTemplate): + key = globals()["__ldg"] + cases = [ + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + ] + + +register_global(__ldg, types.Function(_typing___ldg)) + + +@register +class _typing___ldcg(ConcreteTemplate): + key = globals()["__ldcg"] + cases = [ + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + ] + + +register_global(__ldcg, types.Function(_typing___ldcg)) + + +@register +class _typing___ldca(ConcreteTemplate): + key = globals()["__ldca"] + cases = [ + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + ] + + +register_global(__ldca, types.Function(_typing___ldca)) + + +@register +class _typing___ldcs(ConcreteTemplate): + key = globals()["__ldcs"] + cases = [ + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + ] + + +register_global(__ldcs, types.Function(_typing___ldcs)) + + +@register +class _typing___ldlu(ConcreteTemplate): + key = globals()["__ldlu"] + cases = [ + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + ] + + +register_global(__ldlu, types.Function(_typing___ldlu)) + + +@register +class _typing___ldcv(ConcreteTemplate): + key = globals()["__ldcv"] + cases = [ + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + ] + + +register_global(__ldcv, types.Function(_typing___ldcv)) + + +@register +class _typing___stwb(ConcreteTemplate): + key = globals()["__stwb"] + cases = [ + signature(void, CPointer(_type___nv_bfloat162), _type___nv_bfloat162), + signature(void, CPointer(_type___nv_bfloat16), _type___nv_bfloat16), + ] + + +register_global(__stwb, types.Function(_typing___stwb)) + + +@register +class _typing___stcg(ConcreteTemplate): + key = globals()["__stcg"] + cases = [ + signature(void, CPointer(_type___nv_bfloat162), _type___nv_bfloat162), + signature(void, CPointer(_type___nv_bfloat16), _type___nv_bfloat16), + ] + + +register_global(__stcg, types.Function(_typing___stcg)) + + +@register +class _typing___stcs(ConcreteTemplate): + key = globals()["__stcs"] + cases = [ + signature(void, CPointer(_type___nv_bfloat162), _type___nv_bfloat162), + signature(void, CPointer(_type___nv_bfloat16), _type___nv_bfloat16), + ] + + +register_global(__stcs, types.Function(_typing___stcs)) + + +@register +class _typing___stwt(ConcreteTemplate): + key = globals()["__stwt"] + cases = [ + signature(void, CPointer(_type___nv_bfloat162), _type___nv_bfloat162), + signature(void, CPointer(_type___nv_bfloat16), _type___nv_bfloat16), + ] + + +register_global(__stwt, types.Function(_typing___stwt)) + + +@register +class _typing___heq2(ConcreteTemplate): + key = globals()["__heq2"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] + + +register_global(__heq2, types.Function(_typing___heq2)) + + +@register +class _typing___hne2(ConcreteTemplate): + key = globals()["__hne2"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] + + +register_global(__hne2, types.Function(_typing___hne2)) + + +@register +class _typing___hle2(ConcreteTemplate): + key = globals()["__hle2"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] + + +register_global(__hle2, types.Function(_typing___hle2)) + + +@register +class _typing___hge2(ConcreteTemplate): + key = globals()["__hge2"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] + + +register_global(__hge2, types.Function(_typing___hge2)) + + +@register +class _typing___hlt2(ConcreteTemplate): + key = globals()["__hlt2"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] + + +register_global(__hlt2, types.Function(_typing___hlt2)) + + +@register +class _typing___hgt2(ConcreteTemplate): + key = globals()["__hgt2"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] + + +register_global(__hgt2, types.Function(_typing___hgt2)) + + +@register +class _typing___hequ2(ConcreteTemplate): + key = globals()["__hequ2"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] + + +register_global(__hequ2, types.Function(_typing___hequ2)) + + +@register +class _typing___hneu2(ConcreteTemplate): + key = globals()["__hneu2"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] + + +register_global(__hneu2, types.Function(_typing___hneu2)) + + +@register +class _typing___hleu2(ConcreteTemplate): + key = globals()["__hleu2"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] + + +register_global(__hleu2, types.Function(_typing___hleu2)) + + +@register +class _typing___hgeu2(ConcreteTemplate): + key = globals()["__hgeu2"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] + + +register_global(__hgeu2, types.Function(_typing___hgeu2)) + + +@register +class _typing___hltu2(ConcreteTemplate): + key = globals()["__hltu2"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] + + +register_global(__hltu2, types.Function(_typing___hltu2)) + + +@register +class _typing___hgtu2(ConcreteTemplate): + key = globals()["__hgtu2"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] + + +register_global(__hgtu2, types.Function(_typing___hgtu2)) + + +@register +class _typing___heq2_mask(ConcreteTemplate): + key = globals()["__heq2_mask"] + cases = [signature(uint32, _type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__heq2_mask, types.Function(_typing___heq2_mask)) + + +@register +class _typing___hne2_mask(ConcreteTemplate): + key = globals()["__hne2_mask"] + cases = [signature(uint32, _type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__hne2_mask, types.Function(_typing___hne2_mask)) + + +@register +class _typing___hle2_mask(ConcreteTemplate): + key = globals()["__hle2_mask"] + cases = [signature(uint32, _type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__hle2_mask, types.Function(_typing___hle2_mask)) + + +@register +class _typing___hge2_mask(ConcreteTemplate): + key = globals()["__hge2_mask"] + cases = [signature(uint32, _type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__hge2_mask, types.Function(_typing___hge2_mask)) + + +@register +class _typing___hlt2_mask(ConcreteTemplate): + key = globals()["__hlt2_mask"] + cases = [signature(uint32, _type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__hlt2_mask, types.Function(_typing___hlt2_mask)) + + +@register +class _typing___hgt2_mask(ConcreteTemplate): + key = globals()["__hgt2_mask"] + cases = [signature(uint32, _type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__hgt2_mask, types.Function(_typing___hgt2_mask)) + + +@register +class _typing___hequ2_mask(ConcreteTemplate): + key = globals()["__hequ2_mask"] + cases = [signature(uint32, _type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__hequ2_mask, types.Function(_typing___hequ2_mask)) + + +@register +class _typing___hneu2_mask(ConcreteTemplate): + key = globals()["__hneu2_mask"] + cases = [signature(uint32, _type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__hneu2_mask, types.Function(_typing___hneu2_mask)) + + +@register +class _typing___hleu2_mask(ConcreteTemplate): + key = globals()["__hleu2_mask"] + cases = [signature(uint32, _type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__hleu2_mask, types.Function(_typing___hleu2_mask)) + + +@register +class _typing___hgeu2_mask(ConcreteTemplate): + key = globals()["__hgeu2_mask"] + cases = [signature(uint32, _type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__hgeu2_mask, types.Function(_typing___hgeu2_mask)) + + +@register +class _typing___hltu2_mask(ConcreteTemplate): + key = globals()["__hltu2_mask"] + cases = [signature(uint32, _type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__hltu2_mask, types.Function(_typing___hltu2_mask)) + + +@register +class _typing___hgtu2_mask(ConcreteTemplate): + key = globals()["__hgtu2_mask"] + cases = [signature(uint32, _type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__hgtu2_mask, types.Function(_typing___hgtu2_mask)) + + +@register +class _typing___hisnan2(ConcreteTemplate): + key = globals()["__hisnan2"] + cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__hisnan2, types.Function(_typing___hisnan2)) + + +@register +class _typing___hadd2(ConcreteTemplate): + key = globals()["__hadd2"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] + + +register_global(__hadd2, types.Function(_typing___hadd2)) + + +@register +class _typing___hsub2(ConcreteTemplate): + key = globals()["__hsub2"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] + + +register_global(__hsub2, types.Function(_typing___hsub2)) + + +@register +class _typing___hmul2(ConcreteTemplate): + key = globals()["__hmul2"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] + + +register_global(__hmul2, types.Function(_typing___hmul2)) + + +@register +class _typing___hadd2_rn(ConcreteTemplate): + key = globals()["__hadd2_rn"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] + + +register_global(__hadd2_rn, types.Function(_typing___hadd2_rn)) + + +@register +class _typing___hsub2_rn(ConcreteTemplate): + key = globals()["__hsub2_rn"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] + + +register_global(__hsub2_rn, types.Function(_typing___hsub2_rn)) + + +@register +class _typing___hmul2_rn(ConcreteTemplate): + key = globals()["__hmul2_rn"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] + + +register_global(__hmul2_rn, types.Function(_typing___hmul2_rn)) + + +@register +class _typing___h2div(ConcreteTemplate): + key = globals()["__h2div"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] + + +register_global(__h2div, types.Function(_typing___h2div)) + + +@register +class _typing___habs2(ConcreteTemplate): + key = globals()["__habs2"] + cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__habs2, types.Function(_typing___habs2)) + + +@register +class _typing___hadd2_sat(ConcreteTemplate): + key = globals()["__hadd2_sat"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] + + +register_global(__hadd2_sat, types.Function(_typing___hadd2_sat)) + + +@register +class _typing___hsub2_sat(ConcreteTemplate): + key = globals()["__hsub2_sat"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] + + +register_global(__hsub2_sat, types.Function(_typing___hsub2_sat)) + + +@register +class _typing___hmul2_sat(ConcreteTemplate): + key = globals()["__hmul2_sat"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] + + +register_global(__hmul2_sat, types.Function(_typing___hmul2_sat)) + + +@register +class _typing___hfma2(ConcreteTemplate): + key = globals()["__hfma2"] + cases = [ + signature( + _type___nv_bfloat162, + _type___nv_bfloat162, + _type___nv_bfloat162, + _type___nv_bfloat162, + ) + ] + + +register_global(__hfma2, types.Function(_typing___hfma2)) + + +@register +class _typing___hfma2_sat(ConcreteTemplate): + key = globals()["__hfma2_sat"] + cases = [ + signature( + _type___nv_bfloat162, + _type___nv_bfloat162, + _type___nv_bfloat162, + _type___nv_bfloat162, + ) + ] + + +register_global(__hfma2_sat, types.Function(_typing___hfma2_sat)) + + +@register +class _typing___hneg2(ConcreteTemplate): + key = globals()["__hneg2"] + cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__hneg2, types.Function(_typing___hneg2)) + + +@register +class _typing___habs(ConcreteTemplate): + key = globals()["__habs"] + cases = [signature(_type___nv_bfloat16, _type___nv_bfloat16)] + + +register_global(__habs, types.Function(_typing___habs)) + + +@register +class _typing___hadd(ConcreteTemplate): + key = globals()["__hadd"] + cases = [ + signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16) + ] + + +register_global(__hadd, types.Function(_typing___hadd)) + + +@register +class _typing___hsub(ConcreteTemplate): + key = globals()["__hsub"] + cases = [ + signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16) + ] + + +register_global(__hsub, types.Function(_typing___hsub)) + + +@register +class _typing___hmul(ConcreteTemplate): + key = globals()["__hmul"] + cases = [ + signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16) + ] + + +register_global(__hmul, types.Function(_typing___hmul)) + + +@register +class _typing___hadd_rn(ConcreteTemplate): + key = globals()["__hadd_rn"] + cases = [ + signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16) + ] + + +register_global(__hadd_rn, types.Function(_typing___hadd_rn)) + + +@register +class _typing___hsub_rn(ConcreteTemplate): + key = globals()["__hsub_rn"] + cases = [ + signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16) + ] + + +register_global(__hsub_rn, types.Function(_typing___hsub_rn)) + + +@register +class _typing___hmul_rn(ConcreteTemplate): + key = globals()["__hmul_rn"] + cases = [ + signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16) + ] + + +register_global(__hmul_rn, types.Function(_typing___hmul_rn)) + + +@register +class _typing___hdiv(ConcreteTemplate): + key = globals()["__hdiv"] + cases = [ + signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16) + ] + + +register_global(__hdiv, types.Function(_typing___hdiv)) + + +@register +class _typing___hadd_sat(ConcreteTemplate): + key = globals()["__hadd_sat"] + cases = [ + signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16) + ] + + +register_global(__hadd_sat, types.Function(_typing___hadd_sat)) + + +@register +class _typing___hsub_sat(ConcreteTemplate): + key = globals()["__hsub_sat"] + cases = [ + signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16) + ] + + +register_global(__hsub_sat, types.Function(_typing___hsub_sat)) + + +@register +class _typing___hmul_sat(ConcreteTemplate): + key = globals()["__hmul_sat"] + cases = [ + signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16) + ] + + +register_global(__hmul_sat, types.Function(_typing___hmul_sat)) + + +@register +class _typing___hfma(ConcreteTemplate): + key = globals()["__hfma"] + cases = [ + signature( + _type___nv_bfloat16, + _type___nv_bfloat16, + _type___nv_bfloat16, + _type___nv_bfloat16, + ) + ] + + +register_global(__hfma, types.Function(_typing___hfma)) + + +@register +class _typing___hfma_sat(ConcreteTemplate): + key = globals()["__hfma_sat"] + cases = [ + signature( + _type___nv_bfloat16, + _type___nv_bfloat16, + _type___nv_bfloat16, + _type___nv_bfloat16, + ) + ] + + +register_global(__hfma_sat, types.Function(_typing___hfma_sat)) + + +@register +class _typing___hneg(ConcreteTemplate): + key = globals()["__hneg"] + cases = [signature(_type___nv_bfloat16, _type___nv_bfloat16)] + + +register_global(__hneg, types.Function(_typing___hneg)) + + +@register +class _typing___hbeq2(ConcreteTemplate): + key = globals()["__hbeq2"] + cases = [signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__hbeq2, types.Function(_typing___hbeq2)) + + +@register +class _typing___hbne2(ConcreteTemplate): + key = globals()["__hbne2"] + cases = [signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__hbne2, types.Function(_typing___hbne2)) + + +@register +class _typing___hble2(ConcreteTemplate): + key = globals()["__hble2"] + cases = [signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__hble2, types.Function(_typing___hble2)) + + +@register +class _typing___hbge2(ConcreteTemplate): + key = globals()["__hbge2"] + cases = [signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__hbge2, types.Function(_typing___hbge2)) + + +@register +class _typing___hblt2(ConcreteTemplate): + key = globals()["__hblt2"] + cases = [signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__hblt2, types.Function(_typing___hblt2)) + + +@register +class _typing___hbgt2(ConcreteTemplate): + key = globals()["__hbgt2"] + cases = [signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__hbgt2, types.Function(_typing___hbgt2)) + + +@register +class _typing___hbequ2(ConcreteTemplate): + key = globals()["__hbequ2"] + cases = [signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__hbequ2, types.Function(_typing___hbequ2)) + + +@register +class _typing___hbneu2(ConcreteTemplate): + key = globals()["__hbneu2"] + cases = [signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__hbneu2, types.Function(_typing___hbneu2)) + + +@register +class _typing___hbleu2(ConcreteTemplate): + key = globals()["__hbleu2"] + cases = [signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__hbleu2, types.Function(_typing___hbleu2)) + + +@register +class _typing___hbgeu2(ConcreteTemplate): + key = globals()["__hbgeu2"] + cases = [signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__hbgeu2, types.Function(_typing___hbgeu2)) + + +@register +class _typing___hbltu2(ConcreteTemplate): + key = globals()["__hbltu2"] + cases = [signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__hbltu2, types.Function(_typing___hbltu2)) + + +@register +class _typing___hbgtu2(ConcreteTemplate): + key = globals()["__hbgtu2"] + cases = [signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__hbgtu2, types.Function(_typing___hbgtu2)) + + +@register +class _typing___heq(ConcreteTemplate): + key = globals()["__heq"] + cases = [signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16)] + + +register_global(__heq, types.Function(_typing___heq)) + + +@register +class _typing___hne(ConcreteTemplate): + key = globals()["__hne"] + cases = [signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16)] + + +register_global(__hne, types.Function(_typing___hne)) + + +@register +class _typing___hle(ConcreteTemplate): + key = globals()["__hle"] + cases = [signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16)] + + +register_global(__hle, types.Function(_typing___hle)) + + +@register +class _typing___hge(ConcreteTemplate): + key = globals()["__hge"] + cases = [signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16)] - def operator_eq_2_caller(arg_0, arg_1): - return operator_eq_2(arg_0, arg_1) - @lower(operator.eq, _type___nv_bfloat162, _type___nv_bfloat162) - def impl(context, builder, sig, args): - context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_eq_2", shim_raw_str) - ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] - for ptr, ty, arg in zip(ptrs, sig.args, args): - builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) +register_global(__hge, types.Function(_typing___hge)) - return context.compile_internal( - builder, - operator_eq_2_caller, - signature( - bool_, - CPointer(_type___nv_bfloat162), - CPointer(_type___nv_bfloat162), - ), - ptrs, - ) +@register +class _typing___hlt(ConcreteTemplate): + key = globals()["__hlt"] + cases = [signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16)] -_operator_eq_2_lower(shim_stream, shim_obj) +register_global(__hlt, types.Function(_typing___hlt)) -def _operator_ne_2_lower(shim_stream, shim_obj): - shim_raw_str = """ - extern "C" __device__ int - operator_ne_2(bool &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { - retval = operator!=(*lh, *rh); - return 0; - } - """ - operator_ne_2 = declare_device( - "operator_ne_2", - bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), - ) +@register +class _typing___hgt(ConcreteTemplate): + key = globals()["__hgt"] + cases = [signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16)] - def operator_ne_2_caller(arg_0, arg_1): - return operator_ne_2(arg_0, arg_1) - @lower(operator.ne, _type___nv_bfloat162, _type___nv_bfloat162) - def impl(context, builder, sig, args): - context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_ne_2", shim_raw_str) - ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] - for ptr, ty, arg in zip(ptrs, sig.args, args): - builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) +register_global(__hgt, types.Function(_typing___hgt)) - return context.compile_internal( - builder, - operator_ne_2_caller, - signature( - bool_, - CPointer(_type___nv_bfloat162), - CPointer(_type___nv_bfloat162), - ), - ptrs, - ) +@register +class _typing___hequ(ConcreteTemplate): + key = globals()["__hequ"] + cases = [signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16)] -_operator_ne_2_lower(shim_stream, shim_obj) +register_global(__hequ, types.Function(_typing___hequ)) -def _operator_gt_2_lower(shim_stream, shim_obj): - shim_raw_str = """ - extern "C" __device__ int - operator_gt_2(bool &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { - retval = operator>(*lh, *rh); - return 0; - } - """ - operator_gt_2 = declare_device( - "operator_gt_2", - bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), - ) +@register +class _typing___hneu(ConcreteTemplate): + key = globals()["__hneu"] + cases = [signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16)] - def operator_gt_2_caller(arg_0, arg_1): - return operator_gt_2(arg_0, arg_1) - @lower(operator.gt, _type___nv_bfloat162, _type___nv_bfloat162) - def impl(context, builder, sig, args): - context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_gt_2", shim_raw_str) - ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] - for ptr, ty, arg in zip(ptrs, sig.args, args): - builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) +register_global(__hneu, types.Function(_typing___hneu)) - return context.compile_internal( - builder, - operator_gt_2_caller, - signature( - bool_, - CPointer(_type___nv_bfloat162), - CPointer(_type___nv_bfloat162), - ), - ptrs, - ) +@register +class _typing___hleu(ConcreteTemplate): + key = globals()["__hleu"] + cases = [signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16)] -_operator_gt_2_lower(shim_stream, shim_obj) +register_global(__hleu, types.Function(_typing___hleu)) -def _operator_lt_2_lower(shim_stream, shim_obj): - shim_raw_str = """ - extern "C" __device__ int - operator_lt_2(bool &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { - retval = operator<(*lh, *rh); - return 0; - } - """ - operator_lt_2 = declare_device( - "operator_lt_2", - bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), - ) +@register +class _typing___hgeu(ConcreteTemplate): + key = globals()["__hgeu"] + cases = [signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16)] - def operator_lt_2_caller(arg_0, arg_1): - return operator_lt_2(arg_0, arg_1) - @lower(operator.lt, _type___nv_bfloat162, _type___nv_bfloat162) - def impl(context, builder, sig, args): - context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_lt_2", shim_raw_str) - ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] - for ptr, ty, arg in zip(ptrs, sig.args, args): - builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) +register_global(__hgeu, types.Function(_typing___hgeu)) - return context.compile_internal( - builder, - operator_lt_2_caller, - signature( - bool_, - CPointer(_type___nv_bfloat162), - CPointer(_type___nv_bfloat162), - ), - ptrs, - ) +@register +class _typing___hltu(ConcreteTemplate): + key = globals()["__hltu"] + cases = [signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16)] -_operator_lt_2_lower(shim_stream, shim_obj) +register_global(__hltu, types.Function(_typing___hltu)) -def _operator_ge_2_lower(shim_stream, shim_obj): - shim_raw_str = """ - extern "C" __device__ int - operator_ge_2(bool &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { - retval = operator>=(*lh, *rh); - return 0; - } - """ - operator_ge_2 = declare_device( - "operator_ge_2", - bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), - ) +@register +class _typing___hgtu(ConcreteTemplate): + key = globals()["__hgtu"] + cases = [signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16)] - def operator_ge_2_caller(arg_0, arg_1): - return operator_ge_2(arg_0, arg_1) - @lower(operator.ge, _type___nv_bfloat162, _type___nv_bfloat162) - def impl(context, builder, sig, args): - context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_ge_2", shim_raw_str) - ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] - for ptr, ty, arg in zip(ptrs, sig.args, args): - builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) +register_global(__hgtu, types.Function(_typing___hgtu)) - return context.compile_internal( - builder, - operator_ge_2_caller, - signature( - bool_, - CPointer(_type___nv_bfloat162), - CPointer(_type___nv_bfloat162), - ), - ptrs, - ) +@register +class _typing___hisnan(ConcreteTemplate): + key = globals()["__hisnan"] + cases = [signature(bool_, _type___nv_bfloat16)] -_operator_ge_2_lower(shim_stream, shim_obj) +register_global(__hisnan, types.Function(_typing___hisnan)) -def _operator_le_2_lower(shim_stream, shim_obj): - shim_raw_str = """ - extern "C" __device__ int - operator_le_2(bool &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { - retval = operator<=(*lh, *rh); - return 0; - } - """ - operator_le_2 = declare_device( - "operator_le_2", - bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), - ) +@register +class _typing___hmax(ConcreteTemplate): + key = globals()["__hmax"] + cases = [ + signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16) + ] - def operator_le_2_caller(arg_0, arg_1): - return operator_le_2(arg_0, arg_1) - @lower(operator.le, _type___nv_bfloat162, _type___nv_bfloat162) - def impl(context, builder, sig, args): - context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_le_2", shim_raw_str) - ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] - for ptr, ty, arg in zip(ptrs, sig.args, args): - builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) +register_global(__hmax, types.Function(_typing___hmax)) - return context.compile_internal( - builder, - operator_le_2_caller, - signature( - bool_, - CPointer(_type___nv_bfloat162), - CPointer(_type___nv_bfloat162), - ), - ptrs, - ) + +@register +class _typing___hmin(ConcreteTemplate): + key = globals()["__hmin"] + cases = [ + signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16) + ] -_operator_le_2_lower(shim_stream, shim_obj) +register_global(__hmin, types.Function(_typing___hmin)) @register -class _typing_make_bfloat162(ConcreteTemplate): - key = globals()["make_bfloat162"] +class _typing___hmax_nan(ConcreteTemplate): + key = globals()["__hmax_nan"] cases = [ - signature( - _type___nv_bfloat162, _type___nv_bfloat16, _type___nv_bfloat16 - ) + signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16) ] -register_global(make_bfloat162, types.Function(_typing_make_bfloat162)) +register_global(__hmax_nan, types.Function(_typing___hmax_nan)) @register -class _typing_htrunc(ConcreteTemplate): - key = globals()["htrunc"] - cases = [signature(_type___nv_bfloat16, _type___nv_bfloat16)] +class _typing___hmin_nan(ConcreteTemplate): + key = globals()["__hmin_nan"] + cases = [ + signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16) + ] -register_global(htrunc, types.Function(_typing_htrunc)) +register_global(__hmin_nan, types.Function(_typing___hmin_nan)) @register -class _typing_hceil(ConcreteTemplate): - key = globals()["hceil"] - cases = [signature(_type___nv_bfloat16, _type___nv_bfloat16)] +class _typing___hfma_relu(ConcreteTemplate): + key = globals()["__hfma_relu"] + cases = [ + signature( + _type___nv_bfloat16, + _type___nv_bfloat16, + _type___nv_bfloat16, + _type___nv_bfloat16, + ) + ] -register_global(hceil, types.Function(_typing_hceil)) +register_global(__hfma_relu, types.Function(_typing___hfma_relu)) @register -class _typing_hfloor(ConcreteTemplate): - key = globals()["hfloor"] - cases = [signature(_type___nv_bfloat16, _type___nv_bfloat16)] +class _typing___hmax2(ConcreteTemplate): + key = globals()["__hmax2"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] -register_global(hfloor, types.Function(_typing_hfloor)) +register_global(__hmax2, types.Function(_typing___hmax2)) @register -class _typing_hrint(ConcreteTemplate): - key = globals()["hrint"] - cases = [signature(_type___nv_bfloat16, _type___nv_bfloat16)] +class _typing___hmin2(ConcreteTemplate): + key = globals()["__hmin2"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] -register_global(hrint, types.Function(_typing_hrint)) +register_global(__hmin2, types.Function(_typing___hmin2)) @register -class _typing_h2trunc(ConcreteTemplate): - key = globals()["h2trunc"] - cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)] +class _typing___hmax2_nan(ConcreteTemplate): + key = globals()["__hmax2_nan"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] -register_global(h2trunc, types.Function(_typing_h2trunc)) +register_global(__hmax2_nan, types.Function(_typing___hmax2_nan)) @register -class _typing_h2ceil(ConcreteTemplate): - key = globals()["h2ceil"] - cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)] +class _typing___hmin2_nan(ConcreteTemplate): + key = globals()["__hmin2_nan"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] -register_global(h2ceil, types.Function(_typing_h2ceil)) +register_global(__hmin2_nan, types.Function(_typing___hmin2_nan)) @register -class _typing_h2floor(ConcreteTemplate): - key = globals()["h2floor"] - cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)] +class _typing___hfma2_relu(ConcreteTemplate): + key = globals()["__hfma2_relu"] + cases = [ + signature( + _type___nv_bfloat162, + _type___nv_bfloat162, + _type___nv_bfloat162, + _type___nv_bfloat162, + ) + ] -register_global(h2floor, types.Function(_typing_h2floor)) +register_global(__hfma2_relu, types.Function(_typing___hfma2_relu)) @register -class _typing_h2rint(ConcreteTemplate): - key = globals()["h2rint"] - cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)] +class _typing___hcmadd(ConcreteTemplate): + key = globals()["__hcmadd"] + cases = [ + signature( + _type___nv_bfloat162, + _type___nv_bfloat162, + _type___nv_bfloat162, + _type___nv_bfloat162, + ) + ] -register_global(h2rint, types.Function(_typing_h2rint)) +register_global(__hcmadd, types.Function(_typing___hcmadd)) @register @@ -4991,9 +15962,18 @@ class _typing_atomicAdd(ConcreteTemplate): register_global(atomicAdd, types.Function(_typing_atomicAdd)) +@register +class _typing___half(ConcreteTemplate): + key = globals()["__half"] + cases = [signature(void, _type___nv_bfloat16)] + + +register_global(__half, types.Function(_typing___half)) + + @register_global(operator.add) -class _typing_operator_add(ConcreteTemplate): - cases = [ +class _typing_operator_add(BinOp): + cases = BinOp.cases + [ signature( _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16 ), @@ -5004,8 +15984,8 @@ class _typing_operator_add(ConcreteTemplate): @register_global(operator.sub) -class _typing_operator_sub(ConcreteTemplate): - cases = [ +class _typing_operator_sub(BinOp): + cases = BinOp.cases + [ signature( _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16 ), @@ -5016,8 +15996,8 @@ class _typing_operator_sub(ConcreteTemplate): @register_global(operator.mul) -class _typing_operator_mul(ConcreteTemplate): - cases = [ +class _typing_operator_mul(BinOp): + cases = BinOp.cases + [ signature( _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16 ), @@ -5028,8 +16008,8 @@ class _typing_operator_mul(ConcreteTemplate): @register_global(operator.truediv) -class _typing_operator_truediv(ConcreteTemplate): - cases = [ +class _typing_operator_truediv(BinOpTrueDiv): + cases = BinOpTrueDiv.cases + [ signature( _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16 ), @@ -5040,8 +16020,8 @@ class _typing_operator_truediv(ConcreteTemplate): @register_global(operator.iadd) -class _typing_operator_iadd(ConcreteTemplate): - cases = [ +class _typing_operator_iadd(BinOp): + cases = BinOp.cases + [ signature( _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16 ), @@ -5052,8 +16032,8 @@ class _typing_operator_iadd(ConcreteTemplate): @register_global(operator.isub) -class _typing_operator_isub(ConcreteTemplate): - cases = [ +class _typing_operator_isub(BinOp): + cases = BinOp.cases + [ signature( _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16 ), @@ -5064,8 +16044,8 @@ class _typing_operator_isub(ConcreteTemplate): @register_global(operator.imul) -class _typing_operator_imul(ConcreteTemplate): - cases = [ +class _typing_operator_imul(BinOp): + cases = BinOp.cases + [ signature( _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16 ), @@ -5076,8 +16056,8 @@ class _typing_operator_imul(ConcreteTemplate): @register_global(operator.itruediv) -class _typing_operator_itruediv(ConcreteTemplate): - cases = [ +class _typing_operator_itruediv(BinOpTrueDiv): + cases = BinOp.cases + [ signature( _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16 ), @@ -5088,71 +16068,327 @@ class _typing_operator_itruediv(ConcreteTemplate): @register_global(operator.pos) -class _typing_operator_pos(ConcreteTemplate): - cases = [ +class _typing_operator_pos(UnaryPositive): + cases = UnaryPositive.cases + [ signature(_type___nv_bfloat16, _type___nv_bfloat16), signature(_type___nv_bfloat162, _type___nv_bfloat162), ] @register_global(operator.neg) -class _typing_operator_neg(ConcreteTemplate): - cases = [ +class _typing_operator_neg(UnaryNegate): + cases = UnaryNegate.cases + [ signature(_type___nv_bfloat16, _type___nv_bfloat16), signature(_type___nv_bfloat162, _type___nv_bfloat162), ] @register_global(operator.eq) -class _typing_operator_eq(ConcreteTemplate): - cases = [ +class _typing_operator_eq(UnorderedCmpOp): + cases = UnorderedCmpOp.cases + [ signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16), signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162), ] @register_global(operator.ne) -class _typing_operator_ne(ConcreteTemplate): - cases = [ +class _typing_operator_ne(UnorderedCmpOp): + cases = UnorderedCmpOp.cases + [ signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16), signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162), ] @register_global(operator.gt) -class _typing_operator_gt(ConcreteTemplate): - cases = [ +class _typing_operator_gt(OrderedCmpOp): + cases = OrderedCmpOp.cases + [ signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16), signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162), ] @register_global(operator.lt) -class _typing_operator_lt(ConcreteTemplate): - cases = [ +class _typing_operator_lt(OrderedCmpOp): + cases = OrderedCmpOp.cases + [ signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16), signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162), ] @register_global(operator.ge) -class _typing_operator_ge(ConcreteTemplate): - cases = [ +class _typing_operator_ge(OrderedCmpOp): + cases = OrderedCmpOp.cases + [ signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16), signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162), ] @register_global(operator.le) -class _typing_operator_le(ConcreteTemplate): - cases = [ +class _typing_operator_le(OrderedCmpOp): + cases = OrderedCmpOp.cases + [ signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16), signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162), ] # Aliases: -__nv_bfloat16_raw = unnamed1401637 -__nv_bfloat162_raw = unnamed1401746 +__nv_bfloat16_raw = unnamed1405307 +__nv_bfloat162_raw = unnamed1405416 nv_bfloat16 = __nv_bfloat16 nv_bfloat162 = __nv_bfloat162 + + +# Symbols: + + +_NBTYPE_SYMBOLS = [ + "_type_unnamed1405307", + "_type_unnamed1405416", + "_type___nv_bfloat16", + "_type___nv_bfloat162", +] + + +_RECORD_SYMBOLS = [ + "unnamed1405307", + "unnamed1405416", + "__nv_bfloat16", + "__nv_bfloat162", +] + + +_FUNCTION_SYMBOLS = [ + "__double2bfloat16", + "__float2bfloat16", + "__float2bfloat16_rn", + "__float2bfloat16_rz", + "__float2bfloat16_rd", + "__float2bfloat16_ru", + "__bfloat162float", + "__float2bfloat162_rn", + "__floats2bfloat162_rn", + "__low2float", + "__high2float", + "__float22bfloat162_rn", + "__bfloat1622float2", + "__bfloat162char_rz", + "__bfloat162uchar_rz", + "__bfloat162int_rn", + "__bfloat162int_rz", + "__bfloat162int_rd", + "__bfloat162int_ru", + "__int2bfloat16_rn", + "__int2bfloat16_rz", + "__int2bfloat16_rd", + "__int2bfloat16_ru", + "__bfloat162short_rn", + "__bfloat162short_rz", + "__bfloat162short_rd", + "__bfloat162short_ru", + "__short2bfloat16_rn", + "__short2bfloat16_rz", + "__short2bfloat16_rd", + "__short2bfloat16_ru", + "__bfloat162uint_rn", + "__bfloat162uint_rz", + "__bfloat162uint_rd", + "__bfloat162uint_ru", + "__uint2bfloat16_rn", + "__uint2bfloat16_rz", + "__uint2bfloat16_rd", + "__uint2bfloat16_ru", + "__bfloat162ushort_rn", + "__bfloat162ushort_rz", + "__bfloat162ushort_rd", + "__bfloat162ushort_ru", + "__ushort2bfloat16_rn", + "__ushort2bfloat16_rz", + "__ushort2bfloat16_rd", + "__ushort2bfloat16_ru", + "__bfloat162ull_rn", + "__bfloat162ull_rz", + "make_bfloat162", + "__bfloat162ull_rd", + "__bfloat162ull_ru", + "__ull2bfloat16_rn", + "__ull2bfloat16_rz", + "__ull2bfloat16_rd", + "__ull2bfloat16_ru", + "__bfloat162ll_rn", + "__bfloat162ll_rz", + "__bfloat162ll_rd", + "__bfloat162ll_ru", + "__ll2bfloat16_rn", + "__ll2bfloat16_rz", + "__ll2bfloat16_rd", + "__ll2bfloat16_ru", + "htrunc", + "hceil", + "hfloor", + "hrint", + "h2trunc", + "h2ceil", + "h2floor", + "h2rint", + "__bfloat162bfloat162", + "__lowhigh2highlow", + "__lows2bfloat162", + "__highs2bfloat162", + "__high2bfloat16", + "__low2bfloat16", + "__hisinf", + "__halves2bfloat162", + "__low2bfloat162", + "__high2bfloat162", + "__bfloat16_as_short", + "__bfloat16_as_ushort", + "__short_as_bfloat16", + "__ushort_as_bfloat16", + "__shfl_sync", + "__shfl_sync", + "__shfl_up_sync", + "__shfl_up_sync", + "__shfl_down_sync", + "__shfl_down_sync", + "__shfl_xor_sync", + "__shfl_xor_sync", + "__ldg", + "__ldg", + "__ldcg", + "__ldcg", + "__ldca", + "__ldca", + "__ldcs", + "__ldcs", + "__ldlu", + "__ldlu", + "__ldcv", + "__ldcv", + "__stwb", + "__stwb", + "__stcg", + "__stcg", + "__stcs", + "__stcs", + "__stwt", + "__stwt", + "__heq2", + "__hne2", + "__hle2", + "__hge2", + "__hlt2", + "__hgt2", + "__hequ2", + "__hneu2", + "__hleu2", + "__hgeu2", + "__hltu2", + "__hgtu2", + "__heq2_mask", + "__hne2_mask", + "__hle2_mask", + "__hge2_mask", + "__hlt2_mask", + "__hgt2_mask", + "__hequ2_mask", + "__hneu2_mask", + "__hleu2_mask", + "__hgeu2_mask", + "__hltu2_mask", + "__hgtu2_mask", + "__hisnan2", + "__hadd2", + "__hsub2", + "__hmul2", + "__hadd2_rn", + "__hsub2_rn", + "__hmul2_rn", + "__h2div", + "__habs2", + "__hadd2_sat", + "__hsub2_sat", + "__hmul2_sat", + "__hfma2", + "__hfma2_sat", + "__hneg2", + "__habs", + "__hadd", + "__hsub", + "__hmul", + "__hadd_rn", + "__hsub_rn", + "__hmul_rn", + "__hdiv", + "__hadd_sat", + "__hsub_sat", + "__hmul_sat", + "__hfma", + "__hfma_sat", + "__hneg", + "__hbeq2", + "__hbne2", + "__hble2", + "__hbge2", + "__hblt2", + "__hbgt2", + "__hbequ2", + "__hbneu2", + "__hbleu2", + "__hbgeu2", + "__hbltu2", + "__hbgtu2", + "__heq", + "__hne", + "__hle", + "__hge", + "__hlt", + "__hgt", + "__hequ", + "__hneu", + "__hleu", + "__hgeu", + "__hltu", + "__hgtu", + "__hisnan", + "__hmax", + "__hmin", + "__hmax_nan", + "__hmin_nan", + "__hfma_relu", + "__hmax2", + "__hmin2", + "__hmax2_nan", + "__hmin2_nan", + "__hfma2_relu", + "__hcmadd", + "hsqrt", + "hrsqrt", + "hrcp", + "hlog", + "hlog2", + "hlog10", + "hexp", + "htanh_approx", + "h2tanh_approx", + "htanh", + "h2tanh", + "hexp2", + "hexp10", + "hcos", + "hsin", + "h2sqrt", + "h2rsqrt", + "h2rcp", + "h2log", + "h2log2", + "h2log10", + "h2exp", + "h2exp2", + "h2exp10", + "h2cos", + "h2sin", + "atomicAdd", +] + + +__all__ = _NBTYPE_SYMBOLS + _RECORD_SYMBOLS + _FUNCTION_SYMBOLS diff --git a/numba_cuda/numba/cuda/bf16.py b/numba_cuda/numba/cuda/bf16.py index cb2f41dc6..54f3a0b74 100644 --- a/numba_cuda/numba/cuda/bf16.py +++ b/numba_cuda/numba/cuda/bf16.py @@ -2,8 +2,116 @@ # SPDX-License-Identifier: BSD-2-Clause from numba.cuda._internal.cuda_bf16 import ( - _type_class___nv_bfloat16, + typing_registry, + target_registry, nv_bfloat16 as bfloat16, + # Arithmetic intrinsics + __habs as habs, + __hadd as hadd, + __hsub as hsub, + __hmul as hmul, + __hadd_rn as hadd_rn, + __hsub_rn as hsub_rn, + __hmul_rn as hmul_rn, + __hdiv as hdiv, + __hadd_sat as hadd_sat, + __hsub_sat as hsub_sat, + __hmul_sat as hmul_sat, + __hfma as hfma, + __hfma_sat as hfma_sat, + __hneg as hneg, + __hfma_relu as hfma_relu, + # Comparison intrinsics + __heq as heq, + __hne as hne, + __hge as hge, + __hgt as hgt, + __hle as hle, + __hlt as hlt, + __hmax as hmax, + __hmin as hmin, + __hmax_nan as hmax_nan, + __hmin_nan as hmin_nan, + __hisinf as hisinf, + __hisnan as hisnan, + # Unordered comparison intrinsics + __hequ as hequ, + __hneu as hneu, + __hgeu as hgeu, + __hgtu as hgtu, + __hleu as hleu, + __hltu as hltu, + # Precision conversion and data movement + # - floating-point family + __bfloat162float as bfloat162float, + __float2bfloat16 as float2bfloat16, + __double2bfloat16 as double2bfloat16, + __float2bfloat16_rn as float2bfloat16_rn, + __float2bfloat16_rz as float2bfloat16_rz, + __float2bfloat16_rd as float2bfloat16_rd, + __float2bfloat16_ru as float2bfloat16_ru, + # - char family + __bfloat162char_rz as bfloat162char_rz, + __bfloat162uchar_rz as bfloat162uchar_rz, + # - int family (signed 32-bit) + __int2bfloat16_rn as int2bfloat16_rn, + __int2bfloat16_rz as int2bfloat16_rz, + __int2bfloat16_rd as int2bfloat16_rd, + __int2bfloat16_ru as int2bfloat16_ru, + __bfloat162int_rn as bfloat162int_rn, + __bfloat162int_rz as bfloat162int_rz, + __bfloat162int_rd as bfloat162int_rd, + __bfloat162int_ru as bfloat162int_ru, + # - short family (signed 16-bit) + __short2bfloat16_rn as short2bfloat16_rn, + __short2bfloat16_rz as short2bfloat16_rz, + __short2bfloat16_rd as short2bfloat16_rd, + __short2bfloat16_ru as short2bfloat16_ru, + __bfloat162short_rn as bfloat162short_rn, + __bfloat162short_rz as bfloat162short_rz, + __bfloat162short_rd as bfloat162short_rd, + __bfloat162short_ru as bfloat162short_ru, + # - ushort family (unsigned 16-bit) + __ushort2bfloat16_rn as ushort2bfloat16_rn, + __ushort2bfloat16_rz as ushort2bfloat16_rz, + __ushort2bfloat16_rd as ushort2bfloat16_rd, + __ushort2bfloat16_ru as ushort2bfloat16_ru, + __bfloat162ushort_rn as bfloat162ushort_rn, + __bfloat162ushort_rz as bfloat162ushort_rz, + __bfloat162ushort_rd as bfloat162ushort_rd, + __bfloat162ushort_ru as bfloat162ushort_ru, + # - uint family (unsigned 32-bit) + __uint2bfloat16_rn as uint2bfloat16_rn, + __uint2bfloat16_rz as uint2bfloat16_rz, + __uint2bfloat16_rd as uint2bfloat16_rd, + __uint2bfloat16_ru as uint2bfloat16_ru, + __bfloat162uint_rn as bfloat162uint_rn, + __bfloat162uint_rz as bfloat162uint_rz, + __bfloat162uint_rd as bfloat162uint_rd, + __bfloat162uint_ru as bfloat162uint_ru, + # - ll family (signed 64-bit) + __ll2bfloat16_rn as ll2bfloat16_rn, + __ll2bfloat16_rz as ll2bfloat16_rz, + __ll2bfloat16_rd as ll2bfloat16_rd, + __ll2bfloat16_ru as ll2bfloat16_ru, + __bfloat162ll_rn as bfloat162ll_rn, + __bfloat162ll_rz as bfloat162ll_rz, + __bfloat162ll_rd as bfloat162ll_rd, + __bfloat162ll_ru as bfloat162ll_ru, + # - ull family (unsigned 64-bit) + __ull2bfloat16_rn as ull2bfloat16_rn, + __ull2bfloat16_rz as ull2bfloat16_rz, + __ull2bfloat16_rd as ull2bfloat16_rd, + __ull2bfloat16_ru as ull2bfloat16_ru, + __bfloat162ull_rn as bfloat162ull_rn, + __bfloat162ull_rz as bfloat162ull_rz, + __bfloat162ull_rd as bfloat162ull_rd, + __bfloat162ull_ru as bfloat162ull_ru, + # - bit reinterpret casts + __bfloat16_as_short as bfloat16_as_short, + __bfloat16_as_ushort as bfloat16_as_ushort, + __short_as_bfloat16 as short_as_bfloat16, + __ushort_as_bfloat16 as ushort_as_bfloat16, htrunc, hceil, hfloor, @@ -28,7 +136,7 @@ def _make_unary(a, func): - if isinstance(a, _type_class___nv_bfloat16): + if a == bfloat16: return lambda a: func(a) @@ -92,9 +200,184 @@ def exp2_ol(a): except ImportError: pass +## Public aliases using Numba/Numpy-style type names +# Floating-point +float32_to_bfloat16 = float2bfloat16 +float64_to_bfloat16 = double2bfloat16 +bfloat16_to_float32 = bfloat162float +float32_to_bfloat16_rn = float2bfloat16_rn +float32_to_bfloat16_rz = float2bfloat16_rz +float32_to_bfloat16_rd = float2bfloat16_rd +float32_to_bfloat16_ru = float2bfloat16_ru + +# Char (8-bit) +bfloat16_to_int8_rz = bfloat162char_rz +bfloat16_to_uint8_rz = bfloat162uchar_rz + +# Int16 / UInt16 +int16_to_bfloat16_rn = short2bfloat16_rn +int16_to_bfloat16_rz = short2bfloat16_rz +int16_to_bfloat16_rd = short2bfloat16_rd +int16_to_bfloat16_ru = short2bfloat16_ru +bfloat16_to_int16_rn = bfloat162short_rn +bfloat16_to_int16_rz = bfloat162short_rz +bfloat16_to_int16_rd = bfloat162short_rd +bfloat16_to_int16_ru = bfloat162short_ru + +uint16_to_bfloat16_rn = ushort2bfloat16_rn +uint16_to_bfloat16_rz = ushort2bfloat16_rz +uint16_to_bfloat16_rd = ushort2bfloat16_rd +uint16_to_bfloat16_ru = ushort2bfloat16_ru +bfloat16_to_uint16_rn = bfloat162ushort_rn +bfloat16_to_uint16_rz = bfloat162ushort_rz +bfloat16_to_uint16_rd = bfloat162ushort_rd +bfloat16_to_uint16_ru = bfloat162ushort_ru + +# Int32 / UInt32 +int32_to_bfloat16_rn = int2bfloat16_rn +int32_to_bfloat16_rz = int2bfloat16_rz +int32_to_bfloat16_rd = int2bfloat16_rd +int32_to_bfloat16_ru = int2bfloat16_ru +bfloat16_to_int32_rn = bfloat162int_rn +bfloat16_to_int32_rz = bfloat162int_rz +bfloat16_to_int32_rd = bfloat162int_rd +bfloat16_to_int32_ru = bfloat162int_ru + +uint32_to_bfloat16_rn = uint2bfloat16_rn +uint32_to_bfloat16_rz = uint2bfloat16_rz +uint32_to_bfloat16_rd = uint2bfloat16_rd +uint32_to_bfloat16_ru = uint2bfloat16_ru +bfloat16_to_uint32_rn = bfloat162uint_rn +bfloat16_to_uint32_rz = bfloat162uint_rz +bfloat16_to_uint32_rd = bfloat162uint_rd +bfloat16_to_uint32_ru = bfloat162uint_ru + +# Int64 / UInt64 +int64_to_bfloat16_rn = ll2bfloat16_rn +int64_to_bfloat16_rz = ll2bfloat16_rz +int64_to_bfloat16_rd = ll2bfloat16_rd +int64_to_bfloat16_ru = ll2bfloat16_ru +bfloat16_to_int64_rn = bfloat162ll_rn +bfloat16_to_int64_rz = bfloat162ll_rz +bfloat16_to_int64_rd = bfloat162ll_rd +bfloat16_to_int64_ru = bfloat162ll_ru + +uint64_to_bfloat16_rn = ull2bfloat16_rn +uint64_to_bfloat16_rz = ull2bfloat16_rz +uint64_to_bfloat16_rd = ull2bfloat16_rd +uint64_to_bfloat16_ru = ull2bfloat16_ru +bfloat16_to_uint64_rn = bfloat162ull_rn +bfloat16_to_uint64_rz = bfloat162ull_rz +bfloat16_to_uint64_rd = bfloat162ull_rd +bfloat16_to_uint64_ru = bfloat162ull_ru + +# Bit reinterpret casts +bfloat16_as_int16 = bfloat16_as_short +bfloat16_as_uint16 = bfloat16_as_ushort +int16_as_bfloat16 = short_as_bfloat16 +uint16_as_bfloat16 = ushort_as_bfloat16 __all__ = [ + "typing_registry", + "target_registry", "bfloat16", + # Arithmetic intrinsics + "habs", + "hadd", + "hsub", + "hmul", + "hadd_rn", + "hsub_rn", + "hmul_rn", + "hdiv", + "hadd_sat", + "hsub_sat", + "hmul_sat", + "hfma", + "hfma_sat", + "hneg", + "hfma_relu", + # Comparison intrinsics + "heq", + "hne", + "hge", + "hgt", + "hle", + "hlt", + "hmax", + "hmin", + "hmax_nan", + "hmin_nan", + "hisinf", + "hisnan", + "hequ", + "hneu", + "hgeu", + "hgtu", + "hleu", + "hltu", + # Precision conversion and data movement + "float32_to_bfloat16", + "float64_to_bfloat16", + "bfloat16_to_float32", + "float32_to_bfloat16_rn", + "float32_to_bfloat16_rz", + "float32_to_bfloat16_rd", + "float32_to_bfloat16_ru", + "bfloat16_to_int8_rz", + "bfloat16_to_uint8_rz", + "int16_to_bfloat16_rn", + "int16_to_bfloat16_rz", + "int16_to_bfloat16_rd", + "int16_to_bfloat16_ru", + "bfloat16_to_int16_rn", + "bfloat16_to_int16_rz", + "bfloat16_to_int16_rd", + "bfloat16_to_int16_ru", + "uint16_to_bfloat16_rn", + "uint16_to_bfloat16_rz", + "uint16_to_bfloat16_rd", + "uint16_to_bfloat16_ru", + "bfloat16_to_uint16_rn", + "bfloat16_to_uint16_rz", + "bfloat16_to_uint16_rd", + "bfloat16_to_uint16_ru", + "int32_to_bfloat16_rn", + "int32_to_bfloat16_rz", + "int32_to_bfloat16_rd", + "int32_to_bfloat16_ru", + "bfloat16_to_int32_rn", + "bfloat16_to_int32_rz", + "bfloat16_to_int32_rd", + "bfloat16_to_int32_ru", + "uint32_to_bfloat16_rn", + "uint32_to_bfloat16_rz", + "uint32_to_bfloat16_rd", + "uint32_to_bfloat16_ru", + "bfloat16_to_uint32_rn", + "bfloat16_to_uint32_rz", + "bfloat16_to_uint32_rd", + "bfloat16_to_uint32_ru", + "int64_to_bfloat16_rn", + "int64_to_bfloat16_rz", + "int64_to_bfloat16_rd", + "int64_to_bfloat16_ru", + "bfloat16_to_int64_rn", + "bfloat16_to_int64_rz", + "bfloat16_to_int64_rd", + "bfloat16_to_int64_ru", + "uint64_to_bfloat16_rn", + "uint64_to_bfloat16_rz", + "uint64_to_bfloat16_rd", + "uint64_to_bfloat16_ru", + "bfloat16_to_uint64_rn", + "bfloat16_to_uint64_rz", + "bfloat16_to_uint64_rd", + "bfloat16_to_uint64_ru", + "bfloat16_as_int16", + "bfloat16_as_uint16", + "int16_as_bfloat16", + "uint16_as_bfloat16", "htrunc", "hceil", "hfloor", diff --git a/numba_cuda/numba/cuda/cudadrv/nvrtc.py b/numba_cuda/numba/cuda/cudadrv/nvrtc.py index 1a1035a25..a26e24c93 100644 --- a/numba_cuda/numba/cuda/cudadrv/nvrtc.py +++ b/numba_cuda/numba/cuda/cudadrv/nvrtc.py @@ -355,7 +355,11 @@ def compile(src, name, cc, ltoir=False): cudadrv_path = os.path.dirname(os.path.abspath(__file__)) numba_cuda_path = os.path.dirname(cudadrv_path) - numba_include = f"{os.path.join(numba_cuda_path, 'include', '12')}" + nvrtc_ver_major = version[0] + if nvrtc_ver_major == 12: + numba_include = f"{os.path.join(numba_cuda_path, 'include', '12')}" + elif nvrtc_ver_major == 13: + numba_include = f"{os.path.join(numba_cuda_path, 'include', '13')}" if config.CUDA_NVRTC_EXTRA_SEARCH_PATHS: extra_includes = config.CUDA_NVRTC_EXTRA_SEARCH_PATHS.split(":") diff --git a/numba_cuda/numba/cuda/include/13/cuda_bf16.h b/numba_cuda/numba/cuda/include/13/cuda_bf16.h new file mode 100644 index 000000000..38feffba0 --- /dev/null +++ b/numba_cuda/numba/cuda/include/13/cuda_bf16.h @@ -0,0 +1,5118 @@ +/* +* Copyright 1993-2024 NVIDIA Corporation. All rights reserved. +* +* NOTICE TO LICENSEE: +* +* This source code and/or documentation ("Licensed Deliverables") are +* subject to NVIDIA intellectual property rights under U.S. and +* international Copyright laws. +* +* These Licensed Deliverables contained herein is PROPRIETARY and +* CONFIDENTIAL to NVIDIA and is being provided under the terms and +* conditions of a form of NVIDIA software license agreement by and +* between NVIDIA and Licensee ("License Agreement") or electronically +* accepted by Licensee. Notwithstanding any terms or conditions to +* the contrary in the License Agreement, reproduction or disclosure +* of the Licensed Deliverables to any third party without the express +* written consent of NVIDIA is prohibited. +* +* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE +* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS +* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. +* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED +* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, +* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. +* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY +* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY +* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS +* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE +* OF THESE LICENSED DELIVERABLES. +* +* U.S. Government End Users. These Licensed Deliverables are a +* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT +* 1995), consisting of "commercial computer software" and "commercial +* computer software documentation" as such terms are used in 48 +* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government +* only as a commercial end item. Consistent with 48 C.F.R.12.212 and +* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all +* U.S. Government End Users acquire the Licensed Deliverables with +* only those rights set forth herein. +* +* Any use of the Licensed Deliverables in individual and commercial +* software must include, in the user documentation and internal +* comments to the code, the above Disclaimer and U.S. Government End +* Users Notice. +*/ + +/** +* \defgroup CUDA_MATH_INTRINSIC_BFLOAT16 Bfloat16 Precision Intrinsics +* This section describes nv_bfloat16 precision intrinsic functions. +* To use these functions, include the header file \p cuda_bf16.h in your program. +* All of the functions defined here are available in device code. +* Some of the functions are also available to host compilers, please +* refer to respective functions' documentation for details. +* +* NOTE: Aggressive floating-point optimizations performed by host or device +* compilers may affect numeric behavior of the functions implemented in this +* header. Specific examples are: +* - hsin(__nv_bfloat16); +* - hcos(__nv_bfloat16); +* - h2sin(__nv_bfloat162); +* - h2cos(__nv_bfloat162); +* +* The following macros are available to help users selectively enable/disable +* various definitions present in the header file: +* - \p CUDA_NO_BFLOAT16 - If defined, this macro will prevent the definition of +* additional type aliases in the global namespace, helping to avoid potential +* conflicts with symbols defined in the user program. +* - \p __CUDA_NO_BFLOAT16_CONVERSIONS__ - If defined, this macro will prevent +* the use of the C++ type conversions (converting constructors and conversion +* operators) that are common for built-in floating-point types, but may be +* undesirable for \p __nv_bfloat16 which is essentially a user-defined type. +* - \p __CUDA_NO_BFLOAT16_OPERATORS__ and \p __CUDA_NO_BFLOAT162_OPERATORS__ - +* If defined, these macros will prevent the inadvertent use of usual arithmetic +* and comparison operators. This enforces the storage-only type semantics and +* prevents C++ style computations on \p __nv_bfloat16 and \p __nv_bfloat162 types. +*/ + +/** +* \defgroup CUDA_MATH_INTRINSIC_BFLOAT16_CONSTANTS Bfloat16 Arithmetic Constants +* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 +* To use these constants, include the header file \p cuda_bf16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__BFLOAT16_ARITHMETIC Bfloat16 Arithmetic Functions +* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 +* To use these functions, include the header file \p cuda_bf16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__BFLOAT162_ARITHMETIC Bfloat162 Arithmetic Functions +* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 +* To use these functions, include the header file \p cuda_bf16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__BFLOAT16_COMPARISON Bfloat16 Comparison Functions +* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 +* To use these functions, include the header file \p cuda_bf16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__BFLOAT162_COMPARISON Bfloat162 Comparison Functions +* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 +* To use these functions, include the header file \p cuda_bf16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__BFLOAT16_MISC Bfloat16 Precision Conversion and Data Movement +* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 +* To use these functions, include the header file \p cuda_bf16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__BFLOAT16_FUNCTIONS Bfloat16 Math Functions +* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 +* To use these functions, include the header file \p cuda_bf16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__BFLOAT162_FUNCTIONS Bfloat162 Math Functions +* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 +* To use these functions, include the header file \p cuda_bf16.h in your program. +*/ + +#ifndef __CUDA_BF16_H__ +#define __CUDA_BF16_H__ + +/* bring in __half data type and operations, for use in converting constructors */ +#include "cuda_fp16.h" + +// implicitly provided by NVRTC +#if !defined(__CUDACC_RTC__) +/* bring in float2, double4, etc vector types */ +#include "vector_types.h" +/* bring in operations on vector types like: make_float2 */ +#include "vector_functions.h" +#endif /* !defined(__CUDACC_RTC__) */ + +#define ___CUDA_BF16_STRINGIFY_INNERMOST(x) #x +#define __CUDA_BF16_STRINGIFY(x) ___CUDA_BF16_STRINGIFY_INNERMOST(x) + +#if defined(__cplusplus) + +/* Set up function decorations */ +#if (defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3)))) +#define __CUDA_BF16_DECL__ __device__ +#define __CUDA_HOSTDEVICE_BF16_DECL__ __device__ +#define __CUDA_HOSTDEVICE__ __device__ +#elif defined(__CUDACC__) || defined(_NVHPC_CUDA) +#define __CUDA_BF16_DECL__ static __device__ __inline__ +#define __CUDA_HOSTDEVICE_BF16_DECL__ static __host__ __device__ __inline__ +#define __CUDA_HOSTDEVICE__ __host__ __device__ +#else /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +#if defined(__GNUC__) +#define __CUDA_HOSTDEVICE_BF16_DECL__ static __attribute__ ((unused)) +#else +#define __CUDA_HOSTDEVICE_BF16_DECL__ static +#endif /* defined(__GNUC__) */ +#define __CUDA_HOSTDEVICE__ +#endif /* (defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3))))) */ + +#define __CUDA_BF16_TYPES_EXIST__ + +/* Macros to allow nv_bfloat16 & nv_bfloat162 to be used by inline assembly */ +#define __BFLOAT16_TO_US(var) *(reinterpret_cast(&(var))) +#define __BFLOAT16_TO_CUS(var) *(reinterpret_cast(&(var))) +#define __BFLOAT162_TO_UI(var) *(reinterpret_cast(&(var))) +#define __BFLOAT162_TO_CUI(var) *(reinterpret_cast(&(var))) + +/* Forward-declaration of structures defined in "cuda_bf16.hpp" */ +struct __nv_bfloat16; +struct __nv_bfloat162; + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Converts double number to nv_bfloat16 precision in round-to-nearest-even mode +* and returns \p nv_bfloat16 with converted value. +* +* \details Converts double number \p a to nv_bfloat16 precision in round-to-nearest-even mode. +* \param[in] a - double. Is only being read. +* \returns nv_bfloat16 +* - \p a converted to \p nv_bfloat16 using round-to-nearest-even mode. +* - __double2bfloat16 \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula. +* - __double2bfloat16 \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula. +* - __double2bfloat16(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __double2bfloat16(const double a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Converts float number to nv_bfloat16 precision in round-to-nearest-even mode +* and returns \p nv_bfloat16 with converted value. +* +* \details Converts float number \p a to nv_bfloat16 precision in round-to-nearest-even mode. +* \param[in] a - float. Is only being read. +* \returns nv_bfloat16 +* - \p a converted to nv_bfloat16 using round-to-nearest-even mode. +* +* \see __float2bfloat16_rn(float) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16(const float a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Converts float number to nv_bfloat16 precision in round-to-nearest-even mode +* and returns \p nv_bfloat16 with converted value. +* +* \details Converts float number \p a to nv_bfloat16 precision in round-to-nearest-even mode. +* \param[in] a - float. Is only being read. +* \returns nv_bfloat16 +* - \p a converted to nv_bfloat16 using round-to-nearest-even mode. +* - __float2bfloat16_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula. +* - __float2bfloat16_rn \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula. +* - __float2bfloat16_rn(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rn(const float a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Converts float number to nv_bfloat16 precision in round-towards-zero mode +* and returns \p nv_bfloat16 with converted value. +* +* \details Converts float number \p a to nv_bfloat16 precision in round-towards-zero mode. +* \param[in] a - float. Is only being read. +* \returns nv_bfloat16 +* - \p a converted to nv_bfloat16 using round-towards-zero mode. +* - __float2bfloat16_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula. +* - __float2bfloat16_rz \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula. +* - __float2bfloat16_rz(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rz(const float a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Converts float number to nv_bfloat16 precision in round-down mode +* and returns \p nv_bfloat16 with converted value. +* +* \details Converts float number \p a to nv_bfloat16 precision in round-down mode. +* \param[in] a - float. Is only being read. +* +* \returns nv_bfloat16 +* - \p a converted to nv_bfloat16 using round-down mode. +* - __float2bfloat16_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula. +* - __float2bfloat16_rd \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula. +* - __float2bfloat16_rd(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rd(const float a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Converts float number to nv_bfloat16 precision in round-up mode +* and returns \p nv_bfloat16 with converted value. +* +* \details Converts float number \p a to nv_bfloat16 precision in round-up mode. +* \param[in] a - float. Is only being read. +* +* \returns nv_bfloat16 +* - \p a converted to nv_bfloat16 using round-up mode. +* - __float2bfloat16_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula. +* - __float2bfloat16_ru \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula. +* - __float2bfloat16_ru(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_ru(const float a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Converts \p nv_bfloat16 number to float. +* +* \details Converts nv_bfloat16 number \p a to float. +* \param[in] a - float. Is only being read. +* +* \returns float +* - \p a converted to float. +* - __bfloat162float \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula. +* - __bfloat162float \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula. +* - __bfloat162float(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ float __bfloat162float(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Converts input to nv_bfloat16 precision in round-to-nearest-even mode and +* populates both halves of \p nv_bfloat162 with converted value. +* +* \details Converts input \p a to nv_bfloat16 precision in round-to-nearest-even mode and +* populates both halves of \p nv_bfloat162 with converted value. +* \param[in] a - float. Is only being read. +* +* \returns nv_bfloat162 +* - The \p nv_bfloat162 value with both halves equal to the converted nv_bfloat16 +* precision number. +* +* \see __float2bfloat16_rn(float) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __float2bfloat162_rn(const float a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Converts both input floats to nv_bfloat16 precision in round-to-nearest-even +* mode and returns \p nv_bfloat162 with converted values. +* +* \details Converts both input floats to nv_bfloat16 precision in round-to-nearest-even mode +* and combines the results into one \p nv_bfloat162 number. Low 16 bits of the return +* value correspond to the input \p a, high 16 bits correspond to the input \p +* b. +* \param[in] a - float. Is only being read. +* \param[in] b - float. Is only being read. +* +* \returns nv_bfloat162 +* - The \p nv_bfloat162 value with corresponding halves equal to the +* converted input floats. +* +* \see __float2bfloat16_rn(float) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __floats2bfloat162_rn(const float a, const float b); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Converts low 16 bits of \p nv_bfloat162 to float and returns the result +* +* \details Converts low 16 bits of \p nv_bfloat162 input \p a to 32-bit floating-point number +* and returns the result. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns float +* - The low 16 bits of \p a converted to float. +* +* \see __bfloat162float(__nv_bfloat16) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ float __low2float(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Converts high 16 bits of \p nv_bfloat162 to float and returns the result +* +* \details Converts high 16 bits of \p nv_bfloat162 input \p a to 32-bit floating-point number +* and returns the result. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns float +* - The high 16 bits of \p a converted to float. +* +* \see __bfloat162float(__nv_bfloat16) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ float __high2float(const __nv_bfloat162 a); + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Converts both components of float2 number to nv_bfloat16 precision in +* round-to-nearest-even mode and returns \p nv_bfloat162 with converted values. +* +* \details Converts both components of float2 to nv_bfloat16 precision in round-to-nearest-even +* mode and combines the results into one \p nv_bfloat162 number. Low 16 bits of the +* return value correspond to \p a.x and high 16 bits of the return value +* correspond to \p a.y. +* \param[in] a - float2. Is only being read. +* +* \returns nv_bfloat162 +* - The \p nv_bfloat162 which has corresponding halves equal to the +* converted float2 components. +* +* \see __float2bfloat16_rn(float) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __float22bfloat162_rn(const float2 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Converts both halves of \p nv_bfloat162 to float2 and returns the result. +* +* \details Converts both halves of \p nv_bfloat162 input \p a to float and returns the +* result as a \p float2 packed value. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns float2 +* - \p a converted to float2. +* +* \see __bfloat162float(__nv_bfloat16) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ float2 __bfloat1622float2(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to a signed char in round-towards-zero mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to a signed +* char in round-towards-zero mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns signed char +* - \p h converted to a signed char using round-towards-zero mode. +* - __bfloat162char_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __bfloat162char_rz \cuda_math_formula (x), x > 127\end_cuda_math_formula returns SCHAR_MAX = \p 0x7F. +* - __bfloat162char_rz \cuda_math_formula (x), x < -128\end_cuda_math_formula returns SCHAR_MIN = \p 0x80. +* - __bfloat162char_rz(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ signed char __bfloat162char_rz(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to an unsigned char in round-towards-zero mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned +* char in round-towards-zero mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned char +* - \p h converted to an unsigned char using round-towards-zero mode. +* - __bfloat162uchar_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __bfloat162uchar_rz \cuda_math_formula (x), x > 255\end_cuda_math_formula returns UCHAR_MAX = \p 0xFF. +* - __bfloat162uchar_rz \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0. +* - __bfloat162uchar_rz(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned char __bfloat162uchar_rz(const __nv_bfloat16 h); +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to a signed integer in round-to-nearest-even mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to a signed integer in +* round-to-nearest-even mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns int +* - \p h converted to a signed integer using round-to-nearest-even mode. +* - __bfloat162int_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __bfloat162int_rn \cuda_math_formula (x), x > INT_MAX\end_cuda_math_formula returns INT_MAX = \p 0x7FFFFFFF. +* - __bfloat162int_rn \cuda_math_formula (x), x < INT_MIN\end_cuda_math_formula returns INT_MIN = \p 0x80000000. +* - __bfloat162int_rn(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ int __bfloat162int_rn(const __nv_bfloat16 h); +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to a signed integer in round-towards-zero mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to a signed integer in +* round-towards-zero mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns int +* - \p h converted to a signed integer using round-towards-zero mode. +* - __bfloat162int_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __bfloat162int_rz \cuda_math_formula (x), x > INT_MAX\end_cuda_math_formula returns INT_MAX = \p 0x7FFFFFFF. +* - __bfloat162int_rz \cuda_math_formula (x), x < INT_MIN\end_cuda_math_formula returns INT_MIN = \p 0x80000000. +* - __bfloat162int_rz(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ int __bfloat162int_rz(const __nv_bfloat16 h); +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to a signed integer in round-down mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to a signed integer in +* round-down mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns int +* - \p h converted to a signed integer using round-down mode. +* - __bfloat162int_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __bfloat162int_rd \cuda_math_formula (x), x > INT_MAX\end_cuda_math_formula returns INT_MAX = \p 0x7FFFFFFF. +* - __bfloat162int_rd \cuda_math_formula (x), x < INT_MIN\end_cuda_math_formula returns INT_MIN = \p 0x80000000. +* - __bfloat162int_rd(NaN) returns 0.* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ int __bfloat162int_rd(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to a signed integer in round-up mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to a signed integer in +* round-up mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns int +* - \p h converted to a signed integer using round-up mode. +* - __bfloat162int_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __bfloat162int_ru \cuda_math_formula (x), x > INT_MAX\end_cuda_math_formula returns INT_MAX = \p 0x7FFFFFFF. +* - __bfloat162int_ru \cuda_math_formula (x), x < INT_MIN\end_cuda_math_formula returns INT_MIN = \p 0x80000000. +* - __bfloat162int_ru(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ int __bfloat162int_ru(const __nv_bfloat16 h); +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a signed integer to a nv_bfloat16 in round-to-nearest-even mode. +* +* \details Convert the signed integer value \p i to a nv_bfloat16 floating-point +* value in round-to-nearest-even mode. +* \param[in] i - int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rn(const int i); +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a signed integer to a nv_bfloat16 in round-towards-zero mode. +* +* \details Convert the signed integer value \p i to a nv_bfloat16 floating-point +* value in round-towards-zero mode. +* \param[in] i - int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rz(const int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a signed integer to a nv_bfloat16 in round-down mode. +* +* \details Convert the signed integer value \p i to a nv_bfloat16 floating-point +* value in round-down mode. +* \param[in] i - int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rd(const int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a signed integer to a nv_bfloat16 in round-up mode. +* +* \details Convert the signed integer value \p i to a nv_bfloat16 floating-point +* value in round-up mode. +* \param[in] i - int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_ru(const int i); + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to a signed short integer in round-to-nearest-even +* mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to a signed short +* integer in round-to-nearest-even mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns short int +* - \p h converted to a signed short integer using round-to-nearest-even mode. +* - __bfloat162short_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __bfloat162short_rn \cuda_math_formula (x), x > 32767\end_cuda_math_formula returns SHRT_MAX = \p 0x7FFF. +* - __bfloat162short_rn \cuda_math_formula (x), x < -32768\end_cuda_math_formula returns SHRT_MIN = \p 0x8000. +* - __bfloat162short_rn(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ short int __bfloat162short_rn(const __nv_bfloat16 h); +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to a signed short integer in round-towards-zero mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to a signed short +* integer in round-towards-zero mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns short int +* - \p h converted to a signed short integer using round-towards-zero mode. +* - __bfloat162short_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __bfloat162short_rz \cuda_math_formula (x), x > 32767\end_cuda_math_formula returns SHRT_MAX = \p 0x7FFF. +* - __bfloat162short_rz \cuda_math_formula (x), x < -32768\end_cuda_math_formula returns SHRT_MIN = \p 0x8000. +* - __bfloat162short_rz(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ short int __bfloat162short_rz(const __nv_bfloat16 h); +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to a signed short integer in round-down mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to a signed short +* integer in round-down mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns short int +* - \p h converted to a signed short integer using round-down mode. +* - __bfloat162short_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __bfloat162short_rd \cuda_math_formula (x), x > 32767\end_cuda_math_formula returns SHRT_MAX = \p 0x7FFF. +* - __bfloat162short_rd \cuda_math_formula (x), x < -32768\end_cuda_math_formula returns SHRT_MIN = \p 0x8000. +* - __bfloat162short_rd(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ short int __bfloat162short_rd(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to a signed short integer in round-up mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to a signed short +* integer in round-up mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns short int +* - \p h converted to a signed short integer using round-up mode. +* - __bfloat162short_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __bfloat162short_ru \cuda_math_formula (x), x > 32767\end_cuda_math_formula returns SHRT_MAX = \p 0x7FFF. +* - __bfloat162short_ru \cuda_math_formula (x), x < -32768\end_cuda_math_formula returns SHRT_MIN = \p 0x8000. +* - __bfloat162short_ru(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ short int __bfloat162short_ru(const __nv_bfloat16 h); +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a signed short integer to a nv_bfloat16 in round-to-nearest-even +* mode. +* +* \details Convert the signed short integer value \p i to a nv_bfloat16 floating-point +* value in round-to-nearest-even mode. +* \param[in] i - short int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rn(const short int i); +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a signed short integer to a nv_bfloat16 in round-towards-zero mode. +* +* \details Convert the signed short integer value \p i to a nv_bfloat16 floating-point +* value in round-towards-zero mode. +* \param[in] i - short int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rz(const short int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a signed short integer to a nv_bfloat16 in round-down mode. +* +* \details Convert the signed short integer value \p i to a nv_bfloat16 floating-point +* value in round-down mode. +* \param[in] i - short int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rd(const short int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a signed short integer to a nv_bfloat16 in round-up mode. +* +* \details Convert the signed short integer value \p i to a nv_bfloat16 floating-point +* value in round-up mode. +* \param[in] i - short int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_ru(const short int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to an unsigned integer in round-to-nearest-even mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned integer +* in round-to-nearest-even mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned int +* - \p h converted to an unsigned integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ unsigned int __bfloat162uint_rn(const __nv_bfloat16 h); +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to an unsigned integer in round-towards-zero mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned integer +* in round-towards-zero mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned int +* - \p h converted to an unsigned integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __bfloat162uint_rz(const __nv_bfloat16 h); +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to an unsigned integer in round-down mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned integer +* in round-down mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned int +* - \p h converted to an unsigned integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ unsigned int __bfloat162uint_rd(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to an unsigned integer in round-up mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned integer +* in round-up mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned int +* - \p h converted to an unsigned integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ unsigned int __bfloat162uint_ru(const __nv_bfloat16 h); +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert an unsigned integer to a nv_bfloat16 in round-to-nearest-even mode. +* +* \details Convert the unsigned integer value \p i to a nv_bfloat16 floating-point +* value in round-to-nearest-even mode. +* \param[in] i - unsigned int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rn(const unsigned int i); +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert an unsigned integer to a nv_bfloat16 in round-towards-zero mode. +* +* \details Convert the unsigned integer value \p i to a nv_bfloat16 floating-point +* value in round-towards-zero mode. +* \param[in] i - unsigned int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rz(const unsigned int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert an unsigned integer to a nv_bfloat16 in round-down mode. +* +* \details Convert the unsigned integer value \p i to a nv_bfloat16 floating-point +* value in round-down mode. +* \param[in] i - unsigned int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rd(const unsigned int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert an unsigned integer to a nv_bfloat16 in round-up mode. +* +* \details Convert the unsigned integer value \p i to a nv_bfloat16 floating-point +* value in round-up mode. +* \param[in] i - unsigned int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_ru(const unsigned int i); + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to an unsigned short integer in round-to-nearest-even +* mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned short +* integer in round-to-nearest-even mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned short int +* - \p h converted to an unsigned short integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_rn(const __nv_bfloat16 h); +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to an unsigned short integer in round-towards-zero +* mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned short +* integer in round-towards-zero mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned short int +* - \p h converted to an unsigned short integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned short int __bfloat162ushort_rz(const __nv_bfloat16 h); +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to an unsigned short integer in round-down mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned short +* integer in round-down mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned short int +* - \p h converted to an unsigned short integer. +*/ +__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_rd(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to an unsigned short integer in round-up mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned short +* integer in round-up mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned short int +* - \p h converted to an unsigned short integer. +*/ +__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_ru(const __nv_bfloat16 h); +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert an unsigned short integer to a nv_bfloat16 in round-to-nearest-even +* mode. +* +* \details Convert the unsigned short integer value \p i to a nv_bfloat16 floating-point +* value in round-to-nearest-even mode. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rn(const unsigned short int i); +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert an unsigned short integer to a nv_bfloat16 in round-towards-zero +* mode. +* +* \details Convert the unsigned short integer value \p i to a nv_bfloat16 floating-point +* value in round-towards-zero mode. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rz(const unsigned short int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert an unsigned short integer to a nv_bfloat16 in round-down mode. +* +* \details Convert the unsigned short integer value \p i to a nv_bfloat16 floating-point +* value in round-down mode. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rd(const unsigned short int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert an unsigned short integer to a nv_bfloat16 in round-up mode. +* +* \details Convert the unsigned short integer value \p i to a nv_bfloat16 floating-point +* value in round-up mode. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_ru(const unsigned short int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to an unsigned 64-bit integer in round-to-nearest-even +* mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned 64-bit +* integer in round-to-nearest-even mode. NaN inputs return 0x8000000000000000. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned long long int +* - \p h converted to an unsigned 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_rn(const __nv_bfloat16 h); +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to an unsigned 64-bit integer in round-towards-zero +* mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned 64-bit +* integer in round-towards-zero mode. NaN inputs return 0x8000000000000000. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned long long int +* - \p h converted to an unsigned 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned long long int __bfloat162ull_rz(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Vector function, combines two \p nv_bfloat16 numbers into one \p nv_bfloat162 number. +* +* \details Combines two input \p nv_bfloat16 number \p x and \p y into one \p nv_bfloat162 number. +* Input \p x is stored in low 16 bits of the return value, input \p y is stored +* in high 16 bits of the return value. +* \param[in] x - nv_bfloat16. Is only being read. +* \param[in] y - nv_bfloat16. Is only being read. +* +* \returns __nv_bfloat162 +* - The \p __nv_bfloat162 vector with one half equal to \p x and the other to \p y. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 make_bfloat162(const __nv_bfloat16 x, const __nv_bfloat16 y); +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to an unsigned 64-bit integer in round-down mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned 64-bit +* integer in round-down mode. NaN inputs return 0x8000000000000000. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned long long int +* - \p h converted to an unsigned 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_rd(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to an unsigned 64-bit integer in round-up mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned 64-bit +* integer in round-up mode. NaN inputs return 0x8000000000000000. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned long long int +* - \p h converted to an unsigned 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_ru(const __nv_bfloat16 h); +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert an unsigned 64-bit integer to a nv_bfloat16 in round-to-nearest-even +* mode. +* +* \details Convert the unsigned 64-bit integer value \p i to a nv_bfloat16 floating-point +* value in round-to-nearest-even mode. +* \param[in] i - unsigned long long int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rn(const unsigned long long int i); +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert an unsigned 64-bit integer to a nv_bfloat16 in round-towards-zero +* mode. +* +* \details Convert the unsigned 64-bit integer value \p i to a nv_bfloat16 floating-point +* value in round-towards-zero mode. +* \param[in] i - unsigned long long int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rz(const unsigned long long int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert an unsigned 64-bit integer to a nv_bfloat16 in round-down mode. +* +* \details Convert the unsigned 64-bit integer value \p i to a nv_bfloat16 floating-point +* value in round-down mode. +* \param[in] i - unsigned long long int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rd(const unsigned long long int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert an unsigned 64-bit integer to a nv_bfloat16 in round-up mode. +* +* \details Convert the unsigned 64-bit integer value \p i to a nv_bfloat16 floating-point +* value in round-up mode. +* \param[in] i - unsigned long long int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_ru(const unsigned long long int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to a signed 64-bit integer in round-to-nearest-even +* mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to a signed 64-bit +* integer in round-to-nearest-even mode. NaN inputs return a long long int with hex value of 0x8000000000000000. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns long long int +* - \p h converted to a signed 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ long long int __bfloat162ll_rn(const __nv_bfloat16 h); +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to a signed 64-bit integer in round-towards-zero mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to a signed 64-bit +* integer in round-towards-zero mode. NaN inputs return a long long int with hex value of 0x8000000000000000. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns long long int +* - \p h converted to a signed 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ long long int __bfloat162ll_rz(const __nv_bfloat16 h); +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to a signed 64-bit integer in round-down mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to a signed 64-bit +* integer in round-down mode. NaN inputs return a long long int with hex value of 0x8000000000000000. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns long long int +* - \p h converted to a signed 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ long long int __bfloat162ll_rd(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to a signed 64-bit integer in round-up mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to a signed 64-bit +* integer in round-up mode. NaN inputs return a long long int with hex value of 0x8000000000000000. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns long long int +* - \p h converted to a signed 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ long long int __bfloat162ll_ru(const __nv_bfloat16 h); +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a signed 64-bit integer to a nv_bfloat16 in round-to-nearest-even +* mode. +* +* \details Convert the signed 64-bit integer value \p i to a nv_bfloat16 floating-point +* value in round-to-nearest-even mode. +* \param[in] i - long long int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rn(const long long int i); +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a signed 64-bit integer to a nv_bfloat16 in round-towards-zero mode. +* +* \details Convert the signed 64-bit integer value \p i to a nv_bfloat16 floating-point +* value in round-towards-zero mode. +* \param[in] i - long long int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rz(const long long int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a signed 64-bit integer to a nv_bfloat16 in round-down mode. +* +* \details Convert the signed 64-bit integer value \p i to a nv_bfloat16 floating-point +* value in round-down mode. +* \param[in] i - long long int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rd(const long long int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a signed 64-bit integer to a nv_bfloat16 in round-up mode. +* +* \details Convert the signed 64-bit integer value \p i to a nv_bfloat16 floating-point +* value in round-up mode. +* \param[in] i - long long int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_ru(const long long int i); + +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Truncate input argument to the integral part. +* +* \details Round \p h to the nearest integer value that does not exceed \p h in +* magnitude. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The truncated integer value. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 htrunc(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculate ceiling of the input argument. +* +* \details Compute the smallest integer value not less than \p h. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The smallest integer value not less than \p h. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hceil(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculate the largest integer less than or equal to \p h. +* +* \details Calculate the largest integer value which is less than or equal to \p h. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The largest integer value which is less than or equal to \p h. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hfloor(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Round input to nearest integer value in nv_bfloat16 floating-point +* number. +* +* \details Round \p h to the nearest integer value in nv_bfloat16 floating-point +* format, with bfloat16way cases rounded to the nearest even integer value. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The nearest integer to \p h. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hrint(const __nv_bfloat16 h); + +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Truncate \p nv_bfloat162 vector input argument to the integral part. +* +* \details Round each component of vector \p h to the nearest integer value that does +* not exceed \p h in magnitude. +* \param[in] h - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The truncated \p h. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2trunc(const __nv_bfloat162 h); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculate \p nv_bfloat162 vector ceiling of the input argument. +* +* \details For each component of vector \p h compute the smallest integer value not less +* than \p h. +* \param[in] h - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The vector of smallest integers not less than \p h. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2ceil(const __nv_bfloat162 h); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculate the largest integer less than or equal to \p h. +* +* \details For each component of vector \p h calculate the largest integer value which +* is less than or equal to \p h. +* \param[in] h - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The vector of largest integers which is less than or equal to \p h. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2floor(const __nv_bfloat162 h); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Round input to nearest integer value in nv_bfloat16 floating-point +* number. +* +* \details Round each component of \p nv_bfloat162 vector \p h to the nearest integer value in +* nv_bfloat16 floating-point format, with bfloat16way cases rounded to the +* nearest even integer value. +* \param[in] h - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The vector of rounded integer values. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2rint(const __nv_bfloat162 h); +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Returns \p nv_bfloat162 with both halves equal to the input value. +* +* \details Returns \p nv_bfloat162 number with both halves equal to the input \p a \p nv_bfloat16 +* number. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat162 +* - The vector which has both its halves equal to the input \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __bfloat162bfloat162(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Swaps both halves of the \p nv_bfloat162 input. +* +* \details Swaps both halves of the \p nv_bfloat162 input and returns a new \p nv_bfloat162 number +* with swapped halves. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - \p a with its halves being swapped. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __lowhigh2highlow(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Extracts low 16 bits from each of the two \p nv_bfloat162 inputs and combines +* into one \p nv_bfloat162 number. +* +* \details Extracts low 16 bits from each of the two \p nv_bfloat162 inputs and combines into +* one \p nv_bfloat162 number. Low 16 bits from input \p a is stored in low 16 bits of +* the return value, low 16 bits from input \p b is stored in high 16 bits of +* the return value. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The low 16 bits of \p a and of \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __lows2bfloat162(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Extracts high 16 bits from each of the two \p nv_bfloat162 inputs and +* combines into one \p nv_bfloat162 number. +* +* \details Extracts high 16 bits from each of the two \p nv_bfloat162 inputs and combines into +* one \p nv_bfloat162 number. High 16 bits from input \p a is stored in low 16 bits of +* the return value, high 16 bits from input \p b is stored in high 16 bits of +* the return value. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The high 16 bits of \p a and of \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __highs2bfloat162(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Returns high 16 bits of \p nv_bfloat162 input. +* +* \details Returns high 16 bits of \p nv_bfloat162 input \p a. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat16 +* - The high 16 bits of the input. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __high2bfloat16(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Returns low 16 bits of \p nv_bfloat162 input. +* +* \details Returns low 16 bits of \p nv_bfloat162 input \p a. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat16 +* - Returns \p nv_bfloat16 which contains low 16 bits of the input \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __low2bfloat16(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Checks if the input \p nv_bfloat16 number is infinite. +* +* \details Checks if the input \p nv_bfloat16 number \p a is infinite. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns int +* - -1 if \p a is equal to negative infinity, +* - 1 if \p a is equal to positive infinity, +* - 0 otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ int __hisinf(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Combines two \p nv_bfloat16 numbers into one \p nv_bfloat162 number. +* +* \details Combines two input \p nv_bfloat16 number \p a and \p b into one \p nv_bfloat162 number. +* Input \p a is stored in low 16 bits of the return value, input \p b is stored +* in high 16 bits of the return value. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat162 +* - The nv_bfloat162 with one nv_bfloat16 equal to \p a and the other to \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __halves2bfloat162(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Extracts low 16 bits from \p nv_bfloat162 input. +* +* \details Extracts low 16 bits from \p nv_bfloat162 input \p a and returns a new \p nv_bfloat162 +* number which has both halves equal to the extracted bits. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The nv_bfloat162 with both halves equal to the low 16 bits of the input. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __low2bfloat162(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Extracts high 16 bits from \p nv_bfloat162 input. +* +* \details Extracts high 16 bits from \p nv_bfloat162 input \p a and returns a new \p nv_bfloat162 +* number which has both halves equal to the extracted bits. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The nv_bfloat162 with both halves equal to the high 16 bits of the input. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __high2bfloat162(const __nv_bfloat162 a); + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Reinterprets bits in a \p nv_bfloat16 as a signed short integer. +* +* \details Reinterprets the bits in the nv_bfloat16 floating-point number \p h +* as a signed short integer. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns short int +* - The reinterpreted value. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ short int __bfloat16_as_short(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Reinterprets bits in a \p nv_bfloat16 as an unsigned short integer. +* +* \details Reinterprets the bits in the nv_bfloat16 floating-point \p h +* as an unsigned short number. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned short int +* - The reinterpreted value. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned short int __bfloat16_as_ushort(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Reinterprets bits in a signed short integer as a \p nv_bfloat16. +* +* \details Reinterprets the bits in the signed short integer \p i as a +* nv_bfloat16 floating-point number. +* \param[in] i - short int. Is only being read. +* +* \returns nv_bfloat16 +* - The reinterpreted value. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __short_as_bfloat16(const short int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Reinterprets bits in an unsigned short integer as a \p nv_bfloat16. +* +* \details Reinterprets the bits in the unsigned short integer \p i as a +* nv_bfloat16 floating-point number. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns nv_bfloat16 +* - The reinterpreted value. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ushort_as_bfloat16(const unsigned short int i); + +#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300))) || defined(_NVHPC_CUDA) + +#if !defined warpSize && !defined __local_warpSize +#define warpSize 32 +#define __local_warpSize +#endif + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Exchange a variable between threads within a warp. Direct copy from indexed thread. +* +* \details Returns the value of \p var held by the thread whose ID is given by \p srcLane. +* If the \p width is less than \p warpSize, then each subsection of the warp behaves as a separate +* entity with a starting logical thread ID of 0. If \p srcLane is outside the range \p [0:width-1], +* the value returned corresponds to the value of \p var held by the \p srcLane modulo \p width (i.e. +* within the same subsection). \p width must have a value which is a power of 2; +* results are undefined if \p width is not a power of 2, or is a number greater than +* \p warpSize. +* Threads may only read data from another thread which is actively participating in the +* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined. +* \param[in] mask - unsigned int. Is only being read. +* - Indicates the threads participating in the call. +* - A bit, representing the thread's lane ID, must be set for each participating thread +* to ensure they are properly converged before the intrinsic is executed by the hardware. +* - Each calling thread must have its own bit set in the \p mask and all non-exited threads +* named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined. +* \param[in] var - nv_bfloat162. Is only being read. +* \param[in] srcLane - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 4-byte word referenced by \p var from the source thread ID as \p nv_bfloat162. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_sync(const unsigned int mask, const __nv_bfloat162 var, const int srcLane, const int width = warpSize); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller. +* +* \details Calculates a source thread ID by subtracting \p delta from the caller's lane ID. +* The value of \p var held by the resulting lane ID is returned: in effect, \p var is shifted up +* the warp by \p delta threads. If the \p width is less than \p warpSize, then each subsection of the warp +* behaves as a separate entity with a starting logical thread ID of 0. The source thread index +* will not wrap around the value of \p width, so effectively the lower \p delta threads will be unchanged. +* \p width must have a value which is a power of 2; results are undefined if \p width is not a power of 2, +* or is a number greater than \p warpSize. +* Threads may only read data from another thread which is actively participating in the +* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined. +* \param[in] mask - unsigned int. Is only being read. +* - Indicates the threads participating in the call. +* - A bit, representing the thread's lane ID, must be set for each participating thread +* to ensure they are properly converged before the intrinsic is executed by the hardware. +* - Each calling thread must have its own bit set in the \p mask and all non-exited threads +* named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined. +* \param[in] var - nv_bfloat162. Is only being read. +* \param[in] delta - unsigned int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 4-byte word referenced by \p var from the source thread ID as \p nv_bfloat162. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_up_sync(const unsigned int mask, const __nv_bfloat162 var, const unsigned int delta, const int width = warpSize); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller. +* +* \details Calculates a source thread ID by adding \p delta to the caller's thread ID. +* The value of \p var held by the resulting thread ID is returned: this has the effect +* of shifting \p var down the warp by \p delta threads. If the \p width is less than \p warpSize, then +* each subsection of the warp behaves as a separate entity with a starting logical +* thread ID of 0. Similarly to the __shfl_up_sync(), the ID number of the source thread +* will not wrap around the value of \p width and the upper \p delta threads +* will remain unchanged. +* Threads may only read data from another thread which is actively participating in the +* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined. +* \param[in] mask - unsigned int. Is only being read. +* - Indicates the threads participating in the call. +* - A bit, representing the thread's lane ID, must be set for each participating thread +* to ensure they are properly converged before the intrinsic is executed by the hardware. +* - Each calling thread must have its own bit set in the \p mask and all non-exited threads +* named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined. +* \param[in] var - nv_bfloat162. Is only being read. +* \param[in] delta - unsigned int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 4-byte word referenced by \p var from the source thread ID as \p nv_bfloat162. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_down_sync(const unsigned int mask, const __nv_bfloat162 var, const unsigned int delta, const int width = warpSize); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID. +* +* \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with \p laneMask: +* the value of \p var held by the resulting thread ID is returned. If the \p width is less than \p warpSize, then each +* group of \p width consecutive threads are able to access elements from earlier groups of threads, +* however if they attempt to access elements from later groups of threads their own value of \p var +* will be returned. This mode implements a butterfly addressing pattern such as is used in tree +* reduction and broadcast. +* Threads may only read data from another thread which is actively participating in the +* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined. +* \param[in] mask - unsigned int. Is only being read. +* - Indicates the threads participating in the call. +* - A bit, representing the thread's lane ID, must be set for each participating thread +* to ensure they are properly converged before the intrinsic is executed by the hardware. +* - Each calling thread must have its own bit set in the \p mask and all non-exited threads +* named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined. +* \param[in] var - nv_bfloat162. Is only being read. +* \param[in] laneMask - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 4-byte word referenced by \p var from the source thread ID as \p nv_bfloat162. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_xor_sync(const unsigned int mask, const __nv_bfloat162 var, const int laneMask, const int width = warpSize); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Exchange a variable between threads within a warp. Direct copy from indexed thread. +* +* \details Returns the value of \p var held by the thread whose ID is given by \p srcLane. +* If the \p width is less than \p warpSize, then each subsection of the warp behaves as a separate +* entity with a starting logical thread ID of 0. If \p srcLane is outside the range \p [0:width-1], +* the value returned corresponds to the value of \p var held by the \p srcLane modulo \p width (i.e. +* within the same subsection). \p width must have a value which is a power of 2; +* results are undefined if \p width is not a power of 2, or is a number greater than +* \p warpSize. +* Threads may only read data from another thread which is actively participating in the +* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined. +* \param[in] mask - unsigned int. Is only being read. +* - Indicates the threads participating in the call. +* - A bit, representing the thread's lane ID, must be set for each participating thread +* to ensure they are properly converged before the intrinsic is executed by the hardware. +* - Each calling thread must have its own bit set in the \p mask and all non-exited threads +* named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined. +* \param[in] var - nv_bfloat16. Is only being read. +* \param[in] srcLane - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 2-byte word referenced by \p var from the source thread ID as \p nv_bfloat16. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_sync(const unsigned int mask, const __nv_bfloat16 var, const int srcLane, const int width = warpSize); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller. +* +* \details Calculates a source thread ID by subtracting \p delta from the caller's lane ID. +* The value of \p var held by the resulting lane ID is returned: in effect, \p var is shifted up +* the warp by \p delta threads. If the \p width is less than \p warpSize, then each subsection of the warp +* behaves as a separate entity with a starting logical thread ID of 0. The source thread index +* will not wrap around the value of \p width, so effectively the lower \p delta threads will be unchanged. +* \p width must have a value which is a power of 2; results are undefined if \p width is not a power of 2, +* or is a number greater than \p warpSize. +* Threads may only read data from another thread which is actively participating in the +* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined. +* \param[in] mask - unsigned int. Is only being read. +* - Indicates the threads participating in the call. +* - A bit, representing the thread's lane ID, must be set for each participating thread +* to ensure they are properly converged before the intrinsic is executed by the hardware. +* - Each calling thread must have its own bit set in the \p mask and all non-exited threads +* named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined. +* \param[in] var - nv_bfloat16. Is only being read. +* \param[in] delta - unsigned int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 2-byte word referenced by \p var from the source thread ID as \p nv_bfloat16. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_up_sync(const unsigned int mask, const __nv_bfloat16 var, const unsigned int delta, const int width = warpSize); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller. +* +* \details Calculates a source thread ID by adding \p delta to the caller's thread ID. +* The value of \p var held by the resulting thread ID is returned: this has the effect +* of shifting \p var down the warp by \p delta threads. If the \p width is less than \p warpSize, then +* each subsection of the warp behaves as a separate entity with a starting logical +* thread ID of 0. Similarly to the __shfl_up_sync(), the ID number of the source thread +* will not wrap around the value of \p width and the upper \p delta threads +* will remain unchanged. +* Threads may only read data from another thread which is actively participating in the +* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined. +* \param[in] mask - unsigned int. Is only being read. +* - Indicates the threads participating in the call. +* - A bit, representing the thread's lane ID, must be set for each participating thread +* to ensure they are properly converged before the intrinsic is executed by the hardware. +* - Each calling thread must have its own bit set in the \p mask and all non-exited threads +* named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined. +* \param[in] var - nv_bfloat16. Is only being read. +* \param[in] delta - unsigned int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 2-byte word referenced by \p var from the source thread ID as \p nv_bfloat16. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_down_sync(const unsigned int mask, const __nv_bfloat16 var, const unsigned int delta, const int width = warpSize); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID. +* +* \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with \p laneMask: +* the value of \p var held by the resulting thread ID is returned. If the \p width is less than \p warpSize, then each +* group of \p width consecutive threads are able to access elements from earlier groups of threads, +* however if they attempt to access elements from later groups of threads their own value of \p var +* will be returned. This mode implements a butterfly addressing pattern such as is used in tree +* reduction and broadcast. +* Threads may only read data from another thread which is actively participating in the +* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined. +* \param[in] mask - unsigned int. Is only being read. +* - Indicates the threads participating in the call. +* - A bit, representing the thread's lane ID, must be set for each participating thread +* to ensure they are properly converged before the intrinsic is executed by the hardware. +* - Each calling thread must have its own bit set in the \p mask and all non-exited threads +* named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined. +* \param[in] var - nv_bfloat16. Is only being read. +* \param[in] laneMask - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 2-byte word referenced by \p var from the source thread ID as \p nv_bfloat16. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_xor_sync(const unsigned int mask, const __nv_bfloat16 var, const int laneMask, const int width = warpSize); +#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300))) || defined(_NVHPC_CUDA) */ + +#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320))) || defined(_NVHPC_CUDA) +#if defined(__local_warpSize) +#undef warpSize +#undef __local_warpSize +#endif + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `ld.global.nc` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __ldg(const __nv_bfloat162 *const ptr); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `ld.global.nc` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ldg(const __nv_bfloat16 *const ptr); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `ld.global.cg` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __ldcg(const __nv_bfloat162 *const ptr); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `ld.global.cg` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ldcg(const __nv_bfloat16 *const ptr); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `ld.global.ca` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __ldca(const __nv_bfloat162 *const ptr); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `ld.global.ca` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ldca(const __nv_bfloat16 *const ptr); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `ld.global.cs` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __ldcs(const __nv_bfloat162 *const ptr); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `ld.global.cs` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ldcs(const __nv_bfloat16 *const ptr); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `ld.global.lu` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __ldlu(const __nv_bfloat162 *const ptr); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `ld.global.lu` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ldlu(const __nv_bfloat16 *const ptr); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `ld.global.cv` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __ldcv(const __nv_bfloat162 *const ptr); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `ld.global.cv` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ldcv(const __nv_bfloat16 *const ptr); + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `st.global.wb` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_BF16_DECL__ void __stwb(__nv_bfloat162 *const ptr, const __nv_bfloat162 value); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `st.global.wb` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_BF16_DECL__ void __stwb(__nv_bfloat16 *const ptr, const __nv_bfloat16 value); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `st.global.cg` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_BF16_DECL__ void __stcg(__nv_bfloat162 *const ptr, const __nv_bfloat162 value); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `st.global.cg` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_BF16_DECL__ void __stcg(__nv_bfloat16 *const ptr, const __nv_bfloat16 value); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `st.global.cs` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_BF16_DECL__ void __stcs(__nv_bfloat162 *const ptr, const __nv_bfloat162 value); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `st.global.cs` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_BF16_DECL__ void __stcs(__nv_bfloat16 *const ptr, const __nv_bfloat16 value); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `st.global.wt` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_BF16_DECL__ void __stwt(__nv_bfloat162 *const ptr, const __nv_bfloat162 value); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `st.global.wt` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_BF16_DECL__ void __stwt(__nv_bfloat16 *const ptr, const __nv_bfloat16 value); + +#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320))) || defined(_NVHPC_CUDA) */ + +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs nv_bfloat162 vector if-equal comparison. +* +* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b. +* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The vector result of if-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __heq2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector not-equal comparison. +* +* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b. +* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The vector result of not-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hne2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector less-equal comparison. +* +* \details Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b. +* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The \p nv_bfloat162 result of less-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hle2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector greater-equal comparison. +* +* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b. +* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The vector result of greater-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hge2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector less-than comparison. +* +* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b. +* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The nv_bfloat162 vector result of less-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hlt2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector greater-than comparison. +* +* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b. +* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The vector result of greater-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hgt2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered if-equal comparison. +* +* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b. +* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The vector result of unordered if-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hequ2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered not-equal comparison. +* +* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b. +* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The vector result of unordered not-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hneu2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered less-equal comparison. +* +* Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b. +* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The vector result of unordered less-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hleu2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered greater-equal comparison. +* +* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b. +* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The \p nv_bfloat162 vector result of unordered greater-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered less-than comparison. +* +* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b. +* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The vector result of unordered less-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hltu2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered greater-than comparison. +* +* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b. +* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The \p nv_bfloat162 vector result of unordered greater-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs nv_bfloat162 vector if-equal comparison. +* +* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns unsigned int +* - The vector mask result of if-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __heq2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector not-equal comparison. +* +* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns unsigned int +* - The vector mask result of not-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hne2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector less-equal comparison. +* +* \details Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns unsigned int +* - The vector mask result of less-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hle2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector greater-equal comparison. +* +* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns unsigned int +* - The vector mask result of greater-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hge2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector less-than comparison. +* +* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns unsigned int +* - The vector mask result of less-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hlt2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector greater-than comparison. +* +* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns unsigned int +* - The vector mask result of greater-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hgt2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered if-equal comparison. +* +* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns unsigned int +* - The vector mask result of unordered if-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hequ2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered not-equal comparison. +* +* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns unsigned int +* - The vector mask result of unordered not-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hneu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered less-equal comparison. +* +* Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns unsigned int +* - The vector mask result of unordered less-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hleu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered greater-equal comparison. +* +* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns unsigned int +* - The vector mask result of unordered greater-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hgeu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered less-than comparison. +* +* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns unsigned int +* - The vector mask result of unordered less-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hltu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered greater-than comparison. +* +* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns unsigned int +* - The vector mask result of unordered greater-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hgtu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Determine whether \p nv_bfloat162 argument is a NaN. +* +* \details Determine whether each nv_bfloat16 of input \p nv_bfloat162 number \p a is a NaN. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The nv_bfloat162 with the corresponding \p nv_bfloat16 results set to +* 1.0 for NaN, 0.0 otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hisnan2(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs \p nv_bfloat162 vector addition in round-to-nearest-even mode. +* +* \details Performs \p nv_bfloat162 vector add of inputs \p a and \p b, in round-to-nearest-even +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-95 +* \endinternal +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The sum of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hadd2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs \p nv_bfloat162 vector subtraction in round-to-nearest-even mode. +* +* \details Subtracts \p nv_bfloat162 input vector \p b from input vector \p a in +* round-to-nearest-even mode. +* \internal +* \req DEEPLEARN-SRM_REQ-104 +* \endinternal +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The subtraction of vector \p b from \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hsub2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs \p nv_bfloat162 vector multiplication in round-to-nearest-even mode. +* +* \details Performs \p nv_bfloat162 vector multiplication of inputs \p a and \p b, in +* round-to-nearest-even mode. +* \internal +* \req DEEPLEARN-SRM_REQ-102 +* \endinternal +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The result of elementwise multiplying the vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmul2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs \p nv_bfloat162 vector addition in round-to-nearest-even mode. +* +* \details Performs \p nv_bfloat162 vector add of inputs \p a and \p b, in round-to-nearest-even +* mode. Prevents floating-point contractions of mul+add into fma. +* \internal +* \req DEEPLEARN-SRM_REQ-95 +* \endinternal +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The sum of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hadd2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs \p nv_bfloat162 vector subtraction in round-to-nearest-even mode. +* +* \details Subtracts \p nv_bfloat162 input vector \p b from input vector \p a in +* round-to-nearest-even mode. Prevents floating-point contractions of mul+sub into fma. +* \internal +* \req DEEPLEARN-SRM_REQ-104 +* \endinternal +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The subtraction of vector \p b from \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hsub2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs \p nv_bfloat162 vector multiplication in round-to-nearest-even mode. +* +* \details Performs \p nv_bfloat162 vector multiplication of inputs \p a and \p b, in +* round-to-nearest-even mode. Prevents floating-point contractions of mul+add +* or sub into fma. +* \internal +* \req DEEPLEARN-SRM_REQ-102 +* \endinternal +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The result of elementwise multiplying the vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmul2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs \p nv_bfloat162 vector division in round-to-nearest-even mode. +* +* \details Divides \p nv_bfloat162 input vector \p a by input vector \p b in round-to-nearest-even +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-103 +* \endinternal +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The elementwise division of \p a with \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __h2div(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Calculates the absolute value of both halves of the input \p nv_bfloat162 number and +* returns the result. +* +* \details Calculates the absolute value of both halves of the input \p nv_bfloat162 number and +* returns the result. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns bfloat2 +* - Returns \p a with the absolute value of both halves. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __habs2(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs \p nv_bfloat162 vector addition in round-to-nearest-even mode, with +* saturation to [0.0, 1.0]. +* +* \details Performs \p nv_bfloat162 vector add of inputs \p a and \p b, in round-to-nearest-even +* mode, and clamps the results to range [0.0, 1.0]. NaN results are flushed to +* +0.0. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The sum of \p a and \p b, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hadd2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs \p nv_bfloat162 vector subtraction in round-to-nearest-even mode, +* with saturation to [0.0, 1.0]. +* +* \details Subtracts \p nv_bfloat162 input vector \p b from input vector \p a in +* round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN +* results are flushed to +0.0. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The subtraction of vector \p b from \p a, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hsub2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs \p nv_bfloat162 vector multiplication in round-to-nearest-even mode, +* with saturation to [0.0, 1.0]. +* +* \details Performs \p nv_bfloat162 vector multiplication of inputs \p a and \p b, in +* round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN +* results are flushed to +0.0. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The result of elementwise multiplication of vectors \p a and \p b, +* with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmul2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b); +#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs \p nv_bfloat162 vector fused multiply-add in round-to-nearest-even +* mode. +* +* \details Performs \p nv_bfloat162 vector multiply on inputs \p a and \p b, +* then performs a \p nv_bfloat162 vector add of the result with \p c, +* rounding the result once in round-to-nearest-even mode. +* \internal +* \req DEEPLEARN-SRM_REQ-105 +* \endinternal +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* \param[in] c - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c); +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs \p nv_bfloat162 vector fused multiply-add in round-to-nearest-even +* mode, with saturation to [0.0, 1.0]. +* +* \details Performs \p nv_bfloat162 vector multiply on inputs \p a and \p b, +* then performs a \p nv_bfloat162 vector add of the result with \p c, +* rounding the result once in round-to-nearest-even mode, and clamps the +* results to range [0.0, 1.0]. NaN results are flushed to +0.0. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* \param[in] c - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c, +* with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c); +#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Negates both halves of the input \p nv_bfloat162 number and returns the +* result. +* +* \details Negates both halves of the input \p nv_bfloat162 number \p a and returns the result. +* \internal +* \req DEEPLEARN-SRM_REQ-101 +* \endinternal +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - Returns \p a with both halves negated. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hneg2(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Calculates the absolute value of input \p nv_bfloat16 number and returns the result. +* +* \details Calculates the absolute value of input \p nv_bfloat16 number and returns the result. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The absolute value of a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __habs(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Performs \p nv_bfloat16 addition in round-to-nearest-even mode. +* +* \details Performs \p nv_bfloat16 addition of inputs \p a and \p b, in round-to-nearest-even +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-94 +* \endinternal +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The sum of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hadd(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Performs \p nv_bfloat16 subtraction in round-to-nearest-even mode. +* +* \details Subtracts \p nv_bfloat16 input \p b from input \p a in round-to-nearest-even +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-97 +* \endinternal +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The result of subtracting \p b from \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hsub(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Performs \p nv_bfloat16 multiplication in round-to-nearest-even mode. +* +* \details Performs \p nv_bfloat16 multiplication of inputs \p a and \p b, in round-to-nearest-even +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-99 +* \endinternal +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The result of multiplying \p a and \p b. +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmul(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Performs \p nv_bfloat16 addition in round-to-nearest-even mode. +* +* \details Performs \p nv_bfloat16 addition of inputs \p a and \p b, in round-to-nearest-even +* mode. Prevents floating-point contractions of mul+add into fma. +* \internal +* \req DEEPLEARN-SRM_REQ-94 +* \endinternal +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The sum of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hadd_rn(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Performs \p nv_bfloat16 subtraction in round-to-nearest-even mode. +* +* \details Subtracts \p nv_bfloat16 input \p b from input \p a in round-to-nearest-even +* mode. Prevents floating-point contractions of mul+sub into fma. +* \internal +* \req DEEPLEARN-SRM_REQ-97 +* \endinternal +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The result of subtracting \p b from \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hsub_rn(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Performs \p nv_bfloat16 multiplication in round-to-nearest-even mode. +* +* \details Performs \p nv_bfloat16 multiplication of inputs \p a and \p b, in round-to-nearest-even +* mode. Prevents floating-point contractions of mul+add or sub into fma. +* \internal +* \req DEEPLEARN-SRM_REQ-99 +* \endinternal +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The result of multiplying \p a and \p b. +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmul_rn(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Performs \p nv_bfloat16 division in round-to-nearest-even mode. +* +* \details Divides \p nv_bfloat16 input \p a by input \p b in round-to-nearest-even +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-98 +* \endinternal +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The result of dividing \p a by \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hdiv(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Performs \p nv_bfloat16 addition in round-to-nearest-even mode, with +* saturation to [0.0, 1.0]. +* +* \details Performs \p nv_bfloat16 add of inputs \p a and \p b, in round-to-nearest-even mode, +* and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The sum of \p a and \p b, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hadd_sat(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Performs \p nv_bfloat16 subtraction in round-to-nearest-even mode, with +* saturation to [0.0, 1.0]. +* +* \details Subtracts \p nv_bfloat16 input \p b from input \p a in round-to-nearest-even +* mode, +* and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The result of subtraction of \p b from \p a, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hsub_sat(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Performs \p nv_bfloat16 multiplication in round-to-nearest-even mode, with +* saturation to [0.0, 1.0]. +* +* \details Performs \p nv_bfloat16 multiplication of inputs \p a and \p b, in round-to-nearest-even +* mode, and clamps the result to range [0.0, 1.0]. NaN results are flushed to +* +0.0. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The result of multiplying \p a and \p b, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmul_sat(const __nv_bfloat16 a, const __nv_bfloat16 b); +#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Performs \p nv_bfloat16 fused multiply-add in round-to-nearest-even mode. +* +* \details Performs \p nv_bfloat16 multiply on inputs \p a and \p b, +* then performs a \p nv_bfloat16 add of the result with \p c, +* rounding the result once in round-to-nearest-even mode. +* \internal +* \req DEEPLEARN-SRM_REQ-96 +* \endinternal +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* \param[in] c - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The result of fused multiply-add operation on \p +* a, \p b, and \p c. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __hfma(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c); +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Performs \p nv_bfloat16 fused multiply-add in round-to-nearest-even mode, +* with saturation to [0.0, 1.0]. +* +* \details Performs \p nv_bfloat16 multiply on inputs \p a and \p b, +* then performs a \p nv_bfloat16 add of the result with \p c, +* rounding the result once in round-to-nearest-even mode, and clamps the result +* to range [0.0, 1.0]. NaN results are flushed to +0.0. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* \param[in] c - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The result of fused multiply-add operation on \p +* a, \p b, and \p c, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __hfma_sat(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c); +#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Negates input \p nv_bfloat16 number and returns the result. +* +* \details Negates input \p nv_bfloat16 number and returns the result. +* \internal +* \req DEEPLEARN-SRM_REQ-100 +* \endinternal +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - minus a +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hneg(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector if-equal comparison and returns boolean true +* if both \p nv_bfloat16 results are true, boolean false otherwise. +* +* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p nv_bfloat16 if-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns bool +* - true if both \p nv_bfloat16 results of if-equal comparison +* of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbeq2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector not-equal comparison and returns boolean +* true if both \p nv_bfloat16 results are true, boolean false otherwise. +* +* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p nv_bfloat16 not-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns bool +* - true if both \p nv_bfloat16 results of not-equal comparison +* of vectors \p a and \p b are true, +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbne2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector less-equal comparison and returns boolean +* true if both \p nv_bfloat16 results are true, boolean false otherwise. +* +* \details Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p nv_bfloat16 less-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns bool +* - true if both \p nv_bfloat16 results of less-equal comparison +* of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hble2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector greater-equal comparison and returns boolean +* true if both \p nv_bfloat16 results are true, boolean false otherwise. +* +* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p nv_bfloat16 greater-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns bool +* - true if both \p nv_bfloat16 results of greater-equal +* comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbge2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector less-than comparison and returns boolean +* true if both \p nv_bfloat16 results are true, boolean false otherwise. +* +* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p nv_bfloat16 less-than comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns bool +* - true if both \p nv_bfloat16 results of less-than comparison +* of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hblt2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector greater-than comparison and returns boolean +* true if both \p nv_bfloat16 results are true, boolean false otherwise. +* +* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p nv_bfloat16 greater-than comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns bool +* - true if both \p nv_bfloat16 results of greater-than +* comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbgt2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered if-equal comparison and returns +* boolean true if both \p nv_bfloat16 results are true, boolean false otherwise. +* +* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p nv_bfloat16 if-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns bool +* - true if both \p nv_bfloat16 results of unordered if-equal +* comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbequ2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered not-equal comparison and returns +* boolean true if both \p nv_bfloat16 results are true, boolean false otherwise. +* +* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p nv_bfloat16 not-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns bool +* - true if both \p nv_bfloat16 results of unordered not-equal +* comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbneu2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered less-equal comparison and returns +* boolean true if both \p nv_bfloat16 results are true, boolean false otherwise. +* +* \details Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p nv_bfloat16 less-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns bool +* - true if both \p nv_bfloat16 results of unordered less-equal +* comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbleu2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered greater-equal comparison and +* returns boolean true if both \p nv_bfloat16 results are true, boolean false +* otherwise. +* +* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p nv_bfloat16 greater-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns bool +* - true if both \p nv_bfloat16 results of unordered +* greater-equal comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered less-than comparison and returns +* boolean true if both \p nv_bfloat16 results are true, boolean false otherwise. +* +* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p nv_bfloat16 less-than comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns bool +* - true if both \p nv_bfloat16 results of unordered less-than comparison of +* vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbltu2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered greater-than comparison and +* returns boolean true if both \p nv_bfloat16 results are true, boolean false +* otherwise. +* +* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p nv_bfloat16 greater-than comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns bool +* - true if both \p nv_bfloat16 results of unordered +* greater-than comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Performs \p nv_bfloat16 if-equal comparison. +* +* \details Performs \p nv_bfloat16 if-equal comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns bool +* - The boolean result of if-equal comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __heq(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Performs \p nv_bfloat16 not-equal comparison. +* +* \details Performs \p nv_bfloat16 not-equal comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns bool +* - The boolean result of not-equal comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hne(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Performs \p nv_bfloat16 less-equal comparison. +* +* \details Performs \p nv_bfloat16 less-equal comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns bool +* - The boolean result of less-equal comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hle(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Performs \p nv_bfloat16 greater-equal comparison. +* +* \details Performs \p nv_bfloat16 greater-equal comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns bool +* - The boolean result of greater-equal comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hge(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Performs \p nv_bfloat16 less-than comparison. +* +* \details Performs \p nv_bfloat16 less-than comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns bool +* - The boolean result of less-than comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hlt(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Performs \p nv_bfloat16 greater-than comparison. +* +* \details Performs \p nv_bfloat16 greater-than comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns bool +* - The boolean result of greater-than comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hgt(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Performs \p nv_bfloat16 unordered if-equal comparison. +* +* \details Performs \p nv_bfloat16 if-equal comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns bool +* - The boolean result of unordered if-equal comparison of \p a and +* \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hequ(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Performs \p nv_bfloat16 unordered not-equal comparison. +* +* \details Performs \p nv_bfloat16 not-equal comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns bool +* - The boolean result of unordered not-equal comparison of \p a and +* \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hneu(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Performs \p nv_bfloat16 unordered less-equal comparison. +* +* \details Performs \p nv_bfloat16 less-equal comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns bool +* - The boolean result of unordered less-equal comparison of \p a and +* \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hleu(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Performs \p nv_bfloat16 unordered greater-equal comparison. +* +* \details Performs \p nv_bfloat16 greater-equal comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns bool +* - The boolean result of unordered greater-equal comparison of \p a +* and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hgeu(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Performs \p nv_bfloat16 unordered less-than comparison. +* +* \details Performs \p nv_bfloat16 less-than comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns bool +* - The boolean result of unordered less-than comparison of \p a and +* \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hltu(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Performs \p nv_bfloat16 unordered greater-than comparison. +* +* \details Performs \p nv_bfloat16 greater-than comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns bool +* - The boolean result of unordered greater-than comparison of \p a +* and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hgtu(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Determine whether \p nv_bfloat16 argument is a NaN. +* +* \details Determine whether \p nv_bfloat16 value \p a is a NaN. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns bool +* - true if argument is NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hisnan(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Calculates \p nv_bfloat16 maximum of two input values. +* +* \details Calculates \p nv_bfloat16 max(\p a, \p b) +* defined as (\p a > \p b) ? \p a : \p b. +* - If either of inputs is NaN, the other input is returned. +* - If both inputs are NaNs, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmax(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Calculates \p nv_bfloat16 minimum of two input values. +* +* \details Calculates \p nv_bfloat16 min(\p a, \p b) +* defined as (\p a < \p b) ? \p a : \p b. +* - If either of inputs is NaN, the other input is returned. +* - If both inputs are NaNs, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmin(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Calculates \p nv_bfloat16 maximum of two input values, NaNs pass through. +* +* \details Calculates \p nv_bfloat16 max(\p a, \p b) +* defined as (\p a > \p b) ? \p a : \p b. +* - If either of inputs is NaN, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmax_nan(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Calculates \p nv_bfloat16 minimum of two input values, NaNs pass through. +* +* \details Calculates \p nv_bfloat16 min(\p a, \p b) +* defined as (\p a < \p b) ? \p a : \p b. +* - If either of inputs is NaN, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmin_nan(const __nv_bfloat16 a, const __nv_bfloat16 b); +#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Performs \p nv_bfloat16 fused multiply-add in round-to-nearest-even mode with relu saturation. +* +* \details Performs \p nv_bfloat16 multiply on inputs \p a and \p b, +* then performs a \p nv_bfloat16 add of the result with \p c, +* rounding the result once in round-to-nearest-even mode. +* Then negative result is clamped to 0. +* NaN result is converted to canonical NaN. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* \param[in] c - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The result of fused multiply-add operation on \p +* a, \p b, and \p c with relu saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __hfma_relu(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c); +#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Calculates \p nv_bfloat162 vector maximum of two inputs. +* +* \details Calculates \p nv_bfloat162 vector max(\p a, \p b). +* Elementwise \p nv_bfloat16 operation is defined as +* (\p a > \p b) ? \p a : \p b. +* - If either of inputs is NaN, the other input is returned. +* - If both inputs are NaNs, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The result of elementwise maximum of vectors \p a and \p b +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmax2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Calculates \p nv_bfloat162 vector minimum of two inputs. +* +* \details Calculates \p nv_bfloat162 vector min(\p a, \p b). +* Elementwise \p nv_bfloat16 operation is defined as +* (\p a < \p b) ? \p a : \p b. +* - If either of inputs is NaN, the other input is returned. +* - If both inputs are NaNs, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The result of elementwise minimum of vectors \p a and \p b +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmin2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Calculates \p nv_bfloat162 vector maximum of two inputs, NaNs pass through. +* +* \details Calculates \p nv_bfloat162 vector max(\p a, \p b). +* Elementwise \p nv_bfloat16 operation is defined as +* (\p a > \p b) ? \p a : \p b. +* - If either of inputs is NaN, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The result of elementwise maximum of vectors \p a and \p b, with NaNs pass through +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmax2_nan(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Calculates \p nv_bfloat162 vector minimum of two inputs, NaNs pass through. +* +* \details Calculates \p nv_bfloat162 vector min(\p a, \p b). +* Elementwise \p nv_bfloat16 operation is defined as +* (\p a < \p b) ? \p a : \p b. +* - If either of inputs is NaN, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The result of elementwise minimum of vectors \p a and \p b, with NaNs pass through +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmin2_nan(const __nv_bfloat162 a, const __nv_bfloat162 b); +#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs \p nv_bfloat162 vector fused multiply-add in round-to-nearest-even +* mode with relu saturation. +* +* \details Performs \p nv_bfloat162 vector multiply on inputs \p a and \p b, +* then performs a \p nv_bfloat162 vector add of the result with \p c, +* rounding the result once in round-to-nearest-even mode. +* Then negative result is clamped to 0. +* NaN result is converted to canonical NaN. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* \param[in] c - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c with relu saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2_relu(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c); +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs fast complex multiply-accumulate +* +* \details Interprets vector \p nv_bfloat162 input pairs \p a, \p b, and \p c as +* complex numbers in \p nv_bfloat16 precision and performs +* complex multiply-accumulate operation: a*b + c +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* \param[in] c - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The result of complex multiply-accumulate operation on complex numbers \p a, \p b, and \p c +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __hcmadd(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c); +#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */ +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculates \p nv_bfloat16 square root in round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat16 square root of input \p a in round-to-nearest-even mode. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The square root of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hsqrt(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculates \p nv_bfloat16 reciprocal square root in round-to-nearest-even +* mode. +* +* \details Calculates \p nv_bfloat16 reciprocal square root of input \p a in round-to-nearest-even +* mode. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The reciprocal square root of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hrsqrt(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculates \p nv_bfloat16 reciprocal in round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat16 reciprocal of input \p a in round-to-nearest-even mode. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The reciprocal of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hrcp(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculates \p nv_bfloat16 natural logarithm in round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat16 natural logarithm of input \p a in round-to-nearest-even +* mode. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The natural logarithm of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hlog(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculates \p nv_bfloat16 binary logarithm in round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat16 binary logarithm of input \p a in round-to-nearest-even +* mode. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The binary logarithm of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hlog2(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculates \p nv_bfloat16 decimal logarithm in round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat16 decimal logarithm of input \p a in round-to-nearest-even +* mode. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The decimal logarithm of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hlog10(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculates \p nv_bfloat16 natural exponential function in round-to-nearest-even +* mode. +* +* \details Calculates \p nv_bfloat16 natural exponential function of input \p a in +* round-to-nearest-even mode. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The natural exponential function on \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hexp(const __nv_bfloat16 a); + +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculates approximate \p nv_bfloat16 hyperbolic tangent function. +* +* \details Calculates approximate \p nv_bfloat16 hyperbolic tangent function: \cuda_math_formula \tanh(a)\end_cuda_math_formula. +* This operation uses HW acceleration on devices of compute capability 9.x and higher. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The approximate hyperbolic tangent function of \p a. +* - htanh_approx \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula (\pm 0)\end_cuda_math_formula. +* - htanh_approx \cuda_math_formula (\pm\infty)\end_cuda_math_formula returns \cuda_math_formula (\pm 1)\end_cuda_math_formula. +* - htanh_approx(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 htanh_approx(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculates \p nv_bfloat162 vector approximate hyperbolic tangent function. +* +* \details Calculates \p nv_bfloat162 approximate hyperbolic tangent function of input vector \p a. +* This operation uses HW acceleration on devices of compute capability 9.x and higher. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The elementwise approximate hyperbolic tangent function on vector \p a. +* +* \see htanh_approx(__nv_bfloat16) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2tanh_approx(const __nv_bfloat162 a); + +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculates \p nv_bfloat16 hyperbolic tangent function in +* round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat16 hyperbolic tangent function: \cuda_math_formula \tanh(a)\end_cuda_math_formula in +* round-to-nearest-even mode. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The hyperbolic tangent function of \p a. +* - htanh \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula (\pm 0)\end_cuda_math_formula. +* - htanh \cuda_math_formula (\pm\infty)\end_cuda_math_formula returns \cuda_math_formula (\pm 1)\end_cuda_math_formula. +* - htanh(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 htanh(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculates \p nv_bfloat162 vector hyperbolic tangent function in round-to-nearest-even +* mode. +* +* \details Calculates \p nv_bfloat162 hyperbolic tangent function of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The elementwise hyperbolic tangent function on vector \p a. +* +* \see htanh(__nv_bfloat16) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2tanh(const __nv_bfloat162 a); + +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculates \p nv_bfloat16 binary exponential function in round-to-nearest-even +* mode. +* +* \details Calculates \p nv_bfloat16 binary exponential function of input \p a in +* round-to-nearest-even mode. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The binary exponential function on \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hexp2(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculates \p nv_bfloat16 decimal exponential function in round-to-nearest-even +* mode. +* +* \details Calculates \p nv_bfloat16 decimal exponential function of input \p a in +* round-to-nearest-even mode. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The decimal exponential function on \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hexp10(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculates \p nv_bfloat16 cosine in round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat16 cosine of input \p a in round-to-nearest-even mode. +* +* NOTE: this function's implementation calls cosf(float) function and is exposed +* to compiler optimizations. Specifically, \p --use_fast_math flag changes cosf(float) +* into an intrinsic __cosf(float), which has less accurate numeric behavior. +* +* \param[in] a - nv_bfloat16. Is only being read. +* \returns nv_bfloat16 +* - The cosine of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hcos(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculates \p nv_bfloat16 sine in round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat16 sine of input \p a in round-to-nearest-even mode. +* +* NOTE: this function's implementation calls sinf(float) function and is exposed +* to compiler optimizations. Specifically, \p --use_fast_math flag changes sinf(float) +* into an intrinsic __sinf(float), which has less accurate numeric behavior. +* +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The sine of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hsin(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculates \p nv_bfloat162 vector square root in round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat162 square root of input vector \p a in round-to-nearest-even +* mode. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The elementwise square root on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2sqrt(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculates \p nv_bfloat162 vector reciprocal square root in round-to-nearest-even +* mode. +* +* \details Calculates \p nv_bfloat162 reciprocal square root of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The elementwise reciprocal square root on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2rsqrt(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculates \p nv_bfloat162 vector reciprocal in round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat162 reciprocal of input vector \p a in round-to-nearest-even +* mode. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The elementwise reciprocal on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2rcp(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculates \p nv_bfloat162 vector natural logarithm in round-to-nearest-even +* mode. +* +* \details Calculates \p nv_bfloat162 natural logarithm of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The elementwise natural logarithm on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2log(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculates \p nv_bfloat162 vector binary logarithm in round-to-nearest-even +* mode. +* +* \details Calculates \p nv_bfloat162 binary logarithm of input vector \p a in round-to-nearest-even +* mode. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The elementwise binary logarithm on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2log2(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculates \p nv_bfloat162 vector decimal logarithm in round-to-nearest-even +* mode. +* +* \details Calculates \p nv_bfloat162 decimal logarithm of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The elementwise decimal logarithm on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2log10(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculates \p nv_bfloat162 vector exponential function in round-to-nearest-even +* mode. +* +* \details Calculates \p nv_bfloat162 exponential function of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The elementwise exponential function on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2exp(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculates \p nv_bfloat162 vector binary exponential function in +* round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat162 binary exponential function of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The elementwise binary exponential function on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2exp2(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculates \p nv_bfloat162 vector decimal exponential function in +* round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat162 decimal exponential function of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The elementwise decimal exponential function on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2exp10(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculates \p nv_bfloat162 vector cosine in round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat162 cosine of input vector \p a in round-to-nearest-even +* mode. +* +* NOTE: this function's implementation calls cosf(float) function and is exposed +* to compiler optimizations. Specifically, \p --use_fast_math flag changes cosf(float) +* into an intrinsic __cosf(float), which has less accurate numeric behavior. +* +* \param[in] a - nv_bfloat162. Is only being read. +* \returns nv_bfloat162 +* - The elementwise cosine on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2cos(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculates \p nv_bfloat162 vector sine in round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat162 sine of input vector \p a in round-to-nearest-even mode. +* +* NOTE: this function's implementation calls sinf(float) function and is exposed +* to compiler optimizations. Specifically, \p --use_fast_math flag changes sinf(float) +* into an intrinsic __sinf(float), which has less accurate numeric behavior. +* +* \param[in] a - nv_bfloat162. Is only being read. +* \returns nv_bfloat162 +* - The elementwise sine on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2sin(const __nv_bfloat162 a); + +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Vector add \p val to the value stored at \p address in global or shared memory, and writes this +* value back to \p address. The atomicity of the add operation is guaranteed separately for each of the +* two nv_bfloat16 elements; the entire __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. +* +* \details The location of \p address must be in global or shared memory. This operation has undefined +* behavior otherwise. This operation is natively supported by devices of compute capability 9.x and higher, +* older devices use emulation path. +* +* \param[in] address - __nv_bfloat162*. An address in global or shared memory. +* \param[in] val - __nv_bfloat162. The value to be added. +* +* \returns __nv_bfloat162 +* - The old value read from \p address. +* +* \note_ref_guide_atomic +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 atomicAdd(__nv_bfloat162 *const address, const __nv_bfloat162 val); + +#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700))) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Adds \p val to the value stored at \p address in global or shared memory, and writes this value +* back to \p address. This operation is performed in one atomic operation. +* +* \details The location of \p address must be in global or shared memory. This operation has undefined +* behavior otherwise. This operation is natively supported by devices of compute capability 9.x and higher, +* older devices of compute capability 7.x and 8.x use emulation path. +* +* \param[in] address - __nv_bfloat16*. An address in global or shared memory. +* \param[in] val - __nv_bfloat16. The value to be added. +* +* \returns __nv_bfloat16 +* - The old value read from \p address. +* +* \note_ref_guide_atomic +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 atomicAdd(__nv_bfloat16 *const address, const __nv_bfloat16 val); +#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700))) || defined(_NVHPC_CUDA) */ +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ + + +#endif /* defined(__cplusplus) */ + +#if !defined(_MSC_VER) && __cplusplus >= 201103L +# define __CPP_VERSION_AT_LEAST_11_BF16 +#elif _MSC_FULL_VER >= 190024210 && _MSVC_LANG >= 201103L +# define __CPP_VERSION_AT_LEAST_11_BF16 +#endif + +/* C++11 header for ::std::move. + * In RTC mode, ::std::move is provided implicitly; don't include the header + */ +#if defined(__CPP_VERSION_AT_LEAST_11_BF16) && !defined(__CUDACC_RTC__) +#include +#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) && !defined(__CUDACC_RTC__) */ + +/* C++ header for ::std::memcpy (used for type punning in host-side implementations). + * When compiling as a CUDA source file memcpy is provided implicitly. + * !defined(__CUDACC__) implies !defined(__CUDACC_RTC__). + */ +#if defined(__cplusplus) && !defined(__CUDACC__) +#include +#endif /* defined(__cplusplus) && !defined(__CUDACC__) */ + +// implicitly provided by NVRTC +#if !defined(__CUDACC_RTC__) +#include +#endif /* !defined(__CUDACC_RTC__) */ + +#if (defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3)))) +#define __CUDA_BF16_INLINE__ +#define __CUDA_BF16_FORCEINLINE__ +#else +#define __CUDA_BF16_INLINE__ inline +#define __CUDA_BF16_FORCEINLINE__ __forceinline__ +#endif /* #if (defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3)))) */ + +/* Set up structure-alignment attribute */ +#if defined(__CUDACC__) +#define __CUDA_ALIGN__(align) __align__(align) +#else +/* Define alignment macro based on compiler type (cannot assume C11 "_Alignas" is available) */ +#if defined(__CPP_VERSION_AT_LEAST_11_BF16) +#define __CUDA_ALIGN__(n) alignas(n) /* C++11 kindly gives us a keyword for this */ +#else /* defined(__CPP_VERSION_AT_LEAST_11_BF16)*/ +#if defined(__GNUC__) +#define __CUDA_ALIGN__(n) __attribute__ ((aligned(n))) +#elif defined(_MSC_VER) +#define __CUDA_ALIGN__(n) __declspec(align(n)) +#else +#define __CUDA_ALIGN__(n) +#endif /* defined(__GNUC__) */ +#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */ +#endif /* defined(__CUDACC__) */ + +// define __CUDA_BF16_CONSTEXPR__ in order to +// use constexpr where possible, with supporting C++ dialects +// undef after use +#if (defined __CPP_VERSION_AT_LEAST_11_BF16) +#define __CUDA_BF16_CONSTEXPR__ constexpr +#else +#define __CUDA_BF16_CONSTEXPR__ +#endif + +/** + * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 + * \brief __nv_bfloat16_raw data type + * \details Type allows static initialization of \p nv_bfloat16 until it becomes + * a built-in type. + * + * - Note: this initialization is as a bit-field representation of \p nv_bfloat16, + * and not a conversion from \p short to \p nv_bfloat16. + * Such representation will be deprecated in a future version of CUDA. + * + * - Note: this is visible to non-nvcc compilers, including C-only compilations + */ +typedef struct __CUDA_ALIGN__(2) { + /** + * Storage field contains bits representation of the \p nv_bfloat16 floating-point number. + */ + unsigned short x; +} __nv_bfloat16_raw; + +/** + * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 + * \brief __nv_bfloat162_raw data type + * \details Type allows static initialization of \p nv_bfloat162 until it becomes + * a built-in type. + * + * - Note: this initialization is as a bit-field representation of \p nv_bfloat162, + * and not a conversion from \p short2 to \p nv_bfloat162. + * Such representation will be deprecated in a future version of CUDA. + * + * - Note: this is visible to non-nvcc compilers, including C-only compilations + */ +typedef struct __CUDA_ALIGN__(4) { + /** + * Storage field contains bits of the lower \p nv_bfloat16 part. + */ + unsigned short x; + /** + * Storage field contains bits of the upper \p nv_bfloat16 part. + */ + unsigned short y; +} __nv_bfloat162_raw; + +/* All other definitions in this file are only visible to C++ compilers */ +#if defined(__cplusplus) + +/* Hide GCC member initialization list warnings because of host/device in-function init requirement */ +#if defined(__GNUC__) +#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Weffc++" +#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */ +#endif /* defined(__GNUC__) */ + +/* class' : multiple assignment operators specified + The class has multiple assignment operators of a single type. This warning is informational */ +#if defined(_MSC_VER) && _MSC_VER >= 1500 +#pragma warning( push ) +#pragma warning( disable:4522 ) +#endif /* defined(__GNUC__) */ + +/** + * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 + * \brief nv_bfloat16 datatype + * + * \details This structure implements the datatype for storing + * nv_bfloat16 floating-point numbers. The structure implements + * assignment operators and type conversions. 16 bits are being + * used in total: 1 sign bit, 8 bits for the exponent, and + * the significand is being stored in 7 bits. The total + * precision is 8 bits. + * + */ +struct __CUDA_ALIGN__(2) __nv_bfloat16 { +protected: + /** + * Protected storage variable contains the bits of floating-point data. + */ + unsigned short __x; + +public: + + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * \brief Constructor by default. + * \details Emtpy default constructor, result is uninitialized. + */ +#if defined(__CPP_VERSION_AT_LEAST_11_BF16) + __nv_bfloat16() = default; +#else + __CUDA_HOSTDEVICE__ __nv_bfloat16() { } +#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */ + + /* Convert to/from __nv_bfloat16_raw */ + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Constructor from \p __nv_bfloat16_raw. + */ + __CUDA_HOSTDEVICE__ __CUDA_BF16_CONSTEXPR__ __nv_bfloat16(const __nv_bfloat16_raw &hr) : __x(hr.x) { } + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Assignment operator from \p __nv_bfloat16_raw. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(const __nv_bfloat16_raw &hr); + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Assignment operator from \p __nv_bfloat16_raw to \p volatile \p __nv_bfloat16. + */ + __CUDA_HOSTDEVICE__ volatile __nv_bfloat16 &operator=(const __nv_bfloat16_raw &hr) volatile; + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Assignment operator from \p volatile \p __nv_bfloat16_raw to \p volatile \p __nv_bfloat16. + */ + __CUDA_HOSTDEVICE__ volatile __nv_bfloat16 &operator=(const volatile __nv_bfloat16_raw &hr) volatile; + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Type cast to \p __nv_bfloat16_raw operator. + */ + __CUDA_HOSTDEVICE__ operator __nv_bfloat16_raw() const; + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Type cast to \p __nv_bfloat16_raw operator with \p volatile input. + */ + __CUDA_HOSTDEVICE__ operator __nv_bfloat16_raw() const volatile; + +#if !defined(__CUDA_NO_BFLOAT16_CONVERSIONS__) +#if defined(__CPP_VERSION_AT_LEAST_11_BF16) + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Construct \p __nv_bfloat16 from \p __half input using default round-to-nearest-even rounding mode. + */ + explicit __CUDA_HOSTDEVICE__ __nv_bfloat16(const __half f) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm("{ cvt.rn.bf16.f16 %0, %1;}\n" : "=h"(__x) : "h"(__BFLOAT16_TO_CUS(f))); +, + __x = __float2bfloat16(__half2float(f)).__x; +) +} +#endif /* #if defined(__CPP_VERSION_AT_LEAST_11_BF16) */ + + /* Construct from float/double */ + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Construct \p __nv_bfloat16 from \p float input using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16(const float f) { __x = __float2bfloat16(f).__x; } + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Construct \p __nv_bfloat16 from \p double input using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16(const double f) { __x = __double2bfloat16(f).__x; } + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Type cast to \p float operator. + */ + __CUDA_HOSTDEVICE__ operator float() const; + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Type cast to \p __nv_bfloat16 assignment operator from \p float input using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(const float f); + + /* We omit "cast to double" operator, so as to not be ambiguous about up-cast */ + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Type cast to \p __nv_bfloat16 assignment operator from \p double input using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(const double f); + +/* + * Implicit type conversions to/from integer types were only available to nvcc compilation. + * Introducing them for all compilers is a potentially breaking change that may affect + * overloads resolution and will require users to update their code. + * Define __CUDA_BF16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__ to opt-out. + */ +#if !(defined __CUDA_BF16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__) || (defined __CUDACC__) + /* Allow automatic construction from types supported natively in hardware */ + /* Note we do avoid constructor init-list because of special host/device compilation rules */ + + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Construct \p __nv_bfloat16 from \p short integer input using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16(short val) { __x = __short2bfloat16_rn(val).__x; } + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Construct \p __nv_bfloat16 from \p unsigned \p short integer input using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16(unsigned short val) { __x = __ushort2bfloat16_rn(val).__x; } + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Construct \p __nv_bfloat16 from \p int input using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16(int val) { __x = __int2bfloat16_rn(val).__x; } + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Construct \p __nv_bfloat16 from \p unsigned \p int input using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16(unsigned int val) { __x = __uint2bfloat16_rn(val).__x; } + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Construct \p __nv_bfloat16 from \p long input using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16(const long val) { + /* Suppress VS warning: warning C4127: conditional expression is constant */ +#if defined(_MSC_VER) && !defined(__CUDA_ARCH__) +#pragma warning (push) +#pragma warning (disable: 4127) +#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */ + if (sizeof(long) == sizeof(long long)) +#if defined(_MSC_VER) && !defined(__CUDA_ARCH__) +#pragma warning (pop) +#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */ + { + __x = __ll2bfloat16_rn(static_cast(val)).__x; + } else { + __x = __int2bfloat16_rn(static_cast(val)).__x; + } + } + + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Construct \p __nv_bfloat16 from \p unsigned \p long input using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16(const unsigned long val) { + /* Suppress VS warning: warning C4127: conditional expression is constant */ +#if defined(_MSC_VER) && !defined(__CUDA_ARCH__) +#pragma warning (push) +#pragma warning (disable: 4127) +#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */ + if (sizeof(unsigned long) == sizeof(unsigned long long)) +#if defined(_MSC_VER) && !defined(__CUDA_ARCH__) +#pragma warning (pop) +#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */ + { + __x = __ull2bfloat16_rn(static_cast(val)).__x; + } else { + __x = __uint2bfloat16_rn(static_cast(val)).__x; + } + } + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Construct \p __nv_bfloat16 from \p long \p long input using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16(long long val) { __x = __ll2bfloat16_rn(val).__x; } + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Construct \p __nv_bfloat16 from \p unsigned \p long \p long input using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16(unsigned long long val) { __x = __ull2bfloat16_rn(val).__x; } + + /* Allow automatic casts to supported built-in types, matching all that are permitted with float */ + + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Conversion operator to \p signed \p char data type. + * Using round-toward-zero rounding mode. + * + * See __bfloat162char_rz(__nv_bfloat16) for further details + */ + __CUDA_HOSTDEVICE__ operator signed char() const; + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Conversion operator to \p unsigned \p char data type. + * Using round-toward-zero rounding mode. + * + * See __bfloat162uchar_rz(__nv_bfloat16) for further details + */ + __CUDA_HOSTDEVICE__ operator unsigned char() const; + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Conversion operator to an implementation defined \p char data type. + * Using round-toward-zero rounding mode. + * + * Detects signedness of the \p char type and proceeds accordingly, see + * further details in signed and unsigned char operators. + */ + __CUDA_HOSTDEVICE__ operator char() const; + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Conversion operator to \p short data type. + * Using round-toward-zero rounding mode. + * + * See __bfloat162short_rz(__nv_bfloat16) for further details + */ + __CUDA_HOSTDEVICE__ operator short() const; + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Conversion operator to \p unsigned \p short data type. + * Using round-toward-zero rounding mode. + * + * See __bfloat162ushort_rz(__nv_bfloat16) for further details + */ + __CUDA_HOSTDEVICE__ operator unsigned short() const; + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Conversion operator to \p int data type. + * Using round-toward-zero rounding mode. + * + * See __bfloat162int_rz(__nv_bfloat16) for further details + */ + __CUDA_HOSTDEVICE__ operator int() const; + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Conversion operator to \p unsigned \p int data type. + * Using round-toward-zero rounding mode. + * + * See __bfloat162uint_rz(__nv_bfloat16) for further details + */ + __CUDA_HOSTDEVICE__ operator unsigned int() const; + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Conversion operator to \p long data type. + * Using round-toward-zero rounding mode. + */ + __CUDA_HOSTDEVICE__ operator long() const; + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Conversion operator to \p unsigned \p long data type. + * Using round-toward-zero rounding mode. + */ + __CUDA_HOSTDEVICE__ operator unsigned long() const; + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Conversion operator to \p long \p long data type. + * Using round-toward-zero rounding mode. + * + * See __bfloat162ll_rz(__nv_bfloat16) for further details + */ + __CUDA_HOSTDEVICE__ operator long long() const; + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Conversion operator to \p unsigned \p long \p long data type. + * Using round-toward-zero rounding mode. + * + * See __bfloat162ull_rz(__nv_bfloat16) for further details + */ + __CUDA_HOSTDEVICE__ operator unsigned long long() const; + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Type cast from \p short assignment operator, using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(short val); + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Type cast from \p unsigned \p short assignment operator, using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(unsigned short val); + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Type cast from \p int assignment operator, using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(int val); + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Type cast from \p unsigned \p int assignment operator, using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(unsigned int val); + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Type cast from \p long \p long assignment operator, using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(long long val); + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Type cast from \p unsigned \p long \p long assignment operator, using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(unsigned long long val); + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Conversion operator to \p bool data type. + * +0 and -0 inputs convert to \p false. + * Non-zero inputs convert to \p true. + */ + __CUDA_HOSTDEVICE__ __CUDA_BF16_CONSTEXPR__ operator bool() const { return (__x & 0x7FFFU) != 0U; } +#endif /* !(defined __CUDA_BF16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__) || (defined __CUDACC__) */ +#endif /* !defined(__CUDA_NO_BFLOAT16_CONVERSIONS__) */ +}; + +#if !defined(__CUDA_NO_BFLOAT16_OPERATORS__) +/* Some basic arithmetic operations expected of a built-in */ +/** + * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC + * Performs \p nv_bfloat16 addition operation. + * See also __hadd(__nv_bfloat16, __nv_bfloat16) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator+(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC + * Performs \p nv_bfloat16 subtraction operation. + * See also __hsub(__nv_bfloat16, __nv_bfloat16) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator-(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC + * Performs \p nv_bfloat16 multiplication operation. + * See also __hmul(__nv_bfloat16, __nv_bfloat16) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator*(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC + * Performs \p nv_bfloat16 division operation. + * See also __hdiv(__nv_bfloat16, __nv_bfloat16) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator/(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh); + +/** + * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC + * Performs \p nv_bfloat16 compound assignment with addition operation. + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator+=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC + * Performs \p nv_bfloat16 compound assignment with subtraction operation. + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator-=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC + * Performs \p nv_bfloat16 compound assignment with multiplication operation. + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator*=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC + * Performs \p nv_bfloat16 compound assignment with division operation. + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator/=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh); + +/* Note for increment and decrement we use the raw value 0x3F80U equating to nv_bfloat16(1.0F), to avoid the extra conversion */ +/** + * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC + * Performs \p nv_bfloat16 prefix increment operation. + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator++(__nv_bfloat16 &h); +/** + * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC + * Performs \p nv_bfloat16 prefix decrement operation. + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator--(__nv_bfloat16 &h); +/** + * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC + * Performs \p nv_bfloat16 postfix increment operation. + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator++(__nv_bfloat16 &h, const int ignored); +/** + * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC + * Performs \p nv_bfloat16 postfix decrement operation. + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator--(__nv_bfloat16 &h, const int ignored); +/* Unary plus and inverse operators */ +/** + * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC + * Implements \p nv_bfloat16 unary plus operator, returns input value. + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator+(const __nv_bfloat16 &h); +/** + * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC + * Implements \p nv_bfloat16 unary minus operator. + * See also __hneg(__nv_bfloat16) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator-(const __nv_bfloat16 &h); + +/* Some basic comparison operations to make it look like a built-in */ +/** + * \ingroup CUDA_MATH__BFLOAT16_COMPARISON + * Performs \p nv_bfloat16 ordered compare equal operation. + * See also __heq(__nv_bfloat16, __nv_bfloat16) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator==(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT16_COMPARISON + * Performs \p nv_bfloat16 unordered compare not-equal operation. + * See also __hneu(__nv_bfloat16, __nv_bfloat16) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator!=(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT16_COMPARISON + * Performs \p nv_bfloat16 ordered greater-than compare operation. + * See also __hgt(__nv_bfloat16, __nv_bfloat16) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator> (const __nv_bfloat16 &lh, const __nv_bfloat16 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT16_COMPARISON + * Performs \p nv_bfloat16 ordered less-than compare operation. + * See also __hlt(__nv_bfloat16, __nv_bfloat16) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator< (const __nv_bfloat16 &lh, const __nv_bfloat16 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT16_COMPARISON + * Performs \p nv_bfloat16 ordered greater-or-equal compare operation. + * See also __hge(__nv_bfloat16, __nv_bfloat16) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator>=(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT16_COMPARISON + * Performs \p nv_bfloat16 ordered less-or-equal compare operation. + * See also __hle(__nv_bfloat16, __nv_bfloat16) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator<=(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh); +#endif /* !defined(__CUDA_NO_BFLOAT16_OPERATORS__) */ + +/** +* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 + * \brief nv_bfloat162 datatype + * \details This structure implements the datatype for storing two + * nv_bfloat16 floating-point numbers. + * The structure implements assignment, arithmetic and comparison + * operators, and type conversions. + * + * - NOTE: __nv_bfloat162 is visible to non-nvcc host compilers + */ +struct __CUDA_ALIGN__(4) __nv_bfloat162 { + /** + * Storage field holding lower \p __nv_bfloat16 part. + */ + __nv_bfloat16 x; + /** + * Storage field holding upper \p __nv_bfloat16 part. + */ + __nv_bfloat16 y; + + // All construct/copy/assign/move +public: + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * \brief Constructor by default. + * \details Emtpy default constructor, result is uninitialized. + */ +#if defined(__CPP_VERSION_AT_LEAST_11_BF16) + __nv_bfloat162() = default; + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Move constructor, available for \p C++11 and later dialects + */ + __CUDA_HOSTDEVICE__ __nv_bfloat162(__nv_bfloat162 &&src); + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Move assignment operator, available for \p C++11 and later dialects + */ + __CUDA_HOSTDEVICE__ __nv_bfloat162 &operator=(__nv_bfloat162 &&src); +#else + __CUDA_HOSTDEVICE__ __nv_bfloat162(); +#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */ + + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Constructor from two \p __nv_bfloat16 variables + */ + __CUDA_HOSTDEVICE__ __CUDA_BF16_CONSTEXPR__ __nv_bfloat162(const __nv_bfloat16 &a, const __nv_bfloat16 &b) : x(a), y(b) { } + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Copy constructor + */ + __CUDA_HOSTDEVICE__ __nv_bfloat162(const __nv_bfloat162 &src); + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Copy assignment operator + */ + __CUDA_HOSTDEVICE__ __nv_bfloat162 &operator=(const __nv_bfloat162 &src); + + /* Convert to/from __nv_bfloat162_raw */ + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Constructor from \p __nv_bfloat162_raw + */ + __CUDA_HOSTDEVICE__ __nv_bfloat162(const __nv_bfloat162_raw &h2r ); + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Assignment operator from \p __nv_bfloat162_raw + */ + __CUDA_HOSTDEVICE__ __nv_bfloat162 &operator=(const __nv_bfloat162_raw &h2r); + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Conversion operator to \p __nv_bfloat162_raw + */ + __CUDA_HOSTDEVICE__ operator __nv_bfloat162_raw() const; +}; + +#if !defined(__CUDA_NO_BFLOAT162_OPERATORS__) +/** + * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC + * Performs packed \p nv_bfloat16 addition operation. + * See also __hadd2(__nv_bfloat162, __nv_bfloat162) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator+(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC + * Performs packed \p nv_bfloat16 subtraction operation. + * See also __hsub2(__nv_bfloat162, __nv_bfloat162) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator-(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC + * Performs packed \p nv_bfloat16 multiplication operation. + * See also __hmul2(__nv_bfloat162, __nv_bfloat162) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator*(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC + * Performs packed \p nv_bfloat16 division operation. + * See also __h2div(__nv_bfloat162, __nv_bfloat162) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator/(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC + * Performs packed \p nv_bfloat16 compound assignment with addition operation. + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162& operator+=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC + * Performs packed \p nv_bfloat16 compound assignment with subtraction operation. + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162& operator-=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC + * Performs packed \p nv_bfloat16 compound assignment with multiplication operation. + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162& operator*=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC + * Performs packed \p nv_bfloat16 compound assignment with division operation. + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162& operator/=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC + * Performs packed \p nv_bfloat16 prefix increment operation. + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 &operator++(__nv_bfloat162 &h); +/** + * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC + * Performs packed \p nv_bfloat16 prefix decrement operation. + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 &operator--(__nv_bfloat162 &h); +/** + * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC + * Performs packed \p nv_bfloat16 postfix increment operation. + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator++(__nv_bfloat162 &h, const int ignored); +/** + * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC + * Performs packed \p nv_bfloat16 postfix decrement operation. + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator--(__nv_bfloat162 &h, const int ignored); +/** + * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC + * Implements packed \p nv_bfloat16 unary plus operator, returns input value. + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator+(const __nv_bfloat162 &h); +/** + * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC + * Implements packed \p nv_bfloat16 unary minus operator. + * See also __hneg2(__nv_bfloat162) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator-(const __nv_bfloat162 &h); +/** + * \ingroup CUDA_MATH__BFLOAT162_COMPARISON + * Performs packed \p nv_bfloat16 ordered compare equal operation. + * See also __hbeq2(__nv_bfloat162, __nv_bfloat162) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator==(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT162_COMPARISON + * Performs packed \p nv_bfloat16 unordered compare not-equal operation. + * See also __hbneu2(__nv_bfloat162, __nv_bfloat162) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator!=(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT162_COMPARISON + * Performs packed \p nv_bfloat16 ordered greater-than compare operation. + * See also __hbgt2(__nv_bfloat162, __nv_bfloat162) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator>(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT162_COMPARISON + * Performs packed \p nv_bfloat16 ordered less-than compare operation. + * See also __hblt2(__nv_bfloat162, __nv_bfloat162) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator<(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT162_COMPARISON + * Performs packed \p nv_bfloat16 ordered greater-or-equal compare operation. + * See also __hbge2(__nv_bfloat162, __nv_bfloat162) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator>=(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT162_COMPARISON + * Performs packed \p nv_bfloat16 ordered less-or-equal compare operation. + * See also __hble2(__nv_bfloat162, __nv_bfloat162) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator<=(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh); + +#endif /* !defined(__CUDA_NO_BFLOAT162_OPERATORS__) */ + +#if defined(__CPP_VERSION_AT_LEAST_11_BF16) +#if !defined(__CUDA_NO_HALF_CONVERSIONS__) +__CUDA_HOSTDEVICE__ +#ifdef __CUDACC_RTC__ +inline +#else +__CUDA_BF16_FORCEINLINE__ +#endif +__half::__half(const __nv_bfloat16 f) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm("{ cvt.rn.f16.bf16 %0, %1;}\n" : "=h"(__x) : "h"(__BFLOAT16_TO_CUS(f))); +, + __x = __float2half_rn(__bfloat162float(f)).__x; +) +} +#endif +#endif /* #if defined(__CPP_VERSION_AT_LEAST_11_BF16) */ + +#endif /* defined(__cplusplus) */ + +#if (defined(__FORCE_INCLUDE_CUDA_BF16_HPP_FROM_BF16_H__) || \ + !(defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3))))) +/* Note the .hpp file is included to capture the "nv_bfloat16" & "nv_bfloat162" built-in function definitions. For NVRTC, the built-in + function definitions are compiled at NVRTC library build-time and are available through the NVRTC built-ins library at + link time. +*/ +#include "cuda_bf16.hpp" +#endif /* (defined(__FORCE_INCLUDE_CUDA_BF16_HPP_FROM_BF16_H__) || \ + !(defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3))))) */ + +/* Define first-class types "nv_bfloat16" and "nv_bfloat162", unless user specifies otherwise via "#define CUDA_NO_BFLOAT16" */ +/* C cannot ever have these types defined here, because __nv_bfloat16 and __nv_bfloat162 are C++ classes */ +#if defined(__cplusplus) && !defined(CUDA_NO_BFLOAT16) +/** + * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 + * \brief This datatype is meant to be the first-class or fundamental + * implementation of the bfloat16 numbers format. + * + * \details Should be implemented in the compiler in the future. + * Current implementation is a simple typedef to a respective + * user-level type with underscores. + */ +typedef __nv_bfloat16 nv_bfloat16; + +/** + * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 + * \brief This datatype is meant to be the first-class or fundamental + * implementation of type for pairs of bfloat16 numbers. + * + * \details Should be implemented in the compiler in the future. + * Current implementation is a simple typedef to a respective + * user-level type with underscores. + */ +typedef __nv_bfloat162 nv_bfloat162; + +#endif /* defined(__cplusplus) && !defined(CUDA_NO_BFLOAT16) */ + +#undef __CUDA_BF16_DECL__ +#undef __CUDA_HOSTDEVICE_BF16_DECL__ +#undef __CUDA_HOSTDEVICE__ +#undef __CUDA_BF16_INLINE__ +#undef __CUDA_BF16_FORCEINLINE__ +#undef ___CUDA_BF16_STRINGIFY_INNERMOST +#undef __CUDA_BF16_STRINGIFY + +#endif /* end of include guard: __CUDA_BF16_H__ */ diff --git a/numba_cuda/numba/cuda/include/13/cuda_bf16.hpp b/numba_cuda/numba/cuda/include/13/cuda_bf16.hpp new file mode 100644 index 000000000..5f610c976 --- /dev/null +++ b/numba_cuda/numba/cuda/include/13/cuda_bf16.hpp @@ -0,0 +1,3865 @@ +/* +* Copyright 1993-2024 NVIDIA Corporation. All rights reserved. +* +* NOTICE TO LICENSEE: +* +* This source code and/or documentation ("Licensed Deliverables") are +* subject to NVIDIA intellectual property rights under U.S. and +* international Copyright laws. +* +* These Licensed Deliverables contained herein is PROPRIETARY and +* CONFIDENTIAL to NVIDIA and is being provided under the terms and +* conditions of a form of NVIDIA software license agreement by and +* between NVIDIA and Licensee ("License Agreement") or electronically +* accepted by Licensee. Notwithstanding any terms or conditions to +* the contrary in the License Agreement, reproduction or disclosure +* of the Licensed Deliverables to any third party without the express +* written consent of NVIDIA is prohibited. +* +* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE +* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS +* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. +* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED +* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, +* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. +* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY +* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY +* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS +* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE +* OF THESE LICENSED DELIVERABLES. +* +* U.S. Government End Users. These Licensed Deliverables are a +* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT +* 1995), consisting of "commercial computer software" and "commercial +* computer software documentation" as such terms are used in 48 +* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government +* only as a commercial end item. Consistent with 48 C.F.R.12.212 and +* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all +* U.S. Government End Users acquire the Licensed Deliverables with +* only those rights set forth herein. +* +* Any use of the Licensed Deliverables in individual and commercial +* software must include, in the user documentation and internal +* comments to the code, the above Disclaimer and U.S. Government End +* Users Notice. +*/ + +#if !defined(__CUDA_BF16_HPP__) +#define __CUDA_BF16_HPP__ + +#if !defined(__CUDA_BF16_H__) +#error "Do not include this file directly. Instead, include cuda_bf16.h." +#endif + +#if !defined(IF_DEVICE_OR_CUDACC) +#if defined(__CUDACC__) + #define IF_DEVICE_OR_CUDACC(d, c, f) NV_IF_ELSE_TARGET(NV_IS_DEVICE, d, c) +#else + #define IF_DEVICE_OR_CUDACC(d, c, f) NV_IF_ELSE_TARGET(NV_IS_DEVICE, d, f) +#endif +#endif + +/* All other definitions in this file are only visible to C++ compilers */ +#if defined(__cplusplus) +/** + * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16_CONSTANTS + * \brief Defines floating-point positive infinity value for the \p nv_bfloat16 data type + */ +#define CUDART_INF_BF16 __ushort_as_bfloat16((unsigned short)0x7F80U) +/** + * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16_CONSTANTS + * \brief Defines canonical NaN value for the \p nv_bfloat16 data type + */ +#define CUDART_NAN_BF16 __ushort_as_bfloat16((unsigned short)0x7FFFU) +/** + * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16_CONSTANTS + * \brief Defines a minimum representable (denormalized) value for the \p nv_bfloat16 data type + */ +#define CUDART_MIN_DENORM_BF16 __ushort_as_bfloat16((unsigned short)0x0001U) +/** + * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16_CONSTANTS + * \brief Defines a maximum representable value for the \p nv_bfloat16 data type + */ +#define CUDART_MAX_NORMAL_BF16 __ushort_as_bfloat16((unsigned short)0x7F7FU) +/** + * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16_CONSTANTS + * \brief Defines a negative zero value for the \p nv_bfloat16 data type + */ +#define CUDART_NEG_ZERO_BF16 __ushort_as_bfloat16((unsigned short)0x8000U) +/** + * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16_CONSTANTS + * \brief Defines a positive zero value for the \p nv_bfloat16 data type + */ +#define CUDART_ZERO_BF16 __ushort_as_bfloat16((unsigned short)0x0000U) +/** + * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16_CONSTANTS + * \brief Defines a value of 1.0 for the \p nv_bfloat16 data type + */ +#define CUDART_ONE_BF16 __ushort_as_bfloat16((unsigned short)0x3F80U) + +#if !(defined __DOXYGEN_ONLY__) + + __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(const __nv_bfloat16_raw &hr) { __x = hr.x; return *this; } + __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ volatile __nv_bfloat16 &__nv_bfloat16::operator=(const __nv_bfloat16_raw &hr) volatile { __x = hr.x; return *this; } + __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ volatile __nv_bfloat16 &__nv_bfloat16::operator=(const volatile __nv_bfloat16_raw &hr) volatile { __x = hr.x; return *this; } + __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator __nv_bfloat16_raw() const { __nv_bfloat16_raw ret; ret.x = __x; return ret; } + __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator __nv_bfloat16_raw() const volatile { __nv_bfloat16_raw ret; ret.x = __x; return ret; } + +#if !defined(__CUDA_NO_BFLOAT16_CONVERSIONS__) + __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator float() const { return __bfloat162float(*this); } + __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(const float f) { __x = __float2bfloat16(f).__x; return *this; } + __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(const double f) { __x = __double2bfloat16(f).__x; return *this; } + +/* + * Implicit type conversions to/from integer types were only available to nvcc compilation. + * Introducing them for all compilers is a potentially breaking change that may affect + * overloads resolution and will require users to update their code. + * Define __CUDA_BF16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__ to opt-out. + */ +#if !(defined __CUDA_BF16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__) || (defined __CUDACC__) + __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator signed char() const { return __bfloat162char_rz(*this); } + __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator unsigned char() const { return __bfloat162uchar_rz(*this); } + __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator char() const { + char value; + /* Suppress VS warning: warning C4127: conditional expression is constant */ +#if defined(_MSC_VER) && !defined(__CUDA_ARCH__) +#pragma warning (push) +#pragma warning (disable: 4127) +#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */ + if (((char)-1) < (char)0) +#if defined(_MSC_VER) && !defined(__CUDA_ARCH__) +#pragma warning (pop) +#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */ + { + value = static_cast(__bfloat162char_rz(*this)); + } + else + { + value = static_cast(__bfloat162uchar_rz(*this)); + } + return value; + } + __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator short() const { return __bfloat162short_rz(*this); } + __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator unsigned short() const { return __bfloat162ushort_rz(*this); } + __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator int() const { return __bfloat162int_rz(*this); } + __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator unsigned int() const { return __bfloat162uint_rz(*this); } + __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator long() const { + long retval; + /* Suppress VS warning: warning C4127: conditional expression is constant */ +#if defined(_MSC_VER) && !defined(__CUDA_ARCH__) +#pragma warning (push) +#pragma warning (disable: 4127) +#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */ + if (sizeof(long) == sizeof(long long)) +#if defined(_MSC_VER) && !defined(__CUDA_ARCH__) +#pragma warning (pop) +#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */ + { + retval = static_cast(__bfloat162ll_rz(*this)); + } + else + { + retval = static_cast(__bfloat162int_rz(*this)); + } + return retval; + } + __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator unsigned long() const { + unsigned long retval; + /* Suppress VS warning: warning C4127: conditional expression is constant */ +#if defined(_MSC_VER) && !defined(__CUDA_ARCH__) +#pragma warning (push) +#pragma warning (disable: 4127) +#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */ + if (sizeof(unsigned long) == sizeof(unsigned long long)) +#if defined(_MSC_VER) && !defined(__CUDA_ARCH__) +#pragma warning (pop) +#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */ + { + retval = static_cast(__bfloat162ull_rz(*this)); + } + else + { + retval = static_cast(__bfloat162uint_rz(*this)); + } + return retval; + } + __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator long long() const { return __bfloat162ll_rz(*this); } + __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator unsigned long long() const { return __bfloat162ull_rz(*this); } + __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(short val) { __x = __short2bfloat16_rn(val).__x; return *this; } + __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(unsigned short val) { __x = __ushort2bfloat16_rn(val).__x; return *this; } + __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(int val) { __x = __int2bfloat16_rn(val).__x; return *this; } + __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(unsigned int val) { __x = __uint2bfloat16_rn(val).__x; return *this; } + __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(long long val) { __x = __ll2bfloat16_rn(val).__x; return *this; } + __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(unsigned long long val) { __x = __ull2bfloat16_rn(val).__x; return *this; } +#endif /* !(defined __CUDA_BF16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__) || (defined __CUDACC__) */ +#endif /* !defined(__CUDA_NO_BFLOAT16_CONVERSIONS__) */ + + +#if !defined(__CUDA_NO_BFLOAT16_OPERATORS__) +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator+(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hadd(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator-(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hsub(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator*(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hmul(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator/(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hdiv(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator+=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh) { lh = __hadd(lh, rh); return lh; } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator-=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh) { lh = __hsub(lh, rh); return lh; } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator*=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh) { lh = __hmul(lh, rh); return lh; } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator/=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh) { lh = __hdiv(lh, rh); return lh; } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator++(__nv_bfloat16 &h) { __nv_bfloat16_raw one; one.x = 0x3F80U; h += one; return h; } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator--(__nv_bfloat16 &h) { __nv_bfloat16_raw one; one.x = 0x3F80U; h -= one; return h; } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator++(__nv_bfloat16 &h, const int ignored) +{ + // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators. + static_cast(ignored); + + const __nv_bfloat16 ret = h; + __nv_bfloat16_raw one; + one.x = 0x3F80U; + h += one; + return ret; +} +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator--(__nv_bfloat16 &h, const int ignored) +{ + // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators. + static_cast(ignored); + + const __nv_bfloat16 ret = h; + __nv_bfloat16_raw one; + one.x = 0x3F80U; + h -= one; + return ret; +} +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator+(const __nv_bfloat16 &h) { return h; } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator-(const __nv_bfloat16 &h) { return __hneg(h); } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator==(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __heq(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator!=(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hneu(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator> (const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hgt(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator< (const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hlt(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator>=(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hge(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator<=(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hle(lh, rh); } +#endif /* !defined(__CUDA_NO_BFLOAT16_OPERATORS__) */ + +#if defined(__CPP_VERSION_AT_LEAST_11_BF16) +__CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat162::__nv_bfloat162(__nv_bfloat162 &&src) { +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __BFLOAT162_TO_UI(*this) = ::std::move(__BFLOAT162_TO_CUI(src)); +, + this->x = src.x; + this->y = src.y; +) +} +__CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat162 &__nv_bfloat162::operator=(__nv_bfloat162 &&src) { +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __BFLOAT162_TO_UI(*this) = ::std::move(__BFLOAT162_TO_CUI(src)); +, + this->x = src.x; + this->y = src.y; +) + return *this; +} +#else +__CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat162::__nv_bfloat162() { } +#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat162::__nv_bfloat162(const __nv_bfloat162 &src) { +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __BFLOAT162_TO_UI(*this) = __BFLOAT162_TO_CUI(src); +, + this->x = src.x; + this->y = src.y; +) +} +__CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat162 &__nv_bfloat162::operator=(const __nv_bfloat162 &src) { +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __BFLOAT162_TO_UI(*this) = __BFLOAT162_TO_CUI(src); +, + this->x = src.x; + this->y = src.y; +) + return *this; +} +__CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat162::__nv_bfloat162(const __nv_bfloat162_raw &h2r ) { +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __BFLOAT162_TO_UI(*this) = __BFLOAT162_TO_CUI(h2r); +, + __nv_bfloat16_raw tr; + tr.x = h2r.x; + this->x = static_cast<__nv_bfloat16>(tr); + tr.x = h2r.y; + this->y = static_cast<__nv_bfloat16>(tr); +) +} +__CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat162 &__nv_bfloat162::operator=(const __nv_bfloat162_raw &h2r) { +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __BFLOAT162_TO_UI(*this) = __BFLOAT162_TO_CUI(h2r); +, + __nv_bfloat16_raw tr; + tr.x = h2r.x; + this->x = static_cast<__nv_bfloat16>(tr); + tr.x = h2r.y; + this->y = static_cast<__nv_bfloat16>(tr); +) + return *this; +} +__CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat162::operator __nv_bfloat162_raw() const { + __nv_bfloat162_raw ret; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + ret.x = 0U; + ret.y = 0U; + __BFLOAT162_TO_UI(ret) = __BFLOAT162_TO_CUI(*this); +, + ret.x = static_cast<__nv_bfloat16_raw>(this->x).x; + ret.y = static_cast<__nv_bfloat16_raw>(this->y).x; +) + return ret; +} + +#if !defined(__CUDA_NO_BFLOAT162_OPERATORS__) +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator+(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hadd2(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator-(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hsub2(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator*(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hmul2(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator/(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __h2div(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162& operator+=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh) { lh = __hadd2(lh, rh); return lh; } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162& operator-=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh) { lh = __hsub2(lh, rh); return lh; } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162& operator*=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh) { lh = __hmul2(lh, rh); return lh; } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162& operator/=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh) { lh = __h2div(lh, rh); return lh; } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 &operator++(__nv_bfloat162 &h) { __nv_bfloat162_raw one; one.x = 0x3F80U; one.y = 0x3F80U; h = __hadd2(h, one); return h; } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 &operator--(__nv_bfloat162 &h) { __nv_bfloat162_raw one; one.x = 0x3F80U; one.y = 0x3F80U; h = __hsub2(h, one); return h; } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator++(__nv_bfloat162 &h, const int ignored) +{ + // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators. + static_cast(ignored); + + const __nv_bfloat162 ret = h; + __nv_bfloat162_raw one; + one.x = 0x3F80U; + one.y = 0x3F80U; + h = __hadd2(h, one); + return ret; +} +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator--(__nv_bfloat162 &h, const int ignored) +{ + // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators. + static_cast(ignored); + + const __nv_bfloat162 ret = h; + __nv_bfloat162_raw one; + one.x = 0x3F80U; + one.y = 0x3F80U; + h = __hsub2(h, one); + return ret; +} +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator+(const __nv_bfloat162 &h) { return h; } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator-(const __nv_bfloat162 &h) { return __hneg2(h); } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator==(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hbeq2(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator!=(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hbneu2(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator>(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hbgt2(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator<(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hblt2(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator>=(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hbge2(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator<=(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hble2(lh, rh); } +#endif /* !defined(__CUDA_NO_BFLOAT162_OPERATORS__) */ + +/* Restore warning for multiple assignment operators */ +#if defined(_MSC_VER) && _MSC_VER >= 1500 +#pragma warning( pop ) +#endif /* defined(_MSC_VER) && _MSC_VER >= 1500 */ + +/* Restore -Weffc++ warnings from here on */ +#if defined(__GNUC__) +#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) +#pragma GCC diagnostic pop +#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */ +#endif /* defined(__GNUC__) */ + +#undef __CUDA_HOSTDEVICE__ +#undef __CUDA_ALIGN__ + +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __internal_float_as_uint(const float f) +{ + unsigned int u; +IF_DEVICE_OR_CUDACC( + u = __float_as_uint(f); +, + memcpy(&u, &f, sizeof(f)); +, + ::std::memcpy(&u, &f, sizeof(f)); +) + return u; +} + +__CUDA_HOSTDEVICE_BF16_DECL__ float __internal_uint_as_float(const unsigned int u) +{ + float f; +IF_DEVICE_OR_CUDACC( + f = __uint_as_float(u); +, + memcpy(&f, &u, sizeof(u)); +, + ::std::memcpy(&f, &u, sizeof(u)); +) + return f; +} + +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned short __internal_float2bfloat16(const float f, unsigned int &sign, unsigned int &remainder) +{ + unsigned int x; + + x = __internal_float_as_uint(f); + + if ((x & 0x7fffffffU) > 0x7f800000U) { + sign = 0U; + remainder = 0U; + return static_cast(0x7fffU); + } + sign = x >> 31U; + remainder = x << 16U; + return static_cast(x >> 16U); +} + +__CUDA_HOSTDEVICE_BF16_DECL__ float __internal_double2float_rn(const double x) +{ + float r; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rn.f32.f64 %0, %1;" : "=f"(r) : "d"(x)); +, + r = static_cast(x); +) + return r; +} +__CUDA_HOSTDEVICE_BF16_DECL__ double __internal_float2double(const float x) +{ + double r; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.f64.f32 %0, %1;" : "=d"(r) : "f"(x)); +, + r = static_cast(x); +) + return r; +} + +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __double2bfloat16(const double x) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 val; + asm("{ cvt.rn.bf16.f64 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "d"(x)); + return val; +, + float f = __internal_double2float_rn(x); + const double d = __internal_float2double(f); + unsigned int u = __internal_float_as_uint(f); + + bool x_is_not_nan = ((u << (unsigned)1U) <= (unsigned)0xFF000000U); + + + if ((x > 0.0) && (d > x)) { + u--; + } + if ((x < 0.0) && (d < x)) { + u--; + } + if ((d != x) && x_is_not_nan) { + u |= 1U; + } + + f = __internal_uint_as_float(u); + + return __float2bfloat16(f); +) +} + +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16(const float a) +{ + __nv_bfloat16 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + asm("{ cvt.rn.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a)); +, + __nv_bfloat16_raw r; + unsigned int sign = 0U; + unsigned int remainder = 0U; + r.x = __internal_float2bfloat16(a, sign, remainder); + if ((remainder > 0x80000000U) || ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) { + r.x++; + } + val = r; +) + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rn(const float a) +{ + __nv_bfloat16 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + asm("{ cvt.rn.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a)); +, + __nv_bfloat16_raw r; + unsigned int sign = 0U; + unsigned int remainder = 0U; + r.x = __internal_float2bfloat16(a, sign, remainder); + if ((remainder > 0x80000000U) || ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) { + r.x++; + } + val = r; +) + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rz(const float a) +{ + __nv_bfloat16 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + asm("{ cvt.rz.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a)); +, + __nv_bfloat16_raw r; + unsigned int sign = 0U; + unsigned int remainder = 0U; + r.x = __internal_float2bfloat16(a, sign, remainder); + val = r; +) + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rd(const float a) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 val; + asm("{ cvt.rm.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a)); + return val; +, + __nv_bfloat16 val; + __nv_bfloat16_raw r; + unsigned int sign = 0U; + unsigned int remainder = 0U; + r.x = __internal_float2bfloat16(a, sign, remainder); + if ((remainder != 0U) && (sign != 0U)) { + r.x++; + } + val = r; + return val; +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_ru(const float a) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 val; + asm("{ cvt.rp.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a)); + return val; +, + __nv_bfloat16 val; + __nv_bfloat16_raw r; + unsigned int sign = 0U; + unsigned int remainder = 0U; + r.x = __internal_float2bfloat16(a, sign, remainder); + if ((remainder != 0U) && (sign == 0U)) { + r.x++; + } + val = r; + return val; +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __float2bfloat162_rn(const float a) +{ + __nv_bfloat162 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + asm("{.reg .b16 low;\n" + " cvt.rn.bf16.f32 low, %1;\n" + " mov.b32 %0, {low,low};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "f"(a)); +, + val = __nv_bfloat162(__float2bfloat16_rn(a), __float2bfloat16_rn(a)); +) + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __floats2bfloat162_rn(const float a, const float b) +{ + __nv_bfloat162 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + asm("{ cvt.rn.bf16x2.f32 %0, %2, %1;}\n" + : "=r"(__BFLOAT162_TO_UI(val)) : "f"(a), "f"(b)); +, + val = __nv_bfloat162(__float2bfloat16_rn(a), __float2bfloat16_rn(b)); +) + return val; +} + +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +__CUDA_BF16_DECL__ float __internal_device_bfloat162float(const unsigned short h) +{ + float f; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm("{ cvt.f32.bf16 %0, %1;}\n" : "=f"(f) : "h"(h)); +, + asm("{ mov.b32 %0, {0,%1};}\n" : "=f"(f) : "h"(h)); +) + return f; +} +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ + +__CUDA_HOSTDEVICE_BF16_DECL__ float __internal_bfloat162float(const unsigned short h) +{ + float f; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + f = __internal_device_bfloat162float(h); +, + unsigned int u = static_cast(h) << 16; + f = __internal_uint_as_float(u); +) + return f; +} + +__CUDA_HOSTDEVICE_BF16_DECL__ float __bfloat162float(const __nv_bfloat16 a) +{ + return __internal_bfloat162float(static_cast<__nv_bfloat16_raw>(a).x); +} +__CUDA_HOSTDEVICE_BF16_DECL__ float __low2float(const __nv_bfloat162 a) +{ + return __internal_bfloat162float(static_cast<__nv_bfloat162_raw>(a).x); +} + +__CUDA_HOSTDEVICE_BF16_DECL__ float __high2float(const __nv_bfloat162 a) +{ + return __internal_bfloat162float(static_cast<__nv_bfloat162_raw>(a).y); +} + +/* CUDA vector-types compatible vector creation function (note returns __nv_bfloat162, not nv_bfloat162) */ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 make_bfloat162(const __nv_bfloat16 x, const __nv_bfloat16 y) +{ + __nv_bfloat162 t; t.x = x; t.y = y; return t; +} + +/* Definitions of intrinsics */ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __float22bfloat162_rn(const float2 a) +{ + __nv_bfloat162 val = __floats2bfloat162_rn(a.x, a.y); + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ float2 __bfloat1622float2(const __nv_bfloat162 a) +{ + float hi_float; + float lo_float; + lo_float = __internal_bfloat162float(((__nv_bfloat162_raw)a).x); + hi_float = __internal_bfloat162float(((__nv_bfloat162_raw)a).y); + return make_float2(lo_float, hi_float); +} +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +__CUDA_BF16_DECL__ int __bfloat162int_rn(const __nv_bfloat16 h) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + int val; + asm("{ cvt.rni.s32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h))); + return val; +, + return __float2int_rn(__bfloat162float(h)); +) +} +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ + +__CUDA_HOSTDEVICE_BF16_DECL__ int __internal_bfloat162int_rz(const __nv_bfloat16 h) +{ + const float f = __bfloat162float(h); + int i; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + i = __float2int_rz(f); +, + const int max_val = (int)0x7fffffffU; + const int min_val = (int)0x80000000U; + const unsigned short bits = static_cast(static_cast<__nv_bfloat16_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xFF00U) { + // NaN + i = 0; + } else if (f >= static_cast(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + i = min_val; + } else { + i = static_cast(f); + } +) + return i; +} + +__CUDA_HOSTDEVICE_BF16_DECL__ int __bfloat162int_rz(const __nv_bfloat16 h) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + int val; + asm("{ cvt.rzi.s32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h))); + return val; +, + return __internal_bfloat162int_rz(h); +) +} +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +__CUDA_BF16_DECL__ int __bfloat162int_rd(const __nv_bfloat16 h) +{ + int val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm("{ cvt.rmi.s32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h))); +, + const float f = __bfloat162float(h); + asm("cvt.rmi.s32.f32 %0, %1;" : "=r"(val) : "f"(f)); +) + return val; +} +__CUDA_BF16_DECL__ int __bfloat162int_ru(const __nv_bfloat16 h) +{ + int val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm("{ cvt.rpi.s32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h))); +, + const float f = __bfloat162float(h); + asm("cvt.rpi.s32.f32 %0, %1;" : "=r"(val) : "f"(f)); +) + return val; +} + +__CUDA_BF16_DECL__ __nv_bfloat16 __internal_device_int2bfloat16_rn(const int i) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 val; + asm("cvt.rn.bf16.s32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i)); + return val; +, + const float ru = __int2float_ru(i); + const float rd = __int2float_rd(i); + float rz = __int2float_rz(i); + if (ru != rd) { + rz = __uint_as_float(__float_as_uint(rz) | 1U); + } + return __float2bfloat16_rn(rz); +) +} +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rn(const int i) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __internal_device_int2bfloat16_rn(i); +, + const double d = static_cast(i); + return __double2bfloat16(d); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ signed char __bfloat162char_rz(const __nv_bfloat16 h) +{ + signed char i; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + unsigned short tmp = 0; + asm("{ .reg.b8 myreg;\n" + " cvt.rzi.s8.bf16 myreg, %1;\n" + " mov.b16 %0, {myreg, 0};\n}" + :"=h"(tmp) : "h"(__BFLOAT16_TO_CUS(h))); + const unsigned char u = static_cast(tmp); + i = static_cast(u); +, + const float f = __bfloat162float(h); + const signed char max_val = (signed char)0x7fU; + const signed char min_val = (signed char)0x80U; + const unsigned short bits = static_cast(static_cast<__nv_bfloat16_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xFF00U) { + // NaN + i = 0; + } else if (f > static_cast(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + i = min_val; + } else { + // normal value, conversion is well-defined + i = static_cast(f); + } +) + return i; +} + +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned char __bfloat162uchar_rz(const __nv_bfloat16 h) +{ + unsigned char i; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + unsigned short tmp = 0; + asm("{ .reg.b8 myreg;\n" + " cvt.rzi.u8.bf16 myreg, %1;\n" + " mov.b16 %0, {myreg, 0};\n}" + :"=h"(tmp) : "h"(__BFLOAT16_TO_CUS(h))); + i = static_cast(tmp); +, + const float f = __bfloat162float(h); + const unsigned char max_val = 0xffU; + const unsigned char min_val = 0U; + const unsigned short bits = static_cast(static_cast<__nv_bfloat16_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xFF00U) { + // NaN + i = 0U; + } else if (f > static_cast(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + i = min_val; + } else { + // normal value, conversion is well-defined + i = static_cast(f); + } +) + return i; +} +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rz(const int i) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 val; + asm("cvt.rz.bf16.s32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i)); + return val; +, + return __float2bfloat16_rz(__int2float_rz(i)); +) +} +__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rd(const int i) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 val; + asm("cvt.rm.bf16.s32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i)); + return val; +, + return __float2bfloat16_rd(__int2float_rd(i)); +) +} + +__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_ru(const int i) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 val; + asm("cvt.rp.bf16.s32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i)); + return val; +, + return __float2bfloat16_ru(__int2float_ru(i)); +) +} + +__CUDA_BF16_DECL__ short int __bfloat162short_rn(const __nv_bfloat16 h) +{ + short int val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm("cvt.rni.s16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +, + asm("{ .reg.f32 f;\n" + " mov.b32 f, {0,%1};\n" + " cvt.rni.s16.f32 %0,f;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +) + return val; +} + +__CUDA_BF16_DECL__ short int __internal_device_bfloat162short_rz(const __nv_bfloat16 h) +{ + short int val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm("cvt.rzi.s16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +, + asm("{ .reg.f32 f;\n" + " mov.b32 f, {0,%1};\n" + " cvt.rzi.s16.f32 %0,f;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +) + return val; +} +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_BF16_DECL__ short int __bfloat162short_rz(const __nv_bfloat16 h) +{ + short int val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + val = __internal_device_bfloat162short_rz(h); +, + const float f = __bfloat162float(h); + const short int max_val = (short int)0x7fffU; + const short int min_val = (short int)0x8000U; + const unsigned short bits = static_cast(static_cast<__nv_bfloat16_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xFF00U) { + // NaN + val = 0; + } else if (f > static_cast(max_val)) { + // saturate maximum + val = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + val = min_val; + } else { + val = static_cast(f); + } +) + return val; +} +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +__CUDA_BF16_DECL__ short int __bfloat162short_rd(const __nv_bfloat16 h) +{ + short int val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm("cvt.rmi.s16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +, + asm("{ .reg.f32 f;\n" + " mov.b32 f, {0,%1};\n" + " cvt.rmi.s16.f32 %0,f;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +) + return val; +} +__CUDA_BF16_DECL__ short int __bfloat162short_ru(const __nv_bfloat16 h) +{ + short int val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm("cvt.rpi.s16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +, + asm("{ .reg.f32 f;\n" + " mov.b32 f, {0,%1};\n" + " cvt.rpi.s16.f32 %0,f;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +) + return val; +} +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rn(const short int i) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 val; + asm("cvt.rn.bf16.s16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i)); + return val; +, + const float f = static_cast(i); + return __float2bfloat16_rn(f); +) +} +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rz(const short int i) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 val; + asm("cvt.rz.bf16.s16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i)); + return val; +, + return __float2bfloat16_rz(__int2float_rz(static_cast(i))); +) +} +__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rd(const short int i) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 val; + asm("cvt.rm.bf16.s16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i)); + return val; +, + return __float2bfloat16_rd(__int2float_rd(static_cast(i))); +) +} +__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_ru(const short int i) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 val; + asm("cvt.rp.bf16.s16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i)); + return val; +, + return __float2bfloat16_ru(__int2float_ru(static_cast(i))); +) +} + +__CUDA_BF16_DECL__ unsigned int __bfloat162uint_rn(const __nv_bfloat16 h) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + unsigned int val; + asm("{ cvt.rni.u32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h))); + return val; +, + return __float2uint_rn(__bfloat162float(h)); +) +} +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __internal_bfloat162uint_rz(const __nv_bfloat16 h) +{ + const float f = __bfloat162float(h); + unsigned int i; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + i = __float2uint_rz(f); +, + const unsigned int max_val = 0xffffffffU; + const unsigned int min_val = 0U; + const unsigned short bits = static_cast(static_cast<__nv_bfloat16_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xFF00U) { + // NaN + i = 0U; + } else if (f >= static_cast(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + i = min_val; + } else { + i = static_cast(f); + } +) + return i; +} + +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __bfloat162uint_rz(const __nv_bfloat16 h) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + unsigned int val; + asm("{ cvt.rzi.u32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h))); + return val; +, + return __internal_bfloat162uint_rz(h); +) +} +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +__CUDA_BF16_DECL__ unsigned int __bfloat162uint_rd(const __nv_bfloat16 h) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + unsigned int val; + asm("{ cvt.rmi.u32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h))); + return val; +, + return __float2uint_rd(__bfloat162float(h)); +) +} +__CUDA_BF16_DECL__ unsigned int __bfloat162uint_ru(const __nv_bfloat16 h) +{ + unsigned int val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm("{ cvt.rpi.u32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h))); +, + const float f = __bfloat162float(h); + asm("cvt.rpi.u32.f32 %0, %1;" : "=r"(val) : "f"(f)); +) + return val; +} + +__CUDA_BF16_DECL__ __nv_bfloat16 __internal_device_uint2bfloat16_rn(const unsigned int i) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 val; + asm("cvt.rn.bf16.u32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i)); + return val; +, + const float ru = __uint2float_ru(i); + const float rd = __uint2float_rd(i); + float rz = __uint2float_rz(i); + if (ru != rd) { + rz = __uint_as_float(__float_as_uint(rz) | 1U); + } + return __float2bfloat16_rn(rz); +) +} +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rn(const unsigned int i) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __internal_device_uint2bfloat16_rn(i); +, + const double d = static_cast(i); + return __double2bfloat16(d); +) +} +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rz(const unsigned int i) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 val; + asm("cvt.rz.bf16.u32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i)); + return val; +, + return __float2bfloat16_rz(__uint2float_rz(i)); +) +} +__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rd(const unsigned int i) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 val; + asm("cvt.rm.bf16.u32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i)); + return val; +, + return __float2bfloat16_rd(__uint2float_rd(i)); +) +} +__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_ru(const unsigned int i) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 val; + asm("cvt.rp.bf16.u32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i)); + return val; +, + return __float2bfloat16_ru(__uint2float_ru(i)); +) +} + +__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_rn(const __nv_bfloat16 h) +{ + unsigned short int val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm("cvt.rni.u16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +, + asm("{ .reg.f32 f;\n" + " mov.b32 f, {0,%1};\n" + " cvt.rni.u16.f32 %0,f;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +) + return val; +} + +__CUDA_BF16_DECL__ unsigned short int __internal_device_bfloat162ushort_rz(const __nv_bfloat16 h) +{ + unsigned short int val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm("cvt.rzi.u16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +, + asm("{ .reg.f32 f;\n" + " mov.b32 f, {0,%1};\n" + " cvt.rzi.u16.f32 %0,f;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +) + return val; +} +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned short int __bfloat162ushort_rz(const __nv_bfloat16 h) +{ + unsigned short int val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + val = __internal_device_bfloat162ushort_rz(h); +, + const float f = __bfloat162float(h); + const unsigned short int max_val = 0xffffU; + const unsigned short int min_val = 0U; + const unsigned short bits = static_cast(static_cast<__nv_bfloat16_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xFF00U) { + // NaN + val = 0U; + } else if (f > static_cast(max_val)) { + // saturate maximum + val = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + val = min_val; + } else { + val = static_cast(f); + } +) + return val; +} +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_rd(const __nv_bfloat16 h) +{ + unsigned short int val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm("cvt.rmi.u16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +, + asm("{ .reg.f32 f;\n" + " mov.b32 f, {0,%1};\n" + " cvt.rmi.u16.f32 %0,f;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +) + return val; +} +__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_ru(const __nv_bfloat16 h) +{ + unsigned short int val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm("cvt.rpi.u16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +, + asm("{ .reg.f32 f;\n" + " mov.b32 f, {0,%1};\n" + " cvt.rpi.u16.f32 %0,f;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +) + return val; +} +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rn(const unsigned short int i) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 val; + asm("cvt.rn.bf16.u16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i)); + return val; +, + const float f = static_cast(i); + return __float2bfloat16_rn(f); +) +} +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rz(const unsigned short int i) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 val; + asm("cvt.rz.bf16.u16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i)); + return val; +, + return __float2bfloat16_rz(__uint2float_rz(static_cast(i))); +) +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rd(const unsigned short int i) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 val; + asm("cvt.rm.bf16.u16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i)); + return val; +, + return __float2bfloat16_rd(__uint2float_rd(static_cast(i))); +) +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_ru(const unsigned short int i) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 val; + asm("cvt.rp.bf16.u16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i)); + return val; +, + return __float2bfloat16_ru(__uint2float_ru(static_cast(i))); +) +} + +__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_rn(const __nv_bfloat16 h) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + unsigned long long int i; + asm("cvt.rni.u64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h))); + return i; +, + return __float2ull_rn(__bfloat162float(h)); +) +} + +__CUDA_BF16_DECL__ unsigned long long int __internal_device_bfloat162ull_rz(const __nv_bfloat16 h) +{ + unsigned long long int i; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm("cvt.rzi.u64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h))); +, + const float f = __bfloat162float(h); + i = __float2ull_rz(f); +) + return i; +} +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned long long int __bfloat162ull_rz(const __nv_bfloat16 h) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __internal_device_bfloat162ull_rz(h); +, + const float f = __bfloat162float(h); + unsigned long long int i; + const unsigned long long int max_val = 0xffffffffffffffffULL; + const unsigned long long int min_val = 0ULL; + const unsigned short bits = static_cast(static_cast<__nv_bfloat16_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xFF00U) { + // NaN + i = 0x8000000000000000ULL; + } else if (f >= static_cast(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + i = min_val; + } else { + i = static_cast(f); + } + return i; +) +} +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_rd(const __nv_bfloat16 h) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + unsigned long long int i; + asm("cvt.rmi.u64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h))); + return i; +, + return __float2ull_rd(__bfloat162float(h)); +) +} +__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_ru(const __nv_bfloat16 h) +{ + unsigned long long int i; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm("cvt.rpi.u64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h))); +, + const float f = __bfloat162float(h); + asm("cvt.rpi.u64.f32 %0, %1;" : "=l"(i) : "f"(f)); +) + return i; +} + +__CUDA_BF16_DECL__ __nv_bfloat16 __internal_device_ull2bfloat16_rn(const unsigned long long int i) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 h; + asm("cvt.rn.bf16.u64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i)); + return h; +, + const float ru = __ull2float_ru(i); + const float rd = __ull2float_rd(i); + float rz = __ull2float_rz(i); + if (ru != rd) { + rz = __uint_as_float(__float_as_uint(rz) | 1U); + } + return __float2bfloat16_rn(rz); +) +} +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rn(const unsigned long long int i) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __internal_device_ull2bfloat16_rn(i); +, + float f = static_cast(i); + const unsigned long long int uf = static_cast(f); + unsigned int u = __internal_float_as_uint(f); + // round up happened here + // note: no need to handle round up to f == 0x1.p64 specially + if (uf > i) { + u--; + } + if (uf != i) { + u |= 1U; + } + f = __internal_uint_as_float(u); + return __float2bfloat16_rn(f); +) +} +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rz(const unsigned long long int i) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 h; + asm("cvt.rz.bf16.u64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i)); + return h; +, + return __float2bfloat16_rz(__ull2float_rz(i)); +) +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rd(const unsigned long long int i) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 h; + asm("cvt.rm.bf16.u64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i)); + return h; +, + return __float2bfloat16_rd(__ull2float_rd(i)); +) +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_ru(const unsigned long long int i) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 h; + asm("cvt.rp.bf16.u64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i)); + return h; +, + return __float2bfloat16_ru(__ull2float_ru(i)); +) +} +__CUDA_BF16_DECL__ long long int __bfloat162ll_rn(const __nv_bfloat16 h) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + long long int i; + asm("cvt.rni.s64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h))); + return i; +, + return __float2ll_rn(__bfloat162float(h)); +) +} + +__CUDA_BF16_DECL__ long long int __internal_device_bfloat162ll_rz(const __nv_bfloat16 h) +{ + long long int i; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm("cvt.rzi.s64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h))); +, + const float f = __bfloat162float(h); + i = __float2ll_rz(f); +) + return i; +} +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_BF16_DECL__ long long int __bfloat162ll_rz(const __nv_bfloat16 h) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __internal_device_bfloat162ll_rz(h); +, + long long int i; + const float f = __bfloat162float(h); + const long long int max_val = (long long int)0x7fffffffffffffffULL; + const long long int min_val = (long long int)0x8000000000000000ULL; + const unsigned short bits = static_cast(static_cast<__nv_bfloat16_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xFF00U) { + // NaN + i = min_val; + } else if (f >= static_cast(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + i = min_val; + } else { + i = static_cast(f); + } + return i; +) +} +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +__CUDA_BF16_DECL__ long long int __bfloat162ll_rd(const __nv_bfloat16 h) +{ + long long int i; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm("cvt.rmi.s64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h))); +, + const float f = __bfloat162float(h); + asm("cvt.rmi.s64.f32 %0, %1;" : "=l"(i) : "f"(f)); +) + return i; +} +__CUDA_BF16_DECL__ long long int __bfloat162ll_ru(const __nv_bfloat16 h) +{ + long long int i; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm("cvt.rpi.s64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h))); +, + const float f = __bfloat162float(h); + asm("cvt.rpi.s64.f32 %0, %1;" : "=l"(i) : "f"(f)); +) + return i; +} + +__CUDA_BF16_DECL__ __nv_bfloat16 __internal_device_ll2bfloat16_rn(const long long int i) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 h; + asm("cvt.rn.bf16.s64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i)); + return h; +, + const float ru = __ll2float_ru(i); + const float rd = __ll2float_rd(i); + float rz = __ll2float_rz(i); + if (ru != rd) { + rz = __uint_as_float(__float_as_uint(rz) | 1U); + } + return __float2bfloat16_rn(rz); +) +} +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rn(const long long int i) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __internal_device_ll2bfloat16_rn(i); +, + float f = static_cast(i); + const long long int lf = static_cast(f); + unsigned int u = __internal_float_as_uint(f); + + if ((f > 0.0f) && (lf > i)) { + u--; + } + if ((f < 0.0f) && (lf < i)) { + u--; + } + if (lf != i) { + u |= 1U; + } + + f = __internal_uint_as_float(u); + return __float2bfloat16_rn(f); +) +} +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rz(const long long int i) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 h; + asm("cvt.rz.bf16.s64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i)); + return h; +, + return __float2bfloat16_rz(__ll2float_rz(i)); +) +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rd(const long long int i) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 h; + asm("cvt.rm.bf16.s64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i)); + return h; +, + return __float2bfloat16_rd(__ll2float_rd(i)); +) +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_ru(const long long int i) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 h; + asm("cvt.rp.bf16.s64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i)); + return h; +, + return __float2bfloat16_ru(__ll2float_ru(i)); +) +} + +__CUDA_BF16_DECL__ __nv_bfloat16 htrunc(const __nv_bfloat16 h) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 r; + asm("cvt.rzi.bf16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(h))); + return r; +, + return __float2bfloat16_rz(truncf(__bfloat162float(h))); +) +} +__CUDA_BF16_DECL__ __nv_bfloat16 hceil(const __nv_bfloat16 h) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 r; + asm("cvt.rpi.bf16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(h))); + return r; +, + float fh = __bfloat162float(h); + asm( "{ cvt.rpi.f32.f32 %0, %0; }\n" + :"+f"(fh)); + return __float2bfloat16_rz(fh); +) +} +__CUDA_BF16_DECL__ __nv_bfloat16 hfloor(const __nv_bfloat16 h) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 r; + asm("cvt.rmi.bf16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(h))); + return r; +, + float fh = __bfloat162float(h); + asm( "{ cvt.rmi.f32.f32 %0, %0; }\n" + :"+f"(fh)); + return __float2bfloat16_rz(fh); +) +} +__CUDA_BF16_DECL__ __nv_bfloat16 hrint(const __nv_bfloat16 h) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 r; + asm("cvt.rni.bf16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(h))); + return r; +, + return __float2bfloat16_rz(rintf(__bfloat162float(h))); +) +} + +__CUDA_BF16_DECL__ __nv_bfloat162 h2trunc(const __nv_bfloat162 h) +{ + const __nv_bfloat16 low = htrunc(h.x); + const __nv_bfloat16 high = htrunc(h.y); + return __nv_bfloat162(low, high); +} +__CUDA_BF16_DECL__ __nv_bfloat162 h2ceil(const __nv_bfloat162 h) +{ + const __nv_bfloat16 low = hceil(h.x); + const __nv_bfloat16 high = hceil(h.y); + return __nv_bfloat162(low, high); +} +__CUDA_BF16_DECL__ __nv_bfloat162 h2floor(const __nv_bfloat162 h) +{ + const __nv_bfloat16 low = hfloor(h.x); + const __nv_bfloat16 high = hfloor(h.y); + return __nv_bfloat162(low, high); +} + +__CUDA_BF16_DECL__ __nv_bfloat162 h2rint(const __nv_bfloat162 h) +{ + return __halves2bfloat162(hrint(__low2bfloat16(h)), hrint(__high2bfloat16(h))); +} +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __lows2bfloat162(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{.reg .b16 alow,ahigh,blow,bhigh;\n" + " mov.b32 {alow,ahigh}, %1;\n" + " mov.b32 {blow,bhigh}, %2;\n" + " mov.b32 %0, {alow,blow};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)), "r"(__BFLOAT162_TO_CUI(b))); +, + val.x = a.x; + val.y = b.x; +) + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __highs2bfloat162(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{.reg .b16 alow,ahigh,blow,bhigh;\n" + " mov.b32 {alow,ahigh}, %1;\n" + " mov.b32 {blow,bhigh}, %2;\n" + " mov.b32 %0, {ahigh,bhigh};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)), "r"(__BFLOAT162_TO_CUI(b))); +, + val.x = a.y; + val.y = b.y; +) + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __low2bfloat16(const __nv_bfloat162 a) +{ + __nv_bfloat16 ret; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{.reg .b16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b16 %0, low;}" : "=h"(__BFLOAT16_TO_US(ret)) : "r"(__BFLOAT162_TO_CUI(a))); +, + ret = a.x; +) + return ret; +} +__CUDA_HOSTDEVICE_BF16_DECL__ int __hisinf(const __nv_bfloat16 a) +{ + int retval; + const __nv_bfloat16_raw araw = __nv_bfloat16_raw(a); + if (araw.x == 0xFF80U) { + retval = -1; + } else if (araw.x == 0x7F80U) { + retval = 1; + } else { + retval = 0; + } + return retval; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __low2bfloat162(const __nv_bfloat162 a) +{ + __nv_bfloat162 val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{.reg .b16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b32 %0, {low,low};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a))); +, + val.x = a.x; + val.y = a.x; +) + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __high2bfloat162(const __nv_bfloat162 a) +{ + __nv_bfloat162 val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{.reg .b16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b32 %0, {high,high};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a))); +, + val.x = a.y; + val.y = a.y; +) + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __high2bfloat16(const __nv_bfloat162 a) +{ + __nv_bfloat16 ret; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{.reg .b16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b16 %0, high;}" : "=h"(__BFLOAT16_TO_US(ret)) : "r"(__BFLOAT162_TO_CUI(a))); +, + ret = a.y; +) + return ret; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __halves2bfloat162(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __nv_bfloat162 val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{ mov.b32 %0, {%1,%2};}\n" + : "=r"(__BFLOAT162_TO_UI(val)) : "h"(__BFLOAT16_TO_CUS(a)), "h"(__BFLOAT16_TO_CUS(b))); +, + val.x = a; + val.y = b; +) + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __bfloat162bfloat162(const __nv_bfloat16 a) +{ + __nv_bfloat162 val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{ mov.b32 %0, {%1,%1};}\n" + : "=r"(__BFLOAT162_TO_UI(val)) : "h"(__BFLOAT16_TO_CUS(a))); +, + val.x = a; + val.y = a; +) + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __lowhigh2highlow(const __nv_bfloat162 a) +{ + __nv_bfloat162 val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{.reg .b16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b32 %0, {high,low};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a))); +, + val.x = a.y; + val.y = a.x; +) + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ short int __bfloat16_as_short(const __nv_bfloat16 h) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return static_cast(__BFLOAT16_TO_CUS(h)); +, + return static_cast(__nv_bfloat16_raw(h).x); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned short int __bfloat16_as_ushort(const __nv_bfloat16 h) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __BFLOAT16_TO_CUS(h); +, + return __nv_bfloat16_raw(h).x; +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __short_as_bfloat16(const short int i) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __nv_bfloat16 h; + __BFLOAT16_TO_US(h) = static_cast(i); + return h; +, + __nv_bfloat16_raw hr; + hr.x = static_cast(i); + return __nv_bfloat16(hr); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ushort_as_bfloat16(const unsigned short int i) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __nv_bfloat16 h; + __BFLOAT16_TO_US(h) = i; + return h; +, + __nv_bfloat16_raw hr; + hr.x = i; + return __nv_bfloat16(hr); +) +} + +#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300))) || defined(_NVHPC_CUDA) +/****************************************************************************** +* __nv_bfloat16, __nv_bfloat162 warp shuffle * +******************************************************************************/ +#define __SHUFFLE_SYNC_BFLOAT162_MACRO(name, var, delta, c, mask) /* do */ {\ + __nv_bfloat162 r; \ + asm volatile ("{" __CUDA_BF16_STRINGIFY(name) " %0,%1,%2,%3,%4;\n}" \ + :"=r"(__BFLOAT162_TO_UI(r)): "r"(__BFLOAT162_TO_CUI(var)), "r"(delta), "r"(c), "r"(mask)); \ + return r; \ +} /* while(0) */ + +__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_sync(const unsigned int mask, const __nv_bfloat162 var, const int srcLane, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = ((warp_size - static_cast(width)) << 8U) | 0x1fU; + __SHUFFLE_SYNC_BFLOAT162_MACRO(shfl.sync.idx.b32, var, srcLane, c, mask) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_up_sync(const unsigned int mask, const __nv_bfloat162 var, const unsigned int delta, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = (warp_size - static_cast(width)) << 8U; + __SHUFFLE_SYNC_BFLOAT162_MACRO(shfl.sync.up.b32, var, delta, c, mask) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_down_sync(const unsigned int mask, const __nv_bfloat162 var, const unsigned int delta, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = ((warp_size - static_cast(width)) << 8U) | 0x1fU; + __SHUFFLE_SYNC_BFLOAT162_MACRO(shfl.sync.down.b32, var, delta, c, mask) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_xor_sync(const unsigned int mask, const __nv_bfloat162 var, const int laneMask, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = ((warp_size - static_cast(width)) << 8U) | 0x1fU; + __SHUFFLE_SYNC_BFLOAT162_MACRO(shfl.sync.bfly.b32, var, laneMask, c, mask) +} + +#undef __SHUFFLE_SYNC_BFLOAT162_MACRO + +__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_sync(const unsigned int mask, const __nv_bfloat16 var, const int srcLane, const int width) +{ + const __nv_bfloat162 temp1 = __halves2bfloat162(var, var); + const __nv_bfloat162 temp2 = __shfl_sync(mask, temp1, srcLane, width); + return __low2bfloat16(temp2); +} +__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_up_sync(const unsigned int mask, const __nv_bfloat16 var, const unsigned int delta, const int width) +{ + const __nv_bfloat162 temp1 = __halves2bfloat162(var, var); + const __nv_bfloat162 temp2 = __shfl_up_sync(mask, temp1, delta, width); + return __low2bfloat16(temp2); +} +__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_down_sync(const unsigned int mask, const __nv_bfloat16 var, const unsigned int delta, const int width) +{ + const __nv_bfloat162 temp1 = __halves2bfloat162(var, var); + const __nv_bfloat162 temp2 = __shfl_down_sync(mask, temp1, delta, width); + return __low2bfloat16(temp2); +} +__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_xor_sync(const unsigned int mask, const __nv_bfloat16 var, const int laneMask, const int width) +{ + const __nv_bfloat162 temp1 = __halves2bfloat162(var, var); + const __nv_bfloat162 temp2 = __shfl_xor_sync(mask, temp1, laneMask, width); + return __low2bfloat16(temp2); +} + +/****************************************************************************** +* __nv_bfloat16 and __nv_bfloat162 __ldg,__ldcg,__ldca,__ldcs * +******************************************************************************/ + +#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__) +#define __LDG_PTR "l" +#else +#define __LDG_PTR "r" +#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __ldg(const __nv_bfloat162 *const ptr) +{ + __nv_bfloat162 ret; + asm ("ld.global.nc.b32 %0, [%1];" : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ldg(const __nv_bfloat16 *const ptr) +{ + __nv_bfloat16 ret; + asm ("ld.global.nc.b16 %0, [%1];" : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __ldcg(const __nv_bfloat162 *const ptr) +{ + __nv_bfloat162 ret; + asm ("ld.global.cg.b32 %0, [%1];" : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ldcg(const __nv_bfloat16 *const ptr) +{ + __nv_bfloat16 ret; + asm ("ld.global.cg.b16 %0, [%1];" : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __ldca(const __nv_bfloat162 *const ptr) +{ + __nv_bfloat162 ret; + asm ("ld.global.ca.b32 %0, [%1];" : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ldca(const __nv_bfloat16 *const ptr) +{ + __nv_bfloat16 ret; + asm ("ld.global.ca.b16 %0, [%1];" : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __ldcs(const __nv_bfloat162 *const ptr) +{ + __nv_bfloat162 ret; + asm ("ld.global.cs.b32 %0, [%1];" : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ldcs(const __nv_bfloat16 *const ptr) +{ + __nv_bfloat16 ret; + asm ("ld.global.cs.b16 %0, [%1];" : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __ldlu(const __nv_bfloat162 *const ptr) +{ + __nv_bfloat162 ret; + asm ("ld.global.lu.b32 %0, [%1];" : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr) : "memory"); + return ret; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ldlu(const __nv_bfloat16 *const ptr) +{ + __nv_bfloat16 ret; + asm ("ld.global.lu.b16 %0, [%1];" : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr) : "memory"); + return ret; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __ldcv(const __nv_bfloat162 *const ptr) +{ + __nv_bfloat162 ret; + asm ("ld.global.cv.b32 %0, [%1];" : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr) : "memory"); + return ret; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ldcv(const __nv_bfloat16 *const ptr) +{ + __nv_bfloat16 ret; + asm ("ld.global.cv.b16 %0, [%1];" : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr) : "memory"); + return ret; +} + +__CUDA_BF16_DECL__ void __stwb(__nv_bfloat162 *const ptr, const __nv_bfloat162 value) +{ + asm ("st.global.wb.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__BFLOAT162_TO_CUI(value)) : "memory"); +} +__CUDA_BF16_DECL__ void __stwb(__nv_bfloat16 *const ptr, const __nv_bfloat16 value) +{ + asm ("st.global.wb.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__BFLOAT16_TO_CUS(value)) : "memory"); +} +__CUDA_BF16_DECL__ void __stcg(__nv_bfloat162 *const ptr, const __nv_bfloat162 value) +{ + asm ("st.global.cg.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__BFLOAT162_TO_CUI(value)) : "memory"); +} +__CUDA_BF16_DECL__ void __stcg(__nv_bfloat16 *const ptr, const __nv_bfloat16 value) +{ + asm ("st.global.cg.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__BFLOAT16_TO_CUS(value)) : "memory"); +} +__CUDA_BF16_DECL__ void __stcs(__nv_bfloat162 *const ptr, const __nv_bfloat162 value) +{ + asm ("st.global.cs.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__BFLOAT162_TO_CUI(value)) : "memory"); +} +__CUDA_BF16_DECL__ void __stcs(__nv_bfloat16 *const ptr, const __nv_bfloat16 value) +{ + asm ("st.global.cs.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__BFLOAT16_TO_CUS(value)) : "memory"); +} +__CUDA_BF16_DECL__ void __stwt(__nv_bfloat162 *const ptr, const __nv_bfloat162 value) +{ + asm ("st.global.wt.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__BFLOAT162_TO_CUI(value)) : "memory"); +} +__CUDA_BF16_DECL__ void __stwt(__nv_bfloat16 *const ptr, const __nv_bfloat16 value) +{ + asm ("st.global.wt.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__BFLOAT16_TO_CUS(value)) : "memory"); +} + +#undef __LDG_PTR +#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300))) || defined(_NVHPC_CUDA) */ +/****************************************************************************** +* __nv_bfloat162 comparison * +******************************************************************************/ +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +#define __COMPARISON_OP_BFLOAT162_MACRO(name) {\ + __nv_bfloat162 val; \ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,\ + asm( "{ " __CUDA_BF16_STRINGIFY(name) ".bf16x2.bf16x2 %0,%1,%2;\n}" \ + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \ +,\ + asm( "{.reg .b32 low_a,low_b,high_a,high_b,high_res,low_res;\n"\ + " and.b32 high_a, %1, 0xffff0000U;\n"\ + " and.b32 high_b, %2, 0xffff0000U;\n"\ + " shl.b32 low_a, %1, 16;\n"\ + " shl.b32 low_b, %2, 16;\n"\ + " " __CUDA_BF16_STRINGIFY(name) ".f32.f32 low_res, low_a, low_b;\n"\ + " " __CUDA_BF16_STRINGIFY(name) ".f32.f32 high_res, high_a, high_b;\n"\ + " shr.u32 low_res, low_res, 16;\n"\ + " or.b32 %0, high_res, low_res;}\n"\ + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \ +)\ + return val; \ +} +__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_heq2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __COMPARISON_OP_BFLOAT162_MACRO(set.eq) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hne2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __COMPARISON_OP_BFLOAT162_MACRO(set.ne) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hle2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __COMPARISON_OP_BFLOAT162_MACRO(set.le) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hge2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __COMPARISON_OP_BFLOAT162_MACRO(set.ge) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hlt2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __COMPARISON_OP_BFLOAT162_MACRO(set.lt) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hgt2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __COMPARISON_OP_BFLOAT162_MACRO(set.gt) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hequ2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __COMPARISON_OP_BFLOAT162_MACRO(set.equ) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hneu2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __COMPARISON_OP_BFLOAT162_MACRO(set.neu) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hleu2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __COMPARISON_OP_BFLOAT162_MACRO(set.leu) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __COMPARISON_OP_BFLOAT162_MACRO(set.geu) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hltu2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __COMPARISON_OP_BFLOAT162_MACRO(set.ltu) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __COMPARISON_OP_BFLOAT162_MACRO(set.gtu) +} +#undef __COMPARISON_OP_BFLOAT162_MACRO +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ + +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __heq2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __internal_device_heq2(a, b); +, + __nv_bfloat162_raw val; + val.x = __heq(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U; + val.y = __heq(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U; + return __nv_bfloat162(val); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hne2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __internal_device_hne2(a, b); +, + __nv_bfloat162_raw val; + val.x = __hne(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U; + val.y = __hne(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U; + return __nv_bfloat162(val); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hle2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __internal_device_hle2(a, b); +, + __nv_bfloat162_raw val; + val.x = __hle(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U; + val.y = __hle(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U; + return __nv_bfloat162(val); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hge2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __internal_device_hge2(a, b); +, + __nv_bfloat162_raw val; + val.x = __hge(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U; + val.y = __hge(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U; + return __nv_bfloat162(val); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hlt2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __internal_device_hlt2(a, b); +, + __nv_bfloat162_raw val; + val.x = __hlt(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U; + val.y = __hlt(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U; + return __nv_bfloat162(val); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hgt2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __internal_device_hgt2(a, b); +, + __nv_bfloat162_raw val; + val.x = __hgt(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U; + val.y = __hgt(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U; + return __nv_bfloat162(val); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hequ2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __internal_device_hequ2(a, b); +, + __nv_bfloat162_raw val; + val.x = __hequ(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U; + val.y = __hequ(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U; + return __nv_bfloat162(val); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hneu2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __internal_device_hneu2(a, b); +, + __nv_bfloat162_raw val; + val.x = __hneu(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U; + val.y = __hneu(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U; + return __nv_bfloat162(val); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hleu2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __internal_device_hleu2(a, b); +, + __nv_bfloat162_raw val; + val.x = __hleu(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U; + val.y = __hleu(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U; + return __nv_bfloat162(val); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __internal_device_hgeu2(a, b); +, + __nv_bfloat162_raw val; + val.x = __hgeu(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U; + val.y = __hgeu(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U; + return __nv_bfloat162(val); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hltu2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __internal_device_hltu2(a, b); +, + __nv_bfloat162_raw val; + val.x = __hltu(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U; + val.y = __hltu(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U; + return __nv_bfloat162(val); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __internal_device_hgtu2(a, b); +, + __nv_bfloat162_raw val; + val.x = __hgtu(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U; + val.y = __hgtu(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U; + return __nv_bfloat162(val); +) +} + +/****************************************************************************** +* __nv_bfloat162 comparison with mask output * +******************************************************************************/ +#define __COMPARISON_OP_BFLOAT162_MACRO_MASK(name) {\ + unsigned val; \ + asm( "{ " __CUDA_BF16_STRINGIFY(name) ".u32.bf16x2 %0,%1,%2;\n}" \ + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \ + return val; \ +} + +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __heq2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.eq) +, + const unsigned short px = __heq(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U; + const unsigned short py = __heq(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U; + unsigned ur = (unsigned)py; + ur <<= (unsigned)16U; + ur |= (unsigned)px; + return ur; +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hne2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.ne) +, + const unsigned short px = __hne(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U; + const unsigned short py = __hne(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U; + unsigned ur = (unsigned)py; + ur <<= (unsigned)16U; + ur |= (unsigned)px; + return ur; +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hle2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.le) +, + const unsigned short px = __hle(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U; + const unsigned short py = __hle(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U; + unsigned ur = (unsigned)py; + ur <<= (unsigned)16U; + ur |= (unsigned)px; + return ur; +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hge2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.ge) +, + const unsigned short px = __hge(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U; + const unsigned short py = __hge(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U; + unsigned ur = (unsigned)py; + ur <<= (unsigned)16U; + ur |= (unsigned)px; + return ur; +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hlt2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.lt) +, + const unsigned short px = __hlt(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U; + const unsigned short py = __hlt(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U; + unsigned ur = (unsigned)py; + ur <<= (unsigned)16U; + ur |= (unsigned)px; + return ur; +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hgt2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.gt) +, + const unsigned short px = __hgt(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U; + const unsigned short py = __hgt(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U; + unsigned ur = (unsigned)py; + ur <<= (unsigned)16U; + ur |= (unsigned)px; + return ur; +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hequ2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.equ) +, + const unsigned short px = __hequ(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U; + const unsigned short py = __hequ(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U; + unsigned ur = (unsigned)py; + ur <<= (unsigned)16U; + ur |= (unsigned)px; + return ur; +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hneu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.neu) +, + const unsigned short px = __hneu(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U; + const unsigned short py = __hneu(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U; + unsigned ur = (unsigned)py; + ur <<= (unsigned)16U; + ur |= (unsigned)px; + return ur; +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hleu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.leu) +, + const unsigned short px = __hleu(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U; + const unsigned short py = __hleu(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U; + unsigned ur = (unsigned)py; + ur <<= (unsigned)16U; + ur |= (unsigned)px; + return ur; +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hgeu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.geu) +, + const unsigned short px = __hgeu(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U; + const unsigned short py = __hgeu(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U; + unsigned ur = (unsigned)py; + ur <<= (unsigned)16U; + ur |= (unsigned)px; + return ur; +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hltu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.ltu) +, + const unsigned short px = __hltu(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U; + const unsigned short py = __hltu(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U; + unsigned ur = (unsigned)py; + ur <<= (unsigned)16U; + ur |= (unsigned)px; + return ur; +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hgtu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.gtu) +, + const unsigned short px = __hgtu(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U; + const unsigned short py = __hgtu(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U; + unsigned ur = (unsigned)py; + ur <<= (unsigned)16U; + ur |= (unsigned)px; + return ur; +) +} +#undef __COMPARISON_OP_BFLOAT162_MACRO_MASK + +#define __BOOL_COMPARISON_OP_BFLOAT162_MACRO(name) {\ + unsigned int val; \ + bool retval; \ + asm( "{ " __CUDA_BF16_STRINGIFY(name) ".bf16x2.bf16x2 %0,%1,%2;\n}" \ + :"=r"(val) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \ + if (val == 0x3F803F80U) {\ + retval = true; \ + } else { \ + retval = false; \ + }\ + return retval;\ +} + +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbeq2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.eq) +, + return (__heq(a.x, b.x) && __heq(a.y, b.y)); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbne2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.ne) +, + return (__hne(a.x, b.x) && __hne(a.y, b.y)); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hble2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.le) +, + return (__hle(a.x, b.x) && __hle(a.y, b.y)); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbge2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.ge) +, + return (__hge(a.x, b.x) && __hge(a.y, b.y)); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hblt2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.lt) +, + return (__hlt(a.x, b.x) && __hlt(a.y, b.y)); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbgt2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.gt) +, + return (__hgt(a.x, b.x) && __hgt(a.y, b.y)); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbequ2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.equ) +, + return (__hequ(a.x, b.x) && __hequ(a.y, b.y)); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbneu2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.neu) +, + return (__hneu(a.x, b.x) && __hneu(a.y, b.y)); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbleu2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.leu) +, + return (__hleu(a.x, b.x) && __hleu(a.y, b.y)); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.geu) +, + return (__hgeu(a.x, b.x) && __hgeu(a.y, b.y)); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbltu2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.ltu) +, + return (__hltu(a.x, b.x) && __hltu(a.y, b.y)); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.gtu) +, + return (__hgtu(a.x, b.x) && __hgtu(a.y, b.y)); +) +} +#undef __BOOL_COMPARISON_OP_BFLOAT162_MACRO +/****************************************************************************** +* __nv_bfloat16 comparison * +******************************************************************************/ +#define __COMPARISON_OP_BFLOAT16_MACRO(name) {\ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,\ + unsigned short val; \ + asm( "{ .reg .pred __$temp3;\n" \ + " setp." __CUDA_BF16_STRINGIFY(name) ".bf16 __$temp3, %1, %2;\n" \ + " selp.u16 %0, 1, 0, __$temp3;}" \ + : "=h"(val) : "h"(__BFLOAT16_TO_CUS(a)), "h"(__BFLOAT16_TO_CUS(b))); \ + return (val != 0U) ? true : false; \ +,\ + unsigned int val; \ + asm( "{.reg .b32 a,b;\n"\ + " mov.b32 a, {0, %1};\n"\ + " mov.b32 b, {0, %2};\n"\ + " set." __CUDA_BF16_STRINGIFY(name) ".f32.f32 %0, a, b;}\n"\ + :"=r"(val) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \ + return (val != 0U) ? true : false; \ +)\ +} +__CUDA_HOSTDEVICE_BF16_DECL__ bool __heq(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __COMPARISON_OP_BFLOAT16_MACRO(eq) +, + const float fa = __bfloat162float(a); + const float fb = __bfloat162float(b); + return (fa == fb); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hne(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __COMPARISON_OP_BFLOAT16_MACRO(ne) +, + const float fa = __bfloat162float(a); + const float fb = __bfloat162float(b); + return (fa != fb) && (!__hisnan(a)) && (!__hisnan(b)); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hle(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __COMPARISON_OP_BFLOAT16_MACRO(le) +, + const float fa = __bfloat162float(a); + const float fb = __bfloat162float(b); + return (fa <= fb); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hge(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __COMPARISON_OP_BFLOAT16_MACRO(ge) +, + const float fa = __bfloat162float(a); + const float fb = __bfloat162float(b); + return (fa >= fb); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hlt(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __COMPARISON_OP_BFLOAT16_MACRO(lt) +, + const float fa = __bfloat162float(a); + const float fb = __bfloat162float(b); + return (fa < fb); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hgt(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __COMPARISON_OP_BFLOAT16_MACRO(gt) +, + const float fa = __bfloat162float(a); + const float fb = __bfloat162float(b); + return (fa > fb); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hequ(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __COMPARISON_OP_BFLOAT16_MACRO(equ) +, + const float fa = __bfloat162float(a); + const float fb = __bfloat162float(b); + return (fa == fb) || (__hisnan(a)) || (__hisnan(b)); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hneu(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __COMPARISON_OP_BFLOAT16_MACRO(neu) +, + const float fa = __bfloat162float(a); + const float fb = __bfloat162float(b); + return (fa != fb); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hleu(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __COMPARISON_OP_BFLOAT16_MACRO(leu) +, + const float fa = __bfloat162float(a); + const float fb = __bfloat162float(b); + return (fa <= fb) || (__hisnan(a)) || (__hisnan(b)); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hgeu(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __COMPARISON_OP_BFLOAT16_MACRO(geu) +, + const float fa = __bfloat162float(a); + const float fb = __bfloat162float(b); + return (fa >= fb) || (__hisnan(a)) || (__hisnan(b)); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hltu(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __COMPARISON_OP_BFLOAT16_MACRO(ltu) +, + const float fa = __bfloat162float(a); + const float fb = __bfloat162float(b); + return (fa < fb) || (__hisnan(a)) || (__hisnan(b)); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hgtu(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __COMPARISON_OP_BFLOAT16_MACRO(gtu) +, + const float fa = __bfloat162float(a); + const float fb = __bfloat162float(b); + return (fa > fb) || (__hisnan(a)) || (__hisnan(b)); +) +} +#undef __COMPARISON_OP_BFLOAT16_MACRO +/****************************************************************************** +* __nv_bfloat162 arithmetic * +******************************************************************************/ +#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) +__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hadd2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm( "{ add.bf16x2 %0,%1,%2; }\n" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); +, + asm( "{.reg .b32 c;\n" + " mov.b32 c, 0x3f803f80U;\n" + " fma.rn.bf16x2 %0,%1,c,%2;}\n" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); +) + return val; +} + +__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hsub2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm( "{ sub.bf16x2 %0,%1,%2; }\n" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); +, + asm( "{.reg .b32 c;\n" + " mov.b32 c, 0xbf80bf80U;\n" + " fma.rn.bf16x2 %0,%2,c,%1;}\n" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); +) + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hmul2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm( "{ mul.bf16x2 %0,%1,%2; }\n" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); +, + asm( "{.reg .b32 c;\n" + " mov.b32 c, 0x80008000U;\n" + " fma.rn.bf16x2 %0,%1,%2,c;}\n" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); +) + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hadd2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm( "{ add.rn.bf16x2 %0,%1,%2; }\n" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); +, + asm( "{.reg .b32 c;\n" + " mov.b32 c, 0x3f803f80U;\n" + " fma.rn.bf16x2 %0,%1,c,%2;}\n" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); +) + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hsub2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm( "{ sub.rn.bf16x2 %0,%1,%2; }\n" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); +, + asm( "{.reg .b32 c;\n" + " mov.b32 c, 0xbf80bf80U;\n" + " fma.rn.bf16x2 %0,%2,c,%1;}\n" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); +) + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hmul2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm( "{ mul.rn.bf16x2 %0,%1,%2; }\n" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); +, + asm( "{.reg .b32 c;\n" + " mov.b32 c, 0x80008000U;\n" + " fma.rn.bf16x2 %0,%1,%2,c;}\n" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); +) + return val; +} +#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hadd2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + val = __internal_device_hadd2(a, b); +, + val.x = __hadd(a.x, b.x); + val.y = __hadd(a.y, b.y); +) + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hsub2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + val = __internal_device_hsub2(a, b); +, + val.x = __hsub(a.x, b.x); + val.y = __hsub(a.y, b.y); +) + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmul2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + val = __internal_device_hmul2(a, b); +, + val.x = __hmul(a.x, b.x); + val.y = __hmul(a.y, b.y); +) + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hadd2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + val = __internal_device_hadd2_rn(a, b); +, + val.x = __hadd_rn(a.x, b.x); + val.y = __hadd_rn(a.y, b.y); +) + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hsub2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + val = __internal_device_hsub2_rn(a, b); +, + val.x = __hsub_rn(a.x, b.x); + val.y = __hsub_rn(a.y, b.y); +) + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmul2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + val = __internal_device_hmul2_rn(a, b); +, + val.x = __hmul_rn(a.x, b.x); + val.y = __hmul_rn(a.y, b.y); +) + return val; +} + +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hadd2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + asm( "{.reg .b32 f, one, zero;\n" + " mov.b32 one, 0x3f803f80U;\n" + " mov.b32 zero, 0;\n" + " fma.rn.bf16x2 f,%1,one,%2;\n" + " max.bf16x2 f, f, zero;\n" + " min.bf16x2 %0, f, one;\n}" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); +, + val.x = __hadd_sat(a.x, b.x); + val.y = __hadd_sat(a.y, b.y); +) + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hsub2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + asm( "{.reg .b32 f, one, zero, mone;\n" + " mov.b32 one, 0x3f803f80U;\n" + " mov.b32 zero, 0;\n" + " mov.b32 mone, 0xbf80bf80U;\n" + " fma.rn.bf16x2 f,%2,mone,%1;\n" + " max.bf16x2 f, f, zero;\n" + " min.bf16x2 %0, f, one;\n}" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); +, + val.x = __hsub_sat(a.x, b.x); + val.y = __hsub_sat(a.y, b.y); +) + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmul2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + asm( "{.reg .b32 f, one, zero, mzero;\n" + " mov.b32 one, 0x3f803f80U;\n" + " mov.b32 zero, 0;\n" + " mov.b32 mzero, 0x80008000U;\n" + " fma.rn.bf16x2 f,%1,%2,mzero;\n" + " max.bf16x2 f, f, zero;\n" + " min.bf16x2 %0, f, one;\n}" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); +, + val.x = __hmul_sat(a.x, b.x); + val.y = __hmul_sat(a.y, b.y); +) + return val; +} +#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) +__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c) +{ + __nv_bfloat162 val; + asm( "{fma.rn.bf16x2 %0,%1,%2,%3;\n}" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)),"r"(__BFLOAT162_TO_CUI(c))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c) +{ + __nv_bfloat162 val; + asm( "{ .reg .b32 f, one, zero;\n" + " mov.b32 one, 0x3f803f80U;\n" + " mov.b32 zero, 0;\n" + " fma.rn.bf16x2 f, %1, %2, %3;\n" + " max.bf16x2 f, f, zero;\n" + " min.bf16x2 %0, f, one;\n}" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)),"r"(__BFLOAT162_TO_CUI(c))); + return val; +} +#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __h2div(const __nv_bfloat162 a, const __nv_bfloat162 b) { + __nv_bfloat16 ha, hb; + + ha = __low2bfloat16(a); + hb = __low2bfloat16(b); + + const __nv_bfloat16 v1 = __hdiv(ha, hb); + + ha = __high2bfloat16(a); + hb = __high2bfloat16(b); + + const __nv_bfloat16 v2 = __hdiv(ha, hb); + + return __halves2bfloat162(v1, v2); +} +/****************************************************************************** +* __nv_bfloat16 arithmetic * +******************************************************************************/ +#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) +__CUDA_BF16_DECL__ __nv_bfloat16 __internal_sm80_device_hadd(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __nv_bfloat16 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm( "{ add.bf16 %0,%1,%2; }\n" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); +, + asm( "{.reg .b16 c;\n" + " mov.b16 c, 0x3f80U;\n" + " fma.rn.bf16 %0,%1,c,%2;}\n" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); +) + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __internal_sm80_device_hsub(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __nv_bfloat16 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm( "{ sub.bf16 %0,%1,%2; }\n" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); +, + asm( "{.reg .b16 c;\n" + " mov.b16 c, 0xbf80U;\n" + " fma.rn.bf16 %0,%2,c,%1;}\n" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); +) + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __internal_sm80_device_hmul(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __nv_bfloat16 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm( "{ mul.bf16 %0,%1,%2; }\n" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); +, + asm( "{.reg .b16 c;\n" + " mov.b16 c, 0x8000U;\n" + " fma.rn.bf16 %0,%1,%2,c;}\n" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); +) + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __internal_sm80_device_hadd_rn(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __nv_bfloat16 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm( "{ add.rn.bf16 %0,%1,%2; }\n" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); +, + asm( "{.reg .b16 c;\n" + " mov.b16 c, 0x3f80U;\n" + " fma.rn.bf16 %0,%1,c,%2;}\n" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); +) + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __internal_sm80_device_hsub_rn(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __nv_bfloat16 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm( "{ sub.rn.bf16 %0,%1,%2; }\n" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); +, + asm( "{.reg .b16 c;\n" + " mov.b16 c, 0xbf80U;\n" + " fma.rn.bf16 %0,%2,c,%1;}\n" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); +) + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __internal_sm80_device_hmul_rn(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __nv_bfloat16 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm( "{ mul.rn.bf16 %0,%1,%2; }\n" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); +, + asm( "{.reg .b16 c;\n" + " mov.b16 c, 0x8000U;\n" + " fma.rn.bf16 %0,%1,%2,c;}\n" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); +) + return val; +} +#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */ +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +__CUDA_BF16_DECL__ __nv_bfloat16 __internal_device_hadd(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __nv_bfloat16 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + val = __internal_sm80_device_hadd(a, b); +, + const float fa = __bfloat162float(a); + const float fb = __bfloat162float(b); + // avoid ftz in device code + val = __float2bfloat16(__fmaf_ieee_rn(fa, 1.0f, fb)); +) + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __internal_device_hsub(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __nv_bfloat16 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + val = __internal_sm80_device_hsub(a, b); +, + const float fa = __bfloat162float(a); + const float fb = __bfloat162float(b); + // avoid ftz in device code + val = __float2bfloat16(__fmaf_ieee_rn(fb, -1.0f, fa)); +) + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __internal_device_hmul(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __nv_bfloat16 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + val = __internal_sm80_device_hmul(a, b); +, + const float fa = __bfloat162float(a); + const float fb = __bfloat162float(b); + // avoid ftz in device code + val = __float2bfloat16(__fmaf_ieee_rn(fa, fb, -0.0f)); +) + return val; +} +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hadd(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __internal_device_hadd(a, b); +, + const float fa = __bfloat162float(a); + const float fb = __bfloat162float(b); + return __float2bfloat16(fa + fb); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hsub(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __internal_device_hsub(a, b); +, + const float fa = __bfloat162float(a); + const float fb = __bfloat162float(b); + return __float2bfloat16(fa - fb); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmul(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __internal_device_hmul(a, b); +, + const float fa = __bfloat162float(a); + const float fb = __bfloat162float(b); + return __float2bfloat16(fa * fb); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hadd_rn(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + return __internal_sm80_device_hadd_rn(a, b); +, + return __hadd(a, b); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hsub_rn(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + return __internal_sm80_device_hsub_rn(a, b); +, + return __hsub(a, b); + +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmul_rn(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + return __internal_sm80_device_hmul_rn(a, b); +, + return __hmul(a, b); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hadd_sat(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __nv_bfloat16 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + asm( "{ .reg .b16 f, one, zero;\n" + " mov.b16 one, 0x3f80U;\n" + " mov.b16 zero, 0;\n" + " fma.rn.bf16 f, %1, one, %2;\n" + " max.bf16 f, f, zero;\n" + " min.bf16 %0, f, one;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); +, + val = __hmin(__hmax(__hadd(a, b), CUDART_ZERO_BF16), CUDART_ONE_BF16); +) + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hsub_sat(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __nv_bfloat16 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + asm( "{ .reg .b16 f, one, zero, mone;\n" + " mov.b16 one, 0x3f80U;\n" + " mov.b16 zero, 0;\n" + " mov.b16 mone, 0xbf80U;\n" + " fma.rn.bf16 f, %2, mone, %1;\n" + " max.bf16 f, f, zero;\n" + " min.bf16 %0, f, one;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); +, + val = __hmin(__hmax(__hsub(a, b), CUDART_ZERO_BF16), CUDART_ONE_BF16); +) + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmul_sat(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __nv_bfloat16 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + asm( "{ .reg .b16 f, one, zero, mzero;\n" + " mov.b16 one, 0x3f80U;\n" + " mov.b16 zero, 0;\n" + " mov.b16 mzero, 0x8000U;\n" + " fma.rn.bf16 f, %1, %2, mzero;\n" + " max.bf16 f, f, zero;\n" + " min.bf16 %0, f, one;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); +, + val = __hmin(__hmax(__hmul(a, b), CUDART_ZERO_BF16), CUDART_ONE_BF16); +) + return val; +} +#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) +__CUDA_BF16_DECL__ __nv_bfloat16 __hfma(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c) +{ + __nv_bfloat16 val; + asm( "{fma.rn.bf16 %0,%1,%2,%3;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)),"h"(__BFLOAT16_TO_CUS(c))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __hfma_sat(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c) +{ + __nv_bfloat16 val; + asm( "{ .reg .b16 f, one, zero;\n" + " mov.b16 one, 0x3f80U;\n" + " mov.b16 zero, 0;\n" + " fma.rn.bf16 f, %1, %2, %3;\n" + " max.bf16 f, f, zero;\n" + " min.bf16 %0, f, one;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)),"h"(__BFLOAT16_TO_CUS(c))); + return val; +} +#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */ +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +#define __BINARY_OP_BFLOAT16_MACRO(name) /* do */ {\ + __nv_bfloat16 val; \ + asm( "{.reg .b32 a,b,res;\n"\ + " mov.b32 a, {0,%1};\n"\ + " mov.b32 b, {0,%2};\n"\ + " " __CUDA_BF16_STRINGIFY(name) ".f32 res, a, b;\n"\ + " cvt.rn.bf16.f32 %0, res;}\n"\ + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \ + return val; \ +} /* while(0) */ +__CUDA_BF16_DECL__ __nv_bfloat16 __internal_device_hdiv(const __nv_bfloat16 a, const __nv_bfloat16 b) { + const float two_126 = __uint_as_float(0x7E800000U) ; //2^126 + const float a_f = __bfloat162float(a); + float b_f = __bfloat162float(b); + float ans; + bool b_big = (fabsf(b_f) >= two_126); + if(b_big){b_f *= 0.25f;} + + // f32 div approximation. Good enough for c-r bfloat div. + asm("{ div.approx.f32 %0, %1, %2; }" : "=f"(ans) : "f"(a_f), "f"(b_f)); + + // Prevent ftz: + if(b_big){ans = __fmaf_ieee_rn(ans, 0.25f, -0.0f);} + return __float2bfloat16(ans); +} + +#undef __BINARY_OP_BFLOAT16_MACRO +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hdiv(const __nv_bfloat16 a, const __nv_bfloat16 b) { +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __internal_device_hdiv(a, b); +, + const float fa = __bfloat162float(a); + const float fb = __bfloat162float(b); + return __float2bfloat16(fa / fb); +) +} + +/****************************************************************************** +* __nv_bfloat162 functions * +******************************************************************************/ +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +__CUDA_BF16_DECL__ __nv_bfloat16 __hsin_internal(const __nv_bfloat16 a) { + float f = __bfloat162float(a); + float r = sinf(f); + // Detect compile-time FTZ setting: + // if subnormal constant is not flushed to zero at compile-time, then + // ftz=off, and it is safe to return result of sinf() + // Otherwise, ftz=on, then sinf() result is valid for non-flushed + // values, and subnormal input is returned unchanged via else + // branch. + if ((__uint_as_float(0x00000001U) > 0.0f) || (f != 0.0f)) + { + f = r; + } + return __float2bfloat16_rn(f); +} +__CUDA_BF16_DECL__ __nv_bfloat16 hsin(const __nv_bfloat16 a) { + return __hsin_internal(a); +} +__CUDA_BF16_DECL__ __nv_bfloat162 h2sin(const __nv_bfloat162 a) { + const __nv_bfloat16 l = __low2bfloat16(a); + const __nv_bfloat16 h = __high2bfloat16(a); + return __halves2bfloat162(__hsin_internal(l), __hsin_internal(h)); +} +__CUDA_BF16_DECL__ __nv_bfloat16 __hcos_internal(const __nv_bfloat16 a) { + float f = __bfloat162float(a); + f = cosf(f); + return __float2bfloat16_rn(f); +} +__CUDA_BF16_DECL__ __nv_bfloat16 hcos(const __nv_bfloat16 a) { + return __hcos_internal(a); +} +__CUDA_BF16_DECL__ __nv_bfloat162 h2cos(const __nv_bfloat162 a) { + const __nv_bfloat16 l = __low2bfloat16(a); + const __nv_bfloat16 h = __high2bfloat16(a); + return __halves2bfloat162(__hcos_internal(l), __hcos_internal(h)); +} + +__CUDA_BF16_DECL__ float __internal_device_fast_bf16exp(const float x) +{ + const float log2e_up = __uint_as_float(0x3FB8AA3CU); + float fa = x * log2e_up; + asm("{ ex2.approx.f32 %0, %0; }" : "+f"(fa)); + return fa; +} + +__CUDA_BF16_DECL__ __nv_bfloat16 hexp(const __nv_bfloat16 a) { + float fa = __bfloat162float(a); + fa = __internal_device_fast_bf16exp(fa); + return __float2bfloat16_rn(fa); +} + +#define __APPROX_FCAST2(fun) /* do */ {\ + __nv_bfloat162 val;\ + asm("{.reg.b16 hl, hu; \n"\ + " .reg.b32 fl, fu; \n"\ + " mov.b32 {hl, hu}, %1; \n"\ + " mov.b32 fl, {0,hl}; \n"\ + " mov.b32 fu, {0,hu}; \n"\ + " " __CUDA_BF16_STRINGIFY(fun) ".approx.f32 fl, fl; \n"\ + " " __CUDA_BF16_STRINGIFY(fun) ".approx.f32 fu, fu; \n"\ + " cvt.rn.bf16.f32 hl, fl; \n"\ + " cvt.rn.bf16.f32 hu, fu; \n"\ + " mov.b32 %0, {hl, hu}; \n"\ + "}":"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a))); \ + return val;\ +} /* while(0) */ +#define __BF16_SPEC_CASE2(i,r, spc, ulp) \ + "{.reg.b32 spc, ulp, p;\n"\ + " mov.b32 spc," __CUDA_BF16_STRINGIFY(spc) ";\n"\ + " mov.b32 ulp," __CUDA_BF16_STRINGIFY(ulp) ";\n"\ + " set.eq.f16x2.f16x2 p," __CUDA_BF16_STRINGIFY(i) ", spc;\n"\ + " fma.rn.bf16x2 " __CUDA_BF16_STRINGIFY(r) ",p,ulp," __CUDA_BF16_STRINGIFY(r) ";\n}\n" + +__CUDA_BF16_DECL__ __nv_bfloat162 h2exp(const __nv_bfloat162 a) { +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + __nv_bfloat162 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 h,r,fl,fu, C; \n" + " mov.b32 {hl, hu}, %1; \n" + " mov.b32 h, %1; \n" + " mov.b32 fl, {0,hl}; \n" + " mov.b32 fu, {0,hu}; \n" + " mov.b32 C, 0x3FB8AA3CU; \n" + " mul.f32 fl,fl,C; \n" + " mul.f32 fu,fu,C; \n" + " ex2.approx.f32 fl, fl; \n" + " ex2.approx.f32 fu, fu; \n" + " cvt.rn.bf16.f32 hl, fl; \n" + " cvt.rn.bf16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + " mov.b32 %0, r; \n" + "}":"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a))); + return val; +, + return __floats2bfloat162_rn( __internal_device_fast_bf16exp(__low2float(a)), __internal_device_fast_bf16exp(__high2float(a)) ); +) +} + +__CUDA_BF16_DECL__ float __internal_device_tanhf_noftz(const float x) +{ + float f = x; + float r = tanhf(x); + // Detect compile-time FTZ setting: + // if subnormal constant is not flushed to zero at compile-time, then + // ftz=off, and it is safe to return result of tanhf() + // Otherwise, ftz=on, then tanhf() result is valid for non-flushed + // values, and subnormal input is returned unchanged via else + // branch. + if ((__uint_as_float(0x00000001U) > 0.0f) || (f != 0.0f)) + { + f = r; + } + return f; +} + +__CUDA_BF16_DECL__ __nv_bfloat16 htanh(const __nv_bfloat16 a) { + float f = __bfloat162float(a); +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_75, + asm("{ tanh.approx.f32 %0, %0; }" : "+f"(f)); +, + f = __internal_device_tanhf_noftz(f); +) + __nv_bfloat16 h = __float2bfloat16_rn(f); + return h; +} +__CUDA_BF16_DECL__ __nv_bfloat162 h2tanh(const __nv_bfloat162 a) { + float2 f = __bfloat1622float2(a); +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_75, + asm("{ tanh.approx.f32 %0, %0; }" : "+f"(f.x)); + asm("{ tanh.approx.f32 %0, %0; }" : "+f"(f.y)); +, + f.x = __internal_device_tanhf_noftz(f.x); + f.y = __internal_device_tanhf_noftz(f.y); +) + __nv_bfloat162 h = __float22bfloat162_rn(f); + return h; +} + +__CUDA_BF16_DECL__ __nv_bfloat16 htanh_approx(const __nv_bfloat16 a) { + __nv_bfloat16 r; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16_raw hr = (__nv_bfloat16_raw)a; + asm("tanh.approx.bf16 %0, %0;" : "+h"(hr.x)); + r = (__nv_bfloat16)hr; +, + r = htanh(a); +) + return r; +} +__CUDA_BF16_DECL__ __nv_bfloat162 h2tanh_approx(const __nv_bfloat162 a) { + __nv_bfloat162 res; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm("tanh.approx.bf16x2 %0, %1;" : "=r"(__BFLOAT162_TO_UI(res)) : "r"(__BFLOAT162_TO_CUI(a))); +, + res = h2tanh(a); +) + return res; +} + +__CUDA_BF16_DECL__ __nv_bfloat16 hexp2(const __nv_bfloat16 a) { + float fa = __bfloat162float(a); + asm("{ ex2.approx.f32 %0, %0; }" : "+f"(fa)); + return __float2bfloat16_rn(fa); +} +__CUDA_BF16_DECL__ __nv_bfloat162 h2exp2(const __nv_bfloat162 a) { +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + __APPROX_FCAST2(ex2) +, + float fl = __low2float(a); + asm("{ ex2.approx.f32 %0, %0; }" : "+f"(fl)); + float fh = __high2float(a); + asm("{ ex2.approx.f32 %0, %0; }" : "+f"(fh)); + return __floats2bfloat162_rn( fl, fh ); +) +} + +__CUDA_BF16_DECL__ __nv_bfloat16 hexp10(const __nv_bfloat16 a) { + const float log10_2 = __uint_as_float(0x40549A78U); + float fa = __bfloat162float(a) * log10_2; + asm("{ ex2.approx.f32 %0, %0; }" : "+f"(fa)); + __nv_bfloat16 r = __float2bfloat16_rn(fa); + __nv_bfloat16_raw araw = static_cast<__nv_bfloat16_raw>(a); + if (araw.x == (unsigned short)0xBC95U) + { + araw.x = 0x3f75U; + r = static_cast<__nv_bfloat16>(araw); + } + return r; +} +__CUDA_BF16_DECL__ __nv_bfloat162 h2exp10(const __nv_bfloat162 a) { + __nv_bfloat162 r; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 h,r,fl,fu, C; \n" + " mov.b32 {hl, hu}, %1; \n" + " mov.b32 fl, {0,hl}; \n" + " mov.b32 fu, {0,hu}; \n" + " mov.b32 C, 0x40549A78U; \n" + " mul.f32 fl,fl,C; \n" + " mul.f32 fu,fu,C; \n" + " ex2.approx.f32 fl, fl; \n" + " ex2.approx.f32 fu, fu; \n" + " cvt.rn.bf16.f32 hl, fl; \n" + " cvt.rn.bf16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + __BF16_SPEC_CASE2(%1, r, 0xBC95BC95U,0xBF00BF00U) + " mov.b32 %0, r; \n" + "}":"=r"(__BFLOAT162_TO_UI(r)) : "r"(__BFLOAT162_TO_CUI(a))); +, + const float log10_2 = __uint_as_float(0x40549A78U); + float fl = __low2float(a) * log10_2; + asm("{ ex2.approx.f32 %0, %0; }" : "+f"(fl)); + + float fh = __high2float(a) * log10_2; + asm("{ ex2.approx.f32 %0, %0; }" : "+f"(fh)); + + r = __floats2bfloat162_rn( fl, fh ); + + const __nv_bfloat162_raw araw = static_cast<__nv_bfloat162_raw>(a); + if (araw.x == (unsigned short)0xBC95U) + { + __nv_bfloat16_raw raw_fix; + raw_fix.x = (unsigned short)0x3f75U; + r.x = static_cast<__nv_bfloat16>(raw_fix); + } + if (araw.y == (unsigned short)0xBC95U) + { + __nv_bfloat16_raw raw_fix; + raw_fix.x = (unsigned short)0x3f75U; + r.y = static_cast<__nv_bfloat16>(raw_fix); + } +) + return r; +} + +__CUDA_BF16_DECL__ float __internal_device_fast_bf16log2(float x) +{ + asm("{ lg2.approx.f32 %0, %0; }" : "+f"(x)); + return x; +} + +__CUDA_BF16_DECL__ __nv_bfloat16 hlog2(const __nv_bfloat16 a) { + float fa = __bfloat162float(a); + fa = __internal_device_fast_bf16log2(fa); + return __float2bfloat16_rn(fa); +} + +__CUDA_BF16_DECL__ __nv_bfloat162 h2log2(const __nv_bfloat162 a) { +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + __APPROX_FCAST2(lg2) +, + float fl = __low2float(a); + fl = __internal_device_fast_bf16log2(fl); + float fh = __high2float(a); + fh = __internal_device_fast_bf16log2(fh); + return __floats2bfloat162_rn( fl, fh ); +) +} + +__CUDA_BF16_DECL__ __nv_bfloat16 hlog(const __nv_bfloat16 a) { + const float flt_ln2 = __uint_as_float(0x3f317218U); + float fa = __bfloat162float(a); + fa = __internal_device_fast_bf16log2(fa); + fa = fa * flt_ln2; + return __float2bfloat16_rn(fa); +} +__CUDA_BF16_DECL__ __nv_bfloat162 h2log(const __nv_bfloat162 a) { +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + __nv_bfloat162 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 r, fl, fu, C, h; \n" + " mov.b32 {hl, hu}, %1; \n" + " mov.b32 h, %1; \n" + " mov.b32 fl, {0,hl}; \n" + " mov.b32 fu, {0,hu}; \n" + " lg2.approx.f32 fl, fl; \n" + " lg2.approx.f32 fu, fu; \n" + " mov.b32 C, 0x3f317218U; \n" + " mul.f32 fl,fl,C; \n" + " mul.f32 fu,fu,C; \n" + " cvt.rn.bf16.f32 hl, fl; \n" + " cvt.rn.bf16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + " mov.b32 %0, r; \n" + "}":"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a))); + return val; +, + const float flt_ln2 = __uint_as_float(0x3f317218U); + + float fl = __low2float(a); + fl = __internal_device_fast_bf16log2(fl); + fl = fl * flt_ln2; + + float fh = __high2float(a); + fh = __internal_device_fast_bf16log2(fh); + fh = fh * flt_ln2; + + return __floats2bfloat162_rn( fl, fh ); +) +} + +__CUDA_BF16_DECL__ __nv_bfloat16 hlog10(const __nv_bfloat16 a) { + const float flt_log10_2 = __uint_as_float(0x3E9A209BU); + float fa = __bfloat162float(a); + fa = __internal_device_fast_bf16log2(fa); + fa = fa * flt_log10_2; + return __float2bfloat16_rn(fa); +} +__CUDA_BF16_DECL__ __nv_bfloat162 h2log10(const __nv_bfloat162 a) { +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + __nv_bfloat162 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 r, fl, fu, C, h; \n" + " mov.b32 {hl, hu}, %1; \n" + " mov.b32 h, %1; \n" + " mov.b32 fl, {0,hl}; \n" + " mov.b32 fu, {0,hu}; \n" + " lg2.approx.f32 fl, fl; \n" + " lg2.approx.f32 fu, fu; \n" + " mov.b32 C, 0x3E9A209BU; \n" + " mul.f32 fl,fl,C; \n" + " mul.f32 fu,fu,C; \n" + " cvt.rn.bf16.f32 hl, fl; \n" + " cvt.rn.bf16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + " mov.b32 %0, r; \n" + "}":"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a))); + return val; +, + const float flt_log10_2 = __uint_as_float(0x3E9A209BU); + + float fl = __low2float(a); + fl = __internal_device_fast_bf16log2(fl); + fl = fl * flt_log10_2; + + float fh = __high2float(a); + fh = __internal_device_fast_bf16log2(fh); + fh = fh * flt_log10_2; + + return __floats2bfloat162_rn( fl, fh ); +) +} + +__CUDA_BF16_DECL__ __nv_bfloat162 h2rcp(const __nv_bfloat162 a) { + float fl = __low2float(a); + asm("{ rcp.approx.f32 %0, %0; }" : "+f"(fl)); + float fh = __high2float(a); + asm("{ rcp.approx.f32 %0, %0; }" : "+f"(fh)); + return __floats2bfloat162_rn( fl, fh ); +} +__CUDA_BF16_DECL__ __nv_bfloat16 hrcp(const __nv_bfloat16 a) { + float fa = __bfloat162float(a); + asm("{ rcp.approx.f32 %0, %0; }" : "+f"(fa)); + return __float2bfloat16_rn(fa); +} +__CUDA_BF16_DECL__ __nv_bfloat162 h2rsqrt(const __nv_bfloat162 a) { +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + __APPROX_FCAST2(rsqrt) +, + float fl = __low2float(a); + asm("{ rsqrt.approx.f32 %0, %0; }" : "+f"(fl)); + float fh = __high2float(a); + asm("{ rsqrt.approx.f32 %0, %0; }" : "+f"(fh)); + return __floats2bfloat162_rn( fl, fh ); +) +} +__CUDA_BF16_DECL__ __nv_bfloat16 hrsqrt(const __nv_bfloat16 a) { + float fa = __bfloat162float(a); + asm("{ rsqrt.approx.f32 %0, %0; }" : "+f"(fa)); + return __float2bfloat16_rn(fa); +} +__CUDA_BF16_DECL__ __nv_bfloat162 h2sqrt(const __nv_bfloat162 a) { +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + __APPROX_FCAST2(sqrt) +, + float fl = __low2float(a); + asm("{ sqrt.approx.f32 %0, %0; }" : "+f"(fl)); + float fh = __high2float(a); + asm("{ sqrt.approx.f32 %0, %0; }" : "+f"(fh)); + return __floats2bfloat162_rn( fl, fh ); +) +} +__CUDA_BF16_DECL__ __nv_bfloat16 hsqrt(const __nv_bfloat16 a) { + float fa = __bfloat162float(a); + asm("{ sqrt.approx.f32 %0, %0; }" : "+f"(fa)); + return __float2bfloat16_rn(fa); +} +#undef __APPROX_FCAST2 +#undef __BF16_SPEC_CASE2 + +__CUDA_BF16_DECL__ bool __internal_device_hisnan(const __nv_bfloat16 a) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 r; + asm("{set.nan.bf16.bf16 %0,%1,%1;\n}" + :"=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(a))); + return __BFLOAT16_TO_CUS(r) != 0U; +, + unsigned int r; + asm( "{.reg .b32 a;\n" + " mov.b32 a, {0,%1};\n" + " set.nan.f32.f32 %0, a, a;}\n" + :"=r"(r) : "h"(__BFLOAT16_TO_CUS(a))); + return r != 0U; +) +} +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hisnan2(const __nv_bfloat162 a) +{ + __nv_bfloat162 r; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm("{set.nan.bf16x2.bf16x2 %0,%1,%1;\n}" + :"=r"(__BFLOAT162_TO_UI(r)) : "r"(__BFLOAT162_TO_CUI(a))); +, + __nv_bfloat162_raw val; + val.x = __hisnan(a.x) ? (unsigned short)0x3F80U : (unsigned short)0U; + val.y = __hisnan(a.y) ? (unsigned short)0x3F80U : (unsigned short)0U; + r = __nv_bfloat162(val); +) + return r; +} +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hisnan(const __nv_bfloat16 a) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __internal_device_hisnan(a); +, + const __nv_bfloat16_raw hr = static_cast<__nv_bfloat16_raw>(a); + return ((hr.x & 0x7FFFU) > 0x7F80U); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hneg2(const __nv_bfloat162 a) +{ + __nv_bfloat162 r; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + asm("{neg.bf16x2 %0,%1;\n}" + :"=r"(__BFLOAT162_TO_UI(r)) : "r"(__BFLOAT162_TO_CUI(a))); +, + r.x = __hneg(a.x); + r.y = __hneg(a.y); +) + return r; +} +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +__CUDA_BF16_DECL__ __nv_bfloat16 __internal_device_hneg(const __nv_bfloat16 a) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + __nv_bfloat16 r; + asm("{neg.bf16 %0,%1;\n}" + :"=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(a))); + return r; +, + const float fa = __bfloat162float(a); + return __float2bfloat16(__fmaf_ieee_rn(fa, -1.0f, -0.0f)); +) +} +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hneg(const __nv_bfloat16 a) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __internal_device_hneg(a); +, + const float fa = __bfloat162float(a); + return __float2bfloat16(-fa); +) +} + +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __habs2(const __nv_bfloat162 a) +{ + __nv_bfloat162 r; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + asm("{abs.bf16x2 %0,%1;\n}" + :"=r"(__BFLOAT162_TO_UI(r)) : "r"(__BFLOAT162_TO_CUI(a))); +, + r.x = __habs(a.x); + r.y = __habs(a.y); +) + return r; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __habs(const __nv_bfloat16 a) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + __nv_bfloat16 r; + asm("{abs.bf16 %0,%1;\n}" + :"=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(a))); + return r; +, + __nv_bfloat16_raw abs_a_raw = static_cast<__nv_bfloat16_raw>(a); + abs_a_raw.x &= (unsigned short)0x7FFFU; + if (abs_a_raw.x > (unsigned short)0x7F80U) + { + // return canonical NaN + abs_a_raw.x = (unsigned short)0x7FFFU; + } + return static_cast<__nv_bfloat16>(abs_a_raw); +) +} + +/****************************************************************************** +* __nv_bfloat16 arithmetic * +******************************************************************************/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmax(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + __nv_bfloat16 val; + asm( "{ max.bf16 %0,%1,%2;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); + return val; +, + __nv_bfloat16 maxval; + + maxval = (__hge(a, b) || __hisnan(b)) ? a : b; + + if (__hisnan(maxval)) + { + // if both inputs are NaN, return canonical NaN + maxval = CUDART_NAN_BF16; + } + else if (__heq(a, b)) + { + // hmax(+0.0, -0.0) = +0.0 + // unsigned compare 0x8000U > 0x0000U + __nv_bfloat16_raw ra = __nv_bfloat16_raw(a); + __nv_bfloat16_raw rb = __nv_bfloat16_raw(b); + maxval = (ra.x > rb.x) ? b : a; + } + + return maxval; +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmin(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + __nv_bfloat16 val; + asm( "{ min.bf16 %0,%1,%2;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); + return val; +, + __nv_bfloat16 minval; + + minval = (__hle(a, b) || __hisnan(b)) ? a : b; + + if (__hisnan(minval)) + { + // if both inputs are NaN, return canonical NaN + minval = CUDART_NAN_BF16; + } + else if (__heq(a, b)) + { + // hmin(+0.0, -0.0) = -0.0 + // unsigned compare 0x8000U > 0x0000U + __nv_bfloat16_raw ra = __nv_bfloat16_raw(a); + __nv_bfloat16_raw rb = __nv_bfloat16_raw(b); + minval = (ra.x > rb.x) ? a : b; + } + + return minval; +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmax_nan(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + __nv_bfloat16 val; + asm( "{ max.NaN.bf16 %0,%1,%2;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); + return val; +, + __nv_bfloat16 maxval; + + if (__hisnan(a) || __hisnan(b)) + { + // if either input is NaN, return canonical NaN + maxval = CUDART_NAN_BF16; + } + else + { + maxval = __hge(a, b) ? a : b; + } + + return maxval; +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmin_nan(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + __nv_bfloat16 val; + asm( "{ min.NaN.bf16 %0,%1,%2;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); + return val; +, + __nv_bfloat16 minval; + + if (__hisnan(a) || __hisnan(b)) + { + // if either input is NaN, return canonical NaN + minval = CUDART_NAN_BF16; + } + else + { + minval = __hle(a, b) ? a : b; + } + + return minval; +) +} +#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) +__CUDA_BF16_DECL__ __nv_bfloat16 __hfma_relu(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c) +{ + __nv_bfloat16 val; + asm( "{ fma.rn.relu.bf16 %0,%1,%2,%3;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)),"h"(__BFLOAT16_TO_CUS(c))); + return val; +} +#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */ +/****************************************************************************** +* __nv_bfloat162 arithmetic * +******************************************************************************/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmax2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + __nv_bfloat162 val; + asm( "{ max.bf16x2 %0,%1,%2;\n}" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); + return val; +, + __nv_bfloat162 val; + val.x = __hmax(a.x, b.x); + val.y = __hmax(a.y, b.y); + return val; +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmin2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + __nv_bfloat162 val; + asm( "{ min.bf16x2 %0,%1,%2;\n}" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); + return val; +, + __nv_bfloat162 val; + val.x = __hmin(a.x, b.x); + val.y = __hmin(a.y, b.y); + return val; +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmax2_nan(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + __nv_bfloat162 val; + asm( "{ max.NaN.bf16x2 %0,%1,%2;\n}" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); + return val; +, + __nv_bfloat162 val; + val.x = __hmax_nan(a.x, b.x); + val.y = __hmax_nan(a.y, b.y); + return val; +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmin2_nan(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + __nv_bfloat162 val; + asm( "{ min.NaN.bf16x2 %0,%1,%2;\n}" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); + return val; +, + __nv_bfloat162 val; + val.x = __hmin_nan(a.x, b.x); + val.y = __hmin_nan(a.y, b.y); + return val; +) +} +#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) +__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2_relu(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c) +{ + __nv_bfloat162 val; + asm( "{ fma.rn.relu.bf16x2 %0,%1,%2,%3;\n}" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)),"r"(__BFLOAT162_TO_CUI(c))); + return val; +} + +__CUDA_BF16_DECL__ __nv_bfloat162 __hcmadd(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c) +{ + // fast version of complex multiply-accumulate + // (a.re, a.im) * (b.re, b.im) + (c.re, c.im) + // acc.re = (c.re + a.re*b.re) - a.im*b.im + // acc.im = (c.im + a.re*b.im) + a.im*b.re + __nv_bfloat16 real_tmp = __hfma(a.x, b.x, c.x); + __nv_bfloat16 img_tmp = __hfma(a.x, b.y, c.y); + real_tmp = __hfma(__hneg(a.y), b.y, real_tmp); + img_tmp = __hfma(a.y, b.x, img_tmp); + return make_bfloat162(real_tmp, img_tmp); +} +#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */ + +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/* Define __PTR for atomicAdd prototypes below, undef after done */ +#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__) +#define __PTR "l" +#else +#define __PTR "r" +#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/ + +__CUDA_BF16_DECL__ __nv_bfloat162 atomicAdd(__nv_bfloat162 *const address, const __nv_bfloat162 val) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat162 r; + asm volatile ("{ atom.add.noftz.bf16x2 %0,[%1],%2; }\n" + : "=r"(__BFLOAT162_TO_UI(r)) : __PTR(address), "r"(__BFLOAT162_TO_CUI(val)) + : "memory"); + return r; +, + unsigned int* address_as_uint = (unsigned int*)address; + unsigned int old = *address_as_uint; + unsigned int assumed; + do { + assumed = old; + __nv_bfloat162 new_val = __hadd2(val, *(__nv_bfloat162*)&assumed); + old = atomicCAS(address_as_uint, assumed, *(unsigned int*)&new_val); + } while (assumed != old); + return *(__nv_bfloat162*)&old; +) +} + +#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700))) || defined(_NVHPC_CUDA) +__CUDA_BF16_DECL__ __nv_bfloat16 atomicAdd(__nv_bfloat16 *const address, const __nv_bfloat16 val) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 r; + asm volatile ("{ atom.add.noftz.bf16 %0,[%1],%2; }\n" + : "=h"(__BFLOAT16_TO_US(r)) + : __PTR(address), "h"(__BFLOAT16_TO_CUS(val)) + : "memory"); + return r; +, + unsigned short int* address_as_us = (unsigned short int*)address; + unsigned short int old = *address_as_us; + unsigned short int assumed; + do { + assumed = old; + old = atomicCAS(address_as_us, assumed, + __bfloat16_as_ushort(__hadd(val, __ushort_as_bfloat16(assumed)))); + } while (assumed != old); + return __ushort_as_bfloat16(old); +) +} +#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700))) || defined(_NVHPC_CUDA) */ + +#undef __PTR +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ + +#endif /* !(defined __DOXYGEN_ONLY__) */ + +#endif /* defined(__cplusplus) */ + +#undef __CUDA_HOSTDEVICE_BF16_DECL__ +#undef __CUDA_BF16_DECL__ +#undef __CUDA_BF16_CONSTEXPR__ + +#if defined(__CPP_VERSION_AT_LEAST_11_BF16) +#undef __CPP_VERSION_AT_LEAST_11_BF16 +#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */ + +#undef ___CUDA_BF16_STRINGIFY_INNERMOST +#undef __CUDA_BF16_STRINGIFY + +#endif /* end of include guard: __CUDA_BF16_HPP__ */ diff --git a/numba_cuda/numba/cuda/include/13/cuda_fp16.h b/numba_cuda/numba/cuda/include/13/cuda_fp16.h new file mode 100644 index 000000000..788b81452 --- /dev/null +++ b/numba_cuda/numba/cuda/include/13/cuda_fp16.h @@ -0,0 +1,5363 @@ +/* +* Copyright 1993-2024 NVIDIA Corporation. All rights reserved. +* +* NOTICE TO LICENSEE: +* +* This source code and/or documentation ("Licensed Deliverables") are +* subject to NVIDIA intellectual property rights under U.S. and +* international Copyright laws. +* +* These Licensed Deliverables contained herein is PROPRIETARY and +* CONFIDENTIAL to NVIDIA and is being provided under the terms and +* conditions of a form of NVIDIA software license agreement by and +* between NVIDIA and Licensee ("License Agreement") or electronically +* accepted by Licensee. Notwithstanding any terms or conditions to +* the contrary in the License Agreement, reproduction or disclosure +* of the Licensed Deliverables to any third party without the express +* written consent of NVIDIA is prohibited. +* +* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE +* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS +* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. +* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED +* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, +* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. +* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY +* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY +* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS +* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE +* OF THESE LICENSED DELIVERABLES. +* +* U.S. Government End Users. These Licensed Deliverables are a +* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT +* 1995), consisting of "commercial computer software" and "commercial +* computer software documentation" as such terms are used in 48 +* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government +* only as a commercial end item. Consistent with 48 C.F.R.12.212 and +* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all +* U.S. Government End Users acquire the Licensed Deliverables with +* only those rights set forth herein. +* +* Any use of the Licensed Deliverables in individual and commercial +* software must include, in the user documentation and internal +* comments to the code, the above Disclaimer and U.S. Government End +* Users Notice. +*/ + +/** +* \defgroup CUDA_MATH_INTRINSIC_HALF Half Precision Intrinsics +* This section describes half precision intrinsic functions. +* To use these functions, include the header file \p cuda_fp16.h in your program. +* All of the functions defined here are available in device code. +* Some of the functions are also available to host compilers, please +* refer to respective functions' documentation for details. +* +* NOTE: Aggressive floating-point optimizations performed by host or device +* compilers may affect numeric behavior of the functions implemented in this +* header. +* +* The following macros are available to help users selectively enable/disable +* various definitions present in the header file: +* - \p CUDA_NO_HALF - If defined, this macro will prevent the definition of +* additional type aliases in the global namespace, helping to avoid potential +* conflicts with symbols defined in the user program. +* - \p __CUDA_NO_HALF_CONVERSIONS__ - If defined, this macro will prevent the +* use of the C++ type conversions (converting constructors and conversion +* operators) that are common for built-in floating-point types, but may be +* undesirable for \p half which is essentially a user-defined type. +* - \p __CUDA_NO_HALF_OPERATORS__ and \p __CUDA_NO_HALF2_OPERATORS__ - If +* defined, these macros will prevent the inadvertent use of usual arithmetic +* and comparison operators. This enforces the storage-only type semantics and +* prevents C++ style computations on \p half and \p half2 types. +*/ + +/** +* \defgroup CUDA_MATH_INTRINSIC_HALF_CONSTANTS Half Arithmetic Constants +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these constants, include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF_ARITHMETIC Half Arithmetic Functions +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions, include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF2_ARITHMETIC Half2 Arithmetic Functions +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions, include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF_COMPARISON Half Comparison Functions +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions, include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF2_COMPARISON Half2 Comparison Functions +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions, include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF_MISC Half Precision Conversion and Data Movement +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions, include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF_FUNCTIONS Half Math Functions +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions, include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF2_FUNCTIONS Half2 Math Functions +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions, include the header file \p cuda_fp16.h in your program. +*/ + +#ifndef __CUDA_FP16_H__ +#define __CUDA_FP16_H__ + +// implicitly provided by NVRTC +#if !defined(__CUDACC_RTC__) +/* bring in float2, double4, etc vector types */ +#include "vector_types.h" +/* bring in operations on vector types like: make_float2 */ +#include "vector_functions.h" +#endif /* !defined(__CUDACC_RTC__) */ + +#define ___CUDA_FP16_STRINGIFY_INNERMOST(x) #x +#define __CUDA_FP16_STRINGIFY(x) ___CUDA_FP16_STRINGIFY_INNERMOST(x) + +#if defined(__cplusplus) + +/* Set up function decorations */ +#if (defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3)))) +#define __CUDA_FP16_DECL__ __device__ +#define __CUDA_HOSTDEVICE_FP16_DECL__ __device__ +#define __CUDA_HOSTDEVICE__ __device__ +#elif defined(__CUDACC__) || defined(_NVHPC_CUDA) +#define __CUDA_FP16_DECL__ static __device__ __inline__ +#define __CUDA_HOSTDEVICE_FP16_DECL__ static __host__ __device__ __inline__ +#define __CUDA_HOSTDEVICE__ __host__ __device__ +#else /* !defined(__CUDACC__) */ +#if defined(__GNUC__) +#define __CUDA_HOSTDEVICE_FP16_DECL__ static __attribute__ ((unused)) +#else +#define __CUDA_HOSTDEVICE_FP16_DECL__ static +#endif /* defined(__GNUC__) */ +#define __CUDA_HOSTDEVICE__ +#endif /* (defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3)))) */ + +#define __CUDA_FP16_TYPES_EXIST__ + +/* Macros to allow half & half2 to be used by inline assembly */ +#define __HALF_TO_US(var) *(reinterpret_cast(&(var))) +#define __HALF_TO_CUS(var) *(reinterpret_cast(&(var))) +#define __HALF2_TO_UI(var) *(reinterpret_cast(&(var))) +#define __HALF2_TO_CUI(var) *(reinterpret_cast(&(var))) + +/* Forward-declaration of structures defined in "cuda_fp16.hpp" */ +struct __half; +struct __half2; + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts double number to half precision in round-to-nearest-even mode +* and returns \p half with converted value. +* +* \details Converts double number \p a to half precision in round-to-nearest-even mode. +* \param[in] a - double. Is only being read. +* \returns half +* - \p a converted to half precision using round-to-nearest-even mode. +* - __double2half \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula. +* - __double2half \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula. +* - __double2half(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __double2half(const double a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts float number to half precision in round-to-nearest-even mode +* and returns \p half with converted value. +* +* \details Converts float number \p a to half precision in round-to-nearest-even mode. +* \param[in] a - float. Is only being read. +* \returns half +* - \p a converted to half precision using round-to-nearest-even mode. +* +* \see __float2half_rn(float) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half(const float a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts float number to half precision in round-to-nearest-even mode +* and returns \p half with converted value. +* +* \details Converts float number \p a to half precision in round-to-nearest-even mode. +* \param[in] a - float. Is only being read. +* \returns half +* - \p a converted to half precision using round-to-nearest-even mode. +* - __float2half_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula. +* - __float2half_rn \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula. +* - __float2half_rn(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rn(const float a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts float number to half precision in round-towards-zero mode +* and returns \p half with converted value. +* +* \details Converts float number \p a to half precision in round-towards-zero mode. +* \param[in] a - float. Is only being read. +* \returns half +* - \p a converted to half precision using round-towards-zero mode. +* - __float2half_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula. +* - __float2half_rz \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula. +* - __float2half_rz(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rz(const float a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts float number to half precision in round-down mode +* and returns \p half with converted value. +* +* \details Converts float number \p a to half precision in round-down mode. +* \param[in] a - float. Is only being read. +* +* \returns half +* - \p a converted to half precision using round-down mode. +* - __float2half_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula. +* - __float2half_rd \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula. +* - __float2half_rd(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rd(const float a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts float number to half precision in round-up mode +* and returns \p half with converted value. +* +* \details Converts float number \p a to half precision in round-up mode. +* \param[in] a - float. Is only being read. +* +* \returns half +* - \p a converted to half precision using round-up mode. +* - __float2half_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula. +* - __float2half_ru \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula. +* - __float2half_ru(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_ru(const float a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts \p half number to float. +* +* \details Converts half number \p a to float. +* \param[in] a - float. Is only being read. +* +* \returns float +* - \p a converted to float. +* - __half2float \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula. +* - __half2float \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula. +* - __half2float(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ float __half2float(const __half a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts input to half precision in round-to-nearest-even mode and +* populates both halves of \p half2 with converted value. +* +* \details Converts input \p a to half precision in round-to-nearest-even mode and +* populates both halves of \p half2 with converted value. +* \param[in] a - float. Is only being read. +* +* \returns half2 +* - The \p half2 value with both halves equal to the converted half +* precision number. +* +* \see __float2half_rn(float) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float2half2_rn(const float a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts both input floats to half precision in round-to-nearest-even +* mode and returns \p half2 with converted values. +* +* \details Converts both input floats to half precision in round-to-nearest-even mode +* and combines the results into one \p half2 number. Low 16 bits of the return +* value correspond to the input \p a, high 16 bits correspond to the input \p +* b. +* \param[in] a - float. Is only being read. +* \param[in] b - float. Is only being read. +* +* \returns half2 +* - The \p half2 value with corresponding halves equal to the +* converted input floats. +* +* \see __float2half_rn(float) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __floats2half2_rn(const float a, const float b); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts low 16 bits of \p half2 to float and returns the result +* +* \details Converts low 16 bits of \p half2 input \p a to 32-bit floating-point number +* and returns the result. +* \param[in] a - half2. Is only being read. +* +* \returns float +* - The low 16 bits of \p a converted to float. +* +* \see __half2float(__half) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ float __low2float(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts high 16 bits of \p half2 to float and returns the result +* +* \details Converts high 16 bits of \p half2 input \p a to 32-bit floating-point number +* and returns the result. +* \param[in] a - half2. Is only being read. +* +* \returns float +* - The high 16 bits of \p a converted to float. +* +* \see __half2float(__half) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ float __high2float(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed char in round-towards-zero mode. +* +* \details Convert the half-precision floating-point value \p h to a signed char +* integer in round-towards-zero mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns signed char +* - \p h converted to a signed char using round-towards-zero mode. +* - __half2char_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __half2char_rz \cuda_math_formula (x), x > 127\end_cuda_math_formula returns SCHAR_MAX = \p 0x7F. +* - __half2char_rz \cuda_math_formula (x), x < -128\end_cuda_math_formula returns SCHAR_MIN = \p 0x80. +* - __half2char_rz(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ signed char __half2char_rz(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned char in round-towards-zero +* mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned +* char in round-towards-zero mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns unsigned char +* - \p h converted to an unsigned char using round-towards-zero mode. +* - __half2uchar_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __half2uchar_rz \cuda_math_formula (x), x > 255\end_cuda_math_formula returns UCHAR_MAX = \p 0xFF. +* - __half2uchar_rz \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0. +* - __half2uchar_rz(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned char __half2uchar_rz(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed short integer in round-towards-zero mode. +* +* \details Convert the half-precision floating-point value \p h to a signed short +* integer in round-towards-zero mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns short int +* - \p h converted to a signed short integer using round-towards-zero mode. +* - __half2short_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __half2short_rz \cuda_math_formula (x), x > 32767\end_cuda_math_formula returns SHRT_MAX = \p 0x7FFF. +* - __half2short_rz \cuda_math_formula (x), x < -32768\end_cuda_math_formula returns SHRT_MIN = \p 0x8000. +* - __half2short_rz(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ short int __half2short_rz(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned short integer in round-towards-zero +* mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned short +* integer in round-towards-zero mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns unsigned short int +* - \p h converted to an unsigned short integer using round-towards-zero mode. +* - __half2ushort_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __half2ushort_rz \cuda_math_formula (+\infty)\end_cuda_math_formula returns USHRT_MAX = \p 0xFFFF. +* - __half2ushort_rz \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0. +* - __half2ushort_rz(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned short int __half2ushort_rz(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed integer in round-towards-zero mode. +* +* \details Convert the half-precision floating-point value \p h to a signed integer in +* round-towards-zero mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns int +* - \p h converted to a signed integer using round-towards-zero mode. +* - __half2int_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __half2int_rz \cuda_math_formula (+\infty)\end_cuda_math_formula returns INT_MAX = \p 0x7FFFFFFF. +* - __half2int_rz \cuda_math_formula (-\infty)\end_cuda_math_formula returns INT_MIN = \p 0x80000000. +* - __half2int_rz(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ int __half2int_rz(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned integer in round-towards-zero mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned integer +* in round-towards-zero mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns unsigned int +* - \p h converted to an unsigned integer using round-towards-zero mode. +* - __half2uint_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __half2uint_rz \cuda_math_formula (+\infty)\end_cuda_math_formula returns UINT_MAX = \p 0xFFFFFFFF. +* - __half2uint_rz \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0. +* - __half2uint_rz(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __half2uint_rz(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed 64-bit integer in round-towards-zero mode. +* +* \details Convert the half-precision floating-point value \p h to a signed 64-bit +* integer in round-towards-zero mode. NaN inputs return a long long int with hex value of \p 0x8000000000000000. +* \param[in] h - half. Is only being read. +* +* \returns long long int +* - \p h converted to a signed 64-bit integer using round-towards-zero mode. +* - __half2ll_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __half2ll_rz \cuda_math_formula (+\infty)\end_cuda_math_formula returns LLONG_MAX = \p 0x7FFFFFFFFFFFFFFF. +* - __half2ll_rz \cuda_math_formula (-\infty)\end_cuda_math_formula returns LLONG_MIN = \p 0x8000000000000000. +* - __half2ll_rz(NaN) returns \p 0x8000000000000000. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ long long int __half2ll_rz(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned 64-bit integer in round-towards-zero +* mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned 64-bit +* integer in round-towards-zero mode. NaN inputs return \p 0x8000000000000000. +* \param[in] h - half. Is only being read. +* +* \returns unsigned long long int +* - \p h converted to an unsigned 64-bit integer using round-towards-zero mode. +* - __half2ull_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __half2ull_rz \cuda_math_formula (+\infty)\end_cuda_math_formula returns ULLONG_MAX = \p 0xFFFFFFFFFFFFFFFF. +* - __half2ull_rz \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0. +* - __half2ull_rz(NaN) returns \p 0x8000000000000000. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned long long int __half2ull_rz(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Vector function, combines two \p __half numbers into one \p __half2 number. +* +* \details Combines two input \p __half number \p x and \p y into one \p __half2 number. +* Input \p x is stored in low 16 bits of the return value, input \p y is stored +* in high 16 bits of the return value. +* \param[in] x - half. Is only being read. +* \param[in] y - half. Is only being read. +* +* \returns __half2 +* - The \p __half2 vector with one half equal to \p x and the other to \p y. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 make_half2(const __half x, const __half y); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts both components of \p float2 number to half precision in +* round-to-nearest-even mode and returns \p half2 with converted values. +* +* \details Converts both components of \p float2 to half precision in round-to-nearest-even +* mode and combines the results into one \p half2 number. Low 16 bits of the +* return value correspond to \p a.x and high 16 bits of the return value +* correspond to \p a.y. +* \param[in] a - float2. Is only being read. +* +* \returns half2 +* - The \p half2 which has corresponding halves equal to the +* converted \p float2 components. +* +* \see __float2half_rn(float) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float22half2_rn(const float2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts both halves of \p half2 to \p float2 and returns the result. +* +* \details Converts both halves of \p half2 input \p a to \p float2 and returns the +* result. +* \param[in] a - half2. Is only being read. +* +* \returns float2 +* - \p a converted to \p float2. +* +* \see __half2float(__half) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ float2 __half22float2(const __half2 a); +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed integer in round-to-nearest-even mode. +* +* \details Convert the half-precision floating-point value \p h to a signed integer in +* round-to-nearest-even mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns int +* - \p h converted to a signed integer using round-to-nearest-even mode. +* - __half2int_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __half2int_rn \cuda_math_formula (+\infty)\end_cuda_math_formula returns INT_MAX = \p 0x7FFFFFFF. +* - __half2int_rn \cuda_math_formula (-\infty)\end_cuda_math_formula returns INT_MIN = \p 0x80000000. +* - __half2int_rn(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ int __half2int_rn(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed integer in round-down mode. +* +* \details Convert the half-precision floating-point value \p h to a signed integer in +* round-down mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns int +* - \p h converted to a signed integer using round-down mode. +* - __half2int_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __half2int_rd \cuda_math_formula (+\infty)\end_cuda_math_formula returns INT_MAX = \p 0x7FFFFFFF. +* - __half2int_rd \cuda_math_formula (-\infty)\end_cuda_math_formula returns INT_MIN = \p 0x80000000. +* - __half2int_rd(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ int __half2int_rd(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed integer in round-up mode. +* +* \details Convert the half-precision floating-point value \p h to a signed integer in +* round-up mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns int +* - \p h converted to a signed integer using round-up mode. +* - __half2int_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __half2int_ru \cuda_math_formula (+\infty)\end_cuda_math_formula returns INT_MAX = \p 0x7FFFFFFF. +* - __half2int_ru \cuda_math_formula (-\infty)\end_cuda_math_formula returns INT_MIN = \p 0x80000000. +* - __half2int_ru(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ int __half2int_ru(const __half h); +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed integer to a half in round-to-nearest-even mode. +* +* \details Convert the signed integer value \p i to a half-precision floating-point +* value in round-to-nearest-even mode. +* \param[in] i - int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_rn(const int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed integer to a half in round-towards-zero mode. +* +* \details Convert the signed integer value \p i to a half-precision floating-point +* value in round-towards-zero mode. +* \param[in] i - int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_rz(const int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed integer to a half in round-down mode. +* +* \details Convert the signed integer value \p i to a half-precision floating-point +* value in round-down mode. +* \param[in] i - int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_rd(const int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed integer to a half in round-up mode. +* +* \details Convert the signed integer value \p i to a half-precision floating-point +* value in round-up mode. +* \param[in] i - int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_ru(const int i); +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed short integer in round-to-nearest-even +* mode. +* +* \details Convert the half-precision floating-point value \p h to a signed short +* integer in round-to-nearest-even mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns short int +* - \p h converted to a signed short integer using round-to-nearest-even mode. +* - __half2short_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __half2short_rn \cuda_math_formula (x), x > 32767\end_cuda_math_formula returns SHRT_MAX = \p 0x7FFF. +* - __half2short_rn \cuda_math_formula (x), x < -32768\end_cuda_math_formula returns SHRT_MIN = \p 0x8000. +* - __half2short_rn(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ short int __half2short_rn(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed short integer in round-down mode. +* +* \details Convert the half-precision floating-point value \p h to a signed short +* integer in round-down mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns short int +* - \p h converted to a signed short integer using round-down mode. +* - __half2short_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __half2short_rd \cuda_math_formula (x), x > 32767\end_cuda_math_formula returns SHRT_MAX = \p 0x7FFF. +* - __half2short_rd \cuda_math_formula (x), x < -32768\end_cuda_math_formula returns SHRT_MIN = \p 0x8000. +* - __half2short_rd(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ short int __half2short_rd(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed short integer in round-up mode. +* +* \details Convert the half-precision floating-point value \p h to a signed short +* integer in round-up mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns short int +* - \p h converted to a signed short integer using round-up mode. +* - __half2short_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __half2short_ru \cuda_math_formula (x), x > 32767\end_cuda_math_formula returns SHRT_MAX = \p 0x7FFF. +* - __half2short_ru \cuda_math_formula (x), x < -32768\end_cuda_math_formula returns SHRT_MIN = \p 0x8000. +* - __half2short_ru(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ short int __half2short_ru(const __half h); +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed short integer to a half in round-to-nearest-even +* mode. +* +* \details Convert the signed short integer value \p i to a half-precision floating-point +* value in round-to-nearest-even mode. +* \param[in] i - short int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_rn(const short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed short integer to a half in round-towards-zero mode. +* +* \details Convert the signed short integer value \p i to a half-precision floating-point +* value in round-towards-zero mode. +* \param[in] i - short int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_rz(const short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed short integer to a half in round-down mode. +* +* \details Convert the signed short integer value \p i to a half-precision floating-point +* value in round-down mode. +* \param[in] i - short int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_rd(const short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed short integer to a half in round-up mode. +* +* \details Convert the signed short integer value \p i to a half-precision floating-point +* value in round-up mode. +* \param[in] i - short int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_ru(const short int i); +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned integer in round-to-nearest-even mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned integer +* in round-to-nearest-even mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns unsigned int +* - \p h converted to an unsigned integer using round-to-nearest-even mode. +* - __half2uint_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __half2uint_rn \cuda_math_formula (+\infty)\end_cuda_math_formula returns UINT_MAX = \p 0xFFFFFFFF. +* - __half2uint_rn \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0. +* - __half2uint_rn(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned int __half2uint_rn(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned integer in round-down mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned integer +* in round-down mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns unsigned int +* - \p h converted to an unsigned integer using round-down mode. +* - __half2uint_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __half2uint_rd \cuda_math_formula (+\infty)\end_cuda_math_formula returns UINT_MAX = \p 0xFFFFFFFF. +* - __half2uint_rd \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0. +* - __half2uint_rd(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned int __half2uint_rd(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned integer in round-up mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned integer +* in round-up mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns unsigned int +* - \p h converted to an unsigned integer using round-up mode. +* - __half2uint_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __half2uint_ru \cuda_math_formula (+\infty)\end_cuda_math_formula returns UINT_MAX = \p 0xFFFFFFFF. +* - __half2uint_ru \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0. +* - __half2uint_ru(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned int __half2uint_ru(const __half h); +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned integer to a half in round-to-nearest-even mode. +* +* \details Convert the unsigned integer value \p i to a half-precision floating-point +* value in round-to-nearest-even mode. +* \param[in] i - unsigned int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_rn(const unsigned int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned integer to a half in round-towards-zero mode. +* +* \details Convert the unsigned integer value \p i to a half-precision floating-point +* value in round-towards-zero mode. +* \param[in] i - unsigned int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_rz(const unsigned int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned integer to a half in round-down mode. +* +* \details Convert the unsigned integer value \p i to a half-precision floating-point +* value in round-down mode. +* \param[in] i - unsigned int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_rd(const unsigned int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned integer to a half in round-up mode. +* +* \details Convert the unsigned integer value \p i to a half-precision floating-point +* value in round-up mode. +* \param[in] i - unsigned int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_ru(const unsigned int i); +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned short integer in round-to-nearest-even +* mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned short +* integer in round-to-nearest-even mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns unsigned short int +* - \p h converted to an unsigned short integer using round-to-nearest-even mode. +* - __half2ushort_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __half2ushort_rn \cuda_math_formula (+\infty)\end_cuda_math_formula returns USHRT_MAX = \p 0xFFFF. +* - __half2ushort_rn \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0. +* - __half2ushort_rn(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned short int __half2ushort_rn(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned short integer in round-down mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned short +* integer in round-down mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns unsigned short int +* - \p h converted to an unsigned short integer using round-down mode. +* - __half2ushort_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __half2ushort_rd \cuda_math_formula (+\infty)\end_cuda_math_formula returns USHRT_MAX = \p 0xFFFF. +* - __half2ushort_rd \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0. +* - __half2ushort_rd(NaN) returns 0. +*/ +__CUDA_FP16_DECL__ unsigned short int __half2ushort_rd(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned short integer in round-up mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned short +* integer in round-up mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns unsigned short int +* - \p h converted to an unsigned short integer using round-up mode. +* - __half2ushort_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __half2ushort_ru \cuda_math_formula (+\infty)\end_cuda_math_formula returns USHRT_MAX = \p 0xFFFF. +* - __half2ushort_ru \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0. +* - __half2ushort_ru(NaN) returns 0. +*/ +__CUDA_FP16_DECL__ unsigned short int __half2ushort_ru(const __half h); +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned short integer to a half in round-to-nearest-even +* mode. +* +* \details Convert the unsigned short integer value \p i to a half-precision floating-point +* value in round-to-nearest-even mode. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_rn(const unsigned short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned short integer to a half in round-towards-zero +* mode. +* +* \details Convert the unsigned short integer value \p i to a half-precision floating-point +* value in round-towards-zero mode. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_rz(const unsigned short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned short integer to a half in round-down mode. +* +* \details Convert the unsigned short integer value \p i to a half-precision floating-point +* value in round-down mode. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_rd(const unsigned short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned short integer to a half in round-up mode. +* +* \details Convert the unsigned short integer value \p i to a half-precision floating-point +* value in round-up mode. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_ru(const unsigned short int i); +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned 64-bit integer in round-to-nearest-even +* mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned 64-bit +* integer in round-to-nearest-even mode. NaN inputs return \p 0x8000000000000000. +* \param[in] h - half. Is only being read. +* +* \returns unsigned long long int +* - \p h converted to an unsigned 64-bit integer using round-to-nearest-even mode. +* - __half2ull_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __half2ull_rn \cuda_math_formula (+\infty)\end_cuda_math_formula returns ULLONG_MAX = \p 0xFFFFFFFFFFFFFFFF. +* - __half2ull_rn \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0. +* - __half2ull_rn(NaN) returns \p 0x8000000000000000. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned long long int __half2ull_rn(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned 64-bit integer in round-down mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned 64-bit +* integer in round-down mode. NaN inputs return \p 0x8000000000000000. +* \param[in] h - half. Is only being read. +* +* \returns unsigned long long int +* - \p h converted to an unsigned 64-bit integer using round-down mode. +* - __half2ull_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __half2ull_rd \cuda_math_formula (+\infty)\end_cuda_math_formula returns ULLONG_MAX = \p 0xFFFFFFFFFFFFFFFF. +* - __half2ull_rd \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0. +* - __half2ull_rd(NaN) returns \p 0x8000000000000000. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned long long int __half2ull_rd(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned 64-bit integer in round-up mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned 64-bit +* integer in round-up mode. NaN inputs return \p 0x8000000000000000. +* \param[in] h - half. Is only being read. +* +* \returns unsigned long long int +* - \p h converted to an unsigned 64-bit integer using round-up mode. +* - __half2ull_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __half2ull_ru \cuda_math_formula (+\infty)\end_cuda_math_formula returns ULLONG_MAX = \p 0xFFFFFFFFFFFFFFFF. +* - __half2ull_ru \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0. +* - __half2ull_ru(NaN) returns \p 0x8000000000000000. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned long long int __half2ull_ru(const __half h); +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned 64-bit integer to a half in round-to-nearest-even +* mode. +* +* \details Convert the unsigned 64-bit integer value \p i to a half-precision floating-point +* value in round-to-nearest-even mode. +* \param[in] i - unsigned long long int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_rn(const unsigned long long int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned 64-bit integer to a half in round-towards-zero +* mode. +* +* \details Convert the unsigned 64-bit integer value \p i to a half-precision floating-point +* value in round-towards-zero mode. +* \param[in] i - unsigned long long int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_rz(const unsigned long long int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned 64-bit integer to a half in round-down mode. +* +* \details Convert the unsigned 64-bit integer value \p i to a half-precision floating-point +* value in round-down mode. +* \param[in] i - unsigned long long int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_rd(const unsigned long long int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned 64-bit integer to a half in round-up mode. +* +* \details Convert the unsigned 64-bit integer value \p i to a half-precision floating-point +* value in round-up mode. +* \param[in] i - unsigned long long int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_ru(const unsigned long long int i); +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed 64-bit integer in round-to-nearest-even +* mode. +* +* \details Convert the half-precision floating-point value \p h to a signed 64-bit +* integer in round-to-nearest-even mode. NaN inputs return a long long int with hex value of \p 0x8000000000000000. +* \param[in] h - half. Is only being read. +* +* \returns long long int +* - \p h converted to a signed 64-bit integer using round-to-nearest-even mode. +* - __half2ll_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __half2ll_rn \cuda_math_formula (+\infty)\end_cuda_math_formula returns LLONG_MAX = \p 0x7FFFFFFFFFFFFFFF. +* - __half2ll_rn \cuda_math_formula (-\infty)\end_cuda_math_formula returns LLONG_MIN = \p 0x8000000000000000. +* - __half2ll_rn(NaN) returns \p 0x8000000000000000. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ long long int __half2ll_rn(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed 64-bit integer in round-down mode. +* +* \details Convert the half-precision floating-point value \p h to a signed 64-bit +* integer in round-down mode. NaN inputs return a long long int with hex value of \p 0x8000000000000000. +* \param[in] h - half. Is only being read. +* +* \returns long long int +* - \p h converted to a signed 64-bit integer using round-down mode. +* - __half2ll_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __half2ll_rd \cuda_math_formula (+\infty)\end_cuda_math_formula returns LLONG_MAX = \p 0x7FFFFFFFFFFFFFFF. +* - __half2ll_rd \cuda_math_formula (-\infty)\end_cuda_math_formula returns LLONG_MIN = \p 0x8000000000000000. +* - __half2ll_rd(NaN) returns \p 0x8000000000000000. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ long long int __half2ll_rd(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed 64-bit integer in round-up mode. +* +* \details Convert the half-precision floating-point value \p h to a signed 64-bit +* integer in round-up mode. NaN inputs return a long long int with hex value of \p 0x8000000000000000. +* \param[in] h - half. Is only being read. +* +* \returns long long int +* - \p h converted to a signed 64-bit integer using round-up mode. +* - __half2ll_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __half2ll_ru \cuda_math_formula (+\infty)\end_cuda_math_formula returns LLONG_MAX = \p 0x7FFFFFFFFFFFFFFF. +* - __half2ll_ru \cuda_math_formula (-\infty)\end_cuda_math_formula returns LLONG_MIN = \p 0x8000000000000000. +* - __half2ll_ru(NaN) returns \p 0x8000000000000000. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ long long int __half2ll_ru(const __half h); +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed 64-bit integer to a half in round-to-nearest-even +* mode. +* +* \details Convert the signed 64-bit integer value \p i to a half-precision floating-point +* value in round-to-nearest-even mode. +* \param[in] i - long long int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_rn(const long long int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed 64-bit integer to a half in round-towards-zero mode. +* +* \details Convert the signed 64-bit integer value \p i to a half-precision floating-point +* value in round-towards-zero mode. +* \param[in] i - long long int. Is only being read. +* +* \returns half +* - \p i converted to half. +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_rz(const long long int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed 64-bit integer to a half in round-down mode. +* +* \details Convert the signed 64-bit integer value \p i to a half-precision floating-point +* value in round-down mode. +* \param[in] i - long long int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_rd(const long long int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed 64-bit integer to a half in round-up mode. +* +* \details Convert the signed 64-bit integer value \p i to a half-precision floating-point +* value in round-up mode. +* \param[in] i - long long int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_ru(const long long int i); +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Truncate input argument to the integral part. +* +* \details Round \p h to the largest integer value that does not exceed \p h in +* magnitude. +* \param[in] h - half. Is only being read. +* +* \returns half +* - The truncated value. +* - htrunc( +* \cuda_math_formula \pm 0 \end_cuda_math_formula +* ) returns +* \cuda_math_formula \pm 0 \end_cuda_math_formula. +* - htrunc( +* \cuda_math_formula \pm \infty \end_cuda_math_formula +* ) returns +* \cuda_math_formula \pm \infty \end_cuda_math_formula. +* - htrunc(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half htrunc(const __half h); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculate ceiling of the input argument. +* +* \details Compute the smallest integer value not less than \p h. +* \param[in] h - half. Is only being read. +* +* \returns half +* - The smallest integer value not less than \p h. +* - hceil( +* \cuda_math_formula \pm 0 \end_cuda_math_formula +* ) returns +* \cuda_math_formula \pm 0 \end_cuda_math_formula. +* - hceil( +* \cuda_math_formula \pm \infty \end_cuda_math_formula +* ) returns +* \cuda_math_formula \pm \infty \end_cuda_math_formula. +* - hceil(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hceil(const __half h); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculate the largest integer less than or equal to \p h. +* +* \details Calculate the largest integer value which is less than or equal to \p h. +* \param[in] h - half. Is only being read. +* +* \returns half +* - The largest integer value which is less than or equal to \p h. +* - hfloor( +* \cuda_math_formula \pm 0 \end_cuda_math_formula +* ) returns +* \cuda_math_formula \pm 0 \end_cuda_math_formula. +* - hfloor( +* \cuda_math_formula \pm \infty \end_cuda_math_formula +* ) returns +* \cuda_math_formula \pm \infty \end_cuda_math_formula. +* - hfloor(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hfloor(const __half h); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Round input to nearest integer value in half-precision floating-point +* number. +* +* \details Round \p h to the nearest integer value in half-precision floating-point +* format, with halfway cases rounded to the nearest even integer value. +* \param[in] h - half. Is only being read. +* +* \returns half +* - The nearest integer to \p h. +* - hrint( +* \cuda_math_formula \pm 0 \end_cuda_math_formula +* ) returns +* \cuda_math_formula \pm 0 \end_cuda_math_formula. +* - hrint( +* \cuda_math_formula \pm \infty \end_cuda_math_formula +* ) returns +* \cuda_math_formula \pm \infty \end_cuda_math_formula. +* - hrint(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hrint(const __half h); + +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Truncate \p half2 vector input argument to the integral part. +* +* \details Round each component of vector \p h to the largest integer value that does +* not exceed \p h in magnitude. +* \param[in] h - half2. Is only being read. +* +* \returns half2 +* - The truncated \p h. +* +* \see htrunc(__half) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2trunc(const __half2 h); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculate \p half2 vector ceiling of the input argument. +* +* \details For each component of vector \p h compute the smallest integer value not less +* than \p h. +* \param[in] h - half2. Is only being read. +* +* \returns half2 +* - The vector of smallest integers not less than \p h. +* +* \see hceil(__half) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2ceil(const __half2 h); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculate the largest integer less than or equal to \p h. +* +* \details For each component of vector \p h calculate the largest integer value which +* is less than or equal to \p h. +* \param[in] h - half2. Is only being read. +* +* \returns half2 +* - The vector of largest integers which is less than or equal to \p h. +* +* \see hfloor(__half) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2floor(const __half2 h); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Round input to nearest integer value in half-precision floating-point +* number. +* +* \details Round each component of \p half2 vector \p h to the nearest integer value in +* half-precision floating-point format, with halfway cases rounded to the +* nearest even integer value. +* \param[in] h - half2. Is only being read. +* +* \returns half2 +* - The vector of rounded integer values. +* +* \see hrint(__half) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2rint(const __half2 h); +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Returns \p half2 with both halves equal to the input value. +* +* \details Returns \p half2 number with both halves equal to the input \p a \p half +* number. +* \param[in] a - half. Is only being read. +* +* \returns half2 +* - The vector which has both its halves equal to the input \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __half2half2(const __half a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Swaps both halves of the \p half2 input. +* +* \details Swaps both halves of the \p half2 input and returns a new \p half2 number +* with swapped halves. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - \p a with its halves being swapped. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __lowhigh2highlow(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Extracts low 16 bits from each of the two \p half2 inputs and combines +* into one \p half2 number. +* +* \details Extracts low 16 bits from each of the two \p half2 inputs and combines into +* one \p half2 number. Low 16 bits from input \p a is stored in low 16 bits of +* the return value, low 16 bits from input \p b is stored in high 16 bits of +* the return value. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The low 16 bits of \p a and of \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __lows2half2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Extracts high 16 bits from each of the two \p half2 inputs and +* combines into one \p half2 number. +* +* \details Extracts high 16 bits from each of the two \p half2 inputs and combines into +* one \p half2 number. High 16 bits from input \p a is stored in low 16 bits of +* the return value, high 16 bits from input \p b is stored in high 16 bits of +* the return value. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The high 16 bits of \p a and of \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __highs2half2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Returns high 16 bits of \p half2 input. +* +* \details Returns high 16 bits of \p half2 input \p a. +* \param[in] a - half2. Is only being read. +* +* \returns half +* - The high 16 bits of the input. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __high2half(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Returns low 16 bits of \p half2 input. +* +* \details Returns low 16 bits of \p half2 input \p a. +* \param[in] a - half2. Is only being read. +* +* \returns half +* - Returns \p half which contains low 16 bits of the input \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __low2half(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Checks if the input \p half number is infinite. +* +* \details Checks if the input \p half number \p a is infinite. +* \param[in] a - half. Is only being read. +* +* \returns int +* - -1 if \p a is equal to negative infinity, +* - 1 if \p a is equal to positive infinity, +* - 0 otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ int __hisinf(const __half a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Combines two \p half numbers into one \p half2 number. +* +* \details Combines two input \p half number \p a and \p b into one \p half2 number. +* Input \p a is stored in low 16 bits of the return value, input \p b is stored +* in high 16 bits of the return value. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half2 +* - The half2 with one half equal to \p a and the other to \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __halves2half2(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Extracts low 16 bits from \p half2 input. +* +* \details Extracts low 16 bits from \p half2 input \p a and returns a new \p half2 +* number which has both halves equal to the extracted bits. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The half2 with both halves equal to the low 16 bits of the input. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __low2half2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Extracts high 16 bits from \p half2 input. +* +* \details Extracts high 16 bits from \p half2 input \p a and returns a new \p half2 +* number which has both halves equal to the extracted bits. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The half2 with both halves equal to the high 16 bits of the input. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __high2half2(const __half2 a); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Reinterprets bits in a \p half as a signed short integer. +* +* \details Reinterprets the bits in the half-precision floating-point number \p h +* as a signed short integer. +* \param[in] h - half. Is only being read. +* +* \returns short int +* - The reinterpreted value. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ short int __half_as_short(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Reinterprets bits in a \p half as an unsigned short integer. +* +* \details Reinterprets the bits in the half-precision floating-point \p h +* as an unsigned short number. +* \param[in] h - half. Is only being read. +* +* \returns unsigned short int +* - The reinterpreted value. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned short int __half_as_ushort(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Reinterprets bits in a signed short integer as a \p half. +* +* \details Reinterprets the bits in the signed short integer \p i as a +* half-precision floating-point number. +* \param[in] i - short int. Is only being read. +* +* \returns half +* - The reinterpreted value. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __short_as_half(const short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Reinterprets bits in an unsigned short integer as a \p half. +* +* \details Reinterprets the bits in the unsigned short integer \p i as a +* half-precision floating-point number. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns half +* - The reinterpreted value. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort_as_half(const unsigned short int i); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Calculates \p half maximum of two input values. +* +* \details Calculates \p half max(\p a, \p b) +* defined as (\p a > \p b) ? \p a : \p b. +* - If either of inputs is NaN, the other input is returned. +* - If both inputs are NaNs, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmax(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Calculates \p half minimum of two input values. +* +* \details Calculates \p half min(\p a, \p b) +* defined as (\p a < \p b) ? \p a : \p b. +* - If either of inputs is NaN, the other input is returned. +* - If both inputs are NaNs, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmin(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Calculates \p half2 vector maximum of two inputs. +* +* \details Calculates \p half2 vector max(\p a, \p b). +* Elementwise \p half operation is defined as +* (\p a > \p b) ? \p a : \p b. +* - If either of inputs is NaN, the other input is returned. +* - If both inputs are NaNs, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The result of elementwise maximum of vectors \p a and \p b +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmax2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Calculates \p half2 vector minimum of two inputs. +* +* \details Calculates \p half2 vector min(\p a, \p b). +* Elementwise \p half operation is defined as +* (\p a < \p b) ? \p a : \p b. +* - If either of inputs is NaN, the other input is returned. +* - If both inputs are NaNs, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The result of elementwise minimum of vectors \p a and \p b +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmin2(const __half2 a, const __half2 b); + +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300) +#if !defined warpSize && !defined __local_warpSize +#define warpSize 32 +#define __local_warpSize +#endif + +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ < 700) + +#if defined(_WIN32) +# define __CUDA_FP16_DEPRECATED__(msg) __declspec(deprecated(msg)) +#elif (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 5 && !defined(__clang__)))) +# define __CUDA_FP16_DEPRECATED__(msg) __attribute__((deprecated)) +#else +# define __CUDA_FP16_DEPRECATED__(msg) __attribute__((deprecated(msg))) +#endif + +#if defined(_NVHPC_CUDA) +#define __CUDA_FP16_WSB_DEPRECATION_MESSAGE(x) __CUDA_FP16_STRINGIFY(x) "() is deprecated in favor of " __CUDA_FP16_STRINGIFY(x) "_sync() and may be removed in a future release." +#else +#define __CUDA_FP16_WSB_DEPRECATION_MESSAGE(x) __CUDA_FP16_STRINGIFY(x) "() is deprecated in favor of " __CUDA_FP16_STRINGIFY(x) "_sync() and may be removed in a future release (Use -Wno-deprecated-declarations to suppress this warning)." +#endif + +__CUDA_FP16_DECL__ __CUDA_FP16_DEPRECATED__(__CUDA_FP16_WSB_DEPRECATION_MESSAGE(__shfl)) __half2 __shfl(const __half2 var, const int delta, const int width = warpSize); +__CUDA_FP16_DECL__ __CUDA_FP16_DEPRECATED__(__CUDA_FP16_WSB_DEPRECATION_MESSAGE(__shfl_up)) __half2 __shfl_up(const __half2 var, const unsigned int delta, const int width = warpSize); +__CUDA_FP16_DECL__ __CUDA_FP16_DEPRECATED__(__CUDA_FP16_WSB_DEPRECATION_MESSAGE(__shfl_down))__half2 __shfl_down(const __half2 var, const unsigned int delta, const int width = warpSize); +__CUDA_FP16_DECL__ __CUDA_FP16_DEPRECATED__(__CUDA_FP16_WSB_DEPRECATION_MESSAGE(__shfl_xor)) __half2 __shfl_xor(const __half2 var, const int delta, const int width = warpSize); +__CUDA_FP16_DECL__ __CUDA_FP16_DEPRECATED__(__CUDA_FP16_WSB_DEPRECATION_MESSAGE(__shfl)) __half __shfl(const __half var, const int delta, const int width = warpSize); +__CUDA_FP16_DECL__ __CUDA_FP16_DEPRECATED__(__CUDA_FP16_WSB_DEPRECATION_MESSAGE(__shfl_up)) __half __shfl_up(const __half var, const unsigned int delta, const int width = warpSize); +__CUDA_FP16_DECL__ __CUDA_FP16_DEPRECATED__(__CUDA_FP16_WSB_DEPRECATION_MESSAGE(__shfl_down)) __half __shfl_down(const __half var, const unsigned int delta, const int width = warpSize); +__CUDA_FP16_DECL__ __CUDA_FP16_DEPRECATED__(__CUDA_FP16_WSB_DEPRECATION_MESSAGE(__shfl_xor)) __half __shfl_xor(const __half var, const int delta, const int width = warpSize); + +#undef __CUDA_FP16_WSB_DEPRECATION_MESSAGE +#undef __CUDA_FP16_DEPRECATED__ +#endif /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 */ + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Direct copy from indexed thread. +* +* \details Returns the value of \p var held by the thread whose ID is given by \p srcLane. +* If the \p width is less than \p warpSize, then each subsection of the warp behaves as a separate +* entity with a starting logical thread ID of 0. If \p srcLane is outside the range \p [0:width-1], +* the value returned corresponds to the value of \p var held by the \p srcLane modulo \p width (i.e. +* within the same subsection). \p width must have a value which is a power of 2; +* results are undefined if \p width is not a power of 2, or is a number greater than +* \p warpSize. +* Threads may only read data from another thread which is actively participating in the +* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined. +* \param[in] mask - unsigned int. Is only being read. +* - Indicates the threads participating in the call. +* - A bit, representing the thread's lane ID, must be set for each participating thread +* to ensure they are properly converged before the intrinsic is executed by the hardware. +* - Each calling thread must have its own bit set in the \p mask and all non-exited threads +* named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined. +* \param[in] var - half2. Is only being read. +* \param[in] srcLane - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 4-byte word referenced by \p var from the source thread ID as \p half2. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __shfl_sync(const unsigned int mask, const __half2 var, const int srcLane, const int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller. +* +* \details Calculates a source thread ID by subtracting \p delta from the caller's lane ID. +* The value of \p var held by the resulting lane ID is returned: in effect, \p var is shifted up +* the warp by \p delta threads. If the \p width is less than \p warpSize, then each subsection of the warp +* behaves as a separate entity with a starting logical thread ID of 0. The source thread index +* will not wrap around the value of \p width, so effectively the lower \p delta threads will be unchanged. +* \p width must have a value which is a power of 2; results are undefined if \p width is not a power of 2, +* or is a number greater than \p warpSize. +* Threads may only read data from another thread which is actively participating in the +* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined. +* \param[in] mask - unsigned int. Is only being read. +* - Indicates the threads participating in the call. +* - A bit, representing the thread's lane ID, must be set for each participating thread +* to ensure they are properly converged before the intrinsic is executed by the hardware. +* - Each calling thread must have its own bit set in the \p mask and all non-exited threads +* named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined. +* \param[in] var - half2. Is only being read. +* \param[in] delta - unsigned int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 4-byte word referenced by \p var from the source thread ID as \p half2. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __shfl_up_sync(const unsigned int mask, const __half2 var, const unsigned int delta, const int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller. +* +* \details Calculates a source thread ID by adding \p delta to the caller's thread ID. +* The value of \p var held by the resulting thread ID is returned: this has the effect +* of shifting \p var down the warp by \p delta threads. If the \p width is less than \p warpSize, then +* each subsection of the warp behaves as a separate entity with a starting logical +* thread ID of 0. Similarly to the __shfl_up_sync(), the ID number of the source thread +* will not wrap around the value of \p width and the upper \p delta threads +* will remain unchanged. +* Threads may only read data from another thread which is actively participating in the +* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined. +* \param[in] mask - unsigned int. Is only being read. +* - Indicates the threads participating in the call. +* - A bit, representing the thread's lane ID, must be set for each participating thread +* to ensure they are properly converged before the intrinsic is executed by the hardware. +* - Each calling thread must have its own bit set in the \p mask and all non-exited threads +* named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined. +* \param[in] var - half2. Is only being read. +* \param[in] delta - unsigned int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 4-byte word referenced by \p var from the source thread ID as \p half2. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __shfl_down_sync(const unsigned int mask, const __half2 var, const unsigned int delta, const int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID. +* +* \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with \p laneMask: +* the value of \p var held by the resulting thread ID is returned. If the \p width is less than \p warpSize, then each +* group of \p width consecutive threads are able to access elements from earlier groups of threads, +* however if they attempt to access elements from later groups of threads their own value of \p var +* will be returned. This mode implements a butterfly addressing pattern such as is used in tree +* reduction and broadcast. +* Threads may only read data from another thread which is actively participating in the +* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined. +* \param[in] mask - unsigned int. Is only being read. +* - Indicates the threads participating in the call. +* - A bit, representing the thread's lane ID, must be set for each participating thread +* to ensure they are properly converged before the intrinsic is executed by the hardware. +* - Each calling thread must have its own bit set in the \p mask and all non-exited threads +* named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined. +* \param[in] var - half2. Is only being read. +* \param[in] laneMask - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 4-byte word referenced by \p var from the source thread ID as \p half2. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __shfl_xor_sync(const unsigned int mask, const __half2 var, const int laneMask, const int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Direct copy from indexed thread. +* +* \details Returns the value of \p var held by the thread whose ID is given by \p srcLane. +* If the \p width is less than \p warpSize, then each subsection of the warp behaves as a separate +* entity with a starting logical thread ID of 0. If \p srcLane is outside the range \p [0:width-1], +* the value returned corresponds to the value of \p var held by the \p srcLane modulo \p width (i.e. +* within the same subsection). \p width must have a value which is a power of 2; +* results are undefined if \p width is not a power of 2, or is a number greater than +* \p warpSize. +* Threads may only read data from another thread which is actively participating in the +* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined. +* \param[in] mask - unsigned int. Is only being read. +* - Indicates the threads participating in the call. +* - A bit, representing the thread's lane ID, must be set for each participating thread +* to ensure they are properly converged before the intrinsic is executed by the hardware. +* - Each calling thread must have its own bit set in the \p mask and all non-exited threads +* named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined. +* \param[in] var - half. Is only being read. +* \param[in] srcLane - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 2-byte word referenced by \p var from the source thread ID as \p half. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __shfl_sync(const unsigned int mask, const __half var, const int srcLane, const int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller. +* +* \details Calculates a source thread ID by subtracting \p delta from the caller's lane ID. +* The value of \p var held by the resulting lane ID is returned: in effect, \p var is shifted up +* the warp by \p delta threads. If the \p width is less than \p warpSize, then each subsection of the warp +* behaves as a separate entity with a starting logical thread ID of 0. The source thread index +* will not wrap around the value of \p width, so effectively the lower \p delta threads will be unchanged. +* \p width must have a value which is a power of 2; results are undefined if \p width is not a power of 2, +* or is a number greater than \p warpSize. +* Threads may only read data from another thread which is actively participating in the +* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined. +* \param[in] mask - unsigned int. Is only being read. +* - Indicates the threads participating in the call. +* - A bit, representing the thread's lane ID, must be set for each participating thread +* to ensure they are properly converged before the intrinsic is executed by the hardware. +* - Each calling thread must have its own bit set in the \p mask and all non-exited threads +* named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined. +* \param[in] var - half. Is only being read. +* \param[in] delta - unsigned int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 2-byte word referenced by \p var from the source thread ID as \p half. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __shfl_up_sync(const unsigned int mask, const __half var, const unsigned int delta, const int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller. +* +* \details Calculates a source thread ID by adding \p delta to the caller's thread ID. +* The value of \p var held by the resulting thread ID is returned: this has the effect +* of shifting \p var down the warp by \p delta threads. If the \p width is less than \p warpSize, then +* each subsection of the warp behaves as a separate entity with a starting logical +* thread ID of 0. Similarly to the __shfl_up_sync(), the ID number of the source thread +* will not wrap around the value of \p width and the upper \p delta threads +* will remain unchanged. +* Threads may only read data from another thread which is actively participating in the +* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined. +* \param[in] mask - unsigned int. Is only being read. +* - Indicates the threads participating in the call. +* - A bit, representing the thread's lane ID, must be set for each participating thread +* to ensure they are properly converged before the intrinsic is executed by the hardware. +* - Each calling thread must have its own bit set in the \p mask and all non-exited threads +* named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined. +* \param[in] var - half. Is only being read. +* \param[in] delta - unsigned int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 2-byte word referenced by \p var from the source thread ID as \p half. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __shfl_down_sync(const unsigned int mask, const __half var, const unsigned int delta, const int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID. +* +* \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with \p laneMask: +* the value of \p var held by the resulting thread ID is returned. If the \p width is less than \p warpSize, then each +* group of \p width consecutive threads are able to access elements from earlier groups of threads, +* however if they attempt to access elements from later groups of threads their own value of \p var +* will be returned. This mode implements a butterfly addressing pattern such as is used in tree +* reduction and broadcast. +* Threads may only read data from another thread which is actively participating in the +* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined. +* \param[in] mask - unsigned int. Is only being read. +* - Indicates the threads participating in the call. +* - A bit, representing the thread's lane ID, must be set for each participating thread +* to ensure they are properly converged before the intrinsic is executed by the hardware. +* - Each calling thread must have its own bit set in the \p mask and all non-exited threads +* named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined. +* \param[in] var - half. Is only being read. +* \param[in] laneMask - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 2-byte word referenced by \p var from the source thread ID as \p half. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __shfl_xor_sync(const unsigned int mask, const __half var, const int laneMask, const int width = warpSize); + +#if defined(__local_warpSize) +#undef warpSize +#undef __local_warpSize +#endif +#endif /*!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300) */ + +#if defined(__cplusplus) && ( !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320) ) +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.nc` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half2 __ldg(const __half2 *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.nc` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half __ldg(const __half *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.cg` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half2 __ldcg(const __half2 *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.cg` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half __ldcg(const __half *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.ca` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half2 __ldca(const __half2 *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.ca` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half __ldca(const __half *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.cs` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half2 __ldcs(const __half2 *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.cs` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half __ldcs(const __half *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.lu` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half2 __ldlu(const __half2 *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.lu` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half __ldlu(const __half *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.cv` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half2 __ldcv(const __half2 *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.cv` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half __ldcv(const __half *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.wb` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stwb(__half2 *const ptr, const __half2 value); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.wb` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stwb(__half *const ptr, const __half value); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.cg` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stcg(__half2 *const ptr, const __half2 value); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.cg` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stcg(__half *const ptr, const __half value); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.cs` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stcs(__half2 *const ptr, const __half2 value); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.cs` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stcs(__half *const ptr, const __half value); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.wt` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stwt(__half2 *const ptr, const __half2 value); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.wt` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stwt(__half *const ptr, const __half value); +#endif /*defined(__cplusplus) && ( !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320) )*/ +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ + +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs half2 vector if-equal comparison. +* +* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The vector result of if-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __heq2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector not-equal comparison. +* +* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The vector result of not-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hne2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector less-equal comparison. +* +* \details Performs \p half2 vector less-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The \p half2 result of less-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hle2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector greater-equal comparison. +* +* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The vector result of greater-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hge2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector less-than comparison. +* +* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The half2 vector result of less-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hlt2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector greater-than comparison. +* +* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The vector result of greater-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hgt2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered if-equal comparison. +* +* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The vector result of unordered if-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hequ2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered not-equal comparison. +* +* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The vector result of unordered not-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hneu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered less-equal comparison. +* +* Performs \p half2 vector less-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The vector result of unordered less-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hleu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered greater-equal comparison. +* +* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The \p half2 vector result of unordered greater-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hgeu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered less-than comparison. +* +* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The vector result of unordered less-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hltu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered greater-than comparison. +* +* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The \p half2 vector result of unordered greater-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hgtu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs half2 vector if-equal comparison. +* +* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns unsigned int +* - The vector mask result of if-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __heq2_mask(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector not-equal comparison. +* +* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns unsigned int +* - The vector mask result of not-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hne2_mask(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector less-equal comparison. +* +* \details Performs \p half2 vector less-equal comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns unsigned int +* - The vector mask result of less-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hle2_mask(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector greater-equal comparison. +* +* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns unsigned int +* - The vector mask result of greater-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hge2_mask(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector less-than comparison. +* +* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns unsigned int +* - The vector mask result of less-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hlt2_mask(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector greater-than comparison. +* +* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns unsigned int +* - The vector mask result of greater-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hgt2_mask(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered if-equal comparison. +* +* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns unsigned int +* - The vector mask result of unordered if-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hequ2_mask(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered not-equal comparison. +* +* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns unsigned int +* - The vector mask result of unordered not-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hneu2_mask(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered less-equal comparison. +* +* Performs \p half2 vector less-equal comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns unsigned int +* - The vector mask result of unordered less-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hleu2_mask(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered greater-equal comparison. +* +* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns unsigned int +* - The vector mask result of unordered greater-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hgeu2_mask(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered less-than comparison. +* +* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns unsigned int +* - The vector mask result of unordered less-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hltu2_mask(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered greater-than comparison. +* +* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns unsigned int +* - The vector mask result of unordered greater-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hgtu2_mask(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Determine whether \p half2 argument is a NaN. +* +* \details Determine whether each half of input \p half2 number \p a is a NaN. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The half2 with the corresponding \p half results set to +* 1.0 for NaN, 0.0 otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hisnan2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector addition in round-to-nearest-even mode. +* +* \details Performs \p half2 vector add of inputs \p a and \p b, in round-to-nearest-even +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-95 +* \endinternal +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The sum of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hadd2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector subtraction in round-to-nearest-even mode. +* +* \details Subtracts \p half2 input vector \p b from input vector \p a in +* round-to-nearest-even mode. +* \internal +* \req DEEPLEARN-SRM_REQ-104 +* \endinternal +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The subtraction of vector \p b from \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hsub2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector multiplication in round-to-nearest-even mode. +* +* \details Performs \p half2 vector multiplication of inputs \p a and \p b, in +* round-to-nearest-even mode. +* \internal +* \req DEEPLEARN-SRM_REQ-102 +* \endinternal +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The result of elementwise multiplying the vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmul2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector addition in round-to-nearest-even mode. +* +* \details Performs \p half2 vector add of inputs \p a and \p b, in round-to-nearest-even +* mode. Prevents floating-point contractions of mul+add into fma. +* \internal +* \req DEEPLEARN-SRM_REQ-95 +* \endinternal +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The sum of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hadd2_rn(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector subtraction in round-to-nearest-even mode. +* +* \details Subtracts \p half2 input vector \p b from input vector \p a in +* round-to-nearest-even mode. Prevents floating-point contractions of mul+sub +* into fma. +* \internal +* \req DEEPLEARN-SRM_REQ-104 +* \endinternal +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The subtraction of vector \p b from \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hsub2_rn(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector multiplication in round-to-nearest-even mode. +* +* \details Performs \p half2 vector multiplication of inputs \p a and \p b, in +* round-to-nearest-even mode. Prevents floating-point contractions of +* mul+add or sub into fma. +* \internal +* \req DEEPLEARN-SRM_REQ-102 +* \endinternal +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The result of elementwise multiplying the vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmul2_rn(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector division in round-to-nearest-even mode. +* +* \details Divides \p half2 input vector \p a by input vector \p b in round-to-nearest-even +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-103 +* \endinternal +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The elementwise division of \p a with \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __h2div(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Calculates the absolute value of both halves of the input \p half2 number and +* returns the result. +* +* \details Calculates the absolute value of both halves of the input \p half2 number and +* returns the result. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - Returns \p a with the absolute value of both halves. +* +* \see __habs(__half) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __habs2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector addition in round-to-nearest-even mode, with +* saturation to [0.0, 1.0]. +* +* \details Performs \p half2 vector add of inputs \p a and \p b, in round-to-nearest-even +* mode, and clamps the results to range [0.0, 1.0]. NaN results are flushed to +* +0.0. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The sum of \p a and \p b, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hadd2_sat(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector subtraction in round-to-nearest-even mode, +* with saturation to [0.0, 1.0]. +* +* \details Subtracts \p half2 input vector \p b from input vector \p a in +* round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN +* results are flushed to +0.0. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The subtraction of vector \p b from \p a, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hsub2_sat(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector multiplication in round-to-nearest-even mode, +* with saturation to [0.0, 1.0]. +* +* \details Performs \p half2 vector multiplication of inputs \p a and \p b, in +* round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN +* results are flushed to +0.0. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The result of elementwise multiplication of vectors \p a and \p b, +* with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmul2_sat(const __half2 a, const __half2 b); + +#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector fused multiply-add in round-to-nearest-even +* mode. +* +* \details Performs \p half2 vector multiply on inputs \p a and \p b, +* then performs a \p half2 vector add of the result with \p c, +* rounding the result once in round-to-nearest-even mode. +* \internal +* \req DEEPLEARN-SRM_REQ-105 +* \endinternal +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* \param[in] c - half2. Is only being read. +* +* \returns half2 +* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hfma2(const __half2 a, const __half2 b, const __half2 c); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector fused multiply-add in round-to-nearest-even +* mode, with saturation to [0.0, 1.0]. +* +* \details Performs \p half2 vector multiply on inputs \p a and \p b, +* then performs a \p half2 vector add of the result with \p c, +* rounding the result once in round-to-nearest-even mode, and clamps the +* results to range [0.0, 1.0]. NaN results are flushed to +0.0. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* \param[in] c - half2. Is only being read. +* +* \returns half2 +* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c, +* with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hfma2_sat(const __half2 a, const __half2 b, const __half2 c); +#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Negates both halves of the input \p half2 number and returns the +* result. +* +* \details Negates both halves of the input \p half2 number \p a and returns the result. +* \internal +* \req DEEPLEARN-SRM_REQ-101 +* \endinternal +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - Returns \p a with both halves negated. +* +* \see __hneg(__half) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hneg2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Calculates the absolute value of input \p half number and returns the result. +* +* \details Calculates the absolute value of input \p half number and returns the result. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The absolute value of \p a. +* - __habs \cuda_math_formula (\pm 0)\end_cuda_math_formula returns +0. +* - __habs \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula +\infty \end_cuda_math_formula. +* - __habs(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __habs(const __half a); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half addition in round-to-nearest-even mode. +* +* \details Performs \p half addition of inputs \p a and \p b, in round-to-nearest-even +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-94 +* \endinternal +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* - The sum of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hadd(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half subtraction in round-to-nearest-even mode. +* +* \details Subtracts \p half input \p b from input \p a in round-to-nearest-even +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-97 +* \endinternal +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* - The result of subtracting \p b from \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hsub(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half multiplication in round-to-nearest-even mode. +* +* \details Performs \p half multiplication of inputs \p a and \p b, in round-to-nearest-even +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-99 +* \endinternal +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* - The result of multiplying \p a and \p b. +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmul(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half addition in round-to-nearest-even mode. +* +* \details Performs \p half addition of inputs \p a and \p b, in round-to-nearest-even +* mode. Prevents floating-point contractions of mul+add into fma. +* \internal +* \req DEEPLEARN-SRM_REQ-94 +* \endinternal +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* - The sum of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hadd_rn(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half subtraction in round-to-nearest-even mode. +* +* \details Subtracts \p half input \p b from input \p a in round-to-nearest-even +* mode. Prevents floating-point contractions of mul+sub into fma. +* \internal +* \req DEEPLEARN-SRM_REQ-97 +* \endinternal +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* - The result of subtracting \p b from \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hsub_rn(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half multiplication in round-to-nearest-even mode. +* +* \details Performs \p half multiplication of inputs \p a and \p b, in round-to-nearest-even +* mode. Prevents floating-point contractions of mul+add or sub into fma. +* \internal +* \req DEEPLEARN-SRM_REQ-99 +* \endinternal +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* - The result of multiplying \p a and \p b. +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmul_rn(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half division in round-to-nearest-even mode. +* +* \details Divides \p half input \p a by input \p b in round-to-nearest-even +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-98 +* \endinternal +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* - The result of dividing \p a by \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hdiv(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half addition in round-to-nearest-even mode, with +* saturation to [0.0, 1.0]. +* +* \details Performs \p half add of inputs \p a and \p b, in round-to-nearest-even mode, +* and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* - The sum of \p a and \p b, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hadd_sat(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half subtraction in round-to-nearest-even mode, with +* saturation to [0.0, 1.0]. +* +* \details Subtracts \p half input \p b from input \p a in round-to-nearest-even +* mode, +* and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* - The result of subtraction of \p b from \p a, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hsub_sat(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half multiplication in round-to-nearest-even mode, with +* saturation to [0.0, 1.0]. +* +* \details Performs \p half multiplication of inputs \p a and \p b, in round-to-nearest-even +* mode, and clamps the result to range [0.0, 1.0]. NaN results are flushed to +* +0.0. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* - The result of multiplying \p a and \p b, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmul_sat(const __half a, const __half b); + +#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half fused multiply-add in round-to-nearest-even mode. +* +* \details Performs \p half multiply on inputs \p a and \p b, +* then performs a \p half add of the result with \p c, +* rounding the result once in round-to-nearest-even mode. +* \internal +* \req DEEPLEARN-SRM_REQ-96 +* \endinternal +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* \param[in] c - half. Is only being read. +* +* \returns half +* - The result of fused multiply-add operation on \p +* a, \p b, and \p c. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hfma(const __half a, const __half b, const __half c); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half fused multiply-add in round-to-nearest-even mode, +* with saturation to [0.0, 1.0]. +* +* \details Performs \p half multiply on inputs \p a and \p b, +* then performs a \p half add of the result with \p c, +* rounding the result once in round-to-nearest-even mode, and clamps the result +* to range [0.0, 1.0]. NaN results are flushed to +0.0. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* \param[in] c - half. Is only being read. +* +* \returns half +* - The result of fused multiply-add operation on \p +* a, \p b, and \p c, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hfma_sat(const __half a, const __half b, const __half c); +#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */ + +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Negates input \p half number and returns the result. +* +* \details Negates input \p half number and returns the result. +* \internal +* \req DEEPLEARN-SRM_REQ-100 +* \endinternal +* \param[in] a - half. Is only being read. +* +* \returns half +* - Negated input \p a. +* - __hneg \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \mp 0 \end_cuda_math_formula. +* - __hneg \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \mp \infty \end_cuda_math_formula. +* - __hneg(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hneg(const __half a); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector if-equal comparison and returns boolean true +* if both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half if-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of if-equal comparison +* of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbeq2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector not-equal comparison and returns boolean +* true if both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half not-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of not-equal comparison +* of vectors \p a and \p b are true, +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbne2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector less-equal comparison and returns boolean +* true if both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector less-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half less-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of less-equal comparison +* of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hble2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector greater-equal comparison and returns boolean +* true if both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half greater-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of greater-equal +* comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbge2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector less-than comparison and returns boolean +* true if both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half less-than comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of less-than comparison +* of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hblt2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector greater-than comparison and returns boolean +* true if both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half greater-than comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of greater-than +* comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbgt2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered if-equal comparison and returns +* boolean true if both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half if-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of unordered if-equal +* comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbequ2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered not-equal comparison and returns +* boolean true if both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half not-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of unordered not-equal +* comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbneu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered less-equal comparison and returns +* boolean true if both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector less-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half less-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of unordered less-equal +* comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbleu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered greater-equal comparison and +* returns boolean true if both \p half results are true, boolean false +* otherwise. +* +* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half greater-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of unordered +* greater-equal comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbgeu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered less-than comparison and returns +* boolean true if both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half less-than comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of unordered less-than comparison of +* vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbltu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered greater-than comparison and +* returns boolean true if both \p half results are true, boolean false +* otherwise. +* +* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half greater-than comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of unordered +* greater-than comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbgtu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half if-equal comparison. +* +* \details Performs \p half if-equal comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of if-equal comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ bool __heq(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half not-equal comparison. +* +* \details Performs \p half not-equal comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of not-equal comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hne(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half less-equal comparison. +* +* \details Performs \p half less-equal comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of less-equal comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hle(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half greater-equal comparison. +* +* \details Performs \p half greater-equal comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of greater-equal comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hge(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half less-than comparison. +* +* \details Performs \p half less-than comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of less-than comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hlt(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half greater-than comparison. +* +* \details Performs \p half greater-than comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of greater-than comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hgt(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half unordered if-equal comparison. +* +* \details Performs \p half if-equal comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of unordered if-equal comparison of \p a and +* \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hequ(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half unordered not-equal comparison. +* +* \details Performs \p half not-equal comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of unordered not-equal comparison of \p a and +* \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hneu(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half unordered less-equal comparison. +* +* \details Performs \p half less-equal comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of unordered less-equal comparison of \p a and +* \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hleu(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half unordered greater-equal comparison. +* +* \details Performs \p half greater-equal comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of unordered greater-equal comparison of \p a +* and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hgeu(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half unordered less-than comparison. +* +* \details Performs \p half less-than comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of unordered less-than comparison of \p a and +* \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hltu(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half unordered greater-than comparison. +* +* \details Performs \p half greater-than comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of unordered greater-than comparison of \p a +* and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hgtu(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Determine whether \p half argument is a NaN. +* +* \details Determine whether \p half value \p a is a NaN. +* \param[in] a - half. Is only being read. +* +* \returns bool +* - true if argument is NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hisnan(const __half a); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Calculates \p half maximum of two input values, NaNs pass through. +* +* \details Calculates \p half max(\p a, \p b) +* defined as (\p a > \p b) ? \p a : \p b. +* - If either of inputs is NaN, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmax_nan(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Calculates \p half minimum of two input values, NaNs pass through. +* +* \details Calculates \p half min(\p a, \p b) +* defined as (\p a < \p b) ? \p a : \p b. +* - If either of inputs is NaN, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmin_nan(const __half a, const __half b); +#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half fused multiply-add in round-to-nearest-even mode with relu saturation. +* +* \details Performs \p half multiply on inputs \p a and \p b, +* then performs a \p half add of the result with \p c, +* rounding the result once in round-to-nearest-even mode. +* Then negative result is clamped to 0. +* NaN result is converted to canonical NaN. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* \param[in] c - half. Is only being read. +* +* \returns half +* - The result of fused multiply-add operation on \p +* a, \p b, and \p c with relu saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hfma_relu(const __half a, const __half b, const __half c); +#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Calculates \p half2 vector maximum of two inputs, NaNs pass through. +* +* \details Calculates \p half2 vector max(\p a, \p b). +* Elementwise \p half operation is defined as +* (\p a > \p b) ? \p a : \p b. +* - If either of inputs is NaN, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The result of elementwise maximum of vectors \p a and \p b, with NaNs pass through +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmax2_nan(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Calculates \p half2 vector minimum of two inputs, NaNs pass through. +* +* \details Calculates \p half2 vector min(\p a, \p b). +* Elementwise \p half operation is defined as +* (\p a < \p b) ? \p a : \p b. +* - If either of inputs is NaN, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The result of elementwise minimum of vectors \p a and \p b, with NaNs pass through +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmin2_nan(const __half2 a, const __half2 b); +#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector fused multiply-add in round-to-nearest-even +* mode with relu saturation. +* +* \details Performs \p half2 vector multiply on inputs \p a and \p b, +* then performs a \p half2 vector add of the result with \p c, +* rounding the result once in round-to-nearest-even mode. +* Then negative result is clamped to 0. +* NaN result is converted to canonical NaN. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* \param[in] c - half2. Is only being read. +* +* \returns half2 +* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c with relu saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hfma2_relu(const __half2 a, const __half2 b, const __half2 c); + +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs fast complex multiply-accumulate +* +* \details Interprets vector \p half2 input pairs \p a, \p b, and \p c as +* complex numbers in \p half precision: (a.x + I*a.y), (b.x + I*b.y), (c.x + I*c.y) +* and performs complex multiply-accumulate operation: a*b + c in a simple way: +* ((a.x*b.x + c.x) - a.y*b.y) + I*((a.x*b.y + c.y) + a.y*b.x) +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* \param[in] c - half2. Is only being read. +* +* \returns half2 +* - The result of complex multiply-accumulate operation on complex numbers \p a, \p b, and \p c +* - __half2 result = __hcmadd(a, b, c) is numerically in agreement with: +* - result.x = __hfma(-a.y, b.y, __hfma(a.x, b.x, c.x)) +* - result.y = __hfma( a.y, b.x, __hfma(a.x, b.y, c.y)) +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hcmadd(const __half2 a, const __half2 b, const __half2 c); +#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */ +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half square root in round-to-nearest-even mode. +* +* \details Calculates \p half square root of input: \cuda_math_formula \sqrt{a} \end_cuda_math_formula in round-to-nearest-even mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The square root of \p a. +* - hsqrt \cuda_math_formula (+\infty)\end_cuda_math_formula returns \cuda_math_formula +\infty \end_cuda_math_formula. +* - hsqrt \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula. +* - hsqrt \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns NaN. +* - hsqrt(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hsqrt(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half reciprocal square root in round-to-nearest-even +* mode. +* +* \details Calculates \p half reciprocal square root of input: \cuda_math_formula \frac{1}{\sqrt{a}}\end_cuda_math_formula in round-to-nearest-even +* mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The reciprocal square root of \p a. +* - hrsqrt \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula. +* - hrsqrt \cuda_math_formula (+\infty)\end_cuda_math_formula returns +0. +* - hrsqrt \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns NaN. +* - hrsqrt(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hrsqrt(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half reciprocal in round-to-nearest-even mode. +* +* \details Calculates \p half reciprocal of input: \cuda_math_formula \frac{1}{a}\end_cuda_math_formula in round-to-nearest-even mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The reciprocal of \p a. +* - hrcp \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula. +* - hrcp \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula. +* - hrcp(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hrcp(const __half a); +#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half natural logarithm in round-to-nearest-even mode. +* +* \details Calculates \p half natural logarithm of input: \cuda_math_formula \ln(a)\end_cuda_math_formula in round-to-nearest-even +* mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The natural logarithm of \p a. +* - hlog \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula -\infty \end_cuda_math_formula. +* - hlog(1) returns +0. +* - hlog(x), x < 0 returns NaN. +* - hlog \cuda_math_formula (+\infty)\end_cuda_math_formula returns \cuda_math_formula +\infty \end_cuda_math_formula. +* - hlog(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hlog(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half binary logarithm in round-to-nearest-even mode. +* +* \details Calculates \p half binary logarithm of input: \cuda_math_formula \log_{2}(a)\end_cuda_math_formula in round-to-nearest-even +* mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The binary logarithm of \p a. +* - hlog2 \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula -\infty \end_cuda_math_formula. +* - hlog2(1) returns +0. +* - hlog2(x), x < 0 returns NaN. +* - hlog2 \cuda_math_formula (+\infty)\end_cuda_math_formula returns \cuda_math_formula +\infty \end_cuda_math_formula. +* - hlog2(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hlog2(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half decimal logarithm in round-to-nearest-even mode. +* +* \details Calculates \p half decimal logarithm of input: \cuda_math_formula \log_{10}(a)\end_cuda_math_formula in round-to-nearest-even +* mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The decimal logarithm of \p a. +* - hlog10 \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula -\infty \end_cuda_math_formula. +* - hlog10(1) returns +0. +* - hlog10(x), x < 0 returns NaN. +* - hlog10 \cuda_math_formula (+\infty)\end_cuda_math_formula returns \cuda_math_formula +\infty \end_cuda_math_formula. +* - hlog10(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hlog10(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half natural exponential function in round-to-nearest-even +* mode. +* +* \details Calculates \p half natural exponential function of input: \cuda_math_formula e^{a}\end_cuda_math_formula in +* round-to-nearest-even mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The natural exponential function on \p a. +* - hexp \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 1. +* - hexp \cuda_math_formula (-\infty)\end_cuda_math_formula returns +0. +* - hexp \cuda_math_formula (+\infty)\end_cuda_math_formula returns \cuda_math_formula +\infty \end_cuda_math_formula. +* - hexp(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hexp(const __half a); +#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */ + +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates approximate \p half hyperbolic tangent function. +* +* \details Calculates approximate \p half hyperbolic tangent function: \cuda_math_formula \tanh(a)\end_cuda_math_formula. +* This operation uses HW acceleration on devices of compute capability 7.5 and higher. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The approximate hyperbolic tangent function of \p a. +* - htanh_approx \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula (\pm 0)\end_cuda_math_formula. +* - htanh_approx \cuda_math_formula (\pm\infty)\end_cuda_math_formula returns \cuda_math_formula (\pm 1)\end_cuda_math_formula. +* - htanh_approx(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half htanh_approx(const __half a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector approximate hyperbolic tangent function. +* +* \details Calculates \p half2 approximate hyperbolic tangent function of input vector \p a. +* This operation uses HW acceleration on devices of compute capability 7.5 and higher. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise approximate hyperbolic tangent function on vector \p a. +* +* \see htanh_approx(__half) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2tanh_approx(const __half2 a); + +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half hyperbolic tangent function in +* round-to-nearest-even mode. +* +* \details Calculates \p half hyperbolic tangent function: \cuda_math_formula \tanh(a)\end_cuda_math_formula in +* round-to-nearest-even mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The hyperbolic tangent function of \p a. +* - htanh \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula (\pm 0)\end_cuda_math_formula. +* - htanh \cuda_math_formula (\pm\infty)\end_cuda_math_formula returns \cuda_math_formula (\pm 1)\end_cuda_math_formula. +* - htanh(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half htanh(const __half a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector hyperbolic tangent function in round-to-nearest-even +* mode. +* +* \details Calculates \p half2 hyperbolic tangent function of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise hyperbolic tangent function on vector \p a. +* +* \see htanh(__half) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2tanh(const __half2 a); + +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half binary exponential function in round-to-nearest-even +* mode. +* +* \details Calculates \p half binary exponential function of input: \cuda_math_formula 2^{a}\end_cuda_math_formula in +* round-to-nearest-even mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The binary exponential function on \p a. +* - hexp2 \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 1. +* - hexp2 \cuda_math_formula (-\infty)\end_cuda_math_formula returns +0. +* - hexp2 \cuda_math_formula (+\infty)\end_cuda_math_formula returns \cuda_math_formula +\infty \end_cuda_math_formula. +* - hexp2(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hexp2(const __half a); +#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half decimal exponential function in round-to-nearest-even +* mode. +* +* \details Calculates \p half decimal exponential function of input: \cuda_math_formula 10^{a}\end_cuda_math_formula in +* round-to-nearest-even mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The decimal exponential function on \p a. +* - hexp10 \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 1. +* - hexp10 \cuda_math_formula (-\infty)\end_cuda_math_formula returns +0. +* - hexp10 \cuda_math_formula (+\infty)\end_cuda_math_formula returns \cuda_math_formula +\infty \end_cuda_math_formula. +* - hexp10(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hexp10(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half cosine in round-to-nearest-even mode. +* +* \details Calculates \p half cosine of input \p a in round-to-nearest-even mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The cosine of \p a. +* - hcos \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 1. +* - hcos \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns NaN. +* - hcos(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hcos(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half sine in round-to-nearest-even mode. +* +* \details Calculates \p half sine of input \p a in round-to-nearest-even mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The sine of \p a. +* - hsin \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula (\pm 0)\end_cuda_math_formula. +* - hsin \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns NaN. +* - hsin(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hsin(const __half a); +#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector square root in round-to-nearest-even mode. +* +* \details Calculates \p half2 square root of input vector \p a in round-to-nearest-even +* mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise square root on vector \p a. +* +* \see hsqrt(__half) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2sqrt(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector reciprocal square root in round-to-nearest-even +* mode. +* +* \details Calculates \p half2 reciprocal square root of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise reciprocal square root on vector \p a. +* +* \see hrsqrt(__half) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2rsqrt(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector reciprocal in round-to-nearest-even mode. +* +* \details Calculates \p half2 reciprocal of input vector \p a in round-to-nearest-even +* mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise reciprocal on vector \p a. +* +* \see hrcp(__half) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2rcp(const __half2 a); +#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector natural logarithm in round-to-nearest-even +* mode. +* +* \details Calculates \p half2 natural logarithm of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise natural logarithm on vector \p a. +* +* \see hlog(__half) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2log(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector binary logarithm in round-to-nearest-even +* mode. +* +* \details Calculates \p half2 binary logarithm of input vector \p a in round-to-nearest-even +* mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise binary logarithm on vector \p a. +* +* \see hlog2(__half) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2log2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector decimal logarithm in round-to-nearest-even +* mode. +* +* \details Calculates \p half2 decimal logarithm of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise decimal logarithm on vector \p a. +* +* \see hlog10(__half) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2log10(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector exponential function in round-to-nearest-even +* mode. +* +* \details Calculates \p half2 exponential function of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise exponential function on vector \p a. +* +* \see hexp(__half) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2exp(const __half2 a); +#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector binary exponential function in +* round-to-nearest-even mode. +* +* \details Calculates \p half2 binary exponential function of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise binary exponential function on vector \p a. +* +* \see hexp2(__half) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2exp2(const __half2 a); +#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector decimal exponential function in +* round-to-nearest-even mode. +* +* \details Calculates \p half2 decimal exponential function of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise decimal exponential function on vector \p a. +* +* \see hexp10(__half) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2exp10(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector cosine in round-to-nearest-even mode. +* +* \details Calculates \p half2 cosine of input vector \p a in round-to-nearest-even +* mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise cosine on vector \p a. +* +* \see hcos(__half) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2cos(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector sine in round-to-nearest-even mode. +* +* \details Calculates \p half2 sine of input vector \p a in round-to-nearest-even mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise sine on vector \p a. +* +* \see hsin(__half) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2sin(const __half2 a); +#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */ + +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Vector add \p val to the value stored at \p address in global or shared memory, and writes this +* value back to \p address. The atomicity of the add operation is guaranteed separately for each of the +* two \p __half elements; the entire \p __half2 is not guaranteed to be atomic as a single 32-bit access. +* +* \details The location of \p address must be in global or shared memory. This operation has undefined +* behavior otherwise. This operation is natively supported by devices of compute capability 6.x and higher, +* older devices use emulation path. +* +* \param[in] address - half2*. An address in global or shared memory. +* \param[in] val - half2. The value to be added. +* +* \returns half2 +* - The old value read from \p address. +* +* \note_ref_guide_atomic +*/ +__CUDA_FP16_DECL__ __half2 atomicAdd(__half2 *const address, const __half2 val); + +#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700))) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Adds \p val to the value stored at \p address in global or shared memory, and writes this value +* back to \p address. This operation is performed in one atomic operation. +* +* \details The location of \p address must be in global or shared memory. This operation has undefined +* behavior otherwise. This operation is only supported by devices of compute capability 7.x and higher. +* +* \param[in] address - half*. An address in global or shared memory. +* \param[in] val - half. The value to be added. +* +* \returns half +* - The old value read from \p address. +* +* \note_ref_guide_atomic +*/ +__CUDA_FP16_DECL__ __half atomicAdd(__half *const address, const __half val); +#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700))) || defined(_NVHPC_CUDA) */ +#endif /*defined(__CUDACC__) || defined(_NVHPC_CUDA)*/ + + +#endif /* defined(__cplusplus) */ + +#if !defined(_MSC_VER) && __cplusplus >= 201103L +# define __CPP_VERSION_AT_LEAST_11_FP16 +#elif _MSC_FULL_VER >= 190024210 && _MSVC_LANG >= 201103L +# define __CPP_VERSION_AT_LEAST_11_FP16 +#endif + +// implicitly provided by NVRTC +#if !defined(__CUDACC_RTC__) +#include +#endif /* !defined(__CUDACC_RTC__) */ + +/* C++11 header for ::std::move. + * In RTC mode, ::std::move is provided implicitly; don't include the header + */ +#if defined(__CPP_VERSION_AT_LEAST_11_FP16) && !defined(__CUDACC_RTC__) +#include +#endif /* __cplusplus >= 201103L && !defined(__CUDACC_RTC__) */ + +/* C++ header for ::std::memcpy (used for type punning in host-side implementations). + * When compiling as a CUDA source file memcpy is provided implicitly. + * !defined(__CUDACC__) implies !defined(__CUDACC_RTC__). + */ +#if defined(__cplusplus) && !defined(__CUDACC__) +#include +#endif /* defined(__cplusplus) && !defined(__CUDACC__) */ + +#if (defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3)))) +#define __CUDA_FP16_INLINE__ +#define __CUDA_FP16_FORCEINLINE__ +#else +#define __CUDA_FP16_INLINE__ inline +#define __CUDA_FP16_FORCEINLINE__ __forceinline__ +#endif /* (defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3)))) */ + +/* Set up structure-alignment attribute */ +#if defined(__CUDACC__) +#define __CUDA_ALIGN__(align) __align__(align) +#else +/* Define alignment macro based on compiler type (cannot assume C11 "_Alignas" is available) */ +#if __cplusplus >= 201103L +#define __CUDA_ALIGN__(n) alignas(n) /* C++11 kindly gives us a keyword for this */ +#else /* !defined(__CPP_VERSION_AT_LEAST_11_FP16)*/ +#if defined(__GNUC__) +#define __CUDA_ALIGN__(n) __attribute__ ((aligned(n))) +#elif defined(_MSC_VER) +#define __CUDA_ALIGN__(n) __declspec(align(n)) +#else +#define __CUDA_ALIGN__(n) +#endif /* defined(__GNUC__) */ +#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */ +#endif /* defined(__CUDACC__) */ + +// define __CUDA_FP16_CONSTEXPR__ in order to +// use constexpr where possible, with supporting C++ dialects +// undef after use +#if (defined __CPP_VERSION_AT_LEAST_11_FP16) +#define __CUDA_FP16_CONSTEXPR__ constexpr +#else +#define __CUDA_FP16_CONSTEXPR__ +#endif + +/** + * \ingroup CUDA_MATH_INTRINSIC_HALF + * \brief __half_raw data type + * \details Type allows static initialization of \p half until it becomes + * a built-in type. + * + * - Note: this initialization is as a bit-field representation of \p half, + * and not a conversion from \p short to \p half. + * Such representation will be deprecated in a future version of CUDA. + * + * - Note: this is visible to non-nvcc compilers, including C-only compilations + */ +typedef struct __CUDA_ALIGN__(2) { + /** + * Storage field contains bits representation of the \p half floating-point number. + */ + unsigned short x; +} __half_raw; + +/** + * \ingroup CUDA_MATH_INTRINSIC_HALF + * \brief __half2_raw data type + * \details Type allows static initialization of \p half2 until it becomes + * a built-in type. + * + * - Note: this initialization is as a bit-field representation of \p half2, + * and not a conversion from \p short2 to \p half2. + * Such representation will be deprecated in a future version of CUDA. + * + * - Note: this is visible to non-nvcc compilers, including C-only compilations + */ +typedef struct __CUDA_ALIGN__(4) { + /** + * Storage field contains bits of the lower \p half part. + */ + unsigned short x; + /** + * Storage field contains bits of the upper \p half part. + */ + unsigned short y; +} __half2_raw; + +/* All other definitions in this file are only visible to C++ compilers */ +#if defined(__cplusplus) + +/* Hide GCC member initialization list warnings because of host/device in-function init requirement */ +#if defined(__GNUC__) +#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Weffc++" +#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */ +#endif /* defined(__GNUC__) */ + +/* class' : multiple assignment operators specified + The class has multiple assignment operators of a single type. This warning is informational */ +#if defined(_MSC_VER) && _MSC_VER >= 1500 +#pragma warning( push ) +#pragma warning( disable:4522 ) +#endif /* defined(_MSC_VER) && _MSC_VER >= 1500 */ + +// forward-declaration of bfloat type to be used in converting constructor +struct __nv_bfloat16; + +/** + * \ingroup CUDA_MATH_INTRINSIC_HALF + * \brief __half data type + * \details This structure implements the datatype for storing + * half-precision floating-point numbers. The structure implements + * assignment, arithmetic and comparison operators, and type conversions. + * 16 bits are being used in total: 1 sign bit, 5 bits for the exponent, + * and the significand is being stored in 10 bits. + * The total precision is 11 bits. There are 15361 representable + * numbers within the interval [0.0, 1.0], endpoints included. + * On average we have log10(2**11) ~ 3.311 decimal digits. + * + * The objective here is to provide IEEE754-compliant implementation + * of \p binary16 type and arithmetic with limitations due to + * device HW not supporting floating-point exceptions. + */ +struct __CUDA_ALIGN__(2) __half { +protected: + /** + * Protected storage variable contains the bits of floating-point data. + */ + unsigned short __x; + +public: + /** + * \ingroup CUDA_MATH__HALF_MISC + * \brief Constructor by default. + * \details Emtpy default constructor, result is uninitialized. + */ +#if defined(__CPP_VERSION_AT_LEAST_11_FP16) + __half() = default; +#else + __CUDA_HOSTDEVICE__ __half() {} +#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */ + + /* Convert to/from __half_raw */ + /** + * \ingroup CUDA_MATH__HALF_MISC + * Constructor from \p __half_raw. + */ + __CUDA_HOSTDEVICE__ __CUDA_FP16_CONSTEXPR__ __half(const __half_raw &hr) : __x(hr.x) { } + /** + * \ingroup CUDA_MATH__HALF_MISC + * Assignment operator from \p __half_raw. + */ + __CUDA_HOSTDEVICE__ __half &operator=(const __half_raw &hr); + /** + * \ingroup CUDA_MATH__HALF_MISC + * Assignment operator from \p __half_raw to \p volatile \p __half. + */ + __CUDA_HOSTDEVICE__ volatile __half &operator=(const __half_raw &hr) volatile; + /** + * \ingroup CUDA_MATH__HALF_MISC + * Assignment operator from \p volatile \p __half_raw to \p volatile \p __half. + */ + __CUDA_HOSTDEVICE__ volatile __half &operator=(const volatile __half_raw &hr) volatile; + /** + * \ingroup CUDA_MATH__HALF_MISC + * Type cast to \p __half_raw operator. + */ + __CUDA_HOSTDEVICE__ operator __half_raw() const; + /** + * \ingroup CUDA_MATH__HALF_MISC + * Type cast to \p __half_raw operator with \p volatile input. + */ + __CUDA_HOSTDEVICE__ operator __half_raw() const volatile; +#if !defined(__CUDA_NO_HALF_CONVERSIONS__) +#if defined(__CPP_VERSION_AT_LEAST_11_FP16) + /** + * \ingroup CUDA_MATH__HALF_MISC + * Construct \p __half from \p __nv_bfloat16 input using default round-to-nearest-even rounding mode. + * Need to include the header file \p cuda_bf16.h + */ + explicit __CUDA_HOSTDEVICE__ __half(const __nv_bfloat16 f); //forward declaration only, implemented in cuda_bf16.hpp +#endif /* #if defined(__CPP_VERSION_AT_LEAST_11_FP16) */ + /* Construct from float/double */ + /** + * \ingroup CUDA_MATH__HALF_MISC + * Construct \p __half from \p float input using default round-to-nearest-even rounding mode. + * + * \see __float2half(float) for further details. + */ + __CUDA_HOSTDEVICE__ __half(const float f) { __x = __float2half(f).__x; } + /** + * \ingroup CUDA_MATH__HALF_MISC + * Construct \p __half from \p double input using default round-to-nearest-even rounding mode. + * + * \see __double2half(double) for further details. + */ + __CUDA_HOSTDEVICE__ __half(const double f) { __x = __double2half(f).__x; } + /** + * \ingroup CUDA_MATH__HALF_MISC + * Type cast to \p float operator. + */ + __CUDA_HOSTDEVICE__ operator float() const; + /** + * \ingroup CUDA_MATH__HALF_MISC + * Type cast to \p __half assignment operator from \p float input using default round-to-nearest-even rounding mode. + * + * \see __float2half(float) for further details. + */ + __CUDA_HOSTDEVICE__ __half &operator=(const float f); + + /* We omit "cast to double" operator, so as to not be ambiguous about up-cast */ + /** + * \ingroup CUDA_MATH__HALF_MISC + * Type cast to \p __half assignment operator from \p double input using default round-to-nearest-even rounding mode. + * + * \see __double2half(double) for further details. + */ + __CUDA_HOSTDEVICE__ __half &operator=(const double f); + +/* + * Implicit type conversions to/from integer types were only available to nvcc compilation. + * Introducing them for all compilers is a potentially breaking change that may affect + * overloads resolution and will require users to update their code. + * Define __CUDA_FP16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__ to opt-out. + */ +#if !(defined __CUDA_FP16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__) || (defined __CUDACC__) + /* Allow automatic construction from types supported natively in hardware */ + /* Note we do avoid constructor init-list because of special host/device compilation rules */ + + /** + * \ingroup CUDA_MATH__HALF_MISC + * Construct \p __half from \p short integer input using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __half(const short val) { __x = __short2half_rn(val).__x; } + /** + * \ingroup CUDA_MATH__HALF_MISC + * Construct \p __half from \p unsigned \p short integer input using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __half(const unsigned short val) { __x = __ushort2half_rn(val).__x; } + /** + * \ingroup CUDA_MATH__HALF_MISC + * Construct \p __half from \p int input using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __half(const int val) { __x = __int2half_rn(val).__x; } + /** + * \ingroup CUDA_MATH__HALF_MISC + * Construct \p __half from \p unsigned \p int input using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __half(const unsigned int val) { __x = __uint2half_rn(val).__x; } + /** + * \ingroup CUDA_MATH__HALF_MISC + * Construct \p __half from \p long input using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __half(const long val) { + /* Suppress VS warning: warning C4127: conditional expression is constant */ +#if defined(_MSC_VER) && !defined(__CUDA_ARCH__) +#pragma warning (disable: 4127) +#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */ + if (sizeof(long) == sizeof(long long)) +#if defined(_MSC_VER) && !defined(__CUDA_ARCH__) +#pragma warning (default: 4127) +#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */ + { + __x = __ll2half_rn(static_cast(val)).__x; + } else { + __x = __int2half_rn(static_cast(val)).__x; + } + } + /** + * \ingroup CUDA_MATH__HALF_MISC + * Construct \p __half from \p unsigned \p long input using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __half(const unsigned long val) { + /* Suppress VS warning: warning C4127: conditional expression is constant */ +#if defined(_MSC_VER) && !defined(__CUDA_ARCH__) +#pragma warning (disable: 4127) +#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */ + if (sizeof(unsigned long) == sizeof(unsigned long long)) +#if defined(_MSC_VER) && !defined(__CUDA_ARCH__) +#pragma warning (default: 4127) +#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */ + { + __x = __ull2half_rn(static_cast(val)).__x; + } else { + __x = __uint2half_rn(static_cast(val)).__x; + } + } + + /** + * \ingroup CUDA_MATH__HALF_MISC + * Construct \p __half from \p long \p long input using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __half(const long long val) { __x = __ll2half_rn(val).__x; } + /** + * \ingroup CUDA_MATH__HALF_MISC + * Construct \p __half from \p unsigned \p long \p long input using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __half(const unsigned long long val) { __x = __ull2half_rn(val).__x; } + + /* Allow automatic casts to supported built-in types, matching all that are permitted with float */ + + /** + * \ingroup CUDA_MATH__HALF_MISC + * Conversion operator to \p signed \p char data type. + * Using round-toward-zero rounding mode. + * + * \see __half2char_rz(__half) for further details. + */ + __CUDA_HOSTDEVICE__ operator signed char() const; + /** + * \ingroup CUDA_MATH__HALF_MISC + * Conversion operator to \p unsigned \p char data type. + * Using round-toward-zero rounding mode. + * + * \see __half2uchar_rz(__half) for further details. + */ + __CUDA_HOSTDEVICE__ operator unsigned char() const; + /** + * \ingroup CUDA_MATH__HALF_MISC + * Conversion operator to an implementation defined \p char data type. + * Using round-toward-zero rounding mode. + * + * Detects signedness of the \p char type and proceeds accordingly, see + * further details in __half2char_rz(__half) and __half2uchar_rz(__half). + */ + __CUDA_HOSTDEVICE__ operator char() const; + /** + * \ingroup CUDA_MATH__HALF_MISC + * Conversion operator to \p short data type. + * Using round-toward-zero rounding mode. + * + * \see __half2short_rz(__half) for further details. + */ + __CUDA_HOSTDEVICE__ operator short() const; + /** + * \ingroup CUDA_MATH__HALF_MISC + * Conversion operator to \p unsigned \p short data type. + * Using round-toward-zero rounding mode. + * + * \see __half2ushort_rz(__half) for further details. + */ + __CUDA_HOSTDEVICE__ operator unsigned short() const; + /** + * \ingroup CUDA_MATH__HALF_MISC + * Conversion operator to \p int data type. + * Using round-toward-zero rounding mode. + * + * \see __half2int_rz(__half) for further details. + */ + __CUDA_HOSTDEVICE__ operator int() const; + /** + * \ingroup CUDA_MATH__HALF_MISC + * Conversion operator to \p unsigned \p int data type. + * Using round-toward-zero rounding mode. + * + * \see __half2uint_rz(__half) for further details. + */ + __CUDA_HOSTDEVICE__ operator unsigned int() const; + /** + * \ingroup CUDA_MATH__HALF_MISC + * Conversion operator to \p long data type. + * Using round-toward-zero rounding mode. + * + * Detects size of the \p long type and proceeds accordingly, see + * further details in __half2int_rz(__half) and __half2ll_rz(__half). + */ + __CUDA_HOSTDEVICE__ operator long() const; + /** + * \ingroup CUDA_MATH__HALF_MISC + * Conversion operator to \p unsigned \p long data type. + * Using round-toward-zero rounding mode. + * + * Detects size of the \p unsigned \p long type and proceeds + * accordingly, see further details in __half2uint_rz(__half) and __half2ull_rz(__half). + */ + __CUDA_HOSTDEVICE__ operator unsigned long() const; + /** + * \ingroup CUDA_MATH__HALF_MISC + * Conversion operator to \p long \p long data type. + * Using round-toward-zero rounding mode. + * + * \see __half2ll_rz(__half) for further details. + */ + __CUDA_HOSTDEVICE__ operator long long() const; + /** + * \ingroup CUDA_MATH__HALF_MISC + * Conversion operator to \p unsigned \p long \p long data type. + * Using round-toward-zero rounding mode. + * + * \see __half2ull_rz(__half) for further details. + */ + __CUDA_HOSTDEVICE__ operator unsigned long long() const; + /** + * \ingroup CUDA_MATH__HALF_MISC + * Type cast from \p short assignment operator, using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __half &operator=(const short val); + /** + * \ingroup CUDA_MATH__HALF_MISC + * Type cast from \p unsigned \p short assignment operator, using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __half &operator=(const unsigned short val); + /** + * \ingroup CUDA_MATH__HALF_MISC + * Type cast from \p int assignment operator, using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __half &operator=(const int val); + /** + * \ingroup CUDA_MATH__HALF_MISC + * Type cast from \p unsigned \p int assignment operator, using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __half &operator=(const unsigned int val); + /** + * \ingroup CUDA_MATH__HALF_MISC + * Type cast from \p long \p long assignment operator, using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __half &operator=(const long long val); + /** + * \ingroup CUDA_MATH__HALF_MISC + * Type cast from \p unsigned \p long \p long assignment operator, using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __half &operator=(const unsigned long long val); + /** + * \ingroup CUDA_MATH__HALF_MISC + * Conversion operator to \p bool data type. + * +0 and -0 inputs convert to \p false. + * Non-zero inputs convert to \p true. + */ + __CUDA_HOSTDEVICE__ __CUDA_FP16_CONSTEXPR__ operator bool() const { return (__x & 0x7FFFU) != 0U; } +#endif /* #if !(defined __CUDA_FP16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__) || (defined __CUDACC__) */ +#endif /* !defined(__CUDA_NO_HALF_CONVERSIONS__) */ +}; + +#if !defined(__CUDA_NO_HALF_OPERATORS__) +/* Some basic arithmetic operations expected of a built-in */ + +/** + * \ingroup CUDA_MATH__HALF_ARITHMETIC + * Performs \p half addition operation. + * \see __hadd(__half, __half) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator+(const __half &lh, const __half &rh); +/** + * \ingroup CUDA_MATH__HALF_ARITHMETIC + * Performs \p half subtraction operation. + * \see __hsub(__half, __half) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator-(const __half &lh, const __half &rh); +/** + * \ingroup CUDA_MATH__HALF_ARITHMETIC + * Performs \p half multiplication operation. + * \see __hmul(__half, __half) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator*(const __half &lh, const __half &rh); +/** + * \ingroup CUDA_MATH__HALF_ARITHMETIC + * Performs \p half division operation. + * \see __hdiv(__half, __half) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator/(const __half &lh, const __half &rh); +/** + * \ingroup CUDA_MATH__HALF_ARITHMETIC + * Performs \p half compound assignment with addition operation. + * \see __hadd(__half, __half) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator+=(__half &lh, const __half &rh); +/** + * \ingroup CUDA_MATH__HALF_ARITHMETIC + * Performs \p half compound assignment with subtraction operation. + * \see __hsub(__half, __half) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator-=(__half &lh, const __half &rh); +/** + * \ingroup CUDA_MATH__HALF_ARITHMETIC + * Performs \p half compound assignment with multiplication operation. + * \see __hmul(__half, __half) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator*=(__half &lh, const __half &rh); +/** + * \ingroup CUDA_MATH__HALF_ARITHMETIC + * Performs \p half compound assignment with division operation. + * \see __hdiv(__half, __half) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator/=(__half &lh, const __half &rh); +/* Note for increment and decrement we use the raw value 0x3C00U equating to half(1.0F), to avoid the extra conversion */ +/** + * \ingroup CUDA_MATH__HALF_ARITHMETIC + * Performs \p half prefix increment operation. + * \see __hadd(__half, __half) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator++(__half &h); +/** + * \ingroup CUDA_MATH__HALF_ARITHMETIC + * Performs \p half prefix decrement operation. + * \see __hsub(__half, __half) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator--(__half &h); +/** + * \ingroup CUDA_MATH__HALF_ARITHMETIC + * Performs \p half postfix increment operation. + * \see __hadd(__half, __half) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator++(__half &h, const int ignored); +/** + * \ingroup CUDA_MATH__HALF_ARITHMETIC + * Performs \p half postfix decrement operation. + * \see __hsub(__half, __half) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator--(__half &h, const int ignored); + +/* Unary plus and inverse operators */ +/** + * \ingroup CUDA_MATH__HALF_ARITHMETIC + * Implements \p half unary plus operator, returns input value. + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator+(const __half &h); +/** + * \ingroup CUDA_MATH__HALF_ARITHMETIC + * Implements \p half unary minus operator. + * \see __hneg(__half) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator-(const __half &h); +/* Some basic comparison operations to make it look like a built-in */ +/** + * \ingroup CUDA_MATH__HALF_COMPARISON + * Performs \p half ordered compare equal operation. + * \see __heq(__half, __half) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator==(const __half &lh, const __half &rh); +/** + * \ingroup CUDA_MATH__HALF_COMPARISON + * Performs \p half unordered compare not-equal operation. + * \see __hneu(__half, __half) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator!=(const __half &lh, const __half &rh); +/** + * \ingroup CUDA_MATH__HALF_COMPARISON + * Performs \p half ordered greater-than compare operation. + * \see __hgt(__half, __half) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator> (const __half &lh, const __half &rh); +/** + * \ingroup CUDA_MATH__HALF_COMPARISON + * Performs \p half ordered less-than compare operation. + * \see __hlt(__half, __half) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator< (const __half &lh, const __half &rh); +/** + * \ingroup CUDA_MATH__HALF_COMPARISON + * Performs \p half ordered greater-or-equal compare operation. + * \see __hge(__half, __half) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator>=(const __half &lh, const __half &rh); +/** + * \ingroup CUDA_MATH__HALF_COMPARISON + * Performs \p half ordered less-or-equal compare operation. + * \see __hle(__half, __half) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator<=(const __half &lh, const __half &rh); +#endif /* !defined(__CUDA_NO_HALF_OPERATORS__) */ + +/** + * \ingroup CUDA_MATH_INTRINSIC_HALF + * \brief __half2 data type + * \details This structure implements the datatype for storing two + * half-precision floating-point numbers. + * The structure implements assignment, arithmetic and comparison + * operators, and type conversions. + * + * - NOTE: __half2 is visible to non-nvcc host compilers + */ +struct __CUDA_ALIGN__(4) __half2 { + /** + * Storage field holding lower \p __half part. + */ + __half x; + /** + * Storage field holding upper \p __half part. + */ + __half y; + + // All construct/copy/assign/move +public: + /** + * \ingroup CUDA_MATH__HALF_MISC + * \brief Constructor by default. + * \details Emtpy default constructor, result is uninitialized. + */ +#if defined(__CPP_VERSION_AT_LEAST_11_FP16) + __half2() = default; + /** + * \ingroup CUDA_MATH__HALF_MISC + * Move constructor, available for \p C++11 and later dialects + */ + __CUDA_HOSTDEVICE__ __half2(const __half2 &&src) { +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __HALF2_TO_UI(*this) = ::std::move(__HALF2_TO_CUI(src)); +, + this->x = src.x; + this->y = src.y; +) +} + /** + * \ingroup CUDA_MATH__HALF_MISC + * Move assignment operator, available for \p C++11 and later dialects + */ + __CUDA_HOSTDEVICE__ __half2 &operator=(const __half2 &&src); +#else + __CUDA_HOSTDEVICE__ __half2() { } +#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */ + + /** + * \ingroup CUDA_MATH__HALF_MISC + * Constructor from two \p __half variables + */ + __CUDA_HOSTDEVICE__ __CUDA_FP16_CONSTEXPR__ __half2(const __half &a, const __half &b) : x(a), y(b) { } + /** + * \ingroup CUDA_MATH__HALF_MISC + * Copy constructor + */ + __CUDA_HOSTDEVICE__ __half2(const __half2 &src) { +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __HALF2_TO_UI(*this) = __HALF2_TO_CUI(src); +, + this->x = src.x; + this->y = src.y; +) +} /** + * \ingroup CUDA_MATH__HALF_MISC + * Copy assignment operator + */ + __CUDA_HOSTDEVICE__ __half2 &operator=(const __half2 &src); + + /* Convert to/from __half2_raw */ + /** + * \ingroup CUDA_MATH__HALF_MISC + * Constructor from \p __half2_raw + */ + __CUDA_HOSTDEVICE__ __half2(const __half2_raw &h2r ) { +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __HALF2_TO_UI(*this) = __HALF2_TO_CUI(h2r); +, + __half_raw tr; + tr.x = h2r.x; + this->x = static_cast<__half>(tr); + tr.x = h2r.y; + this->y = static_cast<__half>(tr); +) +} + /** + * \ingroup CUDA_MATH__HALF_MISC + * Assignment operator from \p __half2_raw + */ + __CUDA_HOSTDEVICE__ __half2 &operator=(const __half2_raw &h2r); + /** + * \ingroup CUDA_MATH__HALF_MISC + * Conversion operator to \p __half2_raw + */ + __CUDA_HOSTDEVICE__ operator __half2_raw() const; +}; + +#if !defined(__CUDA_NO_HALF2_OPERATORS__) +/** + * \ingroup CUDA_MATH__HALF2_ARITHMETIC + * Performs packed \p half addition operation. + * \see __hadd2(__half2, __half2) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator+(const __half2 &lh, const __half2 &rh); +/** + * \ingroup CUDA_MATH__HALF2_ARITHMETIC + * Performs packed \p half subtraction operation. + * \see __hsub2(__half2, __half2) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator-(const __half2 &lh, const __half2 &rh); +/** + * \ingroup CUDA_MATH__HALF2_ARITHMETIC + * Performs packed \p half multiplication operation. + * \see __hmul2(__half2, __half2) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator*(const __half2 &lh, const __half2 &rh); +/** + * \ingroup CUDA_MATH__HALF2_ARITHMETIC + * Performs packed \p half division operation. + * \see __h2div(__half2, __half2) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator/(const __half2 &lh, const __half2 &rh); +/** + * \ingroup CUDA_MATH__HALF2_ARITHMETIC + * Performs packed \p half compound assignment with addition operation. + * \see __hadd2(__half2, __half2) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2& operator+=(__half2 &lh, const __half2 &rh); +/** + * \ingroup CUDA_MATH__HALF2_ARITHMETIC + * Performs packed \p half compound assignment with subtraction operation. + * \see __hsub2(__half2, __half2) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2& operator-=(__half2 &lh, const __half2 &rh); +/** + * \ingroup CUDA_MATH__HALF2_ARITHMETIC + * Performs packed \p half compound assignment with multiplication operation. + * \see __hmul2(__half2, __half2) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2& operator*=(__half2 &lh, const __half2 &rh); +/** + * \ingroup CUDA_MATH__HALF2_ARITHMETIC + * Performs packed \p half compound assignment with division operation. + * \see __h2div(__half2, __half2) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2& operator/=(__half2 &lh, const __half2 &rh); +/** + * \ingroup CUDA_MATH__HALF2_ARITHMETIC + * Performs packed \p half prefix increment operation. + * \see __hadd2(__half2, __half2) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 &operator++(__half2 &h); +/** + * \ingroup CUDA_MATH__HALF2_ARITHMETIC + * Performs packed \p half prefix decrement operation. + * \see __hsub2(__half2, __half2) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 &operator--(__half2 &h); +/** + * \ingroup CUDA_MATH__HALF2_ARITHMETIC + * Performs packed \p half postfix increment operation. + * \see __hadd2(__half2, __half2) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator++(__half2 &h, const int ignored); +/** + * \ingroup CUDA_MATH__HALF2_ARITHMETIC + * Performs packed \p half postfix decrement operation. + * \see __hsub2(__half2, __half2) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator--(__half2 &h, const int ignored); +/** + * \ingroup CUDA_MATH__HALF2_ARITHMETIC + * Implements packed \p half unary plus operator, returns input value. + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator+(const __half2 &h); +/** + * \ingroup CUDA_MATH__HALF2_ARITHMETIC + * Implements packed \p half unary minus operator. + * \see __hneg2(__half2) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator-(const __half2 &h); +/** + * \ingroup CUDA_MATH__HALF2_COMPARISON + * Performs packed \p half ordered compare equal operation. + * \see __hbeq2(__half2, __half2) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator==(const __half2 &lh, const __half2 &rh); +/** + * \ingroup CUDA_MATH__HALF2_COMPARISON + * Performs packed \p half unordered compare not-equal operation. + * \see __hbneu2(__half2, __half2) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator!=(const __half2 &lh, const __half2 &rh); +/** + * \ingroup CUDA_MATH__HALF2_COMPARISON + * Performs packed \p half ordered greater-than compare operation. + * \see __hbgt2(__half2, __half2) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator>(const __half2 &lh, const __half2 &rh); +/** + * \ingroup CUDA_MATH__HALF2_COMPARISON + * Performs packed \p half ordered less-than compare operation. + * \see __hblt2(__half2, __half2) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator<(const __half2 &lh, const __half2 &rh); +/** + * \ingroup CUDA_MATH__HALF2_COMPARISON + * Performs packed \p half ordered greater-or-equal compare operation. + * \see __hbge2(__half2, __half2) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator>=(const __half2 &lh, const __half2 &rh); +/** + * \ingroup CUDA_MATH__HALF2_COMPARISON + * Performs packed \p half ordered less-or-equal compare operation. + * \see __hble2(__half2, __half2) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator<=(const __half2 &lh, const __half2 &rh); + +#endif /* !defined(__CUDA_NO_HALF2_OPERATORS__) */ +#endif /* defined(__cplusplus) */ + +#if (defined(__FORCE_INCLUDE_CUDA_FP16_HPP_FROM_FP16_H__) || \ + !(defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3))))) + +/* Note the .hpp file is included to capture the "half" & "half2" built-in function definitions. For NVRTC, the built-in + function definitions are compiled at NVRTC library build-time and are available through the NVRTC built-ins library at + link time. +*/ +#include "cuda_fp16.hpp" +#endif /* (defined(__FORCE_INCLUDE_CUDA_FP16_HPP_FROM_FP16_H__) || \ + !(defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3))))) */ + +/* Define first-class types "half" and "half2", unless user specifies otherwise via "#define CUDA_NO_HALF" */ +/* C cannot ever have these types defined here, because __half and __half2 are C++ classes */ +#if defined(__cplusplus) && !defined(CUDA_NO_HALF) +/** + * \ingroup CUDA_MATH_INTRINSIC_HALF + * \brief This datatype is meant to be the first-class or fundamental + * implementation of the half-precision numbers format. + * + * \details Should be implemented in the compiler in the future. + * Current implementation is a simple typedef to a respective + * user-level type with underscores. + */ +typedef __half half; + +/** + * \ingroup CUDA_MATH_INTRINSIC_HALF + * \brief This datatype is meant to be the first-class or fundamental + * implementation of type for pairs of half-precision numbers. + * + * \details Should be implemented in the compiler in the future. + * Current implementation is a simple typedef to a respective + * user-level type with underscores. + */ +typedef __half2 half2; +// for consistency with __nv_bfloat16 + +/** + * \ingroup CUDA_MATH_INTRINSIC_HALF + * \brief This datatype is an \p __nv_ prefixed alias + */ +typedef __half __nv_half; +/** + * \ingroup CUDA_MATH_INTRINSIC_HALF + * \brief This datatype is an \p __nv_ prefixed alias + */ +typedef __half2 __nv_half2; +/** + * \ingroup CUDA_MATH_INTRINSIC_HALF + * \brief This datatype is an \p __nv_ prefixed alias + */ +typedef __half_raw __nv_half_raw; +/** + * \ingroup CUDA_MATH_INTRINSIC_HALF + * \brief This datatype is an \p __nv_ prefixed alias + */ +typedef __half2_raw __nv_half2_raw; +/** + * \ingroup CUDA_MATH_INTRINSIC_HALF + * \brief This datatype is an \p nv_ prefixed alias + */ +typedef __half nv_half; +/** + * \ingroup CUDA_MATH_INTRINSIC_HALF + * \brief This datatype is an \p nv_ prefixed alias + */ +typedef __half2 nv_half2; +#endif /* defined(__cplusplus) && !defined(CUDA_NO_HALF) */ + +#undef __CUDA_FP16_DECL__ +#undef __CUDA_HOSTDEVICE_FP16_DECL__ +#undef __CUDA_HOSTDEVICE__ +#undef __CUDA_FP16_INLINE__ +#undef __CUDA_FP16_FORCEINLINE__ +#undef ___CUDA_FP16_STRINGIFY_INNERMOST +#undef __CUDA_FP16_STRINGIFY + +#endif /* end of include guard: __CUDA_FP16_H__ */ diff --git a/numba_cuda/numba/cuda/include/13/cuda_fp16.hpp b/numba_cuda/numba/cuda/include/13/cuda_fp16.hpp new file mode 100644 index 000000000..4259992df --- /dev/null +++ b/numba_cuda/numba/cuda/include/13/cuda_fp16.hpp @@ -0,0 +1,3483 @@ +/* +* Copyright 1993-2024 NVIDIA Corporation. All rights reserved. +* +* NOTICE TO LICENSEE: +* +* This source code and/or documentation ("Licensed Deliverables") are +* subject to NVIDIA intellectual property rights under U.S. and +* international Copyright laws. +* +* These Licensed Deliverables contained herein is PROPRIETARY and +* CONFIDENTIAL to NVIDIA and is being provided under the terms and +* conditions of a form of NVIDIA software license agreement by and +* between NVIDIA and Licensee ("License Agreement") or electronically +* accepted by Licensee. Notwithstanding any terms or conditions to +* the contrary in the License Agreement, reproduction or disclosure +* of the Licensed Deliverables to any third party without the express +* written consent of NVIDIA is prohibited. +* +* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE +* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS +* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. +* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED +* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, +* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. +* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY +* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY +* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS +* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE +* OF THESE LICENSED DELIVERABLES. +* +* U.S. Government End Users. These Licensed Deliverables are a +* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT +* 1995), consisting of "commercial computer software" and "commercial +* computer software documentation" as such terms are used in 48 +* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government +* only as a commercial end item. Consistent with 48 C.F.R.12.212 and +* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all +* U.S. Government End Users acquire the Licensed Deliverables with +* only those rights set forth herein. +* +* Any use of the Licensed Deliverables in individual and commercial +* software must include, in the user documentation and internal +* comments to the code, the above Disclaimer and U.S. Government End +* Users Notice. +*/ + +#if !defined(__CUDA_FP16_HPP__) +#define __CUDA_FP16_HPP__ + +#if !defined(__CUDA_FP16_H__) +#error "Do not include this file directly. Instead, include cuda_fp16.h." +#endif + +#if !defined(IF_DEVICE_OR_CUDACC) +#if defined(__CUDACC__) + #define IF_DEVICE_OR_CUDACC(d, c, f) NV_IF_ELSE_TARGET(NV_IS_DEVICE, d, c) +#else + #define IF_DEVICE_OR_CUDACC(d, c, f) NV_IF_ELSE_TARGET(NV_IS_DEVICE, d, f) +#endif +#endif + +/* Macros for half & half2 binary arithmetic */ +#define __BINARY_OP_HALF_MACRO(name) /* do */ {\ + __half val; \ + asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16 %0,%1,%2;\n}" \ + :"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)),"h"(__HALF_TO_CUS(b))); \ + return val; \ +} /* while(0) */ +#define __BINARY_OP_HALF2_MACRO(name) /* do */ {\ + __half2 val; \ + asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16x2 %0,%1,%2;\n}" \ + :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \ + return val; \ +} /* while(0) */ +#define __TERNARY_OP_HALF_MACRO(name) /* do */ {\ + __half val; \ + asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16 %0,%1,%2,%3;\n}" \ + :"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)),"h"(__HALF_TO_CUS(b)),"h"(__HALF_TO_CUS(c))); \ + return val; \ +} /* while(0) */ +#define __TERNARY_OP_HALF2_MACRO(name) /* do */ {\ + __half2 val; \ + asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16x2 %0,%1,%2,%3;\n}" \ + :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b)),"r"(__HALF2_TO_CUI(c))); \ + return val; \ +} /* while(0) */ + +/* All other definitions in this file are only visible to C++ compilers */ +#if defined(__cplusplus) + +/** + * \ingroup CUDA_MATH_INTRINSIC_HALF_CONSTANTS + * \brief Defines floating-point positive infinity value for the \p half data type + */ +#define CUDART_INF_FP16 __ushort_as_half((unsigned short)0x7C00U) +/** + * \ingroup CUDA_MATH_INTRINSIC_HALF_CONSTANTS + * \brief Defines canonical NaN value for the \p half data type + */ +#define CUDART_NAN_FP16 __ushort_as_half((unsigned short)0x7FFFU) +/** + * \ingroup CUDA_MATH_INTRINSIC_HALF_CONSTANTS + * \brief Defines a minimum representable (denormalized) value for the \p half data type + */ +#define CUDART_MIN_DENORM_FP16 __ushort_as_half((unsigned short)0x0001U) +/** + * \ingroup CUDA_MATH_INTRINSIC_HALF_CONSTANTS + * \brief Defines a maximum representable value for the \p half data type + */ +#define CUDART_MAX_NORMAL_FP16 __ushort_as_half((unsigned short)0x7BFFU) +/** + * \ingroup CUDA_MATH_INTRINSIC_HALF_CONSTANTS + * \brief Defines a negative zero value for the \p half data type + */ +#define CUDART_NEG_ZERO_FP16 __ushort_as_half((unsigned short)0x8000U) +/** + * \ingroup CUDA_MATH_INTRINSIC_HALF_CONSTANTS + * \brief Defines a positive zero value for the \p half data type + */ +#define CUDART_ZERO_FP16 __ushort_as_half((unsigned short)0x0000U) +/** + * \ingroup CUDA_MATH_INTRINSIC_HALF_CONSTANTS + * \brief Defines a value of 1.0 for the \p half data type + */ +#define CUDART_ONE_FP16 __ushort_as_half((unsigned short)0x3C00U) + +#if !(defined __DOXYGEN_ONLY__) + +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const __half_raw &hr) { __x = hr.x; return *this; } +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ volatile __half &__half::operator=(const __half_raw &hr) volatile { __x = hr.x; return *this; } +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ volatile __half &__half::operator=(const volatile __half_raw &hr) volatile { __x = hr.x; return *this; } +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator __half_raw() const { __half_raw ret; ret.x = __x; return ret; } +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator __half_raw() const volatile { __half_raw ret; ret.x = __x; return ret; } +#if !defined(__CUDA_NO_HALF_CONVERSIONS__) +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator float() const { return __half2float(*this); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const float f) { __x = __float2half(f).__x; return *this; } +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const double f) { __x = __double2half(f).__x; return *this; } +#if !(defined __CUDA_FP16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__) || (defined __CUDACC__) +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator signed char() const { return __half2char_rz(*this); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator unsigned char() const { return __half2uchar_rz(*this); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator char() const { + char value; + /* Suppress VS warning: warning C4127: conditional expression is constant */ +#if defined(_MSC_VER) && !defined(__CUDA_ARCH__) +#pragma warning (push) +#pragma warning (disable: 4127) +#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */ + if (((char)-1) < (char)0) +#if defined(_MSC_VER) && !defined(__CUDA_ARCH__) +#pragma warning (pop) +#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */ + { + value = static_cast(__half2char_rz(*this)); + } + else + { + value = static_cast(__half2uchar_rz(*this)); + } + return value; +} +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator short() const { return __half2short_rz(*this); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator unsigned short() const { return __half2ushort_rz(*this); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator int() const { return __half2int_rz(*this); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator unsigned int() const { return __half2uint_rz(*this); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator long() const { + long retval; + /* Suppress VS warning: warning C4127: conditional expression is constant */ +#if defined(_MSC_VER) && !defined(__CUDA_ARCH__) +#pragma warning (push) +#pragma warning (disable: 4127) +#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */ + if (sizeof(long) == sizeof(long long)) +#if defined(_MSC_VER) && !defined(__CUDA_ARCH__) +#pragma warning (pop) +#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */ + { + retval = static_cast(__half2ll_rz(*this)); + } + else + { + retval = static_cast(__half2int_rz(*this)); + } + return retval; +} +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator unsigned long() const { + unsigned long retval; + /* Suppress VS warning: warning C4127: conditional expression is constant */ +#if defined(_MSC_VER) && !defined(__CUDA_ARCH__) +#pragma warning (push) +#pragma warning (disable: 4127) +#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */ + if (sizeof(unsigned long) == sizeof(unsigned long long)) +#if defined(_MSC_VER) && !defined(__CUDA_ARCH__) +#pragma warning (pop) +#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */ + { + retval = static_cast(__half2ull_rz(*this)); + } + else + { + retval = static_cast(__half2uint_rz(*this)); + } + return retval; +} +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator long long() const { return __half2ll_rz(*this); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator unsigned long long() const { return __half2ull_rz(*this); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const short val) { __x = __short2half_rn(val).__x; return *this; } +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const unsigned short val) { __x = __ushort2half_rn(val).__x; return *this; } +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const int val) { __x = __int2half_rn(val).__x; return *this; } +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const unsigned int val) { __x = __uint2half_rn(val).__x; return *this; } +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const long long val) { __x = __ll2half_rn(val).__x; return *this; } +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const unsigned long long val) { __x = __ull2half_rn(val).__x; return *this; } + +#endif /* #if !(defined __CUDA_FP16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__) || (defined __CUDACC__) */ +#endif /* !defined(__CUDA_NO_HALF_CONVERSIONS__) */ +#if !defined(__CUDA_NO_HALF_OPERATORS__) +/* Some basic arithmetic operations expected of a built-in */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator+(const __half &lh, const __half &rh) { return __hadd(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator-(const __half &lh, const __half &rh) { return __hsub(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator*(const __half &lh, const __half &rh) { return __hmul(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator/(const __half &lh, const __half &rh) { return __hdiv(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator+=(__half &lh, const __half &rh) { lh = __hadd(lh, rh); return lh; } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator-=(__half &lh, const __half &rh) { lh = __hsub(lh, rh); return lh; } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator*=(__half &lh, const __half &rh) { lh = __hmul(lh, rh); return lh; } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator/=(__half &lh, const __half &rh) { lh = __hdiv(lh, rh); return lh; } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator++(__half &h) { __half_raw one; one.x = 0x3C00U; h += one; return h; } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator--(__half &h) { __half_raw one; one.x = 0x3C00U; h -= one; return h; } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator++(__half &h, const int ignored) +{ + // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators. + static_cast(ignored); + + const __half ret = h; + __half_raw one; + one.x = 0x3C00U; + h += one; + return ret; +} +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator--(__half &h, const int ignored) +{ + // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators. + static_cast(ignored); + + const __half ret = h; + __half_raw one; + one.x = 0x3C00U; + h -= one; + return ret; +} +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator+(const __half &h) { return h; } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator-(const __half &h) { return __hneg(h); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator==(const __half &lh, const __half &rh) { return __heq(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator!=(const __half &lh, const __half &rh) { return __hneu(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator> (const __half &lh, const __half &rh) { return __hgt(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator< (const __half &lh, const __half &rh) { return __hlt(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator>=(const __half &lh, const __half &rh) { return __hge(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator<=(const __half &lh, const __half &rh) { return __hle(lh, rh); } +#endif /* !defined(__CUDA_NO_HALF_OPERATORS__) */ +#if defined(__CPP_VERSION_AT_LEAST_11_FP16) +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half2 &__half2::operator=(const __half2 &&src) { +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __HALF2_TO_UI(*this) = ::std::move(__HALF2_TO_CUI(src)); +, + this->x = src.x; + this->y = src.y; +) + return *this; +} +#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half2 &__half2::operator=(const __half2 &src) { +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __HALF2_TO_UI(*this) = __HALF2_TO_CUI(src); +, + this->x = src.x; + this->y = src.y; +) + return *this; +} +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half2 &__half2::operator=(const __half2_raw &h2r) { +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __HALF2_TO_UI(*this) = __HALF2_TO_CUI(h2r); +, + __half_raw tr; + tr.x = h2r.x; + this->x = static_cast<__half>(tr); + tr.x = h2r.y; + this->y = static_cast<__half>(tr); +) + return *this; +} +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half2::operator __half2_raw() const { + __half2_raw ret; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + ret.x = 0U; + ret.y = 0U; + __HALF2_TO_UI(ret) = __HALF2_TO_CUI(*this); +, + ret.x = static_cast<__half_raw>(this->x).x; + ret.y = static_cast<__half_raw>(this->y).x; +) + return ret; +} +#if !defined(__CUDA_NO_HALF2_OPERATORS__) +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator+(const __half2 &lh, const __half2 &rh) { return __hadd2(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator-(const __half2 &lh, const __half2 &rh) { return __hsub2(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator*(const __half2 &lh, const __half2 &rh) { return __hmul2(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator/(const __half2 &lh, const __half2 &rh) { return __h2div(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2& operator+=(__half2 &lh, const __half2 &rh) { lh = __hadd2(lh, rh); return lh; } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2& operator-=(__half2 &lh, const __half2 &rh) { lh = __hsub2(lh, rh); return lh; } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2& operator*=(__half2 &lh, const __half2 &rh) { lh = __hmul2(lh, rh); return lh; } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2& operator/=(__half2 &lh, const __half2 &rh) { lh = __h2div(lh, rh); return lh; } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 &operator++(__half2 &h) { __half2_raw one; one.x = 0x3C00U; one.y = 0x3C00U; h = __hadd2(h, one); return h; } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 &operator--(__half2 &h) { __half2_raw one; one.x = 0x3C00U; one.y = 0x3C00U; h = __hsub2(h, one); return h; } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator++(__half2 &h, const int ignored) +{ + // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators. + static_cast(ignored); + + const __half2 ret = h; + __half2_raw one; + one.x = 0x3C00U; + one.y = 0x3C00U; + h = __hadd2(h, one); + return ret; +} +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator--(__half2 &h, const int ignored) +{ + // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators. + static_cast(ignored); + + const __half2 ret = h; + __half2_raw one; + one.x = 0x3C00U; + one.y = 0x3C00U; + h = __hsub2(h, one); + return ret; +} +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator+(const __half2 &h) { return h; } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator-(const __half2 &h) { return __hneg2(h); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator==(const __half2 &lh, const __half2 &rh) { return __hbeq2(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator!=(const __half2 &lh, const __half2 &rh) { return __hbneu2(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator>(const __half2 &lh, const __half2 &rh) { return __hbgt2(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator<(const __half2 &lh, const __half2 &rh) { return __hblt2(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator>=(const __half2 &lh, const __half2 &rh) { return __hbge2(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator<=(const __half2 &lh, const __half2 &rh) { return __hble2(lh, rh); } +#endif /* !defined(__CUDA_NO_HALF2_OPERATORS__) */ + +/* Restore warning for multiple assignment operators */ +#if defined(_MSC_VER) && _MSC_VER >= 1500 +#pragma warning( pop ) +#endif /* defined(_MSC_VER) && _MSC_VER >= 1500 */ + +/* Restore -Weffc++ warnings from here on */ +#if defined(__GNUC__) +#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) +#pragma GCC diagnostic pop +#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */ +#endif /* defined(__GNUC__) */ + +#undef __CUDA_HOSTDEVICE__ +#undef __CUDA_ALIGN__ + +#ifndef __CUDACC_RTC__ /* no host functions in NVRTC mode */ +static inline unsigned short __internal_float2half(const float f, unsigned int &sign, unsigned int &remainder) +{ + unsigned int x; + unsigned int u; + unsigned int result; +#if defined(__CUDACC__) + (void)memcpy(&x, &f, sizeof(f)); +#else + (void)::std::memcpy(&x, &f, sizeof(f)); +#endif + u = (x & 0x7fffffffU); + sign = ((x >> 16U) & 0x8000U); + // NaN/+Inf/-Inf + if (u >= 0x7f800000U) { + remainder = 0U; + result = ((u == 0x7f800000U) ? (sign | 0x7c00U) : 0x7fffU); + } else if (u > 0x477fefffU) { // Overflows + remainder = 0x80000000U; + result = (sign | 0x7bffU); + } else if (u >= 0x38800000U) { // Normal numbers + remainder = u << 19U; + u -= 0x38000000U; + result = (sign | (u >> 13U)); + } else if (u < 0x33000001U) { // +0/-0 + remainder = u; + result = sign; + } else { // Denormal numbers + const unsigned int exponent = u >> 23U; + const unsigned int shift = 0x7eU - exponent; + unsigned int mantissa = (u & 0x7fffffU); + mantissa |= 0x800000U; + remainder = mantissa << (32U - shift); + result = (sign | (mantissa >> shift)); + result &= 0x0000FFFFU; + } + return static_cast(result); +} +#endif /* #if !defined(__CUDACC_RTC__) */ + +__CUDA_HOSTDEVICE_FP16_DECL__ __half __double2half(const double a) +{ +IF_DEVICE_OR_CUDACC( + __half val; + asm("{ cvt.rn.f16.f64 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "d"(a)); + return val; +, + __half result; + // Perform rounding to 11 bits of precision, convert value + // to float and call existing float to half conversion. + // By pre-rounding to 11 bits we avoid additional rounding + // in float to half conversion. + unsigned long long int absa; + unsigned long long int ua; + (void)memcpy(&ua, &a, sizeof(a)); + absa = (ua & 0x7fffffffffffffffULL); + if ((absa >= 0x40f0000000000000ULL) || (absa <= 0x3e60000000000000ULL)) + { + // |a| >= 2^16 or NaN or |a| <= 2^(-25) + // double-rounding is not a problem + result = __float2half(static_cast(a)); + } + else + { + // here 2^(-25) < |a| < 2^16 + // prepare shifter value such that a + shifter + // done in double precision performs round-to-nearest-even + // and (a + shifter) - shifter results in a rounded to + // 11 bits of precision. Shifter needs to have exponent of + // a plus 53 - 11 = 42 and a leading bit in mantissa to guard + // against negative values. + // So need to have |a| capped to avoid overflow in exponent. + // For inputs that are smaller than half precision minnorm + // we prepare fixed shifter exponent. + unsigned long long shifterBits; + if (absa >= 0x3f10000000000000ULL) + { // Here if |a| >= 2^(-14) + // add 42 to exponent bits + shifterBits = (ua & 0x7ff0000000000000ULL) + 0x02A0000000000000ULL; + } + else + { // 2^(-25) < |a| < 2^(-14), potentially results in denormal + // set exponent bits to 42 - 14 + bias + shifterBits = 0x41B0000000000000ULL; + } + // set leading mantissa bit to protect against negative inputs + shifterBits |= 0x0008000000000000ULL; + double shifter; + (void)memcpy(&shifter, &shifterBits, sizeof(shifterBits)); + double aShiftRound = a + shifter; + + // Prevent the compiler from optimizing away a + shifter - shifter + // by doing intermediate memcopy and harmless bitwize operation + unsigned long long int aShiftRoundBits; + (void)memcpy(&aShiftRoundBits, &aShiftRound, sizeof(aShiftRound)); + + // the value is positive, so this operation doesn't change anything + aShiftRoundBits &= 0x7fffffffffffffffULL; + + (void)memcpy(&aShiftRound, &aShiftRoundBits, sizeof(aShiftRound)); + + result = __float2half(static_cast(aShiftRound - shifter)); + } + + return result; +, + __half result; + /* + // Perform rounding to 11 bits of precision, convert value + // to float and call existing float to half conversion. + // By pre-rounding to 11 bits we avoid additional rounding + // in float to half conversion. + */ + unsigned long long int absa; + unsigned long long int ua; + (void)::std::memcpy(&ua, &a, sizeof(a)); + absa = (ua & 0x7fffffffffffffffULL); + if ((absa >= 0x40f0000000000000ULL) || (absa <= 0x3e60000000000000ULL)) + { + /* + // |a| >= 2^16 or NaN or |a| <= 2^(-25) + // double-rounding is not a problem + */ + result = __float2half(static_cast(a)); + } + else + { + /* + // here 2^(-25) < |a| < 2^16 + // prepare shifter value such that a + shifter + // done in double precision performs round-to-nearest-even + // and (a + shifter) - shifter results in a rounded to + // 11 bits of precision. Shifter needs to have exponent of + // a plus 53 - 11 = 42 and a leading bit in mantissa to guard + // against negative values. + // So need to have |a| capped to avoid overflow in exponent. + // For inputs that are smaller than half precision minnorm + // we prepare fixed shifter exponent. + */ + unsigned long long shifterBits; + if (absa >= 0x3f10000000000000ULL) + { + /* + // Here if |a| >= 2^(-14) + // add 42 to exponent bits + */ + shifterBits = (ua & 0x7ff0000000000000ULL) + 0x02A0000000000000ULL; + } + else + { + /* + // 2^(-25) < |a| < 2^(-14), potentially results in denormal + // set exponent bits to 42 - 14 + bias + */ + shifterBits = 0x41B0000000000000ULL; + } + // set leading mantissa bit to protect against negative inputs + shifterBits |= 0x0008000000000000ULL; + double shifter; + (void)::std::memcpy(&shifter, &shifterBits, sizeof(shifterBits)); + double aShiftRound = a + shifter; + + /* + // Prevent the compiler from optimizing away a + shifter - shifter + // by doing intermediate memcopy and harmless bitwize operation + */ + unsigned long long int aShiftRoundBits; + (void)::std::memcpy(&aShiftRoundBits, &aShiftRound, sizeof(aShiftRound)); + + // the value is positive, so this operation doesn't change anything + aShiftRoundBits &= 0x7fffffffffffffffULL; + + (void)::std::memcpy(&aShiftRound, &aShiftRoundBits, sizeof(aShiftRound)); + + result = __float2half(static_cast(aShiftRound - shifter)); + } + + return result; +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half(const float a) +{ + __half val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a)); +, + __half_raw r; + unsigned int sign = 0U; + unsigned int remainder = 0U; + r.x = __internal_float2half(a, sign, remainder); + if ((remainder > 0x80000000U) || ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) { + r.x++; + } + val = r; +) + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rn(const float a) +{ + __half val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a)); +, + __half_raw r; + unsigned int sign = 0U; + unsigned int remainder = 0U; + r.x = __internal_float2half(a, sign, remainder); + if ((remainder > 0x80000000U) || ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) { + r.x++; + } + val = r; +) + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rz(const float a) +{ + __half val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{ cvt.rz.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a)); +, + __half_raw r; + unsigned int sign = 0U; + unsigned int remainder = 0U; + r.x = __internal_float2half(a, sign, remainder); + val = r; +) + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rd(const float a) +{ + __half val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{ cvt.rm.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a)); +, + __half_raw r; + unsigned int sign = 0U; + unsigned int remainder = 0U; + r.x = __internal_float2half(a, sign, remainder); + if ((remainder != 0U) && (sign != 0U)) { + r.x++; + } + val = r; +) + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_ru(const float a) +{ + __half val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{ cvt.rp.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a)); +, + __half_raw r; + unsigned int sign = 0U; + unsigned int remainder = 0U; + r.x = __internal_float2half(a, sign, remainder); + if ((remainder != 0U) && (sign == 0U)) { + r.x++; + } + val = r; +) + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float2half2_rn(const float a) +{ + __half2 val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{.reg .f16 low;\n" + " cvt.rn.f16.f32 low, %1;\n" + " mov.b32 %0, {low,low};}\n" : "=r"(__HALF2_TO_UI(val)) : "f"(a)); +, + val = __half2(__float2half_rn(a), __float2half_rn(a)); +) + return val; +} + +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +__CUDA_FP16_DECL__ __half2 __internal_device_float2_to_half2_rn(const float a, const float b) { + __half2 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + asm("{ cvt.rn.f16x2.f32 %0, %2, %1; }\n" + : "=r"(__HALF2_TO_UI(val)) : "f"(a), "f"(b)); +, + asm("{.reg .f16 low,high;\n" + " cvt.rn.f16.f32 low, %1;\n" + " cvt.rn.f16.f32 high, %2;\n" + " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "f"(a), "f"(b)); +) + return val; +} + +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ + +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __floats2half2_rn(const float a, const float b) +{ + __half2 val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + val = __internal_device_float2_to_half2_rn(a,b); +, + val = __half2(__float2half_rn(a), __float2half_rn(b)); +) + return val; +} + +#ifndef __CUDACC_RTC__ /* no host functions in NVRTC mode */ +static inline float __internal_half2float(const unsigned short h) +{ + unsigned int sign = ((static_cast(h) >> 15U) & 1U); + unsigned int exponent = ((static_cast(h) >> 10U) & 0x1fU); + unsigned int mantissa = ((static_cast(h) & 0x3ffU) << 13U); + float f; + if (exponent == 0x1fU) { /* NaN or Inf */ + /* discard sign of a NaN */ + sign = ((mantissa != 0U) ? (sign >> 1U) : sign); + mantissa = ((mantissa != 0U) ? 0x7fffffU : 0U); + exponent = 0xffU; + } else if (exponent == 0U) { /* Denorm or Zero */ + if (mantissa != 0U) { + unsigned int msb; + exponent = 0x71U; + do { + msb = (mantissa & 0x400000U); + mantissa <<= 1U; /* normalize */ + --exponent; + } while (msb == 0U); + mantissa &= 0x7fffffU; /* 1.mantissa is implicit */ + } + } else { + exponent += 0x70U; + } + const unsigned int u = ((sign << 31U) | (exponent << 23U) | mantissa); +#if defined(__CUDACC__) + (void)memcpy(&f, &u, sizeof(u)); +#else + (void)::std::memcpy(&f, &u, sizeof(u)); +#endif + return f; +} +#endif /* !defined(__CUDACC_RTC__) */ + +__CUDA_HOSTDEVICE_FP16_DECL__ float __half2float(const __half a) +{ + float val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(__HALF_TO_CUS(a))); +, + val = __internal_half2float(static_cast<__half_raw>(a).x); +) + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ float __low2float(const __half2 a) +{ + float val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high},%1;\n" + " cvt.f32.f16 %0, low;}\n" : "=f"(val) : "r"(__HALF2_TO_CUI(a))); +, + val = __internal_half2float(static_cast<__half2_raw>(a).x); +) + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ float __high2float(const __half2 a) +{ + float val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high},%1;\n" + " cvt.f32.f16 %0, high;}\n" : "=f"(val) : "r"(__HALF2_TO_CUI(a))); +, + val = __internal_half2float(static_cast<__half2_raw>(a).y); +) + return val; +} + +__CUDA_HOSTDEVICE_FP16_DECL__ signed char __half2char_rz(const __half h) +{ + signed char i; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + unsigned int tmp; + asm("cvt.rzi.s8.f16 %0, %1;" : "=r"(tmp) : "h"(__HALF_TO_CUS(h))); + const unsigned char u = static_cast(tmp); + i = static_cast(u); +, + const float f = __half2float(h); + const signed char max_val = (signed char)0x7fU; + const signed char min_val = (signed char)0x80U; + const unsigned short bits = static_cast(static_cast<__half_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xF800U) { + // NaN + i = 0; + } else if (f > static_cast(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + i = min_val; + } else { + // normal value, conversion is well-defined + i = static_cast(f); + } +) + return i; +} + +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned char __half2uchar_rz(const __half h) +{ + unsigned char i; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + unsigned int tmp; + asm("cvt.rzi.u8.f16 %0, %1;" : "=r"(tmp) : "h"(__HALF_TO_CUS(h))); + i = static_cast(tmp); +, + const float f = __half2float(h); + const unsigned char max_val = 0xffU; + const unsigned char min_val = 0U; + const unsigned short bits = static_cast(static_cast<__half_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xF800U) { + // NaN + i = 0U; + } else if (f > static_cast(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + i = min_val; + } else { + // normal value, conversion is well-defined + i = static_cast(f); + } +) + return i; +} + +__CUDA_HOSTDEVICE_FP16_DECL__ short int __half2short_rz(const __half h) +{ + short int i; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rzi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h))); +, + const float f = __half2float(h); + const short int max_val = (short int)0x7fffU; + const short int min_val = (short int)0x8000U; + const unsigned short bits = static_cast(static_cast<__half_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xF800U) { + // NaN + i = 0; + } else if (f > static_cast(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + i = min_val; + } else { + // normal value, conversion is well-defined + i = static_cast(f); + } +) + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned short int __half2ushort_rz(const __half h) +{ + unsigned short int i; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rzi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h))); +, + const float f = __half2float(h); + const unsigned short int max_val = 0xffffU; + const unsigned short int min_val = 0U; + const unsigned short bits = static_cast(static_cast<__half_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xF800U) { + // NaN + i = 0U; + } else if (f > static_cast(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + i = min_val; + } else { + // normal value, conversion is well-defined + i = static_cast(f); + } +) + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ int __half2int_rz(const __half h) +{ + int i; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rzi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h))); +, + const float f = __half2float(h); + const int max_val = (int)0x7fffffffU; + const int min_val = (int)0x80000000U; + const unsigned short bits = static_cast(static_cast<__half_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xF800U) { + // NaN + i = 0; + } else if (f > static_cast(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + i = min_val; + } else { + // normal value, conversion is well-defined + i = static_cast(f); + } +) + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __half2uint_rz(const __half h) +{ + unsigned int i; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rzi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h))); +, + const float f = __half2float(h); + const unsigned int max_val = 0xffffffffU; + const unsigned int min_val = 0U; + const unsigned short bits = static_cast(static_cast<__half_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xF800U) { + // NaN + i = 0U; + } else if (f > static_cast(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + i = min_val; + } else { + // normal value, conversion is well-defined + i = static_cast(f); + } +) + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ long long int __half2ll_rz(const __half h) +{ + long long int i; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rzi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h))); +, + const float f = __half2float(h); + const long long int max_val = (long long int)0x7fffffffffffffffULL; + const long long int min_val = (long long int)0x8000000000000000ULL; + const unsigned short bits = static_cast(static_cast<__half_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xF800U) { + // NaN + i = min_val; + } else if (f > static_cast(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + i = min_val; + } else { + // normal value, conversion is well-defined + i = static_cast(f); + } +) + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned long long int __half2ull_rz(const __half h) +{ + unsigned long long int i; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rzi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h))); +, + const float f = __half2float(h); + const unsigned long long int max_val = 0xffffffffffffffffULL; + const unsigned long long int min_val = 0ULL; + const unsigned short bits = static_cast(static_cast<__half_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xF800U) { + // NaN + i = 0x8000000000000000ULL; + } else if (f > static_cast(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + i = min_val; + } else { + // normal value, conversion is well-defined + i = static_cast(f); + } +) + return i; +} +/* CUDA vector-types compatible vector creation function (note returns __half2, not half2) */ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 make_half2(const __half x, const __half y) +{ + __half2 t; t.x = x; t.y = y; return t; +} + + +/* Definitions of intrinsics */ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float22half2_rn(const float2 a) +{ + const __half2 val = __floats2half2_rn(a.x, a.y); + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ float2 __half22float2(const __half2 a) +{ + float hi_float; + float lo_float; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high},%1;\n" + " cvt.f32.f16 %0, low;}\n" : "=f"(lo_float) : "r"(__HALF2_TO_CUI(a))); + + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high},%1;\n" + " cvt.f32.f16 %0, high;}\n" : "=f"(hi_float) : "r"(__HALF2_TO_CUI(a))); +, + lo_float = __internal_half2float(((__half2_raw)a).x); + hi_float = __internal_half2float(((__half2_raw)a).y); +) + return make_float2(lo_float, hi_float); +} +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +__CUDA_FP16_DECL__ int __half2int_rn(const __half h) +{ + int i; + asm("cvt.rni.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ int __half2int_rd(const __half h) +{ + int i; + asm("cvt.rmi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ int __half2int_ru(const __half h) +{ + int i; + asm("cvt.rpi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_rn(const int i) +{ + __half h; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rn.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); +, + // double-rounding is not a problem here: if integer + // has more than 24 bits, it is already too large to + // be represented in half precision, and result will + // be infinity. + const float f = static_cast(i); + h = __float2half_rn(f); +) + return h; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_rz(const int i) +{ + __half h; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rz.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); +, + const float f = static_cast(i); + h = __float2half_rz(f); +) + return h; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_rd(const int i) +{ + __half h; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rm.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); +, + const float f = static_cast(i); + h = __float2half_rd(f); +) + return h; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_ru(const int i) +{ + __half h; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rp.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); +, + const float f = static_cast(i); + h = __float2half_ru(f); +) + return h; +} +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +__CUDA_FP16_DECL__ short int __half2short_rn(const __half h) +{ + short int i; + asm("cvt.rni.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ short int __half2short_rd(const __half h) +{ + short int i; + asm("cvt.rmi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ short int __half2short_ru(const __half h) +{ + short int i; + asm("cvt.rpi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_rn(const short int i) +{ + __half h; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rn.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); +, + const float f = static_cast(i); + h = __float2half_rn(f); +) + return h; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_rz(const short int i) +{ + __half h; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rz.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); +, + const float f = static_cast(i); + h = __float2half_rz(f); +) + return h; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_rd(const short int i) +{ + __half h; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rm.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); +, + const float f = static_cast(i); + h = __float2half_rd(f); +) + return h; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_ru(const short int i) +{ + __half h; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rp.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); +, + const float f = static_cast(i); + h = __float2half_ru(f); +) + return h; +} +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +__CUDA_FP16_DECL__ unsigned int __half2uint_rn(const __half h) +{ + unsigned int i; + asm("cvt.rni.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned int __half2uint_rd(const __half h) +{ + unsigned int i; + asm("cvt.rmi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned int __half2uint_ru(const __half h) +{ + unsigned int i; + asm("cvt.rpi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_rn(const unsigned int i) +{ + __half h; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rn.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); +, + // double-rounding is not a problem here: if integer + // has more than 24 bits, it is already too large to + // be represented in half precision, and result will + // be infinity. + const float f = static_cast(i); + h = __float2half_rn(f); +) + return h; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_rz(const unsigned int i) +{ + __half h; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rz.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); +, + const float f = static_cast(i); + h = __float2half_rz(f); +) + return h; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_rd(const unsigned int i) +{ + __half h; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rm.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); +, + const float f = static_cast(i); + h = __float2half_rd(f); +) + return h; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_ru(const unsigned int i) +{ + __half h; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rp.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); +, + const float f = static_cast(i); + h = __float2half_ru(f); +) + return h; +} +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +__CUDA_FP16_DECL__ unsigned short int __half2ushort_rn(const __half h) +{ + unsigned short int i; + asm("cvt.rni.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned short int __half2ushort_rd(const __half h) +{ + unsigned short int i; + asm("cvt.rmi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned short int __half2ushort_ru(const __half h) +{ + unsigned short int i; + asm("cvt.rpi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_rn(const unsigned short int i) +{ + __half h; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rn.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); +, + const float f = static_cast(i); + h = __float2half_rn(f); +) + return h; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_rz(const unsigned short int i) +{ + __half h; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rz.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); +, + const float f = static_cast(i); + h = __float2half_rz(f); +) + return h; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_rd(const unsigned short int i) +{ + __half h; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rm.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); +, + const float f = static_cast(i); + h = __float2half_rd(f); +) + return h; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_ru(const unsigned short int i) +{ + __half h; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rp.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); +, + const float f = static_cast(i); + h = __float2half_ru(f); +) + return h; +} +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +__CUDA_FP16_DECL__ unsigned long long int __half2ull_rn(const __half h) +{ + unsigned long long int i; + asm("cvt.rni.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned long long int __half2ull_rd(const __half h) +{ + unsigned long long int i; + asm("cvt.rmi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned long long int __half2ull_ru(const __half h) +{ + unsigned long long int i; + asm("cvt.rpi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_rn(const unsigned long long int i) +{ + __half h; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rn.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); +, + // double-rounding is not a problem here: if integer + // has more than 24 bits, it is already too large to + // be represented in half precision, and result will + // be infinity. + const float f = static_cast(i); + h = __float2half_rn(f); +) + return h; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_rz(const unsigned long long int i) +{ + __half h; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rz.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); +, + const float f = static_cast(i); + h = __float2half_rz(f); +) + return h; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_rd(const unsigned long long int i) +{ + __half h; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rm.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); +, + const float f = static_cast(i); + h = __float2half_rd(f); +) + return h; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_ru(const unsigned long long int i) +{ + __half h; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rp.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); +, + const float f = static_cast(i); + h = __float2half_ru(f); +) + return h; +} +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +__CUDA_FP16_DECL__ long long int __half2ll_rn(const __half h) +{ + long long int i; + asm("cvt.rni.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ long long int __half2ll_rd(const __half h) +{ + long long int i; + asm("cvt.rmi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ long long int __half2ll_ru(const __half h) +{ + long long int i; + asm("cvt.rpi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_rn(const long long int i) +{ + __half h; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rn.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); +, + // double-rounding is not a problem here: if integer + // has more than 24 bits, it is already too large to + // be represented in half precision, and result will + // be infinity. + const float f = static_cast(i); + h = __float2half_rn(f); +) + return h; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_rz(const long long int i) +{ + __half h; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rz.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); +, + const float f = static_cast(i); + h = __float2half_rz(f); +) + return h; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_rd(const long long int i) +{ + __half h; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rm.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); +, + const float f = static_cast(i); + h = __float2half_rd(f); +) + return h; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_ru(const long long int i) +{ + __half h; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rp.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); +, + const float f = static_cast(i); + h = __float2half_ru(f); +) + return h; +} +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +__CUDA_FP16_DECL__ __half htrunc(const __half h) +{ + __half r; + asm("cvt.rzi.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h))); + return r; +} +__CUDA_FP16_DECL__ __half hceil(const __half h) +{ + __half r; + asm("cvt.rpi.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h))); + return r; +} +__CUDA_FP16_DECL__ __half hfloor(const __half h) +{ + __half r; + asm("cvt.rmi.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h))); + return r; +} +__CUDA_FP16_DECL__ __half hrint(const __half h) +{ + __half r; + asm("cvt.rni.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h))); + return r; +} + +__CUDA_FP16_DECL__ __half2 h2trunc(const __half2 h) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " cvt.rzi.f16.f16 low, low;\n" + " cvt.rzi.f16.f16 high, high;\n" + " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2ceil(const __half2 h) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " cvt.rpi.f16.f16 low, low;\n" + " cvt.rpi.f16.f16 high, high;\n" + " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2floor(const __half2 h) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " cvt.rmi.f16.f16 low, low;\n" + " cvt.rmi.f16.f16 high, high;\n" + " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2rint(const __half2 h) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " cvt.rni.f16.f16 low, low;\n" + " cvt.rni.f16.f16 high, high;\n" + " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h))); + return val; +} +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __lows2half2(const __half2 a, const __half2 b) +{ + __half2 val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{.reg .f16 alow,ahigh,blow,bhigh;\n" + " mov.b32 {alow,ahigh}, %1;\n" + " mov.b32 {blow,bhigh}, %2;\n" + " mov.b32 %0, {alow,blow};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)), "r"(__HALF2_TO_CUI(b))); +, + val.x = a.x; + val.y = b.x; +) + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __highs2half2(const __half2 a, const __half2 b) +{ + __half2 val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{.reg .f16 alow,ahigh,blow,bhigh;\n" + " mov.b32 {alow,ahigh}, %1;\n" + " mov.b32 {blow,bhigh}, %2;\n" + " mov.b32 %0, {ahigh,bhigh};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)), "r"(__HALF2_TO_CUI(b))); +, + val.x = a.y; + val.y = b.y; +) + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __low2half(const __half2 a) +{ + __half ret; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b16 %0, low;}" : "=h"(__HALF_TO_US(ret)) : "r"(__HALF2_TO_CUI(a))); +, + ret = a.x; +) + return ret; +} +__CUDA_HOSTDEVICE_FP16_DECL__ int __hisinf(const __half a) +{ + int retval; + const __half_raw araw = __half_raw(a); + if (araw.x == 0xFC00U) { + retval = -1; + } else if (araw.x == 0x7C00U) { + retval = 1; + } else { + retval = 0; + } + return retval; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __low2half2(const __half2 a) +{ + __half2 val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b32 %0, {low,low};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); +, + val.x = a.x; + val.y = a.x; +) + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __high2half2(const __half2 a) +{ + __half2 val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b32 %0, {high,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); +, + val.x = a.y; + val.y = a.y; +) + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __high2half(const __half2 a) +{ + __half ret; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b16 %0, high;}" : "=h"(__HALF_TO_US(ret)) : "r"(__HALF2_TO_CUI(a))); +, + ret = a.y; +) + return ret; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __halves2half2(const __half a, const __half b) +{ + __half2 val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{ mov.b32 %0, {%1,%2};}\n" + : "=r"(__HALF2_TO_UI(val)) : "h"(__HALF_TO_CUS(a)), "h"(__HALF_TO_CUS(b))); +, + val.x = a; + val.y = b; +) + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __half2half2(const __half a) +{ + __half2 val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{ mov.b32 %0, {%1,%1};}\n" + : "=r"(__HALF2_TO_UI(val)) : "h"(__HALF_TO_CUS(a))); +, + val.x = a; + val.y = a; +) + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __lowhigh2highlow(const __half2 a) +{ + __half2 val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b32 %0, {high,low};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); +, + val.x = a.y; + val.y = a.x; +) + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ short int __half_as_short(const __half h) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return static_cast(__HALF_TO_CUS(h)); +, + return static_cast(__half_raw(h).x); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned short int __half_as_ushort(const __half h) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __HALF_TO_CUS(h); +, + return __half_raw(h).x; +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __short_as_half(const short int i) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __half h; + __HALF_TO_US(h) = static_cast(i); + return h; +, + __half_raw hr; + hr.x = static_cast(i); + return __half(hr); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort_as_half(const unsigned short int i) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __half h; + __HALF_TO_US(h) = i; + return h; +, + __half_raw hr; + hr.x = i; + return __half(hr);) +} + +/****************************************************************************** +* __half arithmetic * +******************************************************************************/ +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +__CUDA_FP16_DECL__ __half __internal_device_hmax(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + __BINARY_OP_HALF_MACRO(max) +, + const float fa = __half2float(a); + const float fb = __half2float(b); + float fr; + asm("{max.f32 %0,%1,%2;\n}" + :"=f"(fr) : "f"(fa), "f"(fb)); + const __half hr = __float2half(fr); + return hr; +) +} +__CUDA_FP16_DECL__ __half __internal_device_hmin(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + __BINARY_OP_HALF_MACRO(min) +, + const float fa = __half2float(a); + const float fb = __half2float(b); + float fr; + asm("{min.f32 %0,%1,%2;\n}" + :"=f"(fr) : "f"(fa), "f"(fb)); + const __half hr = __float2half(fr); + return hr; +) +} +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmax(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __internal_device_hmax(a, b); +, + __half maxval; + + maxval = (__hge(a, b) || __hisnan(b)) ? a : b; + + if (__hisnan(maxval)) + { + // if both inputs are NaN, return canonical NaN + maxval = CUDART_NAN_FP16; + } + else if (__heq(a, b)) + { + // hmax(+0.0, -0.0) = +0.0 + // unsigned compare 0x8000U > 0x0000U + __half_raw ra = __half_raw(a); + __half_raw rb = __half_raw(b); + maxval = (ra.x > rb.x) ? b : a; + } + return maxval; +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmin(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __internal_device_hmin(a, b); +, + __half minval; + + minval = (__hle(a, b) || __hisnan(b)) ? a : b; + + if (__hisnan(minval)) + { + // if both inputs are NaN, return canonical NaN + minval = CUDART_NAN_FP16; + } + else if (__heq(a, b)) + { + // hmin(+0.0, -0.0) = -0.0 + // unsigned compare 0x8000U > 0x0000U + __half_raw ra = __half_raw(a); + __half_raw rb = __half_raw(b); + minval = (ra.x > rb.x) ? a : b; + } + + return minval; +) +} + + +/****************************************************************************** +* __half2 arithmetic * +******************************************************************************/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmax2(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + __BINARY_OP_HALF2_MACRO(max) +, + __half2 val; + val.x = __hmax(a.x, b.x); + val.y = __hmax(a.y, b.y); + return val; +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmin2(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + __BINARY_OP_HALF2_MACRO(min) +, + __half2 val; + val.x = __hmin(a.x, b.x); + val.y = __hmin(a.y, b.y); + return val; +) +} + +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300) || defined(_NVHPC_CUDA) +/****************************************************************************** +* __half, __half2 warp shuffle * +******************************************************************************/ +#define __SHUFFLE_HALF2_MACRO(name) /* do */ {\ + __half2 r; \ + asm volatile ("{" __CUDA_FP16_STRINGIFY(name) " %0,%1,%2,%3;\n}" \ + :"=r"(__HALF2_TO_UI(r)): "r"(__HALF2_TO_CUI(var)), "r"(delta), "r"(c)); \ + return r; \ +} /* while(0) */ + +#define __SHUFFLE_SYNC_HALF2_MACRO(name, var, delta, c, mask) /* do */ {\ + __half2 r; \ + asm volatile ("{" __CUDA_FP16_STRINGIFY(name) " %0,%1,%2,%3,%4;\n}" \ + :"=r"(__HALF2_TO_UI(r)): "r"(__HALF2_TO_CUI(var)), "r"(delta), "r"(c), "r"(mask)); \ + return r; \ +} /* while(0) */ + +#if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ < 700) + +__CUDA_FP16_DECL__ __half2 __shfl(const __half2 var, const int delta, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = ((warp_size - static_cast(width)) << 8U) | 0x1fU; + __SHUFFLE_HALF2_MACRO(shfl.idx.b32) +} +__CUDA_FP16_DECL__ __half2 __shfl_up(const __half2 var, const unsigned int delta, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = (warp_size - static_cast(width)) << 8U; + __SHUFFLE_HALF2_MACRO(shfl.up.b32) +} +__CUDA_FP16_DECL__ __half2 __shfl_down(const __half2 var, const unsigned int delta, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = ((warp_size - static_cast(width)) << 8U) | 0x1fU; + __SHUFFLE_HALF2_MACRO(shfl.down.b32) +} +__CUDA_FP16_DECL__ __half2 __shfl_xor(const __half2 var, const int delta, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = ((warp_size - static_cast(width)) << 8U) | 0x1fU; + __SHUFFLE_HALF2_MACRO(shfl.bfly.b32) +} + +#endif /* defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ < 700) */ + +__CUDA_FP16_DECL__ __half2 __shfl_sync(const unsigned int mask, const __half2 var, const int srcLane, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = ((warp_size - static_cast(width)) << 8U) | 0x1fU; + __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.idx.b32, var, srcLane, c, mask) +} +__CUDA_FP16_DECL__ __half2 __shfl_up_sync(const unsigned int mask, const __half2 var, const unsigned int delta, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = (warp_size - static_cast(width)) << 8U; + __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.up.b32, var, delta, c, mask) +} +__CUDA_FP16_DECL__ __half2 __shfl_down_sync(const unsigned int mask, const __half2 var, const unsigned int delta, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = ((warp_size - static_cast(width)) << 8U) | 0x1fU; + __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.down.b32, var, delta, c, mask) +} +__CUDA_FP16_DECL__ __half2 __shfl_xor_sync(const unsigned int mask, const __half2 var, const int laneMask, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = ((warp_size - static_cast(width)) << 8U) | 0x1fU; + __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.bfly.b32, var, laneMask, c, mask) +} + +#undef __SHUFFLE_HALF2_MACRO +#undef __SHUFFLE_SYNC_HALF2_MACRO + +#if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ < 700) + +__CUDA_FP16_DECL__ __half __shfl(const __half var, const int delta, const int width) +{ + const __half2 temp1 = __halves2half2(var, var); + const __half2 temp2 = __shfl(temp1, delta, width); + return __low2half(temp2); +} +__CUDA_FP16_DECL__ __half __shfl_up(const __half var, const unsigned int delta, const int width) +{ + const __half2 temp1 = __halves2half2(var, var); + const __half2 temp2 = __shfl_up(temp1, delta, width); + return __low2half(temp2); +} +__CUDA_FP16_DECL__ __half __shfl_down(const __half var, const unsigned int delta, const int width) +{ + const __half2 temp1 = __halves2half2(var, var); + const __half2 temp2 = __shfl_down(temp1, delta, width); + return __low2half(temp2); +} +__CUDA_FP16_DECL__ __half __shfl_xor(const __half var, const int delta, const int width) +{ + const __half2 temp1 = __halves2half2(var, var); + const __half2 temp2 = __shfl_xor(temp1, delta, width); + return __low2half(temp2); +} + +#endif /* defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ < 700) */ + +__CUDA_FP16_DECL__ __half __shfl_sync(const unsigned int mask, const __half var, const int srcLane, const int width) +{ + const __half2 temp1 = __halves2half2(var, var); + const __half2 temp2 = __shfl_sync(mask, temp1, srcLane, width); + return __low2half(temp2); +} +__CUDA_FP16_DECL__ __half __shfl_up_sync(const unsigned int mask, const __half var, const unsigned int delta, const int width) +{ + const __half2 temp1 = __halves2half2(var, var); + const __half2 temp2 = __shfl_up_sync(mask, temp1, delta, width); + return __low2half(temp2); +} +__CUDA_FP16_DECL__ __half __shfl_down_sync(const unsigned int mask, const __half var, const unsigned int delta, const int width) +{ + const __half2 temp1 = __halves2half2(var, var); + const __half2 temp2 = __shfl_down_sync(mask, temp1, delta, width); + return __low2half(temp2); +} +__CUDA_FP16_DECL__ __half __shfl_xor_sync(const unsigned int mask, const __half var, const int laneMask, const int width) +{ + const __half2 temp1 = __halves2half2(var, var); + const __half2 temp2 = __shfl_xor_sync(mask, temp1, laneMask, width); + return __low2half(temp2); +} + +#endif /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300) || defined(_NVHPC_CUDA) */ +/****************************************************************************** +* __half and __half2 __ldg,__ldcg,__ldca,__ldcs * +******************************************************************************/ + +#if defined(__cplusplus) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320) || defined(_NVHPC_CUDA)) +#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__) +#define __LDG_PTR "l" +#else +#define __LDG_PTR "r" +#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/ +__CUDA_FP16_DECL__ __half2 __ldg(const __half2 *const ptr) +{ + __half2 ret; + asm ("ld.global.nc.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half __ldg(const __half *const ptr) +{ + __half ret; + asm ("ld.global.nc.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half2 __ldcg(const __half2 *const ptr) +{ + __half2 ret; + asm ("ld.global.cg.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half __ldcg(const __half *const ptr) +{ + __half ret; + asm ("ld.global.cg.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half2 __ldca(const __half2 *const ptr) +{ + __half2 ret; + asm ("ld.global.ca.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half __ldca(const __half *const ptr) +{ + __half ret; + asm ("ld.global.ca.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half2 __ldcs(const __half2 *const ptr) +{ + __half2 ret; + asm ("ld.global.cs.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half __ldcs(const __half *const ptr) +{ + __half ret; + asm ("ld.global.cs.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half2 __ldlu(const __half2 *const ptr) +{ + __half2 ret; + asm ("ld.global.lu.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr) : "memory"); + return ret; +} +__CUDA_FP16_DECL__ __half __ldlu(const __half *const ptr) +{ + __half ret; + asm ("ld.global.lu.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr) : "memory"); + return ret; +} +__CUDA_FP16_DECL__ __half2 __ldcv(const __half2 *const ptr) +{ + __half2 ret; + asm ("ld.global.cv.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr) : "memory"); + return ret; +} +__CUDA_FP16_DECL__ __half __ldcv(const __half *const ptr) +{ + __half ret; + asm ("ld.global.cv.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr) : "memory"); + return ret; +} +__CUDA_FP16_DECL__ void __stwb(__half2 *const ptr, const __half2 value) +{ + asm ("st.global.wb.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory"); +} +__CUDA_FP16_DECL__ void __stwb(__half *const ptr, const __half value) +{ + asm ("st.global.wb.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__HALF_TO_CUS(value)) : "memory"); +} +__CUDA_FP16_DECL__ void __stcg(__half2 *const ptr, const __half2 value) +{ + asm ("st.global.cg.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory"); +} +__CUDA_FP16_DECL__ void __stcg(__half *const ptr, const __half value) +{ + asm ("st.global.cg.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__HALF_TO_CUS(value)) : "memory"); +} +__CUDA_FP16_DECL__ void __stcs(__half2 *const ptr, const __half2 value) +{ + asm ("st.global.cs.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory"); +} +__CUDA_FP16_DECL__ void __stcs(__half *const ptr, const __half value) +{ + asm ("st.global.cs.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__HALF_TO_CUS(value)) : "memory"); +} +__CUDA_FP16_DECL__ void __stwt(__half2 *const ptr, const __half2 value) +{ + asm ("st.global.wt.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory"); +} +__CUDA_FP16_DECL__ void __stwt(__half *const ptr, const __half value) +{ + asm ("st.global.wt.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__HALF_TO_CUS(value)) : "memory"); +} +#undef __LDG_PTR +#endif /* defined(__cplusplus) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320) || defined(_NVHPC_CUDA)) */ +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ + +/****************************************************************************** +* __half2 comparison * +******************************************************************************/ +#define __COMPARISON_OP_HALF2_MACRO(name) /* do */ {\ + __half2 val; \ + asm( "{ " __CUDA_FP16_STRINGIFY(name) ".f16x2.f16x2 %0,%1,%2;\n}" \ + :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \ + return val; \ +} /* while(0) */ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __heq2(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF2_MACRO(set.eq) +, + __half2_raw val; + val.x = __heq(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U; + val.y = __heq(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U; + return __half2(val); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hne2(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF2_MACRO(set.ne) +, + __half2_raw val; + val.x = __hne(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U; + val.y = __hne(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U; + return __half2(val); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hle2(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF2_MACRO(set.le) +, + __half2_raw val; + val.x = __hle(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U; + val.y = __hle(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U; + return __half2(val); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hge2(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF2_MACRO(set.ge) +, + __half2_raw val; + val.x = __hge(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U; + val.y = __hge(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U; + return __half2(val); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hlt2(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF2_MACRO(set.lt) +, + __half2_raw val; + val.x = __hlt(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U; + val.y = __hlt(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U; + return __half2(val); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hgt2(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF2_MACRO(set.gt) +, + __half2_raw val; + val.x = __hgt(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U; + val.y = __hgt(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U; + return __half2(val); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hequ2(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF2_MACRO(set.equ) +, + __half2_raw val; + val.x = __hequ(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U; + val.y = __hequ(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U; + return __half2(val); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hneu2(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF2_MACRO(set.neu) +, + __half2_raw val; + val.x = __hneu(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U; + val.y = __hneu(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U; + return __half2(val); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hleu2(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF2_MACRO(set.leu) +, + __half2_raw val; + val.x = __hleu(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U; + val.y = __hleu(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U; + return __half2(val); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hgeu2(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF2_MACRO(set.geu) +, + __half2_raw val; + val.x = __hgeu(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U; + val.y = __hgeu(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U; + return __half2(val); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hltu2(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF2_MACRO(set.ltu) +, + __half2_raw val; + val.x = __hltu(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U; + val.y = __hltu(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U; + return __half2(val); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hgtu2(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF2_MACRO(set.gtu) +, + __half2_raw val; + val.x = __hgtu(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U; + val.y = __hgtu(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U; + return __half2(val); +) +} +#undef __COMPARISON_OP_HALF2_MACRO +/****************************************************************************** +* __half2 comparison with mask output * +******************************************************************************/ +#define __COMPARISON_OP_HALF2_MACRO_MASK(name) /* do */ {\ + unsigned val; \ + asm( "{ " __CUDA_FP16_STRINGIFY(name) ".u32.f16x2 %0,%1,%2;\n}" \ + :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \ + return val; \ +} /* while(0) */ +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __heq2_mask(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF2_MACRO_MASK(set.eq) +, + const unsigned short px = __heq(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U; + const unsigned short py = __heq(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U; + unsigned ur = (unsigned)py; + ur <<= (unsigned)16U; + ur |= (unsigned)px; + return ur; +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hne2_mask(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF2_MACRO_MASK(set.ne) +, + const unsigned short px = __hne(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U; + const unsigned short py = __hne(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U; + unsigned ur = (unsigned)py; + ur <<= (unsigned)16U; + ur |= (unsigned)px; + return ur; +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hle2_mask(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF2_MACRO_MASK(set.le) +, + const unsigned short px = __hle(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U; + const unsigned short py = __hle(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U; + unsigned ur = (unsigned)py; + ur <<= (unsigned)16U; + ur |= (unsigned)px; + return ur; +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hge2_mask(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF2_MACRO_MASK(set.ge) +, + const unsigned short px = __hge(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U; + const unsigned short py = __hge(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U; + unsigned ur = (unsigned)py; + ur <<= (unsigned)16U; + ur |= (unsigned)px; + return ur; +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hlt2_mask(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF2_MACRO_MASK(set.lt) +, + const unsigned short px = __hlt(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U; + const unsigned short py = __hlt(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U; + unsigned ur = (unsigned)py; + ur <<= (unsigned)16U; + ur |= (unsigned)px; + return ur; +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hgt2_mask(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF2_MACRO_MASK(set.gt) +, + const unsigned short px = __hgt(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U; + const unsigned short py = __hgt(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U; + unsigned ur = (unsigned)py; + ur <<= (unsigned)16U; + ur |= (unsigned)px; + return ur; +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hequ2_mask(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF2_MACRO_MASK(set.equ) +, + const unsigned short px = __hequ(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U; + const unsigned short py = __hequ(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U; + unsigned ur = (unsigned)py; + ur <<= (unsigned)16U; + ur |= (unsigned)px; + return ur; +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hneu2_mask(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF2_MACRO_MASK(set.neu) +, + const unsigned short px = __hneu(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U; + const unsigned short py = __hneu(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U; + unsigned ur = (unsigned)py; + ur <<= (unsigned)16U; + ur |= (unsigned)px; + return ur; +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hleu2_mask(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF2_MACRO_MASK(set.leu) +, + const unsigned short px = __hleu(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U; + const unsigned short py = __hleu(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U; + unsigned ur = (unsigned)py; + ur <<= (unsigned)16U; + ur |= (unsigned)px; + return ur; +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hgeu2_mask(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF2_MACRO_MASK(set.geu) +, + const unsigned short px = __hgeu(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U; + const unsigned short py = __hgeu(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U; + unsigned ur = (unsigned)py; + ur <<= (unsigned)16U; + ur |= (unsigned)px; + return ur; +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hltu2_mask(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF2_MACRO_MASK(set.ltu) +, + const unsigned short px = __hltu(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U; + const unsigned short py = __hltu(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U; + unsigned ur = (unsigned)py; + ur <<= (unsigned)16U; + ur |= (unsigned)px; + return ur; +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hgtu2_mask(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF2_MACRO_MASK(set.gtu) +, + const unsigned short px = __hgtu(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U; + const unsigned short py = __hgtu(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U; + unsigned ur = (unsigned)py; + ur <<= (unsigned)16U; + ur |= (unsigned)px; + return ur; +) +} +#undef __COMPARISON_OP_HALF2_MACRO_MASK + +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbeq2(const __half2 a, const __half2 b) +{ + const unsigned mask = __heq2_mask(a, b); + return (mask == 0xFFFFFFFFU); +} +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbne2(const __half2 a, const __half2 b) +{ + const unsigned mask = __hne2_mask(a, b); + return (mask == 0xFFFFFFFFU); +} +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hble2(const __half2 a, const __half2 b) +{ + const unsigned mask = __hle2_mask(a, b); + return (mask == 0xFFFFFFFFU); +} +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbge2(const __half2 a, const __half2 b) +{ + const unsigned mask = __hge2_mask(a, b); + return (mask == 0xFFFFFFFFU); +} +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hblt2(const __half2 a, const __half2 b) +{ + const unsigned mask = __hlt2_mask(a, b); + return (mask == 0xFFFFFFFFU); +} +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbgt2(const __half2 a, const __half2 b) +{ + const unsigned mask = __hgt2_mask(a, b); + return (mask == 0xFFFFFFFFU); +} +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbequ2(const __half2 a, const __half2 b) +{ + const unsigned mask = __hequ2_mask(a, b); + return (mask == 0xFFFFFFFFU); +} +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbneu2(const __half2 a, const __half2 b) +{ + const unsigned mask = __hneu2_mask(a, b); + return (mask == 0xFFFFFFFFU); +} +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbleu2(const __half2 a, const __half2 b) +{ + const unsigned mask = __hleu2_mask(a, b); + return (mask == 0xFFFFFFFFU); +} +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbgeu2(const __half2 a, const __half2 b) +{ + const unsigned mask = __hgeu2_mask(a, b); + return (mask == 0xFFFFFFFFU); +} +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbltu2(const __half2 a, const __half2 b) +{ + const unsigned mask = __hltu2_mask(a, b); + return (mask == 0xFFFFFFFFU); +} +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbgtu2(const __half2 a, const __half2 b) +{ + const unsigned mask = __hgtu2_mask(a, b); + return (mask == 0xFFFFFFFFU); +} +/****************************************************************************** +* __half comparison * +******************************************************************************/ +#define __COMPARISON_OP_HALF_MACRO(name) /* do */ {\ + unsigned short val; \ + asm( "{ .reg .pred __$temp3;\n" \ + " setp." __CUDA_FP16_STRINGIFY(name) ".f16 __$temp3, %1, %2;\n" \ + " selp.u16 %0, 1, 0, __$temp3;}" \ + : "=h"(val) : "h"(__HALF_TO_CUS(a)), "h"(__HALF_TO_CUS(b))); \ + return (val != 0U) ? true : false; \ +} /* while(0) */ +__CUDA_HOSTDEVICE_FP16_DECL__ bool __heq(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF_MACRO(eq) +, + const float fa = __half2float(a); + const float fb = __half2float(b); + return (fa == fb); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hne(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF_MACRO(ne) +, + const float fa = __half2float(a); + const float fb = __half2float(b); + return (fa != fb) && (!__hisnan(a)) && (!__hisnan(b)); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hle(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF_MACRO(le) +, + const float fa = __half2float(a); + const float fb = __half2float(b); + return (fa <= fb); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hge(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF_MACRO(ge) +, + const float fa = __half2float(a); + const float fb = __half2float(b); + return (fa >= fb); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hlt(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF_MACRO(lt) +, + const float fa = __half2float(a); + const float fb = __half2float(b); + return (fa < fb); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hgt(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF_MACRO(gt) +, + const float fa = __half2float(a); + const float fb = __half2float(b); + return (fa > fb); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hequ(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF_MACRO(equ) +, + const float fa = __half2float(a); + const float fb = __half2float(b); + return (fa == fb) || (__hisnan(a)) || (__hisnan(b)); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hneu(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF_MACRO(neu) +, + const float fa = __half2float(a); + const float fb = __half2float(b); + return (fa != fb); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hleu(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF_MACRO(leu) +, + const float fa = __half2float(a); + const float fb = __half2float(b); + return (fa <= fb) || (__hisnan(a)) || (__hisnan(b)); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hgeu(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF_MACRO(geu) +, + const float fa = __half2float(a); + const float fb = __half2float(b); + return (fa >= fb) || (__hisnan(a)) || (__hisnan(b)); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hltu(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF_MACRO(ltu) +, + const float fa = __half2float(a); + const float fb = __half2float(b); + return (fa < fb) || (__hisnan(a)) || (__hisnan(b)); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hgtu(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF_MACRO(gtu) +, + const float fa = __half2float(a); + const float fb = __half2float(b); + return (fa > fb) || (__hisnan(a)) || (__hisnan(b)); +) +} +#undef __COMPARISON_OP_HALF_MACRO +/****************************************************************************** +* __half2 arithmetic * +******************************************************************************/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hadd2(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __BINARY_OP_HALF2_MACRO(add) +, + __half2 val; + val.x = __hadd(a.x, b.x); + val.y = __hadd(a.y, b.y); + return val; +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hsub2(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __BINARY_OP_HALF2_MACRO(sub) +, + __half2 val; + val.x = __hsub(a.x, b.x); + val.y = __hsub(a.y, b.y); + return val; +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmul2(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __BINARY_OP_HALF2_MACRO(mul) +, + __half2 val; + val.x = __hmul(a.x, b.x); + val.y = __hmul(a.y, b.y); + return val; +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hadd2_sat(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __BINARY_OP_HALF2_MACRO(add.sat) +, + __half2 val; + val.x = __hadd_sat(a.x, b.x); + val.y = __hadd_sat(a.y, b.y); + return val; +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hsub2_sat(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __BINARY_OP_HALF2_MACRO(sub.sat) +, + __half2 val; + val.x = __hsub_sat(a.x, b.x); + val.y = __hsub_sat(a.y, b.y); + return val; +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmul2_sat(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __BINARY_OP_HALF2_MACRO(mul.sat) +, + __half2 val; + val.x = __hmul_sat(a.x, b.x); + val.y = __hmul_sat(a.y, b.y); + return val; +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hadd2_rn(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __BINARY_OP_HALF2_MACRO(add.rn) +, + __half2 val; + val.x = __hadd_rn(a.x, b.x); + val.y = __hadd_rn(a.y, b.y); + return val; +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hsub2_rn(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __BINARY_OP_HALF2_MACRO(sub.rn) +, + __half2 val; + val.x = __hsub_rn(a.x, b.x); + val.y = __hsub_rn(a.y, b.y); + return val; +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmul2_rn(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __BINARY_OP_HALF2_MACRO(mul.rn) +, + __half2 val; + val.x = __hmul_rn(a.x, b.x); + val.y = __hmul_rn(a.y, b.y); + return val; +) +} +#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) +__CUDA_FP16_DECL__ __half2 __hfma2(const __half2 a, const __half2 b, const __half2 c) +{ + __TERNARY_OP_HALF2_MACRO(fma.rn) +} +__CUDA_FP16_DECL__ __half2 __hfma2_sat(const __half2 a, const __half2 b, const __half2 c) +{ + __TERNARY_OP_HALF2_MACRO(fma.rn.sat) +} +#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __h2div(const __half2 a, const __half2 b) { + __half ha = __low2half(a); + __half hb = __low2half(b); + + const __half v1 = __hdiv(ha, hb); + + ha = __high2half(a); + hb = __high2half(b); + + const __half v2 = __hdiv(ha, hb); + + return __halves2half2(v1, v2); +} + +/****************************************************************************** +* __half arithmetic * +******************************************************************************/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hadd(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __BINARY_OP_HALF_MACRO(add) +, + const float fa = __half2float(a); + const float fb = __half2float(b); + return __float2half(fa + fb); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hsub(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __BINARY_OP_HALF_MACRO(sub) +, + const float fa = __half2float(a); + const float fb = __half2float(b); + return __float2half(fa - fb); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmul(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __BINARY_OP_HALF_MACRO(mul) +, + const float fa = __half2float(a); + const float fb = __half2float(b); + return __float2half(fa * fb); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hadd_sat(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __BINARY_OP_HALF_MACRO(add.sat) +, + return __hmin(__hmax(__hadd(a, b), CUDART_ZERO_FP16), CUDART_ONE_FP16); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hsub_sat(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __BINARY_OP_HALF_MACRO(sub.sat) +, + return __hmin(__hmax(__hsub(a, b), CUDART_ZERO_FP16), CUDART_ONE_FP16); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmul_sat(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __BINARY_OP_HALF_MACRO(mul.sat) +, + return __hmin(__hmax(__hmul(a, b), CUDART_ZERO_FP16), CUDART_ONE_FP16); +) +} + +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hadd_rn(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __BINARY_OP_HALF_MACRO(add.rn) +, + const float fa = __half2float(a); + const float fb = __half2float(b); + return __float2half(fa + fb); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hsub_rn(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __BINARY_OP_HALF_MACRO(sub.rn) +, + const float fa = __half2float(a); + const float fb = __half2float(b); + return __float2half(fa - fb); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmul_rn(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __BINARY_OP_HALF_MACRO(mul.rn) +, + const float fa = __half2float(a); + const float fb = __half2float(b); + return __float2half(fa * fb); +) +} +#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) +__CUDA_FP16_DECL__ __half __hfma(const __half a, const __half b, const __half c) +{ + __TERNARY_OP_HALF_MACRO(fma.rn) +} +__CUDA_FP16_DECL__ __half __hfma_sat(const __half a, const __half b, const __half c) +{ + __TERNARY_OP_HALF_MACRO(fma.rn.sat) +} +#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hdiv(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __half v; + __half abs; + __half den; + __HALF_TO_US(den) = 0x008FU; + + float rcp; + const float fa = __half2float(a); + const float fb = __half2float(b); + + asm("{rcp.approx.ftz.f32 %0, %1;\n}" :"=f"(rcp) : "f"(fb)); + + float fv = rcp * fa; + + v = __float2half(fv); + abs = __habs(v); + if (__hlt(abs, den) && __hlt(__float2half(0.0f), abs)) { + const float err = __fmaf_rn(-fb, fv, fa); + fv = __fmaf_rn(rcp, err, fv); + v = __float2half(fv); + } + return v; +, + const float fa = __half2float(a); + const float fb = __half2float(b); + return __float2half(fa / fb); +) +} + +/****************************************************************************** +* __half2 functions * +******************************************************************************/ +#if defined(_NVHPC_CUDA) || defined(__CUDACC__) +#define __APPROX_FCAST(fun) /* do */ {\ + __half val;\ + asm("{.reg.b32 f; \n"\ + " .reg.b16 r; \n"\ + " mov.b16 r,%1; \n"\ + " cvt.f32.f16 f,r; \n"\ + " " __CUDA_FP16_STRINGIFY(fun) ".approx.ftz.f32 f,f; \n"\ + " cvt.rn.f16.f32 r,f; \n"\ + " mov.b16 %0,r; \n"\ + "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));\ + return val;\ +} /* while(0) */ +#define __APPROX_FCAST2(fun) /* do */ {\ + __half2 val;\ + asm("{.reg.b16 hl, hu; \n"\ + " .reg.b32 fl, fu; \n"\ + " mov.b32 {hl, hu}, %1; \n"\ + " cvt.f32.f16 fl, hl; \n"\ + " cvt.f32.f16 fu, hu; \n"\ + " " __CUDA_FP16_STRINGIFY(fun) ".approx.ftz.f32 fl, fl; \n"\ + " " __CUDA_FP16_STRINGIFY(fun) ".approx.ftz.f32 fu, fu; \n"\ + " cvt.rn.f16.f32 hl, fl; \n"\ + " cvt.rn.f16.f32 hu, fu; \n"\ + " mov.b32 %0, {hl, hu}; \n"\ + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); \ + return val;\ +} /* while(0) */ +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) || defined(_NVHPC_CUDA) +#define __SPEC_CASE2(i,r, spc, ulp) \ + "{.reg.b32 spc, ulp, p;\n"\ + " mov.b32 spc," __CUDA_FP16_STRINGIFY(spc) ";\n"\ + " mov.b32 ulp," __CUDA_FP16_STRINGIFY(ulp) ";\n"\ + " set.eq.f16x2.f16x2 p," __CUDA_FP16_STRINGIFY(i) ", spc;\n"\ + " fma.rn.f16x2 " __CUDA_FP16_STRINGIFY(r) ",p,ulp," __CUDA_FP16_STRINGIFY(r) ";\n}\n" +#define __SPEC_CASE(i,r, spc, ulp) \ + "{.reg.b16 spc, ulp, p;\n"\ + " mov.b16 spc," __CUDA_FP16_STRINGIFY(spc) ";\n"\ + " mov.b16 ulp," __CUDA_FP16_STRINGIFY(ulp) ";\n"\ + " set.eq.f16.f16 p," __CUDA_FP16_STRINGIFY(i) ", spc;\n"\ + " fma.rn.f16 " __CUDA_FP16_STRINGIFY(r) ",p,ulp," __CUDA_FP16_STRINGIFY(r) ";\n}\n" +static __device__ __forceinline__ float __float_simpl_sinf(float a); +static __device__ __forceinline__ float __float_simpl_cosf(float a); +__CUDA_FP16_DECL__ __half hsin(const __half a) { + const float sl = __float_simpl_sinf(__half2float(a)); + __half r = __float2half_rn(sl); + asm("{\n\t" + " .reg.b16 i,r,t; \n\t" + " mov.b16 r, %0; \n\t" + " mov.b16 i, %1; \n\t" + " and.b16 t, r, 0x8000U; \n\t" + " abs.f16 r, r; \n\t" + " abs.f16 i, i; \n\t" + __SPEC_CASE(i, r, 0X32B3U, 0x0800U) + __SPEC_CASE(i, r, 0X5CB0U, 0x9000U) + " or.b16 r,r,t; \n\t" + " mov.b16 %0, r; \n" + "}\n" : "+h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a))); + return r; +} +__CUDA_FP16_DECL__ __half2 h2sin(const __half2 a) { + const float sl = __float_simpl_sinf(__half2float(a.x)); + const float sh = __float_simpl_sinf(__half2float(a.y)); + __half2 r = __floats2half2_rn(sl, sh); + asm("{\n\t" + " .reg.b32 i,r,t; \n\t" + " mov.b32 r, %0; \n\t" + " mov.b32 i, %1; \n\t" + " and.b32 t, r, 0x80008000U; \n\t" + " abs.f16x2 r, r; \n\t" + " abs.f16x2 i, i; \n\t" + __SPEC_CASE2(i, r, 0X32B332B3U, 0x08000800U) + __SPEC_CASE2(i, r, 0X5CB05CB0U, 0x90009000U) + " or.b32 r, r, t; \n\t" + " mov.b32 %0, r; \n" + "}\n" : "+r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a))); + return r; +} +__CUDA_FP16_DECL__ __half hcos(const __half a) { + const float cl = __float_simpl_cosf(__half2float(a)); + __half r = __float2half_rn(cl); + asm("{\n\t" + " .reg.b16 i,r; \n\t" + " mov.b16 r, %0; \n\t" + " mov.b16 i, %1; \n\t" + " abs.f16 i, i; \n\t" + __SPEC_CASE(i, r, 0X2B7CU, 0x1000U) + " mov.b16 %0, r; \n" + "}\n" : "+h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a))); + return r; +} +__CUDA_FP16_DECL__ __half2 h2cos(const __half2 a) { + const float cl = __float_simpl_cosf(__half2float(a.x)); + const float ch = __float_simpl_cosf(__half2float(a.y)); + __half2 r = __floats2half2_rn(cl, ch); + asm("{\n\t" + " .reg.b32 i,r; \n\t" + " mov.b32 r, %0; \n\t" + " mov.b32 i, %1; \n\t" + " abs.f16x2 i, i; \n\t" + __SPEC_CASE2(i, r, 0X2B7C2B7CU, 0x10001000U) + " mov.b32 %0, r; \n" + "}\n" : "+r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a))); + return r; +} +static __device__ __forceinline__ float __internal_trig_reduction_kernel(const float a, unsigned int *const quadrant) +{ + const float ar = __fmaf_rn(a, 0.636619772F, 12582912.0F); + const unsigned q = __float_as_uint(ar); + const float j = __fsub_rn(ar, 12582912.0F); + float t = __fmaf_rn(j, -1.5707962512969971e+000F, a); + t = __fmaf_rn(j, -7.5497894158615964e-008F, t); + *quadrant = q; + return t; +} +static __device__ __forceinline__ float __internal_sin_cos_kernel(const float x, const unsigned int i) +{ + float z; + const float x2 = x*x; + float a8; + float a6; + float a4; + float a2; + float a1; + float a0; + + if ((i & 1U) != 0U) { + // cos + a8 = 2.44331571e-5F; + a6 = -1.38873163e-3F; + a4 = 4.16666457e-2F; + a2 = -5.00000000e-1F; + a1 = x2; + a0 = 1.0F; + } + else { + // sin + a8 = -1.95152959e-4F; + a6 = 8.33216087e-3F; + a4 = -1.66666546e-1F; + a2 = 0.0F; + a1 = x; + a0 = x; + } + + z = __fmaf_rn(a8, x2, a6); + z = __fmaf_rn(z, x2, a4); + z = __fmaf_rn(z, x2, a2); + z = __fmaf_rn(z, a1, a0); + + if ((i & 2U) != 0U) { + z = -z; + } + return z; +} +static __device__ __forceinline__ float __float_simpl_sinf(float a) +{ + float z; + unsigned i; + a = __internal_trig_reduction_kernel(a, &i); + z = __internal_sin_cos_kernel(a, i); + return z; +} +static __device__ __forceinline__ float __float_simpl_cosf(float a) +{ + float z; + unsigned i; + a = __internal_trig_reduction_kernel(a, &i); + z = __internal_sin_cos_kernel(a, (i & 0x3U) + 1U); + return z; +} + +__CUDA_FP16_DECL__ __half hexp(const __half a) { + __half val; + asm("{.reg.b32 f, C, nZ; \n" + " .reg.b16 h,r; \n" + " mov.b16 h,%1; \n" + " cvt.f32.f16 f,h; \n" + " mov.b32 C, 0x3fb8aa3bU; \n" + " mov.b32 nZ, 0x80000000U;\n" + " fma.rn.f32 f,f,C,nZ; \n" + " ex2.approx.ftz.f32 f,f; \n" + " cvt.rn.f16.f32 r,f; \n" + __SPEC_CASE(h, r, 0X1F79U, 0x9400U) + __SPEC_CASE(h, r, 0X25CFU, 0x9400U) + __SPEC_CASE(h, r, 0XC13BU, 0x0400U) + __SPEC_CASE(h, r, 0XC1EFU, 0x0200U) + " mov.b16 %0,r; \n" + "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2exp(const __half2 a) { + __half2 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 h,r,fl,fu,C,nZ; \n" + " mov.b32 {hl, hu}, %1; \n" + " mov.b32 h, %1; \n" + " cvt.f32.f16 fl, hl; \n" + " cvt.f32.f16 fu, hu; \n" + " mov.b32 C, 0x3fb8aa3bU; \n" + " mov.b32 nZ, 0x80000000U;\n" + " fma.rn.f32 fl,fl,C,nZ; \n" + " fma.rn.f32 fu,fu,C,nZ; \n" + " ex2.approx.ftz.f32 fl, fl; \n" + " ex2.approx.ftz.f32 fu, fu; \n" + " cvt.rn.f16.f32 hl, fl; \n" + " cvt.rn.f16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + __SPEC_CASE2(h, r, 0X1F791F79U, 0x94009400U) + __SPEC_CASE2(h, r, 0X25CF25CFU, 0x94009400U) + __SPEC_CASE2(h, r, 0XC13BC13BU, 0x04000400U) + __SPEC_CASE2(h, r, 0XC1EFC1EFU, 0x02000200U) + " mov.b32 %0, r; \n" + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +#endif /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) || defined(_NVHPC_CUDA) */ + +__CUDA_FP16_DECL__ __half htanh(const __half a) { + float f = __half2float(a); + f = tanhf(f); + __half h = __float2half_rn(f); + return h; +} +__CUDA_FP16_DECL__ __half2 h2tanh(const __half2 a) { + float2 f = __half22float2(a); + f.x = tanhf(f.x); + f.y = tanhf(f.y); + __half2 h = __float22half2_rn(f); + return h; +} + +__CUDA_FP16_DECL__ __half htanh_approx(const __half a) { + __half r; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_75, + __half_raw hr = (__half_raw)a; + asm("tanh.approx.f16 %0, %0;" : "+h"(hr.x)); + r = (__half)hr; +, + r = htanh(a); +) + return r; +} +__CUDA_FP16_DECL__ __half2 h2tanh_approx(const __half2 a) { + __half2 res; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_75, + asm("tanh.approx.f16x2 %0, %1;" : "=r"(__HALF2_TO_UI(res)) : "r"(__HALF2_TO_CUI(a))); +, + res = h2tanh(a); +) + return res; +} + +__CUDA_FP16_DECL__ __half hexp2(const __half a) { + __half val; + asm("{.reg.b32 f, ULP; \n" + " .reg.b16 r; \n" + " mov.b16 r,%1; \n" + " cvt.f32.f16 f,r; \n" + " ex2.approx.ftz.f32 f,f; \n" + " mov.b32 ULP, 0x33800000U;\n" + " fma.rn.f32 f,f,ULP,f; \n" + " cvt.rn.f16.f32 r,f; \n" + " mov.b16 %0,r; \n" + "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2exp2(const __half2 a) { + __half2 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 fl, fu, ULP; \n" + " mov.b32 {hl, hu}, %1; \n" + " cvt.f32.f16 fl, hl; \n" + " cvt.f32.f16 fu, hu; \n" + " ex2.approx.ftz.f32 fl, fl; \n" + " ex2.approx.ftz.f32 fu, fu; \n" + " mov.b32 ULP, 0x33800000U;\n" + " fma.rn.f32 fl,fl,ULP,fl; \n" + " fma.rn.f32 fu,fu,ULP,fu; \n" + " cvt.rn.f16.f32 hl, fl; \n" + " cvt.rn.f16.f32 hu, fu; \n" + " mov.b32 %0, {hl, hu}; \n" + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) || defined(_NVHPC_CUDA) +__CUDA_FP16_DECL__ __half hexp10(const __half a) { + __half val; + asm("{.reg.b16 h,r; \n" + " .reg.b32 f, C, nZ; \n" + " mov.b16 h, %1; \n" + " cvt.f32.f16 f, h; \n" + " mov.b32 C, 0x40549A78U; \n" + " mov.b32 nZ, 0x80000000U;\n" + " fma.rn.f32 f,f,C,nZ; \n" + " ex2.approx.ftz.f32 f, f; \n" + " cvt.rn.f16.f32 r, f; \n" + __SPEC_CASE(h, r, 0x34DEU, 0x9800U) + __SPEC_CASE(h, r, 0x9766U, 0x9000U) + __SPEC_CASE(h, r, 0x9972U, 0x1000U) + __SPEC_CASE(h, r, 0xA5C4U, 0x1000U) + __SPEC_CASE(h, r, 0xBF0AU, 0x8100U) + " mov.b16 %0, r; \n" + "}":"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2exp10(const __half2 a) { + __half2 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 h,r,fl,fu,C,nZ; \n" + " mov.b32 {hl, hu}, %1; \n" + " mov.b32 h, %1; \n" + " cvt.f32.f16 fl, hl; \n" + " cvt.f32.f16 fu, hu; \n" + " mov.b32 C, 0x40549A78U; \n" + " mov.b32 nZ, 0x80000000U;\n" + " fma.rn.f32 fl,fl,C,nZ; \n" + " fma.rn.f32 fu,fu,C,nZ; \n" + " ex2.approx.ftz.f32 fl, fl; \n" + " ex2.approx.ftz.f32 fu, fu; \n" + " cvt.rn.f16.f32 hl, fl; \n" + " cvt.rn.f16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + __SPEC_CASE2(h, r, 0x34DE34DEU, 0x98009800U) + __SPEC_CASE2(h, r, 0x97669766U, 0x90009000U) + __SPEC_CASE2(h, r, 0x99729972U, 0x10001000U) + __SPEC_CASE2(h, r, 0xA5C4A5C4U, 0x10001000U) + __SPEC_CASE2(h, r, 0xBF0ABF0AU, 0x81008100U) + " mov.b32 %0, r; \n" + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +__CUDA_FP16_DECL__ __half hlog2(const __half a) { + __half val; + asm("{.reg.b16 h, r; \n" + " .reg.b32 f; \n" + " mov.b16 h, %1; \n" + " cvt.f32.f16 f, h; \n" + " lg2.approx.ftz.f32 f, f; \n" + " cvt.rn.f16.f32 r, f; \n" + __SPEC_CASE(r, r, 0xA2E2U, 0x8080U) + __SPEC_CASE(r, r, 0xBF46U, 0x9400U) + " mov.b16 %0, r; \n" + "}":"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2log2(const __half2 a) { + __half2 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 fl, fu, r, p; \n" + " mov.b32 {hl, hu}, %1; \n" + " cvt.f32.f16 fl, hl; \n" + " cvt.f32.f16 fu, hu; \n" + " lg2.approx.ftz.f32 fl, fl; \n" + " lg2.approx.ftz.f32 fu, fu; \n" + " cvt.rn.f16.f32 hl, fl; \n" + " cvt.rn.f16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + __SPEC_CASE2(r, r, 0xA2E2A2E2U, 0x80808080U) + __SPEC_CASE2(r, r, 0xBF46BF46U, 0x94009400U) + " mov.b32 %0, r; \n" + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +__CUDA_FP16_DECL__ __half hlog(const __half a) { + __half val; + asm("{.reg.b32 f, C; \n" + " .reg.b16 r,h; \n" + " mov.b16 h,%1; \n" + " cvt.f32.f16 f,h; \n" + " lg2.approx.ftz.f32 f,f; \n" + " mov.b32 C, 0x3f317218U; \n" + " mul.f32 f,f,C; \n" + " cvt.rn.f16.f32 r,f; \n" + __SPEC_CASE(h, r, 0X160DU, 0x9C00U) + __SPEC_CASE(h, r, 0X3BFEU, 0x8010U) + __SPEC_CASE(h, r, 0X3C0BU, 0x8080U) + __SPEC_CASE(h, r, 0X6051U, 0x1C00U) + " mov.b16 %0,r; \n" + "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2log(const __half2 a) { + __half2 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 r, fl, fu, C, h; \n" + " mov.b32 {hl, hu}, %1; \n" + " mov.b32 h, %1; \n" + " cvt.f32.f16 fl, hl; \n" + " cvt.f32.f16 fu, hu; \n" + " lg2.approx.ftz.f32 fl, fl; \n" + " lg2.approx.ftz.f32 fu, fu; \n" + " mov.b32 C, 0x3f317218U; \n" + " mul.f32 fl,fl,C; \n" + " mul.f32 fu,fu,C; \n" + " cvt.rn.f16.f32 hl, fl; \n" + " cvt.rn.f16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + __SPEC_CASE2(h, r, 0X160D160DU, 0x9C009C00U) + __SPEC_CASE2(h, r, 0X3BFE3BFEU, 0x80108010U) + __SPEC_CASE2(h, r, 0X3C0B3C0BU, 0x80808080U) + __SPEC_CASE2(h, r, 0X60516051U, 0x1C001C00U) + " mov.b32 %0, r; \n" + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +__CUDA_FP16_DECL__ __half hlog10(const __half a) { + __half val; + asm("{.reg.b16 h, r; \n" + " .reg.b32 f, C; \n" + " mov.b16 h, %1; \n" + " cvt.f32.f16 f, h; \n" + " lg2.approx.ftz.f32 f, f; \n" + " mov.b32 C, 0x3E9A209BU; \n" + " mul.f32 f,f,C; \n" + " cvt.rn.f16.f32 r, f; \n" + __SPEC_CASE(h, r, 0x338FU, 0x1000U) + __SPEC_CASE(h, r, 0x33F8U, 0x9000U) + __SPEC_CASE(h, r, 0x57E1U, 0x9800U) + __SPEC_CASE(h, r, 0x719DU, 0x9C00U) + " mov.b16 %0, r; \n" + "}":"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2log10(const __half2 a) { + __half2 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 r, fl, fu, C, h; \n" + " mov.b32 {hl, hu}, %1; \n" + " mov.b32 h, %1; \n" + " cvt.f32.f16 fl, hl; \n" + " cvt.f32.f16 fu, hu; \n" + " lg2.approx.ftz.f32 fl, fl; \n" + " lg2.approx.ftz.f32 fu, fu; \n" + " mov.b32 C, 0x3E9A209BU; \n" + " mul.f32 fl,fl,C; \n" + " mul.f32 fu,fu,C; \n" + " cvt.rn.f16.f32 hl, fl; \n" + " cvt.rn.f16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + __SPEC_CASE2(h, r, 0x338F338FU, 0x10001000U) + __SPEC_CASE2(h, r, 0x33F833F8U, 0x90009000U) + __SPEC_CASE2(h, r, 0x57E157E1U, 0x98009800U) + __SPEC_CASE2(h, r, 0x719D719DU, 0x9C009C00U) + " mov.b32 %0, r; \n" + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +#undef __SPEC_CASE2 +#undef __SPEC_CASE +#endif /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) || defined(_NVHPC_CUDA) */ +__CUDA_FP16_DECL__ __half2 h2rcp(const __half2 a) { + __APPROX_FCAST2(rcp) +} +__CUDA_FP16_DECL__ __half hrcp(const __half a) { + __APPROX_FCAST(rcp) +} +__CUDA_FP16_DECL__ __half2 h2rsqrt(const __half2 a) { + __APPROX_FCAST2(rsqrt) +} +__CUDA_FP16_DECL__ __half hrsqrt(const __half a) { + __APPROX_FCAST(rsqrt) +} +__CUDA_FP16_DECL__ __half2 h2sqrt(const __half2 a) { + __APPROX_FCAST2(sqrt) +} +__CUDA_FP16_DECL__ __half hsqrt(const __half a) { + __APPROX_FCAST(sqrt) +} +#undef __APPROX_FCAST +#undef __APPROX_FCAST2 +#endif /* defined(_NVHPC_CUDA) || defined(__CUDACC__) */ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hisnan2(const __half2 a) +{ + __half2 r; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + asm("{set.nan.f16x2.f16x2 %0,%1,%2;\n}" + :"=r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a)), "r"(__HALF2_TO_CUI(a))); +, + __half2_raw val; + val.x = __hisnan(a.x) ? (unsigned short)0x3C00U : (unsigned short)0U; + val.y = __hisnan(a.y) ? (unsigned short)0x3C00U : (unsigned short)0U; + r = __half2(val); +) + return r; +} +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hisnan(const __half a) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __half r; + asm("{set.nan.f16.f16 %0,%1,%2;\n}" + :"=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a)), "h"(__HALF_TO_CUS(a))); + return __HALF_TO_CUS(r) != 0U; +, + const __half_raw hr = static_cast<__half_raw>(a); + return ((hr.x & (unsigned short)0x7FFFU) > (unsigned short)0x7C00U); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hneg2(const __half2 a) +{ + __half2 r; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + asm("{neg.f16x2 %0,%1;\n}" + :"=r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a))); +, + r.x = __hneg(a.x); + r.y = __hneg(a.y); +) + return r; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hneg(const __half a) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __half r; + asm("{neg.f16 %0,%1;\n}" + :"=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a))); + return r; +, + const float fa = __half2float(a); + return __float2half(-fa); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __habs2(const __half2 a) +{ + __half2 r; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + asm("{abs.f16x2 %0,%1;\n}" + :"=r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a))); +, + r.x = __habs(a.x); + r.y = __habs(a.y); +) + return r; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __habs(const __half a) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __half r; + asm("{abs.f16 %0,%1;\n}" + :"=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a))); + return r; +, + __half_raw abs_a_raw = static_cast<__half_raw>(a); + abs_a_raw.x &= (unsigned short)0x7FFFU; + if (abs_a_raw.x > (unsigned short)0x7C00U) + { + // return canonical NaN + abs_a_raw.x = (unsigned short)0x7FFFU; + } + return static_cast<__half>(abs_a_raw); +) +} +#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) +__CUDA_FP16_DECL__ __half2 __hcmadd(const __half2 a, const __half2 b, const __half2 c) +{ + // fast version of complex multiply-accumulate + // (a.re, a.im) * (b.re, b.im) + (c.re, c.im) + // acc.re = (c.re + a.re*b.re) - a.im*b.im + // acc.im = (c.im + a.re*b.im) + a.im*b.re + __half real_tmp = __hfma(a.x, b.x, c.x); + __half img_tmp = __hfma(a.x, b.y, c.y); + real_tmp = __hfma(__hneg(a.y), b.y, real_tmp); + img_tmp = __hfma(a.y, b.x, img_tmp); + return make_half2(real_tmp, img_tmp); +} +#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */ + +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmax_nan(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + __BINARY_OP_HALF_MACRO(max.NaN) +, + __half maxval; + if (__hisnan(a) || __hisnan(b)) + { + maxval = CUDART_NAN_FP16; + } + else + { + maxval = __hmax(a, b); + } + return maxval; +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmin_nan(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + __BINARY_OP_HALF_MACRO(min.NaN) +, + __half minval; + if (__hisnan(a) || __hisnan(b)) + { + minval = CUDART_NAN_FP16; + } + else + { + minval = __hmin(a, b); + } + return minval; +) +} + +#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) +__CUDA_FP16_DECL__ __half __hfma_relu(const __half a, const __half b, const __half c) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + __TERNARY_OP_HALF_MACRO(fma.rn.relu) +, + return __hmax_nan(__hfma(a, b, c), CUDART_ZERO_FP16); +) +} +#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */ + +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmax2_nan(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + __BINARY_OP_HALF2_MACRO(max.NaN) +, + __half2 result = __hmax2(a, b); + if (__hisnan(a.x) || __hisnan(b.x)) + { + result.x = CUDART_NAN_FP16; + } + if (__hisnan(a.y) || __hisnan(b.y)) + { + result.y = CUDART_NAN_FP16; + } + return result; +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmin2_nan(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + __BINARY_OP_HALF2_MACRO(min.NaN) +, + __half2 result = __hmin2(a, b); + if (__hisnan(a.x) || __hisnan(b.x)) + { + result.x = CUDART_NAN_FP16; + } + if (__hisnan(a.y) || __hisnan(b.y)) + { + result.y = CUDART_NAN_FP16; + } + return result; +) +} +#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) +__CUDA_FP16_DECL__ __half2 __hfma2_relu(const __half2 a, const __half2 b, const __half2 c) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + __TERNARY_OP_HALF2_MACRO(fma.rn.relu) +, + __half2_raw hzero; + hzero.x = (unsigned short)0U; + hzero.y = (unsigned short)0U; + return __hmax2_nan(__hfma2(a, b, c), __half2(hzero)); +) +} +#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */ + +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/* Define __PTR for atomicAdd prototypes below, undef after done */ +#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__) +#define __PTR "l" +#else +#define __PTR "r" +#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/ + +__CUDA_FP16_DECL__ __half2 atomicAdd(__half2 *const address, const __half2 val) { +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_60, + __half2 r; + asm volatile ("{ atom.add.noftz.f16x2 %0,[%1],%2; }\n" + : "=r"(__HALF2_TO_UI(r)) : __PTR(address), "r"(__HALF2_TO_CUI(val)) + : "memory"); + return r; +, + unsigned int* address_as_uint = (unsigned int*)address; + unsigned int old = *address_as_uint; + unsigned int assumed; + do { + assumed = old; + __half2 new_val = __hadd2(val, *(__half2*)&assumed); + old = atomicCAS(address_as_uint, assumed, *(unsigned int*)&new_val); + } while (assumed != old); + return *(__half2*)&old; +) +} + +#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700))) || defined(_NVHPC_CUDA) +__CUDA_FP16_DECL__ __half atomicAdd(__half *const address, const __half val) { + __half r; + asm volatile ("{ atom.add.noftz.f16 %0,[%1],%2; }\n" + : "=h"(__HALF_TO_US(r)) + : __PTR(address), "h"(__HALF_TO_CUS(val)) + : "memory"); + return r; +} +#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700))) || defined(_NVHPC_CUDA) */ + +#undef __PTR +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +#endif /* !(defined __DOXYGEN_ONLY__) */ +#endif /* defined(__cplusplus) */ + +#undef __TERNARY_OP_HALF2_MACRO +#undef __TERNARY_OP_HALF_MACRO +#undef __BINARY_OP_HALF2_MACRO +#undef __BINARY_OP_HALF_MACRO + +#undef __CUDA_HOSTDEVICE_FP16_DECL__ +#undef __CUDA_FP16_DECL__ + +#undef __HALF_TO_US +#undef __HALF_TO_CUS +#undef __HALF2_TO_UI +#undef __HALF2_TO_CUI +#undef __CUDA_FP16_CONSTEXPR__ + +#if defined(__CPP_VERSION_AT_LEAST_11_FP16) +#undef __CPP_VERSION_AT_LEAST_11_FP16 +#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */ + +#undef ___CUDA_FP16_STRINGIFY_INNERMOST +#undef __CUDA_FP16_STRINGIFY + +#endif /* end of include guard: __CUDA_FP16_HPP__ */ diff --git a/numba_cuda/numba/cuda/models.py b/numba_cuda/numba/cuda/models.py index a98af1f6d..4b53bee8b 100644 --- a/numba_cuda/numba/cuda/models.py +++ b/numba_cuda/numba/cuda/models.py @@ -6,9 +6,10 @@ from llvmlite import ir from numba.core.datamodel.registry import DataModelManager, register +from numba.core.datamodel import PrimitiveModel from numba.core.extending import models from numba.core import types -from numba.cuda.types import Dim3, GridGroup, CUDADispatcher +from numba.cuda.types import Dim3, GridGroup, CUDADispatcher, Bfloat16 cuda_data_manager = DataModelManager() @@ -45,3 +46,10 @@ def __init__(self, dmm, fe_type): register_model(CUDADispatcher)(models.OpaqueModel) + + +@register_model(Bfloat16) +class _model___nv_bfloat16(PrimitiveModel): + def __init__(self, dmm, fe_type): + be_type = ir.IntType(16) + super(_model___nv_bfloat16, self).__init__(dmm, fe_type, be_type) diff --git a/numba_cuda/numba/cuda/printimpl.py b/numba_cuda/numba/cuda/printimpl.py index 558335191..a0e236909 100644 --- a/numba_cuda/numba/cuda/printimpl.py +++ b/numba_cuda/numba/cuda/printimpl.py @@ -8,7 +8,7 @@ from numba.core.errors import NumbaWarning from numba.core.imputils import Registry from numba.cuda import nvvmutils -from numba.cuda.types import Dim3 +from numba.cuda.types import Dim3, Bfloat16 from warnings import warn registry = Registry() @@ -51,6 +51,17 @@ def real_print_impl(ty, context, builder, val): return "%f", [lld] +@print_item.register(Bfloat16) +def bfloat16_print_impl(ty, context, builder, val): + # Hand rolled bfloat16 -> float32 -> double conversion with zero-ext + bits32 = builder.zext(val, ir.IntType(32)) + shift = builder.shl(bits32, ir.Constant(ir.IntType(32), 16)) + f32 = builder.bitcast(shift, ir.FloatType()) + # printf("%f") expects a double; promote to f64 to match vararg expectation + f64 = builder.fpext(f32, ir.DoubleType()) + return "%f", [f64] + + @print_item.register(types.StringLiteral) def const_print_impl(ty, context, builder, sigval): pyval = ty.literal_value diff --git a/numba_cuda/numba/cuda/target.py b/numba_cuda/numba/cuda/target.py index af6988dca..1ee2c5be6 100644 --- a/numba_cuda/numba/cuda/target.py +++ b/numba_cuda/numba/cuda/target.py @@ -33,7 +33,14 @@ class CUDATypingContext(typing.BaseContext): def load_additional_registries(self): - from . import cudadecl, cudamath, fp16, libdevicedecl, vector_types + from . import ( + cudadecl, + cudamath, + fp16, + bf16, + libdevicedecl, + vector_types, + ) from numba.core.typing import enumdecl, cffi_utils self.install_registry(cudadecl.registry) @@ -44,6 +51,7 @@ def load_additional_registries(self): self.install_registry(enumdecl.registry) self.install_registry(vector_types.typing_registry) self.install_registry(fp16.typing_registry) + self.install_registry(bf16.typing_registry) def resolve_value_type(self, val): # treat other dispatcher object as another device function @@ -156,6 +164,7 @@ def load_additional_registries(self): libdeviceimpl, mathimpl, vector_types, + bf16, ) # fix for #8940 @@ -169,6 +178,7 @@ def load_additional_registries(self): self.install_registry(mathimpl.registry) self.install_registry(vector_types.impl_registry) self.install_registry(fp16.target_registry) + self.install_registry(bf16.target_registry) def codegen(self): return self._internal_codegen diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py index af392eb39..95e5fe140 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py @@ -1,8 +1,118 @@ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: BSD-2-Clause -from numba import cuda, float32 -from numba.cuda.bf16 import bfloat16 +import numpy as np +from ml_dtypes import bfloat16 as mldtypes_bf16 + +from numba import ( + cuda, + float32, + float64, + int16, + int32, + int64, + uint16, + uint32, + uint64, + config, +) + + +if not config.ENABLE_CUDASIM: + from numba.cuda.bf16 import ( + bfloat16, + habs, + hadd, + hsub, + hmul, + hadd_rn, + hsub_rn, + hmul_rn, + hdiv, + hadd_sat, + hsub_sat, + hmul_sat, + hfma, + hfma_sat, + hneg, + hfma_relu, + # Comparison intrinsics + heq, + hne, + hge, + hgt, + hle, + hlt, + hmax, + hmin, + hmax_nan, + hmin_nan, + hisnan, + hisinf, + # Conversion intrinsics (NumPy-style names) + bfloat16_to_int8_rz, + bfloat16_to_uint8_rz, + int16_to_bfloat16_rn, + int16_to_bfloat16_rz, + int16_to_bfloat16_rd, + int16_to_bfloat16_ru, + bfloat16_to_int16_rn, + bfloat16_to_int16_rz, + bfloat16_to_int16_rd, + bfloat16_to_int16_ru, + uint16_to_bfloat16_rn, + uint16_to_bfloat16_rz, + uint16_to_bfloat16_rd, + uint16_to_bfloat16_ru, + bfloat16_to_uint16_rn, + bfloat16_to_uint16_rz, + bfloat16_to_uint16_rd, + bfloat16_to_uint16_ru, + int32_to_bfloat16_rn, + int32_to_bfloat16_rz, + int32_to_bfloat16_rd, + int32_to_bfloat16_ru, + bfloat16_to_int32_rn, + bfloat16_to_int32_rz, + bfloat16_to_int32_rd, + bfloat16_to_int32_ru, + uint32_to_bfloat16_rn, + uint32_to_bfloat16_rz, + uint32_to_bfloat16_rd, + uint32_to_bfloat16_ru, + bfloat16_to_uint32_rn, + bfloat16_to_uint32_rz, + bfloat16_to_uint32_rd, + bfloat16_to_uint32_ru, + bfloat16_to_int64_rn, + bfloat16_to_int64_rz, + bfloat16_to_int64_rd, + bfloat16_to_int64_ru, + int64_to_bfloat16_rn, + int64_to_bfloat16_rz, + int64_to_bfloat16_rd, + int64_to_bfloat16_ru, + bfloat16_to_uint64_rn, + bfloat16_to_uint64_rz, + bfloat16_to_uint64_rd, + bfloat16_to_uint64_ru, + uint64_to_bfloat16_rn, + uint64_to_bfloat16_rz, + uint64_to_bfloat16_rd, + uint64_to_bfloat16_ru, + bfloat16_as_int16, + int16_as_bfloat16, + bfloat16_as_uint16, + uint16_as_bfloat16, + bfloat16_to_float32, + float32_to_bfloat16, + float64_to_bfloat16, + float32_to_bfloat16_rn, + float32_to_bfloat16_rz, + float32_to_bfloat16_rd, + float32_to_bfloat16_ru, + ) + from numba.cuda.testing import CUDATestCase import math @@ -61,3 +171,431 @@ def kernel(arr): self.assertAlmostEqual(arr[0], f(3.14), delta=1e-1) else: self.assertAlmostEqual(arr[0], f(3.14), delta=1e-2) + + def test_arithmetic_intrinsics_basic(self): + self.skip_unsupported() + + @cuda.jit + def kernel(out): + a = bfloat16(1.25) + b = bfloat16(-2.5) + + out[0] = float32(habs(b)) + out[1] = float32(hadd(a, b)) + out[2] = float32(hsub(a, b)) + out[3] = float32(hmul(a, b)) + out[4] = float32(hdiv(b, a)) + out[5] = float32(hneg(a)) + out[6] = float32(hfma(a, b, b)) + + out[7] = float32(hadd_rn(a, b)) + out[8] = float32(hsub_rn(a, b)) + out[9] = float32(hmul_rn(a, b)) + + out = cuda.device_array((10,), dtype="float32") + kernel[1, 1](out) + + a = 1.25 + b = -2.5 + expected = [ + abs(b), + a + b, + a - b, + a * b, + b / a, + -a, + a * b + b, + a + b, + a - b, + a * b, + ] + for i, exp in enumerate(expected): + self.assertAlmostEqual(out[i], exp, delta=1e-2) + + def test_arithmetic_intrinsics_saturating(self): + self.skip_unsupported() + + @cuda.jit + def kernel(out): + a = bfloat16(1.5) + b = bfloat16(0.75) + + out[0] = float32(hadd_sat(a, b)) # 2.25 -> 1.0 + out[1] = float32(hsub_sat(b, a)) # -0.75 -> 0.0 + out[2] = float32(hmul_sat(a, b)) # 1.125 -> 1.0 + out[3] = float32(hfma_sat(a, b, a)) # 1.125 + 1.5 -> 1.0 + + out = cuda.device_array((4,), dtype="float32") + kernel[1, 1](out) + + self.assertAlmostEqual(out[0], 1.0, delta=1e-3) + self.assertAlmostEqual(out[1], 0.0, delta=1e-3) + self.assertAlmostEqual(out[2], 1.0, delta=1e-3) + self.assertAlmostEqual(out[3], 1.0, delta=1e-3) + + # Also check they are clamped within [0, 1] + for i in range(4): + self.assertGreaterEqual(out[i], 0.0) + self.assertLessEqual(out[i], 1.0) + + def test_fma_relu_intrinsic(self): + self.skip_unsupported() + + @cuda.jit + def kernel(out): + a = bfloat16(-1.5) + b = bfloat16(2.0) + c = bfloat16(0.0) + + out[0] = float32(hfma_relu(a, b, c)) # -3.0 -> relu -> 0.0 + + out = cuda.device_array((1,), dtype="float32") + kernel[1, 1](out) + + self.assertAlmostEqual(out[0], 0.0, delta=1e-3) + + def test_comparison_intrinsics(self): + self.skip_unsupported() + + def make_kernel(cmpfn): + @cuda.jit + def kernel(out, a, b): + a_bf16 = bfloat16(a) + b_bf16 = bfloat16(b) + out[0] = cmpfn(a_bf16, b_bf16) + + return kernel + + comparisons = [heq, hne, hge, hgt, hle, hlt] + ops = [ + lambda x, y: x == y, + lambda x, y: x != y, + lambda x, y: x >= y, + lambda x, y: x > y, + lambda x, y: x <= y, + lambda x, y: x < y, + ] + + for cmpfn, op in zip(comparisons, ops): + with self.subTest(cmpfn=cmpfn): + kernel = make_kernel(cmpfn) + out = cuda.device_array((1,), dtype="bool") + + a = 3.0 + b = 3.0 + kernel[1, 1](out, a, b) + self.assertEqual(bool(out[0]), op(3.0, 3.0)) + + a = 3.0 + b = 4.0 + kernel[1, 1](out, a, b) + self.assertEqual(bool(out[0]), op(3.0, 4.0)) + + a = 4.0 + b = 3.0 + kernel[1, 1](out, a, b) + self.assertEqual(bool(out[0]), op(4.0, 3.0)) + + def test_hmax_hmin_intrinsics(self): + self.skip_unsupported() + + @cuda.jit + def kernel(out): + a = bfloat16(3.0) + b = bfloat16(4.0) + out[0] = float32(hmax(a, b)) + out[1] = float32(hmin(a, b)) + + out = cuda.device_array((2,), dtype="float32") + kernel[1, 1](out) + self.assertAlmostEqual(out[0], 4.0, delta=1e-3) + self.assertAlmostEqual(out[1], 3.0, delta=1e-3) + + def test_nan_and_inf_intrinsics(self): + self.skip_unsupported() + + @cuda.jit + def kernel(out_bool, out_int): + nanv = bfloat16(float("nan")) + infv = bfloat16(float("inf")) + out_bool[0] = hisnan(nanv) + out_int[0] = hisinf(infv) + + out_bool = cuda.device_array((1,), dtype="bool") + out_int = cuda.device_array((1,), dtype="int32") + kernel[1, 1](out_bool, out_int) + self.assertTrue(bool(out_bool[0])) + self.assertNotEqual(int(out_int[0]), 0) + + def test_hmax_nan_hmin_nan_intrinsics(self): + self.skip_unsupported() + + @cuda.jit + def kernel(out): + a = bfloat16(float("nan")) + b = bfloat16(2.0) + out[0] = float32(hmax_nan(a, b)) + out[1] = float32(hmin_nan(a, b)) + out[2] = float32(hmax(a, b)) + out[3] = float32(hmin(a, b)) + + out = cuda.device_array((4,), dtype="float32") + kernel[1, 1](out) + # NaN-propagating variants should produce NaN + self.assertTrue(math.isnan(out[0])) + self.assertTrue(math.isnan(out[1])) + # Non-NaN variants should return the non-NaN operand + self.assertAlmostEqual(out[2], 2.0, delta=1e-3) + self.assertAlmostEqual(out[3], 2.0, delta=1e-3) + + def test_bfloat16_as_bitcast(self): + self.skip_unsupported() + + @cuda.jit + def roundtrip_kernel(test_val, i2, u2): + i2[0] = int16_as_bfloat16(bfloat16_as_int16(test_val)) + u2[0] = uint16_as_bfloat16(bfloat16_as_uint16(test_val)) + + test_val = np.int16(0x3FC0) # 1.5 in bfloat16 + i2 = cuda.device_array((1,), dtype="int16") + u2 = cuda.device_array((1,), dtype="uint16") + roundtrip_kernel[1, 1](test_val, i2, u2) + + self.assertEqual(i2[0], test_val) + self.assertEqual(u2[0], test_val) + + def test_to_integer_conversions(self): + self.skip_unsupported() + + @cuda.jit + def kernel(test_val, i1, i2, i3, i4, u1, u2, u3, u4): + a = int16_as_bfloat16(test_val) + + i1[0] = bfloat16_to_int8_rz(a) + u1[0] = bfloat16_to_uint8_rz(a) + i2[0] = bfloat16_to_int16_rn(a) + i2[1] = bfloat16_to_int16_rz(a) + i2[2] = bfloat16_to_int16_rd(a) + i2[3] = bfloat16_to_int16_ru(a) + u2[0] = bfloat16_to_uint16_rn(a) + u2[1] = bfloat16_to_uint16_rz(a) + u2[2] = bfloat16_to_uint16_rd(a) + u2[3] = bfloat16_to_uint16_ru(a) + i3[0] = bfloat16_to_int32_rn(a) + i3[1] = bfloat16_to_int32_rz(a) + i3[2] = bfloat16_to_int32_rd(a) + i3[3] = bfloat16_to_int32_ru(a) + u3[0] = bfloat16_to_uint32_rn(a) + u3[1] = bfloat16_to_uint32_rz(a) + u3[2] = bfloat16_to_uint32_rd(a) + u3[3] = bfloat16_to_uint32_ru(a) + i4[0] = bfloat16_to_int64_rn(a) + i4[1] = bfloat16_to_int64_rz(a) + i4[2] = bfloat16_to_int64_rd(a) + i4[3] = bfloat16_to_int64_ru(a) + u4[0] = bfloat16_to_uint64_rn(a) + u4[1] = bfloat16_to_uint64_rz(a) + u4[2] = bfloat16_to_uint64_rd(a) + u4[3] = bfloat16_to_uint64_ru(a) + + # rz + i1 = cuda.device_array((1,), dtype="int8") + # rn, rz, rd, ru + i2 = cuda.device_array((4,), dtype="int16") + i3 = cuda.device_array((4,), dtype="int32") + i4 = cuda.device_array((4,), dtype="int64") + # rz + u1 = cuda.device_array((1,), dtype="uint8") + # rn, rz, rd, ru + u2 = cuda.device_array((4,), dtype="uint16") + u3 = cuda.device_array((4,), dtype="uint32") + u4 = cuda.device_array((4,), dtype="uint64") + + test_val = np.int16(0x3FC0) # 1.5 in bfloat16 + + kernel[1, 1](test_val, i1, i2, i3, i4, u1, u2, u3, u4) + + self.assertEqual(i1[0], 1) + self.assertEqual(u1[0], 1) + + np.testing.assert_equal(i2, np.array([2, 1, 1, 2], "int16")) + np.testing.assert_equal(i3, np.array([2, 1, 1, 2], "int32")) + np.testing.assert_equal(i4, np.array([2, 1, 1, 2], "int64")) + np.testing.assert_equal(u2, np.array([2, 1, 1, 2], "uint16")) + np.testing.assert_equal(u3, np.array([2, 1, 1, 2], "uint32")) + np.testing.assert_equal(u4, np.array([2, 1, 1, 2], "uint64")) + + def test_from_integer_conversions(self): + self.skip_unsupported() + + test_val = 789 + + @cuda.jit + def kernel(out): + i2 = int16(test_val) + i3 = int32(test_val) + i4 = int64(test_val) + u2 = uint16(test_val) + u3 = uint32(test_val) + u4 = uint64(test_val) + + i2rn = int16_to_bfloat16_rn(i2) + i2rz = int16_to_bfloat16_rz(i2) + i2rd = int16_to_bfloat16_rd(i2) + i2ru = int16_to_bfloat16_ru(i2) + + u2rn = uint16_to_bfloat16_rn(u2) + u2rz = uint16_to_bfloat16_rz(u2) + u2rd = uint16_to_bfloat16_rd(u2) + u2ru = uint16_to_bfloat16_ru(u2) + + i3rn = int32_to_bfloat16_rn(i3) + i3rz = int32_to_bfloat16_rz(i3) + i3rd = int32_to_bfloat16_rd(i3) + i3ru = int32_to_bfloat16_ru(i3) + + u3rn = uint32_to_bfloat16_rn(u3) + u3rz = uint32_to_bfloat16_rz(u3) + u3rd = uint32_to_bfloat16_rd(u3) + u3ru = uint32_to_bfloat16_ru(u3) + + i4rn = int64_to_bfloat16_rn(i4) + i4rz = int64_to_bfloat16_rz(i4) + i4rd = int64_to_bfloat16_rd(i4) + i4ru = int64_to_bfloat16_ru(i4) + + u4rn = uint64_to_bfloat16_rn(u4) + u4rz = uint64_to_bfloat16_rz(u4) + u4rd = uint64_to_bfloat16_rd(u4) + u4ru = uint64_to_bfloat16_ru(u4) + + out[0] = bfloat16_as_int16(i2rn) + out[1] = bfloat16_as_int16(i2rz) + out[2] = bfloat16_as_int16(i2rd) + out[3] = bfloat16_as_int16(i2ru) + out[4] = bfloat16_as_int16(u2rn) + out[5] = bfloat16_as_int16(u2rz) + out[6] = bfloat16_as_int16(u2rd) + out[7] = bfloat16_as_int16(u2ru) + out[8] = bfloat16_as_int16(i3rn) + out[9] = bfloat16_as_int16(i3rz) + out[10] = bfloat16_as_int16(i3rd) + out[11] = bfloat16_as_int16(i3ru) + out[12] = bfloat16_as_int16(u3rn) + out[13] = bfloat16_as_int16(u3rz) + out[14] = bfloat16_as_int16(u3rd) + out[15] = bfloat16_as_int16(u3ru) + out[16] = bfloat16_as_int16(i4rn) + out[17] = bfloat16_as_int16(i4rz) + out[18] = bfloat16_as_int16(i4rd) + out[19] = bfloat16_as_int16(i4ru) + out[20] = bfloat16_as_int16(u4rn) + out[21] = bfloat16_as_int16(u4rz) + out[22] = bfloat16_as_int16(u4rd) + out[23] = bfloat16_as_int16(u4ru) + + out = cuda.device_array((24,), dtype="int16") + kernel[1, 1](out) + res = out.copy_to_host() + + i2 = np.int16(789).astype(mldtypes_bf16).view("int16") + i3 = np.int32(789).astype(mldtypes_bf16).view("int16") + i4 = np.int64(789).astype(mldtypes_bf16).view("int16") + u2 = np.uint16(789).astype(mldtypes_bf16).view("int16") + u3 = np.uint32(789).astype(mldtypes_bf16).view("int16") + u4 = np.uint64(789).astype(mldtypes_bf16).view("int16") + + i2arr = np.array([i2] * 4) + i3arr = np.array([i3] * 4) + i4arr = np.array([i4] * 4) + u2arr = np.array([u2] * 4) + u3arr = np.array([u3] * 4) + u4arr = np.array([u4] * 4) + + two = np.ones_like(res[0:4]) * 2 + np.testing.assert_array_less(_bf16_ulp_distance(res[0:4], i2arr), two) + np.testing.assert_array_less(_bf16_ulp_distance(res[4:8], i3arr), two) + np.testing.assert_array_less(_bf16_ulp_distance(res[8:12], i4arr), two) + np.testing.assert_array_less(_bf16_ulp_distance(res[12:16], u2arr), two) + np.testing.assert_array_less(_bf16_ulp_distance(res[16:20], u3arr), two) + np.testing.assert_array_less(_bf16_ulp_distance(res[20:24], u4arr), two) + + def test_to_float_conversions(self): + self.skip_unsupported() + + @cuda.jit + def kernel(out): + a = bfloat16(1.5) + out[0] = bfloat16_to_float32(a) + + out = cuda.device_array((1,), dtype="float32") + kernel[1, 1](out) + + self.assertAlmostEqual(out[0], 1.5, delta=1e-7) # conversion is exact + + def test_from_float_conversions(self): + self.skip_unsupported() + + test_val = 1.5 + + @cuda.jit + def kernel(out): + f4 = float32(test_val) + f8 = float64(test_val) + + f4rn = float32_to_bfloat16_rn(f4) + f4rz = float32_to_bfloat16_rz(f4) + f4rd = float32_to_bfloat16_rd(f4) + f4ru = float32_to_bfloat16_ru(f4) + + f4_default = float32_to_bfloat16(f4) + f8_default = float64_to_bfloat16(f8) + + out[0] = bfloat16_as_int16(f4rn) + out[1] = bfloat16_as_int16(f4rz) + out[2] = bfloat16_as_int16(f4rd) + out[3] = bfloat16_as_int16(f4ru) + out[4] = bfloat16_as_int16(f4_default) + out[5] = bfloat16_as_int16(f8_default) + + out = cuda.device_array((1,), dtype="int16") + kernel[1, 1](out) + raw = out.copy_to_host() + + f4_expected = ( + np.array([test_val] * 4, "float32") + .astype(mldtypes_bf16) + .view("int16") + ) + f8_expected = ( + np.array([test_val] * 1, "float64") + .astype(mldtypes_bf16) + .view("int16") + ) + + np.testing.assert_array_less( + _bf16_ulp_distance(raw[0:4], f4_expected), 2 + ) + np.testing.assert_array_less( + _bf16_ulp_distance(raw[4:], f8_expected), 2 + ) + + +def _bf16_ulp_rank(bits_int16: np.ndarray) -> np.ndarray: + """ + Compute the ULP rank of a bfloat16 value. Input is the bits of the bfloat16 value as an int16. + The ULP rank is the number of ULPs between the value and 0. + Negative values are performed the inverse of 2's complement before computing the rank. + """ + u = bits_int16.view(np.uint16) + sign = u >> 15 + return np.where(sign == 0, u + 0x8000, 0x8000 - u).astype(np.int32) + + +def _bf16_ulp_distance( + a_bits_int16: np.ndarray, b_bits_int16: np.ndarray +) -> np.ndarray: + """ + Compute the difference between two bfloat16 values in ULPs. + """ + return np.abs(_bf16_ulp_rank(a_bits_int16) - _bf16_ulp_rank(b_bits_int16)) diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py index a10949de9..7d4343e35 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py @@ -4,6 +4,8 @@ import numba.cuda as cuda from numba.cuda.testing import unittest, CUDATestCase import numpy as np +import operator +from numba.cuda.testing import skip_if_nvjitlink_missing from numba import ( config, @@ -292,6 +294,37 @@ def kernel(arr): np.testing.assert_allclose(arr, [3], atol=1e-2) + @skip_if_nvjitlink_missing("LTO is not supported without nvjitlink.") + def test_bf16_intrinsics_used_in_lto(self): + self.skip_unsupported() + + operations = [ + (operator.add, "fma.rn.bf16"), + (operator.sub, "fma.rn.bf16"), + (operator.mul, "fma.rn.bf16"), + ( + operator.truediv, + "div.approx.f32", + ), # no native bf16 div, see cuda_bf16.hpp:L3067 + ] + + for op, ptx_op in operations: + with self.subTest(op=op): + + @cuda.jit(lto=True) + def kernel(arr): + a = nv_bfloat16(3.14) + b = nv_bfloat16(5) + arr[0] = float32(op(a, b)) + + arr = np.zeros(1, np.float32) + kernel[1, 1](arr) + np.testing.assert_allclose(arr, [op(3.14, 5)], atol=1e-1) + + ptx = next(iter(kernel.inspect_lto_ptx().values())) + + assert ptx_op in ptx, ptx + if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_print.py b/numba_cuda/numba/cuda/tests/cudapy/test_print.py index ca7a5ff13..ff27fd169 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_print.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_print.py @@ -102,6 +102,20 @@ def print_too_many(r): cuda.synchronize() """ +print_bfloat16_usecase = """\ +from numba import cuda, config + +@cuda.jit +def print_bfloat16(): + # 0.9375 is a dyadic rational, it's integer significand can expand within 7 digits. + # printing this should not give any rounding error. + a = cuda.types.bfloat16(0.9375) + print(a, a, a) + +print_bfloat16[1, 1]() +cuda.synchronize() +""" + class TestPrint(CUDATestCase): # Note that in these tests we generally strip the output to avoid dealing @@ -148,6 +162,11 @@ def test_dim3(self): expected = [str(i) for i in np.ndindex(2, 2, 2)] self.assertEqual(sorted(lines), expected) + @skip_on_cudasim("bfloat16 on host is not yet supported.") + def test_bfloat16(self): + output, _ = self.run_code(print_bfloat16_usecase) + self.assertEqual(output.strip(), "0.937500 0.937500 0.937500") + @skip_on_cudasim("cudasim can print unlimited output") def test_too_many_args(self): # Tests that we emit the format string and warn when there are more diff --git a/numba_cuda/numba/cuda/types.py b/numba_cuda/numba/cuda/types.py index 437e0d2f2..d1ec8c28d 100644 --- a/numba_cuda/numba/cuda/types.py +++ b/numba_cuda/numba/cuda/types.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: BSD-2-Clause from numba.core import types +from numba.core.typeconv import Conversion class Dim3(types.Type): @@ -41,3 +42,58 @@ class CUDADispatcher(types.Dispatcher): # is still probably a good idea to have a separate type for CUDA # dispatchers, and this type might get other differentiation from the CPU # dispatcher type in future. + + +class Bfloat16(types.Number): + """ + A bfloat16 type. Has 8 exponent bits and 7 significand bits. + + Conversion rules: + Floats: + from: + fp32, fp64: UNSAFE + fp16: UNSAFE (loses precision) + to: + fp32, fp64: PROMOTE (same exponent, more mantissa) + fp16: UNSAFE (loses range) + + Integers: + from: + int8: SAFE + other int: All UNSAFE (bf16 cannot represent all integers in range) + to: UNSAFE (loses precision, round to zeros) + + All other conversions are not allowed. + """ + + def __init__(self): + super().__init__(name="__nv_bfloat16") + + self.alignof_ = 2 + self.bitwidth = 16 + + def can_convert_from(self, typingctx, other): + if isinstance(other, types.Float): + return Conversion.unsafe + + elif isinstance(other, types.Integer): + if other.bitwidth == 8: + return Conversion.safe + else: + return Conversion.unsafe + + def can_convert_to(self, typingctx, other): + if isinstance(other, types.Float): + if other.bitwidth >= 32: + return Conversion.safe + else: + return Conversion.unsafe + elif isinstance(other, types.Integer): + return Conversion.unsafe + + def unify(self, typingctx, other): + if isinstance(other, (types.Float, types.Integer)): + return typingctx.unify_pairs(self, other) + + +bfloat16 = Bfloat16() diff --git a/pyproject.toml b/pyproject.toml index bed757c95..6ccc44b30 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,6 +49,7 @@ test = [ "pytest", "pytest-xdist", "filecheck", + "ml_dtypes", ] test-cu12 = [ "numba-cuda[cu12]",