From 987617db2290093ca2d7a5c28da51783f287e1eb Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Mon, 4 Aug 2025 16:25:59 -0700 Subject: [PATCH 01/56] update config file with use separate registries --- configs/cuda_bf16.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/configs/cuda_bf16.yml b/configs/cuda_bf16.yml index 3f315e0f7..29aa1d2dd 100644 --- a/configs/cuda_bf16.yml +++ b/configs/cuda_bf16.yml @@ -1,5 +1,7 @@ Name: Numba Bfloat16 -Version: 0.0.1 +Version: 0.0.2 +GPU Arch: + - sm_80 # The first architecture to support bfloat16 Entry Point: ./numba_cuda/numba/cuda/include/12/cuda_bf16.h File List: - ./numba_cuda/numba/cuda/include/12/cuda_bf16.h @@ -22,3 +24,4 @@ Shim Include Override: "\"cuda_bf16.h\"" Additional Import: - os Require Pynvjitlink: False +Use Separate Registry: true From feb8a09988750f79d8e52116fb4262413ae2f8c6 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Mon, 4 Aug 2025 16:30:49 -0700 Subject: [PATCH 02/56] regenerate bfloat16 bindings with lakshayg/Numbast@6282df4 --- numba_cuda/numba/cuda/_internal/cuda_bf16.py | 14086 +++++++++++++++-- 1 file changed, 12671 insertions(+), 1415 deletions(-) diff --git a/numba_cuda/numba/cuda/_internal/cuda_bf16.py b/numba_cuda/numba/cuda/_internal/cuda_bf16.py index e6220fe67..4fd6c50e4 100644 --- a/numba_cuda/numba/cuda/_internal/cuda_bf16.py +++ b/numba_cuda/numba/cuda/_internal/cuda_bf16.py @@ -1,12 +1,12 @@ # Automatically generated by Numbast Static Binding Generator # Generator Information: -# Ast_canopy version: 0.3.0 -# Numbast version: 0.3.0 -# Generation command: /home/wangm/numbast/numbast/src/numbast/__main__.py --cfg-path configs/cuda_bf16.yml --output-dir numba_cuda/numba/cuda/_internal -# Static binding generator parameters: {'cfg_path': 'configs/cuda_bf16.yml', 'output_dir': 'numba_cuda/numba/cuda/', 'entry_point': None, 'retain': None, 'types': None, 'datamodels': None, 'compute_capability': None, 'run_ruff_format': True} -# Config file path (relative to the path of the generated binding): ../../../../configs/cuda_bf16.yml +# Ast_canopy version: 0.4.0 +# Numbast version: 0.4.0 +# Generation command: /home/wangm/miniforge3/envs/numbast/lib/python3.13/site-packages/numbast/__main__.py --cfg-path configs/cuda_bf16.yml --output-dir numba_cuda/numba/cuda/_internal/ +# Static binding generator parameters: {'cfg_path': 'configs/cuda_bf16.yml', 'output_dir': 'numba_cuda/numba/cuda/_internal/', 'run_ruff_format': True} +# Config file path (relative to the path of the generated binding): ../../../../../configs/cuda_bf16.yml # Cudatoolkit version: (12, 8) -# Default CUDA_HOME path: /home/wangm/micromamba/envs/numbast +# Default CUDA_HOME path: /home/wangm/miniforge3/envs/numbast # Imports: @@ -23,11 +23,14 @@ make_attribute_wrapper, register_model, ) +from numba.core.imputils import Registry as TargetRegistry +from numba.core.imputils import lower_cast from numba.core.typing import signature from numba.core.typing.templates import AttributeTemplate, ConcreteTemplate +from numba.core.typing.templates import Registry as TypingRegistry from numba.cuda import CUSource, declare_device -from numba.cuda.cudadecl import register, register_attr, register_global -from numba.cuda.cudaimpl import lower +from numba.cuda._internal.cuda_bf16 import _type___nv_bfloat16 +from numba.cuda.vector_types import vector_types from numba.extending import as_numba_type from numba.types import ( CPointer, @@ -46,10 +49,23 @@ uint16, uint32, uint64, + void, ) +float32x2 = vector_types["float32x2"] + # Setups: + +typing_registry = TypingRegistry() +register = typing_registry.register +register_attr = typing_registry.register_attr +register_global = typing_registry.register_global +target_registry = TargetRegistry() +lower = target_registry.lower +lower_attr = target_registry.lower_getattr +lower_constant = target_registry.lower_constant + # Shim Stream: @@ -76,83 +92,84 @@ def reset(self): shim_stream.write(shim_prefix) shim_obj = CUSource(shim_stream) + # Enums: # Structs: -# Typing for unnamed1401637 -class _type_class_unnamed1401637(Type): +# Typing for unnamed1405307 +class _type_class_unnamed1405307(Type): def __init__(self): - super().__init__(name="unnamed1401637") + super().__init__(name="unnamed1405307") self.alignof_ = 2 self.bitwidth = 2 * 8 -_type_unnamed1401637 = _type_class_unnamed1401637() +_type_unnamed1405307 = _type_class_unnamed1405307() # Make Python API for struct -unnamed1401637 = type("unnamed1401637", (), {"_nbtype": _type_unnamed1401637}) +unnamed1405307 = type("unnamed1405307", (), {"_nbtype": _type_unnamed1405307}) -as_numba_type.register(unnamed1401637, _type_unnamed1401637) +as_numba_type.register(unnamed1405307, _type_unnamed1405307) -@register_model(_type_class_unnamed1401637) -class _model_unnamed1401637(StructModel): +@register_model(_type_class_unnamed1405307) +class _model_unnamed1405307(StructModel): def __init__(self, dmm, fe_type): members = [("x", uint16)] super().__init__(dmm, fe_type, members) @register_attr -class _attr_typing_unnamed1401637(AttributeTemplate): - key = globals()["unnamed1401637"] +class _attr_typing_unnamed1405307(AttributeTemplate): + key = globals()["unnamed1405307"] def resolve_x(self, obj): return uint16 -make_attribute_wrapper(_type_class_unnamed1401637, "x", "x") +make_attribute_wrapper(_type_class_unnamed1405307, "x", "x") @register -class _ctor_template_unnamed1401637(ConcreteTemplate): - key = globals()["unnamed1401637"] +class _ctor_template_unnamed1405307(ConcreteTemplate): + key = globals()["unnamed1405307"] cases = [] -register_global(unnamed1401637, Function(_ctor_template_unnamed1401637)) +register_global(unnamed1405307, Function(_ctor_template_unnamed1405307)) -# Typing for unnamed1401746 -class _type_class_unnamed1401746(Type): +# Typing for unnamed1405416 +class _type_class_unnamed1405416(Type): def __init__(self): - super().__init__(name="unnamed1401746") + super().__init__(name="unnamed1405416") self.alignof_ = 4 self.bitwidth = 4 * 8 -_type_unnamed1401746 = _type_class_unnamed1401746() +_type_unnamed1405416 = _type_class_unnamed1405416() # Make Python API for struct -unnamed1401746 = type("unnamed1401746", (), {"_nbtype": _type_unnamed1401746}) +unnamed1405416 = type("unnamed1405416", (), {"_nbtype": _type_unnamed1405416}) -as_numba_type.register(unnamed1401746, _type_unnamed1401746) +as_numba_type.register(unnamed1405416, _type_unnamed1405416) -@register_model(_type_class_unnamed1401746) -class _model_unnamed1401746(StructModel): +@register_model(_type_class_unnamed1405416) +class _model_unnamed1405416(StructModel): def __init__(self, dmm, fe_type): members = [("x", uint16), ("y", uint16)] super().__init__(dmm, fe_type, members) @register_attr -class _attr_typing_unnamed1401746(AttributeTemplate): - key = globals()["unnamed1401746"] +class _attr_typing_unnamed1405416(AttributeTemplate): + key = globals()["unnamed1405416"] def resolve_x(self, obj): return uint16 @@ -161,19 +178,19 @@ def resolve_y(self, obj): return uint16 -make_attribute_wrapper(_type_class_unnamed1401746, "x", "x") +make_attribute_wrapper(_type_class_unnamed1405416, "x", "x") -make_attribute_wrapper(_type_class_unnamed1401746, "y", "y") +make_attribute_wrapper(_type_class_unnamed1405416, "y", "y") @register -class _ctor_template_unnamed1401746(ConcreteTemplate): - key = globals()["unnamed1401746"] +class _ctor_template_unnamed1405416(ConcreteTemplate): + key = globals()["unnamed1405416"] cases = [] -register_global(unnamed1401746, Function(_ctor_template_unnamed1401746)) +register_global(unnamed1405416, Function(_ctor_template_unnamed1405416)) # Typing for __nv_bfloat16 @@ -200,17 +217,17 @@ def __init__(self, dmm, fe_type): super(_model___nv_bfloat16, self).__init__(dmm, fe_type, be_type) -def _lower___nv_bfloat16_void(shim_stream, shim_obj): +def _lower__ZN13__nv_bfloat16C1Ev(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16____nv_bfloat16_1(int &ignore, __nv_bfloat16 *self ) { + _ZN13__nv_bfloat16C1Ev_nbst(int &ignore, __nv_bfloat16 *self ) { new (self) __nv_bfloat16(); return 0; } """ _ctor_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16____nv_bfloat16_1", + "_ZN13__nv_bfloat16C1Ev_nbst", int32( CPointer(_type___nv_bfloat16), ), @@ -224,9 +241,7 @@ def __nv_bfloat16_device_caller(arg_0): ) def ctor_impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key( - "____nv_bfloat16____nv_bfloat16_1", shim_raw_str - ) + shim_stream.write_with_key("_ZN13__nv_bfloat16C1Ev_nbst", shim_raw_str) selfptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" ) @@ -250,31 +265,31 @@ def ctor_impl(context, builder, sig, args): ) -_lower___nv_bfloat16_void(shim_stream, shim_obj) +_lower__ZN13__nv_bfloat16C1Ev(shim_stream, shim_obj) -def _lower___nv_bfloat16__type_unnamed1401637(shim_stream, shim_obj): +def _lower__ZN13__nv_bfloat16C1ERK17__nv_bfloat16_raw(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16____nv_bfloat16_2(int &ignore, __nv_bfloat16 *self , __nv_bfloat16_raw* hr) { + _ZN13__nv_bfloat16C1ERK17__nv_bfloat16_raw_nbst(int &ignore, __nv_bfloat16 *self , __nv_bfloat16_raw* hr) { new (self) __nv_bfloat16(*hr); return 0; } """ _ctor_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16____nv_bfloat16_2", - int32(CPointer(_type___nv_bfloat16), CPointer(_type_unnamed1401637)), + "_ZN13__nv_bfloat16C1ERK17__nv_bfloat16_raw_nbst", + int32(CPointer(_type___nv_bfloat16), CPointer(_type_unnamed1405307)), ) def __nv_bfloat16_device_caller(arg_0, arg_1): return _ctor_decl___nv_bfloat16(arg_0, arg_1) - @lower(__nv_bfloat16, _type_unnamed1401637) + @lower(__nv_bfloat16, _type_unnamed1405307) def ctor_impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( - "____nv_bfloat16____nv_bfloat16_2", shim_raw_str + "_ZN13__nv_bfloat16C1ERK17__nv_bfloat16_raw_nbst", shim_raw_str ) selfptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" @@ -291,7 +306,7 @@ def ctor_impl(context, builder, sig, args): signature( int32, CPointer(_type___nv_bfloat16), - CPointer(_type_unnamed1401637), + CPointer(_type_unnamed1405307), ), (selfptr, *argptrs), ) @@ -299,21 +314,30 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) ) + @lower_cast(_type_unnamed1405307, _type___nv_bfloat16) + def conversion_impl(context, builder, fromty, toty, value): + return ctor_impl( + context, + builder, + signature(_type___nv_bfloat16, CPointer(_type_unnamed1405307)), + value, + ) + -_lower___nv_bfloat16__type_unnamed1401637(shim_stream, shim_obj) +_lower__ZN13__nv_bfloat16C1ERK17__nv_bfloat16_raw(shim_stream, shim_obj) -def _lower___nv_bfloat16_float16(shim_stream, shim_obj): +def _lower__ZN13__nv_bfloat16C1E6__half(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16____nv_bfloat16_3(int &ignore, __nv_bfloat16 *self , __half* f) { + _ZN13__nv_bfloat16C1E6__half_nbst(int &ignore, __nv_bfloat16 *self , __half* f) { new (self) __nv_bfloat16(*f); return 0; } """ _ctor_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16____nv_bfloat16_3", + "_ZN13__nv_bfloat16C1E6__half_nbst", int32(CPointer(_type___nv_bfloat16), CPointer(float16)), ) @@ -324,7 +348,7 @@ def __nv_bfloat16_device_caller(arg_0, arg_1): def ctor_impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( - "____nv_bfloat16____nv_bfloat16_3", shim_raw_str + "_ZN13__nv_bfloat16C1E6__half_nbst", shim_raw_str ) selfptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" @@ -346,20 +370,20 @@ def ctor_impl(context, builder, sig, args): ) -_lower___nv_bfloat16_float16(shim_stream, shim_obj) +_lower__ZN13__nv_bfloat16C1E6__half(shim_stream, shim_obj) -def _lower___nv_bfloat16_float32(shim_stream, shim_obj): +def _lower__ZN13__nv_bfloat16C1Ef(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16____nv_bfloat16_4(int &ignore, __nv_bfloat16 *self , float* f) { + _ZN13__nv_bfloat16C1Ef_nbst(int &ignore, __nv_bfloat16 *self , float* f) { new (self) __nv_bfloat16(*f); return 0; } """ _ctor_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16____nv_bfloat16_4", + "_ZN13__nv_bfloat16C1Ef_nbst", int32(CPointer(_type___nv_bfloat16), CPointer(float32)), ) @@ -369,9 +393,7 @@ def __nv_bfloat16_device_caller(arg_0, arg_1): @lower(__nv_bfloat16, float32) def ctor_impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key( - "____nv_bfloat16____nv_bfloat16_4", shim_raw_str - ) + shim_stream.write_with_key("_ZN13__nv_bfloat16C1Ef_nbst", shim_raw_str) selfptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" ) @@ -391,21 +413,30 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) ) + @lower_cast(float32, _type___nv_bfloat16) + def conversion_impl(context, builder, fromty, toty, value): + return ctor_impl( + context, + builder, + signature(_type___nv_bfloat16, CPointer(float32)), + value, + ) + -_lower___nv_bfloat16_float32(shim_stream, shim_obj) +_lower__ZN13__nv_bfloat16C1Ef(shim_stream, shim_obj) -def _lower___nv_bfloat16_float64(shim_stream, shim_obj): +def _lower__ZN13__nv_bfloat16C1Ed(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16____nv_bfloat16_5(int &ignore, __nv_bfloat16 *self , double* f) { + _ZN13__nv_bfloat16C1Ed_nbst(int &ignore, __nv_bfloat16 *self , double* f) { new (self) __nv_bfloat16(*f); return 0; } """ _ctor_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16____nv_bfloat16_5", + "_ZN13__nv_bfloat16C1Ed_nbst", int32(CPointer(_type___nv_bfloat16), CPointer(float64)), ) @@ -415,9 +446,7 @@ def __nv_bfloat16_device_caller(arg_0, arg_1): @lower(__nv_bfloat16, float64) def ctor_impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key( - "____nv_bfloat16____nv_bfloat16_5", shim_raw_str - ) + shim_stream.write_with_key("_ZN13__nv_bfloat16C1Ed_nbst", shim_raw_str) selfptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" ) @@ -437,21 +466,30 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) ) + @lower_cast(float64, _type___nv_bfloat16) + def conversion_impl(context, builder, fromty, toty, value): + return ctor_impl( + context, + builder, + signature(_type___nv_bfloat16, CPointer(float64)), + value, + ) -_lower___nv_bfloat16_float64(shim_stream, shim_obj) +_lower__ZN13__nv_bfloat16C1Ed(shim_stream, shim_obj) -def _lower___nv_bfloat16_int16(shim_stream, shim_obj): + +def _lower__ZN13__nv_bfloat16C1Es(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16____nv_bfloat16_6(int &ignore, __nv_bfloat16 *self , short* val) { + _ZN13__nv_bfloat16C1Es_nbst(int &ignore, __nv_bfloat16 *self , short* val) { new (self) __nv_bfloat16(*val); return 0; } """ _ctor_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16____nv_bfloat16_6", + "_ZN13__nv_bfloat16C1Es_nbst", int32(CPointer(_type___nv_bfloat16), CPointer(int16)), ) @@ -461,9 +499,7 @@ def __nv_bfloat16_device_caller(arg_0, arg_1): @lower(__nv_bfloat16, int16) def ctor_impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key( - "____nv_bfloat16____nv_bfloat16_6", shim_raw_str - ) + shim_stream.write_with_key("_ZN13__nv_bfloat16C1Es_nbst", shim_raw_str) selfptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" ) @@ -483,21 +519,30 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) ) + @lower_cast(int16, _type___nv_bfloat16) + def conversion_impl(context, builder, fromty, toty, value): + return ctor_impl( + context, + builder, + signature(_type___nv_bfloat16, CPointer(int16)), + value, + ) + -_lower___nv_bfloat16_int16(shim_stream, shim_obj) +_lower__ZN13__nv_bfloat16C1Es(shim_stream, shim_obj) -def _lower___nv_bfloat16_uint16(shim_stream, shim_obj): +def _lower__ZN13__nv_bfloat16C1Et(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16____nv_bfloat16_7(int &ignore, __nv_bfloat16 *self , unsigned short* val) { + _ZN13__nv_bfloat16C1Et_nbst(int &ignore, __nv_bfloat16 *self , unsigned short* val) { new (self) __nv_bfloat16(*val); return 0; } """ _ctor_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16____nv_bfloat16_7", + "_ZN13__nv_bfloat16C1Et_nbst", int32(CPointer(_type___nv_bfloat16), CPointer(uint16)), ) @@ -507,9 +552,7 @@ def __nv_bfloat16_device_caller(arg_0, arg_1): @lower(__nv_bfloat16, uint16) def ctor_impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key( - "____nv_bfloat16____nv_bfloat16_7", shim_raw_str - ) + shim_stream.write_with_key("_ZN13__nv_bfloat16C1Et_nbst", shim_raw_str) selfptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" ) @@ -529,21 +572,30 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) ) + @lower_cast(uint16, _type___nv_bfloat16) + def conversion_impl(context, builder, fromty, toty, value): + return ctor_impl( + context, + builder, + signature(_type___nv_bfloat16, CPointer(uint16)), + value, + ) + -_lower___nv_bfloat16_uint16(shim_stream, shim_obj) +_lower__ZN13__nv_bfloat16C1Et(shim_stream, shim_obj) -def _lower___nv_bfloat16_int32(shim_stream, shim_obj): +def _lower__ZN13__nv_bfloat16C1Ei(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16____nv_bfloat16_8(int &ignore, __nv_bfloat16 *self , int* val) { + _ZN13__nv_bfloat16C1Ei_nbst(int &ignore, __nv_bfloat16 *self , int* val) { new (self) __nv_bfloat16(*val); return 0; } """ _ctor_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16____nv_bfloat16_8", + "_ZN13__nv_bfloat16C1Ei_nbst", int32(CPointer(_type___nv_bfloat16), CPointer(int32)), ) @@ -553,9 +605,7 @@ def __nv_bfloat16_device_caller(arg_0, arg_1): @lower(__nv_bfloat16, int32) def ctor_impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key( - "____nv_bfloat16____nv_bfloat16_8", shim_raw_str - ) + shim_stream.write_with_key("_ZN13__nv_bfloat16C1Ei_nbst", shim_raw_str) selfptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" ) @@ -575,21 +625,30 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) ) + @lower_cast(int32, _type___nv_bfloat16) + def conversion_impl(context, builder, fromty, toty, value): + return ctor_impl( + context, + builder, + signature(_type___nv_bfloat16, CPointer(int32)), + value, + ) + -_lower___nv_bfloat16_int32(shim_stream, shim_obj) +_lower__ZN13__nv_bfloat16C1Ei(shim_stream, shim_obj) -def _lower___nv_bfloat16_uint32(shim_stream, shim_obj): +def _lower__ZN13__nv_bfloat16C1Ej(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16____nv_bfloat16_9(int &ignore, __nv_bfloat16 *self , unsigned int* val) { + _ZN13__nv_bfloat16C1Ej_nbst(int &ignore, __nv_bfloat16 *self , unsigned int* val) { new (self) __nv_bfloat16(*val); return 0; } """ _ctor_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16____nv_bfloat16_9", + "_ZN13__nv_bfloat16C1Ej_nbst", int32(CPointer(_type___nv_bfloat16), CPointer(uint32)), ) @@ -599,9 +658,7 @@ def __nv_bfloat16_device_caller(arg_0, arg_1): @lower(__nv_bfloat16, uint32) def ctor_impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key( - "____nv_bfloat16____nv_bfloat16_9", shim_raw_str - ) + shim_stream.write_with_key("_ZN13__nv_bfloat16C1Ej_nbst", shim_raw_str) selfptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" ) @@ -621,21 +678,30 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) ) + @lower_cast(uint32, _type___nv_bfloat16) + def conversion_impl(context, builder, fromty, toty, value): + return ctor_impl( + context, + builder, + signature(_type___nv_bfloat16, CPointer(uint32)), + value, + ) -_lower___nv_bfloat16_uint32(shim_stream, shim_obj) +_lower__ZN13__nv_bfloat16C1Ej(shim_stream, shim_obj) -def _lower___nv_bfloat16_int64(shim_stream, shim_obj): + +def _lower__ZN13__nv_bfloat16C1El(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16____nv_bfloat16_10(int &ignore, __nv_bfloat16 *self , long* val) { + _ZN13__nv_bfloat16C1El_nbst(int &ignore, __nv_bfloat16 *self , long* val) { new (self) __nv_bfloat16(*val); return 0; } """ _ctor_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16____nv_bfloat16_10", + "_ZN13__nv_bfloat16C1El_nbst", int32(CPointer(_type___nv_bfloat16), CPointer(int64)), ) @@ -645,9 +711,7 @@ def __nv_bfloat16_device_caller(arg_0, arg_1): @lower(__nv_bfloat16, int64) def ctor_impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key( - "____nv_bfloat16____nv_bfloat16_10", shim_raw_str - ) + shim_stream.write_with_key("_ZN13__nv_bfloat16C1El_nbst", shim_raw_str) selfptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" ) @@ -667,21 +731,30 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) ) + @lower_cast(int64, _type___nv_bfloat16) + def conversion_impl(context, builder, fromty, toty, value): + return ctor_impl( + context, + builder, + signature(_type___nv_bfloat16, CPointer(int64)), + value, + ) + -_lower___nv_bfloat16_int64(shim_stream, shim_obj) +_lower__ZN13__nv_bfloat16C1El(shim_stream, shim_obj) -def _lower___nv_bfloat16_uint64(shim_stream, shim_obj): +def _lower__ZN13__nv_bfloat16C1Em(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16____nv_bfloat16_11(int &ignore, __nv_bfloat16 *self , unsigned long* val) { + _ZN13__nv_bfloat16C1Em_nbst(int &ignore, __nv_bfloat16 *self , unsigned long* val) { new (self) __nv_bfloat16(*val); return 0; } """ _ctor_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16____nv_bfloat16_11", + "_ZN13__nv_bfloat16C1Em_nbst", int32(CPointer(_type___nv_bfloat16), CPointer(uint64)), ) @@ -691,9 +764,7 @@ def __nv_bfloat16_device_caller(arg_0, arg_1): @lower(__nv_bfloat16, uint64) def ctor_impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key( - "____nv_bfloat16____nv_bfloat16_11", shim_raw_str - ) + shim_stream.write_with_key("_ZN13__nv_bfloat16C1Em_nbst", shim_raw_str) selfptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" ) @@ -713,21 +784,30 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) ) + @lower_cast(uint64, _type___nv_bfloat16) + def conversion_impl(context, builder, fromty, toty, value): + return ctor_impl( + context, + builder, + signature(_type___nv_bfloat16, CPointer(uint64)), + value, + ) + -_lower___nv_bfloat16_uint64(shim_stream, shim_obj) +_lower__ZN13__nv_bfloat16C1Em(shim_stream, shim_obj) -def _lower___nv_bfloat16_int64(shim_stream, shim_obj): +def _lower__ZN13__nv_bfloat16C1Ex(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16____nv_bfloat16_12(int &ignore, __nv_bfloat16 *self , long long* val) { + _ZN13__nv_bfloat16C1Ex_nbst(int &ignore, __nv_bfloat16 *self , long long* val) { new (self) __nv_bfloat16(*val); return 0; } """ _ctor_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16____nv_bfloat16_12", + "_ZN13__nv_bfloat16C1Ex_nbst", int32(CPointer(_type___nv_bfloat16), CPointer(int64)), ) @@ -737,9 +817,7 @@ def __nv_bfloat16_device_caller(arg_0, arg_1): @lower(__nv_bfloat16, int64) def ctor_impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key( - "____nv_bfloat16____nv_bfloat16_12", shim_raw_str - ) + shim_stream.write_with_key("_ZN13__nv_bfloat16C1Ex_nbst", shim_raw_str) selfptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" ) @@ -759,21 +837,30 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) ) + @lower_cast(int64, _type___nv_bfloat16) + def conversion_impl(context, builder, fromty, toty, value): + return ctor_impl( + context, + builder, + signature(_type___nv_bfloat16, CPointer(int64)), + value, + ) + -_lower___nv_bfloat16_int64(shim_stream, shim_obj) +_lower__ZN13__nv_bfloat16C1Ex(shim_stream, shim_obj) -def _lower___nv_bfloat16_uint64(shim_stream, shim_obj): +def _lower__ZN13__nv_bfloat16C1Ey(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16____nv_bfloat16_13(int &ignore, __nv_bfloat16 *self , unsigned long long* val) { + _ZN13__nv_bfloat16C1Ey_nbst(int &ignore, __nv_bfloat16 *self , unsigned long long* val) { new (self) __nv_bfloat16(*val); return 0; } """ _ctor_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16____nv_bfloat16_13", + "_ZN13__nv_bfloat16C1Ey_nbst", int32(CPointer(_type___nv_bfloat16), CPointer(uint64)), ) @@ -783,9 +870,7 @@ def __nv_bfloat16_device_caller(arg_0, arg_1): @lower(__nv_bfloat16, uint64) def ctor_impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key( - "____nv_bfloat16____nv_bfloat16_13", shim_raw_str - ) + shim_stream.write_with_key("_ZN13__nv_bfloat16C1Ey_nbst", shim_raw_str) selfptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" ) @@ -805,8 +890,17 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) ) + @lower_cast(uint64, _type___nv_bfloat16) + def conversion_impl(context, builder, fromty, toty, value): + return ctor_impl( + context, + builder, + signature(_type___nv_bfloat16, CPointer(uint64)), + value, + ) -_lower___nv_bfloat16_uint64(shim_stream, shim_obj) + +_lower__ZN13__nv_bfloat16C1Ey(shim_stream, shim_obj) @register @@ -816,7 +910,7 @@ class _ctor_template___nv_bfloat16(ConcreteTemplate): signature( _type___nv_bfloat16, ), - signature(_type___nv_bfloat16, _type_unnamed1401637), + signature(_type___nv_bfloat16, _type_unnamed1405307), signature(_type___nv_bfloat16, float16), signature(_type___nv_bfloat16, float32), signature(_type___nv_bfloat16, float64), @@ -834,18 +928,18 @@ class _ctor_template___nv_bfloat16(ConcreteTemplate): register_global(__nv_bfloat16, Function(_ctor_template___nv_bfloat16)) -def _from___nv_bfloat16_to__type_unnamed1401637_lower(shim_stream, shim_obj): +def _from___nv_bfloat16_to__type_unnamed1405307_lower(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16_operator___nv_bfloat16_raw_1(__nv_bfloat16_raw &retval, __nv_bfloat16 *self) { + ____nv_bfloat16__ZNK13__nv_bfloat16cv17__nv_bfloat16_rawEv_1(__nv_bfloat16_raw &retval, __nv_bfloat16 *self) { retval = self->operator __nv_bfloat16_raw(); return 0; } """ _op_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16_operator___nv_bfloat16_raw_1", - _type_unnamed1401637( + "____nv_bfloat16__ZNK13__nv_bfloat16cv17__nv_bfloat16_rawEv_1", + _type_unnamed1405307( CPointer(_type___nv_bfloat16), ), ) @@ -853,11 +947,12 @@ def _from___nv_bfloat16_to__type_unnamed1401637_lower(shim_stream, shim_obj): def _conversion_op_caller___nv_bfloat16(arg): return _op_decl___nv_bfloat16(arg) - @lower_cast(_type___nv_bfloat16, _type_unnamed1401637) + @lower_cast(_type___nv_bfloat16, _type_unnamed1405307) def impl(context, builder, fromty, toty, value): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( - "____nv_bfloat16_operator___nv_bfloat16_raw_1", shim_raw_str + "____nv_bfloat16__ZNK13__nv_bfloat16cv17__nv_bfloat16_rawEv_1", + shim_raw_str, ) ptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" @@ -870,28 +965,28 @@ def impl(context, builder, fromty, toty, value): builder, _conversion_op_caller___nv_bfloat16, signature( - _type_unnamed1401637, + _type_unnamed1405307, CPointer(_type___nv_bfloat16), ), (ptr,), ) -_from___nv_bfloat16_to__type_unnamed1401637_lower(shim_stream, shim_obj) +_from___nv_bfloat16_to__type_unnamed1405307_lower(shim_stream, shim_obj) -def _from___nv_bfloat16_to__type_unnamed1401637_lower(shim_stream, shim_obj): +def _from___nv_bfloat16_to__type_unnamed1405307_lower(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16_operator___nv_bfloat16_raw_2(__nv_bfloat16_raw &retval, __nv_bfloat16 *self) { + ____nv_bfloat16__ZNVK13__nv_bfloat16cv17__nv_bfloat16_rawEv_1(__nv_bfloat16_raw &retval, __nv_bfloat16 *self) { retval = self->operator __nv_bfloat16_raw(); return 0; } """ _op_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16_operator___nv_bfloat16_raw_2", - _type_unnamed1401637( + "____nv_bfloat16__ZNVK13__nv_bfloat16cv17__nv_bfloat16_rawEv_1", + _type_unnamed1405307( CPointer(_type___nv_bfloat16), ), ) @@ -899,11 +994,12 @@ def _from___nv_bfloat16_to__type_unnamed1401637_lower(shim_stream, shim_obj): def _conversion_op_caller___nv_bfloat16(arg): return _op_decl___nv_bfloat16(arg) - @lower_cast(_type___nv_bfloat16, _type_unnamed1401637) + @lower_cast(_type___nv_bfloat16, _type_unnamed1405307) def impl(context, builder, fromty, toty, value): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( - "____nv_bfloat16_operator___nv_bfloat16_raw_2", shim_raw_str + "____nv_bfloat16__ZNVK13__nv_bfloat16cv17__nv_bfloat16_rawEv_1", + shim_raw_str, ) ptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" @@ -916,27 +1012,27 @@ def impl(context, builder, fromty, toty, value): builder, _conversion_op_caller___nv_bfloat16, signature( - _type_unnamed1401637, + _type_unnamed1405307, CPointer(_type___nv_bfloat16), ), (ptr,), ) -_from___nv_bfloat16_to__type_unnamed1401637_lower(shim_stream, shim_obj) +_from___nv_bfloat16_to__type_unnamed1405307_lower(shim_stream, shim_obj) def _from___nv_bfloat16_to_float32_lower(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16_operator_float_1(float &retval, __nv_bfloat16 *self) { + ____nv_bfloat16__ZNK13__nv_bfloat16cvfEv_1(float &retval, __nv_bfloat16 *self) { retval = self->operator float(); return 0; } """ _op_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16_operator_float_1", + "____nv_bfloat16__ZNK13__nv_bfloat16cvfEv_1", float32( CPointer(_type___nv_bfloat16), ), @@ -949,7 +1045,7 @@ def _conversion_op_caller___nv_bfloat16(arg): def impl(context, builder, fromty, toty, value): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( - "____nv_bfloat16_operator_float_1", shim_raw_str + "____nv_bfloat16__ZNK13__nv_bfloat16cvfEv_1", shim_raw_str ) ptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" @@ -975,14 +1071,14 @@ def impl(context, builder, fromty, toty, value): def _from___nv_bfloat16_to_int8_lower(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16_operator_signed_char_1(signed char &retval, __nv_bfloat16 *self) { + ____nv_bfloat16__ZNK13__nv_bfloat16cvaEv_1(signed char &retval, __nv_bfloat16 *self) { retval = self->operator signed char(); return 0; } """ _op_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16_operator_signed_char_1", + "____nv_bfloat16__ZNK13__nv_bfloat16cvaEv_1", int8( CPointer(_type___nv_bfloat16), ), @@ -995,7 +1091,7 @@ def _conversion_op_caller___nv_bfloat16(arg): def impl(context, builder, fromty, toty, value): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( - "____nv_bfloat16_operator_signed_char_1", shim_raw_str + "____nv_bfloat16__ZNK13__nv_bfloat16cvaEv_1", shim_raw_str ) ptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" @@ -1021,14 +1117,14 @@ def impl(context, builder, fromty, toty, value): def _from___nv_bfloat16_to_uint8_lower(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16_operator_unsigned_char_1(unsigned char &retval, __nv_bfloat16 *self) { + ____nv_bfloat16__ZNK13__nv_bfloat16cvhEv_1(unsigned char &retval, __nv_bfloat16 *self) { retval = self->operator unsigned char(); return 0; } """ _op_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16_operator_unsigned_char_1", + "____nv_bfloat16__ZNK13__nv_bfloat16cvhEv_1", uint8( CPointer(_type___nv_bfloat16), ), @@ -1041,7 +1137,7 @@ def _conversion_op_caller___nv_bfloat16(arg): def impl(context, builder, fromty, toty, value): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( - "____nv_bfloat16_operator_unsigned_char_1", shim_raw_str + "____nv_bfloat16__ZNK13__nv_bfloat16cvhEv_1", shim_raw_str ) ptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" @@ -1067,14 +1163,14 @@ def impl(context, builder, fromty, toty, value): def _from___nv_bfloat16_to_int8_lower(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16_operator_char_1(char &retval, __nv_bfloat16 *self) { + ____nv_bfloat16__ZNK13__nv_bfloat16cvcEv_1(char &retval, __nv_bfloat16 *self) { retval = self->operator char(); return 0; } """ _op_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16_operator_char_1", + "____nv_bfloat16__ZNK13__nv_bfloat16cvcEv_1", int8( CPointer(_type___nv_bfloat16), ), @@ -1087,7 +1183,7 @@ def _conversion_op_caller___nv_bfloat16(arg): def impl(context, builder, fromty, toty, value): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( - "____nv_bfloat16_operator_char_1", shim_raw_str + "____nv_bfloat16__ZNK13__nv_bfloat16cvcEv_1", shim_raw_str ) ptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" @@ -1113,14 +1209,14 @@ def impl(context, builder, fromty, toty, value): def _from___nv_bfloat16_to_int16_lower(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16_operator_short_1(short &retval, __nv_bfloat16 *self) { + ____nv_bfloat16__ZNK13__nv_bfloat16cvsEv_1(short &retval, __nv_bfloat16 *self) { retval = self->operator short(); return 0; } """ _op_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16_operator_short_1", + "____nv_bfloat16__ZNK13__nv_bfloat16cvsEv_1", int16( CPointer(_type___nv_bfloat16), ), @@ -1133,7 +1229,7 @@ def _conversion_op_caller___nv_bfloat16(arg): def impl(context, builder, fromty, toty, value): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( - "____nv_bfloat16_operator_short_1", shim_raw_str + "____nv_bfloat16__ZNK13__nv_bfloat16cvsEv_1", shim_raw_str ) ptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" @@ -1159,14 +1255,14 @@ def impl(context, builder, fromty, toty, value): def _from___nv_bfloat16_to_uint16_lower(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16_operator_unsigned_short_1(unsigned short &retval, __nv_bfloat16 *self) { + ____nv_bfloat16__ZNK13__nv_bfloat16cvtEv_1(unsigned short &retval, __nv_bfloat16 *self) { retval = self->operator unsigned short(); return 0; } """ _op_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16_operator_unsigned_short_1", + "____nv_bfloat16__ZNK13__nv_bfloat16cvtEv_1", uint16( CPointer(_type___nv_bfloat16), ), @@ -1179,7 +1275,7 @@ def _conversion_op_caller___nv_bfloat16(arg): def impl(context, builder, fromty, toty, value): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( - "____nv_bfloat16_operator_unsigned_short_1", shim_raw_str + "____nv_bfloat16__ZNK13__nv_bfloat16cvtEv_1", shim_raw_str ) ptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" @@ -1205,14 +1301,14 @@ def impl(context, builder, fromty, toty, value): def _from___nv_bfloat16_to_int32_lower(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16_operator_int_1(int &retval, __nv_bfloat16 *self) { + ____nv_bfloat16__ZNK13__nv_bfloat16cviEv_1(int &retval, __nv_bfloat16 *self) { retval = self->operator int(); return 0; } """ _op_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16_operator_int_1", + "____nv_bfloat16__ZNK13__nv_bfloat16cviEv_1", int32( CPointer(_type___nv_bfloat16), ), @@ -1225,7 +1321,7 @@ def _conversion_op_caller___nv_bfloat16(arg): def impl(context, builder, fromty, toty, value): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( - "____nv_bfloat16_operator_int_1", shim_raw_str + "____nv_bfloat16__ZNK13__nv_bfloat16cviEv_1", shim_raw_str ) ptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" @@ -1251,14 +1347,14 @@ def impl(context, builder, fromty, toty, value): def _from___nv_bfloat16_to_uint32_lower(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16_operator_unsigned_int_1(unsigned int &retval, __nv_bfloat16 *self) { + ____nv_bfloat16__ZNK13__nv_bfloat16cvjEv_1(unsigned int &retval, __nv_bfloat16 *self) { retval = self->operator unsigned int(); return 0; } """ _op_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16_operator_unsigned_int_1", + "____nv_bfloat16__ZNK13__nv_bfloat16cvjEv_1", uint32( CPointer(_type___nv_bfloat16), ), @@ -1271,7 +1367,7 @@ def _conversion_op_caller___nv_bfloat16(arg): def impl(context, builder, fromty, toty, value): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( - "____nv_bfloat16_operator_unsigned_int_1", shim_raw_str + "____nv_bfloat16__ZNK13__nv_bfloat16cvjEv_1", shim_raw_str ) ptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" @@ -1297,14 +1393,14 @@ def impl(context, builder, fromty, toty, value): def _from___nv_bfloat16_to_int64_lower(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16_operator_long_1(long &retval, __nv_bfloat16 *self) { + ____nv_bfloat16__ZNK13__nv_bfloat16cvlEv_1(long &retval, __nv_bfloat16 *self) { retval = self->operator long(); return 0; } """ _op_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16_operator_long_1", + "____nv_bfloat16__ZNK13__nv_bfloat16cvlEv_1", int64( CPointer(_type___nv_bfloat16), ), @@ -1317,7 +1413,7 @@ def _conversion_op_caller___nv_bfloat16(arg): def impl(context, builder, fromty, toty, value): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( - "____nv_bfloat16_operator_long_1", shim_raw_str + "____nv_bfloat16__ZNK13__nv_bfloat16cvlEv_1", shim_raw_str ) ptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" @@ -1343,14 +1439,14 @@ def impl(context, builder, fromty, toty, value): def _from___nv_bfloat16_to_uint64_lower(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16_operator_unsigned_long_1(unsigned long &retval, __nv_bfloat16 *self) { + ____nv_bfloat16__ZNK13__nv_bfloat16cvmEv_1(unsigned long &retval, __nv_bfloat16 *self) { retval = self->operator unsigned long(); return 0; } """ _op_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16_operator_unsigned_long_1", + "____nv_bfloat16__ZNK13__nv_bfloat16cvmEv_1", uint64( CPointer(_type___nv_bfloat16), ), @@ -1363,7 +1459,7 @@ def _conversion_op_caller___nv_bfloat16(arg): def impl(context, builder, fromty, toty, value): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( - "____nv_bfloat16_operator_unsigned_long_1", shim_raw_str + "____nv_bfloat16__ZNK13__nv_bfloat16cvmEv_1", shim_raw_str ) ptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" @@ -1389,14 +1485,14 @@ def impl(context, builder, fromty, toty, value): def _from___nv_bfloat16_to_int64_lower(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16_operator_long_long_1(long long &retval, __nv_bfloat16 *self) { + ____nv_bfloat16__ZNK13__nv_bfloat16cvxEv_1(long long &retval, __nv_bfloat16 *self) { retval = self->operator long long(); return 0; } """ _op_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16_operator_long_long_1", + "____nv_bfloat16__ZNK13__nv_bfloat16cvxEv_1", int64( CPointer(_type___nv_bfloat16), ), @@ -1409,7 +1505,7 @@ def _conversion_op_caller___nv_bfloat16(arg): def impl(context, builder, fromty, toty, value): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( - "____nv_bfloat16_operator_long_long_1", shim_raw_str + "____nv_bfloat16__ZNK13__nv_bfloat16cvxEv_1", shim_raw_str ) ptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" @@ -1435,14 +1531,14 @@ def impl(context, builder, fromty, toty, value): def _from___nv_bfloat16_to_uint64_lower(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16_operator_unsigned_long_long_1(unsigned long long &retval, __nv_bfloat16 *self) { + ____nv_bfloat16__ZNK13__nv_bfloat16cvyEv_1(unsigned long long &retval, __nv_bfloat16 *self) { retval = self->operator unsigned long long(); return 0; } """ _op_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16_operator_unsigned_long_long_1", + "____nv_bfloat16__ZNK13__nv_bfloat16cvyEv_1", uint64( CPointer(_type___nv_bfloat16), ), @@ -1455,7 +1551,7 @@ def _conversion_op_caller___nv_bfloat16(arg): def impl(context, builder, fromty, toty, value): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( - "____nv_bfloat16_operator_unsigned_long_long_1", shim_raw_str + "____nv_bfloat16__ZNK13__nv_bfloat16cvyEv_1", shim_raw_str ) ptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" @@ -1481,14 +1577,14 @@ def impl(context, builder, fromty, toty, value): def _from___nv_bfloat16_to_bool__lower(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat16_operator_bool_1(bool &retval, __nv_bfloat16 *self) { + ____nv_bfloat16__ZNK13__nv_bfloat16cvbEv_1(bool &retval, __nv_bfloat16 *self) { retval = self->operator bool(); return 0; } """ _op_decl___nv_bfloat16 = declare_device( - "____nv_bfloat16_operator_bool_1", + "____nv_bfloat16__ZNK13__nv_bfloat16cvbEv_1", bool_( CPointer(_type___nv_bfloat16), ), @@ -1501,7 +1597,7 @@ def _conversion_op_caller___nv_bfloat16(arg): def impl(context, builder, fromty, toty, value): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( - "____nv_bfloat16_operator_bool_1", shim_raw_str + "____nv_bfloat16__ZNK13__nv_bfloat16cvbEv_1", shim_raw_str ) ptr = builder.alloca( context.get_value_type(_type___nv_bfloat16), name="selfptr" @@ -1565,17 +1661,17 @@ def resolve_y(self, obj): make_attribute_wrapper(_type_class___nv_bfloat162, "y", "y") -def _lower___nv_bfloat162_void(shim_stream, shim_obj): +def _lower__ZN14__nv_bfloat162C1Ev(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat162____nv_bfloat162_1(int &ignore, __nv_bfloat162 *self ) { + _ZN14__nv_bfloat162C1Ev_nbst(int &ignore, __nv_bfloat162 *self ) { new (self) __nv_bfloat162(); return 0; } """ _ctor_decl___nv_bfloat162 = declare_device( - "____nv_bfloat162____nv_bfloat162_1", + "_ZN14__nv_bfloat162C1Ev_nbst", int32( CPointer(_type___nv_bfloat162), ), @@ -1589,9 +1685,7 @@ def __nv_bfloat162_device_caller(arg_0): ) def ctor_impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key( - "____nv_bfloat162____nv_bfloat162_1", shim_raw_str - ) + shim_stream.write_with_key("_ZN14__nv_bfloat162C1Ev_nbst", shim_raw_str) selfptr = builder.alloca( context.get_value_type(_type___nv_bfloat162), name="selfptr" ) @@ -1615,20 +1709,20 @@ def ctor_impl(context, builder, sig, args): ) -_lower___nv_bfloat162_void(shim_stream, shim_obj) +_lower__ZN14__nv_bfloat162C1Ev(shim_stream, shim_obj) -def _lower___nv_bfloat162__type___nv_bfloat162(shim_stream, shim_obj): +def _lower__ZN14__nv_bfloat162C1EOS_(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat162____nv_bfloat162_2(int &ignore, __nv_bfloat162 *self , __nv_bfloat162* src) { + _ZN14__nv_bfloat162C1EOS__nbst(int &ignore, __nv_bfloat162 *self , __nv_bfloat162* src) { new (self) __nv_bfloat162(*src); return 0; } """ _ctor_decl___nv_bfloat162 = declare_device( - "____nv_bfloat162____nv_bfloat162_2", + "_ZN14__nv_bfloat162C1EOS__nbst", int32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), ) @@ -1639,7 +1733,7 @@ def __nv_bfloat162_device_caller(arg_0, arg_1): def ctor_impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( - "____nv_bfloat162____nv_bfloat162_2", shim_raw_str + "_ZN14__nv_bfloat162C1EOS__nbst", shim_raw_str ) selfptr = builder.alloca( context.get_value_type(_type___nv_bfloat162), name="selfptr" @@ -1665,22 +1759,20 @@ def ctor_impl(context, builder, sig, args): ) -_lower___nv_bfloat162__type___nv_bfloat162(shim_stream, shim_obj) +_lower__ZN14__nv_bfloat162C1EOS_(shim_stream, shim_obj) -def _lower___nv_bfloat162__type___nv_bfloat16__type___nv_bfloat16( - shim_stream, shim_obj -): +def _lower__ZN14__nv_bfloat162C1ERK13__nv_bfloat16S2_(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat162____nv_bfloat162_3(int &ignore, __nv_bfloat162 *self , __nv_bfloat16* a, __nv_bfloat16* b) { + _ZN14__nv_bfloat162C1ERK13__nv_bfloat16S2__nbst(int &ignore, __nv_bfloat162 *self , __nv_bfloat16* a, __nv_bfloat16* b) { new (self) __nv_bfloat162(*a, *b); return 0; } """ _ctor_decl___nv_bfloat162 = declare_device( - "____nv_bfloat162____nv_bfloat162_3", + "_ZN14__nv_bfloat162C1ERK13__nv_bfloat16S2__nbst", int32( CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat16), @@ -1695,7 +1787,7 @@ def __nv_bfloat162_device_caller(arg_0, arg_1, arg_2): def ctor_impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( - "____nv_bfloat162____nv_bfloat162_3", shim_raw_str + "_ZN14__nv_bfloat162C1ERK13__nv_bfloat16S2__nbst", shim_raw_str ) selfptr = builder.alloca( context.get_value_type(_type___nv_bfloat162), name="selfptr" @@ -1722,22 +1814,20 @@ def ctor_impl(context, builder, sig, args): ) -_lower___nv_bfloat162__type___nv_bfloat16__type___nv_bfloat16( - shim_stream, shim_obj -) +_lower__ZN14__nv_bfloat162C1ERK13__nv_bfloat16S2_(shim_stream, shim_obj) -def _lower___nv_bfloat162__type___nv_bfloat162(shim_stream, shim_obj): +def _lower__ZN14__nv_bfloat162C1ERKS_(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat162____nv_bfloat162_4(int &ignore, __nv_bfloat162 *self , __nv_bfloat162* src) { + _ZN14__nv_bfloat162C1ERKS__nbst(int &ignore, __nv_bfloat162 *self , __nv_bfloat162* src) { new (self) __nv_bfloat162(*src); return 0; } """ _ctor_decl___nv_bfloat162 = declare_device( - "____nv_bfloat162____nv_bfloat162_4", + "_ZN14__nv_bfloat162C1ERKS__nbst", int32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), ) @@ -1748,7 +1838,7 @@ def __nv_bfloat162_device_caller(arg_0, arg_1): def ctor_impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( - "____nv_bfloat162____nv_bfloat162_4", shim_raw_str + "_ZN14__nv_bfloat162C1ERKS__nbst", shim_raw_str ) selfptr = builder.alloca( context.get_value_type(_type___nv_bfloat162), name="selfptr" @@ -1774,31 +1864,31 @@ def ctor_impl(context, builder, sig, args): ) -_lower___nv_bfloat162__type___nv_bfloat162(shim_stream, shim_obj) +_lower__ZN14__nv_bfloat162C1ERKS_(shim_stream, shim_obj) -def _lower___nv_bfloat162__type_unnamed1401746(shim_stream, shim_obj): +def _lower__ZN14__nv_bfloat162C1ERK18__nv_bfloat162_raw(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat162____nv_bfloat162_5(int &ignore, __nv_bfloat162 *self , __nv_bfloat162_raw* h2r) { + _ZN14__nv_bfloat162C1ERK18__nv_bfloat162_raw_nbst(int &ignore, __nv_bfloat162 *self , __nv_bfloat162_raw* h2r) { new (self) __nv_bfloat162(*h2r); return 0; } """ _ctor_decl___nv_bfloat162 = declare_device( - "____nv_bfloat162____nv_bfloat162_5", - int32(CPointer(_type___nv_bfloat162), CPointer(_type_unnamed1401746)), + "_ZN14__nv_bfloat162C1ERK18__nv_bfloat162_raw_nbst", + int32(CPointer(_type___nv_bfloat162), CPointer(_type_unnamed1405416)), ) def __nv_bfloat162_device_caller(arg_0, arg_1): return _ctor_decl___nv_bfloat162(arg_0, arg_1) - @lower(__nv_bfloat162, _type_unnamed1401746) + @lower(__nv_bfloat162, _type_unnamed1405416) def ctor_impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( - "____nv_bfloat162____nv_bfloat162_5", shim_raw_str + "_ZN14__nv_bfloat162C1ERK18__nv_bfloat162_raw_nbst", shim_raw_str ) selfptr = builder.alloca( context.get_value_type(_type___nv_bfloat162), name="selfptr" @@ -1815,7 +1905,7 @@ def ctor_impl(context, builder, sig, args): signature( int32, CPointer(_type___nv_bfloat162), - CPointer(_type_unnamed1401746), + CPointer(_type_unnamed1405416), ), (selfptr, *argptrs), ) @@ -1823,8 +1913,17 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat162, "alignof_", None) ) + @lower_cast(_type_unnamed1405416, _type___nv_bfloat162) + def conversion_impl(context, builder, fromty, toty, value): + return ctor_impl( + context, + builder, + signature(_type___nv_bfloat162, CPointer(_type_unnamed1405416)), + value, + ) + -_lower___nv_bfloat162__type_unnamed1401746(shim_stream, shim_obj) +_lower__ZN14__nv_bfloat162C1ERK18__nv_bfloat162_raw(shim_stream, shim_obj) @register @@ -1839,25 +1938,25 @@ class _ctor_template___nv_bfloat162(ConcreteTemplate): _type___nv_bfloat162, _type___nv_bfloat16, _type___nv_bfloat16 ), signature(_type___nv_bfloat162, _type___nv_bfloat162), - signature(_type___nv_bfloat162, _type_unnamed1401746), + signature(_type___nv_bfloat162, _type_unnamed1405416), ] register_global(__nv_bfloat162, Function(_ctor_template___nv_bfloat162)) -def _from___nv_bfloat162_to__type_unnamed1401746_lower(shim_stream, shim_obj): +def _from___nv_bfloat162_to__type_unnamed1405416_lower(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - ____nv_bfloat162_operator___nv_bfloat162_raw_1(__nv_bfloat162_raw &retval, __nv_bfloat162 *self) { + ____nv_bfloat162__ZNK14__nv_bfloat162cv18__nv_bfloat162_rawEv_1(__nv_bfloat162_raw &retval, __nv_bfloat162 *self) { retval = self->operator __nv_bfloat162_raw(); return 0; } """ _op_decl___nv_bfloat162 = declare_device( - "____nv_bfloat162_operator___nv_bfloat162_raw_1", - _type_unnamed1401746( + "____nv_bfloat162__ZNK14__nv_bfloat162cv18__nv_bfloat162_rawEv_1", + _type_unnamed1405416( CPointer(_type___nv_bfloat162), ), ) @@ -1865,11 +1964,12 @@ def _from___nv_bfloat162_to__type_unnamed1401746_lower(shim_stream, shim_obj): def _conversion_op_caller___nv_bfloat162(arg): return _op_decl___nv_bfloat162(arg) - @lower_cast(_type___nv_bfloat162, _type_unnamed1401746) + @lower_cast(_type___nv_bfloat162, _type_unnamed1405416) def impl(context, builder, fromty, toty, value): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( - "____nv_bfloat162_operator___nv_bfloat162_raw_1", shim_raw_str + "____nv_bfloat162__ZNK14__nv_bfloat162cv18__nv_bfloat162_rawEv_1", + shim_raw_str, ) ptr = builder.alloca( context.get_value_type(_type___nv_bfloat162), name="selfptr" @@ -1882,1997 +1982,2083 @@ def impl(context, builder, fromty, toty, value): builder, _conversion_op_caller___nv_bfloat162, signature( - _type_unnamed1401746, + _type_unnamed1405416, CPointer(_type___nv_bfloat162), ), (ptr,), ) -_from___nv_bfloat162_to__type_unnamed1401746_lower(shim_stream, shim_obj) +_from___nv_bfloat162_to__type_unnamed1405416_lower(shim_stream, shim_obj) # Functions: -def make_bfloat162(): +def __double2bfloat16(): pass -def _make_bfloat162_1_lower(shim_stream, shim_obj): +def _lower__ZL17__double2bfloat16d_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - make_bfloat162_1(__nv_bfloat162 &retval , __nv_bfloat16* x, __nv_bfloat16* y) { - retval = make_bfloat162(*x, *y); + _ZL17__double2bfloat16d_nbst(__nv_bfloat16 &retval , double* a) { + retval = __double2bfloat16(*a); return 0; } """ - make_bfloat162_1 = declare_device( - "make_bfloat162_1", - _type___nv_bfloat162( - CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) - ), + _ZL17__double2bfloat16d_nbst = declare_device( + "_ZL17__double2bfloat16d_nbst", _type___nv_bfloat16(CPointer(float64)) ) - def make_bfloat162_1_caller(arg_0, arg_1): - return make_bfloat162_1(arg_0, arg_1) + def _ZL17__double2bfloat16d_nbst_caller(arg_0): + return _ZL17__double2bfloat16d_nbst(arg_0) - @lower(make_bfloat162, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__double2bfloat16, float64) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("make_bfloat162_1", shim_raw_str) + shim_stream.write_with_key("_ZL17__double2bfloat16d_nbst", shim_raw_str) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - make_bfloat162_1_caller, - signature( - _type___nv_bfloat162, - CPointer(_type___nv_bfloat16), - CPointer(_type___nv_bfloat16), - ), + _ZL17__double2bfloat16d_nbst_caller, + signature(_type___nv_bfloat16, CPointer(float64)), ptrs, ) -_make_bfloat162_1_lower(shim_stream, shim_obj) +_lower__ZL17__double2bfloat16d_nbst(shim_stream, shim_obj) -def htrunc(): +def __float2bfloat16(): pass -def _htrunc_1_lower(shim_stream, shim_obj): +def _lower__ZL16__float2bfloat16f_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - htrunc_1(__nv_bfloat16 &retval , __nv_bfloat16* h) { - retval = htrunc(*h); + _ZL16__float2bfloat16f_nbst(__nv_bfloat16 &retval , float* a) { + retval = __float2bfloat16(*a); return 0; } """ - htrunc_1 = declare_device( - "htrunc_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16)) + _ZL16__float2bfloat16f_nbst = declare_device( + "_ZL16__float2bfloat16f_nbst", _type___nv_bfloat16(CPointer(float32)) ) - def htrunc_1_caller(arg_0): - return htrunc_1(arg_0) + def _ZL16__float2bfloat16f_nbst_caller(arg_0): + return _ZL16__float2bfloat16f_nbst(arg_0) - @lower(htrunc, _type___nv_bfloat16) + @lower(__float2bfloat16, float32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("htrunc_1", shim_raw_str) + shim_stream.write_with_key("_ZL16__float2bfloat16f_nbst", shim_raw_str) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - htrunc_1_caller, - signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + _ZL16__float2bfloat16f_nbst_caller, + signature(_type___nv_bfloat16, CPointer(float32)), ptrs, ) -_htrunc_1_lower(shim_stream, shim_obj) +_lower__ZL16__float2bfloat16f_nbst(shim_stream, shim_obj) -def hceil(): +def __float2bfloat16_rn(): pass -def _hceil_1_lower(shim_stream, shim_obj): +def _lower__ZL19__float2bfloat16_rnf_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - hceil_1(__nv_bfloat16 &retval , __nv_bfloat16* h) { - retval = hceil(*h); + _ZL19__float2bfloat16_rnf_nbst(__nv_bfloat16 &retval , float* a) { + retval = __float2bfloat16_rn(*a); return 0; } """ - hceil_1 = declare_device( - "hceil_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16)) + _ZL19__float2bfloat16_rnf_nbst = declare_device( + "_ZL19__float2bfloat16_rnf_nbst", _type___nv_bfloat16(CPointer(float32)) ) - def hceil_1_caller(arg_0): - return hceil_1(arg_0) + def _ZL19__float2bfloat16_rnf_nbst_caller(arg_0): + return _ZL19__float2bfloat16_rnf_nbst(arg_0) - @lower(hceil, _type___nv_bfloat16) + @lower(__float2bfloat16_rn, float32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("hceil_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL19__float2bfloat16_rnf_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - hceil_1_caller, - signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + _ZL19__float2bfloat16_rnf_nbst_caller, + signature(_type___nv_bfloat16, CPointer(float32)), ptrs, ) -_hceil_1_lower(shim_stream, shim_obj) +_lower__ZL19__float2bfloat16_rnf_nbst(shim_stream, shim_obj) -def hfloor(): +def __float2bfloat16_rz(): pass -def _hfloor_1_lower(shim_stream, shim_obj): +def _lower__ZL19__float2bfloat16_rzf_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - hfloor_1(__nv_bfloat16 &retval , __nv_bfloat16* h) { - retval = hfloor(*h); + _ZL19__float2bfloat16_rzf_nbst(__nv_bfloat16 &retval , float* a) { + retval = __float2bfloat16_rz(*a); return 0; } """ - hfloor_1 = declare_device( - "hfloor_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16)) + _ZL19__float2bfloat16_rzf_nbst = declare_device( + "_ZL19__float2bfloat16_rzf_nbst", _type___nv_bfloat16(CPointer(float32)) ) - def hfloor_1_caller(arg_0): - return hfloor_1(arg_0) + def _ZL19__float2bfloat16_rzf_nbst_caller(arg_0): + return _ZL19__float2bfloat16_rzf_nbst(arg_0) - @lower(hfloor, _type___nv_bfloat16) + @lower(__float2bfloat16_rz, float32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("hfloor_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL19__float2bfloat16_rzf_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - hfloor_1_caller, - signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + _ZL19__float2bfloat16_rzf_nbst_caller, + signature(_type___nv_bfloat16, CPointer(float32)), ptrs, ) -_hfloor_1_lower(shim_stream, shim_obj) +_lower__ZL19__float2bfloat16_rzf_nbst(shim_stream, shim_obj) -def hrint(): +def __float2bfloat16_rd(): pass -def _hrint_1_lower(shim_stream, shim_obj): +def _lower__ZL19__float2bfloat16_rdf_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - hrint_1(__nv_bfloat16 &retval , __nv_bfloat16* h) { - retval = hrint(*h); + _ZL19__float2bfloat16_rdf_nbst(__nv_bfloat16 &retval , float* a) { + retval = __float2bfloat16_rd(*a); return 0; } """ - hrint_1 = declare_device( - "hrint_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16)) + _ZL19__float2bfloat16_rdf_nbst = declare_device( + "_ZL19__float2bfloat16_rdf_nbst", _type___nv_bfloat16(CPointer(float32)) ) - def hrint_1_caller(arg_0): - return hrint_1(arg_0) + def _ZL19__float2bfloat16_rdf_nbst_caller(arg_0): + return _ZL19__float2bfloat16_rdf_nbst(arg_0) - @lower(hrint, _type___nv_bfloat16) + @lower(__float2bfloat16_rd, float32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("hrint_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL19__float2bfloat16_rdf_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - hrint_1_caller, - signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + _ZL19__float2bfloat16_rdf_nbst_caller, + signature(_type___nv_bfloat16, CPointer(float32)), ptrs, ) -_hrint_1_lower(shim_stream, shim_obj) +_lower__ZL19__float2bfloat16_rdf_nbst(shim_stream, shim_obj) -def h2trunc(): +def __float2bfloat16_ru(): pass -def _h2trunc_1_lower(shim_stream, shim_obj): +def _lower__ZL19__float2bfloat16_ruf_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - h2trunc_1(__nv_bfloat162 &retval , __nv_bfloat162* h) { - retval = h2trunc(*h); + _ZL19__float2bfloat16_ruf_nbst(__nv_bfloat16 &retval , float* a) { + retval = __float2bfloat16_ru(*a); return 0; } """ - h2trunc_1 = declare_device( - "h2trunc_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162)) + _ZL19__float2bfloat16_ruf_nbst = declare_device( + "_ZL19__float2bfloat16_ruf_nbst", _type___nv_bfloat16(CPointer(float32)) ) - def h2trunc_1_caller(arg_0): - return h2trunc_1(arg_0) + def _ZL19__float2bfloat16_ruf_nbst_caller(arg_0): + return _ZL19__float2bfloat16_ruf_nbst(arg_0) - @lower(h2trunc, _type___nv_bfloat162) + @lower(__float2bfloat16_ru, float32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("h2trunc_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL19__float2bfloat16_ruf_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - h2trunc_1_caller, - signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + _ZL19__float2bfloat16_ruf_nbst_caller, + signature(_type___nv_bfloat16, CPointer(float32)), ptrs, ) -_h2trunc_1_lower(shim_stream, shim_obj) +_lower__ZL19__float2bfloat16_ruf_nbst(shim_stream, shim_obj) -def h2ceil(): +def __bfloat162float(): pass -def _h2ceil_1_lower(shim_stream, shim_obj): +def _lower__ZL16__bfloat162float13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - h2ceil_1(__nv_bfloat162 &retval , __nv_bfloat162* h) { - retval = h2ceil(*h); + _ZL16__bfloat162float13__nv_bfloat16_nbst(float &retval , __nv_bfloat16* a) { + retval = __bfloat162float(*a); return 0; } """ - h2ceil_1 = declare_device( - "h2ceil_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162)) + _ZL16__bfloat162float13__nv_bfloat16_nbst = declare_device( + "_ZL16__bfloat162float13__nv_bfloat16_nbst", + float32(CPointer(_type___nv_bfloat16)), ) - def h2ceil_1_caller(arg_0): - return h2ceil_1(arg_0) + def _ZL16__bfloat162float13__nv_bfloat16_nbst_caller(arg_0): + return _ZL16__bfloat162float13__nv_bfloat16_nbst(arg_0) - @lower(h2ceil, _type___nv_bfloat162) + @lower(__bfloat162float, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("h2ceil_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL16__bfloat162float13__nv_bfloat16_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - h2ceil_1_caller, - signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + _ZL16__bfloat162float13__nv_bfloat16_nbst_caller, + signature(float32, CPointer(_type___nv_bfloat16)), ptrs, ) -_h2ceil_1_lower(shim_stream, shim_obj) +_lower__ZL16__bfloat162float13__nv_bfloat16_nbst(shim_stream, shim_obj) -def h2floor(): +def __float2bfloat162_rn(): pass -def _h2floor_1_lower(shim_stream, shim_obj): +def _lower__ZL20__float2bfloat162_rnf_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - h2floor_1(__nv_bfloat162 &retval , __nv_bfloat162* h) { - retval = h2floor(*h); + _ZL20__float2bfloat162_rnf_nbst(__nv_bfloat162 &retval , float* a) { + retval = __float2bfloat162_rn(*a); return 0; } """ - h2floor_1 = declare_device( - "h2floor_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162)) + _ZL20__float2bfloat162_rnf_nbst = declare_device( + "_ZL20__float2bfloat162_rnf_nbst", + _type___nv_bfloat162(CPointer(float32)), ) - def h2floor_1_caller(arg_0): - return h2floor_1(arg_0) + def _ZL20__float2bfloat162_rnf_nbst_caller(arg_0): + return _ZL20__float2bfloat162_rnf_nbst(arg_0) - @lower(h2floor, _type___nv_bfloat162) + @lower(__float2bfloat162_rn, float32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("h2floor_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL20__float2bfloat162_rnf_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - h2floor_1_caller, - signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + _ZL20__float2bfloat162_rnf_nbst_caller, + signature(_type___nv_bfloat162, CPointer(float32)), ptrs, ) -_h2floor_1_lower(shim_stream, shim_obj) +_lower__ZL20__float2bfloat162_rnf_nbst(shim_stream, shim_obj) -def h2rint(): +def __floats2bfloat162_rn(): pass -def _h2rint_1_lower(shim_stream, shim_obj): +def _lower__ZL21__floats2bfloat162_rnff_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - h2rint_1(__nv_bfloat162 &retval , __nv_bfloat162* h) { - retval = h2rint(*h); + _ZL21__floats2bfloat162_rnff_nbst(__nv_bfloat162 &retval , float* a, float* b) { + retval = __floats2bfloat162_rn(*a, *b); return 0; } """ - h2rint_1 = declare_device( - "h2rint_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162)) + _ZL21__floats2bfloat162_rnff_nbst = declare_device( + "_ZL21__floats2bfloat162_rnff_nbst", + _type___nv_bfloat162(CPointer(float32), CPointer(float32)), ) - def h2rint_1_caller(arg_0): - return h2rint_1(arg_0) + def _ZL21__floats2bfloat162_rnff_nbst_caller(arg_0, arg_1): + return _ZL21__floats2bfloat162_rnff_nbst(arg_0, arg_1) - @lower(h2rint, _type___nv_bfloat162) + @lower(__floats2bfloat162_rn, float32, float32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("h2rint_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL21__floats2bfloat162_rnff_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - h2rint_1_caller, - signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + _ZL21__floats2bfloat162_rnff_nbst_caller, + signature( + _type___nv_bfloat162, CPointer(float32), CPointer(float32) + ), ptrs, ) -_h2rint_1_lower(shim_stream, shim_obj) +_lower__ZL21__floats2bfloat162_rnff_nbst(shim_stream, shim_obj) -def hsqrt(): +def __low2float(): pass -def _hsqrt_1_lower(shim_stream, shim_obj): +def _lower__ZL11__low2float14__nv_bfloat162_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - hsqrt_1(__nv_bfloat16 &retval , __nv_bfloat16* a) { - retval = hsqrt(*a); + _ZL11__low2float14__nv_bfloat162_nbst(float &retval , __nv_bfloat162* a) { + retval = __low2float(*a); return 0; } """ - hsqrt_1 = declare_device( - "hsqrt_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16)) + _ZL11__low2float14__nv_bfloat162_nbst = declare_device( + "_ZL11__low2float14__nv_bfloat162_nbst", + float32(CPointer(_type___nv_bfloat162)), ) - def hsqrt_1_caller(arg_0): - return hsqrt_1(arg_0) + def _ZL11__low2float14__nv_bfloat162_nbst_caller(arg_0): + return _ZL11__low2float14__nv_bfloat162_nbst(arg_0) - @lower(hsqrt, _type___nv_bfloat16) + @lower(__low2float, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("hsqrt_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL11__low2float14__nv_bfloat162_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - hsqrt_1_caller, - signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + _ZL11__low2float14__nv_bfloat162_nbst_caller, + signature(float32, CPointer(_type___nv_bfloat162)), ptrs, ) -_hsqrt_1_lower(shim_stream, shim_obj) +_lower__ZL11__low2float14__nv_bfloat162_nbst(shim_stream, shim_obj) -def hrsqrt(): +def __high2float(): pass -def _hrsqrt_1_lower(shim_stream, shim_obj): +def _lower__ZL12__high2float14__nv_bfloat162_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - hrsqrt_1(__nv_bfloat16 &retval , __nv_bfloat16* a) { - retval = hrsqrt(*a); + _ZL12__high2float14__nv_bfloat162_nbst(float &retval , __nv_bfloat162* a) { + retval = __high2float(*a); return 0; } """ - hrsqrt_1 = declare_device( - "hrsqrt_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16)) + _ZL12__high2float14__nv_bfloat162_nbst = declare_device( + "_ZL12__high2float14__nv_bfloat162_nbst", + float32(CPointer(_type___nv_bfloat162)), ) - def hrsqrt_1_caller(arg_0): - return hrsqrt_1(arg_0) + def _ZL12__high2float14__nv_bfloat162_nbst_caller(arg_0): + return _ZL12__high2float14__nv_bfloat162_nbst(arg_0) - @lower(hrsqrt, _type___nv_bfloat16) + @lower(__high2float, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("hrsqrt_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL12__high2float14__nv_bfloat162_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - hrsqrt_1_caller, - signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + _ZL12__high2float14__nv_bfloat162_nbst_caller, + signature(float32, CPointer(_type___nv_bfloat162)), ptrs, ) -_hrsqrt_1_lower(shim_stream, shim_obj) +_lower__ZL12__high2float14__nv_bfloat162_nbst(shim_stream, shim_obj) -def hrcp(): +def __float22bfloat162_rn(): pass -def _hrcp_1_lower(shim_stream, shim_obj): +def _lower__ZL21__float22bfloat162_rn6float2_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - hrcp_1(__nv_bfloat16 &retval , __nv_bfloat16* a) { - retval = hrcp(*a); + _ZL21__float22bfloat162_rn6float2_nbst(__nv_bfloat162 &retval , float2* a) { + retval = __float22bfloat162_rn(*a); return 0; } """ - hrcp_1 = declare_device( - "hrcp_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16)) + _ZL21__float22bfloat162_rn6float2_nbst = declare_device( + "_ZL21__float22bfloat162_rn6float2_nbst", + _type___nv_bfloat162(CPointer(float32x2)), ) - def hrcp_1_caller(arg_0): - return hrcp_1(arg_0) + def _ZL21__float22bfloat162_rn6float2_nbst_caller(arg_0): + return _ZL21__float22bfloat162_rn6float2_nbst(arg_0) - @lower(hrcp, _type___nv_bfloat16) + @lower(__float22bfloat162_rn, float32x2) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("hrcp_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL21__float22bfloat162_rn6float2_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - hrcp_1_caller, - signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + _ZL21__float22bfloat162_rn6float2_nbst_caller, + signature(_type___nv_bfloat162, CPointer(float32x2)), ptrs, ) -_hrcp_1_lower(shim_stream, shim_obj) +_lower__ZL21__float22bfloat162_rn6float2_nbst(shim_stream, shim_obj) -def hlog(): +def __bfloat1622float2(): pass -def _hlog_1_lower(shim_stream, shim_obj): +def _lower__ZL18__bfloat1622float214__nv_bfloat162_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - hlog_1(__nv_bfloat16 &retval , __nv_bfloat16* a) { - retval = hlog(*a); + _ZL18__bfloat1622float214__nv_bfloat162_nbst(float2 &retval , __nv_bfloat162* a) { + retval = __bfloat1622float2(*a); return 0; } """ - hlog_1 = declare_device( - "hlog_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16)) + _ZL18__bfloat1622float214__nv_bfloat162_nbst = declare_device( + "_ZL18__bfloat1622float214__nv_bfloat162_nbst", + float32x2(CPointer(_type___nv_bfloat162)), ) - def hlog_1_caller(arg_0): - return hlog_1(arg_0) + def _ZL18__bfloat1622float214__nv_bfloat162_nbst_caller(arg_0): + return _ZL18__bfloat1622float214__nv_bfloat162_nbst(arg_0) - @lower(hlog, _type___nv_bfloat16) + @lower(__bfloat1622float2, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("hlog_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL18__bfloat1622float214__nv_bfloat162_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - hlog_1_caller, - signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + _ZL18__bfloat1622float214__nv_bfloat162_nbst_caller, + signature(float32x2, CPointer(_type___nv_bfloat162)), ptrs, ) -_hlog_1_lower(shim_stream, shim_obj) +_lower__ZL18__bfloat1622float214__nv_bfloat162_nbst(shim_stream, shim_obj) -def hlog2(): +def __bfloat162char_rz(): pass -def _hlog2_1_lower(shim_stream, shim_obj): +def _lower__ZL18__bfloat162char_rz13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - hlog2_1(__nv_bfloat16 &retval , __nv_bfloat16* a) { - retval = hlog2(*a); + _ZL18__bfloat162char_rz13__nv_bfloat16_nbst(signed char &retval , __nv_bfloat16* h) { + retval = __bfloat162char_rz(*h); return 0; } """ - hlog2_1 = declare_device( - "hlog2_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16)) + _ZL18__bfloat162char_rz13__nv_bfloat16_nbst = declare_device( + "_ZL18__bfloat162char_rz13__nv_bfloat16_nbst", + int8(CPointer(_type___nv_bfloat16)), ) - def hlog2_1_caller(arg_0): - return hlog2_1(arg_0) + def _ZL18__bfloat162char_rz13__nv_bfloat16_nbst_caller(arg_0): + return _ZL18__bfloat162char_rz13__nv_bfloat16_nbst(arg_0) - @lower(hlog2, _type___nv_bfloat16) + @lower(__bfloat162char_rz, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("hlog2_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL18__bfloat162char_rz13__nv_bfloat16_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - hlog2_1_caller, - signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + _ZL18__bfloat162char_rz13__nv_bfloat16_nbst_caller, + signature(int8, CPointer(_type___nv_bfloat16)), ptrs, ) -_hlog2_1_lower(shim_stream, shim_obj) +_lower__ZL18__bfloat162char_rz13__nv_bfloat16_nbst(shim_stream, shim_obj) -def hlog10(): +def __bfloat162uchar_rz(): pass -def _hlog10_1_lower(shim_stream, shim_obj): +def _lower__ZL19__bfloat162uchar_rz13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - hlog10_1(__nv_bfloat16 &retval , __nv_bfloat16* a) { - retval = hlog10(*a); + _ZL19__bfloat162uchar_rz13__nv_bfloat16_nbst(unsigned char &retval , __nv_bfloat16* h) { + retval = __bfloat162uchar_rz(*h); return 0; } """ - hlog10_1 = declare_device( - "hlog10_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16)) + _ZL19__bfloat162uchar_rz13__nv_bfloat16_nbst = declare_device( + "_ZL19__bfloat162uchar_rz13__nv_bfloat16_nbst", + uint8(CPointer(_type___nv_bfloat16)), ) - def hlog10_1_caller(arg_0): - return hlog10_1(arg_0) + def _ZL19__bfloat162uchar_rz13__nv_bfloat16_nbst_caller(arg_0): + return _ZL19__bfloat162uchar_rz13__nv_bfloat16_nbst(arg_0) - @lower(hlog10, _type___nv_bfloat16) + @lower(__bfloat162uchar_rz, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("hlog10_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL19__bfloat162uchar_rz13__nv_bfloat16_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - hlog10_1_caller, - signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + _ZL19__bfloat162uchar_rz13__nv_bfloat16_nbst_caller, + signature(uint8, CPointer(_type___nv_bfloat16)), ptrs, ) -_hlog10_1_lower(shim_stream, shim_obj) +_lower__ZL19__bfloat162uchar_rz13__nv_bfloat16_nbst(shim_stream, shim_obj) -def hexp(): +def __bfloat162int_rn(): pass -def _hexp_1_lower(shim_stream, shim_obj): +def _lower__ZL17__bfloat162int_rn13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - hexp_1(__nv_bfloat16 &retval , __nv_bfloat16* a) { - retval = hexp(*a); + _ZL17__bfloat162int_rn13__nv_bfloat16_nbst(int &retval , __nv_bfloat16* h) { + retval = __bfloat162int_rn(*h); return 0; } """ - hexp_1 = declare_device( - "hexp_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16)) + _ZL17__bfloat162int_rn13__nv_bfloat16_nbst = declare_device( + "_ZL17__bfloat162int_rn13__nv_bfloat16_nbst", + int32(CPointer(_type___nv_bfloat16)), ) - def hexp_1_caller(arg_0): - return hexp_1(arg_0) + def _ZL17__bfloat162int_rn13__nv_bfloat16_nbst_caller(arg_0): + return _ZL17__bfloat162int_rn13__nv_bfloat16_nbst(arg_0) - @lower(hexp, _type___nv_bfloat16) + @lower(__bfloat162int_rn, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("hexp_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL17__bfloat162int_rn13__nv_bfloat16_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - hexp_1_caller, - signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + _ZL17__bfloat162int_rn13__nv_bfloat16_nbst_caller, + signature(int32, CPointer(_type___nv_bfloat16)), ptrs, ) -_hexp_1_lower(shim_stream, shim_obj) +_lower__ZL17__bfloat162int_rn13__nv_bfloat16_nbst(shim_stream, shim_obj) -def htanh_approx(): +def __bfloat162int_rz(): pass -def _htanh_approx_1_lower(shim_stream, shim_obj): +def _lower__ZL17__bfloat162int_rz13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - htanh_approx_1(__nv_bfloat16 &retval , __nv_bfloat16* a) { - retval = htanh_approx(*a); + _ZL17__bfloat162int_rz13__nv_bfloat16_nbst(int &retval , __nv_bfloat16* h) { + retval = __bfloat162int_rz(*h); return 0; } """ - htanh_approx_1 = declare_device( - "htanh_approx_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16)) + _ZL17__bfloat162int_rz13__nv_bfloat16_nbst = declare_device( + "_ZL17__bfloat162int_rz13__nv_bfloat16_nbst", + int32(CPointer(_type___nv_bfloat16)), ) - def htanh_approx_1_caller(arg_0): - return htanh_approx_1(arg_0) + def _ZL17__bfloat162int_rz13__nv_bfloat16_nbst_caller(arg_0): + return _ZL17__bfloat162int_rz13__nv_bfloat16_nbst(arg_0) - @lower(htanh_approx, _type___nv_bfloat16) + @lower(__bfloat162int_rz, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("htanh_approx_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL17__bfloat162int_rz13__nv_bfloat16_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - htanh_approx_1_caller, - signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + _ZL17__bfloat162int_rz13__nv_bfloat16_nbst_caller, + signature(int32, CPointer(_type___nv_bfloat16)), ptrs, ) -_htanh_approx_1_lower(shim_stream, shim_obj) +_lower__ZL17__bfloat162int_rz13__nv_bfloat16_nbst(shim_stream, shim_obj) -def h2tanh_approx(): +def __bfloat162int_rd(): pass -def _h2tanh_approx_1_lower(shim_stream, shim_obj): +def _lower__ZL17__bfloat162int_rd13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - h2tanh_approx_1(__nv_bfloat162 &retval , __nv_bfloat162* a) { - retval = h2tanh_approx(*a); + _ZL17__bfloat162int_rd13__nv_bfloat16_nbst(int &retval , __nv_bfloat16* h) { + retval = __bfloat162int_rd(*h); return 0; } """ - h2tanh_approx_1 = declare_device( - "h2tanh_approx_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162)) + _ZL17__bfloat162int_rd13__nv_bfloat16_nbst = declare_device( + "_ZL17__bfloat162int_rd13__nv_bfloat16_nbst", + int32(CPointer(_type___nv_bfloat16)), ) - def h2tanh_approx_1_caller(arg_0): - return h2tanh_approx_1(arg_0) + def _ZL17__bfloat162int_rd13__nv_bfloat16_nbst_caller(arg_0): + return _ZL17__bfloat162int_rd13__nv_bfloat16_nbst(arg_0) - @lower(h2tanh_approx, _type___nv_bfloat162) + @lower(__bfloat162int_rd, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("h2tanh_approx_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL17__bfloat162int_rd13__nv_bfloat16_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - h2tanh_approx_1_caller, - signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + _ZL17__bfloat162int_rd13__nv_bfloat16_nbst_caller, + signature(int32, CPointer(_type___nv_bfloat16)), ptrs, ) -_h2tanh_approx_1_lower(shim_stream, shim_obj) +_lower__ZL17__bfloat162int_rd13__nv_bfloat16_nbst(shim_stream, shim_obj) -def htanh(): +def __bfloat162int_ru(): pass -def _htanh_1_lower(shim_stream, shim_obj): +def _lower__ZL17__bfloat162int_ru13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - htanh_1(__nv_bfloat16 &retval , __nv_bfloat16* a) { - retval = htanh(*a); + _ZL17__bfloat162int_ru13__nv_bfloat16_nbst(int &retval , __nv_bfloat16* h) { + retval = __bfloat162int_ru(*h); return 0; } """ - htanh_1 = declare_device( - "htanh_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16)) + _ZL17__bfloat162int_ru13__nv_bfloat16_nbst = declare_device( + "_ZL17__bfloat162int_ru13__nv_bfloat16_nbst", + int32(CPointer(_type___nv_bfloat16)), ) - def htanh_1_caller(arg_0): - return htanh_1(arg_0) + def _ZL17__bfloat162int_ru13__nv_bfloat16_nbst_caller(arg_0): + return _ZL17__bfloat162int_ru13__nv_bfloat16_nbst(arg_0) - @lower(htanh, _type___nv_bfloat16) + @lower(__bfloat162int_ru, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("htanh_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL17__bfloat162int_ru13__nv_bfloat16_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - htanh_1_caller, - signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + _ZL17__bfloat162int_ru13__nv_bfloat16_nbst_caller, + signature(int32, CPointer(_type___nv_bfloat16)), ptrs, ) -_htanh_1_lower(shim_stream, shim_obj) +_lower__ZL17__bfloat162int_ru13__nv_bfloat16_nbst(shim_stream, shim_obj) -def h2tanh(): +def __int2bfloat16_rn(): pass -def _h2tanh_1_lower(shim_stream, shim_obj): +def _lower__ZL17__int2bfloat16_rni_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - h2tanh_1(__nv_bfloat162 &retval , __nv_bfloat162* a) { - retval = h2tanh(*a); + _ZL17__int2bfloat16_rni_nbst(__nv_bfloat16 &retval , int* i) { + retval = __int2bfloat16_rn(*i); return 0; } """ - h2tanh_1 = declare_device( - "h2tanh_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162)) + _ZL17__int2bfloat16_rni_nbst = declare_device( + "_ZL17__int2bfloat16_rni_nbst", _type___nv_bfloat16(CPointer(int32)) ) - def h2tanh_1_caller(arg_0): - return h2tanh_1(arg_0) + def _ZL17__int2bfloat16_rni_nbst_caller(arg_0): + return _ZL17__int2bfloat16_rni_nbst(arg_0) - @lower(h2tanh, _type___nv_bfloat162) + @lower(__int2bfloat16_rn, int32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("h2tanh_1", shim_raw_str) + shim_stream.write_with_key("_ZL17__int2bfloat16_rni_nbst", shim_raw_str) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - h2tanh_1_caller, - signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + _ZL17__int2bfloat16_rni_nbst_caller, + signature(_type___nv_bfloat16, CPointer(int32)), ptrs, ) -_h2tanh_1_lower(shim_stream, shim_obj) +_lower__ZL17__int2bfloat16_rni_nbst(shim_stream, shim_obj) -def hexp2(): +def __int2bfloat16_rz(): pass -def _hexp2_1_lower(shim_stream, shim_obj): +def _lower__ZL17__int2bfloat16_rzi_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - hexp2_1(__nv_bfloat16 &retval , __nv_bfloat16* a) { - retval = hexp2(*a); + _ZL17__int2bfloat16_rzi_nbst(__nv_bfloat16 &retval , int* i) { + retval = __int2bfloat16_rz(*i); return 0; } """ - hexp2_1 = declare_device( - "hexp2_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16)) + _ZL17__int2bfloat16_rzi_nbst = declare_device( + "_ZL17__int2bfloat16_rzi_nbst", _type___nv_bfloat16(CPointer(int32)) ) - def hexp2_1_caller(arg_0): - return hexp2_1(arg_0) + def _ZL17__int2bfloat16_rzi_nbst_caller(arg_0): + return _ZL17__int2bfloat16_rzi_nbst(arg_0) - @lower(hexp2, _type___nv_bfloat16) + @lower(__int2bfloat16_rz, int32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("hexp2_1", shim_raw_str) + shim_stream.write_with_key("_ZL17__int2bfloat16_rzi_nbst", shim_raw_str) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - hexp2_1_caller, - signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + _ZL17__int2bfloat16_rzi_nbst_caller, + signature(_type___nv_bfloat16, CPointer(int32)), ptrs, ) -_hexp2_1_lower(shim_stream, shim_obj) +_lower__ZL17__int2bfloat16_rzi_nbst(shim_stream, shim_obj) -def hexp10(): +def __int2bfloat16_rd(): pass -def _hexp10_1_lower(shim_stream, shim_obj): +def _lower__ZL17__int2bfloat16_rdi_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - hexp10_1(__nv_bfloat16 &retval , __nv_bfloat16* a) { - retval = hexp10(*a); + _ZL17__int2bfloat16_rdi_nbst(__nv_bfloat16 &retval , int* i) { + retval = __int2bfloat16_rd(*i); return 0; } """ - hexp10_1 = declare_device( - "hexp10_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16)) + _ZL17__int2bfloat16_rdi_nbst = declare_device( + "_ZL17__int2bfloat16_rdi_nbst", _type___nv_bfloat16(CPointer(int32)) ) - def hexp10_1_caller(arg_0): - return hexp10_1(arg_0) + def _ZL17__int2bfloat16_rdi_nbst_caller(arg_0): + return _ZL17__int2bfloat16_rdi_nbst(arg_0) - @lower(hexp10, _type___nv_bfloat16) + @lower(__int2bfloat16_rd, int32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("hexp10_1", shim_raw_str) + shim_stream.write_with_key("_ZL17__int2bfloat16_rdi_nbst", shim_raw_str) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - hexp10_1_caller, - signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + _ZL17__int2bfloat16_rdi_nbst_caller, + signature(_type___nv_bfloat16, CPointer(int32)), ptrs, ) -_hexp10_1_lower(shim_stream, shim_obj) +_lower__ZL17__int2bfloat16_rdi_nbst(shim_stream, shim_obj) -def hcos(): +def __int2bfloat16_ru(): pass -def _hcos_1_lower(shim_stream, shim_obj): +def _lower__ZL17__int2bfloat16_rui_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - hcos_1(__nv_bfloat16 &retval , __nv_bfloat16* a) { - retval = hcos(*a); + _ZL17__int2bfloat16_rui_nbst(__nv_bfloat16 &retval , int* i) { + retval = __int2bfloat16_ru(*i); return 0; } """ - hcos_1 = declare_device( - "hcos_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16)) + _ZL17__int2bfloat16_rui_nbst = declare_device( + "_ZL17__int2bfloat16_rui_nbst", _type___nv_bfloat16(CPointer(int32)) ) - def hcos_1_caller(arg_0): - return hcos_1(arg_0) + def _ZL17__int2bfloat16_rui_nbst_caller(arg_0): + return _ZL17__int2bfloat16_rui_nbst(arg_0) - @lower(hcos, _type___nv_bfloat16) + @lower(__int2bfloat16_ru, int32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("hcos_1", shim_raw_str) + shim_stream.write_with_key("_ZL17__int2bfloat16_rui_nbst", shim_raw_str) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - hcos_1_caller, - signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + _ZL17__int2bfloat16_rui_nbst_caller, + signature(_type___nv_bfloat16, CPointer(int32)), ptrs, ) -_hcos_1_lower(shim_stream, shim_obj) +_lower__ZL17__int2bfloat16_rui_nbst(shim_stream, shim_obj) -def hsin(): +def __bfloat162short_rn(): pass -def _hsin_1_lower(shim_stream, shim_obj): +def _lower__ZL19__bfloat162short_rn13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - hsin_1(__nv_bfloat16 &retval , __nv_bfloat16* a) { - retval = hsin(*a); + _ZL19__bfloat162short_rn13__nv_bfloat16_nbst(short &retval , __nv_bfloat16* h) { + retval = __bfloat162short_rn(*h); return 0; } """ - hsin_1 = declare_device( - "hsin_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16)) + _ZL19__bfloat162short_rn13__nv_bfloat16_nbst = declare_device( + "_ZL19__bfloat162short_rn13__nv_bfloat16_nbst", + int16(CPointer(_type___nv_bfloat16)), ) - def hsin_1_caller(arg_0): - return hsin_1(arg_0) + def _ZL19__bfloat162short_rn13__nv_bfloat16_nbst_caller(arg_0): + return _ZL19__bfloat162short_rn13__nv_bfloat16_nbst(arg_0) - @lower(hsin, _type___nv_bfloat16) + @lower(__bfloat162short_rn, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("hsin_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL19__bfloat162short_rn13__nv_bfloat16_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - hsin_1_caller, - signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + _ZL19__bfloat162short_rn13__nv_bfloat16_nbst_caller, + signature(int16, CPointer(_type___nv_bfloat16)), ptrs, ) -_hsin_1_lower(shim_stream, shim_obj) +_lower__ZL19__bfloat162short_rn13__nv_bfloat16_nbst(shim_stream, shim_obj) -def h2sqrt(): +def __bfloat162short_rz(): pass -def _h2sqrt_1_lower(shim_stream, shim_obj): +def _lower__ZL19__bfloat162short_rz13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - h2sqrt_1(__nv_bfloat162 &retval , __nv_bfloat162* a) { - retval = h2sqrt(*a); + _ZL19__bfloat162short_rz13__nv_bfloat16_nbst(short &retval , __nv_bfloat16* h) { + retval = __bfloat162short_rz(*h); return 0; } """ - h2sqrt_1 = declare_device( - "h2sqrt_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162)) + _ZL19__bfloat162short_rz13__nv_bfloat16_nbst = declare_device( + "_ZL19__bfloat162short_rz13__nv_bfloat16_nbst", + int16(CPointer(_type___nv_bfloat16)), ) - def h2sqrt_1_caller(arg_0): - return h2sqrt_1(arg_0) + def _ZL19__bfloat162short_rz13__nv_bfloat16_nbst_caller(arg_0): + return _ZL19__bfloat162short_rz13__nv_bfloat16_nbst(arg_0) - @lower(h2sqrt, _type___nv_bfloat162) + @lower(__bfloat162short_rz, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("h2sqrt_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL19__bfloat162short_rz13__nv_bfloat16_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - h2sqrt_1_caller, - signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + _ZL19__bfloat162short_rz13__nv_bfloat16_nbst_caller, + signature(int16, CPointer(_type___nv_bfloat16)), ptrs, ) -_h2sqrt_1_lower(shim_stream, shim_obj) +_lower__ZL19__bfloat162short_rz13__nv_bfloat16_nbst(shim_stream, shim_obj) -def h2rsqrt(): +def __bfloat162short_rd(): pass -def _h2rsqrt_1_lower(shim_stream, shim_obj): +def _lower__ZL19__bfloat162short_rd13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - h2rsqrt_1(__nv_bfloat162 &retval , __nv_bfloat162* a) { - retval = h2rsqrt(*a); + _ZL19__bfloat162short_rd13__nv_bfloat16_nbst(short &retval , __nv_bfloat16* h) { + retval = __bfloat162short_rd(*h); return 0; } """ - h2rsqrt_1 = declare_device( - "h2rsqrt_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162)) + _ZL19__bfloat162short_rd13__nv_bfloat16_nbst = declare_device( + "_ZL19__bfloat162short_rd13__nv_bfloat16_nbst", + int16(CPointer(_type___nv_bfloat16)), ) - def h2rsqrt_1_caller(arg_0): - return h2rsqrt_1(arg_0) + def _ZL19__bfloat162short_rd13__nv_bfloat16_nbst_caller(arg_0): + return _ZL19__bfloat162short_rd13__nv_bfloat16_nbst(arg_0) - @lower(h2rsqrt, _type___nv_bfloat162) + @lower(__bfloat162short_rd, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("h2rsqrt_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL19__bfloat162short_rd13__nv_bfloat16_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - h2rsqrt_1_caller, - signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + _ZL19__bfloat162short_rd13__nv_bfloat16_nbst_caller, + signature(int16, CPointer(_type___nv_bfloat16)), ptrs, ) -_h2rsqrt_1_lower(shim_stream, shim_obj) +_lower__ZL19__bfloat162short_rd13__nv_bfloat16_nbst(shim_stream, shim_obj) -def h2rcp(): +def __bfloat162short_ru(): pass -def _h2rcp_1_lower(shim_stream, shim_obj): +def _lower__ZL19__bfloat162short_ru13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - h2rcp_1(__nv_bfloat162 &retval , __nv_bfloat162* a) { - retval = h2rcp(*a); + _ZL19__bfloat162short_ru13__nv_bfloat16_nbst(short &retval , __nv_bfloat16* h) { + retval = __bfloat162short_ru(*h); return 0; } """ - h2rcp_1 = declare_device( - "h2rcp_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162)) + _ZL19__bfloat162short_ru13__nv_bfloat16_nbst = declare_device( + "_ZL19__bfloat162short_ru13__nv_bfloat16_nbst", + int16(CPointer(_type___nv_bfloat16)), ) - def h2rcp_1_caller(arg_0): - return h2rcp_1(arg_0) + def _ZL19__bfloat162short_ru13__nv_bfloat16_nbst_caller(arg_0): + return _ZL19__bfloat162short_ru13__nv_bfloat16_nbst(arg_0) - @lower(h2rcp, _type___nv_bfloat162) + @lower(__bfloat162short_ru, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("h2rcp_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL19__bfloat162short_ru13__nv_bfloat16_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - h2rcp_1_caller, - signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + _ZL19__bfloat162short_ru13__nv_bfloat16_nbst_caller, + signature(int16, CPointer(_type___nv_bfloat16)), ptrs, ) -_h2rcp_1_lower(shim_stream, shim_obj) +_lower__ZL19__bfloat162short_ru13__nv_bfloat16_nbst(shim_stream, shim_obj) -def h2log(): +def __short2bfloat16_rn(): pass -def _h2log_1_lower(shim_stream, shim_obj): +def _lower__ZL19__short2bfloat16_rns_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - h2log_1(__nv_bfloat162 &retval , __nv_bfloat162* a) { - retval = h2log(*a); + _ZL19__short2bfloat16_rns_nbst(__nv_bfloat16 &retval , short* i) { + retval = __short2bfloat16_rn(*i); return 0; } """ - h2log_1 = declare_device( - "h2log_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162)) + _ZL19__short2bfloat16_rns_nbst = declare_device( + "_ZL19__short2bfloat16_rns_nbst", _type___nv_bfloat16(CPointer(int16)) ) - def h2log_1_caller(arg_0): - return h2log_1(arg_0) + def _ZL19__short2bfloat16_rns_nbst_caller(arg_0): + return _ZL19__short2bfloat16_rns_nbst(arg_0) - @lower(h2log, _type___nv_bfloat162) + @lower(__short2bfloat16_rn, int16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("h2log_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL19__short2bfloat16_rns_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - h2log_1_caller, - signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + _ZL19__short2bfloat16_rns_nbst_caller, + signature(_type___nv_bfloat16, CPointer(int16)), ptrs, ) -_h2log_1_lower(shim_stream, shim_obj) +_lower__ZL19__short2bfloat16_rns_nbst(shim_stream, shim_obj) -def h2log2(): +def __short2bfloat16_rz(): pass -def _h2log2_1_lower(shim_stream, shim_obj): +def _lower__ZL19__short2bfloat16_rzs_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - h2log2_1(__nv_bfloat162 &retval , __nv_bfloat162* a) { - retval = h2log2(*a); + _ZL19__short2bfloat16_rzs_nbst(__nv_bfloat16 &retval , short* i) { + retval = __short2bfloat16_rz(*i); return 0; } """ - h2log2_1 = declare_device( - "h2log2_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162)) + _ZL19__short2bfloat16_rzs_nbst = declare_device( + "_ZL19__short2bfloat16_rzs_nbst", _type___nv_bfloat16(CPointer(int16)) ) - def h2log2_1_caller(arg_0): - return h2log2_1(arg_0) + def _ZL19__short2bfloat16_rzs_nbst_caller(arg_0): + return _ZL19__short2bfloat16_rzs_nbst(arg_0) - @lower(h2log2, _type___nv_bfloat162) + @lower(__short2bfloat16_rz, int16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("h2log2_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL19__short2bfloat16_rzs_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - h2log2_1_caller, - signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + _ZL19__short2bfloat16_rzs_nbst_caller, + signature(_type___nv_bfloat16, CPointer(int16)), ptrs, ) -_h2log2_1_lower(shim_stream, shim_obj) +_lower__ZL19__short2bfloat16_rzs_nbst(shim_stream, shim_obj) -def h2log10(): +def __short2bfloat16_rd(): pass -def _h2log10_1_lower(shim_stream, shim_obj): +def _lower__ZL19__short2bfloat16_rds_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - h2log10_1(__nv_bfloat162 &retval , __nv_bfloat162* a) { - retval = h2log10(*a); + _ZL19__short2bfloat16_rds_nbst(__nv_bfloat16 &retval , short* i) { + retval = __short2bfloat16_rd(*i); return 0; } """ - h2log10_1 = declare_device( - "h2log10_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162)) + _ZL19__short2bfloat16_rds_nbst = declare_device( + "_ZL19__short2bfloat16_rds_nbst", _type___nv_bfloat16(CPointer(int16)) ) - def h2log10_1_caller(arg_0): - return h2log10_1(arg_0) + def _ZL19__short2bfloat16_rds_nbst_caller(arg_0): + return _ZL19__short2bfloat16_rds_nbst(arg_0) - @lower(h2log10, _type___nv_bfloat162) + @lower(__short2bfloat16_rd, int16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("h2log10_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL19__short2bfloat16_rds_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - h2log10_1_caller, - signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + _ZL19__short2bfloat16_rds_nbst_caller, + signature(_type___nv_bfloat16, CPointer(int16)), ptrs, ) -_h2log10_1_lower(shim_stream, shim_obj) +_lower__ZL19__short2bfloat16_rds_nbst(shim_stream, shim_obj) -def h2exp(): +def __short2bfloat16_ru(): pass -def _h2exp_1_lower(shim_stream, shim_obj): +def _lower__ZL19__short2bfloat16_rus_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - h2exp_1(__nv_bfloat162 &retval , __nv_bfloat162* a) { - retval = h2exp(*a); + _ZL19__short2bfloat16_rus_nbst(__nv_bfloat16 &retval , short* i) { + retval = __short2bfloat16_ru(*i); return 0; } """ - h2exp_1 = declare_device( - "h2exp_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162)) + _ZL19__short2bfloat16_rus_nbst = declare_device( + "_ZL19__short2bfloat16_rus_nbst", _type___nv_bfloat16(CPointer(int16)) ) - def h2exp_1_caller(arg_0): - return h2exp_1(arg_0) + def _ZL19__short2bfloat16_rus_nbst_caller(arg_0): + return _ZL19__short2bfloat16_rus_nbst(arg_0) - @lower(h2exp, _type___nv_bfloat162) + @lower(__short2bfloat16_ru, int16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("h2exp_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL19__short2bfloat16_rus_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - h2exp_1_caller, - signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + _ZL19__short2bfloat16_rus_nbst_caller, + signature(_type___nv_bfloat16, CPointer(int16)), ptrs, ) -_h2exp_1_lower(shim_stream, shim_obj) +_lower__ZL19__short2bfloat16_rus_nbst(shim_stream, shim_obj) -def h2exp2(): +def __bfloat162uint_rn(): pass -def _h2exp2_1_lower(shim_stream, shim_obj): +def _lower__ZL18__bfloat162uint_rn13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - h2exp2_1(__nv_bfloat162 &retval , __nv_bfloat162* a) { - retval = h2exp2(*a); + _ZL18__bfloat162uint_rn13__nv_bfloat16_nbst(unsigned int &retval , __nv_bfloat16* h) { + retval = __bfloat162uint_rn(*h); return 0; } """ - h2exp2_1 = declare_device( - "h2exp2_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162)) + _ZL18__bfloat162uint_rn13__nv_bfloat16_nbst = declare_device( + "_ZL18__bfloat162uint_rn13__nv_bfloat16_nbst", + uint32(CPointer(_type___nv_bfloat16)), ) - def h2exp2_1_caller(arg_0): - return h2exp2_1(arg_0) + def _ZL18__bfloat162uint_rn13__nv_bfloat16_nbst_caller(arg_0): + return _ZL18__bfloat162uint_rn13__nv_bfloat16_nbst(arg_0) - @lower(h2exp2, _type___nv_bfloat162) + @lower(__bfloat162uint_rn, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("h2exp2_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL18__bfloat162uint_rn13__nv_bfloat16_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - h2exp2_1_caller, - signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + _ZL18__bfloat162uint_rn13__nv_bfloat16_nbst_caller, + signature(uint32, CPointer(_type___nv_bfloat16)), ptrs, ) -_h2exp2_1_lower(shim_stream, shim_obj) +_lower__ZL18__bfloat162uint_rn13__nv_bfloat16_nbst(shim_stream, shim_obj) -def h2exp10(): +def __bfloat162uint_rz(): pass -def _h2exp10_1_lower(shim_stream, shim_obj): +def _lower__ZL18__bfloat162uint_rz13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - h2exp10_1(__nv_bfloat162 &retval , __nv_bfloat162* a) { - retval = h2exp10(*a); + _ZL18__bfloat162uint_rz13__nv_bfloat16_nbst(unsigned int &retval , __nv_bfloat16* h) { + retval = __bfloat162uint_rz(*h); return 0; } """ - h2exp10_1 = declare_device( - "h2exp10_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162)) + _ZL18__bfloat162uint_rz13__nv_bfloat16_nbst = declare_device( + "_ZL18__bfloat162uint_rz13__nv_bfloat16_nbst", + uint32(CPointer(_type___nv_bfloat16)), ) - def h2exp10_1_caller(arg_0): - return h2exp10_1(arg_0) + def _ZL18__bfloat162uint_rz13__nv_bfloat16_nbst_caller(arg_0): + return _ZL18__bfloat162uint_rz13__nv_bfloat16_nbst(arg_0) - @lower(h2exp10, _type___nv_bfloat162) + @lower(__bfloat162uint_rz, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("h2exp10_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL18__bfloat162uint_rz13__nv_bfloat16_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - h2exp10_1_caller, - signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + _ZL18__bfloat162uint_rz13__nv_bfloat16_nbst_caller, + signature(uint32, CPointer(_type___nv_bfloat16)), ptrs, ) -_h2exp10_1_lower(shim_stream, shim_obj) +_lower__ZL18__bfloat162uint_rz13__nv_bfloat16_nbst(shim_stream, shim_obj) -def h2cos(): +def __bfloat162uint_rd(): pass -def _h2cos_1_lower(shim_stream, shim_obj): +def _lower__ZL18__bfloat162uint_rd13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - h2cos_1(__nv_bfloat162 &retval , __nv_bfloat162* a) { - retval = h2cos(*a); + _ZL18__bfloat162uint_rd13__nv_bfloat16_nbst(unsigned int &retval , __nv_bfloat16* h) { + retval = __bfloat162uint_rd(*h); return 0; } """ - h2cos_1 = declare_device( - "h2cos_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162)) + _ZL18__bfloat162uint_rd13__nv_bfloat16_nbst = declare_device( + "_ZL18__bfloat162uint_rd13__nv_bfloat16_nbst", + uint32(CPointer(_type___nv_bfloat16)), ) - def h2cos_1_caller(arg_0): - return h2cos_1(arg_0) + def _ZL18__bfloat162uint_rd13__nv_bfloat16_nbst_caller(arg_0): + return _ZL18__bfloat162uint_rd13__nv_bfloat16_nbst(arg_0) - @lower(h2cos, _type___nv_bfloat162) + @lower(__bfloat162uint_rd, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("h2cos_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL18__bfloat162uint_rd13__nv_bfloat16_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - h2cos_1_caller, - signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + _ZL18__bfloat162uint_rd13__nv_bfloat16_nbst_caller, + signature(uint32, CPointer(_type___nv_bfloat16)), ptrs, ) -_h2cos_1_lower(shim_stream, shim_obj) +_lower__ZL18__bfloat162uint_rd13__nv_bfloat16_nbst(shim_stream, shim_obj) -def h2sin(): +def __bfloat162uint_ru(): pass -def _h2sin_1_lower(shim_stream, shim_obj): +def _lower__ZL18__bfloat162uint_ru13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - h2sin_1(__nv_bfloat162 &retval , __nv_bfloat162* a) { - retval = h2sin(*a); + _ZL18__bfloat162uint_ru13__nv_bfloat16_nbst(unsigned int &retval , __nv_bfloat16* h) { + retval = __bfloat162uint_ru(*h); return 0; } """ - h2sin_1 = declare_device( - "h2sin_1", _type___nv_bfloat162(CPointer(_type___nv_bfloat162)) + _ZL18__bfloat162uint_ru13__nv_bfloat16_nbst = declare_device( + "_ZL18__bfloat162uint_ru13__nv_bfloat16_nbst", + uint32(CPointer(_type___nv_bfloat16)), ) - def h2sin_1_caller(arg_0): - return h2sin_1(arg_0) + def _ZL18__bfloat162uint_ru13__nv_bfloat16_nbst_caller(arg_0): + return _ZL18__bfloat162uint_ru13__nv_bfloat16_nbst(arg_0) - @lower(h2sin, _type___nv_bfloat162) + @lower(__bfloat162uint_ru, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("h2sin_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL18__bfloat162uint_ru13__nv_bfloat16_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - h2sin_1_caller, - signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + _ZL18__bfloat162uint_ru13__nv_bfloat16_nbst_caller, + signature(uint32, CPointer(_type___nv_bfloat16)), ptrs, ) -_h2sin_1_lower(shim_stream, shim_obj) +_lower__ZL18__bfloat162uint_ru13__nv_bfloat16_nbst(shim_stream, shim_obj) -def atomicAdd(): +def __uint2bfloat16_rn(): pass -def _atomicAdd_1_lower(shim_stream, shim_obj): +def _lower__ZL18__uint2bfloat16_rnj_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - atomicAdd_1(__nv_bfloat162 &retval , __nv_bfloat162 ** address, __nv_bfloat162* val) { - retval = atomicAdd(*address, *val); + _ZL18__uint2bfloat16_rnj_nbst(__nv_bfloat16 &retval , unsigned int* i) { + retval = __uint2bfloat16_rn(*i); return 0; } """ - atomicAdd_1 = declare_device( - "atomicAdd_1", - _type___nv_bfloat162( - CPointer(CPointer(_type___nv_bfloat162)), - CPointer(_type___nv_bfloat162), - ), + _ZL18__uint2bfloat16_rnj_nbst = declare_device( + "_ZL18__uint2bfloat16_rnj_nbst", _type___nv_bfloat16(CPointer(uint32)) ) - def atomicAdd_1_caller(arg_0, arg_1): - return atomicAdd_1(arg_0, arg_1) + def _ZL18__uint2bfloat16_rnj_nbst_caller(arg_0): + return _ZL18__uint2bfloat16_rnj_nbst(arg_0) - @lower(atomicAdd, CPointer(_type___nv_bfloat162), _type___nv_bfloat162) + @lower(__uint2bfloat16_rn, uint32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("atomicAdd_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL18__uint2bfloat16_rnj_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - atomicAdd_1_caller, - signature( - _type___nv_bfloat162, - CPointer(CPointer(_type___nv_bfloat162)), - CPointer(_type___nv_bfloat162), - ), + _ZL18__uint2bfloat16_rnj_nbst_caller, + signature(_type___nv_bfloat16, CPointer(uint32)), ptrs, ) -_atomicAdd_1_lower(shim_stream, shim_obj) +_lower__ZL18__uint2bfloat16_rnj_nbst(shim_stream, shim_obj) + + +def __uint2bfloat16_rz(): + pass -def _atomicAdd_2_lower(shim_stream, shim_obj): +def _lower__ZL18__uint2bfloat16_rzj_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - atomicAdd_2(__nv_bfloat16 &retval , __nv_bfloat16 ** address, __nv_bfloat16* val) { - retval = atomicAdd(*address, *val); + _ZL18__uint2bfloat16_rzj_nbst(__nv_bfloat16 &retval , unsigned int* i) { + retval = __uint2bfloat16_rz(*i); return 0; } """ - atomicAdd_2 = declare_device( - "atomicAdd_2", - _type___nv_bfloat16( - CPointer(CPointer(_type___nv_bfloat16)), - CPointer(_type___nv_bfloat16), - ), + _ZL18__uint2bfloat16_rzj_nbst = declare_device( + "_ZL18__uint2bfloat16_rzj_nbst", _type___nv_bfloat16(CPointer(uint32)) ) - def atomicAdd_2_caller(arg_0, arg_1): - return atomicAdd_2(arg_0, arg_1) + def _ZL18__uint2bfloat16_rzj_nbst_caller(arg_0): + return _ZL18__uint2bfloat16_rzj_nbst(arg_0) - @lower(atomicAdd, CPointer(_type___nv_bfloat16), _type___nv_bfloat16) + @lower(__uint2bfloat16_rz, uint32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("atomicAdd_2", shim_raw_str) + shim_stream.write_with_key( + "_ZL18__uint2bfloat16_rzj_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - atomicAdd_2_caller, - signature( - _type___nv_bfloat16, - CPointer(CPointer(_type___nv_bfloat16)), - CPointer(_type___nv_bfloat16), - ), + _ZL18__uint2bfloat16_rzj_nbst_caller, + signature(_type___nv_bfloat16, CPointer(uint32)), ptrs, ) -_atomicAdd_2_lower(shim_stream, shim_obj) +_lower__ZL18__uint2bfloat16_rzj_nbst(shim_stream, shim_obj) + + +def __uint2bfloat16_rd(): + pass -def _operator_add_1_lower(shim_stream, shim_obj): +def _lower__ZL18__uint2bfloat16_rdj_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - operator_add_1(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { - retval = operator+(*lh, *rh); + _ZL18__uint2bfloat16_rdj_nbst(__nv_bfloat16 &retval , unsigned int* i) { + retval = __uint2bfloat16_rd(*i); return 0; } """ - operator_add_1 = declare_device( - "operator_add_1", - _type___nv_bfloat16( - CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) - ), + _ZL18__uint2bfloat16_rdj_nbst = declare_device( + "_ZL18__uint2bfloat16_rdj_nbst", _type___nv_bfloat16(CPointer(uint32)) ) - def operator_add_1_caller(arg_0, arg_1): - return operator_add_1(arg_0, arg_1) + def _ZL18__uint2bfloat16_rdj_nbst_caller(arg_0): + return _ZL18__uint2bfloat16_rdj_nbst(arg_0) - @lower(operator.add, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__uint2bfloat16_rd, uint32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_add_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL18__uint2bfloat16_rdj_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - operator_add_1_caller, - signature( - _type___nv_bfloat16, - CPointer(_type___nv_bfloat16), - CPointer(_type___nv_bfloat16), - ), + _ZL18__uint2bfloat16_rdj_nbst_caller, + signature(_type___nv_bfloat16, CPointer(uint32)), ptrs, ) -_operator_add_1_lower(shim_stream, shim_obj) +_lower__ZL18__uint2bfloat16_rdj_nbst(shim_stream, shim_obj) + + +def __uint2bfloat16_ru(): + pass -def _operator_sub_1_lower(shim_stream, shim_obj): +def _lower__ZL18__uint2bfloat16_ruj_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - operator_sub_1(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { - retval = operator-(*lh, *rh); + _ZL18__uint2bfloat16_ruj_nbst(__nv_bfloat16 &retval , unsigned int* i) { + retval = __uint2bfloat16_ru(*i); return 0; } """ - operator_sub_1 = declare_device( - "operator_sub_1", - _type___nv_bfloat16( - CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) - ), + _ZL18__uint2bfloat16_ruj_nbst = declare_device( + "_ZL18__uint2bfloat16_ruj_nbst", _type___nv_bfloat16(CPointer(uint32)) ) - def operator_sub_1_caller(arg_0, arg_1): - return operator_sub_1(arg_0, arg_1) + def _ZL18__uint2bfloat16_ruj_nbst_caller(arg_0): + return _ZL18__uint2bfloat16_ruj_nbst(arg_0) - @lower(operator.sub, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__uint2bfloat16_ru, uint32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_sub_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL18__uint2bfloat16_ruj_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - operator_sub_1_caller, - signature( - _type___nv_bfloat16, - CPointer(_type___nv_bfloat16), - CPointer(_type___nv_bfloat16), - ), + _ZL18__uint2bfloat16_ruj_nbst_caller, + signature(_type___nv_bfloat16, CPointer(uint32)), ptrs, ) -_operator_sub_1_lower(shim_stream, shim_obj) +_lower__ZL18__uint2bfloat16_ruj_nbst(shim_stream, shim_obj) -def _operator_mul_1_lower(shim_stream, shim_obj): +def __bfloat162ushort_rn(): + pass + + +def _lower__ZL20__bfloat162ushort_rn13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - operator_mul_1(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { - retval = operator*(*lh, *rh); + _ZL20__bfloat162ushort_rn13__nv_bfloat16_nbst(unsigned short &retval , __nv_bfloat16* h) { + retval = __bfloat162ushort_rn(*h); return 0; } """ - operator_mul_1 = declare_device( - "operator_mul_1", - _type___nv_bfloat16( - CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) - ), + _ZL20__bfloat162ushort_rn13__nv_bfloat16_nbst = declare_device( + "_ZL20__bfloat162ushort_rn13__nv_bfloat16_nbst", + uint16(CPointer(_type___nv_bfloat16)), ) - def operator_mul_1_caller(arg_0, arg_1): - return operator_mul_1(arg_0, arg_1) + def _ZL20__bfloat162ushort_rn13__nv_bfloat16_nbst_caller(arg_0): + return _ZL20__bfloat162ushort_rn13__nv_bfloat16_nbst(arg_0) - @lower(operator.mul, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__bfloat162ushort_rn, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_mul_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL20__bfloat162ushort_rn13__nv_bfloat16_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - operator_mul_1_caller, - signature( - _type___nv_bfloat16, - CPointer(_type___nv_bfloat16), - CPointer(_type___nv_bfloat16), - ), + _ZL20__bfloat162ushort_rn13__nv_bfloat16_nbst_caller, + signature(uint16, CPointer(_type___nv_bfloat16)), ptrs, ) -_operator_mul_1_lower(shim_stream, shim_obj) +_lower__ZL20__bfloat162ushort_rn13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def __bfloat162ushort_rz(): + pass -def _operator_truediv_1_lower(shim_stream, shim_obj): +def _lower__ZL20__bfloat162ushort_rz13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - operator_truediv_1(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { - retval = operator/(*lh, *rh); + _ZL20__bfloat162ushort_rz13__nv_bfloat16_nbst(unsigned short &retval , __nv_bfloat16* h) { + retval = __bfloat162ushort_rz(*h); return 0; } """ - operator_truediv_1 = declare_device( - "operator_truediv_1", - _type___nv_bfloat16( - CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) - ), + _ZL20__bfloat162ushort_rz13__nv_bfloat16_nbst = declare_device( + "_ZL20__bfloat162ushort_rz13__nv_bfloat16_nbst", + uint16(CPointer(_type___nv_bfloat16)), ) - def operator_truediv_1_caller(arg_0, arg_1): - return operator_truediv_1(arg_0, arg_1) + def _ZL20__bfloat162ushort_rz13__nv_bfloat16_nbst_caller(arg_0): + return _ZL20__bfloat162ushort_rz13__nv_bfloat16_nbst(arg_0) - @lower(operator.truediv, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__bfloat162ushort_rz, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_truediv_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL20__bfloat162ushort_rz13__nv_bfloat16_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - operator_truediv_1_caller, - signature( - _type___nv_bfloat16, - CPointer(_type___nv_bfloat16), - CPointer(_type___nv_bfloat16), - ), + _ZL20__bfloat162ushort_rz13__nv_bfloat16_nbst_caller, + signature(uint16, CPointer(_type___nv_bfloat16)), ptrs, ) -_operator_truediv_1_lower(shim_stream, shim_obj) +_lower__ZL20__bfloat162ushort_rz13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def __bfloat162ushort_rd(): + pass -def _operator_iadd_1_lower(shim_stream, shim_obj): +def _lower__ZL20__bfloat162ushort_rd13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - operator_iadd_1(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { - retval = operator+=(*lh, *rh); + _ZL20__bfloat162ushort_rd13__nv_bfloat16_nbst(unsigned short &retval , __nv_bfloat16* h) { + retval = __bfloat162ushort_rd(*h); return 0; } """ - operator_iadd_1 = declare_device( - "operator_iadd_1", - _type___nv_bfloat16( - CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) - ), + _ZL20__bfloat162ushort_rd13__nv_bfloat16_nbst = declare_device( + "_ZL20__bfloat162ushort_rd13__nv_bfloat16_nbst", + uint16(CPointer(_type___nv_bfloat16)), ) - def operator_iadd_1_caller(arg_0, arg_1): - return operator_iadd_1(arg_0, arg_1) + def _ZL20__bfloat162ushort_rd13__nv_bfloat16_nbst_caller(arg_0): + return _ZL20__bfloat162ushort_rd13__nv_bfloat16_nbst(arg_0) - @lower(operator.iadd, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__bfloat162ushort_rd, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_iadd_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL20__bfloat162ushort_rd13__nv_bfloat16_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - operator_iadd_1_caller, - signature( - _type___nv_bfloat16, - CPointer(_type___nv_bfloat16), - CPointer(_type___nv_bfloat16), - ), + _ZL20__bfloat162ushort_rd13__nv_bfloat16_nbst_caller, + signature(uint16, CPointer(_type___nv_bfloat16)), ptrs, ) -_operator_iadd_1_lower(shim_stream, shim_obj) +_lower__ZL20__bfloat162ushort_rd13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def __bfloat162ushort_ru(): + pass -def _operator_isub_1_lower(shim_stream, shim_obj): +def _lower__ZL20__bfloat162ushort_ru13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - operator_isub_1(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { - retval = operator-=(*lh, *rh); + _ZL20__bfloat162ushort_ru13__nv_bfloat16_nbst(unsigned short &retval , __nv_bfloat16* h) { + retval = __bfloat162ushort_ru(*h); return 0; } """ - operator_isub_1 = declare_device( - "operator_isub_1", - _type___nv_bfloat16( - CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) - ), + _ZL20__bfloat162ushort_ru13__nv_bfloat16_nbst = declare_device( + "_ZL20__bfloat162ushort_ru13__nv_bfloat16_nbst", + uint16(CPointer(_type___nv_bfloat16)), ) - def operator_isub_1_caller(arg_0, arg_1): - return operator_isub_1(arg_0, arg_1) + def _ZL20__bfloat162ushort_ru13__nv_bfloat16_nbst_caller(arg_0): + return _ZL20__bfloat162ushort_ru13__nv_bfloat16_nbst(arg_0) - @lower(operator.isub, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__bfloat162ushort_ru, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_isub_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL20__bfloat162ushort_ru13__nv_bfloat16_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - operator_isub_1_caller, - signature( - _type___nv_bfloat16, - CPointer(_type___nv_bfloat16), - CPointer(_type___nv_bfloat16), - ), + _ZL20__bfloat162ushort_ru13__nv_bfloat16_nbst_caller, + signature(uint16, CPointer(_type___nv_bfloat16)), ptrs, ) -_operator_isub_1_lower(shim_stream, shim_obj) +_lower__ZL20__bfloat162ushort_ru13__nv_bfloat16_nbst(shim_stream, shim_obj) -def _operator_imul_1_lower(shim_stream, shim_obj): +def __ushort2bfloat16_rn(): + pass + + +def _lower__ZL20__ushort2bfloat16_rnt_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - operator_imul_1(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { - retval = operator*=(*lh, *rh); + _ZL20__ushort2bfloat16_rnt_nbst(__nv_bfloat16 &retval , unsigned short* i) { + retval = __ushort2bfloat16_rn(*i); return 0; } """ - operator_imul_1 = declare_device( - "operator_imul_1", - _type___nv_bfloat16( - CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) - ), + _ZL20__ushort2bfloat16_rnt_nbst = declare_device( + "_ZL20__ushort2bfloat16_rnt_nbst", _type___nv_bfloat16(CPointer(uint16)) ) - def operator_imul_1_caller(arg_0, arg_1): - return operator_imul_1(arg_0, arg_1) + def _ZL20__ushort2bfloat16_rnt_nbst_caller(arg_0): + return _ZL20__ushort2bfloat16_rnt_nbst(arg_0) - @lower(operator.imul, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__ushort2bfloat16_rn, uint16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_imul_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL20__ushort2bfloat16_rnt_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - operator_imul_1_caller, - signature( - _type___nv_bfloat16, - CPointer(_type___nv_bfloat16), - CPointer(_type___nv_bfloat16), - ), + _ZL20__ushort2bfloat16_rnt_nbst_caller, + signature(_type___nv_bfloat16, CPointer(uint16)), ptrs, ) -_operator_imul_1_lower(shim_stream, shim_obj) +_lower__ZL20__ushort2bfloat16_rnt_nbst(shim_stream, shim_obj) + + +def __ushort2bfloat16_rz(): + pass -def _operator_itruediv_1_lower(shim_stream, shim_obj): +def _lower__ZL20__ushort2bfloat16_rzt_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - operator_itruediv_1(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { - retval = operator/=(*lh, *rh); + _ZL20__ushort2bfloat16_rzt_nbst(__nv_bfloat16 &retval , unsigned short* i) { + retval = __ushort2bfloat16_rz(*i); return 0; } """ - operator_itruediv_1 = declare_device( - "operator_itruediv_1", - _type___nv_bfloat16( - CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) - ), + _ZL20__ushort2bfloat16_rzt_nbst = declare_device( + "_ZL20__ushort2bfloat16_rzt_nbst", _type___nv_bfloat16(CPointer(uint16)) ) - def operator_itruediv_1_caller(arg_0, arg_1): - return operator_itruediv_1(arg_0, arg_1) + def _ZL20__ushort2bfloat16_rzt_nbst_caller(arg_0): + return _ZL20__ushort2bfloat16_rzt_nbst(arg_0) - @lower(operator.itruediv, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__ushort2bfloat16_rz, uint16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_itruediv_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL20__ushort2bfloat16_rzt_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - operator_itruediv_1_caller, - signature( - _type___nv_bfloat16, - CPointer(_type___nv_bfloat16), - CPointer(_type___nv_bfloat16), - ), + _ZL20__ushort2bfloat16_rzt_nbst_caller, + signature(_type___nv_bfloat16, CPointer(uint16)), ptrs, ) -_operator_itruediv_1_lower(shim_stream, shim_obj) +_lower__ZL20__ushort2bfloat16_rzt_nbst(shim_stream, shim_obj) + + +def __ushort2bfloat16_rd(): + pass -def _operator_pos_1_lower(shim_stream, shim_obj): +def _lower__ZL20__ushort2bfloat16_rdt_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - operator_pos_1(__nv_bfloat16 &retval , __nv_bfloat16* h) { - retval = operator+(*h); + _ZL20__ushort2bfloat16_rdt_nbst(__nv_bfloat16 &retval , unsigned short* i) { + retval = __ushort2bfloat16_rd(*i); return 0; } """ - operator_pos_1 = declare_device( - "operator_pos_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16)) + _ZL20__ushort2bfloat16_rdt_nbst = declare_device( + "_ZL20__ushort2bfloat16_rdt_nbst", _type___nv_bfloat16(CPointer(uint16)) ) - def operator_pos_1_caller(arg_0): - return operator_pos_1(arg_0) + def _ZL20__ushort2bfloat16_rdt_nbst_caller(arg_0): + return _ZL20__ushort2bfloat16_rdt_nbst(arg_0) - @lower(operator.pos, _type___nv_bfloat16) + @lower(__ushort2bfloat16_rd, uint16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_pos_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL20__ushort2bfloat16_rdt_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - operator_pos_1_caller, - signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + _ZL20__ushort2bfloat16_rdt_nbst_caller, + signature(_type___nv_bfloat16, CPointer(uint16)), ptrs, ) -_operator_pos_1_lower(shim_stream, shim_obj) +_lower__ZL20__ushort2bfloat16_rdt_nbst(shim_stream, shim_obj) + + +def __ushort2bfloat16_ru(): + pass -def _operator_neg_1_lower(shim_stream, shim_obj): +def _lower__ZL20__ushort2bfloat16_rut_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - operator_neg_1(__nv_bfloat16 &retval , __nv_bfloat16* h) { - retval = operator-(*h); + _ZL20__ushort2bfloat16_rut_nbst(__nv_bfloat16 &retval , unsigned short* i) { + retval = __ushort2bfloat16_ru(*i); return 0; } """ - operator_neg_1 = declare_device( - "operator_neg_1", _type___nv_bfloat16(CPointer(_type___nv_bfloat16)) + _ZL20__ushort2bfloat16_rut_nbst = declare_device( + "_ZL20__ushort2bfloat16_rut_nbst", _type___nv_bfloat16(CPointer(uint16)) ) - def operator_neg_1_caller(arg_0): - return operator_neg_1(arg_0) + def _ZL20__ushort2bfloat16_rut_nbst_caller(arg_0): + return _ZL20__ushort2bfloat16_rut_nbst(arg_0) - @lower(operator.neg, _type___nv_bfloat16) + @lower(__ushort2bfloat16_ru, uint16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_neg_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL20__ushort2bfloat16_rut_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - operator_neg_1_caller, - signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + _ZL20__ushort2bfloat16_rut_nbst_caller, + signature(_type___nv_bfloat16, CPointer(uint16)), ptrs, ) -_operator_neg_1_lower(shim_stream, shim_obj) +_lower__ZL20__ushort2bfloat16_rut_nbst(shim_stream, shim_obj) + + +def __bfloat162ull_rn(): + pass -def _operator_eq_1_lower(shim_stream, shim_obj): +def _lower__ZL17__bfloat162ull_rn13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - operator_eq_1(bool &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { - retval = operator==(*lh, *rh); + _ZL17__bfloat162ull_rn13__nv_bfloat16_nbst(unsigned long long &retval , __nv_bfloat16* h) { + retval = __bfloat162ull_rn(*h); return 0; } """ - operator_eq_1 = declare_device( - "operator_eq_1", - bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)), + _ZL17__bfloat162ull_rn13__nv_bfloat16_nbst = declare_device( + "_ZL17__bfloat162ull_rn13__nv_bfloat16_nbst", + uint64(CPointer(_type___nv_bfloat16)), ) - def operator_eq_1_caller(arg_0, arg_1): - return operator_eq_1(arg_0, arg_1) + def _ZL17__bfloat162ull_rn13__nv_bfloat16_nbst_caller(arg_0): + return _ZL17__bfloat162ull_rn13__nv_bfloat16_nbst(arg_0) - @lower(operator.eq, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__bfloat162ull_rn, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_eq_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL17__bfloat162ull_rn13__nv_bfloat16_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - operator_eq_1_caller, - signature( - bool_, - CPointer(_type___nv_bfloat16), - CPointer(_type___nv_bfloat16), - ), + _ZL17__bfloat162ull_rn13__nv_bfloat16_nbst_caller, + signature(uint64, CPointer(_type___nv_bfloat16)), ptrs, ) -_operator_eq_1_lower(shim_stream, shim_obj) +_lower__ZL17__bfloat162ull_rn13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def __bfloat162ull_rz(): + pass -def _operator_ne_1_lower(shim_stream, shim_obj): +def _lower__ZL17__bfloat162ull_rz13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - operator_ne_1(bool &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { - retval = operator!=(*lh, *rh); + _ZL17__bfloat162ull_rz13__nv_bfloat16_nbst(unsigned long long &retval , __nv_bfloat16* h) { + retval = __bfloat162ull_rz(*h); return 0; } """ - operator_ne_1 = declare_device( - "operator_ne_1", - bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)), + _ZL17__bfloat162ull_rz13__nv_bfloat16_nbst = declare_device( + "_ZL17__bfloat162ull_rz13__nv_bfloat16_nbst", + uint64(CPointer(_type___nv_bfloat16)), ) - def operator_ne_1_caller(arg_0, arg_1): - return operator_ne_1(arg_0, arg_1) + def _ZL17__bfloat162ull_rz13__nv_bfloat16_nbst_caller(arg_0): + return _ZL17__bfloat162ull_rz13__nv_bfloat16_nbst(arg_0) - @lower(operator.ne, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__bfloat162ull_rz, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_ne_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL17__bfloat162ull_rz13__nv_bfloat16_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - operator_ne_1_caller, - signature( - bool_, - CPointer(_type___nv_bfloat16), - CPointer(_type___nv_bfloat16), - ), + _ZL17__bfloat162ull_rz13__nv_bfloat16_nbst_caller, + signature(uint64, CPointer(_type___nv_bfloat16)), ptrs, ) -_operator_ne_1_lower(shim_stream, shim_obj) +_lower__ZL17__bfloat162ull_rz13__nv_bfloat16_nbst(shim_stream, shim_obj) -def _operator_gt_1_lower(shim_stream, shim_obj): +def make_bfloat162(): + pass + + +def _lower__ZL14make_bfloat16213__nv_bfloat16S__nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - operator_gt_1(bool &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { - retval = operator>(*lh, *rh); + _ZL14make_bfloat16213__nv_bfloat16S__nbst(__nv_bfloat162 &retval , __nv_bfloat16* x, __nv_bfloat16* y) { + retval = make_bfloat162(*x, *y); return 0; } """ - operator_gt_1 = declare_device( - "operator_gt_1", - bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)), + _ZL14make_bfloat16213__nv_bfloat16S__nbst = declare_device( + "_ZL14make_bfloat16213__nv_bfloat16S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) + ), ) - def operator_gt_1_caller(arg_0, arg_1): - return operator_gt_1(arg_0, arg_1) + def _ZL14make_bfloat16213__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL14make_bfloat16213__nv_bfloat16S__nbst(arg_0, arg_1) - @lower(operator.gt, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(make_bfloat162, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_gt_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL14make_bfloat16213__nv_bfloat16S__nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - operator_gt_1_caller, + _ZL14make_bfloat16213__nv_bfloat16S__nbst_caller, signature( - bool_, + _type___nv_bfloat162, CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16), ), @@ -3880,858 +4066,11629 @@ def impl(context, builder, sig, args): ) -_operator_gt_1_lower(shim_stream, shim_obj) +_lower__ZL14make_bfloat16213__nv_bfloat16S__nbst(shim_stream, shim_obj) -def _operator_lt_1_lower(shim_stream, shim_obj): +def __bfloat162ull_rd(): + pass + + +def _lower__ZL17__bfloat162ull_rd13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - operator_lt_1(bool &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { - retval = operator<(*lh, *rh); + _ZL17__bfloat162ull_rd13__nv_bfloat16_nbst(unsigned long long &retval , __nv_bfloat16* h) { + retval = __bfloat162ull_rd(*h); return 0; } """ - operator_lt_1 = declare_device( - "operator_lt_1", - bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)), + _ZL17__bfloat162ull_rd13__nv_bfloat16_nbst = declare_device( + "_ZL17__bfloat162ull_rd13__nv_bfloat16_nbst", + uint64(CPointer(_type___nv_bfloat16)), ) - def operator_lt_1_caller(arg_0, arg_1): - return operator_lt_1(arg_0, arg_1) + def _ZL17__bfloat162ull_rd13__nv_bfloat16_nbst_caller(arg_0): + return _ZL17__bfloat162ull_rd13__nv_bfloat16_nbst(arg_0) - @lower(operator.lt, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__bfloat162ull_rd, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_lt_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL17__bfloat162ull_rd13__nv_bfloat16_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - operator_lt_1_caller, - signature( - bool_, - CPointer(_type___nv_bfloat16), - CPointer(_type___nv_bfloat16), - ), + _ZL17__bfloat162ull_rd13__nv_bfloat16_nbst_caller, + signature(uint64, CPointer(_type___nv_bfloat16)), ptrs, ) -_operator_lt_1_lower(shim_stream, shim_obj) +_lower__ZL17__bfloat162ull_rd13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def __bfloat162ull_ru(): + pass -def _operator_ge_1_lower(shim_stream, shim_obj): +def _lower__ZL17__bfloat162ull_ru13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - operator_ge_1(bool &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { - retval = operator>=(*lh, *rh); + _ZL17__bfloat162ull_ru13__nv_bfloat16_nbst(unsigned long long &retval , __nv_bfloat16* h) { + retval = __bfloat162ull_ru(*h); return 0; } """ - operator_ge_1 = declare_device( - "operator_ge_1", - bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)), + _ZL17__bfloat162ull_ru13__nv_bfloat16_nbst = declare_device( + "_ZL17__bfloat162ull_ru13__nv_bfloat16_nbst", + uint64(CPointer(_type___nv_bfloat16)), ) - def operator_ge_1_caller(arg_0, arg_1): - return operator_ge_1(arg_0, arg_1) + def _ZL17__bfloat162ull_ru13__nv_bfloat16_nbst_caller(arg_0): + return _ZL17__bfloat162ull_ru13__nv_bfloat16_nbst(arg_0) - @lower(operator.ge, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__bfloat162ull_ru, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_ge_1", shim_raw_str) + shim_stream.write_with_key( + "_ZL17__bfloat162ull_ru13__nv_bfloat16_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - operator_ge_1_caller, - signature( - bool_, - CPointer(_type___nv_bfloat16), - CPointer(_type___nv_bfloat16), - ), + _ZL17__bfloat162ull_ru13__nv_bfloat16_nbst_caller, + signature(uint64, CPointer(_type___nv_bfloat16)), ptrs, ) -_operator_ge_1_lower(shim_stream, shim_obj) +_lower__ZL17__bfloat162ull_ru13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def __ull2bfloat16_rn(): + pass -def _operator_le_1_lower(shim_stream, shim_obj): +def _lower__ZL17__ull2bfloat16_rny_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - operator_le_1(bool &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { - retval = operator<=(*lh, *rh); + _ZL17__ull2bfloat16_rny_nbst(__nv_bfloat16 &retval , unsigned long long* i) { + retval = __ull2bfloat16_rn(*i); return 0; } """ - operator_le_1 = declare_device( - "operator_le_1", - bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)), + _ZL17__ull2bfloat16_rny_nbst = declare_device( + "_ZL17__ull2bfloat16_rny_nbst", _type___nv_bfloat16(CPointer(uint64)) ) - def operator_le_1_caller(arg_0, arg_1): - return operator_le_1(arg_0, arg_1) + def _ZL17__ull2bfloat16_rny_nbst_caller(arg_0): + return _ZL17__ull2bfloat16_rny_nbst(arg_0) - @lower(operator.le, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__ull2bfloat16_rn, uint64) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_le_1", shim_raw_str) + shim_stream.write_with_key("_ZL17__ull2bfloat16_rny_nbst", shim_raw_str) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - operator_le_1_caller, - signature( - bool_, - CPointer(_type___nv_bfloat16), - CPointer(_type___nv_bfloat16), - ), + _ZL17__ull2bfloat16_rny_nbst_caller, + signature(_type___nv_bfloat16, CPointer(uint64)), ptrs, ) -_operator_le_1_lower(shim_stream, shim_obj) +_lower__ZL17__ull2bfloat16_rny_nbst(shim_stream, shim_obj) + + +def __ull2bfloat16_rz(): + pass -def _operator_add_2_lower(shim_stream, shim_obj): +def _lower__ZL17__ull2bfloat16_rzy_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - operator_add_2(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { - retval = operator+(*lh, *rh); + _ZL17__ull2bfloat16_rzy_nbst(__nv_bfloat16 &retval , unsigned long long* i) { + retval = __ull2bfloat16_rz(*i); return 0; } """ - operator_add_2 = declare_device( - "operator_add_2", - _type___nv_bfloat162( - CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) - ), + _ZL17__ull2bfloat16_rzy_nbst = declare_device( + "_ZL17__ull2bfloat16_rzy_nbst", _type___nv_bfloat16(CPointer(uint64)) ) - def operator_add_2_caller(arg_0, arg_1): - return operator_add_2(arg_0, arg_1) + def _ZL17__ull2bfloat16_rzy_nbst_caller(arg_0): + return _ZL17__ull2bfloat16_rzy_nbst(arg_0) - @lower(operator.add, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__ull2bfloat16_rz, uint64) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_add_2", shim_raw_str) + shim_stream.write_with_key("_ZL17__ull2bfloat16_rzy_nbst", shim_raw_str) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - operator_add_2_caller, - signature( - _type___nv_bfloat162, - CPointer(_type___nv_bfloat162), - CPointer(_type___nv_bfloat162), - ), + _ZL17__ull2bfloat16_rzy_nbst_caller, + signature(_type___nv_bfloat16, CPointer(uint64)), ptrs, ) -_operator_add_2_lower(shim_stream, shim_obj) +_lower__ZL17__ull2bfloat16_rzy_nbst(shim_stream, shim_obj) + + +def __ull2bfloat16_rd(): + pass -def _operator_sub_2_lower(shim_stream, shim_obj): +def _lower__ZL17__ull2bfloat16_rdy_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - operator_sub_2(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { - retval = operator-(*lh, *rh); + _ZL17__ull2bfloat16_rdy_nbst(__nv_bfloat16 &retval , unsigned long long* i) { + retval = __ull2bfloat16_rd(*i); return 0; } """ - operator_sub_2 = declare_device( - "operator_sub_2", - _type___nv_bfloat162( - CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) - ), + _ZL17__ull2bfloat16_rdy_nbst = declare_device( + "_ZL17__ull2bfloat16_rdy_nbst", _type___nv_bfloat16(CPointer(uint64)) ) - def operator_sub_2_caller(arg_0, arg_1): - return operator_sub_2(arg_0, arg_1) + def _ZL17__ull2bfloat16_rdy_nbst_caller(arg_0): + return _ZL17__ull2bfloat16_rdy_nbst(arg_0) - @lower(operator.sub, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__ull2bfloat16_rd, uint64) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_sub_2", shim_raw_str) + shim_stream.write_with_key("_ZL17__ull2bfloat16_rdy_nbst", shim_raw_str) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - operator_sub_2_caller, - signature( - _type___nv_bfloat162, - CPointer(_type___nv_bfloat162), - CPointer(_type___nv_bfloat162), - ), + _ZL17__ull2bfloat16_rdy_nbst_caller, + signature(_type___nv_bfloat16, CPointer(uint64)), ptrs, ) -_operator_sub_2_lower(shim_stream, shim_obj) +_lower__ZL17__ull2bfloat16_rdy_nbst(shim_stream, shim_obj) + + +def __ull2bfloat16_ru(): + pass -def _operator_mul_2_lower(shim_stream, shim_obj): +def _lower__ZL17__ull2bfloat16_ruy_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - operator_mul_2(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { - retval = operator*(*lh, *rh); + _ZL17__ull2bfloat16_ruy_nbst(__nv_bfloat16 &retval , unsigned long long* i) { + retval = __ull2bfloat16_ru(*i); return 0; } """ - operator_mul_2 = declare_device( - "operator_mul_2", - _type___nv_bfloat162( - CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) - ), + _ZL17__ull2bfloat16_ruy_nbst = declare_device( + "_ZL17__ull2bfloat16_ruy_nbst", _type___nv_bfloat16(CPointer(uint64)) ) - def operator_mul_2_caller(arg_0, arg_1): - return operator_mul_2(arg_0, arg_1) + def _ZL17__ull2bfloat16_ruy_nbst_caller(arg_0): + return _ZL17__ull2bfloat16_ruy_nbst(arg_0) - @lower(operator.mul, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__ull2bfloat16_ru, uint64) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_mul_2", shim_raw_str) + shim_stream.write_with_key("_ZL17__ull2bfloat16_ruy_nbst", shim_raw_str) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - operator_mul_2_caller, - signature( - _type___nv_bfloat162, - CPointer(_type___nv_bfloat162), - CPointer(_type___nv_bfloat162), - ), + _ZL17__ull2bfloat16_ruy_nbst_caller, + signature(_type___nv_bfloat16, CPointer(uint64)), ptrs, ) -_operator_mul_2_lower(shim_stream, shim_obj) +_lower__ZL17__ull2bfloat16_ruy_nbst(shim_stream, shim_obj) -def _operator_truediv_2_lower(shim_stream, shim_obj): +def __bfloat162ll_rn(): + pass + + +def _lower__ZL16__bfloat162ll_rn13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - operator_truediv_2(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { - retval = operator/(*lh, *rh); + _ZL16__bfloat162ll_rn13__nv_bfloat16_nbst(long long &retval , __nv_bfloat16* h) { + retval = __bfloat162ll_rn(*h); return 0; } """ - operator_truediv_2 = declare_device( - "operator_truediv_2", - _type___nv_bfloat162( - CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) - ), + _ZL16__bfloat162ll_rn13__nv_bfloat16_nbst = declare_device( + "_ZL16__bfloat162ll_rn13__nv_bfloat16_nbst", + int64(CPointer(_type___nv_bfloat16)), ) - def operator_truediv_2_caller(arg_0, arg_1): - return operator_truediv_2(arg_0, arg_1) + def _ZL16__bfloat162ll_rn13__nv_bfloat16_nbst_caller(arg_0): + return _ZL16__bfloat162ll_rn13__nv_bfloat16_nbst(arg_0) - @lower(operator.truediv, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__bfloat162ll_rn, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_truediv_2", shim_raw_str) + shim_stream.write_with_key( + "_ZL16__bfloat162ll_rn13__nv_bfloat16_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - operator_truediv_2_caller, - signature( - _type___nv_bfloat162, - CPointer(_type___nv_bfloat162), - CPointer(_type___nv_bfloat162), - ), + _ZL16__bfloat162ll_rn13__nv_bfloat16_nbst_caller, + signature(int64, CPointer(_type___nv_bfloat16)), ptrs, ) -_operator_truediv_2_lower(shim_stream, shim_obj) +_lower__ZL16__bfloat162ll_rn13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def __bfloat162ll_rz(): + pass -def _operator_iadd_2_lower(shim_stream, shim_obj): +def _lower__ZL16__bfloat162ll_rz13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - operator_iadd_2(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { - retval = operator+=(*lh, *rh); + _ZL16__bfloat162ll_rz13__nv_bfloat16_nbst(long long &retval , __nv_bfloat16* h) { + retval = __bfloat162ll_rz(*h); return 0; } """ - operator_iadd_2 = declare_device( - "operator_iadd_2", - _type___nv_bfloat162( - CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) - ), + _ZL16__bfloat162ll_rz13__nv_bfloat16_nbst = declare_device( + "_ZL16__bfloat162ll_rz13__nv_bfloat16_nbst", + int64(CPointer(_type___nv_bfloat16)), ) - def operator_iadd_2_caller(arg_0, arg_1): - return operator_iadd_2(arg_0, arg_1) + def _ZL16__bfloat162ll_rz13__nv_bfloat16_nbst_caller(arg_0): + return _ZL16__bfloat162ll_rz13__nv_bfloat16_nbst(arg_0) - @lower(operator.iadd, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__bfloat162ll_rz, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_iadd_2", shim_raw_str) + shim_stream.write_with_key( + "_ZL16__bfloat162ll_rz13__nv_bfloat16_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - operator_iadd_2_caller, - signature( - _type___nv_bfloat162, - CPointer(_type___nv_bfloat162), - CPointer(_type___nv_bfloat162), - ), + _ZL16__bfloat162ll_rz13__nv_bfloat16_nbst_caller, + signature(int64, CPointer(_type___nv_bfloat16)), ptrs, ) -_operator_iadd_2_lower(shim_stream, shim_obj) +_lower__ZL16__bfloat162ll_rz13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def __bfloat162ll_rd(): + pass -def _operator_isub_2_lower(shim_stream, shim_obj): +def _lower__ZL16__bfloat162ll_rd13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - operator_isub_2(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { - retval = operator-=(*lh, *rh); + _ZL16__bfloat162ll_rd13__nv_bfloat16_nbst(long long &retval , __nv_bfloat16* h) { + retval = __bfloat162ll_rd(*h); return 0; } """ - operator_isub_2 = declare_device( - "operator_isub_2", - _type___nv_bfloat162( - CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) - ), + _ZL16__bfloat162ll_rd13__nv_bfloat16_nbst = declare_device( + "_ZL16__bfloat162ll_rd13__nv_bfloat16_nbst", + int64(CPointer(_type___nv_bfloat16)), ) - def operator_isub_2_caller(arg_0, arg_1): - return operator_isub_2(arg_0, arg_1) + def _ZL16__bfloat162ll_rd13__nv_bfloat16_nbst_caller(arg_0): + return _ZL16__bfloat162ll_rd13__nv_bfloat16_nbst(arg_0) - @lower(operator.isub, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__bfloat162ll_rd, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_isub_2", shim_raw_str) + shim_stream.write_with_key( + "_ZL16__bfloat162ll_rd13__nv_bfloat16_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - operator_isub_2_caller, - signature( - _type___nv_bfloat162, - CPointer(_type___nv_bfloat162), - CPointer(_type___nv_bfloat162), - ), + _ZL16__bfloat162ll_rd13__nv_bfloat16_nbst_caller, + signature(int64, CPointer(_type___nv_bfloat16)), ptrs, ) -_operator_isub_2_lower(shim_stream, shim_obj) +_lower__ZL16__bfloat162ll_rd13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def __bfloat162ll_ru(): + pass -def _operator_imul_2_lower(shim_stream, shim_obj): +def _lower__ZL16__bfloat162ll_ru13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - operator_imul_2(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { - retval = operator*=(*lh, *rh); + _ZL16__bfloat162ll_ru13__nv_bfloat16_nbst(long long &retval , __nv_bfloat16* h) { + retval = __bfloat162ll_ru(*h); return 0; } """ - operator_imul_2 = declare_device( - "operator_imul_2", - _type___nv_bfloat162( - CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) - ), + _ZL16__bfloat162ll_ru13__nv_bfloat16_nbst = declare_device( + "_ZL16__bfloat162ll_ru13__nv_bfloat16_nbst", + int64(CPointer(_type___nv_bfloat16)), ) - def operator_imul_2_caller(arg_0, arg_1): - return operator_imul_2(arg_0, arg_1) + def _ZL16__bfloat162ll_ru13__nv_bfloat16_nbst_caller(arg_0): + return _ZL16__bfloat162ll_ru13__nv_bfloat16_nbst(arg_0) - @lower(operator.imul, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__bfloat162ll_ru, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_imul_2", shim_raw_str) + shim_stream.write_with_key( + "_ZL16__bfloat162ll_ru13__nv_bfloat16_nbst", shim_raw_str + ) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - operator_imul_2_caller, - signature( - _type___nv_bfloat162, - CPointer(_type___nv_bfloat162), - CPointer(_type___nv_bfloat162), - ), + _ZL16__bfloat162ll_ru13__nv_bfloat16_nbst_caller, + signature(int64, CPointer(_type___nv_bfloat16)), ptrs, ) -_operator_imul_2_lower(shim_stream, shim_obj) +_lower__ZL16__bfloat162ll_ru13__nv_bfloat16_nbst(shim_stream, shim_obj) -def _operator_itruediv_2_lower(shim_stream, shim_obj): +def __ll2bfloat16_rn(): + pass + + +def _lower__ZL16__ll2bfloat16_rnx_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - operator_itruediv_2(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { - retval = operator/=(*lh, *rh); + _ZL16__ll2bfloat16_rnx_nbst(__nv_bfloat16 &retval , long long* i) { + retval = __ll2bfloat16_rn(*i); return 0; } """ - operator_itruediv_2 = declare_device( - "operator_itruediv_2", - _type___nv_bfloat162( - CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) - ), + _ZL16__ll2bfloat16_rnx_nbst = declare_device( + "_ZL16__ll2bfloat16_rnx_nbst", _type___nv_bfloat16(CPointer(int64)) ) - def operator_itruediv_2_caller(arg_0, arg_1): - return operator_itruediv_2(arg_0, arg_1) + def _ZL16__ll2bfloat16_rnx_nbst_caller(arg_0): + return _ZL16__ll2bfloat16_rnx_nbst(arg_0) - @lower(operator.itruediv, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__ll2bfloat16_rn, int64) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_itruediv_2", shim_raw_str) + shim_stream.write_with_key("_ZL16__ll2bfloat16_rnx_nbst", shim_raw_str) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - operator_itruediv_2_caller, - signature( - _type___nv_bfloat162, - CPointer(_type___nv_bfloat162), - CPointer(_type___nv_bfloat162), - ), + _ZL16__ll2bfloat16_rnx_nbst_caller, + signature(_type___nv_bfloat16, CPointer(int64)), ptrs, ) -_operator_itruediv_2_lower(shim_stream, shim_obj) +_lower__ZL16__ll2bfloat16_rnx_nbst(shim_stream, shim_obj) + + +def __ll2bfloat16_rz(): + pass -def _operator_pos_2_lower(shim_stream, shim_obj): +def _lower__ZL16__ll2bfloat16_rzx_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - operator_pos_2(__nv_bfloat162 &retval , __nv_bfloat162* h) { - retval = operator+(*h); + _ZL16__ll2bfloat16_rzx_nbst(__nv_bfloat16 &retval , long long* i) { + retval = __ll2bfloat16_rz(*i); return 0; } """ - operator_pos_2 = declare_device( - "operator_pos_2", _type___nv_bfloat162(CPointer(_type___nv_bfloat162)) + _ZL16__ll2bfloat16_rzx_nbst = declare_device( + "_ZL16__ll2bfloat16_rzx_nbst", _type___nv_bfloat16(CPointer(int64)) ) - def operator_pos_2_caller(arg_0): - return operator_pos_2(arg_0) + def _ZL16__ll2bfloat16_rzx_nbst_caller(arg_0): + return _ZL16__ll2bfloat16_rzx_nbst(arg_0) - @lower(operator.pos, _type___nv_bfloat162) + @lower(__ll2bfloat16_rz, int64) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_pos_2", shim_raw_str) + shim_stream.write_with_key("_ZL16__ll2bfloat16_rzx_nbst", shim_raw_str) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) return context.compile_internal( builder, - operator_pos_2_caller, - signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + _ZL16__ll2bfloat16_rzx_nbst_caller, + signature(_type___nv_bfloat16, CPointer(int64)), ptrs, ) -_operator_pos_2_lower(shim_stream, shim_obj) +_lower__ZL16__ll2bfloat16_rzx_nbst(shim_stream, shim_obj) + + +def __ll2bfloat16_rd(): + pass -def _operator_neg_2_lower(shim_stream, shim_obj): +def _lower__ZL16__ll2bfloat16_rdx_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int - operator_neg_2(__nv_bfloat162 &retval , __nv_bfloat162* h) { - retval = operator-(*h); + _ZL16__ll2bfloat16_rdx_nbst(__nv_bfloat16 &retval , long long* i) { + retval = __ll2bfloat16_rd(*i); return 0; } """ - operator_neg_2 = declare_device( - "operator_neg_2", _type___nv_bfloat162(CPointer(_type___nv_bfloat162)) + _ZL16__ll2bfloat16_rdx_nbst = declare_device( + "_ZL16__ll2bfloat16_rdx_nbst", _type___nv_bfloat16(CPointer(int64)) ) - def operator_neg_2_caller(arg_0): - return operator_neg_2(arg_0) + def _ZL16__ll2bfloat16_rdx_nbst_caller(arg_0): + return _ZL16__ll2bfloat16_rdx_nbst(arg_0) - @lower(operator.neg, _type___nv_bfloat162) + @lower(__ll2bfloat16_rd, int64) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_neg_2", shim_raw_str) + shim_stream.write_with_key("_ZL16__ll2bfloat16_rdx_nbst", shim_raw_str) ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] for ptr, ty, arg in zip(ptrs, sig.args, args): builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) - return context.compile_internal( - builder, - operator_neg_2_caller, - signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), - ptrs, - ) + return context.compile_internal( + builder, + _ZL16__ll2bfloat16_rdx_nbst_caller, + signature(_type___nv_bfloat16, CPointer(int64)), + ptrs, + ) + + +_lower__ZL16__ll2bfloat16_rdx_nbst(shim_stream, shim_obj) + + +def __ll2bfloat16_ru(): + pass + + +def _lower__ZL16__ll2bfloat16_rux_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL16__ll2bfloat16_rux_nbst(__nv_bfloat16 &retval , long long* i) { + retval = __ll2bfloat16_ru(*i); + return 0; + } + """ + + _ZL16__ll2bfloat16_rux_nbst = declare_device( + "_ZL16__ll2bfloat16_rux_nbst", _type___nv_bfloat16(CPointer(int64)) + ) + + def _ZL16__ll2bfloat16_rux_nbst_caller(arg_0): + return _ZL16__ll2bfloat16_rux_nbst(arg_0) + + @lower(__ll2bfloat16_ru, int64) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key("_ZL16__ll2bfloat16_rux_nbst", shim_raw_str) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL16__ll2bfloat16_rux_nbst_caller, + signature(_type___nv_bfloat16, CPointer(int64)), + ptrs, + ) + + +_lower__ZL16__ll2bfloat16_rux_nbst(shim_stream, shim_obj) + + +def htrunc(): + pass + + +def _lower__ZL6htrunc13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6htrunc13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* h) { + retval = htrunc(*h); + return 0; + } + """ + + _ZL6htrunc13__nv_bfloat16_nbst = declare_device( + "_ZL6htrunc13__nv_bfloat16_nbst", + _type___nv_bfloat16(CPointer(_type___nv_bfloat16)), + ) + + def _ZL6htrunc13__nv_bfloat16_nbst_caller(arg_0): + return _ZL6htrunc13__nv_bfloat16_nbst(arg_0) + + @lower(htrunc, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6htrunc13__nv_bfloat16_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6htrunc13__nv_bfloat16_nbst_caller, + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + ptrs, + ) + + +_lower__ZL6htrunc13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def hceil(): + pass + + +def _lower__ZL5hceil13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL5hceil13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* h) { + retval = hceil(*h); + return 0; + } + """ + + _ZL5hceil13__nv_bfloat16_nbst = declare_device( + "_ZL5hceil13__nv_bfloat16_nbst", + _type___nv_bfloat16(CPointer(_type___nv_bfloat16)), + ) + + def _ZL5hceil13__nv_bfloat16_nbst_caller(arg_0): + return _ZL5hceil13__nv_bfloat16_nbst(arg_0) + + @lower(hceil, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL5hceil13__nv_bfloat16_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL5hceil13__nv_bfloat16_nbst_caller, + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + ptrs, + ) + + +_lower__ZL5hceil13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def hfloor(): + pass + + +def _lower__ZL6hfloor13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6hfloor13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* h) { + retval = hfloor(*h); + return 0; + } + """ + + _ZL6hfloor13__nv_bfloat16_nbst = declare_device( + "_ZL6hfloor13__nv_bfloat16_nbst", + _type___nv_bfloat16(CPointer(_type___nv_bfloat16)), + ) + + def _ZL6hfloor13__nv_bfloat16_nbst_caller(arg_0): + return _ZL6hfloor13__nv_bfloat16_nbst(arg_0) + + @lower(hfloor, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6hfloor13__nv_bfloat16_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6hfloor13__nv_bfloat16_nbst_caller, + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + ptrs, + ) + + +_lower__ZL6hfloor13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def hrint(): + pass + + +def _lower__ZL5hrint13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL5hrint13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* h) { + retval = hrint(*h); + return 0; + } + """ + + _ZL5hrint13__nv_bfloat16_nbst = declare_device( + "_ZL5hrint13__nv_bfloat16_nbst", + _type___nv_bfloat16(CPointer(_type___nv_bfloat16)), + ) + + def _ZL5hrint13__nv_bfloat16_nbst_caller(arg_0): + return _ZL5hrint13__nv_bfloat16_nbst(arg_0) + + @lower(hrint, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL5hrint13__nv_bfloat16_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL5hrint13__nv_bfloat16_nbst_caller, + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + ptrs, + ) + + +_lower__ZL5hrint13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def h2trunc(): + pass + + +def _lower__ZL7h2trunc14__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL7h2trunc14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* h) { + retval = h2trunc(*h); + return 0; + } + """ + + _ZL7h2trunc14__nv_bfloat162_nbst = declare_device( + "_ZL7h2trunc14__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(_type___nv_bfloat162)), + ) + + def _ZL7h2trunc14__nv_bfloat162_nbst_caller(arg_0): + return _ZL7h2trunc14__nv_bfloat162_nbst(arg_0) + + @lower(h2trunc, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL7h2trunc14__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL7h2trunc14__nv_bfloat162_nbst_caller, + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + ptrs, + ) + + +_lower__ZL7h2trunc14__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def h2ceil(): + pass + + +def _lower__ZL6h2ceil14__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6h2ceil14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* h) { + retval = h2ceil(*h); + return 0; + } + """ + + _ZL6h2ceil14__nv_bfloat162_nbst = declare_device( + "_ZL6h2ceil14__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(_type___nv_bfloat162)), + ) + + def _ZL6h2ceil14__nv_bfloat162_nbst_caller(arg_0): + return _ZL6h2ceil14__nv_bfloat162_nbst(arg_0) + + @lower(h2ceil, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6h2ceil14__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6h2ceil14__nv_bfloat162_nbst_caller, + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + ptrs, + ) + + +_lower__ZL6h2ceil14__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def h2floor(): + pass + + +def _lower__ZL7h2floor14__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL7h2floor14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* h) { + retval = h2floor(*h); + return 0; + } + """ + + _ZL7h2floor14__nv_bfloat162_nbst = declare_device( + "_ZL7h2floor14__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(_type___nv_bfloat162)), + ) + + def _ZL7h2floor14__nv_bfloat162_nbst_caller(arg_0): + return _ZL7h2floor14__nv_bfloat162_nbst(arg_0) + + @lower(h2floor, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL7h2floor14__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL7h2floor14__nv_bfloat162_nbst_caller, + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + ptrs, + ) + + +_lower__ZL7h2floor14__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def h2rint(): + pass + + +def _lower__ZL6h2rint14__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6h2rint14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* h) { + retval = h2rint(*h); + return 0; + } + """ + + _ZL6h2rint14__nv_bfloat162_nbst = declare_device( + "_ZL6h2rint14__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(_type___nv_bfloat162)), + ) + + def _ZL6h2rint14__nv_bfloat162_nbst_caller(arg_0): + return _ZL6h2rint14__nv_bfloat162_nbst(arg_0) + + @lower(h2rint, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6h2rint14__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6h2rint14__nv_bfloat162_nbst_caller, + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + ptrs, + ) + + +_lower__ZL6h2rint14__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def __bfloat162bfloat162(): + pass + + +def _lower__ZL20__bfloat162bfloat16213__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL20__bfloat162bfloat16213__nv_bfloat16_nbst(__nv_bfloat162 &retval , __nv_bfloat16* a) { + retval = __bfloat162bfloat162(*a); + return 0; + } + """ + + _ZL20__bfloat162bfloat16213__nv_bfloat16_nbst = declare_device( + "_ZL20__bfloat162bfloat16213__nv_bfloat16_nbst", + _type___nv_bfloat162(CPointer(_type___nv_bfloat16)), + ) + + def _ZL20__bfloat162bfloat16213__nv_bfloat16_nbst_caller(arg_0): + return _ZL20__bfloat162bfloat16213__nv_bfloat16_nbst(arg_0) + + @lower(__bfloat162bfloat162, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL20__bfloat162bfloat16213__nv_bfloat16_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL20__bfloat162bfloat16213__nv_bfloat16_nbst_caller, + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat16)), + ptrs, + ) + + +_lower__ZL20__bfloat162bfloat16213__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def __lowhigh2highlow(): + pass + + +def _lower__ZL17__lowhigh2highlow14__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL17__lowhigh2highlow14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) { + retval = __lowhigh2highlow(*a); + return 0; + } + """ + + _ZL17__lowhigh2highlow14__nv_bfloat162_nbst = declare_device( + "_ZL17__lowhigh2highlow14__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(_type___nv_bfloat162)), + ) + + def _ZL17__lowhigh2highlow14__nv_bfloat162_nbst_caller(arg_0): + return _ZL17__lowhigh2highlow14__nv_bfloat162_nbst(arg_0) + + @lower(__lowhigh2highlow, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL17__lowhigh2highlow14__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL17__lowhigh2highlow14__nv_bfloat162_nbst_caller, + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + ptrs, + ) + + +_lower__ZL17__lowhigh2highlow14__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def __lows2bfloat162(): + pass + + +def _lower__ZL16__lows2bfloat16214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL16__lows2bfloat16214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __lows2bfloat162(*a, *b); + return 0; + } + """ + + _ZL16__lows2bfloat16214__nv_bfloat162S__nbst = declare_device( + "_ZL16__lows2bfloat16214__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL16__lows2bfloat16214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL16__lows2bfloat16214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__lows2bfloat162, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL16__lows2bfloat16214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL16__lows2bfloat16214__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL16__lows2bfloat16214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __highs2bfloat162(): + pass + + +def _lower__ZL17__highs2bfloat16214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL17__highs2bfloat16214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __highs2bfloat162(*a, *b); + return 0; + } + """ + + _ZL17__highs2bfloat16214__nv_bfloat162S__nbst = declare_device( + "_ZL17__highs2bfloat16214__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL17__highs2bfloat16214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL17__highs2bfloat16214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__highs2bfloat162, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL17__highs2bfloat16214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL17__highs2bfloat16214__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL17__highs2bfloat16214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __high2bfloat16(): + pass + + +def _lower__ZL15__high2bfloat1614__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL15__high2bfloat1614__nv_bfloat162_nbst(__nv_bfloat16 &retval , __nv_bfloat162* a) { + retval = __high2bfloat16(*a); + return 0; + } + """ + + _ZL15__high2bfloat1614__nv_bfloat162_nbst = declare_device( + "_ZL15__high2bfloat1614__nv_bfloat162_nbst", + _type___nv_bfloat16(CPointer(_type___nv_bfloat162)), + ) + + def _ZL15__high2bfloat1614__nv_bfloat162_nbst_caller(arg_0): + return _ZL15__high2bfloat1614__nv_bfloat162_nbst(arg_0) + + @lower(__high2bfloat16, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL15__high2bfloat1614__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL15__high2bfloat1614__nv_bfloat162_nbst_caller, + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat162)), + ptrs, + ) + + +_lower__ZL15__high2bfloat1614__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def __low2bfloat16(): + pass + + +def _lower__ZL14__low2bfloat1614__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL14__low2bfloat1614__nv_bfloat162_nbst(__nv_bfloat16 &retval , __nv_bfloat162* a) { + retval = __low2bfloat16(*a); + return 0; + } + """ + + _ZL14__low2bfloat1614__nv_bfloat162_nbst = declare_device( + "_ZL14__low2bfloat1614__nv_bfloat162_nbst", + _type___nv_bfloat16(CPointer(_type___nv_bfloat162)), + ) + + def _ZL14__low2bfloat1614__nv_bfloat162_nbst_caller(arg_0): + return _ZL14__low2bfloat1614__nv_bfloat162_nbst(arg_0) + + @lower(__low2bfloat16, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL14__low2bfloat1614__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL14__low2bfloat1614__nv_bfloat162_nbst_caller, + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat162)), + ptrs, + ) + + +_lower__ZL14__low2bfloat1614__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def __hisinf(): + pass + + +def _lower__ZL8__hisinf13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL8__hisinf13__nv_bfloat16_nbst(int &retval , __nv_bfloat16* a) { + retval = __hisinf(*a); + return 0; + } + """ + + _ZL8__hisinf13__nv_bfloat16_nbst = declare_device( + "_ZL8__hisinf13__nv_bfloat16_nbst", int32(CPointer(_type___nv_bfloat16)) + ) + + def _ZL8__hisinf13__nv_bfloat16_nbst_caller(arg_0): + return _ZL8__hisinf13__nv_bfloat16_nbst(arg_0) + + @lower(__hisinf, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL8__hisinf13__nv_bfloat16_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL8__hisinf13__nv_bfloat16_nbst_caller, + signature(int32, CPointer(_type___nv_bfloat16)), + ptrs, + ) + + +_lower__ZL8__hisinf13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def __halves2bfloat162(): + pass + + +def _lower__ZL18__halves2bfloat16213__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL18__halves2bfloat16213__nv_bfloat16S__nbst(__nv_bfloat162 &retval , __nv_bfloat16* a, __nv_bfloat16* b) { + retval = __halves2bfloat162(*a, *b); + return 0; + } + """ + + _ZL18__halves2bfloat16213__nv_bfloat16S__nbst = declare_device( + "_ZL18__halves2bfloat16213__nv_bfloat16S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) + ), + ) + + def _ZL18__halves2bfloat16213__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL18__halves2bfloat16213__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__halves2bfloat162, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL18__halves2bfloat16213__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL18__halves2bfloat16213__nv_bfloat16S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL18__halves2bfloat16213__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __low2bfloat162(): + pass + + +def _lower__ZL15__low2bfloat16214__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL15__low2bfloat16214__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) { + retval = __low2bfloat162(*a); + return 0; + } + """ + + _ZL15__low2bfloat16214__nv_bfloat162_nbst = declare_device( + "_ZL15__low2bfloat16214__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(_type___nv_bfloat162)), + ) + + def _ZL15__low2bfloat16214__nv_bfloat162_nbst_caller(arg_0): + return _ZL15__low2bfloat16214__nv_bfloat162_nbst(arg_0) + + @lower(__low2bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL15__low2bfloat16214__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL15__low2bfloat16214__nv_bfloat162_nbst_caller, + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + ptrs, + ) + + +_lower__ZL15__low2bfloat16214__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def __high2bfloat162(): + pass + + +def _lower__ZL16__high2bfloat16214__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL16__high2bfloat16214__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) { + retval = __high2bfloat162(*a); + return 0; + } + """ + + _ZL16__high2bfloat16214__nv_bfloat162_nbst = declare_device( + "_ZL16__high2bfloat16214__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(_type___nv_bfloat162)), + ) + + def _ZL16__high2bfloat16214__nv_bfloat162_nbst_caller(arg_0): + return _ZL16__high2bfloat16214__nv_bfloat162_nbst(arg_0) + + @lower(__high2bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL16__high2bfloat16214__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL16__high2bfloat16214__nv_bfloat162_nbst_caller, + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + ptrs, + ) + + +_lower__ZL16__high2bfloat16214__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def __bfloat16_as_short(): + pass + + +def _lower__ZL19__bfloat16_as_short13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL19__bfloat16_as_short13__nv_bfloat16_nbst(short &retval , __nv_bfloat16* h) { + retval = __bfloat16_as_short(*h); + return 0; + } + """ + + _ZL19__bfloat16_as_short13__nv_bfloat16_nbst = declare_device( + "_ZL19__bfloat16_as_short13__nv_bfloat16_nbst", + int16(CPointer(_type___nv_bfloat16)), + ) + + def _ZL19__bfloat16_as_short13__nv_bfloat16_nbst_caller(arg_0): + return _ZL19__bfloat16_as_short13__nv_bfloat16_nbst(arg_0) + + @lower(__bfloat16_as_short, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL19__bfloat16_as_short13__nv_bfloat16_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL19__bfloat16_as_short13__nv_bfloat16_nbst_caller, + signature(int16, CPointer(_type___nv_bfloat16)), + ptrs, + ) + + +_lower__ZL19__bfloat16_as_short13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def __bfloat16_as_ushort(): + pass + + +def _lower__ZL20__bfloat16_as_ushort13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL20__bfloat16_as_ushort13__nv_bfloat16_nbst(unsigned short &retval , __nv_bfloat16* h) { + retval = __bfloat16_as_ushort(*h); + return 0; + } + """ + + _ZL20__bfloat16_as_ushort13__nv_bfloat16_nbst = declare_device( + "_ZL20__bfloat16_as_ushort13__nv_bfloat16_nbst", + uint16(CPointer(_type___nv_bfloat16)), + ) + + def _ZL20__bfloat16_as_ushort13__nv_bfloat16_nbst_caller(arg_0): + return _ZL20__bfloat16_as_ushort13__nv_bfloat16_nbst(arg_0) + + @lower(__bfloat16_as_ushort, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL20__bfloat16_as_ushort13__nv_bfloat16_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL20__bfloat16_as_ushort13__nv_bfloat16_nbst_caller, + signature(uint16, CPointer(_type___nv_bfloat16)), + ptrs, + ) + + +_lower__ZL20__bfloat16_as_ushort13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def __short_as_bfloat16(): + pass + + +def _lower__ZL19__short_as_bfloat16s_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL19__short_as_bfloat16s_nbst(__nv_bfloat16 &retval , short* i) { + retval = __short_as_bfloat16(*i); + return 0; + } + """ + + _ZL19__short_as_bfloat16s_nbst = declare_device( + "_ZL19__short_as_bfloat16s_nbst", _type___nv_bfloat16(CPointer(int16)) + ) + + def _ZL19__short_as_bfloat16s_nbst_caller(arg_0): + return _ZL19__short_as_bfloat16s_nbst(arg_0) + + @lower(__short_as_bfloat16, int16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL19__short_as_bfloat16s_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL19__short_as_bfloat16s_nbst_caller, + signature(_type___nv_bfloat16, CPointer(int16)), + ptrs, + ) + + +_lower__ZL19__short_as_bfloat16s_nbst(shim_stream, shim_obj) + + +def __ushort_as_bfloat16(): + pass + + +def _lower__ZL20__ushort_as_bfloat16t_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL20__ushort_as_bfloat16t_nbst(__nv_bfloat16 &retval , unsigned short* i) { + retval = __ushort_as_bfloat16(*i); + return 0; + } + """ + + _ZL20__ushort_as_bfloat16t_nbst = declare_device( + "_ZL20__ushort_as_bfloat16t_nbst", _type___nv_bfloat16(CPointer(uint16)) + ) + + def _ZL20__ushort_as_bfloat16t_nbst_caller(arg_0): + return _ZL20__ushort_as_bfloat16t_nbst(arg_0) + + @lower(__ushort_as_bfloat16, uint16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL20__ushort_as_bfloat16t_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL20__ushort_as_bfloat16t_nbst_caller, + signature(_type___nv_bfloat16, CPointer(uint16)), + ptrs, + ) + + +_lower__ZL20__ushort_as_bfloat16t_nbst(shim_stream, shim_obj) + + +def __shfl_sync(): + pass + + +def _lower__ZL11__shfl_syncj14__nv_bfloat162ii_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL11__shfl_syncj14__nv_bfloat162ii_nbst(__nv_bfloat162 &retval , unsigned int* mask, __nv_bfloat162* var, int* srcLane, int* width) { + retval = __shfl_sync(*mask, *var, *srcLane, *width); + return 0; + } + """ + + _ZL11__shfl_syncj14__nv_bfloat162ii_nbst = declare_device( + "_ZL11__shfl_syncj14__nv_bfloat162ii_nbst", + _type___nv_bfloat162( + CPointer(uint32), + CPointer(_type___nv_bfloat162), + CPointer(int32), + CPointer(int32), + ), + ) + + def _ZL11__shfl_syncj14__nv_bfloat162ii_nbst_caller( + arg_0, arg_1, arg_2, arg_3 + ): + return _ZL11__shfl_syncj14__nv_bfloat162ii_nbst( + arg_0, arg_1, arg_2, arg_3 + ) + + @lower(__shfl_sync, uint32, _type___nv_bfloat162, int32, int32) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL11__shfl_syncj14__nv_bfloat162ii_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL11__shfl_syncj14__nv_bfloat162ii_nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(uint32), + CPointer(_type___nv_bfloat162), + CPointer(int32), + CPointer(int32), + ), + ptrs, + ) + + +_lower__ZL11__shfl_syncj14__nv_bfloat162ii_nbst(shim_stream, shim_obj) + + +def __shfl_up_sync(): + pass + + +def _lower__ZL14__shfl_up_syncj14__nv_bfloat162ji_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL14__shfl_up_syncj14__nv_bfloat162ji_nbst(__nv_bfloat162 &retval , unsigned int* mask, __nv_bfloat162* var, unsigned int* delta, int* width) { + retval = __shfl_up_sync(*mask, *var, *delta, *width); + return 0; + } + """ + + _ZL14__shfl_up_syncj14__nv_bfloat162ji_nbst = declare_device( + "_ZL14__shfl_up_syncj14__nv_bfloat162ji_nbst", + _type___nv_bfloat162( + CPointer(uint32), + CPointer(_type___nv_bfloat162), + CPointer(uint32), + CPointer(int32), + ), + ) + + def _ZL14__shfl_up_syncj14__nv_bfloat162ji_nbst_caller( + arg_0, arg_1, arg_2, arg_3 + ): + return _ZL14__shfl_up_syncj14__nv_bfloat162ji_nbst( + arg_0, arg_1, arg_2, arg_3 + ) + + @lower(__shfl_up_sync, uint32, _type___nv_bfloat162, uint32, int32) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL14__shfl_up_syncj14__nv_bfloat162ji_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL14__shfl_up_syncj14__nv_bfloat162ji_nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(uint32), + CPointer(_type___nv_bfloat162), + CPointer(uint32), + CPointer(int32), + ), + ptrs, + ) + + +_lower__ZL14__shfl_up_syncj14__nv_bfloat162ji_nbst(shim_stream, shim_obj) + + +def __shfl_down_sync(): + pass + + +def _lower__ZL16__shfl_down_syncj14__nv_bfloat162ji_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL16__shfl_down_syncj14__nv_bfloat162ji_nbst(__nv_bfloat162 &retval , unsigned int* mask, __nv_bfloat162* var, unsigned int* delta, int* width) { + retval = __shfl_down_sync(*mask, *var, *delta, *width); + return 0; + } + """ + + _ZL16__shfl_down_syncj14__nv_bfloat162ji_nbst = declare_device( + "_ZL16__shfl_down_syncj14__nv_bfloat162ji_nbst", + _type___nv_bfloat162( + CPointer(uint32), + CPointer(_type___nv_bfloat162), + CPointer(uint32), + CPointer(int32), + ), + ) + + def _ZL16__shfl_down_syncj14__nv_bfloat162ji_nbst_caller( + arg_0, arg_1, arg_2, arg_3 + ): + return _ZL16__shfl_down_syncj14__nv_bfloat162ji_nbst( + arg_0, arg_1, arg_2, arg_3 + ) + + @lower(__shfl_down_sync, uint32, _type___nv_bfloat162, uint32, int32) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL16__shfl_down_syncj14__nv_bfloat162ji_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL16__shfl_down_syncj14__nv_bfloat162ji_nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(uint32), + CPointer(_type___nv_bfloat162), + CPointer(uint32), + CPointer(int32), + ), + ptrs, + ) + + +_lower__ZL16__shfl_down_syncj14__nv_bfloat162ji_nbst(shim_stream, shim_obj) + + +def __shfl_xor_sync(): + pass + + +def _lower__ZL15__shfl_xor_syncj14__nv_bfloat162ii_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL15__shfl_xor_syncj14__nv_bfloat162ii_nbst(__nv_bfloat162 &retval , unsigned int* mask, __nv_bfloat162* var, int* laneMask, int* width) { + retval = __shfl_xor_sync(*mask, *var, *laneMask, *width); + return 0; + } + """ + + _ZL15__shfl_xor_syncj14__nv_bfloat162ii_nbst = declare_device( + "_ZL15__shfl_xor_syncj14__nv_bfloat162ii_nbst", + _type___nv_bfloat162( + CPointer(uint32), + CPointer(_type___nv_bfloat162), + CPointer(int32), + CPointer(int32), + ), + ) + + def _ZL15__shfl_xor_syncj14__nv_bfloat162ii_nbst_caller( + arg_0, arg_1, arg_2, arg_3 + ): + return _ZL15__shfl_xor_syncj14__nv_bfloat162ii_nbst( + arg_0, arg_1, arg_2, arg_3 + ) + + @lower(__shfl_xor_sync, uint32, _type___nv_bfloat162, int32, int32) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL15__shfl_xor_syncj14__nv_bfloat162ii_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL15__shfl_xor_syncj14__nv_bfloat162ii_nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(uint32), + CPointer(_type___nv_bfloat162), + CPointer(int32), + CPointer(int32), + ), + ptrs, + ) + + +_lower__ZL15__shfl_xor_syncj14__nv_bfloat162ii_nbst(shim_stream, shim_obj) + + +def _lower__ZL11__shfl_syncj13__nv_bfloat16ii_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL11__shfl_syncj13__nv_bfloat16ii_nbst(__nv_bfloat16 &retval , unsigned int* mask, __nv_bfloat16* var, int* srcLane, int* width) { + retval = __shfl_sync(*mask, *var, *srcLane, *width); + return 0; + } + """ + + _ZL11__shfl_syncj13__nv_bfloat16ii_nbst = declare_device( + "_ZL11__shfl_syncj13__nv_bfloat16ii_nbst", + _type___nv_bfloat16( + CPointer(uint32), + CPointer(_type___nv_bfloat16), + CPointer(int32), + CPointer(int32), + ), + ) + + def _ZL11__shfl_syncj13__nv_bfloat16ii_nbst_caller( + arg_0, arg_1, arg_2, arg_3 + ): + return _ZL11__shfl_syncj13__nv_bfloat16ii_nbst( + arg_0, arg_1, arg_2, arg_3 + ) + + @lower(__shfl_sync, uint32, _type___nv_bfloat16, int32, int32) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL11__shfl_syncj13__nv_bfloat16ii_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL11__shfl_syncj13__nv_bfloat16ii_nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(uint32), + CPointer(_type___nv_bfloat16), + CPointer(int32), + CPointer(int32), + ), + ptrs, + ) + + +_lower__ZL11__shfl_syncj13__nv_bfloat16ii_nbst(shim_stream, shim_obj) + + +def _lower__ZL14__shfl_up_syncj13__nv_bfloat16ji_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL14__shfl_up_syncj13__nv_bfloat16ji_nbst(__nv_bfloat16 &retval , unsigned int* mask, __nv_bfloat16* var, unsigned int* delta, int* width) { + retval = __shfl_up_sync(*mask, *var, *delta, *width); + return 0; + } + """ + + _ZL14__shfl_up_syncj13__nv_bfloat16ji_nbst = declare_device( + "_ZL14__shfl_up_syncj13__nv_bfloat16ji_nbst", + _type___nv_bfloat16( + CPointer(uint32), + CPointer(_type___nv_bfloat16), + CPointer(uint32), + CPointer(int32), + ), + ) + + def _ZL14__shfl_up_syncj13__nv_bfloat16ji_nbst_caller( + arg_0, arg_1, arg_2, arg_3 + ): + return _ZL14__shfl_up_syncj13__nv_bfloat16ji_nbst( + arg_0, arg_1, arg_2, arg_3 + ) + + @lower(__shfl_up_sync, uint32, _type___nv_bfloat16, uint32, int32) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL14__shfl_up_syncj13__nv_bfloat16ji_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL14__shfl_up_syncj13__nv_bfloat16ji_nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(uint32), + CPointer(_type___nv_bfloat16), + CPointer(uint32), + CPointer(int32), + ), + ptrs, + ) + + +_lower__ZL14__shfl_up_syncj13__nv_bfloat16ji_nbst(shim_stream, shim_obj) + + +def _lower__ZL16__shfl_down_syncj13__nv_bfloat16ji_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL16__shfl_down_syncj13__nv_bfloat16ji_nbst(__nv_bfloat16 &retval , unsigned int* mask, __nv_bfloat16* var, unsigned int* delta, int* width) { + retval = __shfl_down_sync(*mask, *var, *delta, *width); + return 0; + } + """ + + _ZL16__shfl_down_syncj13__nv_bfloat16ji_nbst = declare_device( + "_ZL16__shfl_down_syncj13__nv_bfloat16ji_nbst", + _type___nv_bfloat16( + CPointer(uint32), + CPointer(_type___nv_bfloat16), + CPointer(uint32), + CPointer(int32), + ), + ) + + def _ZL16__shfl_down_syncj13__nv_bfloat16ji_nbst_caller( + arg_0, arg_1, arg_2, arg_3 + ): + return _ZL16__shfl_down_syncj13__nv_bfloat16ji_nbst( + arg_0, arg_1, arg_2, arg_3 + ) + + @lower(__shfl_down_sync, uint32, _type___nv_bfloat16, uint32, int32) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL16__shfl_down_syncj13__nv_bfloat16ji_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL16__shfl_down_syncj13__nv_bfloat16ji_nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(uint32), + CPointer(_type___nv_bfloat16), + CPointer(uint32), + CPointer(int32), + ), + ptrs, + ) + + +_lower__ZL16__shfl_down_syncj13__nv_bfloat16ji_nbst(shim_stream, shim_obj) + + +def _lower__ZL15__shfl_xor_syncj13__nv_bfloat16ii_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL15__shfl_xor_syncj13__nv_bfloat16ii_nbst(__nv_bfloat16 &retval , unsigned int* mask, __nv_bfloat16* var, int* laneMask, int* width) { + retval = __shfl_xor_sync(*mask, *var, *laneMask, *width); + return 0; + } + """ + + _ZL15__shfl_xor_syncj13__nv_bfloat16ii_nbst = declare_device( + "_ZL15__shfl_xor_syncj13__nv_bfloat16ii_nbst", + _type___nv_bfloat16( + CPointer(uint32), + CPointer(_type___nv_bfloat16), + CPointer(int32), + CPointer(int32), + ), + ) + + def _ZL15__shfl_xor_syncj13__nv_bfloat16ii_nbst_caller( + arg_0, arg_1, arg_2, arg_3 + ): + return _ZL15__shfl_xor_syncj13__nv_bfloat16ii_nbst( + arg_0, arg_1, arg_2, arg_3 + ) + + @lower(__shfl_xor_sync, uint32, _type___nv_bfloat16, int32, int32) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL15__shfl_xor_syncj13__nv_bfloat16ii_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL15__shfl_xor_syncj13__nv_bfloat16ii_nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(uint32), + CPointer(_type___nv_bfloat16), + CPointer(int32), + CPointer(int32), + ), + ptrs, + ) + + +_lower__ZL15__shfl_xor_syncj13__nv_bfloat16ii_nbst(shim_stream, shim_obj) + + +def __ldg(): + pass + + +def _lower__ZL5__ldgPK14__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL5__ldgPK14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162 ** ptr) { + retval = __ldg(*ptr); + return 0; + } + """ + + _ZL5__ldgPK14__nv_bfloat162_nbst = declare_device( + "_ZL5__ldgPK14__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(CPointer(_type___nv_bfloat162))), + ) + + def _ZL5__ldgPK14__nv_bfloat162_nbst_caller(arg_0): + return _ZL5__ldgPK14__nv_bfloat162_nbst(arg_0) + + @lower(__ldg, CPointer(_type___nv_bfloat162)) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL5__ldgPK14__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL5__ldgPK14__nv_bfloat162_nbst_caller, + signature( + _type___nv_bfloat162, CPointer(CPointer(_type___nv_bfloat162)) + ), + ptrs, + ) + + +_lower__ZL5__ldgPK14__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def _lower__ZL5__ldgPK13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL5__ldgPK13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16 ** ptr) { + retval = __ldg(*ptr); + return 0; + } + """ + + _ZL5__ldgPK13__nv_bfloat16_nbst = declare_device( + "_ZL5__ldgPK13__nv_bfloat16_nbst", + _type___nv_bfloat16(CPointer(CPointer(_type___nv_bfloat16))), + ) + + def _ZL5__ldgPK13__nv_bfloat16_nbst_caller(arg_0): + return _ZL5__ldgPK13__nv_bfloat16_nbst(arg_0) + + @lower(__ldg, CPointer(_type___nv_bfloat16)) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL5__ldgPK13__nv_bfloat16_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL5__ldgPK13__nv_bfloat16_nbst_caller, + signature( + _type___nv_bfloat16, CPointer(CPointer(_type___nv_bfloat16)) + ), + ptrs, + ) + + +_lower__ZL5__ldgPK13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def __ldcg(): + pass + + +def _lower__ZL6__ldcgPK14__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__ldcgPK14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162 ** ptr) { + retval = __ldcg(*ptr); + return 0; + } + """ + + _ZL6__ldcgPK14__nv_bfloat162_nbst = declare_device( + "_ZL6__ldcgPK14__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(CPointer(_type___nv_bfloat162))), + ) + + def _ZL6__ldcgPK14__nv_bfloat162_nbst_caller(arg_0): + return _ZL6__ldcgPK14__nv_bfloat162_nbst(arg_0) + + @lower(__ldcg, CPointer(_type___nv_bfloat162)) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__ldcgPK14__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__ldcgPK14__nv_bfloat162_nbst_caller, + signature( + _type___nv_bfloat162, CPointer(CPointer(_type___nv_bfloat162)) + ), + ptrs, + ) + + +_lower__ZL6__ldcgPK14__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def _lower__ZL6__ldcgPK13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__ldcgPK13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16 ** ptr) { + retval = __ldcg(*ptr); + return 0; + } + """ + + _ZL6__ldcgPK13__nv_bfloat16_nbst = declare_device( + "_ZL6__ldcgPK13__nv_bfloat16_nbst", + _type___nv_bfloat16(CPointer(CPointer(_type___nv_bfloat16))), + ) + + def _ZL6__ldcgPK13__nv_bfloat16_nbst_caller(arg_0): + return _ZL6__ldcgPK13__nv_bfloat16_nbst(arg_0) + + @lower(__ldcg, CPointer(_type___nv_bfloat16)) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__ldcgPK13__nv_bfloat16_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__ldcgPK13__nv_bfloat16_nbst_caller, + signature( + _type___nv_bfloat16, CPointer(CPointer(_type___nv_bfloat16)) + ), + ptrs, + ) + + +_lower__ZL6__ldcgPK13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def __ldca(): + pass + + +def _lower__ZL6__ldcaPK14__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__ldcaPK14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162 ** ptr) { + retval = __ldca(*ptr); + return 0; + } + """ + + _ZL6__ldcaPK14__nv_bfloat162_nbst = declare_device( + "_ZL6__ldcaPK14__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(CPointer(_type___nv_bfloat162))), + ) + + def _ZL6__ldcaPK14__nv_bfloat162_nbst_caller(arg_0): + return _ZL6__ldcaPK14__nv_bfloat162_nbst(arg_0) + + @lower(__ldca, CPointer(_type___nv_bfloat162)) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__ldcaPK14__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__ldcaPK14__nv_bfloat162_nbst_caller, + signature( + _type___nv_bfloat162, CPointer(CPointer(_type___nv_bfloat162)) + ), + ptrs, + ) + + +_lower__ZL6__ldcaPK14__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def _lower__ZL6__ldcaPK13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__ldcaPK13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16 ** ptr) { + retval = __ldca(*ptr); + return 0; + } + """ + + _ZL6__ldcaPK13__nv_bfloat16_nbst = declare_device( + "_ZL6__ldcaPK13__nv_bfloat16_nbst", + _type___nv_bfloat16(CPointer(CPointer(_type___nv_bfloat16))), + ) + + def _ZL6__ldcaPK13__nv_bfloat16_nbst_caller(arg_0): + return _ZL6__ldcaPK13__nv_bfloat16_nbst(arg_0) + + @lower(__ldca, CPointer(_type___nv_bfloat16)) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__ldcaPK13__nv_bfloat16_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__ldcaPK13__nv_bfloat16_nbst_caller, + signature( + _type___nv_bfloat16, CPointer(CPointer(_type___nv_bfloat16)) + ), + ptrs, + ) + + +_lower__ZL6__ldcaPK13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def __ldcs(): + pass + + +def _lower__ZL6__ldcsPK14__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__ldcsPK14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162 ** ptr) { + retval = __ldcs(*ptr); + return 0; + } + """ + + _ZL6__ldcsPK14__nv_bfloat162_nbst = declare_device( + "_ZL6__ldcsPK14__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(CPointer(_type___nv_bfloat162))), + ) + + def _ZL6__ldcsPK14__nv_bfloat162_nbst_caller(arg_0): + return _ZL6__ldcsPK14__nv_bfloat162_nbst(arg_0) + + @lower(__ldcs, CPointer(_type___nv_bfloat162)) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__ldcsPK14__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__ldcsPK14__nv_bfloat162_nbst_caller, + signature( + _type___nv_bfloat162, CPointer(CPointer(_type___nv_bfloat162)) + ), + ptrs, + ) + + +_lower__ZL6__ldcsPK14__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def _lower__ZL6__ldcsPK13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__ldcsPK13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16 ** ptr) { + retval = __ldcs(*ptr); + return 0; + } + """ + + _ZL6__ldcsPK13__nv_bfloat16_nbst = declare_device( + "_ZL6__ldcsPK13__nv_bfloat16_nbst", + _type___nv_bfloat16(CPointer(CPointer(_type___nv_bfloat16))), + ) + + def _ZL6__ldcsPK13__nv_bfloat16_nbst_caller(arg_0): + return _ZL6__ldcsPK13__nv_bfloat16_nbst(arg_0) + + @lower(__ldcs, CPointer(_type___nv_bfloat16)) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__ldcsPK13__nv_bfloat16_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__ldcsPK13__nv_bfloat16_nbst_caller, + signature( + _type___nv_bfloat16, CPointer(CPointer(_type___nv_bfloat16)) + ), + ptrs, + ) + + +_lower__ZL6__ldcsPK13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def __ldlu(): + pass + + +def _lower__ZL6__ldluPK14__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__ldluPK14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162 ** ptr) { + retval = __ldlu(*ptr); + return 0; + } + """ + + _ZL6__ldluPK14__nv_bfloat162_nbst = declare_device( + "_ZL6__ldluPK14__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(CPointer(_type___nv_bfloat162))), + ) + + def _ZL6__ldluPK14__nv_bfloat162_nbst_caller(arg_0): + return _ZL6__ldluPK14__nv_bfloat162_nbst(arg_0) + + @lower(__ldlu, CPointer(_type___nv_bfloat162)) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__ldluPK14__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__ldluPK14__nv_bfloat162_nbst_caller, + signature( + _type___nv_bfloat162, CPointer(CPointer(_type___nv_bfloat162)) + ), + ptrs, + ) + + +_lower__ZL6__ldluPK14__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def _lower__ZL6__ldluPK13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__ldluPK13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16 ** ptr) { + retval = __ldlu(*ptr); + return 0; + } + """ + + _ZL6__ldluPK13__nv_bfloat16_nbst = declare_device( + "_ZL6__ldluPK13__nv_bfloat16_nbst", + _type___nv_bfloat16(CPointer(CPointer(_type___nv_bfloat16))), + ) + + def _ZL6__ldluPK13__nv_bfloat16_nbst_caller(arg_0): + return _ZL6__ldluPK13__nv_bfloat16_nbst(arg_0) + + @lower(__ldlu, CPointer(_type___nv_bfloat16)) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__ldluPK13__nv_bfloat16_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__ldluPK13__nv_bfloat16_nbst_caller, + signature( + _type___nv_bfloat16, CPointer(CPointer(_type___nv_bfloat16)) + ), + ptrs, + ) + + +_lower__ZL6__ldluPK13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def __ldcv(): + pass + + +def _lower__ZL6__ldcvPK14__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__ldcvPK14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162 ** ptr) { + retval = __ldcv(*ptr); + return 0; + } + """ + + _ZL6__ldcvPK14__nv_bfloat162_nbst = declare_device( + "_ZL6__ldcvPK14__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(CPointer(_type___nv_bfloat162))), + ) + + def _ZL6__ldcvPK14__nv_bfloat162_nbst_caller(arg_0): + return _ZL6__ldcvPK14__nv_bfloat162_nbst(arg_0) + + @lower(__ldcv, CPointer(_type___nv_bfloat162)) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__ldcvPK14__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__ldcvPK14__nv_bfloat162_nbst_caller, + signature( + _type___nv_bfloat162, CPointer(CPointer(_type___nv_bfloat162)) + ), + ptrs, + ) + + +_lower__ZL6__ldcvPK14__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def _lower__ZL6__ldcvPK13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__ldcvPK13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16 ** ptr) { + retval = __ldcv(*ptr); + return 0; + } + """ + + _ZL6__ldcvPK13__nv_bfloat16_nbst = declare_device( + "_ZL6__ldcvPK13__nv_bfloat16_nbst", + _type___nv_bfloat16(CPointer(CPointer(_type___nv_bfloat16))), + ) + + def _ZL6__ldcvPK13__nv_bfloat16_nbst_caller(arg_0): + return _ZL6__ldcvPK13__nv_bfloat16_nbst(arg_0) + + @lower(__ldcv, CPointer(_type___nv_bfloat16)) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__ldcvPK13__nv_bfloat16_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__ldcvPK13__nv_bfloat16_nbst_caller, + signature( + _type___nv_bfloat16, CPointer(CPointer(_type___nv_bfloat16)) + ), + ptrs, + ) + + +_lower__ZL6__ldcvPK13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def __stwb(): + pass + + +def _lower__ZL6__stwbP14__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__stwbP14__nv_bfloat162S__nbst(int &retval , __nv_bfloat162 ** ptr, __nv_bfloat162* value) { + __stwb(*ptr, *value); + return 0; + } + """ + + _ZL6__stwbP14__nv_bfloat162S__nbst = declare_device( + "_ZL6__stwbP14__nv_bfloat162S__nbst", + void( + CPointer(CPointer(_type___nv_bfloat162)), + CPointer(_type___nv_bfloat162), + ), + ) + + def _ZL6__stwbP14__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL6__stwbP14__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__stwb, CPointer(_type___nv_bfloat162), _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__stwbP14__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__stwbP14__nv_bfloat162S__nbst_caller, + signature( + void, + CPointer(CPointer(_type___nv_bfloat162)), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL6__stwbP14__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def _lower__ZL6__stwbP13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__stwbP13__nv_bfloat16S__nbst(int &retval , __nv_bfloat16 ** ptr, __nv_bfloat16* value) { + __stwb(*ptr, *value); + return 0; + } + """ + + _ZL6__stwbP13__nv_bfloat16S__nbst = declare_device( + "_ZL6__stwbP13__nv_bfloat16S__nbst", + void( + CPointer(CPointer(_type___nv_bfloat16)), + CPointer(_type___nv_bfloat16), + ), + ) + + def _ZL6__stwbP13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL6__stwbP13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__stwb, CPointer(_type___nv_bfloat16), _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__stwbP13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__stwbP13__nv_bfloat16S__nbst_caller, + signature( + void, + CPointer(CPointer(_type___nv_bfloat16)), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL6__stwbP13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __stcg(): + pass + + +def _lower__ZL6__stcgP14__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__stcgP14__nv_bfloat162S__nbst(int &retval , __nv_bfloat162 ** ptr, __nv_bfloat162* value) { + __stcg(*ptr, *value); + return 0; + } + """ + + _ZL6__stcgP14__nv_bfloat162S__nbst = declare_device( + "_ZL6__stcgP14__nv_bfloat162S__nbst", + void( + CPointer(CPointer(_type___nv_bfloat162)), + CPointer(_type___nv_bfloat162), + ), + ) + + def _ZL6__stcgP14__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL6__stcgP14__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__stcg, CPointer(_type___nv_bfloat162), _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__stcgP14__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__stcgP14__nv_bfloat162S__nbst_caller, + signature( + void, + CPointer(CPointer(_type___nv_bfloat162)), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL6__stcgP14__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def _lower__ZL6__stcgP13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__stcgP13__nv_bfloat16S__nbst(int &retval , __nv_bfloat16 ** ptr, __nv_bfloat16* value) { + __stcg(*ptr, *value); + return 0; + } + """ + + _ZL6__stcgP13__nv_bfloat16S__nbst = declare_device( + "_ZL6__stcgP13__nv_bfloat16S__nbst", + void( + CPointer(CPointer(_type___nv_bfloat16)), + CPointer(_type___nv_bfloat16), + ), + ) + + def _ZL6__stcgP13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL6__stcgP13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__stcg, CPointer(_type___nv_bfloat16), _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__stcgP13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__stcgP13__nv_bfloat16S__nbst_caller, + signature( + void, + CPointer(CPointer(_type___nv_bfloat16)), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL6__stcgP13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __stcs(): + pass + + +def _lower__ZL6__stcsP14__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__stcsP14__nv_bfloat162S__nbst(int &retval , __nv_bfloat162 ** ptr, __nv_bfloat162* value) { + __stcs(*ptr, *value); + return 0; + } + """ + + _ZL6__stcsP14__nv_bfloat162S__nbst = declare_device( + "_ZL6__stcsP14__nv_bfloat162S__nbst", + void( + CPointer(CPointer(_type___nv_bfloat162)), + CPointer(_type___nv_bfloat162), + ), + ) + + def _ZL6__stcsP14__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL6__stcsP14__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__stcs, CPointer(_type___nv_bfloat162), _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__stcsP14__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__stcsP14__nv_bfloat162S__nbst_caller, + signature( + void, + CPointer(CPointer(_type___nv_bfloat162)), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL6__stcsP14__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def _lower__ZL6__stcsP13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__stcsP13__nv_bfloat16S__nbst(int &retval , __nv_bfloat16 ** ptr, __nv_bfloat16* value) { + __stcs(*ptr, *value); + return 0; + } + """ + + _ZL6__stcsP13__nv_bfloat16S__nbst = declare_device( + "_ZL6__stcsP13__nv_bfloat16S__nbst", + void( + CPointer(CPointer(_type___nv_bfloat16)), + CPointer(_type___nv_bfloat16), + ), + ) + + def _ZL6__stcsP13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL6__stcsP13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__stcs, CPointer(_type___nv_bfloat16), _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__stcsP13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__stcsP13__nv_bfloat16S__nbst_caller, + signature( + void, + CPointer(CPointer(_type___nv_bfloat16)), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL6__stcsP13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __stwt(): + pass + + +def _lower__ZL6__stwtP14__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__stwtP14__nv_bfloat162S__nbst(int &retval , __nv_bfloat162 ** ptr, __nv_bfloat162* value) { + __stwt(*ptr, *value); + return 0; + } + """ + + _ZL6__stwtP14__nv_bfloat162S__nbst = declare_device( + "_ZL6__stwtP14__nv_bfloat162S__nbst", + void( + CPointer(CPointer(_type___nv_bfloat162)), + CPointer(_type___nv_bfloat162), + ), + ) + + def _ZL6__stwtP14__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL6__stwtP14__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__stwt, CPointer(_type___nv_bfloat162), _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__stwtP14__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__stwtP14__nv_bfloat162S__nbst_caller, + signature( + void, + CPointer(CPointer(_type___nv_bfloat162)), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL6__stwtP14__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def _lower__ZL6__stwtP13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__stwtP13__nv_bfloat16S__nbst(int &retval , __nv_bfloat16 ** ptr, __nv_bfloat16* value) { + __stwt(*ptr, *value); + return 0; + } + """ + + _ZL6__stwtP13__nv_bfloat16S__nbst = declare_device( + "_ZL6__stwtP13__nv_bfloat16S__nbst", + void( + CPointer(CPointer(_type___nv_bfloat16)), + CPointer(_type___nv_bfloat16), + ), + ) + + def _ZL6__stwtP13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL6__stwtP13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__stwt, CPointer(_type___nv_bfloat16), _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__stwtP13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__stwtP13__nv_bfloat16S__nbst_caller, + signature( + void, + CPointer(CPointer(_type___nv_bfloat16)), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL6__stwtP13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __heq2(): + pass + + +def _lower__ZL6__heq214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__heq214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __heq2(*a, *b); + return 0; + } + """ + + _ZL6__heq214__nv_bfloat162S__nbst = declare_device( + "_ZL6__heq214__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL6__heq214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL6__heq214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__heq2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__heq214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__heq214__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL6__heq214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hne2(): + pass + + +def _lower__ZL6__hne214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__hne214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hne2(*a, *b); + return 0; + } + """ + + _ZL6__hne214__nv_bfloat162S__nbst = declare_device( + "_ZL6__hne214__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL6__hne214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL6__hne214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hne2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__hne214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__hne214__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL6__hne214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hle2(): + pass + + +def _lower__ZL6__hle214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__hle214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hle2(*a, *b); + return 0; + } + """ + + _ZL6__hle214__nv_bfloat162S__nbst = declare_device( + "_ZL6__hle214__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL6__hle214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL6__hle214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hle2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__hle214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__hle214__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL6__hle214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hge2(): + pass + + +def _lower__ZL6__hge214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__hge214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hge2(*a, *b); + return 0; + } + """ + + _ZL6__hge214__nv_bfloat162S__nbst = declare_device( + "_ZL6__hge214__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL6__hge214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL6__hge214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hge2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__hge214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__hge214__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL6__hge214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hlt2(): + pass + + +def _lower__ZL6__hlt214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__hlt214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hlt2(*a, *b); + return 0; + } + """ + + _ZL6__hlt214__nv_bfloat162S__nbst = declare_device( + "_ZL6__hlt214__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL6__hlt214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL6__hlt214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hlt2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__hlt214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__hlt214__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL6__hlt214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hgt2(): + pass + + +def _lower__ZL6__hgt214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__hgt214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hgt2(*a, *b); + return 0; + } + """ + + _ZL6__hgt214__nv_bfloat162S__nbst = declare_device( + "_ZL6__hgt214__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL6__hgt214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL6__hgt214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hgt2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__hgt214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__hgt214__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL6__hgt214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hequ2(): + pass + + +def _lower__ZL7__hequ214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL7__hequ214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hequ2(*a, *b); + return 0; + } + """ + + _ZL7__hequ214__nv_bfloat162S__nbst = declare_device( + "_ZL7__hequ214__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL7__hequ214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL7__hequ214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hequ2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL7__hequ214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL7__hequ214__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL7__hequ214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hneu2(): + pass + + +def _lower__ZL7__hneu214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL7__hneu214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hneu2(*a, *b); + return 0; + } + """ + + _ZL7__hneu214__nv_bfloat162S__nbst = declare_device( + "_ZL7__hneu214__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL7__hneu214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL7__hneu214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hneu2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL7__hneu214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL7__hneu214__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL7__hneu214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hleu2(): + pass + + +def _lower__ZL7__hleu214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL7__hleu214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hleu2(*a, *b); + return 0; + } + """ + + _ZL7__hleu214__nv_bfloat162S__nbst = declare_device( + "_ZL7__hleu214__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL7__hleu214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL7__hleu214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hleu2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL7__hleu214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL7__hleu214__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL7__hleu214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hgeu2(): + pass + + +def _lower__ZL7__hgeu214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL7__hgeu214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hgeu2(*a, *b); + return 0; + } + """ + + _ZL7__hgeu214__nv_bfloat162S__nbst = declare_device( + "_ZL7__hgeu214__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL7__hgeu214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL7__hgeu214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hgeu2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL7__hgeu214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL7__hgeu214__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL7__hgeu214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hltu2(): + pass + + +def _lower__ZL7__hltu214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL7__hltu214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hltu2(*a, *b); + return 0; + } + """ + + _ZL7__hltu214__nv_bfloat162S__nbst = declare_device( + "_ZL7__hltu214__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL7__hltu214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL7__hltu214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hltu2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL7__hltu214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL7__hltu214__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL7__hltu214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hgtu2(): + pass + + +def _lower__ZL7__hgtu214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL7__hgtu214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hgtu2(*a, *b); + return 0; + } + """ + + _ZL7__hgtu214__nv_bfloat162S__nbst = declare_device( + "_ZL7__hgtu214__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL7__hgtu214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL7__hgtu214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hgtu2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL7__hgtu214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL7__hgtu214__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL7__hgtu214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __heq2_mask(): + pass + + +def _lower__ZL11__heq2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL11__heq2_mask14__nv_bfloat162S__nbst(unsigned int &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __heq2_mask(*a, *b); + return 0; + } + """ + + _ZL11__heq2_mask14__nv_bfloat162S__nbst = declare_device( + "_ZL11__heq2_mask14__nv_bfloat162S__nbst", + uint32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZL11__heq2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL11__heq2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__heq2_mask, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL11__heq2_mask14__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL11__heq2_mask14__nv_bfloat162S__nbst_caller, + signature( + uint32, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL11__heq2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hne2_mask(): + pass + + +def _lower__ZL11__hne2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL11__hne2_mask14__nv_bfloat162S__nbst(unsigned int &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hne2_mask(*a, *b); + return 0; + } + """ + + _ZL11__hne2_mask14__nv_bfloat162S__nbst = declare_device( + "_ZL11__hne2_mask14__nv_bfloat162S__nbst", + uint32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZL11__hne2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL11__hne2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hne2_mask, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL11__hne2_mask14__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL11__hne2_mask14__nv_bfloat162S__nbst_caller, + signature( + uint32, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL11__hne2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hle2_mask(): + pass + + +def _lower__ZL11__hle2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL11__hle2_mask14__nv_bfloat162S__nbst(unsigned int &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hle2_mask(*a, *b); + return 0; + } + """ + + _ZL11__hle2_mask14__nv_bfloat162S__nbst = declare_device( + "_ZL11__hle2_mask14__nv_bfloat162S__nbst", + uint32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZL11__hle2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL11__hle2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hle2_mask, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL11__hle2_mask14__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL11__hle2_mask14__nv_bfloat162S__nbst_caller, + signature( + uint32, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL11__hle2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hge2_mask(): + pass + + +def _lower__ZL11__hge2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL11__hge2_mask14__nv_bfloat162S__nbst(unsigned int &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hge2_mask(*a, *b); + return 0; + } + """ + + _ZL11__hge2_mask14__nv_bfloat162S__nbst = declare_device( + "_ZL11__hge2_mask14__nv_bfloat162S__nbst", + uint32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZL11__hge2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL11__hge2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hge2_mask, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL11__hge2_mask14__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL11__hge2_mask14__nv_bfloat162S__nbst_caller, + signature( + uint32, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL11__hge2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hlt2_mask(): + pass + + +def _lower__ZL11__hlt2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL11__hlt2_mask14__nv_bfloat162S__nbst(unsigned int &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hlt2_mask(*a, *b); + return 0; + } + """ + + _ZL11__hlt2_mask14__nv_bfloat162S__nbst = declare_device( + "_ZL11__hlt2_mask14__nv_bfloat162S__nbst", + uint32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZL11__hlt2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL11__hlt2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hlt2_mask, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL11__hlt2_mask14__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL11__hlt2_mask14__nv_bfloat162S__nbst_caller, + signature( + uint32, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL11__hlt2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hgt2_mask(): + pass + + +def _lower__ZL11__hgt2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL11__hgt2_mask14__nv_bfloat162S__nbst(unsigned int &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hgt2_mask(*a, *b); + return 0; + } + """ + + _ZL11__hgt2_mask14__nv_bfloat162S__nbst = declare_device( + "_ZL11__hgt2_mask14__nv_bfloat162S__nbst", + uint32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZL11__hgt2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL11__hgt2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hgt2_mask, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL11__hgt2_mask14__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL11__hgt2_mask14__nv_bfloat162S__nbst_caller, + signature( + uint32, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL11__hgt2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hequ2_mask(): + pass + + +def _lower__ZL12__hequ2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL12__hequ2_mask14__nv_bfloat162S__nbst(unsigned int &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hequ2_mask(*a, *b); + return 0; + } + """ + + _ZL12__hequ2_mask14__nv_bfloat162S__nbst = declare_device( + "_ZL12__hequ2_mask14__nv_bfloat162S__nbst", + uint32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZL12__hequ2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL12__hequ2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hequ2_mask, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL12__hequ2_mask14__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL12__hequ2_mask14__nv_bfloat162S__nbst_caller, + signature( + uint32, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL12__hequ2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hneu2_mask(): + pass + + +def _lower__ZL12__hneu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL12__hneu2_mask14__nv_bfloat162S__nbst(unsigned int &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hneu2_mask(*a, *b); + return 0; + } + """ + + _ZL12__hneu2_mask14__nv_bfloat162S__nbst = declare_device( + "_ZL12__hneu2_mask14__nv_bfloat162S__nbst", + uint32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZL12__hneu2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL12__hneu2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hneu2_mask, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL12__hneu2_mask14__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL12__hneu2_mask14__nv_bfloat162S__nbst_caller, + signature( + uint32, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL12__hneu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hleu2_mask(): + pass + + +def _lower__ZL12__hleu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL12__hleu2_mask14__nv_bfloat162S__nbst(unsigned int &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hleu2_mask(*a, *b); + return 0; + } + """ + + _ZL12__hleu2_mask14__nv_bfloat162S__nbst = declare_device( + "_ZL12__hleu2_mask14__nv_bfloat162S__nbst", + uint32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZL12__hleu2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL12__hleu2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hleu2_mask, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL12__hleu2_mask14__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL12__hleu2_mask14__nv_bfloat162S__nbst_caller, + signature( + uint32, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL12__hleu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hgeu2_mask(): + pass + + +def _lower__ZL12__hgeu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL12__hgeu2_mask14__nv_bfloat162S__nbst(unsigned int &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hgeu2_mask(*a, *b); + return 0; + } + """ + + _ZL12__hgeu2_mask14__nv_bfloat162S__nbst = declare_device( + "_ZL12__hgeu2_mask14__nv_bfloat162S__nbst", + uint32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZL12__hgeu2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL12__hgeu2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hgeu2_mask, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL12__hgeu2_mask14__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL12__hgeu2_mask14__nv_bfloat162S__nbst_caller, + signature( + uint32, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL12__hgeu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hltu2_mask(): + pass + + +def _lower__ZL12__hltu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL12__hltu2_mask14__nv_bfloat162S__nbst(unsigned int &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hltu2_mask(*a, *b); + return 0; + } + """ + + _ZL12__hltu2_mask14__nv_bfloat162S__nbst = declare_device( + "_ZL12__hltu2_mask14__nv_bfloat162S__nbst", + uint32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZL12__hltu2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL12__hltu2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hltu2_mask, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL12__hltu2_mask14__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL12__hltu2_mask14__nv_bfloat162S__nbst_caller, + signature( + uint32, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL12__hltu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hgtu2_mask(): + pass + + +def _lower__ZL12__hgtu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL12__hgtu2_mask14__nv_bfloat162S__nbst(unsigned int &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hgtu2_mask(*a, *b); + return 0; + } + """ + + _ZL12__hgtu2_mask14__nv_bfloat162S__nbst = declare_device( + "_ZL12__hgtu2_mask14__nv_bfloat162S__nbst", + uint32(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZL12__hgtu2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL12__hgtu2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hgtu2_mask, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL12__hgtu2_mask14__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL12__hgtu2_mask14__nv_bfloat162S__nbst_caller, + signature( + uint32, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL12__hgtu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hisnan2(): + pass + + +def _lower__ZL9__hisnan214__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL9__hisnan214__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) { + retval = __hisnan2(*a); + return 0; + } + """ + + _ZL9__hisnan214__nv_bfloat162_nbst = declare_device( + "_ZL9__hisnan214__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(_type___nv_bfloat162)), + ) + + def _ZL9__hisnan214__nv_bfloat162_nbst_caller(arg_0): + return _ZL9__hisnan214__nv_bfloat162_nbst(arg_0) + + @lower(__hisnan2, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL9__hisnan214__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL9__hisnan214__nv_bfloat162_nbst_caller, + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + ptrs, + ) + + +_lower__ZL9__hisnan214__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def __hadd2(): + pass + + +def _lower__ZL7__hadd214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL7__hadd214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hadd2(*a, *b); + return 0; + } + """ + + _ZL7__hadd214__nv_bfloat162S__nbst = declare_device( + "_ZL7__hadd214__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL7__hadd214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL7__hadd214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hadd2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL7__hadd214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL7__hadd214__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL7__hadd214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hsub2(): + pass + + +def _lower__ZL7__hsub214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL7__hsub214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hsub2(*a, *b); + return 0; + } + """ + + _ZL7__hsub214__nv_bfloat162S__nbst = declare_device( + "_ZL7__hsub214__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL7__hsub214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL7__hsub214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hsub2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL7__hsub214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL7__hsub214__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL7__hsub214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hmul2(): + pass + + +def _lower__ZL7__hmul214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL7__hmul214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hmul2(*a, *b); + return 0; + } + """ + + _ZL7__hmul214__nv_bfloat162S__nbst = declare_device( + "_ZL7__hmul214__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL7__hmul214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL7__hmul214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hmul2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL7__hmul214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL7__hmul214__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL7__hmul214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hadd2_rn(): + pass + + +def _lower__ZL10__hadd2_rn14__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL10__hadd2_rn14__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hadd2_rn(*a, *b); + return 0; + } + """ + + _ZL10__hadd2_rn14__nv_bfloat162S__nbst = declare_device( + "_ZL10__hadd2_rn14__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL10__hadd2_rn14__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL10__hadd2_rn14__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hadd2_rn, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL10__hadd2_rn14__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL10__hadd2_rn14__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL10__hadd2_rn14__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hsub2_rn(): + pass + + +def _lower__ZL10__hsub2_rn14__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL10__hsub2_rn14__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hsub2_rn(*a, *b); + return 0; + } + """ + + _ZL10__hsub2_rn14__nv_bfloat162S__nbst = declare_device( + "_ZL10__hsub2_rn14__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL10__hsub2_rn14__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL10__hsub2_rn14__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hsub2_rn, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL10__hsub2_rn14__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL10__hsub2_rn14__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL10__hsub2_rn14__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hmul2_rn(): + pass + + +def _lower__ZL10__hmul2_rn14__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL10__hmul2_rn14__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hmul2_rn(*a, *b); + return 0; + } + """ + + _ZL10__hmul2_rn14__nv_bfloat162S__nbst = declare_device( + "_ZL10__hmul2_rn14__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL10__hmul2_rn14__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL10__hmul2_rn14__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hmul2_rn, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL10__hmul2_rn14__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL10__hmul2_rn14__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL10__hmul2_rn14__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __h2div(): + pass + + +def _lower__ZL7__h2div14__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL7__h2div14__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __h2div(*a, *b); + return 0; + } + """ + + _ZL7__h2div14__nv_bfloat162S__nbst = declare_device( + "_ZL7__h2div14__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL7__h2div14__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL7__h2div14__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__h2div, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL7__h2div14__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL7__h2div14__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL7__h2div14__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __habs2(): + pass + + +def _lower__ZL7__habs214__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL7__habs214__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) { + retval = __habs2(*a); + return 0; + } + """ + + _ZL7__habs214__nv_bfloat162_nbst = declare_device( + "_ZL7__habs214__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(_type___nv_bfloat162)), + ) + + def _ZL7__habs214__nv_bfloat162_nbst_caller(arg_0): + return _ZL7__habs214__nv_bfloat162_nbst(arg_0) + + @lower(__habs2, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL7__habs214__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL7__habs214__nv_bfloat162_nbst_caller, + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + ptrs, + ) + + +_lower__ZL7__habs214__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def __hadd2_sat(): + pass + + +def _lower__ZL11__hadd2_sat14__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL11__hadd2_sat14__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hadd2_sat(*a, *b); + return 0; + } + """ + + _ZL11__hadd2_sat14__nv_bfloat162S__nbst = declare_device( + "_ZL11__hadd2_sat14__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL11__hadd2_sat14__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL11__hadd2_sat14__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hadd2_sat, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL11__hadd2_sat14__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL11__hadd2_sat14__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL11__hadd2_sat14__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hsub2_sat(): + pass + + +def _lower__ZL11__hsub2_sat14__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL11__hsub2_sat14__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hsub2_sat(*a, *b); + return 0; + } + """ + + _ZL11__hsub2_sat14__nv_bfloat162S__nbst = declare_device( + "_ZL11__hsub2_sat14__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL11__hsub2_sat14__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL11__hsub2_sat14__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hsub2_sat, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL11__hsub2_sat14__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL11__hsub2_sat14__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL11__hsub2_sat14__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hmul2_sat(): + pass + + +def _lower__ZL11__hmul2_sat14__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL11__hmul2_sat14__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hmul2_sat(*a, *b); + return 0; + } + """ + + _ZL11__hmul2_sat14__nv_bfloat162S__nbst = declare_device( + "_ZL11__hmul2_sat14__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL11__hmul2_sat14__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL11__hmul2_sat14__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hmul2_sat, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL11__hmul2_sat14__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL11__hmul2_sat14__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL11__hmul2_sat14__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hfma2(): + pass + + +def _lower__ZL7__hfma214__nv_bfloat162S_S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL7__hfma214__nv_bfloat162S_S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b, __nv_bfloat162* c) { + retval = __hfma2(*a, *b, *c); + return 0; + } + """ + + _ZL7__hfma214__nv_bfloat162S_S__nbst = declare_device( + "_ZL7__hfma214__nv_bfloat162S_S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ) + + def _ZL7__hfma214__nv_bfloat162S_S__nbst_caller(arg_0, arg_1, arg_2): + return _ZL7__hfma214__nv_bfloat162S_S__nbst(arg_0, arg_1, arg_2) + + @lower( + __hfma2, + _type___nv_bfloat162, + _type___nv_bfloat162, + _type___nv_bfloat162, + ) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL7__hfma214__nv_bfloat162S_S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL7__hfma214__nv_bfloat162S_S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL7__hfma214__nv_bfloat162S_S__nbst(shim_stream, shim_obj) + + +def __hfma2_sat(): + pass + + +def _lower__ZL11__hfma2_sat14__nv_bfloat162S_S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL11__hfma2_sat14__nv_bfloat162S_S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b, __nv_bfloat162* c) { + retval = __hfma2_sat(*a, *b, *c); + return 0; + } + """ + + _ZL11__hfma2_sat14__nv_bfloat162S_S__nbst = declare_device( + "_ZL11__hfma2_sat14__nv_bfloat162S_S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ) + + def _ZL11__hfma2_sat14__nv_bfloat162S_S__nbst_caller(arg_0, arg_1, arg_2): + return _ZL11__hfma2_sat14__nv_bfloat162S_S__nbst(arg_0, arg_1, arg_2) + + @lower( + __hfma2_sat, + _type___nv_bfloat162, + _type___nv_bfloat162, + _type___nv_bfloat162, + ) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL11__hfma2_sat14__nv_bfloat162S_S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL11__hfma2_sat14__nv_bfloat162S_S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL11__hfma2_sat14__nv_bfloat162S_S__nbst(shim_stream, shim_obj) + + +def __hneg2(): + pass + + +def _lower__ZL7__hneg214__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL7__hneg214__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) { + retval = __hneg2(*a); + return 0; + } + """ + + _ZL7__hneg214__nv_bfloat162_nbst = declare_device( + "_ZL7__hneg214__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(_type___nv_bfloat162)), + ) + + def _ZL7__hneg214__nv_bfloat162_nbst_caller(arg_0): + return _ZL7__hneg214__nv_bfloat162_nbst(arg_0) + + @lower(__hneg2, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL7__hneg214__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL7__hneg214__nv_bfloat162_nbst_caller, + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + ptrs, + ) + + +_lower__ZL7__hneg214__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def __habs(): + pass + + +def _lower__ZL6__habs13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__habs13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) { + retval = __habs(*a); + return 0; + } + """ + + _ZL6__habs13__nv_bfloat16_nbst = declare_device( + "_ZL6__habs13__nv_bfloat16_nbst", + _type___nv_bfloat16(CPointer(_type___nv_bfloat16)), + ) + + def _ZL6__habs13__nv_bfloat16_nbst_caller(arg_0): + return _ZL6__habs13__nv_bfloat16_nbst(arg_0) + + @lower(__habs, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__habs13__nv_bfloat16_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__habs13__nv_bfloat16_nbst_caller, + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + ptrs, + ) + + +_lower__ZL6__habs13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def __hadd(): + pass + + +def _lower__ZL6__hadd13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__hadd13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) { + retval = __hadd(*a, *b); + return 0; + } + """ + + _ZL6__hadd13__nv_bfloat16S__nbst = declare_device( + "_ZL6__hadd13__nv_bfloat16S__nbst", + _type___nv_bfloat16( + CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) + ), + ) + + def _ZL6__hadd13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL6__hadd13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__hadd, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__hadd13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__hadd13__nv_bfloat16S__nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL6__hadd13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __hsub(): + pass + + +def _lower__ZL6__hsub13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__hsub13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) { + retval = __hsub(*a, *b); + return 0; + } + """ + + _ZL6__hsub13__nv_bfloat16S__nbst = declare_device( + "_ZL6__hsub13__nv_bfloat16S__nbst", + _type___nv_bfloat16( + CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) + ), + ) + + def _ZL6__hsub13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL6__hsub13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__hsub, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__hsub13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__hsub13__nv_bfloat16S__nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL6__hsub13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __hmul(): + pass + + +def _lower__ZL6__hmul13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__hmul13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) { + retval = __hmul(*a, *b); + return 0; + } + """ + + _ZL6__hmul13__nv_bfloat16S__nbst = declare_device( + "_ZL6__hmul13__nv_bfloat16S__nbst", + _type___nv_bfloat16( + CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) + ), + ) + + def _ZL6__hmul13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL6__hmul13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__hmul, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__hmul13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__hmul13__nv_bfloat16S__nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL6__hmul13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __hadd_rn(): + pass + + +def _lower__ZL9__hadd_rn13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL9__hadd_rn13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) { + retval = __hadd_rn(*a, *b); + return 0; + } + """ + + _ZL9__hadd_rn13__nv_bfloat16S__nbst = declare_device( + "_ZL9__hadd_rn13__nv_bfloat16S__nbst", + _type___nv_bfloat16( + CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) + ), + ) + + def _ZL9__hadd_rn13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL9__hadd_rn13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__hadd_rn, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL9__hadd_rn13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL9__hadd_rn13__nv_bfloat16S__nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL9__hadd_rn13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __hsub_rn(): + pass + + +def _lower__ZL9__hsub_rn13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL9__hsub_rn13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) { + retval = __hsub_rn(*a, *b); + return 0; + } + """ + + _ZL9__hsub_rn13__nv_bfloat16S__nbst = declare_device( + "_ZL9__hsub_rn13__nv_bfloat16S__nbst", + _type___nv_bfloat16( + CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) + ), + ) + + def _ZL9__hsub_rn13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL9__hsub_rn13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__hsub_rn, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL9__hsub_rn13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL9__hsub_rn13__nv_bfloat16S__nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL9__hsub_rn13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __hmul_rn(): + pass + + +def _lower__ZL9__hmul_rn13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL9__hmul_rn13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) { + retval = __hmul_rn(*a, *b); + return 0; + } + """ + + _ZL9__hmul_rn13__nv_bfloat16S__nbst = declare_device( + "_ZL9__hmul_rn13__nv_bfloat16S__nbst", + _type___nv_bfloat16( + CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) + ), + ) + + def _ZL9__hmul_rn13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL9__hmul_rn13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__hmul_rn, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL9__hmul_rn13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL9__hmul_rn13__nv_bfloat16S__nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL9__hmul_rn13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __hdiv(): + pass + + +def _lower__ZL6__hdiv13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__hdiv13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) { + retval = __hdiv(*a, *b); + return 0; + } + """ + + _ZL6__hdiv13__nv_bfloat16S__nbst = declare_device( + "_ZL6__hdiv13__nv_bfloat16S__nbst", + _type___nv_bfloat16( + CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) + ), + ) + + def _ZL6__hdiv13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL6__hdiv13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__hdiv, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__hdiv13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__hdiv13__nv_bfloat16S__nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL6__hdiv13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __hadd_sat(): + pass + + +def _lower__ZL10__hadd_sat13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL10__hadd_sat13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) { + retval = __hadd_sat(*a, *b); + return 0; + } + """ + + _ZL10__hadd_sat13__nv_bfloat16S__nbst = declare_device( + "_ZL10__hadd_sat13__nv_bfloat16S__nbst", + _type___nv_bfloat16( + CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) + ), + ) + + def _ZL10__hadd_sat13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL10__hadd_sat13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__hadd_sat, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL10__hadd_sat13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL10__hadd_sat13__nv_bfloat16S__nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL10__hadd_sat13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __hsub_sat(): + pass + + +def _lower__ZL10__hsub_sat13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL10__hsub_sat13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) { + retval = __hsub_sat(*a, *b); + return 0; + } + """ + + _ZL10__hsub_sat13__nv_bfloat16S__nbst = declare_device( + "_ZL10__hsub_sat13__nv_bfloat16S__nbst", + _type___nv_bfloat16( + CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) + ), + ) + + def _ZL10__hsub_sat13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL10__hsub_sat13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__hsub_sat, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL10__hsub_sat13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL10__hsub_sat13__nv_bfloat16S__nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL10__hsub_sat13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __hmul_sat(): + pass + + +def _lower__ZL10__hmul_sat13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL10__hmul_sat13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) { + retval = __hmul_sat(*a, *b); + return 0; + } + """ + + _ZL10__hmul_sat13__nv_bfloat16S__nbst = declare_device( + "_ZL10__hmul_sat13__nv_bfloat16S__nbst", + _type___nv_bfloat16( + CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) + ), + ) + + def _ZL10__hmul_sat13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL10__hmul_sat13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__hmul_sat, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL10__hmul_sat13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL10__hmul_sat13__nv_bfloat16S__nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL10__hmul_sat13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __hfma(): + pass + + +def _lower__ZL6__hfma13__nv_bfloat16S_S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__hfma13__nv_bfloat16S_S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b, __nv_bfloat16* c) { + retval = __hfma(*a, *b, *c); + return 0; + } + """ + + _ZL6__hfma13__nv_bfloat16S_S__nbst = declare_device( + "_ZL6__hfma13__nv_bfloat16S_S__nbst", + _type___nv_bfloat16( + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ) + + def _ZL6__hfma13__nv_bfloat16S_S__nbst_caller(arg_0, arg_1, arg_2): + return _ZL6__hfma13__nv_bfloat16S_S__nbst(arg_0, arg_1, arg_2) + + @lower( + __hfma, _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16 + ) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__hfma13__nv_bfloat16S_S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__hfma13__nv_bfloat16S_S__nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL6__hfma13__nv_bfloat16S_S__nbst(shim_stream, shim_obj) + + +def __hfma_sat(): + pass + + +def _lower__ZL10__hfma_sat13__nv_bfloat16S_S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL10__hfma_sat13__nv_bfloat16S_S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b, __nv_bfloat16* c) { + retval = __hfma_sat(*a, *b, *c); + return 0; + } + """ + + _ZL10__hfma_sat13__nv_bfloat16S_S__nbst = declare_device( + "_ZL10__hfma_sat13__nv_bfloat16S_S__nbst", + _type___nv_bfloat16( + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ) + + def _ZL10__hfma_sat13__nv_bfloat16S_S__nbst_caller(arg_0, arg_1, arg_2): + return _ZL10__hfma_sat13__nv_bfloat16S_S__nbst(arg_0, arg_1, arg_2) + + @lower( + __hfma_sat, + _type___nv_bfloat16, + _type___nv_bfloat16, + _type___nv_bfloat16, + ) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL10__hfma_sat13__nv_bfloat16S_S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL10__hfma_sat13__nv_bfloat16S_S__nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL10__hfma_sat13__nv_bfloat16S_S__nbst(shim_stream, shim_obj) + + +def __hneg(): + pass + + +def _lower__ZL6__hneg13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__hneg13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) { + retval = __hneg(*a); + return 0; + } + """ + + _ZL6__hneg13__nv_bfloat16_nbst = declare_device( + "_ZL6__hneg13__nv_bfloat16_nbst", + _type___nv_bfloat16(CPointer(_type___nv_bfloat16)), + ) + + def _ZL6__hneg13__nv_bfloat16_nbst_caller(arg_0): + return _ZL6__hneg13__nv_bfloat16_nbst(arg_0) + + @lower(__hneg, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__hneg13__nv_bfloat16_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__hneg13__nv_bfloat16_nbst_caller, + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + ptrs, + ) + + +_lower__ZL6__hneg13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def __hbeq2(): + pass + + +def _lower__ZL7__hbeq214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL7__hbeq214__nv_bfloat162S__nbst(bool &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hbeq2(*a, *b); + return 0; + } + """ + + _ZL7__hbeq214__nv_bfloat162S__nbst = declare_device( + "_ZL7__hbeq214__nv_bfloat162S__nbst", + bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZL7__hbeq214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL7__hbeq214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hbeq2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL7__hbeq214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL7__hbeq214__nv_bfloat162S__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL7__hbeq214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hbne2(): + pass + + +def _lower__ZL7__hbne214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL7__hbne214__nv_bfloat162S__nbst(bool &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hbne2(*a, *b); + return 0; + } + """ + + _ZL7__hbne214__nv_bfloat162S__nbst = declare_device( + "_ZL7__hbne214__nv_bfloat162S__nbst", + bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZL7__hbne214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL7__hbne214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hbne2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL7__hbne214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL7__hbne214__nv_bfloat162S__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL7__hbne214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hble2(): + pass + + +def _lower__ZL7__hble214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL7__hble214__nv_bfloat162S__nbst(bool &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hble2(*a, *b); + return 0; + } + """ + + _ZL7__hble214__nv_bfloat162S__nbst = declare_device( + "_ZL7__hble214__nv_bfloat162S__nbst", + bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZL7__hble214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL7__hble214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hble2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL7__hble214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL7__hble214__nv_bfloat162S__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL7__hble214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hbge2(): + pass + + +def _lower__ZL7__hbge214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL7__hbge214__nv_bfloat162S__nbst(bool &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hbge2(*a, *b); + return 0; + } + """ + + _ZL7__hbge214__nv_bfloat162S__nbst = declare_device( + "_ZL7__hbge214__nv_bfloat162S__nbst", + bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZL7__hbge214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL7__hbge214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hbge2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL7__hbge214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL7__hbge214__nv_bfloat162S__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL7__hbge214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hblt2(): + pass + + +def _lower__ZL7__hblt214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL7__hblt214__nv_bfloat162S__nbst(bool &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hblt2(*a, *b); + return 0; + } + """ + + _ZL7__hblt214__nv_bfloat162S__nbst = declare_device( + "_ZL7__hblt214__nv_bfloat162S__nbst", + bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZL7__hblt214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL7__hblt214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hblt2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL7__hblt214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL7__hblt214__nv_bfloat162S__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL7__hblt214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hbgt2(): + pass + + +def _lower__ZL7__hbgt214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL7__hbgt214__nv_bfloat162S__nbst(bool &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hbgt2(*a, *b); + return 0; + } + """ + + _ZL7__hbgt214__nv_bfloat162S__nbst = declare_device( + "_ZL7__hbgt214__nv_bfloat162S__nbst", + bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZL7__hbgt214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL7__hbgt214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hbgt2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL7__hbgt214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL7__hbgt214__nv_bfloat162S__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL7__hbgt214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hbequ2(): + pass + + +def _lower__ZL8__hbequ214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL8__hbequ214__nv_bfloat162S__nbst(bool &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hbequ2(*a, *b); + return 0; + } + """ + + _ZL8__hbequ214__nv_bfloat162S__nbst = declare_device( + "_ZL8__hbequ214__nv_bfloat162S__nbst", + bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZL8__hbequ214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL8__hbequ214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hbequ2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL8__hbequ214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL8__hbequ214__nv_bfloat162S__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL8__hbequ214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hbneu2(): + pass + + +def _lower__ZL8__hbneu214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL8__hbneu214__nv_bfloat162S__nbst(bool &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hbneu2(*a, *b); + return 0; + } + """ + + _ZL8__hbneu214__nv_bfloat162S__nbst = declare_device( + "_ZL8__hbneu214__nv_bfloat162S__nbst", + bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZL8__hbneu214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL8__hbneu214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hbneu2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL8__hbneu214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL8__hbneu214__nv_bfloat162S__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL8__hbneu214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hbleu2(): + pass + + +def _lower__ZL8__hbleu214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL8__hbleu214__nv_bfloat162S__nbst(bool &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hbleu2(*a, *b); + return 0; + } + """ + + _ZL8__hbleu214__nv_bfloat162S__nbst = declare_device( + "_ZL8__hbleu214__nv_bfloat162S__nbst", + bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZL8__hbleu214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL8__hbleu214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hbleu2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL8__hbleu214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL8__hbleu214__nv_bfloat162S__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL8__hbleu214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hbgeu2(): + pass + + +def _lower__ZL8__hbgeu214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL8__hbgeu214__nv_bfloat162S__nbst(bool &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hbgeu2(*a, *b); + return 0; + } + """ + + _ZL8__hbgeu214__nv_bfloat162S__nbst = declare_device( + "_ZL8__hbgeu214__nv_bfloat162S__nbst", + bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZL8__hbgeu214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL8__hbgeu214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hbgeu2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL8__hbgeu214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL8__hbgeu214__nv_bfloat162S__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL8__hbgeu214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hbltu2(): + pass + + +def _lower__ZL8__hbltu214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL8__hbltu214__nv_bfloat162S__nbst(bool &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hbltu2(*a, *b); + return 0; + } + """ + + _ZL8__hbltu214__nv_bfloat162S__nbst = declare_device( + "_ZL8__hbltu214__nv_bfloat162S__nbst", + bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZL8__hbltu214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL8__hbltu214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hbltu2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL8__hbltu214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL8__hbltu214__nv_bfloat162S__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL8__hbltu214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hbgtu2(): + pass + + +def _lower__ZL8__hbgtu214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL8__hbgtu214__nv_bfloat162S__nbst(bool &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hbgtu2(*a, *b); + return 0; + } + """ + + _ZL8__hbgtu214__nv_bfloat162S__nbst = declare_device( + "_ZL8__hbgtu214__nv_bfloat162S__nbst", + bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZL8__hbgtu214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL8__hbgtu214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hbgtu2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL8__hbgtu214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL8__hbgtu214__nv_bfloat162S__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL8__hbgtu214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __heq(): + pass + + +def _lower__ZL5__heq13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL5__heq13__nv_bfloat16S__nbst(bool &retval , __nv_bfloat16* a, __nv_bfloat16* b) { + retval = __heq(*a, *b); + return 0; + } + """ + + _ZL5__heq13__nv_bfloat16S__nbst = declare_device( + "_ZL5__heq13__nv_bfloat16S__nbst", + bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)), + ) + + def _ZL5__heq13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL5__heq13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__heq, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL5__heq13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL5__heq13__nv_bfloat16S__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL5__heq13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __hne(): + pass + + +def _lower__ZL5__hne13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL5__hne13__nv_bfloat16S__nbst(bool &retval , __nv_bfloat16* a, __nv_bfloat16* b) { + retval = __hne(*a, *b); + return 0; + } + """ + + _ZL5__hne13__nv_bfloat16S__nbst = declare_device( + "_ZL5__hne13__nv_bfloat16S__nbst", + bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)), + ) + + def _ZL5__hne13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL5__hne13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__hne, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL5__hne13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL5__hne13__nv_bfloat16S__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL5__hne13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __hle(): + pass + + +def _lower__ZL5__hle13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL5__hle13__nv_bfloat16S__nbst(bool &retval , __nv_bfloat16* a, __nv_bfloat16* b) { + retval = __hle(*a, *b); + return 0; + } + """ + + _ZL5__hle13__nv_bfloat16S__nbst = declare_device( + "_ZL5__hle13__nv_bfloat16S__nbst", + bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)), + ) + + def _ZL5__hle13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL5__hle13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__hle, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL5__hle13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL5__hle13__nv_bfloat16S__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL5__hle13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __hge(): + pass + + +def _lower__ZL5__hge13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL5__hge13__nv_bfloat16S__nbst(bool &retval , __nv_bfloat16* a, __nv_bfloat16* b) { + retval = __hge(*a, *b); + return 0; + } + """ + + _ZL5__hge13__nv_bfloat16S__nbst = declare_device( + "_ZL5__hge13__nv_bfloat16S__nbst", + bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)), + ) + + def _ZL5__hge13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL5__hge13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__hge, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL5__hge13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL5__hge13__nv_bfloat16S__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL5__hge13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __hlt(): + pass + + +def _lower__ZL5__hlt13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL5__hlt13__nv_bfloat16S__nbst(bool &retval , __nv_bfloat16* a, __nv_bfloat16* b) { + retval = __hlt(*a, *b); + return 0; + } + """ + + _ZL5__hlt13__nv_bfloat16S__nbst = declare_device( + "_ZL5__hlt13__nv_bfloat16S__nbst", + bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)), + ) + + def _ZL5__hlt13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL5__hlt13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__hlt, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL5__hlt13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL5__hlt13__nv_bfloat16S__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL5__hlt13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __hgt(): + pass + + +def _lower__ZL5__hgt13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL5__hgt13__nv_bfloat16S__nbst(bool &retval , __nv_bfloat16* a, __nv_bfloat16* b) { + retval = __hgt(*a, *b); + return 0; + } + """ + + _ZL5__hgt13__nv_bfloat16S__nbst = declare_device( + "_ZL5__hgt13__nv_bfloat16S__nbst", + bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)), + ) + + def _ZL5__hgt13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL5__hgt13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__hgt, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL5__hgt13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL5__hgt13__nv_bfloat16S__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL5__hgt13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __hequ(): + pass + + +def _lower__ZL6__hequ13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__hequ13__nv_bfloat16S__nbst(bool &retval , __nv_bfloat16* a, __nv_bfloat16* b) { + retval = __hequ(*a, *b); + return 0; + } + """ + + _ZL6__hequ13__nv_bfloat16S__nbst = declare_device( + "_ZL6__hequ13__nv_bfloat16S__nbst", + bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)), + ) + + def _ZL6__hequ13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL6__hequ13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__hequ, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__hequ13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__hequ13__nv_bfloat16S__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL6__hequ13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __hneu(): + pass + + +def _lower__ZL6__hneu13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__hneu13__nv_bfloat16S__nbst(bool &retval , __nv_bfloat16* a, __nv_bfloat16* b) { + retval = __hneu(*a, *b); + return 0; + } + """ + + _ZL6__hneu13__nv_bfloat16S__nbst = declare_device( + "_ZL6__hneu13__nv_bfloat16S__nbst", + bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)), + ) + + def _ZL6__hneu13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL6__hneu13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__hneu, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__hneu13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__hneu13__nv_bfloat16S__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL6__hneu13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __hleu(): + pass + + +def _lower__ZL6__hleu13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__hleu13__nv_bfloat16S__nbst(bool &retval , __nv_bfloat16* a, __nv_bfloat16* b) { + retval = __hleu(*a, *b); + return 0; + } + """ + + _ZL6__hleu13__nv_bfloat16S__nbst = declare_device( + "_ZL6__hleu13__nv_bfloat16S__nbst", + bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)), + ) + + def _ZL6__hleu13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL6__hleu13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__hleu, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__hleu13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__hleu13__nv_bfloat16S__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL6__hleu13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __hgeu(): + pass + + +def _lower__ZL6__hgeu13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__hgeu13__nv_bfloat16S__nbst(bool &retval , __nv_bfloat16* a, __nv_bfloat16* b) { + retval = __hgeu(*a, *b); + return 0; + } + """ + + _ZL6__hgeu13__nv_bfloat16S__nbst = declare_device( + "_ZL6__hgeu13__nv_bfloat16S__nbst", + bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)), + ) + + def _ZL6__hgeu13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL6__hgeu13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__hgeu, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__hgeu13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__hgeu13__nv_bfloat16S__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL6__hgeu13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __hltu(): + pass + + +def _lower__ZL6__hltu13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__hltu13__nv_bfloat16S__nbst(bool &retval , __nv_bfloat16* a, __nv_bfloat16* b) { + retval = __hltu(*a, *b); + return 0; + } + """ + + _ZL6__hltu13__nv_bfloat16S__nbst = declare_device( + "_ZL6__hltu13__nv_bfloat16S__nbst", + bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)), + ) + + def _ZL6__hltu13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL6__hltu13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__hltu, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__hltu13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__hltu13__nv_bfloat16S__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL6__hltu13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __hgtu(): + pass + + +def _lower__ZL6__hgtu13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__hgtu13__nv_bfloat16S__nbst(bool &retval , __nv_bfloat16* a, __nv_bfloat16* b) { + retval = __hgtu(*a, *b); + return 0; + } + """ + + _ZL6__hgtu13__nv_bfloat16S__nbst = declare_device( + "_ZL6__hgtu13__nv_bfloat16S__nbst", + bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)), + ) + + def _ZL6__hgtu13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL6__hgtu13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__hgtu, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__hgtu13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__hgtu13__nv_bfloat16S__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL6__hgtu13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __hisnan(): + pass + + +def _lower__ZL8__hisnan13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL8__hisnan13__nv_bfloat16_nbst(bool &retval , __nv_bfloat16* a) { + retval = __hisnan(*a); + return 0; + } + """ + + _ZL8__hisnan13__nv_bfloat16_nbst = declare_device( + "_ZL8__hisnan13__nv_bfloat16_nbst", bool_(CPointer(_type___nv_bfloat16)) + ) + + def _ZL8__hisnan13__nv_bfloat16_nbst_caller(arg_0): + return _ZL8__hisnan13__nv_bfloat16_nbst(arg_0) + + @lower(__hisnan, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL8__hisnan13__nv_bfloat16_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL8__hisnan13__nv_bfloat16_nbst_caller, + signature(bool_, CPointer(_type___nv_bfloat16)), + ptrs, + ) + + +_lower__ZL8__hisnan13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def __hmax(): + pass + + +def _lower__ZL6__hmax13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__hmax13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) { + retval = __hmax(*a, *b); + return 0; + } + """ + + _ZL6__hmax13__nv_bfloat16S__nbst = declare_device( + "_ZL6__hmax13__nv_bfloat16S__nbst", + _type___nv_bfloat16( + CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) + ), + ) + + def _ZL6__hmax13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL6__hmax13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__hmax, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__hmax13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__hmax13__nv_bfloat16S__nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL6__hmax13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __hmin(): + pass + + +def _lower__ZL6__hmin13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6__hmin13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) { + retval = __hmin(*a, *b); + return 0; + } + """ + + _ZL6__hmin13__nv_bfloat16S__nbst = declare_device( + "_ZL6__hmin13__nv_bfloat16S__nbst", + _type___nv_bfloat16( + CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) + ), + ) + + def _ZL6__hmin13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL6__hmin13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__hmin, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6__hmin13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6__hmin13__nv_bfloat16S__nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL6__hmin13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __hmax_nan(): + pass + + +def _lower__ZL10__hmax_nan13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL10__hmax_nan13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) { + retval = __hmax_nan(*a, *b); + return 0; + } + """ + + _ZL10__hmax_nan13__nv_bfloat16S__nbst = declare_device( + "_ZL10__hmax_nan13__nv_bfloat16S__nbst", + _type___nv_bfloat16( + CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) + ), + ) + + def _ZL10__hmax_nan13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL10__hmax_nan13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__hmax_nan, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL10__hmax_nan13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL10__hmax_nan13__nv_bfloat16S__nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL10__hmax_nan13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __hmin_nan(): + pass + + +def _lower__ZL10__hmin_nan13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL10__hmin_nan13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b) { + retval = __hmin_nan(*a, *b); + return 0; + } + """ + + _ZL10__hmin_nan13__nv_bfloat16S__nbst = declare_device( + "_ZL10__hmin_nan13__nv_bfloat16S__nbst", + _type___nv_bfloat16( + CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) + ), + ) + + def _ZL10__hmin_nan13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL10__hmin_nan13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(__hmin_nan, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL10__hmin_nan13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL10__hmin_nan13__nv_bfloat16S__nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL10__hmin_nan13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def __hfma_relu(): + pass + + +def _lower__ZL11__hfma_relu13__nv_bfloat16S_S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL11__hfma_relu13__nv_bfloat16S_S__nbst(__nv_bfloat16 &retval , __nv_bfloat16* a, __nv_bfloat16* b, __nv_bfloat16* c) { + retval = __hfma_relu(*a, *b, *c); + return 0; + } + """ + + _ZL11__hfma_relu13__nv_bfloat16S_S__nbst = declare_device( + "_ZL11__hfma_relu13__nv_bfloat16S_S__nbst", + _type___nv_bfloat16( + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ) + + def _ZL11__hfma_relu13__nv_bfloat16S_S__nbst_caller(arg_0, arg_1, arg_2): + return _ZL11__hfma_relu13__nv_bfloat16S_S__nbst(arg_0, arg_1, arg_2) + + @lower( + __hfma_relu, + _type___nv_bfloat16, + _type___nv_bfloat16, + _type___nv_bfloat16, + ) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL11__hfma_relu13__nv_bfloat16S_S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL11__hfma_relu13__nv_bfloat16S_S__nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL11__hfma_relu13__nv_bfloat16S_S__nbst(shim_stream, shim_obj) + + +def __hmax2(): + pass + + +def _lower__ZL7__hmax214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL7__hmax214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hmax2(*a, *b); + return 0; + } + """ + + _ZL7__hmax214__nv_bfloat162S__nbst = declare_device( + "_ZL7__hmax214__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL7__hmax214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL7__hmax214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hmax2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL7__hmax214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL7__hmax214__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL7__hmax214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hmin2(): + pass + + +def _lower__ZL7__hmin214__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL7__hmin214__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hmin2(*a, *b); + return 0; + } + """ + + _ZL7__hmin214__nv_bfloat162S__nbst = declare_device( + "_ZL7__hmin214__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL7__hmin214__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL7__hmin214__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hmin2, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL7__hmin214__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL7__hmin214__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL7__hmin214__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hmax2_nan(): + pass + + +def _lower__ZL11__hmax2_nan14__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL11__hmax2_nan14__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hmax2_nan(*a, *b); + return 0; + } + """ + + _ZL11__hmax2_nan14__nv_bfloat162S__nbst = declare_device( + "_ZL11__hmax2_nan14__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL11__hmax2_nan14__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL11__hmax2_nan14__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hmax2_nan, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL11__hmax2_nan14__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL11__hmax2_nan14__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL11__hmax2_nan14__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hmin2_nan(): + pass + + +def _lower__ZL11__hmin2_nan14__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL11__hmin2_nan14__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b) { + retval = __hmin2_nan(*a, *b); + return 0; + } + """ + + _ZL11__hmin2_nan14__nv_bfloat162S__nbst = declare_device( + "_ZL11__hmin2_nan14__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZL11__hmin2_nan14__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL11__hmin2_nan14__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(__hmin2_nan, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL11__hmin2_nan14__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL11__hmin2_nan14__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL11__hmin2_nan14__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def __hfma2_relu(): + pass + + +def _lower__ZL12__hfma2_relu14__nv_bfloat162S_S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL12__hfma2_relu14__nv_bfloat162S_S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b, __nv_bfloat162* c) { + retval = __hfma2_relu(*a, *b, *c); + return 0; + } + """ + + _ZL12__hfma2_relu14__nv_bfloat162S_S__nbst = declare_device( + "_ZL12__hfma2_relu14__nv_bfloat162S_S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ) + + def _ZL12__hfma2_relu14__nv_bfloat162S_S__nbst_caller(arg_0, arg_1, arg_2): + return _ZL12__hfma2_relu14__nv_bfloat162S_S__nbst(arg_0, arg_1, arg_2) + + @lower( + __hfma2_relu, + _type___nv_bfloat162, + _type___nv_bfloat162, + _type___nv_bfloat162, + ) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL12__hfma2_relu14__nv_bfloat162S_S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL12__hfma2_relu14__nv_bfloat162S_S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL12__hfma2_relu14__nv_bfloat162S_S__nbst(shim_stream, shim_obj) + + +def __hcmadd(): + pass + + +def _lower__ZL8__hcmadd14__nv_bfloat162S_S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL8__hcmadd14__nv_bfloat162S_S__nbst(__nv_bfloat162 &retval , __nv_bfloat162* a, __nv_bfloat162* b, __nv_bfloat162* c) { + retval = __hcmadd(*a, *b, *c); + return 0; + } + """ + + _ZL8__hcmadd14__nv_bfloat162S_S__nbst = declare_device( + "_ZL8__hcmadd14__nv_bfloat162S_S__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ) + + def _ZL8__hcmadd14__nv_bfloat162S_S__nbst_caller(arg_0, arg_1, arg_2): + return _ZL8__hcmadd14__nv_bfloat162S_S__nbst(arg_0, arg_1, arg_2) + + @lower( + __hcmadd, + _type___nv_bfloat162, + _type___nv_bfloat162, + _type___nv_bfloat162, + ) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL8__hcmadd14__nv_bfloat162S_S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL8__hcmadd14__nv_bfloat162S_S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL8__hcmadd14__nv_bfloat162S_S__nbst(shim_stream, shim_obj) + + +def hsqrt(): + pass + + +def _lower__ZL5hsqrt13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL5hsqrt13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) { + retval = hsqrt(*a); + return 0; + } + """ + + _ZL5hsqrt13__nv_bfloat16_nbst = declare_device( + "_ZL5hsqrt13__nv_bfloat16_nbst", + _type___nv_bfloat16(CPointer(_type___nv_bfloat16)), + ) + + def _ZL5hsqrt13__nv_bfloat16_nbst_caller(arg_0): + return _ZL5hsqrt13__nv_bfloat16_nbst(arg_0) + + @lower(hsqrt, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL5hsqrt13__nv_bfloat16_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL5hsqrt13__nv_bfloat16_nbst_caller, + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + ptrs, + ) + + +_lower__ZL5hsqrt13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def hrsqrt(): + pass + + +def _lower__ZL6hrsqrt13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6hrsqrt13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) { + retval = hrsqrt(*a); + return 0; + } + """ + + _ZL6hrsqrt13__nv_bfloat16_nbst = declare_device( + "_ZL6hrsqrt13__nv_bfloat16_nbst", + _type___nv_bfloat16(CPointer(_type___nv_bfloat16)), + ) + + def _ZL6hrsqrt13__nv_bfloat16_nbst_caller(arg_0): + return _ZL6hrsqrt13__nv_bfloat16_nbst(arg_0) + + @lower(hrsqrt, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6hrsqrt13__nv_bfloat16_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6hrsqrt13__nv_bfloat16_nbst_caller, + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + ptrs, + ) + + +_lower__ZL6hrsqrt13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def hrcp(): + pass + + +def _lower__ZL4hrcp13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL4hrcp13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) { + retval = hrcp(*a); + return 0; + } + """ + + _ZL4hrcp13__nv_bfloat16_nbst = declare_device( + "_ZL4hrcp13__nv_bfloat16_nbst", + _type___nv_bfloat16(CPointer(_type___nv_bfloat16)), + ) + + def _ZL4hrcp13__nv_bfloat16_nbst_caller(arg_0): + return _ZL4hrcp13__nv_bfloat16_nbst(arg_0) + + @lower(hrcp, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key("_ZL4hrcp13__nv_bfloat16_nbst", shim_raw_str) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL4hrcp13__nv_bfloat16_nbst_caller, + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + ptrs, + ) + + +_lower__ZL4hrcp13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def hlog(): + pass + + +def _lower__ZL4hlog13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL4hlog13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) { + retval = hlog(*a); + return 0; + } + """ + + _ZL4hlog13__nv_bfloat16_nbst = declare_device( + "_ZL4hlog13__nv_bfloat16_nbst", + _type___nv_bfloat16(CPointer(_type___nv_bfloat16)), + ) + + def _ZL4hlog13__nv_bfloat16_nbst_caller(arg_0): + return _ZL4hlog13__nv_bfloat16_nbst(arg_0) + + @lower(hlog, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key("_ZL4hlog13__nv_bfloat16_nbst", shim_raw_str) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL4hlog13__nv_bfloat16_nbst_caller, + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + ptrs, + ) + + +_lower__ZL4hlog13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def hlog2(): + pass + + +def _lower__ZL5hlog213__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL5hlog213__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) { + retval = hlog2(*a); + return 0; + } + """ + + _ZL5hlog213__nv_bfloat16_nbst = declare_device( + "_ZL5hlog213__nv_bfloat16_nbst", + _type___nv_bfloat16(CPointer(_type___nv_bfloat16)), + ) + + def _ZL5hlog213__nv_bfloat16_nbst_caller(arg_0): + return _ZL5hlog213__nv_bfloat16_nbst(arg_0) + + @lower(hlog2, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL5hlog213__nv_bfloat16_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL5hlog213__nv_bfloat16_nbst_caller, + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + ptrs, + ) + + +_lower__ZL5hlog213__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def hlog10(): + pass + + +def _lower__ZL6hlog1013__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6hlog1013__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) { + retval = hlog10(*a); + return 0; + } + """ + + _ZL6hlog1013__nv_bfloat16_nbst = declare_device( + "_ZL6hlog1013__nv_bfloat16_nbst", + _type___nv_bfloat16(CPointer(_type___nv_bfloat16)), + ) + + def _ZL6hlog1013__nv_bfloat16_nbst_caller(arg_0): + return _ZL6hlog1013__nv_bfloat16_nbst(arg_0) + + @lower(hlog10, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6hlog1013__nv_bfloat16_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6hlog1013__nv_bfloat16_nbst_caller, + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + ptrs, + ) + + +_lower__ZL6hlog1013__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def hexp(): + pass + + +def _lower__ZL4hexp13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL4hexp13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) { + retval = hexp(*a); + return 0; + } + """ + + _ZL4hexp13__nv_bfloat16_nbst = declare_device( + "_ZL4hexp13__nv_bfloat16_nbst", + _type___nv_bfloat16(CPointer(_type___nv_bfloat16)), + ) + + def _ZL4hexp13__nv_bfloat16_nbst_caller(arg_0): + return _ZL4hexp13__nv_bfloat16_nbst(arg_0) + + @lower(hexp, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key("_ZL4hexp13__nv_bfloat16_nbst", shim_raw_str) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL4hexp13__nv_bfloat16_nbst_caller, + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + ptrs, + ) + + +_lower__ZL4hexp13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def htanh_approx(): + pass + + +def _lower__ZL12htanh_approx13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL12htanh_approx13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) { + retval = htanh_approx(*a); + return 0; + } + """ + + _ZL12htanh_approx13__nv_bfloat16_nbst = declare_device( + "_ZL12htanh_approx13__nv_bfloat16_nbst", + _type___nv_bfloat16(CPointer(_type___nv_bfloat16)), + ) + + def _ZL12htanh_approx13__nv_bfloat16_nbst_caller(arg_0): + return _ZL12htanh_approx13__nv_bfloat16_nbst(arg_0) + + @lower(htanh_approx, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL12htanh_approx13__nv_bfloat16_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL12htanh_approx13__nv_bfloat16_nbst_caller, + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + ptrs, + ) + + +_lower__ZL12htanh_approx13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def h2tanh_approx(): + pass + + +def _lower__ZL13h2tanh_approx14__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL13h2tanh_approx14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) { + retval = h2tanh_approx(*a); + return 0; + } + """ + + _ZL13h2tanh_approx14__nv_bfloat162_nbst = declare_device( + "_ZL13h2tanh_approx14__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(_type___nv_bfloat162)), + ) + + def _ZL13h2tanh_approx14__nv_bfloat162_nbst_caller(arg_0): + return _ZL13h2tanh_approx14__nv_bfloat162_nbst(arg_0) + + @lower(h2tanh_approx, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL13h2tanh_approx14__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL13h2tanh_approx14__nv_bfloat162_nbst_caller, + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + ptrs, + ) + + +_lower__ZL13h2tanh_approx14__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def htanh(): + pass + + +def _lower__ZL5htanh13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL5htanh13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) { + retval = htanh(*a); + return 0; + } + """ + + _ZL5htanh13__nv_bfloat16_nbst = declare_device( + "_ZL5htanh13__nv_bfloat16_nbst", + _type___nv_bfloat16(CPointer(_type___nv_bfloat16)), + ) + + def _ZL5htanh13__nv_bfloat16_nbst_caller(arg_0): + return _ZL5htanh13__nv_bfloat16_nbst(arg_0) + + @lower(htanh, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL5htanh13__nv_bfloat16_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL5htanh13__nv_bfloat16_nbst_caller, + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + ptrs, + ) + + +_lower__ZL5htanh13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def h2tanh(): + pass + + +def _lower__ZL6h2tanh14__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6h2tanh14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) { + retval = h2tanh(*a); + return 0; + } + """ + + _ZL6h2tanh14__nv_bfloat162_nbst = declare_device( + "_ZL6h2tanh14__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(_type___nv_bfloat162)), + ) + + def _ZL6h2tanh14__nv_bfloat162_nbst_caller(arg_0): + return _ZL6h2tanh14__nv_bfloat162_nbst(arg_0) + + @lower(h2tanh, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6h2tanh14__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6h2tanh14__nv_bfloat162_nbst_caller, + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + ptrs, + ) + + +_lower__ZL6h2tanh14__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def hexp2(): + pass + + +def _lower__ZL5hexp213__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL5hexp213__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) { + retval = hexp2(*a); + return 0; + } + """ + + _ZL5hexp213__nv_bfloat16_nbst = declare_device( + "_ZL5hexp213__nv_bfloat16_nbst", + _type___nv_bfloat16(CPointer(_type___nv_bfloat16)), + ) + + def _ZL5hexp213__nv_bfloat16_nbst_caller(arg_0): + return _ZL5hexp213__nv_bfloat16_nbst(arg_0) + + @lower(hexp2, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL5hexp213__nv_bfloat16_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL5hexp213__nv_bfloat16_nbst_caller, + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + ptrs, + ) + + +_lower__ZL5hexp213__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def hexp10(): + pass + + +def _lower__ZL6hexp1013__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6hexp1013__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) { + retval = hexp10(*a); + return 0; + } + """ + + _ZL6hexp1013__nv_bfloat16_nbst = declare_device( + "_ZL6hexp1013__nv_bfloat16_nbst", + _type___nv_bfloat16(CPointer(_type___nv_bfloat16)), + ) + + def _ZL6hexp1013__nv_bfloat16_nbst_caller(arg_0): + return _ZL6hexp1013__nv_bfloat16_nbst(arg_0) + + @lower(hexp10, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6hexp1013__nv_bfloat16_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6hexp1013__nv_bfloat16_nbst_caller, + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + ptrs, + ) + + +_lower__ZL6hexp1013__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def hcos(): + pass + + +def _lower__ZL4hcos13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL4hcos13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) { + retval = hcos(*a); + return 0; + } + """ + + _ZL4hcos13__nv_bfloat16_nbst = declare_device( + "_ZL4hcos13__nv_bfloat16_nbst", + _type___nv_bfloat16(CPointer(_type___nv_bfloat16)), + ) + + def _ZL4hcos13__nv_bfloat16_nbst_caller(arg_0): + return _ZL4hcos13__nv_bfloat16_nbst(arg_0) + + @lower(hcos, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key("_ZL4hcos13__nv_bfloat16_nbst", shim_raw_str) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL4hcos13__nv_bfloat16_nbst_caller, + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + ptrs, + ) + + +_lower__ZL4hcos13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def hsin(): + pass + + +def _lower__ZL4hsin13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL4hsin13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* a) { + retval = hsin(*a); + return 0; + } + """ + + _ZL4hsin13__nv_bfloat16_nbst = declare_device( + "_ZL4hsin13__nv_bfloat16_nbst", + _type___nv_bfloat16(CPointer(_type___nv_bfloat16)), + ) + + def _ZL4hsin13__nv_bfloat16_nbst_caller(arg_0): + return _ZL4hsin13__nv_bfloat16_nbst(arg_0) + + @lower(hsin, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key("_ZL4hsin13__nv_bfloat16_nbst", shim_raw_str) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL4hsin13__nv_bfloat16_nbst_caller, + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + ptrs, + ) + + +_lower__ZL4hsin13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def h2sqrt(): + pass + + +def _lower__ZL6h2sqrt14__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6h2sqrt14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) { + retval = h2sqrt(*a); + return 0; + } + """ + + _ZL6h2sqrt14__nv_bfloat162_nbst = declare_device( + "_ZL6h2sqrt14__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(_type___nv_bfloat162)), + ) + + def _ZL6h2sqrt14__nv_bfloat162_nbst_caller(arg_0): + return _ZL6h2sqrt14__nv_bfloat162_nbst(arg_0) + + @lower(h2sqrt, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6h2sqrt14__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6h2sqrt14__nv_bfloat162_nbst_caller, + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + ptrs, + ) + + +_lower__ZL6h2sqrt14__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def h2rsqrt(): + pass + + +def _lower__ZL7h2rsqrt14__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL7h2rsqrt14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) { + retval = h2rsqrt(*a); + return 0; + } + """ + + _ZL7h2rsqrt14__nv_bfloat162_nbst = declare_device( + "_ZL7h2rsqrt14__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(_type___nv_bfloat162)), + ) + + def _ZL7h2rsqrt14__nv_bfloat162_nbst_caller(arg_0): + return _ZL7h2rsqrt14__nv_bfloat162_nbst(arg_0) + + @lower(h2rsqrt, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL7h2rsqrt14__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL7h2rsqrt14__nv_bfloat162_nbst_caller, + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + ptrs, + ) + + +_lower__ZL7h2rsqrt14__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def h2rcp(): + pass + + +def _lower__ZL5h2rcp14__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL5h2rcp14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) { + retval = h2rcp(*a); + return 0; + } + """ + + _ZL5h2rcp14__nv_bfloat162_nbst = declare_device( + "_ZL5h2rcp14__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(_type___nv_bfloat162)), + ) + + def _ZL5h2rcp14__nv_bfloat162_nbst_caller(arg_0): + return _ZL5h2rcp14__nv_bfloat162_nbst(arg_0) + + @lower(h2rcp, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL5h2rcp14__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL5h2rcp14__nv_bfloat162_nbst_caller, + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + ptrs, + ) + + +_lower__ZL5h2rcp14__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def h2log(): + pass + + +def _lower__ZL5h2log14__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL5h2log14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) { + retval = h2log(*a); + return 0; + } + """ + + _ZL5h2log14__nv_bfloat162_nbst = declare_device( + "_ZL5h2log14__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(_type___nv_bfloat162)), + ) + + def _ZL5h2log14__nv_bfloat162_nbst_caller(arg_0): + return _ZL5h2log14__nv_bfloat162_nbst(arg_0) + + @lower(h2log, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL5h2log14__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL5h2log14__nv_bfloat162_nbst_caller, + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + ptrs, + ) + + +_lower__ZL5h2log14__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def h2log2(): + pass + + +def _lower__ZL6h2log214__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6h2log214__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) { + retval = h2log2(*a); + return 0; + } + """ + + _ZL6h2log214__nv_bfloat162_nbst = declare_device( + "_ZL6h2log214__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(_type___nv_bfloat162)), + ) + + def _ZL6h2log214__nv_bfloat162_nbst_caller(arg_0): + return _ZL6h2log214__nv_bfloat162_nbst(arg_0) + + @lower(h2log2, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6h2log214__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6h2log214__nv_bfloat162_nbst_caller, + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + ptrs, + ) + + +_lower__ZL6h2log214__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def h2log10(): + pass + + +def _lower__ZL7h2log1014__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL7h2log1014__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) { + retval = h2log10(*a); + return 0; + } + """ + + _ZL7h2log1014__nv_bfloat162_nbst = declare_device( + "_ZL7h2log1014__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(_type___nv_bfloat162)), + ) + + def _ZL7h2log1014__nv_bfloat162_nbst_caller(arg_0): + return _ZL7h2log1014__nv_bfloat162_nbst(arg_0) + + @lower(h2log10, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL7h2log1014__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL7h2log1014__nv_bfloat162_nbst_caller, + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + ptrs, + ) + + +_lower__ZL7h2log1014__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def h2exp(): + pass + + +def _lower__ZL5h2exp14__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL5h2exp14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) { + retval = h2exp(*a); + return 0; + } + """ + + _ZL5h2exp14__nv_bfloat162_nbst = declare_device( + "_ZL5h2exp14__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(_type___nv_bfloat162)), + ) + + def _ZL5h2exp14__nv_bfloat162_nbst_caller(arg_0): + return _ZL5h2exp14__nv_bfloat162_nbst(arg_0) + + @lower(h2exp, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL5h2exp14__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL5h2exp14__nv_bfloat162_nbst_caller, + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + ptrs, + ) + + +_lower__ZL5h2exp14__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def h2exp2(): + pass + + +def _lower__ZL6h2exp214__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL6h2exp214__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) { + retval = h2exp2(*a); + return 0; + } + """ + + _ZL6h2exp214__nv_bfloat162_nbst = declare_device( + "_ZL6h2exp214__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(_type___nv_bfloat162)), + ) + + def _ZL6h2exp214__nv_bfloat162_nbst_caller(arg_0): + return _ZL6h2exp214__nv_bfloat162_nbst(arg_0) + + @lower(h2exp2, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL6h2exp214__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL6h2exp214__nv_bfloat162_nbst_caller, + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + ptrs, + ) + + +_lower__ZL6h2exp214__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def h2exp10(): + pass + + +def _lower__ZL7h2exp1014__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL7h2exp1014__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) { + retval = h2exp10(*a); + return 0; + } + """ + + _ZL7h2exp1014__nv_bfloat162_nbst = declare_device( + "_ZL7h2exp1014__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(_type___nv_bfloat162)), + ) + + def _ZL7h2exp1014__nv_bfloat162_nbst_caller(arg_0): + return _ZL7h2exp1014__nv_bfloat162_nbst(arg_0) + + @lower(h2exp10, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL7h2exp1014__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL7h2exp1014__nv_bfloat162_nbst_caller, + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + ptrs, + ) + + +_lower__ZL7h2exp1014__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def h2cos(): + pass + + +def _lower__ZL5h2cos14__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL5h2cos14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) { + retval = h2cos(*a); + return 0; + } + """ + + _ZL5h2cos14__nv_bfloat162_nbst = declare_device( + "_ZL5h2cos14__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(_type___nv_bfloat162)), + ) + + def _ZL5h2cos14__nv_bfloat162_nbst_caller(arg_0): + return _ZL5h2cos14__nv_bfloat162_nbst(arg_0) + + @lower(h2cos, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL5h2cos14__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL5h2cos14__nv_bfloat162_nbst_caller, + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + ptrs, + ) + + +_lower__ZL5h2cos14__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def h2sin(): + pass + + +def _lower__ZL5h2sin14__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL5h2sin14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* a) { + retval = h2sin(*a); + return 0; + } + """ + + _ZL5h2sin14__nv_bfloat162_nbst = declare_device( + "_ZL5h2sin14__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(_type___nv_bfloat162)), + ) + + def _ZL5h2sin14__nv_bfloat162_nbst_caller(arg_0): + return _ZL5h2sin14__nv_bfloat162_nbst(arg_0) + + @lower(h2sin, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL5h2sin14__nv_bfloat162_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL5h2sin14__nv_bfloat162_nbst_caller, + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + ptrs, + ) + + +_lower__ZL5h2sin14__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def atomicAdd(): + pass + + +def _lower__ZL9atomicAddP14__nv_bfloat162S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL9atomicAddP14__nv_bfloat162S__nbst(__nv_bfloat162 &retval , __nv_bfloat162 ** address, __nv_bfloat162* val) { + retval = atomicAdd(*address, *val); + return 0; + } + """ + + _ZL9atomicAddP14__nv_bfloat162S__nbst = declare_device( + "_ZL9atomicAddP14__nv_bfloat162S__nbst", + _type___nv_bfloat162( + CPointer(CPointer(_type___nv_bfloat162)), + CPointer(_type___nv_bfloat162), + ), + ) + + def _ZL9atomicAddP14__nv_bfloat162S__nbst_caller(arg_0, arg_1): + return _ZL9atomicAddP14__nv_bfloat162S__nbst(arg_0, arg_1) + + @lower(atomicAdd, CPointer(_type___nv_bfloat162), _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL9atomicAddP14__nv_bfloat162S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL9atomicAddP14__nv_bfloat162S__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(CPointer(_type___nv_bfloat162)), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZL9atomicAddP14__nv_bfloat162S__nbst(shim_stream, shim_obj) + + +def _lower__ZL9atomicAddP13__nv_bfloat16S__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZL9atomicAddP13__nv_bfloat16S__nbst(__nv_bfloat16 &retval , __nv_bfloat16 ** address, __nv_bfloat16* val) { + retval = atomicAdd(*address, *val); + return 0; + } + """ + + _ZL9atomicAddP13__nv_bfloat16S__nbst = declare_device( + "_ZL9atomicAddP13__nv_bfloat16S__nbst", + _type___nv_bfloat16( + CPointer(CPointer(_type___nv_bfloat16)), + CPointer(_type___nv_bfloat16), + ), + ) + + def _ZL9atomicAddP13__nv_bfloat16S__nbst_caller(arg_0, arg_1): + return _ZL9atomicAddP13__nv_bfloat16S__nbst(arg_0, arg_1) + + @lower(atomicAdd, CPointer(_type___nv_bfloat16), _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZL9atomicAddP13__nv_bfloat16S__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZL9atomicAddP13__nv_bfloat16S__nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(CPointer(_type___nv_bfloat16)), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZL9atomicAddP13__nv_bfloat16S__nbst(shim_stream, shim_obj) + + +def _lower__ZplRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZplRK13__nv_bfloat16S1__nbst(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { + retval = operator+(*lh, *rh); + return 0; + } + """ + + _ZplRK13__nv_bfloat16S1__nbst = declare_device( + "_ZplRK13__nv_bfloat16S1__nbst", + _type___nv_bfloat16( + CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) + ), + ) + + def _ZplRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1): + return _ZplRK13__nv_bfloat16S1__nbst(arg_0, arg_1) + + @lower(operator.add, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZplRK13__nv_bfloat16S1__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZplRK13__nv_bfloat16S1__nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZplRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj) + + +def _lower__ZmiRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZmiRK13__nv_bfloat16S1__nbst(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { + retval = operator-(*lh, *rh); + return 0; + } + """ + + _ZmiRK13__nv_bfloat16S1__nbst = declare_device( + "_ZmiRK13__nv_bfloat16S1__nbst", + _type___nv_bfloat16( + CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) + ), + ) + + def _ZmiRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1): + return _ZmiRK13__nv_bfloat16S1__nbst(arg_0, arg_1) + + @lower(operator.sub, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZmiRK13__nv_bfloat16S1__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZmiRK13__nv_bfloat16S1__nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZmiRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj) + + +def _lower__ZmlRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZmlRK13__nv_bfloat16S1__nbst(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { + retval = operator*(*lh, *rh); + return 0; + } + """ + + _ZmlRK13__nv_bfloat16S1__nbst = declare_device( + "_ZmlRK13__nv_bfloat16S1__nbst", + _type___nv_bfloat16( + CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) + ), + ) + + def _ZmlRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1): + return _ZmlRK13__nv_bfloat16S1__nbst(arg_0, arg_1) + + @lower(operator.mul, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZmlRK13__nv_bfloat16S1__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZmlRK13__nv_bfloat16S1__nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZmlRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj) + + +def _lower__ZdvRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZdvRK13__nv_bfloat16S1__nbst(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { + retval = operator/(*lh, *rh); + return 0; + } + """ + + _ZdvRK13__nv_bfloat16S1__nbst = declare_device( + "_ZdvRK13__nv_bfloat16S1__nbst", + _type___nv_bfloat16( + CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) + ), + ) + + def _ZdvRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1): + return _ZdvRK13__nv_bfloat16S1__nbst(arg_0, arg_1) + + @lower(operator.truediv, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZdvRK13__nv_bfloat16S1__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZdvRK13__nv_bfloat16S1__nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZdvRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj) + + +def _lower__ZpLR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZpLR13__nv_bfloat16RKS__nbst(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { + retval = operator+=(*lh, *rh); + return 0; + } + """ + + _ZpLR13__nv_bfloat16RKS__nbst = declare_device( + "_ZpLR13__nv_bfloat16RKS__nbst", + _type___nv_bfloat16( + CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) + ), + ) + + def _ZpLR13__nv_bfloat16RKS__nbst_caller(arg_0, arg_1): + return _ZpLR13__nv_bfloat16RKS__nbst(arg_0, arg_1) + + @lower(operator.iadd, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZpLR13__nv_bfloat16RKS__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZpLR13__nv_bfloat16RKS__nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZpLR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj) + + +def _lower__ZmIR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZmIR13__nv_bfloat16RKS__nbst(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { + retval = operator-=(*lh, *rh); + return 0; + } + """ + + _ZmIR13__nv_bfloat16RKS__nbst = declare_device( + "_ZmIR13__nv_bfloat16RKS__nbst", + _type___nv_bfloat16( + CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) + ), + ) + + def _ZmIR13__nv_bfloat16RKS__nbst_caller(arg_0, arg_1): + return _ZmIR13__nv_bfloat16RKS__nbst(arg_0, arg_1) + + @lower(operator.isub, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZmIR13__nv_bfloat16RKS__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZmIR13__nv_bfloat16RKS__nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZmIR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj) + + +def _lower__ZmLR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZmLR13__nv_bfloat16RKS__nbst(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { + retval = operator*=(*lh, *rh); + return 0; + } + """ + + _ZmLR13__nv_bfloat16RKS__nbst = declare_device( + "_ZmLR13__nv_bfloat16RKS__nbst", + _type___nv_bfloat16( + CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) + ), + ) + + def _ZmLR13__nv_bfloat16RKS__nbst_caller(arg_0, arg_1): + return _ZmLR13__nv_bfloat16RKS__nbst(arg_0, arg_1) + + @lower(operator.imul, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZmLR13__nv_bfloat16RKS__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZmLR13__nv_bfloat16RKS__nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZmLR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj) + + +def _lower__ZdVR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZdVR13__nv_bfloat16RKS__nbst(__nv_bfloat16 &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { + retval = operator/=(*lh, *rh); + return 0; + } + """ + + _ZdVR13__nv_bfloat16RKS__nbst = declare_device( + "_ZdVR13__nv_bfloat16RKS__nbst", + _type___nv_bfloat16( + CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16) + ), + ) + + def _ZdVR13__nv_bfloat16RKS__nbst_caller(arg_0, arg_1): + return _ZdVR13__nv_bfloat16RKS__nbst(arg_0, arg_1) + + @lower(operator.itruediv, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZdVR13__nv_bfloat16RKS__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZdVR13__nv_bfloat16RKS__nbst_caller, + signature( + _type___nv_bfloat16, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZdVR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj) + + +def _lower__ZpsRK13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZpsRK13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* h) { + retval = operator+(*h); + return 0; + } + """ + + _ZpsRK13__nv_bfloat16_nbst = declare_device( + "_ZpsRK13__nv_bfloat16_nbst", + _type___nv_bfloat16(CPointer(_type___nv_bfloat16)), + ) + + def _ZpsRK13__nv_bfloat16_nbst_caller(arg_0): + return _ZpsRK13__nv_bfloat16_nbst(arg_0) + + @lower(operator.pos, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key("_ZpsRK13__nv_bfloat16_nbst", shim_raw_str) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZpsRK13__nv_bfloat16_nbst_caller, + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + ptrs, + ) + + +_lower__ZpsRK13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def _lower__ZngRK13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZngRK13__nv_bfloat16_nbst(__nv_bfloat16 &retval , __nv_bfloat16* h) { + retval = operator-(*h); + return 0; + } + """ + + _ZngRK13__nv_bfloat16_nbst = declare_device( + "_ZngRK13__nv_bfloat16_nbst", + _type___nv_bfloat16(CPointer(_type___nv_bfloat16)), + ) + + def _ZngRK13__nv_bfloat16_nbst_caller(arg_0): + return _ZngRK13__nv_bfloat16_nbst(arg_0) + + @lower(operator.neg, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key("_ZngRK13__nv_bfloat16_nbst", shim_raw_str) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZngRK13__nv_bfloat16_nbst_caller, + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + ptrs, + ) + + +_lower__ZngRK13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +def _lower__ZeqRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZeqRK13__nv_bfloat16S1__nbst(bool &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { + retval = operator==(*lh, *rh); + return 0; + } + """ + + _ZeqRK13__nv_bfloat16S1__nbst = declare_device( + "_ZeqRK13__nv_bfloat16S1__nbst", + bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)), + ) + + def _ZeqRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1): + return _ZeqRK13__nv_bfloat16S1__nbst(arg_0, arg_1) + + @lower(operator.eq, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZeqRK13__nv_bfloat16S1__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZeqRK13__nv_bfloat16S1__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZeqRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj) + + +def _lower__ZneRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZneRK13__nv_bfloat16S1__nbst(bool &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { + retval = operator!=(*lh, *rh); + return 0; + } + """ + + _ZneRK13__nv_bfloat16S1__nbst = declare_device( + "_ZneRK13__nv_bfloat16S1__nbst", + bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)), + ) + + def _ZneRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1): + return _ZneRK13__nv_bfloat16S1__nbst(arg_0, arg_1) + + @lower(operator.ne, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZneRK13__nv_bfloat16S1__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZneRK13__nv_bfloat16S1__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZneRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj) + + +def _lower__ZgtRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZgtRK13__nv_bfloat16S1__nbst(bool &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { + retval = operator>(*lh, *rh); + return 0; + } + """ + + _ZgtRK13__nv_bfloat16S1__nbst = declare_device( + "_ZgtRK13__nv_bfloat16S1__nbst", + bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)), + ) + + def _ZgtRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1): + return _ZgtRK13__nv_bfloat16S1__nbst(arg_0, arg_1) + + @lower(operator.gt, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZgtRK13__nv_bfloat16S1__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZgtRK13__nv_bfloat16S1__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZgtRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj) + + +def _lower__ZltRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZltRK13__nv_bfloat16S1__nbst(bool &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { + retval = operator<(*lh, *rh); + return 0; + } + """ + + _ZltRK13__nv_bfloat16S1__nbst = declare_device( + "_ZltRK13__nv_bfloat16S1__nbst", + bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)), + ) + + def _ZltRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1): + return _ZltRK13__nv_bfloat16S1__nbst(arg_0, arg_1) + + @lower(operator.lt, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZltRK13__nv_bfloat16S1__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZltRK13__nv_bfloat16S1__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZltRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj) + + +def _lower__ZgeRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZgeRK13__nv_bfloat16S1__nbst(bool &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { + retval = operator>=(*lh, *rh); + return 0; + } + """ + + _ZgeRK13__nv_bfloat16S1__nbst = declare_device( + "_ZgeRK13__nv_bfloat16S1__nbst", + bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)), + ) + + def _ZgeRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1): + return _ZgeRK13__nv_bfloat16S1__nbst(arg_0, arg_1) + + @lower(operator.ge, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZgeRK13__nv_bfloat16S1__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZgeRK13__nv_bfloat16S1__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZgeRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj) + + +def _lower__ZleRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZleRK13__nv_bfloat16S1__nbst(bool &retval , __nv_bfloat16* lh, __nv_bfloat16* rh) { + retval = operator<=(*lh, *rh); + return 0; + } + """ + + _ZleRK13__nv_bfloat16S1__nbst = declare_device( + "_ZleRK13__nv_bfloat16S1__nbst", + bool_(CPointer(_type___nv_bfloat16), CPointer(_type___nv_bfloat16)), + ) + + def _ZleRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1): + return _ZleRK13__nv_bfloat16S1__nbst(arg_0, arg_1) + + @lower(operator.le, _type___nv_bfloat16, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZleRK13__nv_bfloat16S1__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZleRK13__nv_bfloat16S1__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat16), + CPointer(_type___nv_bfloat16), + ), + ptrs, + ) + + +_lower__ZleRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj) + + +def _lower__ZplRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZplRK14__nv_bfloat162S1__nbst(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { + retval = operator+(*lh, *rh); + return 0; + } + """ + + _ZplRK14__nv_bfloat162S1__nbst = declare_device( + "_ZplRK14__nv_bfloat162S1__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZplRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1): + return _ZplRK14__nv_bfloat162S1__nbst(arg_0, arg_1) + + @lower(operator.add, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZplRK14__nv_bfloat162S1__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZplRK14__nv_bfloat162S1__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZplRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj) + + +def _lower__ZmiRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZmiRK14__nv_bfloat162S1__nbst(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { + retval = operator-(*lh, *rh); + return 0; + } + """ + + _ZmiRK14__nv_bfloat162S1__nbst = declare_device( + "_ZmiRK14__nv_bfloat162S1__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZmiRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1): + return _ZmiRK14__nv_bfloat162S1__nbst(arg_0, arg_1) + + @lower(operator.sub, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZmiRK14__nv_bfloat162S1__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZmiRK14__nv_bfloat162S1__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZmiRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj) + + +def _lower__ZmlRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZmlRK14__nv_bfloat162S1__nbst(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { + retval = operator*(*lh, *rh); + return 0; + } + """ + + _ZmlRK14__nv_bfloat162S1__nbst = declare_device( + "_ZmlRK14__nv_bfloat162S1__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZmlRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1): + return _ZmlRK14__nv_bfloat162S1__nbst(arg_0, arg_1) + + @lower(operator.mul, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZmlRK14__nv_bfloat162S1__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZmlRK14__nv_bfloat162S1__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZmlRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj) + + +def _lower__ZdvRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZdvRK14__nv_bfloat162S1__nbst(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { + retval = operator/(*lh, *rh); + return 0; + } + """ + + _ZdvRK14__nv_bfloat162S1__nbst = declare_device( + "_ZdvRK14__nv_bfloat162S1__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZdvRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1): + return _ZdvRK14__nv_bfloat162S1__nbst(arg_0, arg_1) + + @lower(operator.truediv, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZdvRK14__nv_bfloat162S1__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZdvRK14__nv_bfloat162S1__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZdvRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj) + + +def _lower__ZpLR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZpLR14__nv_bfloat162RKS__nbst(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { + retval = operator+=(*lh, *rh); + return 0; + } + """ + + _ZpLR14__nv_bfloat162RKS__nbst = declare_device( + "_ZpLR14__nv_bfloat162RKS__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZpLR14__nv_bfloat162RKS__nbst_caller(arg_0, arg_1): + return _ZpLR14__nv_bfloat162RKS__nbst(arg_0, arg_1) + + @lower(operator.iadd, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZpLR14__nv_bfloat162RKS__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZpLR14__nv_bfloat162RKS__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZpLR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj) + + +def _lower__ZmIR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZmIR14__nv_bfloat162RKS__nbst(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { + retval = operator-=(*lh, *rh); + return 0; + } + """ + + _ZmIR14__nv_bfloat162RKS__nbst = declare_device( + "_ZmIR14__nv_bfloat162RKS__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZmIR14__nv_bfloat162RKS__nbst_caller(arg_0, arg_1): + return _ZmIR14__nv_bfloat162RKS__nbst(arg_0, arg_1) + + @lower(operator.isub, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZmIR14__nv_bfloat162RKS__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZmIR14__nv_bfloat162RKS__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZmIR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj) + + +def _lower__ZmLR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZmLR14__nv_bfloat162RKS__nbst(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { + retval = operator*=(*lh, *rh); + return 0; + } + """ + + _ZmLR14__nv_bfloat162RKS__nbst = declare_device( + "_ZmLR14__nv_bfloat162RKS__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZmLR14__nv_bfloat162RKS__nbst_caller(arg_0, arg_1): + return _ZmLR14__nv_bfloat162RKS__nbst(arg_0, arg_1) + + @lower(operator.imul, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZmLR14__nv_bfloat162RKS__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZmLR14__nv_bfloat162RKS__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZmLR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj) + + +def _lower__ZdVR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZdVR14__nv_bfloat162RKS__nbst(__nv_bfloat162 &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { + retval = operator/=(*lh, *rh); + return 0; + } + """ + + _ZdVR14__nv_bfloat162RKS__nbst = declare_device( + "_ZdVR14__nv_bfloat162RKS__nbst", + _type___nv_bfloat162( + CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162) + ), + ) + + def _ZdVR14__nv_bfloat162RKS__nbst_caller(arg_0, arg_1): + return _ZdVR14__nv_bfloat162RKS__nbst(arg_0, arg_1) + + @lower(operator.itruediv, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZdVR14__nv_bfloat162RKS__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZdVR14__nv_bfloat162RKS__nbst_caller, + signature( + _type___nv_bfloat162, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZdVR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj) + + +def _lower__ZpsRK14__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZpsRK14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* h) { + retval = operator+(*h); + return 0; + } + """ + + _ZpsRK14__nv_bfloat162_nbst = declare_device( + "_ZpsRK14__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(_type___nv_bfloat162)), + ) + + def _ZpsRK14__nv_bfloat162_nbst_caller(arg_0): + return _ZpsRK14__nv_bfloat162_nbst(arg_0) + + @lower(operator.pos, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key("_ZpsRK14__nv_bfloat162_nbst", shim_raw_str) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZpsRK14__nv_bfloat162_nbst_caller, + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + ptrs, + ) + + +_lower__ZpsRK14__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def _lower__ZngRK14__nv_bfloat162_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZngRK14__nv_bfloat162_nbst(__nv_bfloat162 &retval , __nv_bfloat162* h) { + retval = operator-(*h); + return 0; + } + """ + + _ZngRK14__nv_bfloat162_nbst = declare_device( + "_ZngRK14__nv_bfloat162_nbst", + _type___nv_bfloat162(CPointer(_type___nv_bfloat162)), + ) + + def _ZngRK14__nv_bfloat162_nbst_caller(arg_0): + return _ZngRK14__nv_bfloat162_nbst(arg_0) + + @lower(operator.neg, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key("_ZngRK14__nv_bfloat162_nbst", shim_raw_str) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZngRK14__nv_bfloat162_nbst_caller, + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + ptrs, + ) + + +_lower__ZngRK14__nv_bfloat162_nbst(shim_stream, shim_obj) + + +def _lower__ZeqRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZeqRK14__nv_bfloat162S1__nbst(bool &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { + retval = operator==(*lh, *rh); + return 0; + } + """ + + _ZeqRK14__nv_bfloat162S1__nbst = declare_device( + "_ZeqRK14__nv_bfloat162S1__nbst", + bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZeqRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1): + return _ZeqRK14__nv_bfloat162S1__nbst(arg_0, arg_1) + + @lower(operator.eq, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZeqRK14__nv_bfloat162S1__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZeqRK14__nv_bfloat162S1__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZeqRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj) + + +def _lower__ZneRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZneRK14__nv_bfloat162S1__nbst(bool &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { + retval = operator!=(*lh, *rh); + return 0; + } + """ + + _ZneRK14__nv_bfloat162S1__nbst = declare_device( + "_ZneRK14__nv_bfloat162S1__nbst", + bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZneRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1): + return _ZneRK14__nv_bfloat162S1__nbst(arg_0, arg_1) + + @lower(operator.ne, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZneRK14__nv_bfloat162S1__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZneRK14__nv_bfloat162S1__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZneRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj) + + +def _lower__ZgtRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZgtRK14__nv_bfloat162S1__nbst(bool &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { + retval = operator>(*lh, *rh); + return 0; + } + """ + + _ZgtRK14__nv_bfloat162S1__nbst = declare_device( + "_ZgtRK14__nv_bfloat162S1__nbst", + bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZgtRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1): + return _ZgtRK14__nv_bfloat162S1__nbst(arg_0, arg_1) + + @lower(operator.gt, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZgtRK14__nv_bfloat162S1__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZgtRK14__nv_bfloat162S1__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZgtRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj) + + +def _lower__ZltRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZltRK14__nv_bfloat162S1__nbst(bool &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { + retval = operator<(*lh, *rh); + return 0; + } + """ + + _ZltRK14__nv_bfloat162S1__nbst = declare_device( + "_ZltRK14__nv_bfloat162S1__nbst", + bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZltRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1): + return _ZltRK14__nv_bfloat162S1__nbst(arg_0, arg_1) + + @lower(operator.lt, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZltRK14__nv_bfloat162S1__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZltRK14__nv_bfloat162S1__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZltRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj) + + +def _lower__ZgeRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZgeRK14__nv_bfloat162S1__nbst(bool &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { + retval = operator>=(*lh, *rh); + return 0; + } + """ + + _ZgeRK14__nv_bfloat162S1__nbst = declare_device( + "_ZgeRK14__nv_bfloat162S1__nbst", + bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZgeRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1): + return _ZgeRK14__nv_bfloat162S1__nbst(arg_0, arg_1) + + @lower(operator.ge, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZgeRK14__nv_bfloat162S1__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZgeRK14__nv_bfloat162S1__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZgeRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj) + + +def _lower__ZleRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZleRK14__nv_bfloat162S1__nbst(bool &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { + retval = operator<=(*lh, *rh); + return 0; + } + """ + + _ZleRK14__nv_bfloat162S1__nbst = declare_device( + "_ZleRK14__nv_bfloat162S1__nbst", + bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), + ) + + def _ZleRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1): + return _ZleRK14__nv_bfloat162S1__nbst(arg_0, arg_1) + + @lower(operator.le, _type___nv_bfloat162, _type___nv_bfloat162) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZleRK14__nv_bfloat162S1__nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZleRK14__nv_bfloat162S1__nbst_caller, + signature( + bool_, + CPointer(_type___nv_bfloat162), + CPointer(_type___nv_bfloat162), + ), + ptrs, + ) + + +_lower__ZleRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj) + + +def __half(): + pass + + +def _lower__ZN6__halfC1E13__nv_bfloat16_nbst(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZN6__halfC1E13__nv_bfloat16_nbst(int &retval , __nv_bfloat16* f) { + __half(*f); + return 0; + } + """ + + _ZN6__halfC1E13__nv_bfloat16_nbst = declare_device( + "_ZN6__halfC1E13__nv_bfloat16_nbst", void(CPointer(_type___nv_bfloat16)) + ) + + def _ZN6__halfC1E13__nv_bfloat16_nbst_caller(arg_0): + return _ZN6__halfC1E13__nv_bfloat16_nbst(arg_0) + + @lower(__half, _type___nv_bfloat16) + def impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key( + "_ZN6__halfC1E13__nv_bfloat16_nbst", shim_raw_str + ) + ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] + for ptr, ty, arg in zip(ptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + return context.compile_internal( + builder, + _ZN6__halfC1E13__nv_bfloat16_nbst_caller, + signature(void, CPointer(_type___nv_bfloat16)), + ptrs, + ) + + +_lower__ZN6__halfC1E13__nv_bfloat16_nbst(shim_stream, shim_obj) + + +@register +class _typing___double2bfloat16(ConcreteTemplate): + key = globals()["__double2bfloat16"] + cases = [signature(_type___nv_bfloat16, float64)] + + +register_global(__double2bfloat16, types.Function(_typing___double2bfloat16)) + + +@register +class _typing___float2bfloat16(ConcreteTemplate): + key = globals()["__float2bfloat16"] + cases = [signature(_type___nv_bfloat16, float32)] + + +register_global(__float2bfloat16, types.Function(_typing___float2bfloat16)) + + +@register +class _typing___float2bfloat16_rn(ConcreteTemplate): + key = globals()["__float2bfloat16_rn"] + cases = [signature(_type___nv_bfloat16, float32)] + + +register_global( + __float2bfloat16_rn, types.Function(_typing___float2bfloat16_rn) +) + + +@register +class _typing___float2bfloat16_rz(ConcreteTemplate): + key = globals()["__float2bfloat16_rz"] + cases = [signature(_type___nv_bfloat16, float32)] + + +register_global( + __float2bfloat16_rz, types.Function(_typing___float2bfloat16_rz) +) + + +@register +class _typing___float2bfloat16_rd(ConcreteTemplate): + key = globals()["__float2bfloat16_rd"] + cases = [signature(_type___nv_bfloat16, float32)] + + +register_global( + __float2bfloat16_rd, types.Function(_typing___float2bfloat16_rd) +) + + +@register +class _typing___float2bfloat16_ru(ConcreteTemplate): + key = globals()["__float2bfloat16_ru"] + cases = [signature(_type___nv_bfloat16, float32)] + + +register_global( + __float2bfloat16_ru, types.Function(_typing___float2bfloat16_ru) +) + + +@register +class _typing___bfloat162float(ConcreteTemplate): + key = globals()["__bfloat162float"] + cases = [signature(float32, _type___nv_bfloat16)] + + +register_global(__bfloat162float, types.Function(_typing___bfloat162float)) + + +@register +class _typing___float2bfloat162_rn(ConcreteTemplate): + key = globals()["__float2bfloat162_rn"] + cases = [signature(_type___nv_bfloat162, float32)] + + +register_global( + __float2bfloat162_rn, types.Function(_typing___float2bfloat162_rn) +) + + +@register +class _typing___floats2bfloat162_rn(ConcreteTemplate): + key = globals()["__floats2bfloat162_rn"] + cases = [signature(_type___nv_bfloat162, float32, float32)] + + +register_global( + __floats2bfloat162_rn, types.Function(_typing___floats2bfloat162_rn) +) + + +@register +class _typing___low2float(ConcreteTemplate): + key = globals()["__low2float"] + cases = [signature(float32, _type___nv_bfloat162)] + + +register_global(__low2float, types.Function(_typing___low2float)) + + +@register +class _typing___high2float(ConcreteTemplate): + key = globals()["__high2float"] + cases = [signature(float32, _type___nv_bfloat162)] + + +register_global(__high2float, types.Function(_typing___high2float)) + + +@register +class _typing___float22bfloat162_rn(ConcreteTemplate): + key = globals()["__float22bfloat162_rn"] + cases = [signature(_type___nv_bfloat162, float32x2)] + + +register_global( + __float22bfloat162_rn, types.Function(_typing___float22bfloat162_rn) +) + + +@register +class _typing___bfloat1622float2(ConcreteTemplate): + key = globals()["__bfloat1622float2"] + cases = [signature(float32x2, _type___nv_bfloat162)] + + +register_global(__bfloat1622float2, types.Function(_typing___bfloat1622float2)) + + +@register +class _typing___bfloat162char_rz(ConcreteTemplate): + key = globals()["__bfloat162char_rz"] + cases = [signature(int8, _type___nv_bfloat16)] + + +register_global(__bfloat162char_rz, types.Function(_typing___bfloat162char_rz)) + + +@register +class _typing___bfloat162uchar_rz(ConcreteTemplate): + key = globals()["__bfloat162uchar_rz"] + cases = [signature(uint8, _type___nv_bfloat16)] + + +register_global( + __bfloat162uchar_rz, types.Function(_typing___bfloat162uchar_rz) +) + + +@register +class _typing___bfloat162int_rn(ConcreteTemplate): + key = globals()["__bfloat162int_rn"] + cases = [signature(int32, _type___nv_bfloat16)] + + +register_global(__bfloat162int_rn, types.Function(_typing___bfloat162int_rn)) + + +@register +class _typing___bfloat162int_rz(ConcreteTemplate): + key = globals()["__bfloat162int_rz"] + cases = [signature(int32, _type___nv_bfloat16)] + + +register_global(__bfloat162int_rz, types.Function(_typing___bfloat162int_rz)) + + +@register +class _typing___bfloat162int_rd(ConcreteTemplate): + key = globals()["__bfloat162int_rd"] + cases = [signature(int32, _type___nv_bfloat16)] + + +register_global(__bfloat162int_rd, types.Function(_typing___bfloat162int_rd)) + + +@register +class _typing___bfloat162int_ru(ConcreteTemplate): + key = globals()["__bfloat162int_ru"] + cases = [signature(int32, _type___nv_bfloat16)] + + +register_global(__bfloat162int_ru, types.Function(_typing___bfloat162int_ru)) + + +@register +class _typing___int2bfloat16_rn(ConcreteTemplate): + key = globals()["__int2bfloat16_rn"] + cases = [signature(_type___nv_bfloat16, int32)] + + +register_global(__int2bfloat16_rn, types.Function(_typing___int2bfloat16_rn)) + + +@register +class _typing___int2bfloat16_rz(ConcreteTemplate): + key = globals()["__int2bfloat16_rz"] + cases = [signature(_type___nv_bfloat16, int32)] + + +register_global(__int2bfloat16_rz, types.Function(_typing___int2bfloat16_rz)) + + +@register +class _typing___int2bfloat16_rd(ConcreteTemplate): + key = globals()["__int2bfloat16_rd"] + cases = [signature(_type___nv_bfloat16, int32)] + + +register_global(__int2bfloat16_rd, types.Function(_typing___int2bfloat16_rd)) + + +@register +class _typing___int2bfloat16_ru(ConcreteTemplate): + key = globals()["__int2bfloat16_ru"] + cases = [signature(_type___nv_bfloat16, int32)] + + +register_global(__int2bfloat16_ru, types.Function(_typing___int2bfloat16_ru)) + + +@register +class _typing___bfloat162short_rn(ConcreteTemplate): + key = globals()["__bfloat162short_rn"] + cases = [signature(int16, _type___nv_bfloat16)] + + +register_global( + __bfloat162short_rn, types.Function(_typing___bfloat162short_rn) +) + + +@register +class _typing___bfloat162short_rz(ConcreteTemplate): + key = globals()["__bfloat162short_rz"] + cases = [signature(int16, _type___nv_bfloat16)] + + +register_global( + __bfloat162short_rz, types.Function(_typing___bfloat162short_rz) +) + + +@register +class _typing___bfloat162short_rd(ConcreteTemplate): + key = globals()["__bfloat162short_rd"] + cases = [signature(int16, _type___nv_bfloat16)] + + +register_global( + __bfloat162short_rd, types.Function(_typing___bfloat162short_rd) +) + + +@register +class _typing___bfloat162short_ru(ConcreteTemplate): + key = globals()["__bfloat162short_ru"] + cases = [signature(int16, _type___nv_bfloat16)] + + +register_global( + __bfloat162short_ru, types.Function(_typing___bfloat162short_ru) +) + + +@register +class _typing___short2bfloat16_rn(ConcreteTemplate): + key = globals()["__short2bfloat16_rn"] + cases = [signature(_type___nv_bfloat16, int16)] + + +register_global( + __short2bfloat16_rn, types.Function(_typing___short2bfloat16_rn) +) + + +@register +class _typing___short2bfloat16_rz(ConcreteTemplate): + key = globals()["__short2bfloat16_rz"] + cases = [signature(_type___nv_bfloat16, int16)] + + +register_global( + __short2bfloat16_rz, types.Function(_typing___short2bfloat16_rz) +) + + +@register +class _typing___short2bfloat16_rd(ConcreteTemplate): + key = globals()["__short2bfloat16_rd"] + cases = [signature(_type___nv_bfloat16, int16)] + + +register_global( + __short2bfloat16_rd, types.Function(_typing___short2bfloat16_rd) +) + + +@register +class _typing___short2bfloat16_ru(ConcreteTemplate): + key = globals()["__short2bfloat16_ru"] + cases = [signature(_type___nv_bfloat16, int16)] + + +register_global( + __short2bfloat16_ru, types.Function(_typing___short2bfloat16_ru) +) + + +@register +class _typing___bfloat162uint_rn(ConcreteTemplate): + key = globals()["__bfloat162uint_rn"] + cases = [signature(uint32, _type___nv_bfloat16)] + + +register_global(__bfloat162uint_rn, types.Function(_typing___bfloat162uint_rn)) + + +@register +class _typing___bfloat162uint_rz(ConcreteTemplate): + key = globals()["__bfloat162uint_rz"] + cases = [signature(uint32, _type___nv_bfloat16)] + + +register_global(__bfloat162uint_rz, types.Function(_typing___bfloat162uint_rz)) + + +@register +class _typing___bfloat162uint_rd(ConcreteTemplate): + key = globals()["__bfloat162uint_rd"] + cases = [signature(uint32, _type___nv_bfloat16)] + + +register_global(__bfloat162uint_rd, types.Function(_typing___bfloat162uint_rd)) + + +@register +class _typing___bfloat162uint_ru(ConcreteTemplate): + key = globals()["__bfloat162uint_ru"] + cases = [signature(uint32, _type___nv_bfloat16)] + + +register_global(__bfloat162uint_ru, types.Function(_typing___bfloat162uint_ru)) + + +@register +class _typing___uint2bfloat16_rn(ConcreteTemplate): + key = globals()["__uint2bfloat16_rn"] + cases = [signature(_type___nv_bfloat16, uint32)] + + +register_global(__uint2bfloat16_rn, types.Function(_typing___uint2bfloat16_rn)) + + +@register +class _typing___uint2bfloat16_rz(ConcreteTemplate): + key = globals()["__uint2bfloat16_rz"] + cases = [signature(_type___nv_bfloat16, uint32)] + + +register_global(__uint2bfloat16_rz, types.Function(_typing___uint2bfloat16_rz)) + + +@register +class _typing___uint2bfloat16_rd(ConcreteTemplate): + key = globals()["__uint2bfloat16_rd"] + cases = [signature(_type___nv_bfloat16, uint32)] + + +register_global(__uint2bfloat16_rd, types.Function(_typing___uint2bfloat16_rd)) + + +@register +class _typing___uint2bfloat16_ru(ConcreteTemplate): + key = globals()["__uint2bfloat16_ru"] + cases = [signature(_type___nv_bfloat16, uint32)] + + +register_global(__uint2bfloat16_ru, types.Function(_typing___uint2bfloat16_ru)) + + +@register +class _typing___bfloat162ushort_rn(ConcreteTemplate): + key = globals()["__bfloat162ushort_rn"] + cases = [signature(uint16, _type___nv_bfloat16)] + + +register_global( + __bfloat162ushort_rn, types.Function(_typing___bfloat162ushort_rn) +) + + +@register +class _typing___bfloat162ushort_rz(ConcreteTemplate): + key = globals()["__bfloat162ushort_rz"] + cases = [signature(uint16, _type___nv_bfloat16)] + + +register_global( + __bfloat162ushort_rz, types.Function(_typing___bfloat162ushort_rz) +) + + +@register +class _typing___bfloat162ushort_rd(ConcreteTemplate): + key = globals()["__bfloat162ushort_rd"] + cases = [signature(uint16, _type___nv_bfloat16)] + + +register_global( + __bfloat162ushort_rd, types.Function(_typing___bfloat162ushort_rd) +) + + +@register +class _typing___bfloat162ushort_ru(ConcreteTemplate): + key = globals()["__bfloat162ushort_ru"] + cases = [signature(uint16, _type___nv_bfloat16)] + + +register_global( + __bfloat162ushort_ru, types.Function(_typing___bfloat162ushort_ru) +) + + +@register +class _typing___ushort2bfloat16_rn(ConcreteTemplate): + key = globals()["__ushort2bfloat16_rn"] + cases = [signature(_type___nv_bfloat16, uint16)] + + +register_global( + __ushort2bfloat16_rn, types.Function(_typing___ushort2bfloat16_rn) +) + + +@register +class _typing___ushort2bfloat16_rz(ConcreteTemplate): + key = globals()["__ushort2bfloat16_rz"] + cases = [signature(_type___nv_bfloat16, uint16)] + + +register_global( + __ushort2bfloat16_rz, types.Function(_typing___ushort2bfloat16_rz) +) + + +@register +class _typing___ushort2bfloat16_rd(ConcreteTemplate): + key = globals()["__ushort2bfloat16_rd"] + cases = [signature(_type___nv_bfloat16, uint16)] + + +register_global( + __ushort2bfloat16_rd, types.Function(_typing___ushort2bfloat16_rd) +) + + +@register +class _typing___ushort2bfloat16_ru(ConcreteTemplate): + key = globals()["__ushort2bfloat16_ru"] + cases = [signature(_type___nv_bfloat16, uint16)] + + +register_global( + __ushort2bfloat16_ru, types.Function(_typing___ushort2bfloat16_ru) +) + + +@register +class _typing___bfloat162ull_rn(ConcreteTemplate): + key = globals()["__bfloat162ull_rn"] + cases = [signature(uint64, _type___nv_bfloat16)] + + +register_global(__bfloat162ull_rn, types.Function(_typing___bfloat162ull_rn)) + + +@register +class _typing___bfloat162ull_rz(ConcreteTemplate): + key = globals()["__bfloat162ull_rz"] + cases = [signature(uint64, _type___nv_bfloat16)] + + +register_global(__bfloat162ull_rz, types.Function(_typing___bfloat162ull_rz)) + + +@register +class _typing_make_bfloat162(ConcreteTemplate): + key = globals()["make_bfloat162"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat16, _type___nv_bfloat16 + ) + ] + + +register_global(make_bfloat162, types.Function(_typing_make_bfloat162)) + + +@register +class _typing___bfloat162ull_rd(ConcreteTemplate): + key = globals()["__bfloat162ull_rd"] + cases = [signature(uint64, _type___nv_bfloat16)] + + +register_global(__bfloat162ull_rd, types.Function(_typing___bfloat162ull_rd)) + + +@register +class _typing___bfloat162ull_ru(ConcreteTemplate): + key = globals()["__bfloat162ull_ru"] + cases = [signature(uint64, _type___nv_bfloat16)] + + +register_global(__bfloat162ull_ru, types.Function(_typing___bfloat162ull_ru)) + + +@register +class _typing___ull2bfloat16_rn(ConcreteTemplate): + key = globals()["__ull2bfloat16_rn"] + cases = [signature(_type___nv_bfloat16, uint64)] + + +register_global(__ull2bfloat16_rn, types.Function(_typing___ull2bfloat16_rn)) + + +@register +class _typing___ull2bfloat16_rz(ConcreteTemplate): + key = globals()["__ull2bfloat16_rz"] + cases = [signature(_type___nv_bfloat16, uint64)] + + +register_global(__ull2bfloat16_rz, types.Function(_typing___ull2bfloat16_rz)) + + +@register +class _typing___ull2bfloat16_rd(ConcreteTemplate): + key = globals()["__ull2bfloat16_rd"] + cases = [signature(_type___nv_bfloat16, uint64)] + + +register_global(__ull2bfloat16_rd, types.Function(_typing___ull2bfloat16_rd)) + + +@register +class _typing___ull2bfloat16_ru(ConcreteTemplate): + key = globals()["__ull2bfloat16_ru"] + cases = [signature(_type___nv_bfloat16, uint64)] + + +register_global(__ull2bfloat16_ru, types.Function(_typing___ull2bfloat16_ru)) + + +@register +class _typing___bfloat162ll_rn(ConcreteTemplate): + key = globals()["__bfloat162ll_rn"] + cases = [signature(int64, _type___nv_bfloat16)] + + +register_global(__bfloat162ll_rn, types.Function(_typing___bfloat162ll_rn)) + + +@register +class _typing___bfloat162ll_rz(ConcreteTemplate): + key = globals()["__bfloat162ll_rz"] + cases = [signature(int64, _type___nv_bfloat16)] + + +register_global(__bfloat162ll_rz, types.Function(_typing___bfloat162ll_rz)) + + +@register +class _typing___bfloat162ll_rd(ConcreteTemplate): + key = globals()["__bfloat162ll_rd"] + cases = [signature(int64, _type___nv_bfloat16)] + + +register_global(__bfloat162ll_rd, types.Function(_typing___bfloat162ll_rd)) + + +@register +class _typing___bfloat162ll_ru(ConcreteTemplate): + key = globals()["__bfloat162ll_ru"] + cases = [signature(int64, _type___nv_bfloat16)] + + +register_global(__bfloat162ll_ru, types.Function(_typing___bfloat162ll_ru)) + + +@register +class _typing___ll2bfloat16_rn(ConcreteTemplate): + key = globals()["__ll2bfloat16_rn"] + cases = [signature(_type___nv_bfloat16, int64)] + + +register_global(__ll2bfloat16_rn, types.Function(_typing___ll2bfloat16_rn)) + + +@register +class _typing___ll2bfloat16_rz(ConcreteTemplate): + key = globals()["__ll2bfloat16_rz"] + cases = [signature(_type___nv_bfloat16, int64)] + + +register_global(__ll2bfloat16_rz, types.Function(_typing___ll2bfloat16_rz)) + + +@register +class _typing___ll2bfloat16_rd(ConcreteTemplate): + key = globals()["__ll2bfloat16_rd"] + cases = [signature(_type___nv_bfloat16, int64)] + + +register_global(__ll2bfloat16_rd, types.Function(_typing___ll2bfloat16_rd)) + + +@register +class _typing___ll2bfloat16_ru(ConcreteTemplate): + key = globals()["__ll2bfloat16_ru"] + cases = [signature(_type___nv_bfloat16, int64)] + + +register_global(__ll2bfloat16_ru, types.Function(_typing___ll2bfloat16_ru)) + + +@register +class _typing_htrunc(ConcreteTemplate): + key = globals()["htrunc"] + cases = [signature(_type___nv_bfloat16, _type___nv_bfloat16)] + + +register_global(htrunc, types.Function(_typing_htrunc)) + + +@register +class _typing_hceil(ConcreteTemplate): + key = globals()["hceil"] + cases = [signature(_type___nv_bfloat16, _type___nv_bfloat16)] + + +register_global(hceil, types.Function(_typing_hceil)) + + +@register +class _typing_hfloor(ConcreteTemplate): + key = globals()["hfloor"] + cases = [signature(_type___nv_bfloat16, _type___nv_bfloat16)] + + +register_global(hfloor, types.Function(_typing_hfloor)) + + +@register +class _typing_hrint(ConcreteTemplate): + key = globals()["hrint"] + cases = [signature(_type___nv_bfloat16, _type___nv_bfloat16)] + + +register_global(hrint, types.Function(_typing_hrint)) + + +@register +class _typing_h2trunc(ConcreteTemplate): + key = globals()["h2trunc"] + cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(h2trunc, types.Function(_typing_h2trunc)) + + +@register +class _typing_h2ceil(ConcreteTemplate): + key = globals()["h2ceil"] + cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(h2ceil, types.Function(_typing_h2ceil)) + + +@register +class _typing_h2floor(ConcreteTemplate): + key = globals()["h2floor"] + cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(h2floor, types.Function(_typing_h2floor)) + + +@register +class _typing_h2rint(ConcreteTemplate): + key = globals()["h2rint"] + cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(h2rint, types.Function(_typing_h2rint)) + + +@register +class _typing___bfloat162bfloat162(ConcreteTemplate): + key = globals()["__bfloat162bfloat162"] + cases = [signature(_type___nv_bfloat162, _type___nv_bfloat16)] + + +register_global( + __bfloat162bfloat162, types.Function(_typing___bfloat162bfloat162) +) + + +@register +class _typing___lowhigh2highlow(ConcreteTemplate): + key = globals()["__lowhigh2highlow"] + cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__lowhigh2highlow, types.Function(_typing___lowhigh2highlow)) + + +@register +class _typing___lows2bfloat162(ConcreteTemplate): + key = globals()["__lows2bfloat162"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] + + +register_global(__lows2bfloat162, types.Function(_typing___lows2bfloat162)) + + +@register +class _typing___highs2bfloat162(ConcreteTemplate): + key = globals()["__highs2bfloat162"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] + + +register_global(__highs2bfloat162, types.Function(_typing___highs2bfloat162)) + + +@register +class _typing___high2bfloat16(ConcreteTemplate): + key = globals()["__high2bfloat16"] + cases = [signature(_type___nv_bfloat16, _type___nv_bfloat162)] + + +register_global(__high2bfloat16, types.Function(_typing___high2bfloat16)) + + +@register +class _typing___low2bfloat16(ConcreteTemplate): + key = globals()["__low2bfloat16"] + cases = [signature(_type___nv_bfloat16, _type___nv_bfloat162)] + + +register_global(__low2bfloat16, types.Function(_typing___low2bfloat16)) + + +@register +class _typing___hisinf(ConcreteTemplate): + key = globals()["__hisinf"] + cases = [signature(int32, _type___nv_bfloat16)] + + +register_global(__hisinf, types.Function(_typing___hisinf)) + + +@register +class _typing___halves2bfloat162(ConcreteTemplate): + key = globals()["__halves2bfloat162"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat16, _type___nv_bfloat16 + ) + ] + + +register_global(__halves2bfloat162, types.Function(_typing___halves2bfloat162)) + + +@register +class _typing___low2bfloat162(ConcreteTemplate): + key = globals()["__low2bfloat162"] + cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__low2bfloat162, types.Function(_typing___low2bfloat162)) + + +@register +class _typing___high2bfloat162(ConcreteTemplate): + key = globals()["__high2bfloat162"] + cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__high2bfloat162, types.Function(_typing___high2bfloat162)) + + +@register +class _typing___bfloat16_as_short(ConcreteTemplate): + key = globals()["__bfloat16_as_short"] + cases = [signature(int16, _type___nv_bfloat16)] + + +register_global( + __bfloat16_as_short, types.Function(_typing___bfloat16_as_short) +) + + +@register +class _typing___bfloat16_as_ushort(ConcreteTemplate): + key = globals()["__bfloat16_as_ushort"] + cases = [signature(uint16, _type___nv_bfloat16)] + + +register_global( + __bfloat16_as_ushort, types.Function(_typing___bfloat16_as_ushort) +) + + +@register +class _typing___short_as_bfloat16(ConcreteTemplate): + key = globals()["__short_as_bfloat16"] + cases = [signature(_type___nv_bfloat16, int16)] + + +register_global( + __short_as_bfloat16, types.Function(_typing___short_as_bfloat16) +) + + +@register +class _typing___ushort_as_bfloat16(ConcreteTemplate): + key = globals()["__ushort_as_bfloat16"] + cases = [signature(_type___nv_bfloat16, uint16)] + + +register_global( + __ushort_as_bfloat16, types.Function(_typing___ushort_as_bfloat16) +) + + +@register +class _typing___shfl_sync(ConcreteTemplate): + key = globals()["__shfl_sync"] + cases = [ + signature( + _type___nv_bfloat162, uint32, _type___nv_bfloat162, int32, int32 + ), + signature( + _type___nv_bfloat16, uint32, _type___nv_bfloat16, int32, int32 + ), + ] + + +register_global(__shfl_sync, types.Function(_typing___shfl_sync)) + + +@register +class _typing___shfl_up_sync(ConcreteTemplate): + key = globals()["__shfl_up_sync"] + cases = [ + signature( + _type___nv_bfloat162, uint32, _type___nv_bfloat162, uint32, int32 + ), + signature( + _type___nv_bfloat16, uint32, _type___nv_bfloat16, uint32, int32 + ), + ] + + +register_global(__shfl_up_sync, types.Function(_typing___shfl_up_sync)) + + +@register +class _typing___shfl_down_sync(ConcreteTemplate): + key = globals()["__shfl_down_sync"] + cases = [ + signature( + _type___nv_bfloat162, uint32, _type___nv_bfloat162, uint32, int32 + ), + signature( + _type___nv_bfloat16, uint32, _type___nv_bfloat16, uint32, int32 + ), + ] + + +register_global(__shfl_down_sync, types.Function(_typing___shfl_down_sync)) + + +@register +class _typing___shfl_xor_sync(ConcreteTemplate): + key = globals()["__shfl_xor_sync"] + cases = [ + signature( + _type___nv_bfloat162, uint32, _type___nv_bfloat162, int32, int32 + ), + signature( + _type___nv_bfloat16, uint32, _type___nv_bfloat16, int32, int32 + ), + ] + + +register_global(__shfl_xor_sync, types.Function(_typing___shfl_xor_sync)) + + +@register +class _typing___ldg(ConcreteTemplate): + key = globals()["__ldg"] + cases = [ + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + ] + + +register_global(__ldg, types.Function(_typing___ldg)) + + +@register +class _typing___ldcg(ConcreteTemplate): + key = globals()["__ldcg"] + cases = [ + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + ] + + +register_global(__ldcg, types.Function(_typing___ldcg)) + + +@register +class _typing___ldca(ConcreteTemplate): + key = globals()["__ldca"] + cases = [ + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + ] + + +register_global(__ldca, types.Function(_typing___ldca)) + + +@register +class _typing___ldcs(ConcreteTemplate): + key = globals()["__ldcs"] + cases = [ + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + ] + + +register_global(__ldcs, types.Function(_typing___ldcs)) + + +@register +class _typing___ldlu(ConcreteTemplate): + key = globals()["__ldlu"] + cases = [ + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + ] + + +register_global(__ldlu, types.Function(_typing___ldlu)) + + +@register +class _typing___ldcv(ConcreteTemplate): + key = globals()["__ldcv"] + cases = [ + signature(_type___nv_bfloat162, CPointer(_type___nv_bfloat162)), + signature(_type___nv_bfloat16, CPointer(_type___nv_bfloat16)), + ] + + +register_global(__ldcv, types.Function(_typing___ldcv)) + + +@register +class _typing___stwb(ConcreteTemplate): + key = globals()["__stwb"] + cases = [ + signature(void, CPointer(_type___nv_bfloat162), _type___nv_bfloat162), + signature(void, CPointer(_type___nv_bfloat16), _type___nv_bfloat16), + ] + + +register_global(__stwb, types.Function(_typing___stwb)) + + +@register +class _typing___stcg(ConcreteTemplate): + key = globals()["__stcg"] + cases = [ + signature(void, CPointer(_type___nv_bfloat162), _type___nv_bfloat162), + signature(void, CPointer(_type___nv_bfloat16), _type___nv_bfloat16), + ] + + +register_global(__stcg, types.Function(_typing___stcg)) + + +@register +class _typing___stcs(ConcreteTemplate): + key = globals()["__stcs"] + cases = [ + signature(void, CPointer(_type___nv_bfloat162), _type___nv_bfloat162), + signature(void, CPointer(_type___nv_bfloat16), _type___nv_bfloat16), + ] + + +register_global(__stcs, types.Function(_typing___stcs)) + + +@register +class _typing___stwt(ConcreteTemplate): + key = globals()["__stwt"] + cases = [ + signature(void, CPointer(_type___nv_bfloat162), _type___nv_bfloat162), + signature(void, CPointer(_type___nv_bfloat16), _type___nv_bfloat16), + ] + + +register_global(__stwt, types.Function(_typing___stwt)) + + +@register +class _typing___heq2(ConcreteTemplate): + key = globals()["__heq2"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] + + +register_global(__heq2, types.Function(_typing___heq2)) + + +@register +class _typing___hne2(ConcreteTemplate): + key = globals()["__hne2"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] + + +register_global(__hne2, types.Function(_typing___hne2)) + + +@register +class _typing___hle2(ConcreteTemplate): + key = globals()["__hle2"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] + + +register_global(__hle2, types.Function(_typing___hle2)) + + +@register +class _typing___hge2(ConcreteTemplate): + key = globals()["__hge2"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] + + +register_global(__hge2, types.Function(_typing___hge2)) + + +@register +class _typing___hlt2(ConcreteTemplate): + key = globals()["__hlt2"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] + + +register_global(__hlt2, types.Function(_typing___hlt2)) + + +@register +class _typing___hgt2(ConcreteTemplate): + key = globals()["__hgt2"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] + + +register_global(__hgt2, types.Function(_typing___hgt2)) + + +@register +class _typing___hequ2(ConcreteTemplate): + key = globals()["__hequ2"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] + + +register_global(__hequ2, types.Function(_typing___hequ2)) + + +@register +class _typing___hneu2(ConcreteTemplate): + key = globals()["__hneu2"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] + + +register_global(__hneu2, types.Function(_typing___hneu2)) + + +@register +class _typing___hleu2(ConcreteTemplate): + key = globals()["__hleu2"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] + + +register_global(__hleu2, types.Function(_typing___hleu2)) + + +@register +class _typing___hgeu2(ConcreteTemplate): + key = globals()["__hgeu2"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] + + +register_global(__hgeu2, types.Function(_typing___hgeu2)) + + +@register +class _typing___hltu2(ConcreteTemplate): + key = globals()["__hltu2"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] + + +register_global(__hltu2, types.Function(_typing___hltu2)) + + +@register +class _typing___hgtu2(ConcreteTemplate): + key = globals()["__hgtu2"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] + + +register_global(__hgtu2, types.Function(_typing___hgtu2)) + + +@register +class _typing___heq2_mask(ConcreteTemplate): + key = globals()["__heq2_mask"] + cases = [signature(uint32, _type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__heq2_mask, types.Function(_typing___heq2_mask)) + + +@register +class _typing___hne2_mask(ConcreteTemplate): + key = globals()["__hne2_mask"] + cases = [signature(uint32, _type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__hne2_mask, types.Function(_typing___hne2_mask)) + + +@register +class _typing___hle2_mask(ConcreteTemplate): + key = globals()["__hle2_mask"] + cases = [signature(uint32, _type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__hle2_mask, types.Function(_typing___hle2_mask)) + + +@register +class _typing___hge2_mask(ConcreteTemplate): + key = globals()["__hge2_mask"] + cases = [signature(uint32, _type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__hge2_mask, types.Function(_typing___hge2_mask)) + + +@register +class _typing___hlt2_mask(ConcreteTemplate): + key = globals()["__hlt2_mask"] + cases = [signature(uint32, _type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__hlt2_mask, types.Function(_typing___hlt2_mask)) + + +@register +class _typing___hgt2_mask(ConcreteTemplate): + key = globals()["__hgt2_mask"] + cases = [signature(uint32, _type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__hgt2_mask, types.Function(_typing___hgt2_mask)) + + +@register +class _typing___hequ2_mask(ConcreteTemplate): + key = globals()["__hequ2_mask"] + cases = [signature(uint32, _type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__hequ2_mask, types.Function(_typing___hequ2_mask)) + + +@register +class _typing___hneu2_mask(ConcreteTemplate): + key = globals()["__hneu2_mask"] + cases = [signature(uint32, _type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__hneu2_mask, types.Function(_typing___hneu2_mask)) + + +@register +class _typing___hleu2_mask(ConcreteTemplate): + key = globals()["__hleu2_mask"] + cases = [signature(uint32, _type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__hleu2_mask, types.Function(_typing___hleu2_mask)) + + +@register +class _typing___hgeu2_mask(ConcreteTemplate): + key = globals()["__hgeu2_mask"] + cases = [signature(uint32, _type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__hgeu2_mask, types.Function(_typing___hgeu2_mask)) + + +@register +class _typing___hltu2_mask(ConcreteTemplate): + key = globals()["__hltu2_mask"] + cases = [signature(uint32, _type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__hltu2_mask, types.Function(_typing___hltu2_mask)) + + +@register +class _typing___hgtu2_mask(ConcreteTemplate): + key = globals()["__hgtu2_mask"] + cases = [signature(uint32, _type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__hgtu2_mask, types.Function(_typing___hgtu2_mask)) + + +@register +class _typing___hisnan2(ConcreteTemplate): + key = globals()["__hisnan2"] + cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__hisnan2, types.Function(_typing___hisnan2)) + + +@register +class _typing___hadd2(ConcreteTemplate): + key = globals()["__hadd2"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] + + +register_global(__hadd2, types.Function(_typing___hadd2)) + + +@register +class _typing___hsub2(ConcreteTemplate): + key = globals()["__hsub2"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] + + +register_global(__hsub2, types.Function(_typing___hsub2)) + + +@register +class _typing___hmul2(ConcreteTemplate): + key = globals()["__hmul2"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] + + +register_global(__hmul2, types.Function(_typing___hmul2)) + + +@register +class _typing___hadd2_rn(ConcreteTemplate): + key = globals()["__hadd2_rn"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] + + +register_global(__hadd2_rn, types.Function(_typing___hadd2_rn)) + + +@register +class _typing___hsub2_rn(ConcreteTemplate): + key = globals()["__hsub2_rn"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] + + +register_global(__hsub2_rn, types.Function(_typing___hsub2_rn)) + + +@register +class _typing___hmul2_rn(ConcreteTemplate): + key = globals()["__hmul2_rn"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] + + +register_global(__hmul2_rn, types.Function(_typing___hmul2_rn)) + + +@register +class _typing___h2div(ConcreteTemplate): + key = globals()["__h2div"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] + + +register_global(__h2div, types.Function(_typing___h2div)) + + +@register +class _typing___habs2(ConcreteTemplate): + key = globals()["__habs2"] + cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__habs2, types.Function(_typing___habs2)) + + +@register +class _typing___hadd2_sat(ConcreteTemplate): + key = globals()["__hadd2_sat"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] + + +register_global(__hadd2_sat, types.Function(_typing___hadd2_sat)) + + +@register +class _typing___hsub2_sat(ConcreteTemplate): + key = globals()["__hsub2_sat"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] + + +register_global(__hsub2_sat, types.Function(_typing___hsub2_sat)) + + +@register +class _typing___hmul2_sat(ConcreteTemplate): + key = globals()["__hmul2_sat"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] + + +register_global(__hmul2_sat, types.Function(_typing___hmul2_sat)) + + +@register +class _typing___hfma2(ConcreteTemplate): + key = globals()["__hfma2"] + cases = [ + signature( + _type___nv_bfloat162, + _type___nv_bfloat162, + _type___nv_bfloat162, + _type___nv_bfloat162, + ) + ] + + +register_global(__hfma2, types.Function(_typing___hfma2)) + + +@register +class _typing___hfma2_sat(ConcreteTemplate): + key = globals()["__hfma2_sat"] + cases = [ + signature( + _type___nv_bfloat162, + _type___nv_bfloat162, + _type___nv_bfloat162, + _type___nv_bfloat162, + ) + ] + + +register_global(__hfma2_sat, types.Function(_typing___hfma2_sat)) + + +@register +class _typing___hneg2(ConcreteTemplate): + key = globals()["__hneg2"] + cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__hneg2, types.Function(_typing___hneg2)) + + +@register +class _typing___habs(ConcreteTemplate): + key = globals()["__habs"] + cases = [signature(_type___nv_bfloat16, _type___nv_bfloat16)] + + +register_global(__habs, types.Function(_typing___habs)) + + +@register +class _typing___hadd(ConcreteTemplate): + key = globals()["__hadd"] + cases = [ + signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16) + ] + + +register_global(__hadd, types.Function(_typing___hadd)) + + +@register +class _typing___hsub(ConcreteTemplate): + key = globals()["__hsub"] + cases = [ + signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16) + ] + + +register_global(__hsub, types.Function(_typing___hsub)) + + +@register +class _typing___hmul(ConcreteTemplate): + key = globals()["__hmul"] + cases = [ + signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16) + ] + + +register_global(__hmul, types.Function(_typing___hmul)) + + +@register +class _typing___hadd_rn(ConcreteTemplate): + key = globals()["__hadd_rn"] + cases = [ + signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16) + ] + + +register_global(__hadd_rn, types.Function(_typing___hadd_rn)) + + +@register +class _typing___hsub_rn(ConcreteTemplate): + key = globals()["__hsub_rn"] + cases = [ + signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16) + ] + + +register_global(__hsub_rn, types.Function(_typing___hsub_rn)) + + +@register +class _typing___hmul_rn(ConcreteTemplate): + key = globals()["__hmul_rn"] + cases = [ + signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16) + ] + + +register_global(__hmul_rn, types.Function(_typing___hmul_rn)) + + +@register +class _typing___hdiv(ConcreteTemplate): + key = globals()["__hdiv"] + cases = [ + signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16) + ] + + +register_global(__hdiv, types.Function(_typing___hdiv)) + + +@register +class _typing___hadd_sat(ConcreteTemplate): + key = globals()["__hadd_sat"] + cases = [ + signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16) + ] + + +register_global(__hadd_sat, types.Function(_typing___hadd_sat)) + + +@register +class _typing___hsub_sat(ConcreteTemplate): + key = globals()["__hsub_sat"] + cases = [ + signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16) + ] + + +register_global(__hsub_sat, types.Function(_typing___hsub_sat)) + + +@register +class _typing___hmul_sat(ConcreteTemplate): + key = globals()["__hmul_sat"] + cases = [ + signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16) + ] + + +register_global(__hmul_sat, types.Function(_typing___hmul_sat)) + + +@register +class _typing___hfma(ConcreteTemplate): + key = globals()["__hfma"] + cases = [ + signature( + _type___nv_bfloat16, + _type___nv_bfloat16, + _type___nv_bfloat16, + _type___nv_bfloat16, + ) + ] + + +register_global(__hfma, types.Function(_typing___hfma)) + + +@register +class _typing___hfma_sat(ConcreteTemplate): + key = globals()["__hfma_sat"] + cases = [ + signature( + _type___nv_bfloat16, + _type___nv_bfloat16, + _type___nv_bfloat16, + _type___nv_bfloat16, + ) + ] + + +register_global(__hfma_sat, types.Function(_typing___hfma_sat)) + + +@register +class _typing___hneg(ConcreteTemplate): + key = globals()["__hneg"] + cases = [signature(_type___nv_bfloat16, _type___nv_bfloat16)] + + +register_global(__hneg, types.Function(_typing___hneg)) + + +@register +class _typing___hbeq2(ConcreteTemplate): + key = globals()["__hbeq2"] + cases = [signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__hbeq2, types.Function(_typing___hbeq2)) + + +@register +class _typing___hbne2(ConcreteTemplate): + key = globals()["__hbne2"] + cases = [signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__hbne2, types.Function(_typing___hbne2)) + + +@register +class _typing___hble2(ConcreteTemplate): + key = globals()["__hble2"] + cases = [signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__hble2, types.Function(_typing___hble2)) + + +@register +class _typing___hbge2(ConcreteTemplate): + key = globals()["__hbge2"] + cases = [signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__hbge2, types.Function(_typing___hbge2)) + + +@register +class _typing___hblt2(ConcreteTemplate): + key = globals()["__hblt2"] + cases = [signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__hblt2, types.Function(_typing___hblt2)) + + +@register +class _typing___hbgt2(ConcreteTemplate): + key = globals()["__hbgt2"] + cases = [signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__hbgt2, types.Function(_typing___hbgt2)) + + +@register +class _typing___hbequ2(ConcreteTemplate): + key = globals()["__hbequ2"] + cases = [signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__hbequ2, types.Function(_typing___hbequ2)) + + +@register +class _typing___hbneu2(ConcreteTemplate): + key = globals()["__hbneu2"] + cases = [signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__hbneu2, types.Function(_typing___hbneu2)) + + +@register +class _typing___hbleu2(ConcreteTemplate): + key = globals()["__hbleu2"] + cases = [signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__hbleu2, types.Function(_typing___hbleu2)) + + +@register +class _typing___hbgeu2(ConcreteTemplate): + key = globals()["__hbgeu2"] + cases = [signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__hbgeu2, types.Function(_typing___hbgeu2)) + + +@register +class _typing___hbltu2(ConcreteTemplate): + key = globals()["__hbltu2"] + cases = [signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__hbltu2, types.Function(_typing___hbltu2)) + + +@register +class _typing___hbgtu2(ConcreteTemplate): + key = globals()["__hbgtu2"] + cases = [signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162)] + + +register_global(__hbgtu2, types.Function(_typing___hbgtu2)) + + +@register +class _typing___heq(ConcreteTemplate): + key = globals()["__heq"] + cases = [signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16)] + + +register_global(__heq, types.Function(_typing___heq)) + + +@register +class _typing___hne(ConcreteTemplate): + key = globals()["__hne"] + cases = [signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16)] -_operator_neg_2_lower(shim_stream, shim_obj) +register_global(__hne, types.Function(_typing___hne)) -def _operator_eq_2_lower(shim_stream, shim_obj): - shim_raw_str = """ - extern "C" __device__ int - operator_eq_2(bool &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { - retval = operator==(*lh, *rh); - return 0; - } - """ +@register +class _typing___hle(ConcreteTemplate): + key = globals()["__hle"] + cases = [signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16)] - operator_eq_2 = declare_device( - "operator_eq_2", - bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), - ) - def operator_eq_2_caller(arg_0, arg_1): - return operator_eq_2(arg_0, arg_1) +register_global(__hle, types.Function(_typing___hle)) - @lower(operator.eq, _type___nv_bfloat162, _type___nv_bfloat162) - def impl(context, builder, sig, args): - context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_eq_2", shim_raw_str) - ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] - for ptr, ty, arg in zip(ptrs, sig.args, args): - builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) - return context.compile_internal( - builder, - operator_eq_2_caller, - signature( - bool_, - CPointer(_type___nv_bfloat162), - CPointer(_type___nv_bfloat162), - ), - ptrs, - ) +@register +class _typing___hge(ConcreteTemplate): + key = globals()["__hge"] + cases = [signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16)] -_operator_eq_2_lower(shim_stream, shim_obj) +register_global(__hge, types.Function(_typing___hge)) -def _operator_ne_2_lower(shim_stream, shim_obj): - shim_raw_str = """ - extern "C" __device__ int - operator_ne_2(bool &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { - retval = operator!=(*lh, *rh); - return 0; - } - """ +@register +class _typing___hlt(ConcreteTemplate): + key = globals()["__hlt"] + cases = [signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16)] - operator_ne_2 = declare_device( - "operator_ne_2", - bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), - ) - def operator_ne_2_caller(arg_0, arg_1): - return operator_ne_2(arg_0, arg_1) +register_global(__hlt, types.Function(_typing___hlt)) - @lower(operator.ne, _type___nv_bfloat162, _type___nv_bfloat162) - def impl(context, builder, sig, args): - context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_ne_2", shim_raw_str) - ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] - for ptr, ty, arg in zip(ptrs, sig.args, args): - builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) - return context.compile_internal( - builder, - operator_ne_2_caller, - signature( - bool_, - CPointer(_type___nv_bfloat162), - CPointer(_type___nv_bfloat162), - ), - ptrs, - ) +@register +class _typing___hgt(ConcreteTemplate): + key = globals()["__hgt"] + cases = [signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16)] -_operator_ne_2_lower(shim_stream, shim_obj) +register_global(__hgt, types.Function(_typing___hgt)) -def _operator_gt_2_lower(shim_stream, shim_obj): - shim_raw_str = """ - extern "C" __device__ int - operator_gt_2(bool &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { - retval = operator>(*lh, *rh); - return 0; - } - """ +@register +class _typing___hequ(ConcreteTemplate): + key = globals()["__hequ"] + cases = [signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16)] - operator_gt_2 = declare_device( - "operator_gt_2", - bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), - ) - def operator_gt_2_caller(arg_0, arg_1): - return operator_gt_2(arg_0, arg_1) +register_global(__hequ, types.Function(_typing___hequ)) - @lower(operator.gt, _type___nv_bfloat162, _type___nv_bfloat162) - def impl(context, builder, sig, args): - context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_gt_2", shim_raw_str) - ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] - for ptr, ty, arg in zip(ptrs, sig.args, args): - builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) - return context.compile_internal( - builder, - operator_gt_2_caller, - signature( - bool_, - CPointer(_type___nv_bfloat162), - CPointer(_type___nv_bfloat162), - ), - ptrs, - ) +@register +class _typing___hneu(ConcreteTemplate): + key = globals()["__hneu"] + cases = [signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16)] -_operator_gt_2_lower(shim_stream, shim_obj) +register_global(__hneu, types.Function(_typing___hneu)) -def _operator_lt_2_lower(shim_stream, shim_obj): - shim_raw_str = """ - extern "C" __device__ int - operator_lt_2(bool &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { - retval = operator<(*lh, *rh); - return 0; - } - """ +@register +class _typing___hleu(ConcreteTemplate): + key = globals()["__hleu"] + cases = [signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16)] - operator_lt_2 = declare_device( - "operator_lt_2", - bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), - ) - def operator_lt_2_caller(arg_0, arg_1): - return operator_lt_2(arg_0, arg_1) +register_global(__hleu, types.Function(_typing___hleu)) - @lower(operator.lt, _type___nv_bfloat162, _type___nv_bfloat162) - def impl(context, builder, sig, args): - context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_lt_2", shim_raw_str) - ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] - for ptr, ty, arg in zip(ptrs, sig.args, args): - builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) - return context.compile_internal( - builder, - operator_lt_2_caller, - signature( - bool_, - CPointer(_type___nv_bfloat162), - CPointer(_type___nv_bfloat162), - ), - ptrs, - ) +@register +class _typing___hgeu(ConcreteTemplate): + key = globals()["__hgeu"] + cases = [signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16)] -_operator_lt_2_lower(shim_stream, shim_obj) +register_global(__hgeu, types.Function(_typing___hgeu)) -def _operator_ge_2_lower(shim_stream, shim_obj): - shim_raw_str = """ - extern "C" __device__ int - operator_ge_2(bool &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { - retval = operator>=(*lh, *rh); - return 0; - } - """ +@register +class _typing___hltu(ConcreteTemplate): + key = globals()["__hltu"] + cases = [signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16)] - operator_ge_2 = declare_device( - "operator_ge_2", - bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), - ) - def operator_ge_2_caller(arg_0, arg_1): - return operator_ge_2(arg_0, arg_1) +register_global(__hltu, types.Function(_typing___hltu)) - @lower(operator.ge, _type___nv_bfloat162, _type___nv_bfloat162) - def impl(context, builder, sig, args): - context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_ge_2", shim_raw_str) - ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] - for ptr, ty, arg in zip(ptrs, sig.args, args): - builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) - return context.compile_internal( - builder, - operator_ge_2_caller, - signature( - bool_, - CPointer(_type___nv_bfloat162), - CPointer(_type___nv_bfloat162), - ), - ptrs, - ) +@register +class _typing___hgtu(ConcreteTemplate): + key = globals()["__hgtu"] + cases = [signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16)] -_operator_ge_2_lower(shim_stream, shim_obj) +register_global(__hgtu, types.Function(_typing___hgtu)) -def _operator_le_2_lower(shim_stream, shim_obj): - shim_raw_str = """ - extern "C" __device__ int - operator_le_2(bool &retval , __nv_bfloat162* lh, __nv_bfloat162* rh) { - retval = operator<=(*lh, *rh); - return 0; - } - """ +@register +class _typing___hisnan(ConcreteTemplate): + key = globals()["__hisnan"] + cases = [signature(bool_, _type___nv_bfloat16)] - operator_le_2 = declare_device( - "operator_le_2", - bool_(CPointer(_type___nv_bfloat162), CPointer(_type___nv_bfloat162)), - ) - def operator_le_2_caller(arg_0, arg_1): - return operator_le_2(arg_0, arg_1) +register_global(__hisnan, types.Function(_typing___hisnan)) - @lower(operator.le, _type___nv_bfloat162, _type___nv_bfloat162) - def impl(context, builder, sig, args): - context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("operator_le_2", shim_raw_str) - ptrs = [builder.alloca(context.get_value_type(arg)) for arg in sig.args] - for ptr, ty, arg in zip(ptrs, sig.args, args): - builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) - return context.compile_internal( - builder, - operator_le_2_caller, - signature( - bool_, - CPointer(_type___nv_bfloat162), - CPointer(_type___nv_bfloat162), - ), - ptrs, - ) +@register +class _typing___hmax(ConcreteTemplate): + key = globals()["__hmax"] + cases = [ + signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16) + ] -_operator_le_2_lower(shim_stream, shim_obj) +register_global(__hmax, types.Function(_typing___hmax)) @register -class _typing_make_bfloat162(ConcreteTemplate): - key = globals()["make_bfloat162"] +class _typing___hmin(ConcreteTemplate): + key = globals()["__hmin"] cases = [ - signature( - _type___nv_bfloat162, _type___nv_bfloat16, _type___nv_bfloat16 - ) + signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16) ] -register_global(make_bfloat162, types.Function(_typing_make_bfloat162)) +register_global(__hmin, types.Function(_typing___hmin)) @register -class _typing_htrunc(ConcreteTemplate): - key = globals()["htrunc"] - cases = [signature(_type___nv_bfloat16, _type___nv_bfloat16)] +class _typing___hmax_nan(ConcreteTemplate): + key = globals()["__hmax_nan"] + cases = [ + signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16) + ] -register_global(htrunc, types.Function(_typing_htrunc)) +register_global(__hmax_nan, types.Function(_typing___hmax_nan)) @register -class _typing_hceil(ConcreteTemplate): - key = globals()["hceil"] - cases = [signature(_type___nv_bfloat16, _type___nv_bfloat16)] +class _typing___hmin_nan(ConcreteTemplate): + key = globals()["__hmin_nan"] + cases = [ + signature(_type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16) + ] -register_global(hceil, types.Function(_typing_hceil)) +register_global(__hmin_nan, types.Function(_typing___hmin_nan)) @register -class _typing_hfloor(ConcreteTemplate): - key = globals()["hfloor"] - cases = [signature(_type___nv_bfloat16, _type___nv_bfloat16)] +class _typing___hfma_relu(ConcreteTemplate): + key = globals()["__hfma_relu"] + cases = [ + signature( + _type___nv_bfloat16, + _type___nv_bfloat16, + _type___nv_bfloat16, + _type___nv_bfloat16, + ) + ] -register_global(hfloor, types.Function(_typing_hfloor)) +register_global(__hfma_relu, types.Function(_typing___hfma_relu)) @register -class _typing_hrint(ConcreteTemplate): - key = globals()["hrint"] - cases = [signature(_type___nv_bfloat16, _type___nv_bfloat16)] +class _typing___hmax2(ConcreteTemplate): + key = globals()["__hmax2"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] -register_global(hrint, types.Function(_typing_hrint)) +register_global(__hmax2, types.Function(_typing___hmax2)) @register -class _typing_h2trunc(ConcreteTemplate): - key = globals()["h2trunc"] - cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)] +class _typing___hmin2(ConcreteTemplate): + key = globals()["__hmin2"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] -register_global(h2trunc, types.Function(_typing_h2trunc)) +register_global(__hmin2, types.Function(_typing___hmin2)) @register -class _typing_h2ceil(ConcreteTemplate): - key = globals()["h2ceil"] - cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)] +class _typing___hmax2_nan(ConcreteTemplate): + key = globals()["__hmax2_nan"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] -register_global(h2ceil, types.Function(_typing_h2ceil)) +register_global(__hmax2_nan, types.Function(_typing___hmax2_nan)) @register -class _typing_h2floor(ConcreteTemplate): - key = globals()["h2floor"] - cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)] +class _typing___hmin2_nan(ConcreteTemplate): + key = globals()["__hmin2_nan"] + cases = [ + signature( + _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + ) + ] -register_global(h2floor, types.Function(_typing_h2floor)) +register_global(__hmin2_nan, types.Function(_typing___hmin2_nan)) @register -class _typing_h2rint(ConcreteTemplate): - key = globals()["h2rint"] - cases = [signature(_type___nv_bfloat162, _type___nv_bfloat162)] +class _typing___hfma2_relu(ConcreteTemplate): + key = globals()["__hfma2_relu"] + cases = [ + signature( + _type___nv_bfloat162, + _type___nv_bfloat162, + _type___nv_bfloat162, + _type___nv_bfloat162, + ) + ] -register_global(h2rint, types.Function(_typing_h2rint)) +register_global(__hfma2_relu, types.Function(_typing___hfma2_relu)) + + +@register +class _typing___hcmadd(ConcreteTemplate): + key = globals()["__hcmadd"] + cases = [ + signature( + _type___nv_bfloat162, + _type___nv_bfloat162, + _type___nv_bfloat162, + _type___nv_bfloat162, + ) + ] + + +register_global(__hcmadd, types.Function(_typing___hcmadd)) @register @@ -4988,6 +15945,15 @@ class _typing_atomicAdd(ConcreteTemplate): register_global(atomicAdd, types.Function(_typing_atomicAdd)) +@register +class _typing___half(ConcreteTemplate): + key = globals()["__half"] + cases = [signature(void, _type___nv_bfloat16)] + + +register_global(__half, types.Function(_typing___half)) + + @register_global(operator.add) class _typing_operator_add(ConcreteTemplate): cases = [ @@ -5149,7 +16115,297 @@ class _typing_operator_le(ConcreteTemplate): # Aliases: -__nv_bfloat16_raw = unnamed1401637 -__nv_bfloat162_raw = unnamed1401746 +__nv_bfloat16_raw = unnamed1405307 +__nv_bfloat162_raw = unnamed1405416 nv_bfloat16 = __nv_bfloat16 nv_bfloat162 = __nv_bfloat162 + + +# Symbols: + + +_NBTYPE_SYMBOLS = [ + "_type_unnamed1405307", + "_type_unnamed1405416", + "_type___nv_bfloat16", + "_type___nv_bfloat162", +] + + +_RECORD_SYMBOLS = [ + "unnamed1405307", + "unnamed1405416", + "__nv_bfloat16", + "__nv_bfloat162", +] + + +_FUNCTION_SYMBOLS = [ + "__double2bfloat16", + "__float2bfloat16", + "__float2bfloat16_rn", + "__float2bfloat16_rz", + "__float2bfloat16_rd", + "__float2bfloat16_ru", + "__bfloat162float", + "__float2bfloat162_rn", + "__floats2bfloat162_rn", + "__low2float", + "__high2float", + "__float22bfloat162_rn", + "__bfloat1622float2", + "__bfloat162char_rz", + "__bfloat162uchar_rz", + "__bfloat162int_rn", + "__bfloat162int_rz", + "__bfloat162int_rd", + "__bfloat162int_ru", + "__int2bfloat16_rn", + "__int2bfloat16_rz", + "__int2bfloat16_rd", + "__int2bfloat16_ru", + "__bfloat162short_rn", + "__bfloat162short_rz", + "__bfloat162short_rd", + "__bfloat162short_ru", + "__short2bfloat16_rn", + "__short2bfloat16_rz", + "__short2bfloat16_rd", + "__short2bfloat16_ru", + "__bfloat162uint_rn", + "__bfloat162uint_rz", + "__bfloat162uint_rd", + "__bfloat162uint_ru", + "__uint2bfloat16_rn", + "__uint2bfloat16_rz", + "__uint2bfloat16_rd", + "__uint2bfloat16_ru", + "__bfloat162ushort_rn", + "__bfloat162ushort_rz", + "__bfloat162ushort_rd", + "__bfloat162ushort_ru", + "__ushort2bfloat16_rn", + "__ushort2bfloat16_rz", + "__ushort2bfloat16_rd", + "__ushort2bfloat16_ru", + "__bfloat162ull_rn", + "__bfloat162ull_rz", + "make_bfloat162", + "__bfloat162ull_rd", + "__bfloat162ull_ru", + "__ull2bfloat16_rn", + "__ull2bfloat16_rz", + "__ull2bfloat16_rd", + "__ull2bfloat16_ru", + "__bfloat162ll_rn", + "__bfloat162ll_rz", + "__bfloat162ll_rd", + "__bfloat162ll_ru", + "__ll2bfloat16_rn", + "__ll2bfloat16_rz", + "__ll2bfloat16_rd", + "__ll2bfloat16_ru", + "htrunc", + "hceil", + "hfloor", + "hrint", + "h2trunc", + "h2ceil", + "h2floor", + "h2rint", + "__bfloat162bfloat162", + "__lowhigh2highlow", + "__lows2bfloat162", + "__highs2bfloat162", + "__high2bfloat16", + "__low2bfloat16", + "__hisinf", + "__halves2bfloat162", + "__low2bfloat162", + "__high2bfloat162", + "__bfloat16_as_short", + "__bfloat16_as_ushort", + "__short_as_bfloat16", + "__ushort_as_bfloat16", + "__shfl_sync", + "__shfl_sync", + "__shfl_up_sync", + "__shfl_up_sync", + "__shfl_down_sync", + "__shfl_down_sync", + "__shfl_xor_sync", + "__shfl_xor_sync", + "__ldg", + "__ldg", + "__ldcg", + "__ldcg", + "__ldca", + "__ldca", + "__ldcs", + "__ldcs", + "__ldlu", + "__ldlu", + "__ldcv", + "__ldcv", + "__stwb", + "__stwb", + "__stcg", + "__stcg", + "__stcs", + "__stcs", + "__stwt", + "__stwt", + "__heq2", + "__hne2", + "__hle2", + "__hge2", + "__hlt2", + "__hgt2", + "__hequ2", + "__hneu2", + "__hleu2", + "__hgeu2", + "__hltu2", + "__hgtu2", + "__heq2_mask", + "__hne2_mask", + "__hle2_mask", + "__hge2_mask", + "__hlt2_mask", + "__hgt2_mask", + "__hequ2_mask", + "__hneu2_mask", + "__hleu2_mask", + "__hgeu2_mask", + "__hltu2_mask", + "__hgtu2_mask", + "__hisnan2", + "__hadd2", + "__hsub2", + "__hmul2", + "__hadd2_rn", + "__hsub2_rn", + "__hmul2_rn", + "__h2div", + "__habs2", + "__hadd2_sat", + "__hsub2_sat", + "__hmul2_sat", + "__hfma2", + "__hfma2_sat", + "__hneg2", + "__habs", + "__hadd", + "__hsub", + "__hmul", + "__hadd_rn", + "__hsub_rn", + "__hmul_rn", + "__hdiv", + "__hadd_sat", + "__hsub_sat", + "__hmul_sat", + "__hfma", + "__hfma_sat", + "__hneg", + "__hbeq2", + "__hbne2", + "__hble2", + "__hbge2", + "__hblt2", + "__hbgt2", + "__hbequ2", + "__hbneu2", + "__hbleu2", + "__hbgeu2", + "__hbltu2", + "__hbgtu2", + "__heq", + "__hne", + "__hle", + "__hge", + "__hlt", + "__hgt", + "__hequ", + "__hneu", + "__hleu", + "__hgeu", + "__hltu", + "__hgtu", + "__hisnan", + "__hmax", + "__hmin", + "__hmax_nan", + "__hmin_nan", + "__hfma_relu", + "__hmax2", + "__hmin2", + "__hmax2_nan", + "__hmin2_nan", + "__hfma2_relu", + "__hcmadd", + "hsqrt", + "hrsqrt", + "hrcp", + "hlog", + "hlog2", + "hlog10", + "hexp", + "htanh_approx", + "h2tanh_approx", + "htanh", + "h2tanh", + "hexp2", + "hexp10", + "hcos", + "hsin", + "h2sqrt", + "h2rsqrt", + "h2rcp", + "h2log", + "h2log2", + "h2log10", + "h2exp", + "h2exp2", + "h2exp10", + "h2cos", + "h2sin", + "atomicAdd", + "atomicAdd", + "operator+", + "operator-", + "operator*", + "operator/", + "operator+=", + "operator-=", + "operator*=", + "operator/=", + "operator+", + "operator-", + "operator==", + "operator!=", + "operator>", + "operator<", + "operator>=", + "operator<=", + "operator+", + "operator-", + "operator*", + "operator/", + "operator+=", + "operator-=", + "operator*=", + "operator/=", + "operator+", + "operator-", + "operator==", + "operator!=", + "operator>", + "operator<", + "operator>=", + "operator<=", + "__half", +] + + +__all__ = _NBTYPE_SYMBOLS + _RECORD_SYMBOLS + _FUNCTION_SYMBOLS From ae6de8cfa4e4600e86ec7fe55199e0d86bb3c27d Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Mon, 4 Aug 2025 16:31:54 -0700 Subject: [PATCH 03/56] remove re-import of bfloat16 type --- numba_cuda/numba/cuda/_internal/cuda_bf16.py | 1 - 1 file changed, 1 deletion(-) diff --git a/numba_cuda/numba/cuda/_internal/cuda_bf16.py b/numba_cuda/numba/cuda/_internal/cuda_bf16.py index 4fd6c50e4..45eae7d41 100644 --- a/numba_cuda/numba/cuda/_internal/cuda_bf16.py +++ b/numba_cuda/numba/cuda/_internal/cuda_bf16.py @@ -29,7 +29,6 @@ from numba.core.typing.templates import AttributeTemplate, ConcreteTemplate from numba.core.typing.templates import Registry as TypingRegistry from numba.cuda import CUSource, declare_device -from numba.cuda._internal.cuda_bf16 import _type___nv_bfloat16 from numba.cuda.vector_types import vector_types from numba.extending import as_numba_type from numba.types import ( From 8498a995d250212096bedd9933f3c4b38dbd45ca Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 5 Aug 2025 12:34:09 -0700 Subject: [PATCH 04/56] implement custom bfloat16 type object; insert type registry into cuda target; mock bfloat16 llvmIR type --- numba_cuda/numba/cuda/_internal/cuda_bf16.py | 81 ++++++-------------- numba_cuda/numba/cuda/bf16.py | 5 +- numba_cuda/numba/cuda/models.py | 35 ++++++++- numba_cuda/numba/cuda/target.py | 12 ++- numba_cuda/numba/cuda/types.py | 15 ++++ 5 files changed, 86 insertions(+), 62 deletions(-) diff --git a/numba_cuda/numba/cuda/_internal/cuda_bf16.py b/numba_cuda/numba/cuda/_internal/cuda_bf16.py index 8a263b962..162eec8a8 100644 --- a/numba_cuda/numba/cuda/_internal/cuda_bf16.py +++ b/numba_cuda/numba/cuda/_internal/cuda_bf16.py @@ -25,10 +25,11 @@ ) from numba.core.imputils import Registry as TargetRegistry from numba.core.imputils import lower_cast -from numba.core.typing.templates import Registry as TypingRegistry +from numba.cuda.typing.templates import Registry as TypingRegistry from numba.cuda.typing import signature from numba.cuda.typing.templates import AttributeTemplate, ConcreteTemplate from numba.cuda import CUSource, declare_device +from numba.cuda.types import bfloat16 from numba.cuda.vector_types import vector_types from numba.extending import as_numba_type from numba.types import ( @@ -52,6 +53,7 @@ ) float32x2 = vector_types["float32x2"] +__half = float16 # Setups: @@ -192,28 +194,7 @@ class _ctor_template_unnamed1405416(ConcreteTemplate): register_global(unnamed1405416, Function(_ctor_template_unnamed1405416)) -# Typing for __nv_bfloat16 -class _type_class___nv_bfloat16(Number): - def __init__(self): - super().__init__(name="__nv_bfloat16") - self.alignof_ = 2 - self.bitwidth = 2 * 8 - - -_type___nv_bfloat16 = _type_class___nv_bfloat16() - - -# Make Python API for struct -__nv_bfloat16 = type("__nv_bfloat16", (), {"_nbtype": _type___nv_bfloat16}) - -as_numba_type.register(__nv_bfloat16, _type___nv_bfloat16) - - -@register_model(_type_class___nv_bfloat16) -class _model___nv_bfloat16(PrimitiveModel): - def __init__(self, dmm, fe_type): - be_type = ir.IntType(fe_type.bitwidth) - super(_model___nv_bfloat16, self).__init__(dmm, fe_type, be_type) +__nv_bfloat16 = _type___nv_bfloat16 = bfloat16 def _lower__ZN13__nv_bfloat16C1Ev(shim_stream, shim_obj): @@ -417,8 +398,8 @@ def conversion_impl(context, builder, fromty, toty, value): return ctor_impl( context, builder, - signature(_type___nv_bfloat16, CPointer(float32)), - value, + signature(_type___nv_bfloat16, float32), + [value], ) @@ -470,8 +451,8 @@ def conversion_impl(context, builder, fromty, toty, value): return ctor_impl( context, builder, - signature(_type___nv_bfloat16, CPointer(float64)), - value, + signature(_type___nv_bfloat16, float64), + [value], ) @@ -523,8 +504,8 @@ def conversion_impl(context, builder, fromty, toty, value): return ctor_impl( context, builder, - signature(_type___nv_bfloat16, CPointer(int16)), - value, + signature(_type___nv_bfloat16, int16), + [value], ) @@ -576,8 +557,8 @@ def conversion_impl(context, builder, fromty, toty, value): return ctor_impl( context, builder, - signature(_type___nv_bfloat16, CPointer(uint16)), - value, + signature(_type___nv_bfloat16, uint16), + [value], ) @@ -629,8 +610,8 @@ def conversion_impl(context, builder, fromty, toty, value): return ctor_impl( context, builder, - signature(_type___nv_bfloat16, CPointer(int32)), - value, + signature(_type___nv_bfloat16, int32), + [value], ) @@ -682,8 +663,8 @@ def conversion_impl(context, builder, fromty, toty, value): return ctor_impl( context, builder, - signature(_type___nv_bfloat16, CPointer(uint32)), - value, + signature(_type___nv_bfloat16, uint32), + [value], ) @@ -735,8 +716,8 @@ def conversion_impl(context, builder, fromty, toty, value): return ctor_impl( context, builder, - signature(_type___nv_bfloat16, CPointer(int64)), - value, + signature(_type___nv_bfloat16, int64), + [value], ) @@ -788,8 +769,8 @@ def conversion_impl(context, builder, fromty, toty, value): return ctor_impl( context, builder, - signature(_type___nv_bfloat16, CPointer(uint64)), - value, + signature(_type___nv_bfloat16, uint64), + [value], ) @@ -841,8 +822,8 @@ def conversion_impl(context, builder, fromty, toty, value): return ctor_impl( context, builder, - signature(_type___nv_bfloat16, CPointer(int64)), - value, + signature(_type___nv_bfloat16, int64), + [value], ) @@ -894,8 +875,8 @@ def conversion_impl(context, builder, fromty, toty, value): return ctor_impl( context, builder, - signature(_type___nv_bfloat16, CPointer(uint64)), - value, + signature(_type___nv_bfloat16, uint64), + [value], ) @@ -13635,10 +13616,6 @@ def impl(context, builder, sig, args): _lower__ZleRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj) -def __half(): - pass - - def _lower__ZN6__halfC1E13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int @@ -15944,15 +15921,6 @@ class _typing_atomicAdd(ConcreteTemplate): register_global(atomicAdd, types.Function(_typing_atomicAdd)) -@register -class _typing___half(ConcreteTemplate): - key = globals()["__half"] - cases = [signature(void, _type___nv_bfloat16)] - - -register_global(__half, types.Function(_typing___half)) - - @register_global(operator.add) class _typing_operator_add(ConcreteTemplate): cases = [ @@ -16403,7 +16371,6 @@ class _typing_operator_le(ConcreteTemplate): "operator<", "operator>=", "operator<=", - "__half", ] diff --git a/numba_cuda/numba/cuda/bf16.py b/numba_cuda/numba/cuda/bf16.py index 1ef6a370a..7ce28e459 100644 --- a/numba_cuda/numba/cuda/bf16.py +++ b/numba_cuda/numba/cuda/bf16.py @@ -1,6 +1,5 @@ from numba.cuda._internal.cuda_bf16 import ( - _type_class___nv_bfloat16, - nv_bfloat16 as bfloat16, + __nv_bfloat16 as bfloat16, htrunc, hceil, hfloor, @@ -25,7 +24,7 @@ def _make_unary(a, func): - if isinstance(a, _type_class___nv_bfloat16): + if isinstance(a, bfloat16): return lambda a: func(a) diff --git a/numba_cuda/numba/cuda/models.py b/numba_cuda/numba/cuda/models.py index f9735d7fc..768a1c3bf 100644 --- a/numba_cuda/numba/cuda/models.py +++ b/numba_cuda/numba/cuda/models.py @@ -3,9 +3,10 @@ from llvmlite import ir from numba.core.datamodel.registry import DataModelManager, register +from numba.core.datamodel import PrimitiveModel from numba.core.extending import models from numba.core import types -from numba.cuda.types import Dim3, GridGroup, CUDADispatcher +from numba.cuda.types import Dim3, GridGroup, CUDADispatcher, Bfloat16 cuda_data_manager = DataModelManager() @@ -42,3 +43,35 @@ def __init__(self, dmm, fe_type): register_model(CUDADispatcher)(models.OpaqueModel) + + +def _as_bfloat(value): + # Step 1: Convert to float + f = ir.types._as_float(value) + # Step 2: Truncate (or round, we choose truncate) last 16 bits + bf = f >> 16 + return bf + + +class BfloatType(ir.types._BaseFloatType): + """Brain-float type""" + + null = "0.0" + intrinsic_name = "bfloat" + + def __str__(self): + return "bfloat" + + def format_constant(self, value): + return ir.types._format_double(_as_bfloat(value)) + + +BfloatType._create_instance() + + +@register_model(Bfloat16) +class _model___nv_bfloat16(PrimitiveModel): + def __init__(self, dmm, fe_type): + # be_type = BfloatType() + be_type = ir.IntType(16) + super(_model___nv_bfloat16, self).__init__(dmm, fe_type, be_type) diff --git a/numba_cuda/numba/cuda/target.py b/numba_cuda/numba/cuda/target.py index 26b717264..66f2e2359 100644 --- a/numba_cuda/numba/cuda/target.py +++ b/numba_cuda/numba/cuda/target.py @@ -31,7 +31,14 @@ class CUDATypingContext(typing.BaseContext): def load_additional_registries(self): - from . import cudadecl, cudamath, fp16, libdevicedecl, vector_types + from . import ( + cudadecl, + cudamath, + fp16, + bf16, + libdevicedecl, + vector_types, + ) from numba.core.typing import enumdecl, cffi_utils self.install_registry(cudadecl.registry) @@ -42,6 +49,7 @@ def load_additional_registries(self): self.install_registry(enumdecl.registry) self.install_registry(vector_types.typing_registry) self.install_registry(fp16.typing_registry) + self.install_registry(bf16.typing_registry) def resolve_value_type(self, val): # treat other dispatcher object as another device function @@ -154,6 +162,7 @@ def load_additional_registries(self): libdeviceimpl, mathimpl, vector_types, + bf16, ) # fix for #8940 @@ -167,6 +176,7 @@ def load_additional_registries(self): self.install_registry(mathimpl.registry) self.install_registry(vector_types.impl_registry) self.install_registry(fp16.target_registry) + self.install_registry(bf16.target_registry) def codegen(self): return self._internal_codegen diff --git a/numba_cuda/numba/cuda/types.py b/numba_cuda/numba/cuda/types.py index 92b8f3ecb..844ce393a 100644 --- a/numba_cuda/numba/cuda/types.py +++ b/numba_cuda/numba/cuda/types.py @@ -38,3 +38,18 @@ class CUDADispatcher(types.Dispatcher): # is still probably a good idea to have a separate type for CUDA # dispatchers, and this type might get other differentiation from the CPU # dispatcher type in future. + + +class Bfloat16(types.Number): + """ + A bfloat16 type. + """ + + def __init__(self): + super().__init__(name="__nv_bfloat16") + + self.alignof_ = 2 + self.bitwidth = 2 * 8 + + +bfloat16 = Bfloat16() From f79f0bfa659df0e88b528a4d93ed5ff5932bcf3b Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 6 Aug 2025 20:55:32 -0700 Subject: [PATCH 05/56] update bfloat16 bindings --- configs/cuda_bf16.yml | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/configs/cuda_bf16.yml b/configs/cuda_bf16.yml index 29aa1d2dd..348e48ee7 100644 --- a/configs/cuda_bf16.yml +++ b/configs/cuda_bf16.yml @@ -1,7 +1,7 @@ Name: Numba Bfloat16 Version: 0.0.2 GPU Arch: - - sm_80 # The first architecture to support bfloat16 + - sm_80 # sm_80 is the first CUDA architecture that supports bfloat16 Entry Point: ./numba_cuda/numba/cuda/include/12/cuda_bf16.h File List: - ./numba_cuda/numba/cuda/include/12/cuda_bf16.h @@ -21,7 +21,4 @@ Data Models: __nv_bfloat162: StructModel nv_bfloat162: StructModel Shim Include Override: "\"cuda_bf16.h\"" -Additional Import: - - os -Require Pynvjitlink: False -Use Separate Registry: true +Use Separate Registry: True From 1b3598f2011e9776ea9463e080831142f39969d1 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 6 Aug 2025 20:56:07 -0700 Subject: [PATCH 06/56] export typing and target registries in bf16 --- numba_cuda/numba/cuda/bf16.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/numba_cuda/numba/cuda/bf16.py b/numba_cuda/numba/cuda/bf16.py index 7ce28e459..693a8e573 100644 --- a/numba_cuda/numba/cuda/bf16.py +++ b/numba_cuda/numba/cuda/bf16.py @@ -1,5 +1,7 @@ from numba.cuda._internal.cuda_bf16 import ( - __nv_bfloat16 as bfloat16, + typing_registry, + target_registry, + nv_bfloat16 as bfloat16, htrunc, hceil, hfloor, @@ -18,13 +20,14 @@ htanh, htanh_approx, ) +from numba.cuda.types import Bfloat16 from numba.extending import overload import math def _make_unary(a, func): - if isinstance(a, bfloat16): + if isinstance(a, Bfloat16): return lambda a: func(a) @@ -90,6 +93,8 @@ def exp2_ol(a): __all__ = [ + "typing_registry", + "target_registry", "bfloat16", "htrunc", "hceil", From efc32f0aecd9a8bfb1b18af6d2468d6a2d3aa656 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 6 Aug 2025 20:56:34 -0700 Subject: [PATCH 07/56] manually implement the lower_cast for float16 to bfloat16 --- numba_cuda/numba/cuda/_internal/cuda_bf16.py | 56 ++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/numba_cuda/numba/cuda/_internal/cuda_bf16.py b/numba_cuda/numba/cuda/_internal/cuda_bf16.py index 162eec8a8..fcc70298d 100644 --- a/numba_cuda/numba/cuda/_internal/cuda_bf16.py +++ b/numba_cuda/numba/cuda/_internal/cuda_bf16.py @@ -307,6 +307,62 @@ def conversion_impl(context, builder, fromty, toty, value): _lower__ZN13__nv_bfloat16C1ERK17__nv_bfloat16_raw(shim_stream, shim_obj) +def _lower__float16_to_bfloat16(shim_stream, shim_obj): + shim_raw_str = """ + extern "C" __device__ int + _ZN13__float162bfloat16_nbst(int &ignore, __nv_bfloat16 *self , __half* hr) { + new (self) __nv_bfloat16(*hr); + return 0; + } + """ + + _ctor_decl___float162bfloat16 = declare_device( + "_ZN13__float162bfloat16_nbst", + int32(CPointer(_type___nv_bfloat16), CPointer(float16)), + ) + + def __float162bfloat16_device_caller(arg_0, arg_1): + return _ctor_decl___float162bfloat16(arg_0, arg_1) + + def ctor_impl(context, builder, sig, args): + context.active_code_library.add_linking_file(shim_obj) + shim_stream.write_with_key("_ZN13__float162bfloat16_nbst", shim_raw_str) + selfptr = builder.alloca( + context.get_value_type(_type___nv_bfloat16), name="selfptr" + ) + argptrs = [ + builder.alloca(context.get_value_type(arg)) for arg in sig.args + ] + for ptr, ty, arg in zip(argptrs, sig.args, args): + builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) + + context.compile_internal( + builder, + __float162bfloat16_device_caller, + signature( + int32, + CPointer(_type___nv_bfloat16), + CPointer(float16), + ), + (selfptr, *argptrs), + ) + return builder.load( + selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) + ) + + @lower_cast(float16, _type___nv_bfloat16) + def conversion_impl(context, builder, fromty, toty, value): + return ctor_impl( + context, + builder, + signature(_type___nv_bfloat16, float16), + [value], + ) + + +_lower__float16_to_bfloat16(shim_stream, shim_obj) + + def _lower__ZN13__nv_bfloat16C1E6__half(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int From b0f76e96a39bfdf0b1b557dccd50483a11fc07a5 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 6 Aug 2025 22:32:17 -0700 Subject: [PATCH 08/56] add converting rules and unify rules --- numba_cuda/numba/cuda/types.py | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/numba_cuda/numba/cuda/types.py b/numba_cuda/numba/cuda/types.py index 844ce393a..92e3cafde 100644 --- a/numba_cuda/numba/cuda/types.py +++ b/numba_cuda/numba/cuda/types.py @@ -1,4 +1,5 @@ from numba.core import types +from numba.core.typeconv import Conversion class Dim3(types.Type): @@ -49,7 +50,32 @@ def __init__(self): super().__init__(name="__nv_bfloat16") self.alignof_ = 2 - self.bitwidth = 2 * 8 + self.bitwidth = 16 + + def can_convert_from(self, other): + if isinstance(other, types.Float): + return Conversion.unsafe + + elif isinstance(other, types.Integer): + if other.bitwidth == 8: + return Conversion.safe + + return Conversion.unsafe + + def can_convert_to(self, typingctx, other): + if isinstance(other, types.Float): + if other.bitwidth >= 32: + return Conversion.safe + else: + return Conversion.unsafe + elif isinstance(other, types.Integer): + return Conversion.unsafe + + return Conversion.unsafe + + def unify(self, typingctx, other): + if isinstance(other, (types.Float, types.Integer)): + return typingctx.unify_pairs(self, other) bfloat16 = Bfloat16() From 041862516163d2639421198d78ee029f81b1800d Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Thu, 14 Aug 2025 14:42:31 -0700 Subject: [PATCH 09/56] choose irType based on compute capability --- numba_cuda/numba/cuda/models.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/numba_cuda/numba/cuda/models.py b/numba_cuda/numba/cuda/models.py index 768a1c3bf..02f629575 100644 --- a/numba_cuda/numba/cuda/models.py +++ b/numba_cuda/numba/cuda/models.py @@ -72,6 +72,14 @@ def format_constant(self, value): @register_model(Bfloat16) class _model___nv_bfloat16(PrimitiveModel): def __init__(self, dmm, fe_type): - # be_type = BfloatType() - be_type = ir.IntType(16) + from numba.cuda.api import get_current_device + + major, minor = get_current_device().compute_capability + + # Blackwell device leverage latest nvvm (llvm 20+ dialect) which has + # bfloat type + if major >= 10: + be_type = BfloatType() + else: + be_type = ir.IntType(16) super(_model___nv_bfloat16, self).__init__(dmm, fe_type, be_type) From 6ffa69666c84b8702aa5398abbc22d338d9311ee Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Thu, 14 Aug 2025 14:43:41 -0700 Subject: [PATCH 10/56] vend ctk13 code --- numba_cuda/numba/cuda/cudadrv/nvrtc.py | 6 +- numba_cuda/numba/cuda/include/13/cuda_bf16.h | 5118 +++++++++++++++++ .../numba/cuda/include/13/cuda_bf16.hpp | 3865 +++++++++++++ 3 files changed, 8988 insertions(+), 1 deletion(-) create mode 100644 numba_cuda/numba/cuda/include/13/cuda_bf16.h create mode 100644 numba_cuda/numba/cuda/include/13/cuda_bf16.hpp diff --git a/numba_cuda/numba/cuda/cudadrv/nvrtc.py b/numba_cuda/numba/cuda/cudadrv/nvrtc.py index 0c4074a73..7b1efc225 100644 --- a/numba_cuda/numba/cuda/cudadrv/nvrtc.py +++ b/numba_cuda/numba/cuda/cudadrv/nvrtc.py @@ -356,8 +356,12 @@ def compile(src, name, cc, ltoir=False): if nvrtc_ver_major == 11: numba_include = f"{os.path.join(numba_cuda_path, 'include', '11')}" - else: + elif nvrtc_ver_major == 12: numba_include = f"{os.path.join(numba_cuda_path, 'include', '12')}" + elif nvrtc_ver_major == 13: + numba_include = f"{os.path.join(numba_cuda_path, 'include', '13')}" + else: + raise RuntimeError(f"Unsupported CUDA version: {nvrtc_version}") if config.CUDA_NVRTC_EXTRA_SEARCH_PATHS: extra_includes = config.CUDA_NVRTC_EXTRA_SEARCH_PATHS.split(":") diff --git a/numba_cuda/numba/cuda/include/13/cuda_bf16.h b/numba_cuda/numba/cuda/include/13/cuda_bf16.h new file mode 100644 index 000000000..38feffba0 --- /dev/null +++ b/numba_cuda/numba/cuda/include/13/cuda_bf16.h @@ -0,0 +1,5118 @@ +/* +* Copyright 1993-2024 NVIDIA Corporation. All rights reserved. +* +* NOTICE TO LICENSEE: +* +* This source code and/or documentation ("Licensed Deliverables") are +* subject to NVIDIA intellectual property rights under U.S. and +* international Copyright laws. +* +* These Licensed Deliverables contained herein is PROPRIETARY and +* CONFIDENTIAL to NVIDIA and is being provided under the terms and +* conditions of a form of NVIDIA software license agreement by and +* between NVIDIA and Licensee ("License Agreement") or electronically +* accepted by Licensee. Notwithstanding any terms or conditions to +* the contrary in the License Agreement, reproduction or disclosure +* of the Licensed Deliverables to any third party without the express +* written consent of NVIDIA is prohibited. +* +* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE +* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS +* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. +* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED +* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, +* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. +* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY +* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY +* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS +* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE +* OF THESE LICENSED DELIVERABLES. +* +* U.S. Government End Users. These Licensed Deliverables are a +* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT +* 1995), consisting of "commercial computer software" and "commercial +* computer software documentation" as such terms are used in 48 +* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government +* only as a commercial end item. Consistent with 48 C.F.R.12.212 and +* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all +* U.S. Government End Users acquire the Licensed Deliverables with +* only those rights set forth herein. +* +* Any use of the Licensed Deliverables in individual and commercial +* software must include, in the user documentation and internal +* comments to the code, the above Disclaimer and U.S. Government End +* Users Notice. +*/ + +/** +* \defgroup CUDA_MATH_INTRINSIC_BFLOAT16 Bfloat16 Precision Intrinsics +* This section describes nv_bfloat16 precision intrinsic functions. +* To use these functions, include the header file \p cuda_bf16.h in your program. +* All of the functions defined here are available in device code. +* Some of the functions are also available to host compilers, please +* refer to respective functions' documentation for details. +* +* NOTE: Aggressive floating-point optimizations performed by host or device +* compilers may affect numeric behavior of the functions implemented in this +* header. Specific examples are: +* - hsin(__nv_bfloat16); +* - hcos(__nv_bfloat16); +* - h2sin(__nv_bfloat162); +* - h2cos(__nv_bfloat162); +* +* The following macros are available to help users selectively enable/disable +* various definitions present in the header file: +* - \p CUDA_NO_BFLOAT16 - If defined, this macro will prevent the definition of +* additional type aliases in the global namespace, helping to avoid potential +* conflicts with symbols defined in the user program. +* - \p __CUDA_NO_BFLOAT16_CONVERSIONS__ - If defined, this macro will prevent +* the use of the C++ type conversions (converting constructors and conversion +* operators) that are common for built-in floating-point types, but may be +* undesirable for \p __nv_bfloat16 which is essentially a user-defined type. +* - \p __CUDA_NO_BFLOAT16_OPERATORS__ and \p __CUDA_NO_BFLOAT162_OPERATORS__ - +* If defined, these macros will prevent the inadvertent use of usual arithmetic +* and comparison operators. This enforces the storage-only type semantics and +* prevents C++ style computations on \p __nv_bfloat16 and \p __nv_bfloat162 types. +*/ + +/** +* \defgroup CUDA_MATH_INTRINSIC_BFLOAT16_CONSTANTS Bfloat16 Arithmetic Constants +* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 +* To use these constants, include the header file \p cuda_bf16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__BFLOAT16_ARITHMETIC Bfloat16 Arithmetic Functions +* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 +* To use these functions, include the header file \p cuda_bf16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__BFLOAT162_ARITHMETIC Bfloat162 Arithmetic Functions +* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 +* To use these functions, include the header file \p cuda_bf16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__BFLOAT16_COMPARISON Bfloat16 Comparison Functions +* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 +* To use these functions, include the header file \p cuda_bf16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__BFLOAT162_COMPARISON Bfloat162 Comparison Functions +* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 +* To use these functions, include the header file \p cuda_bf16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__BFLOAT16_MISC Bfloat16 Precision Conversion and Data Movement +* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 +* To use these functions, include the header file \p cuda_bf16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__BFLOAT16_FUNCTIONS Bfloat16 Math Functions +* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 +* To use these functions, include the header file \p cuda_bf16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__BFLOAT162_FUNCTIONS Bfloat162 Math Functions +* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 +* To use these functions, include the header file \p cuda_bf16.h in your program. +*/ + +#ifndef __CUDA_BF16_H__ +#define __CUDA_BF16_H__ + +/* bring in __half data type and operations, for use in converting constructors */ +#include "cuda_fp16.h" + +// implicitly provided by NVRTC +#if !defined(__CUDACC_RTC__) +/* bring in float2, double4, etc vector types */ +#include "vector_types.h" +/* bring in operations on vector types like: make_float2 */ +#include "vector_functions.h" +#endif /* !defined(__CUDACC_RTC__) */ + +#define ___CUDA_BF16_STRINGIFY_INNERMOST(x) #x +#define __CUDA_BF16_STRINGIFY(x) ___CUDA_BF16_STRINGIFY_INNERMOST(x) + +#if defined(__cplusplus) + +/* Set up function decorations */ +#if (defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3)))) +#define __CUDA_BF16_DECL__ __device__ +#define __CUDA_HOSTDEVICE_BF16_DECL__ __device__ +#define __CUDA_HOSTDEVICE__ __device__ +#elif defined(__CUDACC__) || defined(_NVHPC_CUDA) +#define __CUDA_BF16_DECL__ static __device__ __inline__ +#define __CUDA_HOSTDEVICE_BF16_DECL__ static __host__ __device__ __inline__ +#define __CUDA_HOSTDEVICE__ __host__ __device__ +#else /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +#if defined(__GNUC__) +#define __CUDA_HOSTDEVICE_BF16_DECL__ static __attribute__ ((unused)) +#else +#define __CUDA_HOSTDEVICE_BF16_DECL__ static +#endif /* defined(__GNUC__) */ +#define __CUDA_HOSTDEVICE__ +#endif /* (defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3))))) */ + +#define __CUDA_BF16_TYPES_EXIST__ + +/* Macros to allow nv_bfloat16 & nv_bfloat162 to be used by inline assembly */ +#define __BFLOAT16_TO_US(var) *(reinterpret_cast(&(var))) +#define __BFLOAT16_TO_CUS(var) *(reinterpret_cast(&(var))) +#define __BFLOAT162_TO_UI(var) *(reinterpret_cast(&(var))) +#define __BFLOAT162_TO_CUI(var) *(reinterpret_cast(&(var))) + +/* Forward-declaration of structures defined in "cuda_bf16.hpp" */ +struct __nv_bfloat16; +struct __nv_bfloat162; + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Converts double number to nv_bfloat16 precision in round-to-nearest-even mode +* and returns \p nv_bfloat16 with converted value. +* +* \details Converts double number \p a to nv_bfloat16 precision in round-to-nearest-even mode. +* \param[in] a - double. Is only being read. +* \returns nv_bfloat16 +* - \p a converted to \p nv_bfloat16 using round-to-nearest-even mode. +* - __double2bfloat16 \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula. +* - __double2bfloat16 \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula. +* - __double2bfloat16(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __double2bfloat16(const double a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Converts float number to nv_bfloat16 precision in round-to-nearest-even mode +* and returns \p nv_bfloat16 with converted value. +* +* \details Converts float number \p a to nv_bfloat16 precision in round-to-nearest-even mode. +* \param[in] a - float. Is only being read. +* \returns nv_bfloat16 +* - \p a converted to nv_bfloat16 using round-to-nearest-even mode. +* +* \see __float2bfloat16_rn(float) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16(const float a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Converts float number to nv_bfloat16 precision in round-to-nearest-even mode +* and returns \p nv_bfloat16 with converted value. +* +* \details Converts float number \p a to nv_bfloat16 precision in round-to-nearest-even mode. +* \param[in] a - float. Is only being read. +* \returns nv_bfloat16 +* - \p a converted to nv_bfloat16 using round-to-nearest-even mode. +* - __float2bfloat16_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula. +* - __float2bfloat16_rn \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula. +* - __float2bfloat16_rn(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rn(const float a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Converts float number to nv_bfloat16 precision in round-towards-zero mode +* and returns \p nv_bfloat16 with converted value. +* +* \details Converts float number \p a to nv_bfloat16 precision in round-towards-zero mode. +* \param[in] a - float. Is only being read. +* \returns nv_bfloat16 +* - \p a converted to nv_bfloat16 using round-towards-zero mode. +* - __float2bfloat16_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula. +* - __float2bfloat16_rz \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula. +* - __float2bfloat16_rz(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rz(const float a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Converts float number to nv_bfloat16 precision in round-down mode +* and returns \p nv_bfloat16 with converted value. +* +* \details Converts float number \p a to nv_bfloat16 precision in round-down mode. +* \param[in] a - float. Is only being read. +* +* \returns nv_bfloat16 +* - \p a converted to nv_bfloat16 using round-down mode. +* - __float2bfloat16_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula. +* - __float2bfloat16_rd \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula. +* - __float2bfloat16_rd(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rd(const float a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Converts float number to nv_bfloat16 precision in round-up mode +* and returns \p nv_bfloat16 with converted value. +* +* \details Converts float number \p a to nv_bfloat16 precision in round-up mode. +* \param[in] a - float. Is only being read. +* +* \returns nv_bfloat16 +* - \p a converted to nv_bfloat16 using round-up mode. +* - __float2bfloat16_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula. +* - __float2bfloat16_ru \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula. +* - __float2bfloat16_ru(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_ru(const float a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Converts \p nv_bfloat16 number to float. +* +* \details Converts nv_bfloat16 number \p a to float. +* \param[in] a - float. Is only being read. +* +* \returns float +* - \p a converted to float. +* - __bfloat162float \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula. +* - __bfloat162float \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula. +* - __bfloat162float(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ float __bfloat162float(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Converts input to nv_bfloat16 precision in round-to-nearest-even mode and +* populates both halves of \p nv_bfloat162 with converted value. +* +* \details Converts input \p a to nv_bfloat16 precision in round-to-nearest-even mode and +* populates both halves of \p nv_bfloat162 with converted value. +* \param[in] a - float. Is only being read. +* +* \returns nv_bfloat162 +* - The \p nv_bfloat162 value with both halves equal to the converted nv_bfloat16 +* precision number. +* +* \see __float2bfloat16_rn(float) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __float2bfloat162_rn(const float a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Converts both input floats to nv_bfloat16 precision in round-to-nearest-even +* mode and returns \p nv_bfloat162 with converted values. +* +* \details Converts both input floats to nv_bfloat16 precision in round-to-nearest-even mode +* and combines the results into one \p nv_bfloat162 number. Low 16 bits of the return +* value correspond to the input \p a, high 16 bits correspond to the input \p +* b. +* \param[in] a - float. Is only being read. +* \param[in] b - float. Is only being read. +* +* \returns nv_bfloat162 +* - The \p nv_bfloat162 value with corresponding halves equal to the +* converted input floats. +* +* \see __float2bfloat16_rn(float) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __floats2bfloat162_rn(const float a, const float b); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Converts low 16 bits of \p nv_bfloat162 to float and returns the result +* +* \details Converts low 16 bits of \p nv_bfloat162 input \p a to 32-bit floating-point number +* and returns the result. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns float +* - The low 16 bits of \p a converted to float. +* +* \see __bfloat162float(__nv_bfloat16) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ float __low2float(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Converts high 16 bits of \p nv_bfloat162 to float and returns the result +* +* \details Converts high 16 bits of \p nv_bfloat162 input \p a to 32-bit floating-point number +* and returns the result. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns float +* - The high 16 bits of \p a converted to float. +* +* \see __bfloat162float(__nv_bfloat16) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ float __high2float(const __nv_bfloat162 a); + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Converts both components of float2 number to nv_bfloat16 precision in +* round-to-nearest-even mode and returns \p nv_bfloat162 with converted values. +* +* \details Converts both components of float2 to nv_bfloat16 precision in round-to-nearest-even +* mode and combines the results into one \p nv_bfloat162 number. Low 16 bits of the +* return value correspond to \p a.x and high 16 bits of the return value +* correspond to \p a.y. +* \param[in] a - float2. Is only being read. +* +* \returns nv_bfloat162 +* - The \p nv_bfloat162 which has corresponding halves equal to the +* converted float2 components. +* +* \see __float2bfloat16_rn(float) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __float22bfloat162_rn(const float2 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Converts both halves of \p nv_bfloat162 to float2 and returns the result. +* +* \details Converts both halves of \p nv_bfloat162 input \p a to float and returns the +* result as a \p float2 packed value. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns float2 +* - \p a converted to float2. +* +* \see __bfloat162float(__nv_bfloat16) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ float2 __bfloat1622float2(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to a signed char in round-towards-zero mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to a signed +* char in round-towards-zero mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns signed char +* - \p h converted to a signed char using round-towards-zero mode. +* - __bfloat162char_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __bfloat162char_rz \cuda_math_formula (x), x > 127\end_cuda_math_formula returns SCHAR_MAX = \p 0x7F. +* - __bfloat162char_rz \cuda_math_formula (x), x < -128\end_cuda_math_formula returns SCHAR_MIN = \p 0x80. +* - __bfloat162char_rz(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ signed char __bfloat162char_rz(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to an unsigned char in round-towards-zero mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned +* char in round-towards-zero mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned char +* - \p h converted to an unsigned char using round-towards-zero mode. +* - __bfloat162uchar_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __bfloat162uchar_rz \cuda_math_formula (x), x > 255\end_cuda_math_formula returns UCHAR_MAX = \p 0xFF. +* - __bfloat162uchar_rz \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0. +* - __bfloat162uchar_rz(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned char __bfloat162uchar_rz(const __nv_bfloat16 h); +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to a signed integer in round-to-nearest-even mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to a signed integer in +* round-to-nearest-even mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns int +* - \p h converted to a signed integer using round-to-nearest-even mode. +* - __bfloat162int_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __bfloat162int_rn \cuda_math_formula (x), x > INT_MAX\end_cuda_math_formula returns INT_MAX = \p 0x7FFFFFFF. +* - __bfloat162int_rn \cuda_math_formula (x), x < INT_MIN\end_cuda_math_formula returns INT_MIN = \p 0x80000000. +* - __bfloat162int_rn(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ int __bfloat162int_rn(const __nv_bfloat16 h); +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to a signed integer in round-towards-zero mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to a signed integer in +* round-towards-zero mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns int +* - \p h converted to a signed integer using round-towards-zero mode. +* - __bfloat162int_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __bfloat162int_rz \cuda_math_formula (x), x > INT_MAX\end_cuda_math_formula returns INT_MAX = \p 0x7FFFFFFF. +* - __bfloat162int_rz \cuda_math_formula (x), x < INT_MIN\end_cuda_math_formula returns INT_MIN = \p 0x80000000. +* - __bfloat162int_rz(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ int __bfloat162int_rz(const __nv_bfloat16 h); +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to a signed integer in round-down mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to a signed integer in +* round-down mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns int +* - \p h converted to a signed integer using round-down mode. +* - __bfloat162int_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __bfloat162int_rd \cuda_math_formula (x), x > INT_MAX\end_cuda_math_formula returns INT_MAX = \p 0x7FFFFFFF. +* - __bfloat162int_rd \cuda_math_formula (x), x < INT_MIN\end_cuda_math_formula returns INT_MIN = \p 0x80000000. +* - __bfloat162int_rd(NaN) returns 0.* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ int __bfloat162int_rd(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to a signed integer in round-up mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to a signed integer in +* round-up mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns int +* - \p h converted to a signed integer using round-up mode. +* - __bfloat162int_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __bfloat162int_ru \cuda_math_formula (x), x > INT_MAX\end_cuda_math_formula returns INT_MAX = \p 0x7FFFFFFF. +* - __bfloat162int_ru \cuda_math_formula (x), x < INT_MIN\end_cuda_math_formula returns INT_MIN = \p 0x80000000. +* - __bfloat162int_ru(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ int __bfloat162int_ru(const __nv_bfloat16 h); +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a signed integer to a nv_bfloat16 in round-to-nearest-even mode. +* +* \details Convert the signed integer value \p i to a nv_bfloat16 floating-point +* value in round-to-nearest-even mode. +* \param[in] i - int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rn(const int i); +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a signed integer to a nv_bfloat16 in round-towards-zero mode. +* +* \details Convert the signed integer value \p i to a nv_bfloat16 floating-point +* value in round-towards-zero mode. +* \param[in] i - int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rz(const int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a signed integer to a nv_bfloat16 in round-down mode. +* +* \details Convert the signed integer value \p i to a nv_bfloat16 floating-point +* value in round-down mode. +* \param[in] i - int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rd(const int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a signed integer to a nv_bfloat16 in round-up mode. +* +* \details Convert the signed integer value \p i to a nv_bfloat16 floating-point +* value in round-up mode. +* \param[in] i - int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_ru(const int i); + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to a signed short integer in round-to-nearest-even +* mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to a signed short +* integer in round-to-nearest-even mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns short int +* - \p h converted to a signed short integer using round-to-nearest-even mode. +* - __bfloat162short_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __bfloat162short_rn \cuda_math_formula (x), x > 32767\end_cuda_math_formula returns SHRT_MAX = \p 0x7FFF. +* - __bfloat162short_rn \cuda_math_formula (x), x < -32768\end_cuda_math_formula returns SHRT_MIN = \p 0x8000. +* - __bfloat162short_rn(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ short int __bfloat162short_rn(const __nv_bfloat16 h); +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to a signed short integer in round-towards-zero mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to a signed short +* integer in round-towards-zero mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns short int +* - \p h converted to a signed short integer using round-towards-zero mode. +* - __bfloat162short_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __bfloat162short_rz \cuda_math_formula (x), x > 32767\end_cuda_math_formula returns SHRT_MAX = \p 0x7FFF. +* - __bfloat162short_rz \cuda_math_formula (x), x < -32768\end_cuda_math_formula returns SHRT_MIN = \p 0x8000. +* - __bfloat162short_rz(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ short int __bfloat162short_rz(const __nv_bfloat16 h); +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to a signed short integer in round-down mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to a signed short +* integer in round-down mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns short int +* - \p h converted to a signed short integer using round-down mode. +* - __bfloat162short_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __bfloat162short_rd \cuda_math_formula (x), x > 32767\end_cuda_math_formula returns SHRT_MAX = \p 0x7FFF. +* - __bfloat162short_rd \cuda_math_formula (x), x < -32768\end_cuda_math_formula returns SHRT_MIN = \p 0x8000. +* - __bfloat162short_rd(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ short int __bfloat162short_rd(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to a signed short integer in round-up mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to a signed short +* integer in round-up mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns short int +* - \p h converted to a signed short integer using round-up mode. +* - __bfloat162short_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __bfloat162short_ru \cuda_math_formula (x), x > 32767\end_cuda_math_formula returns SHRT_MAX = \p 0x7FFF. +* - __bfloat162short_ru \cuda_math_formula (x), x < -32768\end_cuda_math_formula returns SHRT_MIN = \p 0x8000. +* - __bfloat162short_ru(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ short int __bfloat162short_ru(const __nv_bfloat16 h); +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a signed short integer to a nv_bfloat16 in round-to-nearest-even +* mode. +* +* \details Convert the signed short integer value \p i to a nv_bfloat16 floating-point +* value in round-to-nearest-even mode. +* \param[in] i - short int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rn(const short int i); +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a signed short integer to a nv_bfloat16 in round-towards-zero mode. +* +* \details Convert the signed short integer value \p i to a nv_bfloat16 floating-point +* value in round-towards-zero mode. +* \param[in] i - short int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rz(const short int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a signed short integer to a nv_bfloat16 in round-down mode. +* +* \details Convert the signed short integer value \p i to a nv_bfloat16 floating-point +* value in round-down mode. +* \param[in] i - short int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rd(const short int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a signed short integer to a nv_bfloat16 in round-up mode. +* +* \details Convert the signed short integer value \p i to a nv_bfloat16 floating-point +* value in round-up mode. +* \param[in] i - short int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_ru(const short int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to an unsigned integer in round-to-nearest-even mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned integer +* in round-to-nearest-even mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned int +* - \p h converted to an unsigned integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ unsigned int __bfloat162uint_rn(const __nv_bfloat16 h); +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to an unsigned integer in round-towards-zero mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned integer +* in round-towards-zero mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned int +* - \p h converted to an unsigned integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __bfloat162uint_rz(const __nv_bfloat16 h); +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to an unsigned integer in round-down mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned integer +* in round-down mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned int +* - \p h converted to an unsigned integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ unsigned int __bfloat162uint_rd(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to an unsigned integer in round-up mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned integer +* in round-up mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned int +* - \p h converted to an unsigned integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ unsigned int __bfloat162uint_ru(const __nv_bfloat16 h); +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert an unsigned integer to a nv_bfloat16 in round-to-nearest-even mode. +* +* \details Convert the unsigned integer value \p i to a nv_bfloat16 floating-point +* value in round-to-nearest-even mode. +* \param[in] i - unsigned int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rn(const unsigned int i); +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert an unsigned integer to a nv_bfloat16 in round-towards-zero mode. +* +* \details Convert the unsigned integer value \p i to a nv_bfloat16 floating-point +* value in round-towards-zero mode. +* \param[in] i - unsigned int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rz(const unsigned int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert an unsigned integer to a nv_bfloat16 in round-down mode. +* +* \details Convert the unsigned integer value \p i to a nv_bfloat16 floating-point +* value in round-down mode. +* \param[in] i - unsigned int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rd(const unsigned int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert an unsigned integer to a nv_bfloat16 in round-up mode. +* +* \details Convert the unsigned integer value \p i to a nv_bfloat16 floating-point +* value in round-up mode. +* \param[in] i - unsigned int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_ru(const unsigned int i); + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to an unsigned short integer in round-to-nearest-even +* mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned short +* integer in round-to-nearest-even mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned short int +* - \p h converted to an unsigned short integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_rn(const __nv_bfloat16 h); +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to an unsigned short integer in round-towards-zero +* mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned short +* integer in round-towards-zero mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned short int +* - \p h converted to an unsigned short integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned short int __bfloat162ushort_rz(const __nv_bfloat16 h); +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to an unsigned short integer in round-down mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned short +* integer in round-down mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned short int +* - \p h converted to an unsigned short integer. +*/ +__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_rd(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to an unsigned short integer in round-up mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned short +* integer in round-up mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned short int +* - \p h converted to an unsigned short integer. +*/ +__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_ru(const __nv_bfloat16 h); +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert an unsigned short integer to a nv_bfloat16 in round-to-nearest-even +* mode. +* +* \details Convert the unsigned short integer value \p i to a nv_bfloat16 floating-point +* value in round-to-nearest-even mode. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rn(const unsigned short int i); +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert an unsigned short integer to a nv_bfloat16 in round-towards-zero +* mode. +* +* \details Convert the unsigned short integer value \p i to a nv_bfloat16 floating-point +* value in round-towards-zero mode. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rz(const unsigned short int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert an unsigned short integer to a nv_bfloat16 in round-down mode. +* +* \details Convert the unsigned short integer value \p i to a nv_bfloat16 floating-point +* value in round-down mode. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rd(const unsigned short int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert an unsigned short integer to a nv_bfloat16 in round-up mode. +* +* \details Convert the unsigned short integer value \p i to a nv_bfloat16 floating-point +* value in round-up mode. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_ru(const unsigned short int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to an unsigned 64-bit integer in round-to-nearest-even +* mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned 64-bit +* integer in round-to-nearest-even mode. NaN inputs return 0x8000000000000000. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned long long int +* - \p h converted to an unsigned 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_rn(const __nv_bfloat16 h); +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to an unsigned 64-bit integer in round-towards-zero +* mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned 64-bit +* integer in round-towards-zero mode. NaN inputs return 0x8000000000000000. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned long long int +* - \p h converted to an unsigned 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned long long int __bfloat162ull_rz(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Vector function, combines two \p nv_bfloat16 numbers into one \p nv_bfloat162 number. +* +* \details Combines two input \p nv_bfloat16 number \p x and \p y into one \p nv_bfloat162 number. +* Input \p x is stored in low 16 bits of the return value, input \p y is stored +* in high 16 bits of the return value. +* \param[in] x - nv_bfloat16. Is only being read. +* \param[in] y - nv_bfloat16. Is only being read. +* +* \returns __nv_bfloat162 +* - The \p __nv_bfloat162 vector with one half equal to \p x and the other to \p y. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 make_bfloat162(const __nv_bfloat16 x, const __nv_bfloat16 y); +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to an unsigned 64-bit integer in round-down mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned 64-bit +* integer in round-down mode. NaN inputs return 0x8000000000000000. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned long long int +* - \p h converted to an unsigned 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_rd(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to an unsigned 64-bit integer in round-up mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned 64-bit +* integer in round-up mode. NaN inputs return 0x8000000000000000. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned long long int +* - \p h converted to an unsigned 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_ru(const __nv_bfloat16 h); +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert an unsigned 64-bit integer to a nv_bfloat16 in round-to-nearest-even +* mode. +* +* \details Convert the unsigned 64-bit integer value \p i to a nv_bfloat16 floating-point +* value in round-to-nearest-even mode. +* \param[in] i - unsigned long long int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rn(const unsigned long long int i); +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert an unsigned 64-bit integer to a nv_bfloat16 in round-towards-zero +* mode. +* +* \details Convert the unsigned 64-bit integer value \p i to a nv_bfloat16 floating-point +* value in round-towards-zero mode. +* \param[in] i - unsigned long long int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rz(const unsigned long long int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert an unsigned 64-bit integer to a nv_bfloat16 in round-down mode. +* +* \details Convert the unsigned 64-bit integer value \p i to a nv_bfloat16 floating-point +* value in round-down mode. +* \param[in] i - unsigned long long int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rd(const unsigned long long int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert an unsigned 64-bit integer to a nv_bfloat16 in round-up mode. +* +* \details Convert the unsigned 64-bit integer value \p i to a nv_bfloat16 floating-point +* value in round-up mode. +* \param[in] i - unsigned long long int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_ru(const unsigned long long int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to a signed 64-bit integer in round-to-nearest-even +* mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to a signed 64-bit +* integer in round-to-nearest-even mode. NaN inputs return a long long int with hex value of 0x8000000000000000. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns long long int +* - \p h converted to a signed 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ long long int __bfloat162ll_rn(const __nv_bfloat16 h); +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to a signed 64-bit integer in round-towards-zero mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to a signed 64-bit +* integer in round-towards-zero mode. NaN inputs return a long long int with hex value of 0x8000000000000000. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns long long int +* - \p h converted to a signed 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ long long int __bfloat162ll_rz(const __nv_bfloat16 h); +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to a signed 64-bit integer in round-down mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to a signed 64-bit +* integer in round-down mode. NaN inputs return a long long int with hex value of 0x8000000000000000. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns long long int +* - \p h converted to a signed 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ long long int __bfloat162ll_rd(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to a signed 64-bit integer in round-up mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to a signed 64-bit +* integer in round-up mode. NaN inputs return a long long int with hex value of 0x8000000000000000. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns long long int +* - \p h converted to a signed 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ long long int __bfloat162ll_ru(const __nv_bfloat16 h); +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a signed 64-bit integer to a nv_bfloat16 in round-to-nearest-even +* mode. +* +* \details Convert the signed 64-bit integer value \p i to a nv_bfloat16 floating-point +* value in round-to-nearest-even mode. +* \param[in] i - long long int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rn(const long long int i); +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a signed 64-bit integer to a nv_bfloat16 in round-towards-zero mode. +* +* \details Convert the signed 64-bit integer value \p i to a nv_bfloat16 floating-point +* value in round-towards-zero mode. +* \param[in] i - long long int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rz(const long long int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a signed 64-bit integer to a nv_bfloat16 in round-down mode. +* +* \details Convert the signed 64-bit integer value \p i to a nv_bfloat16 floating-point +* value in round-down mode. +* \param[in] i - long long int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rd(const long long int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a signed 64-bit integer to a nv_bfloat16 in round-up mode. +* +* \details Convert the signed 64-bit integer value \p i to a nv_bfloat16 floating-point +* value in round-up mode. +* \param[in] i - long long int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_ru(const long long int i); + +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Truncate input argument to the integral part. +* +* \details Round \p h to the nearest integer value that does not exceed \p h in +* magnitude. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The truncated integer value. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 htrunc(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculate ceiling of the input argument. +* +* \details Compute the smallest integer value not less than \p h. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The smallest integer value not less than \p h. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hceil(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculate the largest integer less than or equal to \p h. +* +* \details Calculate the largest integer value which is less than or equal to \p h. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The largest integer value which is less than or equal to \p h. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hfloor(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Round input to nearest integer value in nv_bfloat16 floating-point +* number. +* +* \details Round \p h to the nearest integer value in nv_bfloat16 floating-point +* format, with bfloat16way cases rounded to the nearest even integer value. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The nearest integer to \p h. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hrint(const __nv_bfloat16 h); + +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Truncate \p nv_bfloat162 vector input argument to the integral part. +* +* \details Round each component of vector \p h to the nearest integer value that does +* not exceed \p h in magnitude. +* \param[in] h - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The truncated \p h. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2trunc(const __nv_bfloat162 h); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculate \p nv_bfloat162 vector ceiling of the input argument. +* +* \details For each component of vector \p h compute the smallest integer value not less +* than \p h. +* \param[in] h - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The vector of smallest integers not less than \p h. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2ceil(const __nv_bfloat162 h); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculate the largest integer less than or equal to \p h. +* +* \details For each component of vector \p h calculate the largest integer value which +* is less than or equal to \p h. +* \param[in] h - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The vector of largest integers which is less than or equal to \p h. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2floor(const __nv_bfloat162 h); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Round input to nearest integer value in nv_bfloat16 floating-point +* number. +* +* \details Round each component of \p nv_bfloat162 vector \p h to the nearest integer value in +* nv_bfloat16 floating-point format, with bfloat16way cases rounded to the +* nearest even integer value. +* \param[in] h - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The vector of rounded integer values. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2rint(const __nv_bfloat162 h); +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Returns \p nv_bfloat162 with both halves equal to the input value. +* +* \details Returns \p nv_bfloat162 number with both halves equal to the input \p a \p nv_bfloat16 +* number. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat162 +* - The vector which has both its halves equal to the input \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __bfloat162bfloat162(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Swaps both halves of the \p nv_bfloat162 input. +* +* \details Swaps both halves of the \p nv_bfloat162 input and returns a new \p nv_bfloat162 number +* with swapped halves. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - \p a with its halves being swapped. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __lowhigh2highlow(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Extracts low 16 bits from each of the two \p nv_bfloat162 inputs and combines +* into one \p nv_bfloat162 number. +* +* \details Extracts low 16 bits from each of the two \p nv_bfloat162 inputs and combines into +* one \p nv_bfloat162 number. Low 16 bits from input \p a is stored in low 16 bits of +* the return value, low 16 bits from input \p b is stored in high 16 bits of +* the return value. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The low 16 bits of \p a and of \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __lows2bfloat162(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Extracts high 16 bits from each of the two \p nv_bfloat162 inputs and +* combines into one \p nv_bfloat162 number. +* +* \details Extracts high 16 bits from each of the two \p nv_bfloat162 inputs and combines into +* one \p nv_bfloat162 number. High 16 bits from input \p a is stored in low 16 bits of +* the return value, high 16 bits from input \p b is stored in high 16 bits of +* the return value. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The high 16 bits of \p a and of \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __highs2bfloat162(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Returns high 16 bits of \p nv_bfloat162 input. +* +* \details Returns high 16 bits of \p nv_bfloat162 input \p a. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat16 +* - The high 16 bits of the input. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __high2bfloat16(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Returns low 16 bits of \p nv_bfloat162 input. +* +* \details Returns low 16 bits of \p nv_bfloat162 input \p a. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat16 +* - Returns \p nv_bfloat16 which contains low 16 bits of the input \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __low2bfloat16(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Checks if the input \p nv_bfloat16 number is infinite. +* +* \details Checks if the input \p nv_bfloat16 number \p a is infinite. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns int +* - -1 if \p a is equal to negative infinity, +* - 1 if \p a is equal to positive infinity, +* - 0 otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ int __hisinf(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Combines two \p nv_bfloat16 numbers into one \p nv_bfloat162 number. +* +* \details Combines two input \p nv_bfloat16 number \p a and \p b into one \p nv_bfloat162 number. +* Input \p a is stored in low 16 bits of the return value, input \p b is stored +* in high 16 bits of the return value. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat162 +* - The nv_bfloat162 with one nv_bfloat16 equal to \p a and the other to \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __halves2bfloat162(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Extracts low 16 bits from \p nv_bfloat162 input. +* +* \details Extracts low 16 bits from \p nv_bfloat162 input \p a and returns a new \p nv_bfloat162 +* number which has both halves equal to the extracted bits. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The nv_bfloat162 with both halves equal to the low 16 bits of the input. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __low2bfloat162(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Extracts high 16 bits from \p nv_bfloat162 input. +* +* \details Extracts high 16 bits from \p nv_bfloat162 input \p a and returns a new \p nv_bfloat162 +* number which has both halves equal to the extracted bits. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The nv_bfloat162 with both halves equal to the high 16 bits of the input. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __high2bfloat162(const __nv_bfloat162 a); + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Reinterprets bits in a \p nv_bfloat16 as a signed short integer. +* +* \details Reinterprets the bits in the nv_bfloat16 floating-point number \p h +* as a signed short integer. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns short int +* - The reinterpreted value. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ short int __bfloat16_as_short(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Reinterprets bits in a \p nv_bfloat16 as an unsigned short integer. +* +* \details Reinterprets the bits in the nv_bfloat16 floating-point \p h +* as an unsigned short number. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned short int +* - The reinterpreted value. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned short int __bfloat16_as_ushort(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Reinterprets bits in a signed short integer as a \p nv_bfloat16. +* +* \details Reinterprets the bits in the signed short integer \p i as a +* nv_bfloat16 floating-point number. +* \param[in] i - short int. Is only being read. +* +* \returns nv_bfloat16 +* - The reinterpreted value. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __short_as_bfloat16(const short int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Reinterprets bits in an unsigned short integer as a \p nv_bfloat16. +* +* \details Reinterprets the bits in the unsigned short integer \p i as a +* nv_bfloat16 floating-point number. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns nv_bfloat16 +* - The reinterpreted value. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ushort_as_bfloat16(const unsigned short int i); + +#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300))) || defined(_NVHPC_CUDA) + +#if !defined warpSize && !defined __local_warpSize +#define warpSize 32 +#define __local_warpSize +#endif + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Exchange a variable between threads within a warp. Direct copy from indexed thread. +* +* \details Returns the value of \p var held by the thread whose ID is given by \p srcLane. +* If the \p width is less than \p warpSize, then each subsection of the warp behaves as a separate +* entity with a starting logical thread ID of 0. If \p srcLane is outside the range \p [0:width-1], +* the value returned corresponds to the value of \p var held by the \p srcLane modulo \p width (i.e. +* within the same subsection). \p width must have a value which is a power of 2; +* results are undefined if \p width is not a power of 2, or is a number greater than +* \p warpSize. +* Threads may only read data from another thread which is actively participating in the +* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined. +* \param[in] mask - unsigned int. Is only being read. +* - Indicates the threads participating in the call. +* - A bit, representing the thread's lane ID, must be set for each participating thread +* to ensure they are properly converged before the intrinsic is executed by the hardware. +* - Each calling thread must have its own bit set in the \p mask and all non-exited threads +* named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined. +* \param[in] var - nv_bfloat162. Is only being read. +* \param[in] srcLane - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 4-byte word referenced by \p var from the source thread ID as \p nv_bfloat162. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_sync(const unsigned int mask, const __nv_bfloat162 var, const int srcLane, const int width = warpSize); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller. +* +* \details Calculates a source thread ID by subtracting \p delta from the caller's lane ID. +* The value of \p var held by the resulting lane ID is returned: in effect, \p var is shifted up +* the warp by \p delta threads. If the \p width is less than \p warpSize, then each subsection of the warp +* behaves as a separate entity with a starting logical thread ID of 0. The source thread index +* will not wrap around the value of \p width, so effectively the lower \p delta threads will be unchanged. +* \p width must have a value which is a power of 2; results are undefined if \p width is not a power of 2, +* or is a number greater than \p warpSize. +* Threads may only read data from another thread which is actively participating in the +* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined. +* \param[in] mask - unsigned int. Is only being read. +* - Indicates the threads participating in the call. +* - A bit, representing the thread's lane ID, must be set for each participating thread +* to ensure they are properly converged before the intrinsic is executed by the hardware. +* - Each calling thread must have its own bit set in the \p mask and all non-exited threads +* named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined. +* \param[in] var - nv_bfloat162. Is only being read. +* \param[in] delta - unsigned int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 4-byte word referenced by \p var from the source thread ID as \p nv_bfloat162. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_up_sync(const unsigned int mask, const __nv_bfloat162 var, const unsigned int delta, const int width = warpSize); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller. +* +* \details Calculates a source thread ID by adding \p delta to the caller's thread ID. +* The value of \p var held by the resulting thread ID is returned: this has the effect +* of shifting \p var down the warp by \p delta threads. If the \p width is less than \p warpSize, then +* each subsection of the warp behaves as a separate entity with a starting logical +* thread ID of 0. Similarly to the __shfl_up_sync(), the ID number of the source thread +* will not wrap around the value of \p width and the upper \p delta threads +* will remain unchanged. +* Threads may only read data from another thread which is actively participating in the +* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined. +* \param[in] mask - unsigned int. Is only being read. +* - Indicates the threads participating in the call. +* - A bit, representing the thread's lane ID, must be set for each participating thread +* to ensure they are properly converged before the intrinsic is executed by the hardware. +* - Each calling thread must have its own bit set in the \p mask and all non-exited threads +* named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined. +* \param[in] var - nv_bfloat162. Is only being read. +* \param[in] delta - unsigned int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 4-byte word referenced by \p var from the source thread ID as \p nv_bfloat162. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_down_sync(const unsigned int mask, const __nv_bfloat162 var, const unsigned int delta, const int width = warpSize); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID. +* +* \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with \p laneMask: +* the value of \p var held by the resulting thread ID is returned. If the \p width is less than \p warpSize, then each +* group of \p width consecutive threads are able to access elements from earlier groups of threads, +* however if they attempt to access elements from later groups of threads their own value of \p var +* will be returned. This mode implements a butterfly addressing pattern such as is used in tree +* reduction and broadcast. +* Threads may only read data from another thread which is actively participating in the +* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined. +* \param[in] mask - unsigned int. Is only being read. +* - Indicates the threads participating in the call. +* - A bit, representing the thread's lane ID, must be set for each participating thread +* to ensure they are properly converged before the intrinsic is executed by the hardware. +* - Each calling thread must have its own bit set in the \p mask and all non-exited threads +* named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined. +* \param[in] var - nv_bfloat162. Is only being read. +* \param[in] laneMask - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 4-byte word referenced by \p var from the source thread ID as \p nv_bfloat162. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_xor_sync(const unsigned int mask, const __nv_bfloat162 var, const int laneMask, const int width = warpSize); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Exchange a variable between threads within a warp. Direct copy from indexed thread. +* +* \details Returns the value of \p var held by the thread whose ID is given by \p srcLane. +* If the \p width is less than \p warpSize, then each subsection of the warp behaves as a separate +* entity with a starting logical thread ID of 0. If \p srcLane is outside the range \p [0:width-1], +* the value returned corresponds to the value of \p var held by the \p srcLane modulo \p width (i.e. +* within the same subsection). \p width must have a value which is a power of 2; +* results are undefined if \p width is not a power of 2, or is a number greater than +* \p warpSize. +* Threads may only read data from another thread which is actively participating in the +* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined. +* \param[in] mask - unsigned int. Is only being read. +* - Indicates the threads participating in the call. +* - A bit, representing the thread's lane ID, must be set for each participating thread +* to ensure they are properly converged before the intrinsic is executed by the hardware. +* - Each calling thread must have its own bit set in the \p mask and all non-exited threads +* named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined. +* \param[in] var - nv_bfloat16. Is only being read. +* \param[in] srcLane - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 2-byte word referenced by \p var from the source thread ID as \p nv_bfloat16. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_sync(const unsigned int mask, const __nv_bfloat16 var, const int srcLane, const int width = warpSize); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller. +* +* \details Calculates a source thread ID by subtracting \p delta from the caller's lane ID. +* The value of \p var held by the resulting lane ID is returned: in effect, \p var is shifted up +* the warp by \p delta threads. If the \p width is less than \p warpSize, then each subsection of the warp +* behaves as a separate entity with a starting logical thread ID of 0. The source thread index +* will not wrap around the value of \p width, so effectively the lower \p delta threads will be unchanged. +* \p width must have a value which is a power of 2; results are undefined if \p width is not a power of 2, +* or is a number greater than \p warpSize. +* Threads may only read data from another thread which is actively participating in the +* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined. +* \param[in] mask - unsigned int. Is only being read. +* - Indicates the threads participating in the call. +* - A bit, representing the thread's lane ID, must be set for each participating thread +* to ensure they are properly converged before the intrinsic is executed by the hardware. +* - Each calling thread must have its own bit set in the \p mask and all non-exited threads +* named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined. +* \param[in] var - nv_bfloat16. Is only being read. +* \param[in] delta - unsigned int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 2-byte word referenced by \p var from the source thread ID as \p nv_bfloat16. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_up_sync(const unsigned int mask, const __nv_bfloat16 var, const unsigned int delta, const int width = warpSize); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller. +* +* \details Calculates a source thread ID by adding \p delta to the caller's thread ID. +* The value of \p var held by the resulting thread ID is returned: this has the effect +* of shifting \p var down the warp by \p delta threads. If the \p width is less than \p warpSize, then +* each subsection of the warp behaves as a separate entity with a starting logical +* thread ID of 0. Similarly to the __shfl_up_sync(), the ID number of the source thread +* will not wrap around the value of \p width and the upper \p delta threads +* will remain unchanged. +* Threads may only read data from another thread which is actively participating in the +* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined. +* \param[in] mask - unsigned int. Is only being read. +* - Indicates the threads participating in the call. +* - A bit, representing the thread's lane ID, must be set for each participating thread +* to ensure they are properly converged before the intrinsic is executed by the hardware. +* - Each calling thread must have its own bit set in the \p mask and all non-exited threads +* named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined. +* \param[in] var - nv_bfloat16. Is only being read. +* \param[in] delta - unsigned int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 2-byte word referenced by \p var from the source thread ID as \p nv_bfloat16. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_down_sync(const unsigned int mask, const __nv_bfloat16 var, const unsigned int delta, const int width = warpSize); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID. +* +* \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with \p laneMask: +* the value of \p var held by the resulting thread ID is returned. If the \p width is less than \p warpSize, then each +* group of \p width consecutive threads are able to access elements from earlier groups of threads, +* however if they attempt to access elements from later groups of threads their own value of \p var +* will be returned. This mode implements a butterfly addressing pattern such as is used in tree +* reduction and broadcast. +* Threads may only read data from another thread which is actively participating in the +* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined. +* \param[in] mask - unsigned int. Is only being read. +* - Indicates the threads participating in the call. +* - A bit, representing the thread's lane ID, must be set for each participating thread +* to ensure they are properly converged before the intrinsic is executed by the hardware. +* - Each calling thread must have its own bit set in the \p mask and all non-exited threads +* named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined. +* \param[in] var - nv_bfloat16. Is only being read. +* \param[in] laneMask - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 2-byte word referenced by \p var from the source thread ID as \p nv_bfloat16. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_xor_sync(const unsigned int mask, const __nv_bfloat16 var, const int laneMask, const int width = warpSize); +#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300))) || defined(_NVHPC_CUDA) */ + +#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320))) || defined(_NVHPC_CUDA) +#if defined(__local_warpSize) +#undef warpSize +#undef __local_warpSize +#endif + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `ld.global.nc` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __ldg(const __nv_bfloat162 *const ptr); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `ld.global.nc` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ldg(const __nv_bfloat16 *const ptr); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `ld.global.cg` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __ldcg(const __nv_bfloat162 *const ptr); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `ld.global.cg` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ldcg(const __nv_bfloat16 *const ptr); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `ld.global.ca` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __ldca(const __nv_bfloat162 *const ptr); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `ld.global.ca` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ldca(const __nv_bfloat16 *const ptr); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `ld.global.cs` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __ldcs(const __nv_bfloat162 *const ptr); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `ld.global.cs` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ldcs(const __nv_bfloat16 *const ptr); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `ld.global.lu` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __ldlu(const __nv_bfloat162 *const ptr); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `ld.global.lu` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ldlu(const __nv_bfloat16 *const ptr); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `ld.global.cv` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __ldcv(const __nv_bfloat162 *const ptr); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `ld.global.cv` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ldcv(const __nv_bfloat16 *const ptr); + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `st.global.wb` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_BF16_DECL__ void __stwb(__nv_bfloat162 *const ptr, const __nv_bfloat162 value); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `st.global.wb` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_BF16_DECL__ void __stwb(__nv_bfloat16 *const ptr, const __nv_bfloat16 value); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `st.global.cg` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_BF16_DECL__ void __stcg(__nv_bfloat162 *const ptr, const __nv_bfloat162 value); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `st.global.cg` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_BF16_DECL__ void __stcg(__nv_bfloat16 *const ptr, const __nv_bfloat16 value); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `st.global.cs` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_BF16_DECL__ void __stcs(__nv_bfloat162 *const ptr, const __nv_bfloat162 value); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `st.global.cs` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_BF16_DECL__ void __stcs(__nv_bfloat16 *const ptr, const __nv_bfloat16 value); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `st.global.wt` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_BF16_DECL__ void __stwt(__nv_bfloat162 *const ptr, const __nv_bfloat162 value); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `st.global.wt` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_BF16_DECL__ void __stwt(__nv_bfloat16 *const ptr, const __nv_bfloat16 value); + +#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320))) || defined(_NVHPC_CUDA) */ + +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs nv_bfloat162 vector if-equal comparison. +* +* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b. +* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The vector result of if-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __heq2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector not-equal comparison. +* +* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b. +* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The vector result of not-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hne2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector less-equal comparison. +* +* \details Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b. +* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The \p nv_bfloat162 result of less-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hle2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector greater-equal comparison. +* +* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b. +* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The vector result of greater-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hge2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector less-than comparison. +* +* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b. +* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The nv_bfloat162 vector result of less-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hlt2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector greater-than comparison. +* +* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b. +* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The vector result of greater-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hgt2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered if-equal comparison. +* +* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b. +* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The vector result of unordered if-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hequ2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered not-equal comparison. +* +* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b. +* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The vector result of unordered not-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hneu2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered less-equal comparison. +* +* Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b. +* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The vector result of unordered less-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hleu2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered greater-equal comparison. +* +* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b. +* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The \p nv_bfloat162 vector result of unordered greater-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered less-than comparison. +* +* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b. +* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The vector result of unordered less-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hltu2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered greater-than comparison. +* +* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b. +* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The \p nv_bfloat162 vector result of unordered greater-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs nv_bfloat162 vector if-equal comparison. +* +* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns unsigned int +* - The vector mask result of if-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __heq2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector not-equal comparison. +* +* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns unsigned int +* - The vector mask result of not-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hne2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector less-equal comparison. +* +* \details Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns unsigned int +* - The vector mask result of less-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hle2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector greater-equal comparison. +* +* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns unsigned int +* - The vector mask result of greater-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hge2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector less-than comparison. +* +* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns unsigned int +* - The vector mask result of less-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hlt2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector greater-than comparison. +* +* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns unsigned int +* - The vector mask result of greater-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hgt2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered if-equal comparison. +* +* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns unsigned int +* - The vector mask result of unordered if-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hequ2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered not-equal comparison. +* +* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns unsigned int +* - The vector mask result of unordered not-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hneu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered less-equal comparison. +* +* Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns unsigned int +* - The vector mask result of unordered less-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hleu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered greater-equal comparison. +* +* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns unsigned int +* - The vector mask result of unordered greater-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hgeu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered less-than comparison. +* +* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns unsigned int +* - The vector mask result of unordered less-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hltu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered greater-than comparison. +* +* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns unsigned int +* - The vector mask result of unordered greater-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hgtu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Determine whether \p nv_bfloat162 argument is a NaN. +* +* \details Determine whether each nv_bfloat16 of input \p nv_bfloat162 number \p a is a NaN. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The nv_bfloat162 with the corresponding \p nv_bfloat16 results set to +* 1.0 for NaN, 0.0 otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hisnan2(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs \p nv_bfloat162 vector addition in round-to-nearest-even mode. +* +* \details Performs \p nv_bfloat162 vector add of inputs \p a and \p b, in round-to-nearest-even +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-95 +* \endinternal +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The sum of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hadd2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs \p nv_bfloat162 vector subtraction in round-to-nearest-even mode. +* +* \details Subtracts \p nv_bfloat162 input vector \p b from input vector \p a in +* round-to-nearest-even mode. +* \internal +* \req DEEPLEARN-SRM_REQ-104 +* \endinternal +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The subtraction of vector \p b from \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hsub2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs \p nv_bfloat162 vector multiplication in round-to-nearest-even mode. +* +* \details Performs \p nv_bfloat162 vector multiplication of inputs \p a and \p b, in +* round-to-nearest-even mode. +* \internal +* \req DEEPLEARN-SRM_REQ-102 +* \endinternal +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The result of elementwise multiplying the vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmul2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs \p nv_bfloat162 vector addition in round-to-nearest-even mode. +* +* \details Performs \p nv_bfloat162 vector add of inputs \p a and \p b, in round-to-nearest-even +* mode. Prevents floating-point contractions of mul+add into fma. +* \internal +* \req DEEPLEARN-SRM_REQ-95 +* \endinternal +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The sum of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hadd2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs \p nv_bfloat162 vector subtraction in round-to-nearest-even mode. +* +* \details Subtracts \p nv_bfloat162 input vector \p b from input vector \p a in +* round-to-nearest-even mode. Prevents floating-point contractions of mul+sub into fma. +* \internal +* \req DEEPLEARN-SRM_REQ-104 +* \endinternal +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The subtraction of vector \p b from \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hsub2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs \p nv_bfloat162 vector multiplication in round-to-nearest-even mode. +* +* \details Performs \p nv_bfloat162 vector multiplication of inputs \p a and \p b, in +* round-to-nearest-even mode. Prevents floating-point contractions of mul+add +* or sub into fma. +* \internal +* \req DEEPLEARN-SRM_REQ-102 +* \endinternal +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The result of elementwise multiplying the vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmul2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs \p nv_bfloat162 vector division in round-to-nearest-even mode. +* +* \details Divides \p nv_bfloat162 input vector \p a by input vector \p b in round-to-nearest-even +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-103 +* \endinternal +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The elementwise division of \p a with \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __h2div(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Calculates the absolute value of both halves of the input \p nv_bfloat162 number and +* returns the result. +* +* \details Calculates the absolute value of both halves of the input \p nv_bfloat162 number and +* returns the result. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns bfloat2 +* - Returns \p a with the absolute value of both halves. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __habs2(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs \p nv_bfloat162 vector addition in round-to-nearest-even mode, with +* saturation to [0.0, 1.0]. +* +* \details Performs \p nv_bfloat162 vector add of inputs \p a and \p b, in round-to-nearest-even +* mode, and clamps the results to range [0.0, 1.0]. NaN results are flushed to +* +0.0. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The sum of \p a and \p b, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hadd2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs \p nv_bfloat162 vector subtraction in round-to-nearest-even mode, +* with saturation to [0.0, 1.0]. +* +* \details Subtracts \p nv_bfloat162 input vector \p b from input vector \p a in +* round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN +* results are flushed to +0.0. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The subtraction of vector \p b from \p a, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hsub2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs \p nv_bfloat162 vector multiplication in round-to-nearest-even mode, +* with saturation to [0.0, 1.0]. +* +* \details Performs \p nv_bfloat162 vector multiplication of inputs \p a and \p b, in +* round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN +* results are flushed to +0.0. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The result of elementwise multiplication of vectors \p a and \p b, +* with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmul2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b); +#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs \p nv_bfloat162 vector fused multiply-add in round-to-nearest-even +* mode. +* +* \details Performs \p nv_bfloat162 vector multiply on inputs \p a and \p b, +* then performs a \p nv_bfloat162 vector add of the result with \p c, +* rounding the result once in round-to-nearest-even mode. +* \internal +* \req DEEPLEARN-SRM_REQ-105 +* \endinternal +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* \param[in] c - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c); +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs \p nv_bfloat162 vector fused multiply-add in round-to-nearest-even +* mode, with saturation to [0.0, 1.0]. +* +* \details Performs \p nv_bfloat162 vector multiply on inputs \p a and \p b, +* then performs a \p nv_bfloat162 vector add of the result with \p c, +* rounding the result once in round-to-nearest-even mode, and clamps the +* results to range [0.0, 1.0]. NaN results are flushed to +0.0. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* \param[in] c - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c, +* with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c); +#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Negates both halves of the input \p nv_bfloat162 number and returns the +* result. +* +* \details Negates both halves of the input \p nv_bfloat162 number \p a and returns the result. +* \internal +* \req DEEPLEARN-SRM_REQ-101 +* \endinternal +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - Returns \p a with both halves negated. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hneg2(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Calculates the absolute value of input \p nv_bfloat16 number and returns the result. +* +* \details Calculates the absolute value of input \p nv_bfloat16 number and returns the result. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The absolute value of a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __habs(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Performs \p nv_bfloat16 addition in round-to-nearest-even mode. +* +* \details Performs \p nv_bfloat16 addition of inputs \p a and \p b, in round-to-nearest-even +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-94 +* \endinternal +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The sum of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hadd(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Performs \p nv_bfloat16 subtraction in round-to-nearest-even mode. +* +* \details Subtracts \p nv_bfloat16 input \p b from input \p a in round-to-nearest-even +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-97 +* \endinternal +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The result of subtracting \p b from \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hsub(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Performs \p nv_bfloat16 multiplication in round-to-nearest-even mode. +* +* \details Performs \p nv_bfloat16 multiplication of inputs \p a and \p b, in round-to-nearest-even +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-99 +* \endinternal +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The result of multiplying \p a and \p b. +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmul(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Performs \p nv_bfloat16 addition in round-to-nearest-even mode. +* +* \details Performs \p nv_bfloat16 addition of inputs \p a and \p b, in round-to-nearest-even +* mode. Prevents floating-point contractions of mul+add into fma. +* \internal +* \req DEEPLEARN-SRM_REQ-94 +* \endinternal +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The sum of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hadd_rn(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Performs \p nv_bfloat16 subtraction in round-to-nearest-even mode. +* +* \details Subtracts \p nv_bfloat16 input \p b from input \p a in round-to-nearest-even +* mode. Prevents floating-point contractions of mul+sub into fma. +* \internal +* \req DEEPLEARN-SRM_REQ-97 +* \endinternal +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The result of subtracting \p b from \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hsub_rn(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Performs \p nv_bfloat16 multiplication in round-to-nearest-even mode. +* +* \details Performs \p nv_bfloat16 multiplication of inputs \p a and \p b, in round-to-nearest-even +* mode. Prevents floating-point contractions of mul+add or sub into fma. +* \internal +* \req DEEPLEARN-SRM_REQ-99 +* \endinternal +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The result of multiplying \p a and \p b. +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmul_rn(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Performs \p nv_bfloat16 division in round-to-nearest-even mode. +* +* \details Divides \p nv_bfloat16 input \p a by input \p b in round-to-nearest-even +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-98 +* \endinternal +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The result of dividing \p a by \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hdiv(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Performs \p nv_bfloat16 addition in round-to-nearest-even mode, with +* saturation to [0.0, 1.0]. +* +* \details Performs \p nv_bfloat16 add of inputs \p a and \p b, in round-to-nearest-even mode, +* and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The sum of \p a and \p b, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hadd_sat(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Performs \p nv_bfloat16 subtraction in round-to-nearest-even mode, with +* saturation to [0.0, 1.0]. +* +* \details Subtracts \p nv_bfloat16 input \p b from input \p a in round-to-nearest-even +* mode, +* and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The result of subtraction of \p b from \p a, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hsub_sat(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Performs \p nv_bfloat16 multiplication in round-to-nearest-even mode, with +* saturation to [0.0, 1.0]. +* +* \details Performs \p nv_bfloat16 multiplication of inputs \p a and \p b, in round-to-nearest-even +* mode, and clamps the result to range [0.0, 1.0]. NaN results are flushed to +* +0.0. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The result of multiplying \p a and \p b, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmul_sat(const __nv_bfloat16 a, const __nv_bfloat16 b); +#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Performs \p nv_bfloat16 fused multiply-add in round-to-nearest-even mode. +* +* \details Performs \p nv_bfloat16 multiply on inputs \p a and \p b, +* then performs a \p nv_bfloat16 add of the result with \p c, +* rounding the result once in round-to-nearest-even mode. +* \internal +* \req DEEPLEARN-SRM_REQ-96 +* \endinternal +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* \param[in] c - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The result of fused multiply-add operation on \p +* a, \p b, and \p c. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __hfma(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c); +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Performs \p nv_bfloat16 fused multiply-add in round-to-nearest-even mode, +* with saturation to [0.0, 1.0]. +* +* \details Performs \p nv_bfloat16 multiply on inputs \p a and \p b, +* then performs a \p nv_bfloat16 add of the result with \p c, +* rounding the result once in round-to-nearest-even mode, and clamps the result +* to range [0.0, 1.0]. NaN results are flushed to +0.0. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* \param[in] c - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The result of fused multiply-add operation on \p +* a, \p b, and \p c, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __hfma_sat(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c); +#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Negates input \p nv_bfloat16 number and returns the result. +* +* \details Negates input \p nv_bfloat16 number and returns the result. +* \internal +* \req DEEPLEARN-SRM_REQ-100 +* \endinternal +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - minus a +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hneg(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector if-equal comparison and returns boolean true +* if both \p nv_bfloat16 results are true, boolean false otherwise. +* +* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p nv_bfloat16 if-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns bool +* - true if both \p nv_bfloat16 results of if-equal comparison +* of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbeq2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector not-equal comparison and returns boolean +* true if both \p nv_bfloat16 results are true, boolean false otherwise. +* +* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p nv_bfloat16 not-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns bool +* - true if both \p nv_bfloat16 results of not-equal comparison +* of vectors \p a and \p b are true, +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbne2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector less-equal comparison and returns boolean +* true if both \p nv_bfloat16 results are true, boolean false otherwise. +* +* \details Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p nv_bfloat16 less-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns bool +* - true if both \p nv_bfloat16 results of less-equal comparison +* of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hble2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector greater-equal comparison and returns boolean +* true if both \p nv_bfloat16 results are true, boolean false otherwise. +* +* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p nv_bfloat16 greater-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns bool +* - true if both \p nv_bfloat16 results of greater-equal +* comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbge2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector less-than comparison and returns boolean +* true if both \p nv_bfloat16 results are true, boolean false otherwise. +* +* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p nv_bfloat16 less-than comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns bool +* - true if both \p nv_bfloat16 results of less-than comparison +* of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hblt2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector greater-than comparison and returns boolean +* true if both \p nv_bfloat16 results are true, boolean false otherwise. +* +* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p nv_bfloat16 greater-than comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns bool +* - true if both \p nv_bfloat16 results of greater-than +* comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbgt2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered if-equal comparison and returns +* boolean true if both \p nv_bfloat16 results are true, boolean false otherwise. +* +* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p nv_bfloat16 if-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns bool +* - true if both \p nv_bfloat16 results of unordered if-equal +* comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbequ2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered not-equal comparison and returns +* boolean true if both \p nv_bfloat16 results are true, boolean false otherwise. +* +* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p nv_bfloat16 not-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns bool +* - true if both \p nv_bfloat16 results of unordered not-equal +* comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbneu2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered less-equal comparison and returns +* boolean true if both \p nv_bfloat16 results are true, boolean false otherwise. +* +* \details Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p nv_bfloat16 less-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns bool +* - true if both \p nv_bfloat16 results of unordered less-equal +* comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbleu2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered greater-equal comparison and +* returns boolean true if both \p nv_bfloat16 results are true, boolean false +* otherwise. +* +* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p nv_bfloat16 greater-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns bool +* - true if both \p nv_bfloat16 results of unordered +* greater-equal comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered less-than comparison and returns +* boolean true if both \p nv_bfloat16 results are true, boolean false otherwise. +* +* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p nv_bfloat16 less-than comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns bool +* - true if both \p nv_bfloat16 results of unordered less-than comparison of +* vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbltu2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered greater-than comparison and +* returns boolean true if both \p nv_bfloat16 results are true, boolean false +* otherwise. +* +* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p nv_bfloat16 greater-than comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns bool +* - true if both \p nv_bfloat16 results of unordered +* greater-than comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Performs \p nv_bfloat16 if-equal comparison. +* +* \details Performs \p nv_bfloat16 if-equal comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns bool +* - The boolean result of if-equal comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __heq(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Performs \p nv_bfloat16 not-equal comparison. +* +* \details Performs \p nv_bfloat16 not-equal comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns bool +* - The boolean result of not-equal comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hne(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Performs \p nv_bfloat16 less-equal comparison. +* +* \details Performs \p nv_bfloat16 less-equal comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns bool +* - The boolean result of less-equal comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hle(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Performs \p nv_bfloat16 greater-equal comparison. +* +* \details Performs \p nv_bfloat16 greater-equal comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns bool +* - The boolean result of greater-equal comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hge(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Performs \p nv_bfloat16 less-than comparison. +* +* \details Performs \p nv_bfloat16 less-than comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns bool +* - The boolean result of less-than comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hlt(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Performs \p nv_bfloat16 greater-than comparison. +* +* \details Performs \p nv_bfloat16 greater-than comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns bool +* - The boolean result of greater-than comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hgt(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Performs \p nv_bfloat16 unordered if-equal comparison. +* +* \details Performs \p nv_bfloat16 if-equal comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns bool +* - The boolean result of unordered if-equal comparison of \p a and +* \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hequ(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Performs \p nv_bfloat16 unordered not-equal comparison. +* +* \details Performs \p nv_bfloat16 not-equal comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns bool +* - The boolean result of unordered not-equal comparison of \p a and +* \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hneu(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Performs \p nv_bfloat16 unordered less-equal comparison. +* +* \details Performs \p nv_bfloat16 less-equal comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns bool +* - The boolean result of unordered less-equal comparison of \p a and +* \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hleu(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Performs \p nv_bfloat16 unordered greater-equal comparison. +* +* \details Performs \p nv_bfloat16 greater-equal comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns bool +* - The boolean result of unordered greater-equal comparison of \p a +* and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hgeu(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Performs \p nv_bfloat16 unordered less-than comparison. +* +* \details Performs \p nv_bfloat16 less-than comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns bool +* - The boolean result of unordered less-than comparison of \p a and +* \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hltu(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Performs \p nv_bfloat16 unordered greater-than comparison. +* +* \details Performs \p nv_bfloat16 greater-than comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns bool +* - The boolean result of unordered greater-than comparison of \p a +* and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hgtu(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Determine whether \p nv_bfloat16 argument is a NaN. +* +* \details Determine whether \p nv_bfloat16 value \p a is a NaN. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns bool +* - true if argument is NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hisnan(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Calculates \p nv_bfloat16 maximum of two input values. +* +* \details Calculates \p nv_bfloat16 max(\p a, \p b) +* defined as (\p a > \p b) ? \p a : \p b. +* - If either of inputs is NaN, the other input is returned. +* - If both inputs are NaNs, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmax(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Calculates \p nv_bfloat16 minimum of two input values. +* +* \details Calculates \p nv_bfloat16 min(\p a, \p b) +* defined as (\p a < \p b) ? \p a : \p b. +* - If either of inputs is NaN, the other input is returned. +* - If both inputs are NaNs, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmin(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Calculates \p nv_bfloat16 maximum of two input values, NaNs pass through. +* +* \details Calculates \p nv_bfloat16 max(\p a, \p b) +* defined as (\p a > \p b) ? \p a : \p b. +* - If either of inputs is NaN, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmax_nan(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Calculates \p nv_bfloat16 minimum of two input values, NaNs pass through. +* +* \details Calculates \p nv_bfloat16 min(\p a, \p b) +* defined as (\p a < \p b) ? \p a : \p b. +* - If either of inputs is NaN, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmin_nan(const __nv_bfloat16 a, const __nv_bfloat16 b); +#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Performs \p nv_bfloat16 fused multiply-add in round-to-nearest-even mode with relu saturation. +* +* \details Performs \p nv_bfloat16 multiply on inputs \p a and \p b, +* then performs a \p nv_bfloat16 add of the result with \p c, +* rounding the result once in round-to-nearest-even mode. +* Then negative result is clamped to 0. +* NaN result is converted to canonical NaN. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* \param[in] c - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The result of fused multiply-add operation on \p +* a, \p b, and \p c with relu saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __hfma_relu(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c); +#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Calculates \p nv_bfloat162 vector maximum of two inputs. +* +* \details Calculates \p nv_bfloat162 vector max(\p a, \p b). +* Elementwise \p nv_bfloat16 operation is defined as +* (\p a > \p b) ? \p a : \p b. +* - If either of inputs is NaN, the other input is returned. +* - If both inputs are NaNs, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The result of elementwise maximum of vectors \p a and \p b +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmax2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Calculates \p nv_bfloat162 vector minimum of two inputs. +* +* \details Calculates \p nv_bfloat162 vector min(\p a, \p b). +* Elementwise \p nv_bfloat16 operation is defined as +* (\p a < \p b) ? \p a : \p b. +* - If either of inputs is NaN, the other input is returned. +* - If both inputs are NaNs, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The result of elementwise minimum of vectors \p a and \p b +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmin2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Calculates \p nv_bfloat162 vector maximum of two inputs, NaNs pass through. +* +* \details Calculates \p nv_bfloat162 vector max(\p a, \p b). +* Elementwise \p nv_bfloat16 operation is defined as +* (\p a > \p b) ? \p a : \p b. +* - If either of inputs is NaN, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The result of elementwise maximum of vectors \p a and \p b, with NaNs pass through +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmax2_nan(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Calculates \p nv_bfloat162 vector minimum of two inputs, NaNs pass through. +* +* \details Calculates \p nv_bfloat162 vector min(\p a, \p b). +* Elementwise \p nv_bfloat16 operation is defined as +* (\p a < \p b) ? \p a : \p b. +* - If either of inputs is NaN, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The result of elementwise minimum of vectors \p a and \p b, with NaNs pass through +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmin2_nan(const __nv_bfloat162 a, const __nv_bfloat162 b); +#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs \p nv_bfloat162 vector fused multiply-add in round-to-nearest-even +* mode with relu saturation. +* +* \details Performs \p nv_bfloat162 vector multiply on inputs \p a and \p b, +* then performs a \p nv_bfloat162 vector add of the result with \p c, +* rounding the result once in round-to-nearest-even mode. +* Then negative result is clamped to 0. +* NaN result is converted to canonical NaN. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* \param[in] c - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c with relu saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2_relu(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c); +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs fast complex multiply-accumulate +* +* \details Interprets vector \p nv_bfloat162 input pairs \p a, \p b, and \p c as +* complex numbers in \p nv_bfloat16 precision and performs +* complex multiply-accumulate operation: a*b + c +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* \param[in] c - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The result of complex multiply-accumulate operation on complex numbers \p a, \p b, and \p c +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __hcmadd(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c); +#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */ +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculates \p nv_bfloat16 square root in round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat16 square root of input \p a in round-to-nearest-even mode. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The square root of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hsqrt(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculates \p nv_bfloat16 reciprocal square root in round-to-nearest-even +* mode. +* +* \details Calculates \p nv_bfloat16 reciprocal square root of input \p a in round-to-nearest-even +* mode. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The reciprocal square root of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hrsqrt(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculates \p nv_bfloat16 reciprocal in round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat16 reciprocal of input \p a in round-to-nearest-even mode. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The reciprocal of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hrcp(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculates \p nv_bfloat16 natural logarithm in round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat16 natural logarithm of input \p a in round-to-nearest-even +* mode. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The natural logarithm of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hlog(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculates \p nv_bfloat16 binary logarithm in round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat16 binary logarithm of input \p a in round-to-nearest-even +* mode. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The binary logarithm of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hlog2(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculates \p nv_bfloat16 decimal logarithm in round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat16 decimal logarithm of input \p a in round-to-nearest-even +* mode. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The decimal logarithm of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hlog10(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculates \p nv_bfloat16 natural exponential function in round-to-nearest-even +* mode. +* +* \details Calculates \p nv_bfloat16 natural exponential function of input \p a in +* round-to-nearest-even mode. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The natural exponential function on \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hexp(const __nv_bfloat16 a); + +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculates approximate \p nv_bfloat16 hyperbolic tangent function. +* +* \details Calculates approximate \p nv_bfloat16 hyperbolic tangent function: \cuda_math_formula \tanh(a)\end_cuda_math_formula. +* This operation uses HW acceleration on devices of compute capability 9.x and higher. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The approximate hyperbolic tangent function of \p a. +* - htanh_approx \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula (\pm 0)\end_cuda_math_formula. +* - htanh_approx \cuda_math_formula (\pm\infty)\end_cuda_math_formula returns \cuda_math_formula (\pm 1)\end_cuda_math_formula. +* - htanh_approx(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 htanh_approx(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculates \p nv_bfloat162 vector approximate hyperbolic tangent function. +* +* \details Calculates \p nv_bfloat162 approximate hyperbolic tangent function of input vector \p a. +* This operation uses HW acceleration on devices of compute capability 9.x and higher. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The elementwise approximate hyperbolic tangent function on vector \p a. +* +* \see htanh_approx(__nv_bfloat16) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2tanh_approx(const __nv_bfloat162 a); + +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculates \p nv_bfloat16 hyperbolic tangent function in +* round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat16 hyperbolic tangent function: \cuda_math_formula \tanh(a)\end_cuda_math_formula in +* round-to-nearest-even mode. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The hyperbolic tangent function of \p a. +* - htanh \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula (\pm 0)\end_cuda_math_formula. +* - htanh \cuda_math_formula (\pm\infty)\end_cuda_math_formula returns \cuda_math_formula (\pm 1)\end_cuda_math_formula. +* - htanh(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 htanh(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculates \p nv_bfloat162 vector hyperbolic tangent function in round-to-nearest-even +* mode. +* +* \details Calculates \p nv_bfloat162 hyperbolic tangent function of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The elementwise hyperbolic tangent function on vector \p a. +* +* \see htanh(__nv_bfloat16) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2tanh(const __nv_bfloat162 a); + +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculates \p nv_bfloat16 binary exponential function in round-to-nearest-even +* mode. +* +* \details Calculates \p nv_bfloat16 binary exponential function of input \p a in +* round-to-nearest-even mode. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The binary exponential function on \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hexp2(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculates \p nv_bfloat16 decimal exponential function in round-to-nearest-even +* mode. +* +* \details Calculates \p nv_bfloat16 decimal exponential function of input \p a in +* round-to-nearest-even mode. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The decimal exponential function on \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hexp10(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculates \p nv_bfloat16 cosine in round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat16 cosine of input \p a in round-to-nearest-even mode. +* +* NOTE: this function's implementation calls cosf(float) function and is exposed +* to compiler optimizations. Specifically, \p --use_fast_math flag changes cosf(float) +* into an intrinsic __cosf(float), which has less accurate numeric behavior. +* +* \param[in] a - nv_bfloat16. Is only being read. +* \returns nv_bfloat16 +* - The cosine of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hcos(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculates \p nv_bfloat16 sine in round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat16 sine of input \p a in round-to-nearest-even mode. +* +* NOTE: this function's implementation calls sinf(float) function and is exposed +* to compiler optimizations. Specifically, \p --use_fast_math flag changes sinf(float) +* into an intrinsic __sinf(float), which has less accurate numeric behavior. +* +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The sine of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hsin(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculates \p nv_bfloat162 vector square root in round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat162 square root of input vector \p a in round-to-nearest-even +* mode. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The elementwise square root on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2sqrt(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculates \p nv_bfloat162 vector reciprocal square root in round-to-nearest-even +* mode. +* +* \details Calculates \p nv_bfloat162 reciprocal square root of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The elementwise reciprocal square root on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2rsqrt(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculates \p nv_bfloat162 vector reciprocal in round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat162 reciprocal of input vector \p a in round-to-nearest-even +* mode. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The elementwise reciprocal on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2rcp(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculates \p nv_bfloat162 vector natural logarithm in round-to-nearest-even +* mode. +* +* \details Calculates \p nv_bfloat162 natural logarithm of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The elementwise natural logarithm on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2log(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculates \p nv_bfloat162 vector binary logarithm in round-to-nearest-even +* mode. +* +* \details Calculates \p nv_bfloat162 binary logarithm of input vector \p a in round-to-nearest-even +* mode. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The elementwise binary logarithm on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2log2(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculates \p nv_bfloat162 vector decimal logarithm in round-to-nearest-even +* mode. +* +* \details Calculates \p nv_bfloat162 decimal logarithm of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The elementwise decimal logarithm on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2log10(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculates \p nv_bfloat162 vector exponential function in round-to-nearest-even +* mode. +* +* \details Calculates \p nv_bfloat162 exponential function of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The elementwise exponential function on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2exp(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculates \p nv_bfloat162 vector binary exponential function in +* round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat162 binary exponential function of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The elementwise binary exponential function on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2exp2(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculates \p nv_bfloat162 vector decimal exponential function in +* round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat162 decimal exponential function of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The elementwise decimal exponential function on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2exp10(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculates \p nv_bfloat162 vector cosine in round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat162 cosine of input vector \p a in round-to-nearest-even +* mode. +* +* NOTE: this function's implementation calls cosf(float) function and is exposed +* to compiler optimizations. Specifically, \p --use_fast_math flag changes cosf(float) +* into an intrinsic __cosf(float), which has less accurate numeric behavior. +* +* \param[in] a - nv_bfloat162. Is only being read. +* \returns nv_bfloat162 +* - The elementwise cosine on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2cos(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculates \p nv_bfloat162 vector sine in round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat162 sine of input vector \p a in round-to-nearest-even mode. +* +* NOTE: this function's implementation calls sinf(float) function and is exposed +* to compiler optimizations. Specifically, \p --use_fast_math flag changes sinf(float) +* into an intrinsic __sinf(float), which has less accurate numeric behavior. +* +* \param[in] a - nv_bfloat162. Is only being read. +* \returns nv_bfloat162 +* - The elementwise sine on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2sin(const __nv_bfloat162 a); + +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Vector add \p val to the value stored at \p address in global or shared memory, and writes this +* value back to \p address. The atomicity of the add operation is guaranteed separately for each of the +* two nv_bfloat16 elements; the entire __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. +* +* \details The location of \p address must be in global or shared memory. This operation has undefined +* behavior otherwise. This operation is natively supported by devices of compute capability 9.x and higher, +* older devices use emulation path. +* +* \param[in] address - __nv_bfloat162*. An address in global or shared memory. +* \param[in] val - __nv_bfloat162. The value to be added. +* +* \returns __nv_bfloat162 +* - The old value read from \p address. +* +* \note_ref_guide_atomic +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 atomicAdd(__nv_bfloat162 *const address, const __nv_bfloat162 val); + +#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700))) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Adds \p val to the value stored at \p address in global or shared memory, and writes this value +* back to \p address. This operation is performed in one atomic operation. +* +* \details The location of \p address must be in global or shared memory. This operation has undefined +* behavior otherwise. This operation is natively supported by devices of compute capability 9.x and higher, +* older devices of compute capability 7.x and 8.x use emulation path. +* +* \param[in] address - __nv_bfloat16*. An address in global or shared memory. +* \param[in] val - __nv_bfloat16. The value to be added. +* +* \returns __nv_bfloat16 +* - The old value read from \p address. +* +* \note_ref_guide_atomic +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 atomicAdd(__nv_bfloat16 *const address, const __nv_bfloat16 val); +#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700))) || defined(_NVHPC_CUDA) */ +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ + + +#endif /* defined(__cplusplus) */ + +#if !defined(_MSC_VER) && __cplusplus >= 201103L +# define __CPP_VERSION_AT_LEAST_11_BF16 +#elif _MSC_FULL_VER >= 190024210 && _MSVC_LANG >= 201103L +# define __CPP_VERSION_AT_LEAST_11_BF16 +#endif + +/* C++11 header for ::std::move. + * In RTC mode, ::std::move is provided implicitly; don't include the header + */ +#if defined(__CPP_VERSION_AT_LEAST_11_BF16) && !defined(__CUDACC_RTC__) +#include +#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) && !defined(__CUDACC_RTC__) */ + +/* C++ header for ::std::memcpy (used for type punning in host-side implementations). + * When compiling as a CUDA source file memcpy is provided implicitly. + * !defined(__CUDACC__) implies !defined(__CUDACC_RTC__). + */ +#if defined(__cplusplus) && !defined(__CUDACC__) +#include +#endif /* defined(__cplusplus) && !defined(__CUDACC__) */ + +// implicitly provided by NVRTC +#if !defined(__CUDACC_RTC__) +#include +#endif /* !defined(__CUDACC_RTC__) */ + +#if (defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3)))) +#define __CUDA_BF16_INLINE__ +#define __CUDA_BF16_FORCEINLINE__ +#else +#define __CUDA_BF16_INLINE__ inline +#define __CUDA_BF16_FORCEINLINE__ __forceinline__ +#endif /* #if (defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3)))) */ + +/* Set up structure-alignment attribute */ +#if defined(__CUDACC__) +#define __CUDA_ALIGN__(align) __align__(align) +#else +/* Define alignment macro based on compiler type (cannot assume C11 "_Alignas" is available) */ +#if defined(__CPP_VERSION_AT_LEAST_11_BF16) +#define __CUDA_ALIGN__(n) alignas(n) /* C++11 kindly gives us a keyword for this */ +#else /* defined(__CPP_VERSION_AT_LEAST_11_BF16)*/ +#if defined(__GNUC__) +#define __CUDA_ALIGN__(n) __attribute__ ((aligned(n))) +#elif defined(_MSC_VER) +#define __CUDA_ALIGN__(n) __declspec(align(n)) +#else +#define __CUDA_ALIGN__(n) +#endif /* defined(__GNUC__) */ +#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */ +#endif /* defined(__CUDACC__) */ + +// define __CUDA_BF16_CONSTEXPR__ in order to +// use constexpr where possible, with supporting C++ dialects +// undef after use +#if (defined __CPP_VERSION_AT_LEAST_11_BF16) +#define __CUDA_BF16_CONSTEXPR__ constexpr +#else +#define __CUDA_BF16_CONSTEXPR__ +#endif + +/** + * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 + * \brief __nv_bfloat16_raw data type + * \details Type allows static initialization of \p nv_bfloat16 until it becomes + * a built-in type. + * + * - Note: this initialization is as a bit-field representation of \p nv_bfloat16, + * and not a conversion from \p short to \p nv_bfloat16. + * Such representation will be deprecated in a future version of CUDA. + * + * - Note: this is visible to non-nvcc compilers, including C-only compilations + */ +typedef struct __CUDA_ALIGN__(2) { + /** + * Storage field contains bits representation of the \p nv_bfloat16 floating-point number. + */ + unsigned short x; +} __nv_bfloat16_raw; + +/** + * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 + * \brief __nv_bfloat162_raw data type + * \details Type allows static initialization of \p nv_bfloat162 until it becomes + * a built-in type. + * + * - Note: this initialization is as a bit-field representation of \p nv_bfloat162, + * and not a conversion from \p short2 to \p nv_bfloat162. + * Such representation will be deprecated in a future version of CUDA. + * + * - Note: this is visible to non-nvcc compilers, including C-only compilations + */ +typedef struct __CUDA_ALIGN__(4) { + /** + * Storage field contains bits of the lower \p nv_bfloat16 part. + */ + unsigned short x; + /** + * Storage field contains bits of the upper \p nv_bfloat16 part. + */ + unsigned short y; +} __nv_bfloat162_raw; + +/* All other definitions in this file are only visible to C++ compilers */ +#if defined(__cplusplus) + +/* Hide GCC member initialization list warnings because of host/device in-function init requirement */ +#if defined(__GNUC__) +#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Weffc++" +#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */ +#endif /* defined(__GNUC__) */ + +/* class' : multiple assignment operators specified + The class has multiple assignment operators of a single type. This warning is informational */ +#if defined(_MSC_VER) && _MSC_VER >= 1500 +#pragma warning( push ) +#pragma warning( disable:4522 ) +#endif /* defined(__GNUC__) */ + +/** + * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 + * \brief nv_bfloat16 datatype + * + * \details This structure implements the datatype for storing + * nv_bfloat16 floating-point numbers. The structure implements + * assignment operators and type conversions. 16 bits are being + * used in total: 1 sign bit, 8 bits for the exponent, and + * the significand is being stored in 7 bits. The total + * precision is 8 bits. + * + */ +struct __CUDA_ALIGN__(2) __nv_bfloat16 { +protected: + /** + * Protected storage variable contains the bits of floating-point data. + */ + unsigned short __x; + +public: + + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * \brief Constructor by default. + * \details Emtpy default constructor, result is uninitialized. + */ +#if defined(__CPP_VERSION_AT_LEAST_11_BF16) + __nv_bfloat16() = default; +#else + __CUDA_HOSTDEVICE__ __nv_bfloat16() { } +#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */ + + /* Convert to/from __nv_bfloat16_raw */ + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Constructor from \p __nv_bfloat16_raw. + */ + __CUDA_HOSTDEVICE__ __CUDA_BF16_CONSTEXPR__ __nv_bfloat16(const __nv_bfloat16_raw &hr) : __x(hr.x) { } + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Assignment operator from \p __nv_bfloat16_raw. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(const __nv_bfloat16_raw &hr); + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Assignment operator from \p __nv_bfloat16_raw to \p volatile \p __nv_bfloat16. + */ + __CUDA_HOSTDEVICE__ volatile __nv_bfloat16 &operator=(const __nv_bfloat16_raw &hr) volatile; + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Assignment operator from \p volatile \p __nv_bfloat16_raw to \p volatile \p __nv_bfloat16. + */ + __CUDA_HOSTDEVICE__ volatile __nv_bfloat16 &operator=(const volatile __nv_bfloat16_raw &hr) volatile; + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Type cast to \p __nv_bfloat16_raw operator. + */ + __CUDA_HOSTDEVICE__ operator __nv_bfloat16_raw() const; + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Type cast to \p __nv_bfloat16_raw operator with \p volatile input. + */ + __CUDA_HOSTDEVICE__ operator __nv_bfloat16_raw() const volatile; + +#if !defined(__CUDA_NO_BFLOAT16_CONVERSIONS__) +#if defined(__CPP_VERSION_AT_LEAST_11_BF16) + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Construct \p __nv_bfloat16 from \p __half input using default round-to-nearest-even rounding mode. + */ + explicit __CUDA_HOSTDEVICE__ __nv_bfloat16(const __half f) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm("{ cvt.rn.bf16.f16 %0, %1;}\n" : "=h"(__x) : "h"(__BFLOAT16_TO_CUS(f))); +, + __x = __float2bfloat16(__half2float(f)).__x; +) +} +#endif /* #if defined(__CPP_VERSION_AT_LEAST_11_BF16) */ + + /* Construct from float/double */ + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Construct \p __nv_bfloat16 from \p float input using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16(const float f) { __x = __float2bfloat16(f).__x; } + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Construct \p __nv_bfloat16 from \p double input using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16(const double f) { __x = __double2bfloat16(f).__x; } + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Type cast to \p float operator. + */ + __CUDA_HOSTDEVICE__ operator float() const; + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Type cast to \p __nv_bfloat16 assignment operator from \p float input using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(const float f); + + /* We omit "cast to double" operator, so as to not be ambiguous about up-cast */ + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Type cast to \p __nv_bfloat16 assignment operator from \p double input using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(const double f); + +/* + * Implicit type conversions to/from integer types were only available to nvcc compilation. + * Introducing them for all compilers is a potentially breaking change that may affect + * overloads resolution and will require users to update their code. + * Define __CUDA_BF16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__ to opt-out. + */ +#if !(defined __CUDA_BF16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__) || (defined __CUDACC__) + /* Allow automatic construction from types supported natively in hardware */ + /* Note we do avoid constructor init-list because of special host/device compilation rules */ + + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Construct \p __nv_bfloat16 from \p short integer input using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16(short val) { __x = __short2bfloat16_rn(val).__x; } + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Construct \p __nv_bfloat16 from \p unsigned \p short integer input using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16(unsigned short val) { __x = __ushort2bfloat16_rn(val).__x; } + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Construct \p __nv_bfloat16 from \p int input using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16(int val) { __x = __int2bfloat16_rn(val).__x; } + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Construct \p __nv_bfloat16 from \p unsigned \p int input using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16(unsigned int val) { __x = __uint2bfloat16_rn(val).__x; } + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Construct \p __nv_bfloat16 from \p long input using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16(const long val) { + /* Suppress VS warning: warning C4127: conditional expression is constant */ +#if defined(_MSC_VER) && !defined(__CUDA_ARCH__) +#pragma warning (push) +#pragma warning (disable: 4127) +#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */ + if (sizeof(long) == sizeof(long long)) +#if defined(_MSC_VER) && !defined(__CUDA_ARCH__) +#pragma warning (pop) +#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */ + { + __x = __ll2bfloat16_rn(static_cast(val)).__x; + } else { + __x = __int2bfloat16_rn(static_cast(val)).__x; + } + } + + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Construct \p __nv_bfloat16 from \p unsigned \p long input using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16(const unsigned long val) { + /* Suppress VS warning: warning C4127: conditional expression is constant */ +#if defined(_MSC_VER) && !defined(__CUDA_ARCH__) +#pragma warning (push) +#pragma warning (disable: 4127) +#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */ + if (sizeof(unsigned long) == sizeof(unsigned long long)) +#if defined(_MSC_VER) && !defined(__CUDA_ARCH__) +#pragma warning (pop) +#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */ + { + __x = __ull2bfloat16_rn(static_cast(val)).__x; + } else { + __x = __uint2bfloat16_rn(static_cast(val)).__x; + } + } + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Construct \p __nv_bfloat16 from \p long \p long input using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16(long long val) { __x = __ll2bfloat16_rn(val).__x; } + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Construct \p __nv_bfloat16 from \p unsigned \p long \p long input using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16(unsigned long long val) { __x = __ull2bfloat16_rn(val).__x; } + + /* Allow automatic casts to supported built-in types, matching all that are permitted with float */ + + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Conversion operator to \p signed \p char data type. + * Using round-toward-zero rounding mode. + * + * See __bfloat162char_rz(__nv_bfloat16) for further details + */ + __CUDA_HOSTDEVICE__ operator signed char() const; + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Conversion operator to \p unsigned \p char data type. + * Using round-toward-zero rounding mode. + * + * See __bfloat162uchar_rz(__nv_bfloat16) for further details + */ + __CUDA_HOSTDEVICE__ operator unsigned char() const; + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Conversion operator to an implementation defined \p char data type. + * Using round-toward-zero rounding mode. + * + * Detects signedness of the \p char type and proceeds accordingly, see + * further details in signed and unsigned char operators. + */ + __CUDA_HOSTDEVICE__ operator char() const; + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Conversion operator to \p short data type. + * Using round-toward-zero rounding mode. + * + * See __bfloat162short_rz(__nv_bfloat16) for further details + */ + __CUDA_HOSTDEVICE__ operator short() const; + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Conversion operator to \p unsigned \p short data type. + * Using round-toward-zero rounding mode. + * + * See __bfloat162ushort_rz(__nv_bfloat16) for further details + */ + __CUDA_HOSTDEVICE__ operator unsigned short() const; + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Conversion operator to \p int data type. + * Using round-toward-zero rounding mode. + * + * See __bfloat162int_rz(__nv_bfloat16) for further details + */ + __CUDA_HOSTDEVICE__ operator int() const; + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Conversion operator to \p unsigned \p int data type. + * Using round-toward-zero rounding mode. + * + * See __bfloat162uint_rz(__nv_bfloat16) for further details + */ + __CUDA_HOSTDEVICE__ operator unsigned int() const; + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Conversion operator to \p long data type. + * Using round-toward-zero rounding mode. + */ + __CUDA_HOSTDEVICE__ operator long() const; + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Conversion operator to \p unsigned \p long data type. + * Using round-toward-zero rounding mode. + */ + __CUDA_HOSTDEVICE__ operator unsigned long() const; + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Conversion operator to \p long \p long data type. + * Using round-toward-zero rounding mode. + * + * See __bfloat162ll_rz(__nv_bfloat16) for further details + */ + __CUDA_HOSTDEVICE__ operator long long() const; + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Conversion operator to \p unsigned \p long \p long data type. + * Using round-toward-zero rounding mode. + * + * See __bfloat162ull_rz(__nv_bfloat16) for further details + */ + __CUDA_HOSTDEVICE__ operator unsigned long long() const; + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Type cast from \p short assignment operator, using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(short val); + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Type cast from \p unsigned \p short assignment operator, using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(unsigned short val); + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Type cast from \p int assignment operator, using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(int val); + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Type cast from \p unsigned \p int assignment operator, using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(unsigned int val); + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Type cast from \p long \p long assignment operator, using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(long long val); + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Type cast from \p unsigned \p long \p long assignment operator, using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(unsigned long long val); + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Conversion operator to \p bool data type. + * +0 and -0 inputs convert to \p false. + * Non-zero inputs convert to \p true. + */ + __CUDA_HOSTDEVICE__ __CUDA_BF16_CONSTEXPR__ operator bool() const { return (__x & 0x7FFFU) != 0U; } +#endif /* !(defined __CUDA_BF16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__) || (defined __CUDACC__) */ +#endif /* !defined(__CUDA_NO_BFLOAT16_CONVERSIONS__) */ +}; + +#if !defined(__CUDA_NO_BFLOAT16_OPERATORS__) +/* Some basic arithmetic operations expected of a built-in */ +/** + * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC + * Performs \p nv_bfloat16 addition operation. + * See also __hadd(__nv_bfloat16, __nv_bfloat16) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator+(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC + * Performs \p nv_bfloat16 subtraction operation. + * See also __hsub(__nv_bfloat16, __nv_bfloat16) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator-(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC + * Performs \p nv_bfloat16 multiplication operation. + * See also __hmul(__nv_bfloat16, __nv_bfloat16) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator*(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC + * Performs \p nv_bfloat16 division operation. + * See also __hdiv(__nv_bfloat16, __nv_bfloat16) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator/(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh); + +/** + * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC + * Performs \p nv_bfloat16 compound assignment with addition operation. + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator+=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC + * Performs \p nv_bfloat16 compound assignment with subtraction operation. + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator-=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC + * Performs \p nv_bfloat16 compound assignment with multiplication operation. + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator*=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC + * Performs \p nv_bfloat16 compound assignment with division operation. + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator/=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh); + +/* Note for increment and decrement we use the raw value 0x3F80U equating to nv_bfloat16(1.0F), to avoid the extra conversion */ +/** + * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC + * Performs \p nv_bfloat16 prefix increment operation. + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator++(__nv_bfloat16 &h); +/** + * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC + * Performs \p nv_bfloat16 prefix decrement operation. + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator--(__nv_bfloat16 &h); +/** + * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC + * Performs \p nv_bfloat16 postfix increment operation. + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator++(__nv_bfloat16 &h, const int ignored); +/** + * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC + * Performs \p nv_bfloat16 postfix decrement operation. + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator--(__nv_bfloat16 &h, const int ignored); +/* Unary plus and inverse operators */ +/** + * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC + * Implements \p nv_bfloat16 unary plus operator, returns input value. + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator+(const __nv_bfloat16 &h); +/** + * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC + * Implements \p nv_bfloat16 unary minus operator. + * See also __hneg(__nv_bfloat16) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator-(const __nv_bfloat16 &h); + +/* Some basic comparison operations to make it look like a built-in */ +/** + * \ingroup CUDA_MATH__BFLOAT16_COMPARISON + * Performs \p nv_bfloat16 ordered compare equal operation. + * See also __heq(__nv_bfloat16, __nv_bfloat16) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator==(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT16_COMPARISON + * Performs \p nv_bfloat16 unordered compare not-equal operation. + * See also __hneu(__nv_bfloat16, __nv_bfloat16) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator!=(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT16_COMPARISON + * Performs \p nv_bfloat16 ordered greater-than compare operation. + * See also __hgt(__nv_bfloat16, __nv_bfloat16) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator> (const __nv_bfloat16 &lh, const __nv_bfloat16 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT16_COMPARISON + * Performs \p nv_bfloat16 ordered less-than compare operation. + * See also __hlt(__nv_bfloat16, __nv_bfloat16) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator< (const __nv_bfloat16 &lh, const __nv_bfloat16 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT16_COMPARISON + * Performs \p nv_bfloat16 ordered greater-or-equal compare operation. + * See also __hge(__nv_bfloat16, __nv_bfloat16) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator>=(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT16_COMPARISON + * Performs \p nv_bfloat16 ordered less-or-equal compare operation. + * See also __hle(__nv_bfloat16, __nv_bfloat16) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator<=(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh); +#endif /* !defined(__CUDA_NO_BFLOAT16_OPERATORS__) */ + +/** +* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 + * \brief nv_bfloat162 datatype + * \details This structure implements the datatype for storing two + * nv_bfloat16 floating-point numbers. + * The structure implements assignment, arithmetic and comparison + * operators, and type conversions. + * + * - NOTE: __nv_bfloat162 is visible to non-nvcc host compilers + */ +struct __CUDA_ALIGN__(4) __nv_bfloat162 { + /** + * Storage field holding lower \p __nv_bfloat16 part. + */ + __nv_bfloat16 x; + /** + * Storage field holding upper \p __nv_bfloat16 part. + */ + __nv_bfloat16 y; + + // All construct/copy/assign/move +public: + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * \brief Constructor by default. + * \details Emtpy default constructor, result is uninitialized. + */ +#if defined(__CPP_VERSION_AT_LEAST_11_BF16) + __nv_bfloat162() = default; + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Move constructor, available for \p C++11 and later dialects + */ + __CUDA_HOSTDEVICE__ __nv_bfloat162(__nv_bfloat162 &&src); + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Move assignment operator, available for \p C++11 and later dialects + */ + __CUDA_HOSTDEVICE__ __nv_bfloat162 &operator=(__nv_bfloat162 &&src); +#else + __CUDA_HOSTDEVICE__ __nv_bfloat162(); +#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */ + + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Constructor from two \p __nv_bfloat16 variables + */ + __CUDA_HOSTDEVICE__ __CUDA_BF16_CONSTEXPR__ __nv_bfloat162(const __nv_bfloat16 &a, const __nv_bfloat16 &b) : x(a), y(b) { } + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Copy constructor + */ + __CUDA_HOSTDEVICE__ __nv_bfloat162(const __nv_bfloat162 &src); + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Copy assignment operator + */ + __CUDA_HOSTDEVICE__ __nv_bfloat162 &operator=(const __nv_bfloat162 &src); + + /* Convert to/from __nv_bfloat162_raw */ + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Constructor from \p __nv_bfloat162_raw + */ + __CUDA_HOSTDEVICE__ __nv_bfloat162(const __nv_bfloat162_raw &h2r ); + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Assignment operator from \p __nv_bfloat162_raw + */ + __CUDA_HOSTDEVICE__ __nv_bfloat162 &operator=(const __nv_bfloat162_raw &h2r); + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Conversion operator to \p __nv_bfloat162_raw + */ + __CUDA_HOSTDEVICE__ operator __nv_bfloat162_raw() const; +}; + +#if !defined(__CUDA_NO_BFLOAT162_OPERATORS__) +/** + * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC + * Performs packed \p nv_bfloat16 addition operation. + * See also __hadd2(__nv_bfloat162, __nv_bfloat162) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator+(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC + * Performs packed \p nv_bfloat16 subtraction operation. + * See also __hsub2(__nv_bfloat162, __nv_bfloat162) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator-(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC + * Performs packed \p nv_bfloat16 multiplication operation. + * See also __hmul2(__nv_bfloat162, __nv_bfloat162) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator*(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC + * Performs packed \p nv_bfloat16 division operation. + * See also __h2div(__nv_bfloat162, __nv_bfloat162) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator/(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC + * Performs packed \p nv_bfloat16 compound assignment with addition operation. + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162& operator+=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC + * Performs packed \p nv_bfloat16 compound assignment with subtraction operation. + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162& operator-=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC + * Performs packed \p nv_bfloat16 compound assignment with multiplication operation. + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162& operator*=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC + * Performs packed \p nv_bfloat16 compound assignment with division operation. + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162& operator/=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC + * Performs packed \p nv_bfloat16 prefix increment operation. + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 &operator++(__nv_bfloat162 &h); +/** + * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC + * Performs packed \p nv_bfloat16 prefix decrement operation. + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 &operator--(__nv_bfloat162 &h); +/** + * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC + * Performs packed \p nv_bfloat16 postfix increment operation. + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator++(__nv_bfloat162 &h, const int ignored); +/** + * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC + * Performs packed \p nv_bfloat16 postfix decrement operation. + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator--(__nv_bfloat162 &h, const int ignored); +/** + * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC + * Implements packed \p nv_bfloat16 unary plus operator, returns input value. + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator+(const __nv_bfloat162 &h); +/** + * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC + * Implements packed \p nv_bfloat16 unary minus operator. + * See also __hneg2(__nv_bfloat162) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator-(const __nv_bfloat162 &h); +/** + * \ingroup CUDA_MATH__BFLOAT162_COMPARISON + * Performs packed \p nv_bfloat16 ordered compare equal operation. + * See also __hbeq2(__nv_bfloat162, __nv_bfloat162) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator==(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT162_COMPARISON + * Performs packed \p nv_bfloat16 unordered compare not-equal operation. + * See also __hbneu2(__nv_bfloat162, __nv_bfloat162) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator!=(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT162_COMPARISON + * Performs packed \p nv_bfloat16 ordered greater-than compare operation. + * See also __hbgt2(__nv_bfloat162, __nv_bfloat162) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator>(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT162_COMPARISON + * Performs packed \p nv_bfloat16 ordered less-than compare operation. + * See also __hblt2(__nv_bfloat162, __nv_bfloat162) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator<(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT162_COMPARISON + * Performs packed \p nv_bfloat16 ordered greater-or-equal compare operation. + * See also __hbge2(__nv_bfloat162, __nv_bfloat162) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator>=(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT162_COMPARISON + * Performs packed \p nv_bfloat16 ordered less-or-equal compare operation. + * See also __hble2(__nv_bfloat162, __nv_bfloat162) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator<=(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh); + +#endif /* !defined(__CUDA_NO_BFLOAT162_OPERATORS__) */ + +#if defined(__CPP_VERSION_AT_LEAST_11_BF16) +#if !defined(__CUDA_NO_HALF_CONVERSIONS__) +__CUDA_HOSTDEVICE__ +#ifdef __CUDACC_RTC__ +inline +#else +__CUDA_BF16_FORCEINLINE__ +#endif +__half::__half(const __nv_bfloat16 f) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm("{ cvt.rn.f16.bf16 %0, %1;}\n" : "=h"(__x) : "h"(__BFLOAT16_TO_CUS(f))); +, + __x = __float2half_rn(__bfloat162float(f)).__x; +) +} +#endif +#endif /* #if defined(__CPP_VERSION_AT_LEAST_11_BF16) */ + +#endif /* defined(__cplusplus) */ + +#if (defined(__FORCE_INCLUDE_CUDA_BF16_HPP_FROM_BF16_H__) || \ + !(defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3))))) +/* Note the .hpp file is included to capture the "nv_bfloat16" & "nv_bfloat162" built-in function definitions. For NVRTC, the built-in + function definitions are compiled at NVRTC library build-time and are available through the NVRTC built-ins library at + link time. +*/ +#include "cuda_bf16.hpp" +#endif /* (defined(__FORCE_INCLUDE_CUDA_BF16_HPP_FROM_BF16_H__) || \ + !(defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3))))) */ + +/* Define first-class types "nv_bfloat16" and "nv_bfloat162", unless user specifies otherwise via "#define CUDA_NO_BFLOAT16" */ +/* C cannot ever have these types defined here, because __nv_bfloat16 and __nv_bfloat162 are C++ classes */ +#if defined(__cplusplus) && !defined(CUDA_NO_BFLOAT16) +/** + * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 + * \brief This datatype is meant to be the first-class or fundamental + * implementation of the bfloat16 numbers format. + * + * \details Should be implemented in the compiler in the future. + * Current implementation is a simple typedef to a respective + * user-level type with underscores. + */ +typedef __nv_bfloat16 nv_bfloat16; + +/** + * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 + * \brief This datatype is meant to be the first-class or fundamental + * implementation of type for pairs of bfloat16 numbers. + * + * \details Should be implemented in the compiler in the future. + * Current implementation is a simple typedef to a respective + * user-level type with underscores. + */ +typedef __nv_bfloat162 nv_bfloat162; + +#endif /* defined(__cplusplus) && !defined(CUDA_NO_BFLOAT16) */ + +#undef __CUDA_BF16_DECL__ +#undef __CUDA_HOSTDEVICE_BF16_DECL__ +#undef __CUDA_HOSTDEVICE__ +#undef __CUDA_BF16_INLINE__ +#undef __CUDA_BF16_FORCEINLINE__ +#undef ___CUDA_BF16_STRINGIFY_INNERMOST +#undef __CUDA_BF16_STRINGIFY + +#endif /* end of include guard: __CUDA_BF16_H__ */ diff --git a/numba_cuda/numba/cuda/include/13/cuda_bf16.hpp b/numba_cuda/numba/cuda/include/13/cuda_bf16.hpp new file mode 100644 index 000000000..5f610c976 --- /dev/null +++ b/numba_cuda/numba/cuda/include/13/cuda_bf16.hpp @@ -0,0 +1,3865 @@ +/* +* Copyright 1993-2024 NVIDIA Corporation. All rights reserved. +* +* NOTICE TO LICENSEE: +* +* This source code and/or documentation ("Licensed Deliverables") are +* subject to NVIDIA intellectual property rights under U.S. and +* international Copyright laws. +* +* These Licensed Deliverables contained herein is PROPRIETARY and +* CONFIDENTIAL to NVIDIA and is being provided under the terms and +* conditions of a form of NVIDIA software license agreement by and +* between NVIDIA and Licensee ("License Agreement") or electronically +* accepted by Licensee. Notwithstanding any terms or conditions to +* the contrary in the License Agreement, reproduction or disclosure +* of the Licensed Deliverables to any third party without the express +* written consent of NVIDIA is prohibited. +* +* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE +* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS +* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. +* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED +* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, +* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. +* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY +* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY +* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS +* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE +* OF THESE LICENSED DELIVERABLES. +* +* U.S. Government End Users. These Licensed Deliverables are a +* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT +* 1995), consisting of "commercial computer software" and "commercial +* computer software documentation" as such terms are used in 48 +* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government +* only as a commercial end item. Consistent with 48 C.F.R.12.212 and +* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all +* U.S. Government End Users acquire the Licensed Deliverables with +* only those rights set forth herein. +* +* Any use of the Licensed Deliverables in individual and commercial +* software must include, in the user documentation and internal +* comments to the code, the above Disclaimer and U.S. Government End +* Users Notice. +*/ + +#if !defined(__CUDA_BF16_HPP__) +#define __CUDA_BF16_HPP__ + +#if !defined(__CUDA_BF16_H__) +#error "Do not include this file directly. Instead, include cuda_bf16.h." +#endif + +#if !defined(IF_DEVICE_OR_CUDACC) +#if defined(__CUDACC__) + #define IF_DEVICE_OR_CUDACC(d, c, f) NV_IF_ELSE_TARGET(NV_IS_DEVICE, d, c) +#else + #define IF_DEVICE_OR_CUDACC(d, c, f) NV_IF_ELSE_TARGET(NV_IS_DEVICE, d, f) +#endif +#endif + +/* All other definitions in this file are only visible to C++ compilers */ +#if defined(__cplusplus) +/** + * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16_CONSTANTS + * \brief Defines floating-point positive infinity value for the \p nv_bfloat16 data type + */ +#define CUDART_INF_BF16 __ushort_as_bfloat16((unsigned short)0x7F80U) +/** + * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16_CONSTANTS + * \brief Defines canonical NaN value for the \p nv_bfloat16 data type + */ +#define CUDART_NAN_BF16 __ushort_as_bfloat16((unsigned short)0x7FFFU) +/** + * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16_CONSTANTS + * \brief Defines a minimum representable (denormalized) value for the \p nv_bfloat16 data type + */ +#define CUDART_MIN_DENORM_BF16 __ushort_as_bfloat16((unsigned short)0x0001U) +/** + * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16_CONSTANTS + * \brief Defines a maximum representable value for the \p nv_bfloat16 data type + */ +#define CUDART_MAX_NORMAL_BF16 __ushort_as_bfloat16((unsigned short)0x7F7FU) +/** + * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16_CONSTANTS + * \brief Defines a negative zero value for the \p nv_bfloat16 data type + */ +#define CUDART_NEG_ZERO_BF16 __ushort_as_bfloat16((unsigned short)0x8000U) +/** + * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16_CONSTANTS + * \brief Defines a positive zero value for the \p nv_bfloat16 data type + */ +#define CUDART_ZERO_BF16 __ushort_as_bfloat16((unsigned short)0x0000U) +/** + * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16_CONSTANTS + * \brief Defines a value of 1.0 for the \p nv_bfloat16 data type + */ +#define CUDART_ONE_BF16 __ushort_as_bfloat16((unsigned short)0x3F80U) + +#if !(defined __DOXYGEN_ONLY__) + + __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(const __nv_bfloat16_raw &hr) { __x = hr.x; return *this; } + __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ volatile __nv_bfloat16 &__nv_bfloat16::operator=(const __nv_bfloat16_raw &hr) volatile { __x = hr.x; return *this; } + __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ volatile __nv_bfloat16 &__nv_bfloat16::operator=(const volatile __nv_bfloat16_raw &hr) volatile { __x = hr.x; return *this; } + __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator __nv_bfloat16_raw() const { __nv_bfloat16_raw ret; ret.x = __x; return ret; } + __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator __nv_bfloat16_raw() const volatile { __nv_bfloat16_raw ret; ret.x = __x; return ret; } + +#if !defined(__CUDA_NO_BFLOAT16_CONVERSIONS__) + __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator float() const { return __bfloat162float(*this); } + __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(const float f) { __x = __float2bfloat16(f).__x; return *this; } + __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(const double f) { __x = __double2bfloat16(f).__x; return *this; } + +/* + * Implicit type conversions to/from integer types were only available to nvcc compilation. + * Introducing them for all compilers is a potentially breaking change that may affect + * overloads resolution and will require users to update their code. + * Define __CUDA_BF16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__ to opt-out. + */ +#if !(defined __CUDA_BF16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__) || (defined __CUDACC__) + __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator signed char() const { return __bfloat162char_rz(*this); } + __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator unsigned char() const { return __bfloat162uchar_rz(*this); } + __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator char() const { + char value; + /* Suppress VS warning: warning C4127: conditional expression is constant */ +#if defined(_MSC_VER) && !defined(__CUDA_ARCH__) +#pragma warning (push) +#pragma warning (disable: 4127) +#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */ + if (((char)-1) < (char)0) +#if defined(_MSC_VER) && !defined(__CUDA_ARCH__) +#pragma warning (pop) +#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */ + { + value = static_cast(__bfloat162char_rz(*this)); + } + else + { + value = static_cast(__bfloat162uchar_rz(*this)); + } + return value; + } + __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator short() const { return __bfloat162short_rz(*this); } + __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator unsigned short() const { return __bfloat162ushort_rz(*this); } + __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator int() const { return __bfloat162int_rz(*this); } + __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator unsigned int() const { return __bfloat162uint_rz(*this); } + __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator long() const { + long retval; + /* Suppress VS warning: warning C4127: conditional expression is constant */ +#if defined(_MSC_VER) && !defined(__CUDA_ARCH__) +#pragma warning (push) +#pragma warning (disable: 4127) +#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */ + if (sizeof(long) == sizeof(long long)) +#if defined(_MSC_VER) && !defined(__CUDA_ARCH__) +#pragma warning (pop) +#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */ + { + retval = static_cast(__bfloat162ll_rz(*this)); + } + else + { + retval = static_cast(__bfloat162int_rz(*this)); + } + return retval; + } + __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator unsigned long() const { + unsigned long retval; + /* Suppress VS warning: warning C4127: conditional expression is constant */ +#if defined(_MSC_VER) && !defined(__CUDA_ARCH__) +#pragma warning (push) +#pragma warning (disable: 4127) +#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */ + if (sizeof(unsigned long) == sizeof(unsigned long long)) +#if defined(_MSC_VER) && !defined(__CUDA_ARCH__) +#pragma warning (pop) +#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */ + { + retval = static_cast(__bfloat162ull_rz(*this)); + } + else + { + retval = static_cast(__bfloat162uint_rz(*this)); + } + return retval; + } + __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator long long() const { return __bfloat162ll_rz(*this); } + __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16::operator unsigned long long() const { return __bfloat162ull_rz(*this); } + __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(short val) { __x = __short2bfloat16_rn(val).__x; return *this; } + __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(unsigned short val) { __x = __ushort2bfloat16_rn(val).__x; return *this; } + __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(int val) { __x = __int2bfloat16_rn(val).__x; return *this; } + __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(unsigned int val) { __x = __uint2bfloat16_rn(val).__x; return *this; } + __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(long long val) { __x = __ll2bfloat16_rn(val).__x; return *this; } + __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(unsigned long long val) { __x = __ull2bfloat16_rn(val).__x; return *this; } +#endif /* !(defined __CUDA_BF16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__) || (defined __CUDACC__) */ +#endif /* !defined(__CUDA_NO_BFLOAT16_CONVERSIONS__) */ + + +#if !defined(__CUDA_NO_BFLOAT16_OPERATORS__) +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator+(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hadd(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator-(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hsub(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator*(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hmul(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator/(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hdiv(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator+=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh) { lh = __hadd(lh, rh); return lh; } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator-=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh) { lh = __hsub(lh, rh); return lh; } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator*=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh) { lh = __hmul(lh, rh); return lh; } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator/=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh) { lh = __hdiv(lh, rh); return lh; } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator++(__nv_bfloat16 &h) { __nv_bfloat16_raw one; one.x = 0x3F80U; h += one; return h; } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator--(__nv_bfloat16 &h) { __nv_bfloat16_raw one; one.x = 0x3F80U; h -= one; return h; } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator++(__nv_bfloat16 &h, const int ignored) +{ + // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators. + static_cast(ignored); + + const __nv_bfloat16 ret = h; + __nv_bfloat16_raw one; + one.x = 0x3F80U; + h += one; + return ret; +} +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator--(__nv_bfloat16 &h, const int ignored) +{ + // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators. + static_cast(ignored); + + const __nv_bfloat16 ret = h; + __nv_bfloat16_raw one; + one.x = 0x3F80U; + h -= one; + return ret; +} +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator+(const __nv_bfloat16 &h) { return h; } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator-(const __nv_bfloat16 &h) { return __hneg(h); } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator==(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __heq(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator!=(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hneu(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator> (const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hgt(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator< (const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hlt(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator>=(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hge(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator<=(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hle(lh, rh); } +#endif /* !defined(__CUDA_NO_BFLOAT16_OPERATORS__) */ + +#if defined(__CPP_VERSION_AT_LEAST_11_BF16) +__CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat162::__nv_bfloat162(__nv_bfloat162 &&src) { +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __BFLOAT162_TO_UI(*this) = ::std::move(__BFLOAT162_TO_CUI(src)); +, + this->x = src.x; + this->y = src.y; +) +} +__CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat162 &__nv_bfloat162::operator=(__nv_bfloat162 &&src) { +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __BFLOAT162_TO_UI(*this) = ::std::move(__BFLOAT162_TO_CUI(src)); +, + this->x = src.x; + this->y = src.y; +) + return *this; +} +#else +__CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat162::__nv_bfloat162() { } +#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat162::__nv_bfloat162(const __nv_bfloat162 &src) { +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __BFLOAT162_TO_UI(*this) = __BFLOAT162_TO_CUI(src); +, + this->x = src.x; + this->y = src.y; +) +} +__CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat162 &__nv_bfloat162::operator=(const __nv_bfloat162 &src) { +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __BFLOAT162_TO_UI(*this) = __BFLOAT162_TO_CUI(src); +, + this->x = src.x; + this->y = src.y; +) + return *this; +} +__CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat162::__nv_bfloat162(const __nv_bfloat162_raw &h2r ) { +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __BFLOAT162_TO_UI(*this) = __BFLOAT162_TO_CUI(h2r); +, + __nv_bfloat16_raw tr; + tr.x = h2r.x; + this->x = static_cast<__nv_bfloat16>(tr); + tr.x = h2r.y; + this->y = static_cast<__nv_bfloat16>(tr); +) +} +__CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat162 &__nv_bfloat162::operator=(const __nv_bfloat162_raw &h2r) { +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __BFLOAT162_TO_UI(*this) = __BFLOAT162_TO_CUI(h2r); +, + __nv_bfloat16_raw tr; + tr.x = h2r.x; + this->x = static_cast<__nv_bfloat16>(tr); + tr.x = h2r.y; + this->y = static_cast<__nv_bfloat16>(tr); +) + return *this; +} +__CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat162::operator __nv_bfloat162_raw() const { + __nv_bfloat162_raw ret; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + ret.x = 0U; + ret.y = 0U; + __BFLOAT162_TO_UI(ret) = __BFLOAT162_TO_CUI(*this); +, + ret.x = static_cast<__nv_bfloat16_raw>(this->x).x; + ret.y = static_cast<__nv_bfloat16_raw>(this->y).x; +) + return ret; +} + +#if !defined(__CUDA_NO_BFLOAT162_OPERATORS__) +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator+(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hadd2(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator-(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hsub2(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator*(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hmul2(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator/(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __h2div(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162& operator+=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh) { lh = __hadd2(lh, rh); return lh; } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162& operator-=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh) { lh = __hsub2(lh, rh); return lh; } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162& operator*=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh) { lh = __hmul2(lh, rh); return lh; } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162& operator/=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh) { lh = __h2div(lh, rh); return lh; } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 &operator++(__nv_bfloat162 &h) { __nv_bfloat162_raw one; one.x = 0x3F80U; one.y = 0x3F80U; h = __hadd2(h, one); return h; } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 &operator--(__nv_bfloat162 &h) { __nv_bfloat162_raw one; one.x = 0x3F80U; one.y = 0x3F80U; h = __hsub2(h, one); return h; } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator++(__nv_bfloat162 &h, const int ignored) +{ + // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators. + static_cast(ignored); + + const __nv_bfloat162 ret = h; + __nv_bfloat162_raw one; + one.x = 0x3F80U; + one.y = 0x3F80U; + h = __hadd2(h, one); + return ret; +} +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator--(__nv_bfloat162 &h, const int ignored) +{ + // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators. + static_cast(ignored); + + const __nv_bfloat162 ret = h; + __nv_bfloat162_raw one; + one.x = 0x3F80U; + one.y = 0x3F80U; + h = __hsub2(h, one); + return ret; +} +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator+(const __nv_bfloat162 &h) { return h; } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator-(const __nv_bfloat162 &h) { return __hneg2(h); } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator==(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hbeq2(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator!=(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hbneu2(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator>(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hbgt2(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator<(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hblt2(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator>=(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hbge2(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator<=(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hble2(lh, rh); } +#endif /* !defined(__CUDA_NO_BFLOAT162_OPERATORS__) */ + +/* Restore warning for multiple assignment operators */ +#if defined(_MSC_VER) && _MSC_VER >= 1500 +#pragma warning( pop ) +#endif /* defined(_MSC_VER) && _MSC_VER >= 1500 */ + +/* Restore -Weffc++ warnings from here on */ +#if defined(__GNUC__) +#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) +#pragma GCC diagnostic pop +#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */ +#endif /* defined(__GNUC__) */ + +#undef __CUDA_HOSTDEVICE__ +#undef __CUDA_ALIGN__ + +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __internal_float_as_uint(const float f) +{ + unsigned int u; +IF_DEVICE_OR_CUDACC( + u = __float_as_uint(f); +, + memcpy(&u, &f, sizeof(f)); +, + ::std::memcpy(&u, &f, sizeof(f)); +) + return u; +} + +__CUDA_HOSTDEVICE_BF16_DECL__ float __internal_uint_as_float(const unsigned int u) +{ + float f; +IF_DEVICE_OR_CUDACC( + f = __uint_as_float(u); +, + memcpy(&f, &u, sizeof(u)); +, + ::std::memcpy(&f, &u, sizeof(u)); +) + return f; +} + +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned short __internal_float2bfloat16(const float f, unsigned int &sign, unsigned int &remainder) +{ + unsigned int x; + + x = __internal_float_as_uint(f); + + if ((x & 0x7fffffffU) > 0x7f800000U) { + sign = 0U; + remainder = 0U; + return static_cast(0x7fffU); + } + sign = x >> 31U; + remainder = x << 16U; + return static_cast(x >> 16U); +} + +__CUDA_HOSTDEVICE_BF16_DECL__ float __internal_double2float_rn(const double x) +{ + float r; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rn.f32.f64 %0, %1;" : "=f"(r) : "d"(x)); +, + r = static_cast(x); +) + return r; +} +__CUDA_HOSTDEVICE_BF16_DECL__ double __internal_float2double(const float x) +{ + double r; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.f64.f32 %0, %1;" : "=d"(r) : "f"(x)); +, + r = static_cast(x); +) + return r; +} + +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __double2bfloat16(const double x) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 val; + asm("{ cvt.rn.bf16.f64 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "d"(x)); + return val; +, + float f = __internal_double2float_rn(x); + const double d = __internal_float2double(f); + unsigned int u = __internal_float_as_uint(f); + + bool x_is_not_nan = ((u << (unsigned)1U) <= (unsigned)0xFF000000U); + + + if ((x > 0.0) && (d > x)) { + u--; + } + if ((x < 0.0) && (d < x)) { + u--; + } + if ((d != x) && x_is_not_nan) { + u |= 1U; + } + + f = __internal_uint_as_float(u); + + return __float2bfloat16(f); +) +} + +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16(const float a) +{ + __nv_bfloat16 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + asm("{ cvt.rn.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a)); +, + __nv_bfloat16_raw r; + unsigned int sign = 0U; + unsigned int remainder = 0U; + r.x = __internal_float2bfloat16(a, sign, remainder); + if ((remainder > 0x80000000U) || ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) { + r.x++; + } + val = r; +) + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rn(const float a) +{ + __nv_bfloat16 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + asm("{ cvt.rn.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a)); +, + __nv_bfloat16_raw r; + unsigned int sign = 0U; + unsigned int remainder = 0U; + r.x = __internal_float2bfloat16(a, sign, remainder); + if ((remainder > 0x80000000U) || ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) { + r.x++; + } + val = r; +) + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rz(const float a) +{ + __nv_bfloat16 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + asm("{ cvt.rz.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a)); +, + __nv_bfloat16_raw r; + unsigned int sign = 0U; + unsigned int remainder = 0U; + r.x = __internal_float2bfloat16(a, sign, remainder); + val = r; +) + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rd(const float a) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 val; + asm("{ cvt.rm.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a)); + return val; +, + __nv_bfloat16 val; + __nv_bfloat16_raw r; + unsigned int sign = 0U; + unsigned int remainder = 0U; + r.x = __internal_float2bfloat16(a, sign, remainder); + if ((remainder != 0U) && (sign != 0U)) { + r.x++; + } + val = r; + return val; +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_ru(const float a) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 val; + asm("{ cvt.rp.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a)); + return val; +, + __nv_bfloat16 val; + __nv_bfloat16_raw r; + unsigned int sign = 0U; + unsigned int remainder = 0U; + r.x = __internal_float2bfloat16(a, sign, remainder); + if ((remainder != 0U) && (sign == 0U)) { + r.x++; + } + val = r; + return val; +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __float2bfloat162_rn(const float a) +{ + __nv_bfloat162 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + asm("{.reg .b16 low;\n" + " cvt.rn.bf16.f32 low, %1;\n" + " mov.b32 %0, {low,low};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "f"(a)); +, + val = __nv_bfloat162(__float2bfloat16_rn(a), __float2bfloat16_rn(a)); +) + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __floats2bfloat162_rn(const float a, const float b) +{ + __nv_bfloat162 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + asm("{ cvt.rn.bf16x2.f32 %0, %2, %1;}\n" + : "=r"(__BFLOAT162_TO_UI(val)) : "f"(a), "f"(b)); +, + val = __nv_bfloat162(__float2bfloat16_rn(a), __float2bfloat16_rn(b)); +) + return val; +} + +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +__CUDA_BF16_DECL__ float __internal_device_bfloat162float(const unsigned short h) +{ + float f; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm("{ cvt.f32.bf16 %0, %1;}\n" : "=f"(f) : "h"(h)); +, + asm("{ mov.b32 %0, {0,%1};}\n" : "=f"(f) : "h"(h)); +) + return f; +} +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ + +__CUDA_HOSTDEVICE_BF16_DECL__ float __internal_bfloat162float(const unsigned short h) +{ + float f; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + f = __internal_device_bfloat162float(h); +, + unsigned int u = static_cast(h) << 16; + f = __internal_uint_as_float(u); +) + return f; +} + +__CUDA_HOSTDEVICE_BF16_DECL__ float __bfloat162float(const __nv_bfloat16 a) +{ + return __internal_bfloat162float(static_cast<__nv_bfloat16_raw>(a).x); +} +__CUDA_HOSTDEVICE_BF16_DECL__ float __low2float(const __nv_bfloat162 a) +{ + return __internal_bfloat162float(static_cast<__nv_bfloat162_raw>(a).x); +} + +__CUDA_HOSTDEVICE_BF16_DECL__ float __high2float(const __nv_bfloat162 a) +{ + return __internal_bfloat162float(static_cast<__nv_bfloat162_raw>(a).y); +} + +/* CUDA vector-types compatible vector creation function (note returns __nv_bfloat162, not nv_bfloat162) */ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 make_bfloat162(const __nv_bfloat16 x, const __nv_bfloat16 y) +{ + __nv_bfloat162 t; t.x = x; t.y = y; return t; +} + +/* Definitions of intrinsics */ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __float22bfloat162_rn(const float2 a) +{ + __nv_bfloat162 val = __floats2bfloat162_rn(a.x, a.y); + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ float2 __bfloat1622float2(const __nv_bfloat162 a) +{ + float hi_float; + float lo_float; + lo_float = __internal_bfloat162float(((__nv_bfloat162_raw)a).x); + hi_float = __internal_bfloat162float(((__nv_bfloat162_raw)a).y); + return make_float2(lo_float, hi_float); +} +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +__CUDA_BF16_DECL__ int __bfloat162int_rn(const __nv_bfloat16 h) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + int val; + asm("{ cvt.rni.s32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h))); + return val; +, + return __float2int_rn(__bfloat162float(h)); +) +} +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ + +__CUDA_HOSTDEVICE_BF16_DECL__ int __internal_bfloat162int_rz(const __nv_bfloat16 h) +{ + const float f = __bfloat162float(h); + int i; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + i = __float2int_rz(f); +, + const int max_val = (int)0x7fffffffU; + const int min_val = (int)0x80000000U; + const unsigned short bits = static_cast(static_cast<__nv_bfloat16_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xFF00U) { + // NaN + i = 0; + } else if (f >= static_cast(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + i = min_val; + } else { + i = static_cast(f); + } +) + return i; +} + +__CUDA_HOSTDEVICE_BF16_DECL__ int __bfloat162int_rz(const __nv_bfloat16 h) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + int val; + asm("{ cvt.rzi.s32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h))); + return val; +, + return __internal_bfloat162int_rz(h); +) +} +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +__CUDA_BF16_DECL__ int __bfloat162int_rd(const __nv_bfloat16 h) +{ + int val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm("{ cvt.rmi.s32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h))); +, + const float f = __bfloat162float(h); + asm("cvt.rmi.s32.f32 %0, %1;" : "=r"(val) : "f"(f)); +) + return val; +} +__CUDA_BF16_DECL__ int __bfloat162int_ru(const __nv_bfloat16 h) +{ + int val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm("{ cvt.rpi.s32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h))); +, + const float f = __bfloat162float(h); + asm("cvt.rpi.s32.f32 %0, %1;" : "=r"(val) : "f"(f)); +) + return val; +} + +__CUDA_BF16_DECL__ __nv_bfloat16 __internal_device_int2bfloat16_rn(const int i) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 val; + asm("cvt.rn.bf16.s32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i)); + return val; +, + const float ru = __int2float_ru(i); + const float rd = __int2float_rd(i); + float rz = __int2float_rz(i); + if (ru != rd) { + rz = __uint_as_float(__float_as_uint(rz) | 1U); + } + return __float2bfloat16_rn(rz); +) +} +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rn(const int i) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __internal_device_int2bfloat16_rn(i); +, + const double d = static_cast(i); + return __double2bfloat16(d); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ signed char __bfloat162char_rz(const __nv_bfloat16 h) +{ + signed char i; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + unsigned short tmp = 0; + asm("{ .reg.b8 myreg;\n" + " cvt.rzi.s8.bf16 myreg, %1;\n" + " mov.b16 %0, {myreg, 0};\n}" + :"=h"(tmp) : "h"(__BFLOAT16_TO_CUS(h))); + const unsigned char u = static_cast(tmp); + i = static_cast(u); +, + const float f = __bfloat162float(h); + const signed char max_val = (signed char)0x7fU; + const signed char min_val = (signed char)0x80U; + const unsigned short bits = static_cast(static_cast<__nv_bfloat16_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xFF00U) { + // NaN + i = 0; + } else if (f > static_cast(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + i = min_val; + } else { + // normal value, conversion is well-defined + i = static_cast(f); + } +) + return i; +} + +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned char __bfloat162uchar_rz(const __nv_bfloat16 h) +{ + unsigned char i; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + unsigned short tmp = 0; + asm("{ .reg.b8 myreg;\n" + " cvt.rzi.u8.bf16 myreg, %1;\n" + " mov.b16 %0, {myreg, 0};\n}" + :"=h"(tmp) : "h"(__BFLOAT16_TO_CUS(h))); + i = static_cast(tmp); +, + const float f = __bfloat162float(h); + const unsigned char max_val = 0xffU; + const unsigned char min_val = 0U; + const unsigned short bits = static_cast(static_cast<__nv_bfloat16_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xFF00U) { + // NaN + i = 0U; + } else if (f > static_cast(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + i = min_val; + } else { + // normal value, conversion is well-defined + i = static_cast(f); + } +) + return i; +} +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rz(const int i) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 val; + asm("cvt.rz.bf16.s32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i)); + return val; +, + return __float2bfloat16_rz(__int2float_rz(i)); +) +} +__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rd(const int i) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 val; + asm("cvt.rm.bf16.s32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i)); + return val; +, + return __float2bfloat16_rd(__int2float_rd(i)); +) +} + +__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_ru(const int i) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 val; + asm("cvt.rp.bf16.s32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i)); + return val; +, + return __float2bfloat16_ru(__int2float_ru(i)); +) +} + +__CUDA_BF16_DECL__ short int __bfloat162short_rn(const __nv_bfloat16 h) +{ + short int val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm("cvt.rni.s16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +, + asm("{ .reg.f32 f;\n" + " mov.b32 f, {0,%1};\n" + " cvt.rni.s16.f32 %0,f;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +) + return val; +} + +__CUDA_BF16_DECL__ short int __internal_device_bfloat162short_rz(const __nv_bfloat16 h) +{ + short int val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm("cvt.rzi.s16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +, + asm("{ .reg.f32 f;\n" + " mov.b32 f, {0,%1};\n" + " cvt.rzi.s16.f32 %0,f;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +) + return val; +} +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_BF16_DECL__ short int __bfloat162short_rz(const __nv_bfloat16 h) +{ + short int val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + val = __internal_device_bfloat162short_rz(h); +, + const float f = __bfloat162float(h); + const short int max_val = (short int)0x7fffU; + const short int min_val = (short int)0x8000U; + const unsigned short bits = static_cast(static_cast<__nv_bfloat16_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xFF00U) { + // NaN + val = 0; + } else if (f > static_cast(max_val)) { + // saturate maximum + val = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + val = min_val; + } else { + val = static_cast(f); + } +) + return val; +} +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +__CUDA_BF16_DECL__ short int __bfloat162short_rd(const __nv_bfloat16 h) +{ + short int val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm("cvt.rmi.s16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +, + asm("{ .reg.f32 f;\n" + " mov.b32 f, {0,%1};\n" + " cvt.rmi.s16.f32 %0,f;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +) + return val; +} +__CUDA_BF16_DECL__ short int __bfloat162short_ru(const __nv_bfloat16 h) +{ + short int val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm("cvt.rpi.s16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +, + asm("{ .reg.f32 f;\n" + " mov.b32 f, {0,%1};\n" + " cvt.rpi.s16.f32 %0,f;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +) + return val; +} +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rn(const short int i) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 val; + asm("cvt.rn.bf16.s16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i)); + return val; +, + const float f = static_cast(i); + return __float2bfloat16_rn(f); +) +} +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rz(const short int i) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 val; + asm("cvt.rz.bf16.s16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i)); + return val; +, + return __float2bfloat16_rz(__int2float_rz(static_cast(i))); +) +} +__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rd(const short int i) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 val; + asm("cvt.rm.bf16.s16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i)); + return val; +, + return __float2bfloat16_rd(__int2float_rd(static_cast(i))); +) +} +__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_ru(const short int i) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 val; + asm("cvt.rp.bf16.s16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i)); + return val; +, + return __float2bfloat16_ru(__int2float_ru(static_cast(i))); +) +} + +__CUDA_BF16_DECL__ unsigned int __bfloat162uint_rn(const __nv_bfloat16 h) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + unsigned int val; + asm("{ cvt.rni.u32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h))); + return val; +, + return __float2uint_rn(__bfloat162float(h)); +) +} +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __internal_bfloat162uint_rz(const __nv_bfloat16 h) +{ + const float f = __bfloat162float(h); + unsigned int i; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + i = __float2uint_rz(f); +, + const unsigned int max_val = 0xffffffffU; + const unsigned int min_val = 0U; + const unsigned short bits = static_cast(static_cast<__nv_bfloat16_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xFF00U) { + // NaN + i = 0U; + } else if (f >= static_cast(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + i = min_val; + } else { + i = static_cast(f); + } +) + return i; +} + +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __bfloat162uint_rz(const __nv_bfloat16 h) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + unsigned int val; + asm("{ cvt.rzi.u32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h))); + return val; +, + return __internal_bfloat162uint_rz(h); +) +} +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +__CUDA_BF16_DECL__ unsigned int __bfloat162uint_rd(const __nv_bfloat16 h) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + unsigned int val; + asm("{ cvt.rmi.u32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h))); + return val; +, + return __float2uint_rd(__bfloat162float(h)); +) +} +__CUDA_BF16_DECL__ unsigned int __bfloat162uint_ru(const __nv_bfloat16 h) +{ + unsigned int val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm("{ cvt.rpi.u32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h))); +, + const float f = __bfloat162float(h); + asm("cvt.rpi.u32.f32 %0, %1;" : "=r"(val) : "f"(f)); +) + return val; +} + +__CUDA_BF16_DECL__ __nv_bfloat16 __internal_device_uint2bfloat16_rn(const unsigned int i) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 val; + asm("cvt.rn.bf16.u32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i)); + return val; +, + const float ru = __uint2float_ru(i); + const float rd = __uint2float_rd(i); + float rz = __uint2float_rz(i); + if (ru != rd) { + rz = __uint_as_float(__float_as_uint(rz) | 1U); + } + return __float2bfloat16_rn(rz); +) +} +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rn(const unsigned int i) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __internal_device_uint2bfloat16_rn(i); +, + const double d = static_cast(i); + return __double2bfloat16(d); +) +} +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rz(const unsigned int i) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 val; + asm("cvt.rz.bf16.u32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i)); + return val; +, + return __float2bfloat16_rz(__uint2float_rz(i)); +) +} +__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rd(const unsigned int i) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 val; + asm("cvt.rm.bf16.u32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i)); + return val; +, + return __float2bfloat16_rd(__uint2float_rd(i)); +) +} +__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_ru(const unsigned int i) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 val; + asm("cvt.rp.bf16.u32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i)); + return val; +, + return __float2bfloat16_ru(__uint2float_ru(i)); +) +} + +__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_rn(const __nv_bfloat16 h) +{ + unsigned short int val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm("cvt.rni.u16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +, + asm("{ .reg.f32 f;\n" + " mov.b32 f, {0,%1};\n" + " cvt.rni.u16.f32 %0,f;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +) + return val; +} + +__CUDA_BF16_DECL__ unsigned short int __internal_device_bfloat162ushort_rz(const __nv_bfloat16 h) +{ + unsigned short int val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm("cvt.rzi.u16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +, + asm("{ .reg.f32 f;\n" + " mov.b32 f, {0,%1};\n" + " cvt.rzi.u16.f32 %0,f;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +) + return val; +} +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned short int __bfloat162ushort_rz(const __nv_bfloat16 h) +{ + unsigned short int val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + val = __internal_device_bfloat162ushort_rz(h); +, + const float f = __bfloat162float(h); + const unsigned short int max_val = 0xffffU; + const unsigned short int min_val = 0U; + const unsigned short bits = static_cast(static_cast<__nv_bfloat16_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xFF00U) { + // NaN + val = 0U; + } else if (f > static_cast(max_val)) { + // saturate maximum + val = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + val = min_val; + } else { + val = static_cast(f); + } +) + return val; +} +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_rd(const __nv_bfloat16 h) +{ + unsigned short int val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm("cvt.rmi.u16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +, + asm("{ .reg.f32 f;\n" + " mov.b32 f, {0,%1};\n" + " cvt.rmi.u16.f32 %0,f;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +) + return val; +} +__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_ru(const __nv_bfloat16 h) +{ + unsigned short int val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm("cvt.rpi.u16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +, + asm("{ .reg.f32 f;\n" + " mov.b32 f, {0,%1};\n" + " cvt.rpi.u16.f32 %0,f;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +) + return val; +} +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rn(const unsigned short int i) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 val; + asm("cvt.rn.bf16.u16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i)); + return val; +, + const float f = static_cast(i); + return __float2bfloat16_rn(f); +) +} +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rz(const unsigned short int i) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 val; + asm("cvt.rz.bf16.u16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i)); + return val; +, + return __float2bfloat16_rz(__uint2float_rz(static_cast(i))); +) +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rd(const unsigned short int i) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 val; + asm("cvt.rm.bf16.u16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i)); + return val; +, + return __float2bfloat16_rd(__uint2float_rd(static_cast(i))); +) +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_ru(const unsigned short int i) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 val; + asm("cvt.rp.bf16.u16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i)); + return val; +, + return __float2bfloat16_ru(__uint2float_ru(static_cast(i))); +) +} + +__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_rn(const __nv_bfloat16 h) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + unsigned long long int i; + asm("cvt.rni.u64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h))); + return i; +, + return __float2ull_rn(__bfloat162float(h)); +) +} + +__CUDA_BF16_DECL__ unsigned long long int __internal_device_bfloat162ull_rz(const __nv_bfloat16 h) +{ + unsigned long long int i; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm("cvt.rzi.u64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h))); +, + const float f = __bfloat162float(h); + i = __float2ull_rz(f); +) + return i; +} +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned long long int __bfloat162ull_rz(const __nv_bfloat16 h) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __internal_device_bfloat162ull_rz(h); +, + const float f = __bfloat162float(h); + unsigned long long int i; + const unsigned long long int max_val = 0xffffffffffffffffULL; + const unsigned long long int min_val = 0ULL; + const unsigned short bits = static_cast(static_cast<__nv_bfloat16_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xFF00U) { + // NaN + i = 0x8000000000000000ULL; + } else if (f >= static_cast(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + i = min_val; + } else { + i = static_cast(f); + } + return i; +) +} +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_rd(const __nv_bfloat16 h) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + unsigned long long int i; + asm("cvt.rmi.u64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h))); + return i; +, + return __float2ull_rd(__bfloat162float(h)); +) +} +__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_ru(const __nv_bfloat16 h) +{ + unsigned long long int i; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm("cvt.rpi.u64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h))); +, + const float f = __bfloat162float(h); + asm("cvt.rpi.u64.f32 %0, %1;" : "=l"(i) : "f"(f)); +) + return i; +} + +__CUDA_BF16_DECL__ __nv_bfloat16 __internal_device_ull2bfloat16_rn(const unsigned long long int i) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 h; + asm("cvt.rn.bf16.u64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i)); + return h; +, + const float ru = __ull2float_ru(i); + const float rd = __ull2float_rd(i); + float rz = __ull2float_rz(i); + if (ru != rd) { + rz = __uint_as_float(__float_as_uint(rz) | 1U); + } + return __float2bfloat16_rn(rz); +) +} +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rn(const unsigned long long int i) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __internal_device_ull2bfloat16_rn(i); +, + float f = static_cast(i); + const unsigned long long int uf = static_cast(f); + unsigned int u = __internal_float_as_uint(f); + // round up happened here + // note: no need to handle round up to f == 0x1.p64 specially + if (uf > i) { + u--; + } + if (uf != i) { + u |= 1U; + } + f = __internal_uint_as_float(u); + return __float2bfloat16_rn(f); +) +} +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rz(const unsigned long long int i) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 h; + asm("cvt.rz.bf16.u64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i)); + return h; +, + return __float2bfloat16_rz(__ull2float_rz(i)); +) +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rd(const unsigned long long int i) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 h; + asm("cvt.rm.bf16.u64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i)); + return h; +, + return __float2bfloat16_rd(__ull2float_rd(i)); +) +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_ru(const unsigned long long int i) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 h; + asm("cvt.rp.bf16.u64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i)); + return h; +, + return __float2bfloat16_ru(__ull2float_ru(i)); +) +} +__CUDA_BF16_DECL__ long long int __bfloat162ll_rn(const __nv_bfloat16 h) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + long long int i; + asm("cvt.rni.s64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h))); + return i; +, + return __float2ll_rn(__bfloat162float(h)); +) +} + +__CUDA_BF16_DECL__ long long int __internal_device_bfloat162ll_rz(const __nv_bfloat16 h) +{ + long long int i; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm("cvt.rzi.s64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h))); +, + const float f = __bfloat162float(h); + i = __float2ll_rz(f); +) + return i; +} +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_BF16_DECL__ long long int __bfloat162ll_rz(const __nv_bfloat16 h) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __internal_device_bfloat162ll_rz(h); +, + long long int i; + const float f = __bfloat162float(h); + const long long int max_val = (long long int)0x7fffffffffffffffULL; + const long long int min_val = (long long int)0x8000000000000000ULL; + const unsigned short bits = static_cast(static_cast<__nv_bfloat16_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xFF00U) { + // NaN + i = min_val; + } else if (f >= static_cast(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + i = min_val; + } else { + i = static_cast(f); + } + return i; +) +} +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +__CUDA_BF16_DECL__ long long int __bfloat162ll_rd(const __nv_bfloat16 h) +{ + long long int i; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm("cvt.rmi.s64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h))); +, + const float f = __bfloat162float(h); + asm("cvt.rmi.s64.f32 %0, %1;" : "=l"(i) : "f"(f)); +) + return i; +} +__CUDA_BF16_DECL__ long long int __bfloat162ll_ru(const __nv_bfloat16 h) +{ + long long int i; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm("cvt.rpi.s64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h))); +, + const float f = __bfloat162float(h); + asm("cvt.rpi.s64.f32 %0, %1;" : "=l"(i) : "f"(f)); +) + return i; +} + +__CUDA_BF16_DECL__ __nv_bfloat16 __internal_device_ll2bfloat16_rn(const long long int i) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 h; + asm("cvt.rn.bf16.s64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i)); + return h; +, + const float ru = __ll2float_ru(i); + const float rd = __ll2float_rd(i); + float rz = __ll2float_rz(i); + if (ru != rd) { + rz = __uint_as_float(__float_as_uint(rz) | 1U); + } + return __float2bfloat16_rn(rz); +) +} +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rn(const long long int i) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __internal_device_ll2bfloat16_rn(i); +, + float f = static_cast(i); + const long long int lf = static_cast(f); + unsigned int u = __internal_float_as_uint(f); + + if ((f > 0.0f) && (lf > i)) { + u--; + } + if ((f < 0.0f) && (lf < i)) { + u--; + } + if (lf != i) { + u |= 1U; + } + + f = __internal_uint_as_float(u); + return __float2bfloat16_rn(f); +) +} +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rz(const long long int i) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 h; + asm("cvt.rz.bf16.s64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i)); + return h; +, + return __float2bfloat16_rz(__ll2float_rz(i)); +) +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rd(const long long int i) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 h; + asm("cvt.rm.bf16.s64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i)); + return h; +, + return __float2bfloat16_rd(__ll2float_rd(i)); +) +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_ru(const long long int i) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 h; + asm("cvt.rp.bf16.s64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i)); + return h; +, + return __float2bfloat16_ru(__ll2float_ru(i)); +) +} + +__CUDA_BF16_DECL__ __nv_bfloat16 htrunc(const __nv_bfloat16 h) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 r; + asm("cvt.rzi.bf16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(h))); + return r; +, + return __float2bfloat16_rz(truncf(__bfloat162float(h))); +) +} +__CUDA_BF16_DECL__ __nv_bfloat16 hceil(const __nv_bfloat16 h) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 r; + asm("cvt.rpi.bf16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(h))); + return r; +, + float fh = __bfloat162float(h); + asm( "{ cvt.rpi.f32.f32 %0, %0; }\n" + :"+f"(fh)); + return __float2bfloat16_rz(fh); +) +} +__CUDA_BF16_DECL__ __nv_bfloat16 hfloor(const __nv_bfloat16 h) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 r; + asm("cvt.rmi.bf16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(h))); + return r; +, + float fh = __bfloat162float(h); + asm( "{ cvt.rmi.f32.f32 %0, %0; }\n" + :"+f"(fh)); + return __float2bfloat16_rz(fh); +) +} +__CUDA_BF16_DECL__ __nv_bfloat16 hrint(const __nv_bfloat16 h) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 r; + asm("cvt.rni.bf16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(h))); + return r; +, + return __float2bfloat16_rz(rintf(__bfloat162float(h))); +) +} + +__CUDA_BF16_DECL__ __nv_bfloat162 h2trunc(const __nv_bfloat162 h) +{ + const __nv_bfloat16 low = htrunc(h.x); + const __nv_bfloat16 high = htrunc(h.y); + return __nv_bfloat162(low, high); +} +__CUDA_BF16_DECL__ __nv_bfloat162 h2ceil(const __nv_bfloat162 h) +{ + const __nv_bfloat16 low = hceil(h.x); + const __nv_bfloat16 high = hceil(h.y); + return __nv_bfloat162(low, high); +} +__CUDA_BF16_DECL__ __nv_bfloat162 h2floor(const __nv_bfloat162 h) +{ + const __nv_bfloat16 low = hfloor(h.x); + const __nv_bfloat16 high = hfloor(h.y); + return __nv_bfloat162(low, high); +} + +__CUDA_BF16_DECL__ __nv_bfloat162 h2rint(const __nv_bfloat162 h) +{ + return __halves2bfloat162(hrint(__low2bfloat16(h)), hrint(__high2bfloat16(h))); +} +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __lows2bfloat162(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{.reg .b16 alow,ahigh,blow,bhigh;\n" + " mov.b32 {alow,ahigh}, %1;\n" + " mov.b32 {blow,bhigh}, %2;\n" + " mov.b32 %0, {alow,blow};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)), "r"(__BFLOAT162_TO_CUI(b))); +, + val.x = a.x; + val.y = b.x; +) + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __highs2bfloat162(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{.reg .b16 alow,ahigh,blow,bhigh;\n" + " mov.b32 {alow,ahigh}, %1;\n" + " mov.b32 {blow,bhigh}, %2;\n" + " mov.b32 %0, {ahigh,bhigh};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)), "r"(__BFLOAT162_TO_CUI(b))); +, + val.x = a.y; + val.y = b.y; +) + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __low2bfloat16(const __nv_bfloat162 a) +{ + __nv_bfloat16 ret; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{.reg .b16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b16 %0, low;}" : "=h"(__BFLOAT16_TO_US(ret)) : "r"(__BFLOAT162_TO_CUI(a))); +, + ret = a.x; +) + return ret; +} +__CUDA_HOSTDEVICE_BF16_DECL__ int __hisinf(const __nv_bfloat16 a) +{ + int retval; + const __nv_bfloat16_raw araw = __nv_bfloat16_raw(a); + if (araw.x == 0xFF80U) { + retval = -1; + } else if (araw.x == 0x7F80U) { + retval = 1; + } else { + retval = 0; + } + return retval; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __low2bfloat162(const __nv_bfloat162 a) +{ + __nv_bfloat162 val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{.reg .b16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b32 %0, {low,low};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a))); +, + val.x = a.x; + val.y = a.x; +) + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __high2bfloat162(const __nv_bfloat162 a) +{ + __nv_bfloat162 val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{.reg .b16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b32 %0, {high,high};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a))); +, + val.x = a.y; + val.y = a.y; +) + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __high2bfloat16(const __nv_bfloat162 a) +{ + __nv_bfloat16 ret; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{.reg .b16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b16 %0, high;}" : "=h"(__BFLOAT16_TO_US(ret)) : "r"(__BFLOAT162_TO_CUI(a))); +, + ret = a.y; +) + return ret; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __halves2bfloat162(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __nv_bfloat162 val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{ mov.b32 %0, {%1,%2};}\n" + : "=r"(__BFLOAT162_TO_UI(val)) : "h"(__BFLOAT16_TO_CUS(a)), "h"(__BFLOAT16_TO_CUS(b))); +, + val.x = a; + val.y = b; +) + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __bfloat162bfloat162(const __nv_bfloat16 a) +{ + __nv_bfloat162 val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{ mov.b32 %0, {%1,%1};}\n" + : "=r"(__BFLOAT162_TO_UI(val)) : "h"(__BFLOAT16_TO_CUS(a))); +, + val.x = a; + val.y = a; +) + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __lowhigh2highlow(const __nv_bfloat162 a) +{ + __nv_bfloat162 val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{.reg .b16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b32 %0, {high,low};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a))); +, + val.x = a.y; + val.y = a.x; +) + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ short int __bfloat16_as_short(const __nv_bfloat16 h) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return static_cast(__BFLOAT16_TO_CUS(h)); +, + return static_cast(__nv_bfloat16_raw(h).x); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned short int __bfloat16_as_ushort(const __nv_bfloat16 h) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __BFLOAT16_TO_CUS(h); +, + return __nv_bfloat16_raw(h).x; +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __short_as_bfloat16(const short int i) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __nv_bfloat16 h; + __BFLOAT16_TO_US(h) = static_cast(i); + return h; +, + __nv_bfloat16_raw hr; + hr.x = static_cast(i); + return __nv_bfloat16(hr); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ushort_as_bfloat16(const unsigned short int i) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __nv_bfloat16 h; + __BFLOAT16_TO_US(h) = i; + return h; +, + __nv_bfloat16_raw hr; + hr.x = i; + return __nv_bfloat16(hr); +) +} + +#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300))) || defined(_NVHPC_CUDA) +/****************************************************************************** +* __nv_bfloat16, __nv_bfloat162 warp shuffle * +******************************************************************************/ +#define __SHUFFLE_SYNC_BFLOAT162_MACRO(name, var, delta, c, mask) /* do */ {\ + __nv_bfloat162 r; \ + asm volatile ("{" __CUDA_BF16_STRINGIFY(name) " %0,%1,%2,%3,%4;\n}" \ + :"=r"(__BFLOAT162_TO_UI(r)): "r"(__BFLOAT162_TO_CUI(var)), "r"(delta), "r"(c), "r"(mask)); \ + return r; \ +} /* while(0) */ + +__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_sync(const unsigned int mask, const __nv_bfloat162 var, const int srcLane, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = ((warp_size - static_cast(width)) << 8U) | 0x1fU; + __SHUFFLE_SYNC_BFLOAT162_MACRO(shfl.sync.idx.b32, var, srcLane, c, mask) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_up_sync(const unsigned int mask, const __nv_bfloat162 var, const unsigned int delta, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = (warp_size - static_cast(width)) << 8U; + __SHUFFLE_SYNC_BFLOAT162_MACRO(shfl.sync.up.b32, var, delta, c, mask) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_down_sync(const unsigned int mask, const __nv_bfloat162 var, const unsigned int delta, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = ((warp_size - static_cast(width)) << 8U) | 0x1fU; + __SHUFFLE_SYNC_BFLOAT162_MACRO(shfl.sync.down.b32, var, delta, c, mask) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_xor_sync(const unsigned int mask, const __nv_bfloat162 var, const int laneMask, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = ((warp_size - static_cast(width)) << 8U) | 0x1fU; + __SHUFFLE_SYNC_BFLOAT162_MACRO(shfl.sync.bfly.b32, var, laneMask, c, mask) +} + +#undef __SHUFFLE_SYNC_BFLOAT162_MACRO + +__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_sync(const unsigned int mask, const __nv_bfloat16 var, const int srcLane, const int width) +{ + const __nv_bfloat162 temp1 = __halves2bfloat162(var, var); + const __nv_bfloat162 temp2 = __shfl_sync(mask, temp1, srcLane, width); + return __low2bfloat16(temp2); +} +__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_up_sync(const unsigned int mask, const __nv_bfloat16 var, const unsigned int delta, const int width) +{ + const __nv_bfloat162 temp1 = __halves2bfloat162(var, var); + const __nv_bfloat162 temp2 = __shfl_up_sync(mask, temp1, delta, width); + return __low2bfloat16(temp2); +} +__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_down_sync(const unsigned int mask, const __nv_bfloat16 var, const unsigned int delta, const int width) +{ + const __nv_bfloat162 temp1 = __halves2bfloat162(var, var); + const __nv_bfloat162 temp2 = __shfl_down_sync(mask, temp1, delta, width); + return __low2bfloat16(temp2); +} +__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_xor_sync(const unsigned int mask, const __nv_bfloat16 var, const int laneMask, const int width) +{ + const __nv_bfloat162 temp1 = __halves2bfloat162(var, var); + const __nv_bfloat162 temp2 = __shfl_xor_sync(mask, temp1, laneMask, width); + return __low2bfloat16(temp2); +} + +/****************************************************************************** +* __nv_bfloat16 and __nv_bfloat162 __ldg,__ldcg,__ldca,__ldcs * +******************************************************************************/ + +#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__) +#define __LDG_PTR "l" +#else +#define __LDG_PTR "r" +#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __ldg(const __nv_bfloat162 *const ptr) +{ + __nv_bfloat162 ret; + asm ("ld.global.nc.b32 %0, [%1];" : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ldg(const __nv_bfloat16 *const ptr) +{ + __nv_bfloat16 ret; + asm ("ld.global.nc.b16 %0, [%1];" : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __ldcg(const __nv_bfloat162 *const ptr) +{ + __nv_bfloat162 ret; + asm ("ld.global.cg.b32 %0, [%1];" : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ldcg(const __nv_bfloat16 *const ptr) +{ + __nv_bfloat16 ret; + asm ("ld.global.cg.b16 %0, [%1];" : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __ldca(const __nv_bfloat162 *const ptr) +{ + __nv_bfloat162 ret; + asm ("ld.global.ca.b32 %0, [%1];" : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ldca(const __nv_bfloat16 *const ptr) +{ + __nv_bfloat16 ret; + asm ("ld.global.ca.b16 %0, [%1];" : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __ldcs(const __nv_bfloat162 *const ptr) +{ + __nv_bfloat162 ret; + asm ("ld.global.cs.b32 %0, [%1];" : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ldcs(const __nv_bfloat16 *const ptr) +{ + __nv_bfloat16 ret; + asm ("ld.global.cs.b16 %0, [%1];" : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __ldlu(const __nv_bfloat162 *const ptr) +{ + __nv_bfloat162 ret; + asm ("ld.global.lu.b32 %0, [%1];" : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr) : "memory"); + return ret; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ldlu(const __nv_bfloat16 *const ptr) +{ + __nv_bfloat16 ret; + asm ("ld.global.lu.b16 %0, [%1];" : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr) : "memory"); + return ret; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __ldcv(const __nv_bfloat162 *const ptr) +{ + __nv_bfloat162 ret; + asm ("ld.global.cv.b32 %0, [%1];" : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr) : "memory"); + return ret; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ldcv(const __nv_bfloat16 *const ptr) +{ + __nv_bfloat16 ret; + asm ("ld.global.cv.b16 %0, [%1];" : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr) : "memory"); + return ret; +} + +__CUDA_BF16_DECL__ void __stwb(__nv_bfloat162 *const ptr, const __nv_bfloat162 value) +{ + asm ("st.global.wb.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__BFLOAT162_TO_CUI(value)) : "memory"); +} +__CUDA_BF16_DECL__ void __stwb(__nv_bfloat16 *const ptr, const __nv_bfloat16 value) +{ + asm ("st.global.wb.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__BFLOAT16_TO_CUS(value)) : "memory"); +} +__CUDA_BF16_DECL__ void __stcg(__nv_bfloat162 *const ptr, const __nv_bfloat162 value) +{ + asm ("st.global.cg.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__BFLOAT162_TO_CUI(value)) : "memory"); +} +__CUDA_BF16_DECL__ void __stcg(__nv_bfloat16 *const ptr, const __nv_bfloat16 value) +{ + asm ("st.global.cg.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__BFLOAT16_TO_CUS(value)) : "memory"); +} +__CUDA_BF16_DECL__ void __stcs(__nv_bfloat162 *const ptr, const __nv_bfloat162 value) +{ + asm ("st.global.cs.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__BFLOAT162_TO_CUI(value)) : "memory"); +} +__CUDA_BF16_DECL__ void __stcs(__nv_bfloat16 *const ptr, const __nv_bfloat16 value) +{ + asm ("st.global.cs.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__BFLOAT16_TO_CUS(value)) : "memory"); +} +__CUDA_BF16_DECL__ void __stwt(__nv_bfloat162 *const ptr, const __nv_bfloat162 value) +{ + asm ("st.global.wt.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__BFLOAT162_TO_CUI(value)) : "memory"); +} +__CUDA_BF16_DECL__ void __stwt(__nv_bfloat16 *const ptr, const __nv_bfloat16 value) +{ + asm ("st.global.wt.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__BFLOAT16_TO_CUS(value)) : "memory"); +} + +#undef __LDG_PTR +#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300))) || defined(_NVHPC_CUDA) */ +/****************************************************************************** +* __nv_bfloat162 comparison * +******************************************************************************/ +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +#define __COMPARISON_OP_BFLOAT162_MACRO(name) {\ + __nv_bfloat162 val; \ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,\ + asm( "{ " __CUDA_BF16_STRINGIFY(name) ".bf16x2.bf16x2 %0,%1,%2;\n}" \ + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \ +,\ + asm( "{.reg .b32 low_a,low_b,high_a,high_b,high_res,low_res;\n"\ + " and.b32 high_a, %1, 0xffff0000U;\n"\ + " and.b32 high_b, %2, 0xffff0000U;\n"\ + " shl.b32 low_a, %1, 16;\n"\ + " shl.b32 low_b, %2, 16;\n"\ + " " __CUDA_BF16_STRINGIFY(name) ".f32.f32 low_res, low_a, low_b;\n"\ + " " __CUDA_BF16_STRINGIFY(name) ".f32.f32 high_res, high_a, high_b;\n"\ + " shr.u32 low_res, low_res, 16;\n"\ + " or.b32 %0, high_res, low_res;}\n"\ + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \ +)\ + return val; \ +} +__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_heq2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __COMPARISON_OP_BFLOAT162_MACRO(set.eq) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hne2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __COMPARISON_OP_BFLOAT162_MACRO(set.ne) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hle2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __COMPARISON_OP_BFLOAT162_MACRO(set.le) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hge2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __COMPARISON_OP_BFLOAT162_MACRO(set.ge) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hlt2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __COMPARISON_OP_BFLOAT162_MACRO(set.lt) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hgt2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __COMPARISON_OP_BFLOAT162_MACRO(set.gt) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hequ2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __COMPARISON_OP_BFLOAT162_MACRO(set.equ) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hneu2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __COMPARISON_OP_BFLOAT162_MACRO(set.neu) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hleu2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __COMPARISON_OP_BFLOAT162_MACRO(set.leu) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __COMPARISON_OP_BFLOAT162_MACRO(set.geu) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hltu2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __COMPARISON_OP_BFLOAT162_MACRO(set.ltu) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __COMPARISON_OP_BFLOAT162_MACRO(set.gtu) +} +#undef __COMPARISON_OP_BFLOAT162_MACRO +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ + +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __heq2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __internal_device_heq2(a, b); +, + __nv_bfloat162_raw val; + val.x = __heq(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U; + val.y = __heq(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U; + return __nv_bfloat162(val); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hne2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __internal_device_hne2(a, b); +, + __nv_bfloat162_raw val; + val.x = __hne(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U; + val.y = __hne(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U; + return __nv_bfloat162(val); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hle2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __internal_device_hle2(a, b); +, + __nv_bfloat162_raw val; + val.x = __hle(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U; + val.y = __hle(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U; + return __nv_bfloat162(val); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hge2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __internal_device_hge2(a, b); +, + __nv_bfloat162_raw val; + val.x = __hge(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U; + val.y = __hge(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U; + return __nv_bfloat162(val); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hlt2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __internal_device_hlt2(a, b); +, + __nv_bfloat162_raw val; + val.x = __hlt(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U; + val.y = __hlt(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U; + return __nv_bfloat162(val); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hgt2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __internal_device_hgt2(a, b); +, + __nv_bfloat162_raw val; + val.x = __hgt(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U; + val.y = __hgt(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U; + return __nv_bfloat162(val); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hequ2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __internal_device_hequ2(a, b); +, + __nv_bfloat162_raw val; + val.x = __hequ(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U; + val.y = __hequ(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U; + return __nv_bfloat162(val); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hneu2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __internal_device_hneu2(a, b); +, + __nv_bfloat162_raw val; + val.x = __hneu(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U; + val.y = __hneu(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U; + return __nv_bfloat162(val); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hleu2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __internal_device_hleu2(a, b); +, + __nv_bfloat162_raw val; + val.x = __hleu(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U; + val.y = __hleu(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U; + return __nv_bfloat162(val); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __internal_device_hgeu2(a, b); +, + __nv_bfloat162_raw val; + val.x = __hgeu(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U; + val.y = __hgeu(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U; + return __nv_bfloat162(val); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hltu2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __internal_device_hltu2(a, b); +, + __nv_bfloat162_raw val; + val.x = __hltu(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U; + val.y = __hltu(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U; + return __nv_bfloat162(val); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __internal_device_hgtu2(a, b); +, + __nv_bfloat162_raw val; + val.x = __hgtu(a.x, b.x) ? (unsigned short)0x3F80U : (unsigned short)0U; + val.y = __hgtu(a.y, b.y) ? (unsigned short)0x3F80U : (unsigned short)0U; + return __nv_bfloat162(val); +) +} + +/****************************************************************************** +* __nv_bfloat162 comparison with mask output * +******************************************************************************/ +#define __COMPARISON_OP_BFLOAT162_MACRO_MASK(name) {\ + unsigned val; \ + asm( "{ " __CUDA_BF16_STRINGIFY(name) ".u32.bf16x2 %0,%1,%2;\n}" \ + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \ + return val; \ +} + +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __heq2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.eq) +, + const unsigned short px = __heq(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U; + const unsigned short py = __heq(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U; + unsigned ur = (unsigned)py; + ur <<= (unsigned)16U; + ur |= (unsigned)px; + return ur; +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hne2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.ne) +, + const unsigned short px = __hne(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U; + const unsigned short py = __hne(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U; + unsigned ur = (unsigned)py; + ur <<= (unsigned)16U; + ur |= (unsigned)px; + return ur; +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hle2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.le) +, + const unsigned short px = __hle(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U; + const unsigned short py = __hle(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U; + unsigned ur = (unsigned)py; + ur <<= (unsigned)16U; + ur |= (unsigned)px; + return ur; +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hge2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.ge) +, + const unsigned short px = __hge(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U; + const unsigned short py = __hge(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U; + unsigned ur = (unsigned)py; + ur <<= (unsigned)16U; + ur |= (unsigned)px; + return ur; +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hlt2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.lt) +, + const unsigned short px = __hlt(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U; + const unsigned short py = __hlt(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U; + unsigned ur = (unsigned)py; + ur <<= (unsigned)16U; + ur |= (unsigned)px; + return ur; +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hgt2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.gt) +, + const unsigned short px = __hgt(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U; + const unsigned short py = __hgt(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U; + unsigned ur = (unsigned)py; + ur <<= (unsigned)16U; + ur |= (unsigned)px; + return ur; +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hequ2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.equ) +, + const unsigned short px = __hequ(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U; + const unsigned short py = __hequ(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U; + unsigned ur = (unsigned)py; + ur <<= (unsigned)16U; + ur |= (unsigned)px; + return ur; +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hneu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.neu) +, + const unsigned short px = __hneu(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U; + const unsigned short py = __hneu(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U; + unsigned ur = (unsigned)py; + ur <<= (unsigned)16U; + ur |= (unsigned)px; + return ur; +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hleu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.leu) +, + const unsigned short px = __hleu(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U; + const unsigned short py = __hleu(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U; + unsigned ur = (unsigned)py; + ur <<= (unsigned)16U; + ur |= (unsigned)px; + return ur; +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hgeu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.geu) +, + const unsigned short px = __hgeu(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U; + const unsigned short py = __hgeu(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U; + unsigned ur = (unsigned)py; + ur <<= (unsigned)16U; + ur |= (unsigned)px; + return ur; +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hltu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.ltu) +, + const unsigned short px = __hltu(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U; + const unsigned short py = __hltu(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U; + unsigned ur = (unsigned)py; + ur <<= (unsigned)16U; + ur |= (unsigned)px; + return ur; +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hgtu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __COMPARISON_OP_BFLOAT162_MACRO_MASK(set.gtu) +, + const unsigned short px = __hgtu(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U; + const unsigned short py = __hgtu(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U; + unsigned ur = (unsigned)py; + ur <<= (unsigned)16U; + ur |= (unsigned)px; + return ur; +) +} +#undef __COMPARISON_OP_BFLOAT162_MACRO_MASK + +#define __BOOL_COMPARISON_OP_BFLOAT162_MACRO(name) {\ + unsigned int val; \ + bool retval; \ + asm( "{ " __CUDA_BF16_STRINGIFY(name) ".bf16x2.bf16x2 %0,%1,%2;\n}" \ + :"=r"(val) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \ + if (val == 0x3F803F80U) {\ + retval = true; \ + } else { \ + retval = false; \ + }\ + return retval;\ +} + +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbeq2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.eq) +, + return (__heq(a.x, b.x) && __heq(a.y, b.y)); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbne2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.ne) +, + return (__hne(a.x, b.x) && __hne(a.y, b.y)); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hble2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.le) +, + return (__hle(a.x, b.x) && __hle(a.y, b.y)); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbge2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.ge) +, + return (__hge(a.x, b.x) && __hge(a.y, b.y)); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hblt2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.lt) +, + return (__hlt(a.x, b.x) && __hlt(a.y, b.y)); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbgt2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.gt) +, + return (__hgt(a.x, b.x) && __hgt(a.y, b.y)); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbequ2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.equ) +, + return (__hequ(a.x, b.x) && __hequ(a.y, b.y)); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbneu2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.neu) +, + return (__hneu(a.x, b.x) && __hneu(a.y, b.y)); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbleu2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.leu) +, + return (__hleu(a.x, b.x) && __hleu(a.y, b.y)); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.geu) +, + return (__hgeu(a.x, b.x) && __hgeu(a.y, b.y)); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbltu2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.ltu) +, + return (__hltu(a.x, b.x) && __hltu(a.y, b.y)); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.gtu) +, + return (__hgtu(a.x, b.x) && __hgtu(a.y, b.y)); +) +} +#undef __BOOL_COMPARISON_OP_BFLOAT162_MACRO +/****************************************************************************** +* __nv_bfloat16 comparison * +******************************************************************************/ +#define __COMPARISON_OP_BFLOAT16_MACRO(name) {\ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,\ + unsigned short val; \ + asm( "{ .reg .pred __$temp3;\n" \ + " setp." __CUDA_BF16_STRINGIFY(name) ".bf16 __$temp3, %1, %2;\n" \ + " selp.u16 %0, 1, 0, __$temp3;}" \ + : "=h"(val) : "h"(__BFLOAT16_TO_CUS(a)), "h"(__BFLOAT16_TO_CUS(b))); \ + return (val != 0U) ? true : false; \ +,\ + unsigned int val; \ + asm( "{.reg .b32 a,b;\n"\ + " mov.b32 a, {0, %1};\n"\ + " mov.b32 b, {0, %2};\n"\ + " set." __CUDA_BF16_STRINGIFY(name) ".f32.f32 %0, a, b;}\n"\ + :"=r"(val) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \ + return (val != 0U) ? true : false; \ +)\ +} +__CUDA_HOSTDEVICE_BF16_DECL__ bool __heq(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __COMPARISON_OP_BFLOAT16_MACRO(eq) +, + const float fa = __bfloat162float(a); + const float fb = __bfloat162float(b); + return (fa == fb); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hne(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __COMPARISON_OP_BFLOAT16_MACRO(ne) +, + const float fa = __bfloat162float(a); + const float fb = __bfloat162float(b); + return (fa != fb) && (!__hisnan(a)) && (!__hisnan(b)); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hle(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __COMPARISON_OP_BFLOAT16_MACRO(le) +, + const float fa = __bfloat162float(a); + const float fb = __bfloat162float(b); + return (fa <= fb); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hge(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __COMPARISON_OP_BFLOAT16_MACRO(ge) +, + const float fa = __bfloat162float(a); + const float fb = __bfloat162float(b); + return (fa >= fb); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hlt(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __COMPARISON_OP_BFLOAT16_MACRO(lt) +, + const float fa = __bfloat162float(a); + const float fb = __bfloat162float(b); + return (fa < fb); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hgt(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __COMPARISON_OP_BFLOAT16_MACRO(gt) +, + const float fa = __bfloat162float(a); + const float fb = __bfloat162float(b); + return (fa > fb); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hequ(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __COMPARISON_OP_BFLOAT16_MACRO(equ) +, + const float fa = __bfloat162float(a); + const float fb = __bfloat162float(b); + return (fa == fb) || (__hisnan(a)) || (__hisnan(b)); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hneu(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __COMPARISON_OP_BFLOAT16_MACRO(neu) +, + const float fa = __bfloat162float(a); + const float fb = __bfloat162float(b); + return (fa != fb); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hleu(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __COMPARISON_OP_BFLOAT16_MACRO(leu) +, + const float fa = __bfloat162float(a); + const float fb = __bfloat162float(b); + return (fa <= fb) || (__hisnan(a)) || (__hisnan(b)); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hgeu(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __COMPARISON_OP_BFLOAT16_MACRO(geu) +, + const float fa = __bfloat162float(a); + const float fb = __bfloat162float(b); + return (fa >= fb) || (__hisnan(a)) || (__hisnan(b)); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hltu(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __COMPARISON_OP_BFLOAT16_MACRO(ltu) +, + const float fa = __bfloat162float(a); + const float fb = __bfloat162float(b); + return (fa < fb) || (__hisnan(a)) || (__hisnan(b)); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hgtu(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __COMPARISON_OP_BFLOAT16_MACRO(gtu) +, + const float fa = __bfloat162float(a); + const float fb = __bfloat162float(b); + return (fa > fb) || (__hisnan(a)) || (__hisnan(b)); +) +} +#undef __COMPARISON_OP_BFLOAT16_MACRO +/****************************************************************************** +* __nv_bfloat162 arithmetic * +******************************************************************************/ +#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) +__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hadd2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm( "{ add.bf16x2 %0,%1,%2; }\n" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); +, + asm( "{.reg .b32 c;\n" + " mov.b32 c, 0x3f803f80U;\n" + " fma.rn.bf16x2 %0,%1,c,%2;}\n" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); +) + return val; +} + +__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hsub2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm( "{ sub.bf16x2 %0,%1,%2; }\n" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); +, + asm( "{.reg .b32 c;\n" + " mov.b32 c, 0xbf80bf80U;\n" + " fma.rn.bf16x2 %0,%2,c,%1;}\n" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); +) + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hmul2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm( "{ mul.bf16x2 %0,%1,%2; }\n" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); +, + asm( "{.reg .b32 c;\n" + " mov.b32 c, 0x80008000U;\n" + " fma.rn.bf16x2 %0,%1,%2,c;}\n" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); +) + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hadd2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm( "{ add.rn.bf16x2 %0,%1,%2; }\n" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); +, + asm( "{.reg .b32 c;\n" + " mov.b32 c, 0x3f803f80U;\n" + " fma.rn.bf16x2 %0,%1,c,%2;}\n" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); +) + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hsub2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm( "{ sub.rn.bf16x2 %0,%1,%2; }\n" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); +, + asm( "{.reg .b32 c;\n" + " mov.b32 c, 0xbf80bf80U;\n" + " fma.rn.bf16x2 %0,%2,c,%1;}\n" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); +) + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __internal_device_hmul2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm( "{ mul.rn.bf16x2 %0,%1,%2; }\n" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); +, + asm( "{.reg .b32 c;\n" + " mov.b32 c, 0x80008000U;\n" + " fma.rn.bf16x2 %0,%1,%2,c;}\n" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); +) + return val; +} +#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hadd2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + val = __internal_device_hadd2(a, b); +, + val.x = __hadd(a.x, b.x); + val.y = __hadd(a.y, b.y); +) + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hsub2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + val = __internal_device_hsub2(a, b); +, + val.x = __hsub(a.x, b.x); + val.y = __hsub(a.y, b.y); +) + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmul2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + val = __internal_device_hmul2(a, b); +, + val.x = __hmul(a.x, b.x); + val.y = __hmul(a.y, b.y); +) + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hadd2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + val = __internal_device_hadd2_rn(a, b); +, + val.x = __hadd_rn(a.x, b.x); + val.y = __hadd_rn(a.y, b.y); +) + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hsub2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + val = __internal_device_hsub2_rn(a, b); +, + val.x = __hsub_rn(a.x, b.x); + val.y = __hsub_rn(a.y, b.y); +) + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmul2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + val = __internal_device_hmul2_rn(a, b); +, + val.x = __hmul_rn(a.x, b.x); + val.y = __hmul_rn(a.y, b.y); +) + return val; +} + +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hadd2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + asm( "{.reg .b32 f, one, zero;\n" + " mov.b32 one, 0x3f803f80U;\n" + " mov.b32 zero, 0;\n" + " fma.rn.bf16x2 f,%1,one,%2;\n" + " max.bf16x2 f, f, zero;\n" + " min.bf16x2 %0, f, one;\n}" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); +, + val.x = __hadd_sat(a.x, b.x); + val.y = __hadd_sat(a.y, b.y); +) + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hsub2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + asm( "{.reg .b32 f, one, zero, mone;\n" + " mov.b32 one, 0x3f803f80U;\n" + " mov.b32 zero, 0;\n" + " mov.b32 mone, 0xbf80bf80U;\n" + " fma.rn.bf16x2 f,%2,mone,%1;\n" + " max.bf16x2 f, f, zero;\n" + " min.bf16x2 %0, f, one;\n}" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); +, + val.x = __hsub_sat(a.x, b.x); + val.y = __hsub_sat(a.y, b.y); +) + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmul2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + asm( "{.reg .b32 f, one, zero, mzero;\n" + " mov.b32 one, 0x3f803f80U;\n" + " mov.b32 zero, 0;\n" + " mov.b32 mzero, 0x80008000U;\n" + " fma.rn.bf16x2 f,%1,%2,mzero;\n" + " max.bf16x2 f, f, zero;\n" + " min.bf16x2 %0, f, one;\n}" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); +, + val.x = __hmul_sat(a.x, b.x); + val.y = __hmul_sat(a.y, b.y); +) + return val; +} +#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) +__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c) +{ + __nv_bfloat162 val; + asm( "{fma.rn.bf16x2 %0,%1,%2,%3;\n}" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)),"r"(__BFLOAT162_TO_CUI(c))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c) +{ + __nv_bfloat162 val; + asm( "{ .reg .b32 f, one, zero;\n" + " mov.b32 one, 0x3f803f80U;\n" + " mov.b32 zero, 0;\n" + " fma.rn.bf16x2 f, %1, %2, %3;\n" + " max.bf16x2 f, f, zero;\n" + " min.bf16x2 %0, f, one;\n}" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)),"r"(__BFLOAT162_TO_CUI(c))); + return val; +} +#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __h2div(const __nv_bfloat162 a, const __nv_bfloat162 b) { + __nv_bfloat16 ha, hb; + + ha = __low2bfloat16(a); + hb = __low2bfloat16(b); + + const __nv_bfloat16 v1 = __hdiv(ha, hb); + + ha = __high2bfloat16(a); + hb = __high2bfloat16(b); + + const __nv_bfloat16 v2 = __hdiv(ha, hb); + + return __halves2bfloat162(v1, v2); +} +/****************************************************************************** +* __nv_bfloat16 arithmetic * +******************************************************************************/ +#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) +__CUDA_BF16_DECL__ __nv_bfloat16 __internal_sm80_device_hadd(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __nv_bfloat16 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm( "{ add.bf16 %0,%1,%2; }\n" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); +, + asm( "{.reg .b16 c;\n" + " mov.b16 c, 0x3f80U;\n" + " fma.rn.bf16 %0,%1,c,%2;}\n" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); +) + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __internal_sm80_device_hsub(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __nv_bfloat16 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm( "{ sub.bf16 %0,%1,%2; }\n" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); +, + asm( "{.reg .b16 c;\n" + " mov.b16 c, 0xbf80U;\n" + " fma.rn.bf16 %0,%2,c,%1;}\n" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); +) + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __internal_sm80_device_hmul(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __nv_bfloat16 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm( "{ mul.bf16 %0,%1,%2; }\n" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); +, + asm( "{.reg .b16 c;\n" + " mov.b16 c, 0x8000U;\n" + " fma.rn.bf16 %0,%1,%2,c;}\n" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); +) + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __internal_sm80_device_hadd_rn(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __nv_bfloat16 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm( "{ add.rn.bf16 %0,%1,%2; }\n" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); +, + asm( "{.reg .b16 c;\n" + " mov.b16 c, 0x3f80U;\n" + " fma.rn.bf16 %0,%1,c,%2;}\n" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); +) + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __internal_sm80_device_hsub_rn(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __nv_bfloat16 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm( "{ sub.rn.bf16 %0,%1,%2; }\n" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); +, + asm( "{.reg .b16 c;\n" + " mov.b16 c, 0xbf80U;\n" + " fma.rn.bf16 %0,%2,c,%1;}\n" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); +) + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __internal_sm80_device_hmul_rn(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __nv_bfloat16 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm( "{ mul.rn.bf16 %0,%1,%2; }\n" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); +, + asm( "{.reg .b16 c;\n" + " mov.b16 c, 0x8000U;\n" + " fma.rn.bf16 %0,%1,%2,c;}\n" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); +) + return val; +} +#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */ +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +__CUDA_BF16_DECL__ __nv_bfloat16 __internal_device_hadd(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __nv_bfloat16 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + val = __internal_sm80_device_hadd(a, b); +, + const float fa = __bfloat162float(a); + const float fb = __bfloat162float(b); + // avoid ftz in device code + val = __float2bfloat16(__fmaf_ieee_rn(fa, 1.0f, fb)); +) + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __internal_device_hsub(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __nv_bfloat16 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + val = __internal_sm80_device_hsub(a, b); +, + const float fa = __bfloat162float(a); + const float fb = __bfloat162float(b); + // avoid ftz in device code + val = __float2bfloat16(__fmaf_ieee_rn(fb, -1.0f, fa)); +) + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __internal_device_hmul(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __nv_bfloat16 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + val = __internal_sm80_device_hmul(a, b); +, + const float fa = __bfloat162float(a); + const float fb = __bfloat162float(b); + // avoid ftz in device code + val = __float2bfloat16(__fmaf_ieee_rn(fa, fb, -0.0f)); +) + return val; +} +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hadd(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __internal_device_hadd(a, b); +, + const float fa = __bfloat162float(a); + const float fb = __bfloat162float(b); + return __float2bfloat16(fa + fb); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hsub(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __internal_device_hsub(a, b); +, + const float fa = __bfloat162float(a); + const float fb = __bfloat162float(b); + return __float2bfloat16(fa - fb); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmul(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __internal_device_hmul(a, b); +, + const float fa = __bfloat162float(a); + const float fb = __bfloat162float(b); + return __float2bfloat16(fa * fb); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hadd_rn(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + return __internal_sm80_device_hadd_rn(a, b); +, + return __hadd(a, b); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hsub_rn(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + return __internal_sm80_device_hsub_rn(a, b); +, + return __hsub(a, b); + +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmul_rn(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + return __internal_sm80_device_hmul_rn(a, b); +, + return __hmul(a, b); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hadd_sat(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __nv_bfloat16 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + asm( "{ .reg .b16 f, one, zero;\n" + " mov.b16 one, 0x3f80U;\n" + " mov.b16 zero, 0;\n" + " fma.rn.bf16 f, %1, one, %2;\n" + " max.bf16 f, f, zero;\n" + " min.bf16 %0, f, one;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); +, + val = __hmin(__hmax(__hadd(a, b), CUDART_ZERO_BF16), CUDART_ONE_BF16); +) + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hsub_sat(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __nv_bfloat16 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + asm( "{ .reg .b16 f, one, zero, mone;\n" + " mov.b16 one, 0x3f80U;\n" + " mov.b16 zero, 0;\n" + " mov.b16 mone, 0xbf80U;\n" + " fma.rn.bf16 f, %2, mone, %1;\n" + " max.bf16 f, f, zero;\n" + " min.bf16 %0, f, one;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); +, + val = __hmin(__hmax(__hsub(a, b), CUDART_ZERO_BF16), CUDART_ONE_BF16); +) + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmul_sat(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __nv_bfloat16 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + asm( "{ .reg .b16 f, one, zero, mzero;\n" + " mov.b16 one, 0x3f80U;\n" + " mov.b16 zero, 0;\n" + " mov.b16 mzero, 0x8000U;\n" + " fma.rn.bf16 f, %1, %2, mzero;\n" + " max.bf16 f, f, zero;\n" + " min.bf16 %0, f, one;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); +, + val = __hmin(__hmax(__hmul(a, b), CUDART_ZERO_BF16), CUDART_ONE_BF16); +) + return val; +} +#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) +__CUDA_BF16_DECL__ __nv_bfloat16 __hfma(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c) +{ + __nv_bfloat16 val; + asm( "{fma.rn.bf16 %0,%1,%2,%3;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)),"h"(__BFLOAT16_TO_CUS(c))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __hfma_sat(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c) +{ + __nv_bfloat16 val; + asm( "{ .reg .b16 f, one, zero;\n" + " mov.b16 one, 0x3f80U;\n" + " mov.b16 zero, 0;\n" + " fma.rn.bf16 f, %1, %2, %3;\n" + " max.bf16 f, f, zero;\n" + " min.bf16 %0, f, one;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)),"h"(__BFLOAT16_TO_CUS(c))); + return val; +} +#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */ +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +#define __BINARY_OP_BFLOAT16_MACRO(name) /* do */ {\ + __nv_bfloat16 val; \ + asm( "{.reg .b32 a,b,res;\n"\ + " mov.b32 a, {0,%1};\n"\ + " mov.b32 b, {0,%2};\n"\ + " " __CUDA_BF16_STRINGIFY(name) ".f32 res, a, b;\n"\ + " cvt.rn.bf16.f32 %0, res;}\n"\ + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \ + return val; \ +} /* while(0) */ +__CUDA_BF16_DECL__ __nv_bfloat16 __internal_device_hdiv(const __nv_bfloat16 a, const __nv_bfloat16 b) { + const float two_126 = __uint_as_float(0x7E800000U) ; //2^126 + const float a_f = __bfloat162float(a); + float b_f = __bfloat162float(b); + float ans; + bool b_big = (fabsf(b_f) >= two_126); + if(b_big){b_f *= 0.25f;} + + // f32 div approximation. Good enough for c-r bfloat div. + asm("{ div.approx.f32 %0, %1, %2; }" : "=f"(ans) : "f"(a_f), "f"(b_f)); + + // Prevent ftz: + if(b_big){ans = __fmaf_ieee_rn(ans, 0.25f, -0.0f);} + return __float2bfloat16(ans); +} + +#undef __BINARY_OP_BFLOAT16_MACRO +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hdiv(const __nv_bfloat16 a, const __nv_bfloat16 b) { +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __internal_device_hdiv(a, b); +, + const float fa = __bfloat162float(a); + const float fb = __bfloat162float(b); + return __float2bfloat16(fa / fb); +) +} + +/****************************************************************************** +* __nv_bfloat162 functions * +******************************************************************************/ +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +__CUDA_BF16_DECL__ __nv_bfloat16 __hsin_internal(const __nv_bfloat16 a) { + float f = __bfloat162float(a); + float r = sinf(f); + // Detect compile-time FTZ setting: + // if subnormal constant is not flushed to zero at compile-time, then + // ftz=off, and it is safe to return result of sinf() + // Otherwise, ftz=on, then sinf() result is valid for non-flushed + // values, and subnormal input is returned unchanged via else + // branch. + if ((__uint_as_float(0x00000001U) > 0.0f) || (f != 0.0f)) + { + f = r; + } + return __float2bfloat16_rn(f); +} +__CUDA_BF16_DECL__ __nv_bfloat16 hsin(const __nv_bfloat16 a) { + return __hsin_internal(a); +} +__CUDA_BF16_DECL__ __nv_bfloat162 h2sin(const __nv_bfloat162 a) { + const __nv_bfloat16 l = __low2bfloat16(a); + const __nv_bfloat16 h = __high2bfloat16(a); + return __halves2bfloat162(__hsin_internal(l), __hsin_internal(h)); +} +__CUDA_BF16_DECL__ __nv_bfloat16 __hcos_internal(const __nv_bfloat16 a) { + float f = __bfloat162float(a); + f = cosf(f); + return __float2bfloat16_rn(f); +} +__CUDA_BF16_DECL__ __nv_bfloat16 hcos(const __nv_bfloat16 a) { + return __hcos_internal(a); +} +__CUDA_BF16_DECL__ __nv_bfloat162 h2cos(const __nv_bfloat162 a) { + const __nv_bfloat16 l = __low2bfloat16(a); + const __nv_bfloat16 h = __high2bfloat16(a); + return __halves2bfloat162(__hcos_internal(l), __hcos_internal(h)); +} + +__CUDA_BF16_DECL__ float __internal_device_fast_bf16exp(const float x) +{ + const float log2e_up = __uint_as_float(0x3FB8AA3CU); + float fa = x * log2e_up; + asm("{ ex2.approx.f32 %0, %0; }" : "+f"(fa)); + return fa; +} + +__CUDA_BF16_DECL__ __nv_bfloat16 hexp(const __nv_bfloat16 a) { + float fa = __bfloat162float(a); + fa = __internal_device_fast_bf16exp(fa); + return __float2bfloat16_rn(fa); +} + +#define __APPROX_FCAST2(fun) /* do */ {\ + __nv_bfloat162 val;\ + asm("{.reg.b16 hl, hu; \n"\ + " .reg.b32 fl, fu; \n"\ + " mov.b32 {hl, hu}, %1; \n"\ + " mov.b32 fl, {0,hl}; \n"\ + " mov.b32 fu, {0,hu}; \n"\ + " " __CUDA_BF16_STRINGIFY(fun) ".approx.f32 fl, fl; \n"\ + " " __CUDA_BF16_STRINGIFY(fun) ".approx.f32 fu, fu; \n"\ + " cvt.rn.bf16.f32 hl, fl; \n"\ + " cvt.rn.bf16.f32 hu, fu; \n"\ + " mov.b32 %0, {hl, hu}; \n"\ + "}":"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a))); \ + return val;\ +} /* while(0) */ +#define __BF16_SPEC_CASE2(i,r, spc, ulp) \ + "{.reg.b32 spc, ulp, p;\n"\ + " mov.b32 spc," __CUDA_BF16_STRINGIFY(spc) ";\n"\ + " mov.b32 ulp," __CUDA_BF16_STRINGIFY(ulp) ";\n"\ + " set.eq.f16x2.f16x2 p," __CUDA_BF16_STRINGIFY(i) ", spc;\n"\ + " fma.rn.bf16x2 " __CUDA_BF16_STRINGIFY(r) ",p,ulp," __CUDA_BF16_STRINGIFY(r) ";\n}\n" + +__CUDA_BF16_DECL__ __nv_bfloat162 h2exp(const __nv_bfloat162 a) { +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + __nv_bfloat162 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 h,r,fl,fu, C; \n" + " mov.b32 {hl, hu}, %1; \n" + " mov.b32 h, %1; \n" + " mov.b32 fl, {0,hl}; \n" + " mov.b32 fu, {0,hu}; \n" + " mov.b32 C, 0x3FB8AA3CU; \n" + " mul.f32 fl,fl,C; \n" + " mul.f32 fu,fu,C; \n" + " ex2.approx.f32 fl, fl; \n" + " ex2.approx.f32 fu, fu; \n" + " cvt.rn.bf16.f32 hl, fl; \n" + " cvt.rn.bf16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + " mov.b32 %0, r; \n" + "}":"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a))); + return val; +, + return __floats2bfloat162_rn( __internal_device_fast_bf16exp(__low2float(a)), __internal_device_fast_bf16exp(__high2float(a)) ); +) +} + +__CUDA_BF16_DECL__ float __internal_device_tanhf_noftz(const float x) +{ + float f = x; + float r = tanhf(x); + // Detect compile-time FTZ setting: + // if subnormal constant is not flushed to zero at compile-time, then + // ftz=off, and it is safe to return result of tanhf() + // Otherwise, ftz=on, then tanhf() result is valid for non-flushed + // values, and subnormal input is returned unchanged via else + // branch. + if ((__uint_as_float(0x00000001U) > 0.0f) || (f != 0.0f)) + { + f = r; + } + return f; +} + +__CUDA_BF16_DECL__ __nv_bfloat16 htanh(const __nv_bfloat16 a) { + float f = __bfloat162float(a); +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_75, + asm("{ tanh.approx.f32 %0, %0; }" : "+f"(f)); +, + f = __internal_device_tanhf_noftz(f); +) + __nv_bfloat16 h = __float2bfloat16_rn(f); + return h; +} +__CUDA_BF16_DECL__ __nv_bfloat162 h2tanh(const __nv_bfloat162 a) { + float2 f = __bfloat1622float2(a); +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_75, + asm("{ tanh.approx.f32 %0, %0; }" : "+f"(f.x)); + asm("{ tanh.approx.f32 %0, %0; }" : "+f"(f.y)); +, + f.x = __internal_device_tanhf_noftz(f.x); + f.y = __internal_device_tanhf_noftz(f.y); +) + __nv_bfloat162 h = __float22bfloat162_rn(f); + return h; +} + +__CUDA_BF16_DECL__ __nv_bfloat16 htanh_approx(const __nv_bfloat16 a) { + __nv_bfloat16 r; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16_raw hr = (__nv_bfloat16_raw)a; + asm("tanh.approx.bf16 %0, %0;" : "+h"(hr.x)); + r = (__nv_bfloat16)hr; +, + r = htanh(a); +) + return r; +} +__CUDA_BF16_DECL__ __nv_bfloat162 h2tanh_approx(const __nv_bfloat162 a) { + __nv_bfloat162 res; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm("tanh.approx.bf16x2 %0, %1;" : "=r"(__BFLOAT162_TO_UI(res)) : "r"(__BFLOAT162_TO_CUI(a))); +, + res = h2tanh(a); +) + return res; +} + +__CUDA_BF16_DECL__ __nv_bfloat16 hexp2(const __nv_bfloat16 a) { + float fa = __bfloat162float(a); + asm("{ ex2.approx.f32 %0, %0; }" : "+f"(fa)); + return __float2bfloat16_rn(fa); +} +__CUDA_BF16_DECL__ __nv_bfloat162 h2exp2(const __nv_bfloat162 a) { +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + __APPROX_FCAST2(ex2) +, + float fl = __low2float(a); + asm("{ ex2.approx.f32 %0, %0; }" : "+f"(fl)); + float fh = __high2float(a); + asm("{ ex2.approx.f32 %0, %0; }" : "+f"(fh)); + return __floats2bfloat162_rn( fl, fh ); +) +} + +__CUDA_BF16_DECL__ __nv_bfloat16 hexp10(const __nv_bfloat16 a) { + const float log10_2 = __uint_as_float(0x40549A78U); + float fa = __bfloat162float(a) * log10_2; + asm("{ ex2.approx.f32 %0, %0; }" : "+f"(fa)); + __nv_bfloat16 r = __float2bfloat16_rn(fa); + __nv_bfloat16_raw araw = static_cast<__nv_bfloat16_raw>(a); + if (araw.x == (unsigned short)0xBC95U) + { + araw.x = 0x3f75U; + r = static_cast<__nv_bfloat16>(araw); + } + return r; +} +__CUDA_BF16_DECL__ __nv_bfloat162 h2exp10(const __nv_bfloat162 a) { + __nv_bfloat162 r; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 h,r,fl,fu, C; \n" + " mov.b32 {hl, hu}, %1; \n" + " mov.b32 fl, {0,hl}; \n" + " mov.b32 fu, {0,hu}; \n" + " mov.b32 C, 0x40549A78U; \n" + " mul.f32 fl,fl,C; \n" + " mul.f32 fu,fu,C; \n" + " ex2.approx.f32 fl, fl; \n" + " ex2.approx.f32 fu, fu; \n" + " cvt.rn.bf16.f32 hl, fl; \n" + " cvt.rn.bf16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + __BF16_SPEC_CASE2(%1, r, 0xBC95BC95U,0xBF00BF00U) + " mov.b32 %0, r; \n" + "}":"=r"(__BFLOAT162_TO_UI(r)) : "r"(__BFLOAT162_TO_CUI(a))); +, + const float log10_2 = __uint_as_float(0x40549A78U); + float fl = __low2float(a) * log10_2; + asm("{ ex2.approx.f32 %0, %0; }" : "+f"(fl)); + + float fh = __high2float(a) * log10_2; + asm("{ ex2.approx.f32 %0, %0; }" : "+f"(fh)); + + r = __floats2bfloat162_rn( fl, fh ); + + const __nv_bfloat162_raw araw = static_cast<__nv_bfloat162_raw>(a); + if (araw.x == (unsigned short)0xBC95U) + { + __nv_bfloat16_raw raw_fix; + raw_fix.x = (unsigned short)0x3f75U; + r.x = static_cast<__nv_bfloat16>(raw_fix); + } + if (araw.y == (unsigned short)0xBC95U) + { + __nv_bfloat16_raw raw_fix; + raw_fix.x = (unsigned short)0x3f75U; + r.y = static_cast<__nv_bfloat16>(raw_fix); + } +) + return r; +} + +__CUDA_BF16_DECL__ float __internal_device_fast_bf16log2(float x) +{ + asm("{ lg2.approx.f32 %0, %0; }" : "+f"(x)); + return x; +} + +__CUDA_BF16_DECL__ __nv_bfloat16 hlog2(const __nv_bfloat16 a) { + float fa = __bfloat162float(a); + fa = __internal_device_fast_bf16log2(fa); + return __float2bfloat16_rn(fa); +} + +__CUDA_BF16_DECL__ __nv_bfloat162 h2log2(const __nv_bfloat162 a) { +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + __APPROX_FCAST2(lg2) +, + float fl = __low2float(a); + fl = __internal_device_fast_bf16log2(fl); + float fh = __high2float(a); + fh = __internal_device_fast_bf16log2(fh); + return __floats2bfloat162_rn( fl, fh ); +) +} + +__CUDA_BF16_DECL__ __nv_bfloat16 hlog(const __nv_bfloat16 a) { + const float flt_ln2 = __uint_as_float(0x3f317218U); + float fa = __bfloat162float(a); + fa = __internal_device_fast_bf16log2(fa); + fa = fa * flt_ln2; + return __float2bfloat16_rn(fa); +} +__CUDA_BF16_DECL__ __nv_bfloat162 h2log(const __nv_bfloat162 a) { +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + __nv_bfloat162 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 r, fl, fu, C, h; \n" + " mov.b32 {hl, hu}, %1; \n" + " mov.b32 h, %1; \n" + " mov.b32 fl, {0,hl}; \n" + " mov.b32 fu, {0,hu}; \n" + " lg2.approx.f32 fl, fl; \n" + " lg2.approx.f32 fu, fu; \n" + " mov.b32 C, 0x3f317218U; \n" + " mul.f32 fl,fl,C; \n" + " mul.f32 fu,fu,C; \n" + " cvt.rn.bf16.f32 hl, fl; \n" + " cvt.rn.bf16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + " mov.b32 %0, r; \n" + "}":"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a))); + return val; +, + const float flt_ln2 = __uint_as_float(0x3f317218U); + + float fl = __low2float(a); + fl = __internal_device_fast_bf16log2(fl); + fl = fl * flt_ln2; + + float fh = __high2float(a); + fh = __internal_device_fast_bf16log2(fh); + fh = fh * flt_ln2; + + return __floats2bfloat162_rn( fl, fh ); +) +} + +__CUDA_BF16_DECL__ __nv_bfloat16 hlog10(const __nv_bfloat16 a) { + const float flt_log10_2 = __uint_as_float(0x3E9A209BU); + float fa = __bfloat162float(a); + fa = __internal_device_fast_bf16log2(fa); + fa = fa * flt_log10_2; + return __float2bfloat16_rn(fa); +} +__CUDA_BF16_DECL__ __nv_bfloat162 h2log10(const __nv_bfloat162 a) { +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + __nv_bfloat162 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 r, fl, fu, C, h; \n" + " mov.b32 {hl, hu}, %1; \n" + " mov.b32 h, %1; \n" + " mov.b32 fl, {0,hl}; \n" + " mov.b32 fu, {0,hu}; \n" + " lg2.approx.f32 fl, fl; \n" + " lg2.approx.f32 fu, fu; \n" + " mov.b32 C, 0x3E9A209BU; \n" + " mul.f32 fl,fl,C; \n" + " mul.f32 fu,fu,C; \n" + " cvt.rn.bf16.f32 hl, fl; \n" + " cvt.rn.bf16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + " mov.b32 %0, r; \n" + "}":"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a))); + return val; +, + const float flt_log10_2 = __uint_as_float(0x3E9A209BU); + + float fl = __low2float(a); + fl = __internal_device_fast_bf16log2(fl); + fl = fl * flt_log10_2; + + float fh = __high2float(a); + fh = __internal_device_fast_bf16log2(fh); + fh = fh * flt_log10_2; + + return __floats2bfloat162_rn( fl, fh ); +) +} + +__CUDA_BF16_DECL__ __nv_bfloat162 h2rcp(const __nv_bfloat162 a) { + float fl = __low2float(a); + asm("{ rcp.approx.f32 %0, %0; }" : "+f"(fl)); + float fh = __high2float(a); + asm("{ rcp.approx.f32 %0, %0; }" : "+f"(fh)); + return __floats2bfloat162_rn( fl, fh ); +} +__CUDA_BF16_DECL__ __nv_bfloat16 hrcp(const __nv_bfloat16 a) { + float fa = __bfloat162float(a); + asm("{ rcp.approx.f32 %0, %0; }" : "+f"(fa)); + return __float2bfloat16_rn(fa); +} +__CUDA_BF16_DECL__ __nv_bfloat162 h2rsqrt(const __nv_bfloat162 a) { +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + __APPROX_FCAST2(rsqrt) +, + float fl = __low2float(a); + asm("{ rsqrt.approx.f32 %0, %0; }" : "+f"(fl)); + float fh = __high2float(a); + asm("{ rsqrt.approx.f32 %0, %0; }" : "+f"(fh)); + return __floats2bfloat162_rn( fl, fh ); +) +} +__CUDA_BF16_DECL__ __nv_bfloat16 hrsqrt(const __nv_bfloat16 a) { + float fa = __bfloat162float(a); + asm("{ rsqrt.approx.f32 %0, %0; }" : "+f"(fa)); + return __float2bfloat16_rn(fa); +} +__CUDA_BF16_DECL__ __nv_bfloat162 h2sqrt(const __nv_bfloat162 a) { +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + __APPROX_FCAST2(sqrt) +, + float fl = __low2float(a); + asm("{ sqrt.approx.f32 %0, %0; }" : "+f"(fl)); + float fh = __high2float(a); + asm("{ sqrt.approx.f32 %0, %0; }" : "+f"(fh)); + return __floats2bfloat162_rn( fl, fh ); +) +} +__CUDA_BF16_DECL__ __nv_bfloat16 hsqrt(const __nv_bfloat16 a) { + float fa = __bfloat162float(a); + asm("{ sqrt.approx.f32 %0, %0; }" : "+f"(fa)); + return __float2bfloat16_rn(fa); +} +#undef __APPROX_FCAST2 +#undef __BF16_SPEC_CASE2 + +__CUDA_BF16_DECL__ bool __internal_device_hisnan(const __nv_bfloat16 a) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 r; + asm("{set.nan.bf16.bf16 %0,%1,%1;\n}" + :"=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(a))); + return __BFLOAT16_TO_CUS(r) != 0U; +, + unsigned int r; + asm( "{.reg .b32 a;\n" + " mov.b32 a, {0,%1};\n" + " set.nan.f32.f32 %0, a, a;}\n" + :"=r"(r) : "h"(__BFLOAT16_TO_CUS(a))); + return r != 0U; +) +} +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hisnan2(const __nv_bfloat162 a) +{ + __nv_bfloat162 r; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm("{set.nan.bf16x2.bf16x2 %0,%1,%1;\n}" + :"=r"(__BFLOAT162_TO_UI(r)) : "r"(__BFLOAT162_TO_CUI(a))); +, + __nv_bfloat162_raw val; + val.x = __hisnan(a.x) ? (unsigned short)0x3F80U : (unsigned short)0U; + val.y = __hisnan(a.y) ? (unsigned short)0x3F80U : (unsigned short)0U; + r = __nv_bfloat162(val); +) + return r; +} +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hisnan(const __nv_bfloat16 a) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __internal_device_hisnan(a); +, + const __nv_bfloat16_raw hr = static_cast<__nv_bfloat16_raw>(a); + return ((hr.x & 0x7FFFU) > 0x7F80U); +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hneg2(const __nv_bfloat162 a) +{ + __nv_bfloat162 r; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + asm("{neg.bf16x2 %0,%1;\n}" + :"=r"(__BFLOAT162_TO_UI(r)) : "r"(__BFLOAT162_TO_CUI(a))); +, + r.x = __hneg(a.x); + r.y = __hneg(a.y); +) + return r; +} +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +__CUDA_BF16_DECL__ __nv_bfloat16 __internal_device_hneg(const __nv_bfloat16 a) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + __nv_bfloat16 r; + asm("{neg.bf16 %0,%1;\n}" + :"=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(a))); + return r; +, + const float fa = __bfloat162float(a); + return __float2bfloat16(__fmaf_ieee_rn(fa, -1.0f, -0.0f)); +) +} +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hneg(const __nv_bfloat16 a) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __internal_device_hneg(a); +, + const float fa = __bfloat162float(a); + return __float2bfloat16(-fa); +) +} + +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __habs2(const __nv_bfloat162 a) +{ + __nv_bfloat162 r; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + asm("{abs.bf16x2 %0,%1;\n}" + :"=r"(__BFLOAT162_TO_UI(r)) : "r"(__BFLOAT162_TO_CUI(a))); +, + r.x = __habs(a.x); + r.y = __habs(a.y); +) + return r; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __habs(const __nv_bfloat16 a) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + __nv_bfloat16 r; + asm("{abs.bf16 %0,%1;\n}" + :"=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(a))); + return r; +, + __nv_bfloat16_raw abs_a_raw = static_cast<__nv_bfloat16_raw>(a); + abs_a_raw.x &= (unsigned short)0x7FFFU; + if (abs_a_raw.x > (unsigned short)0x7F80U) + { + // return canonical NaN + abs_a_raw.x = (unsigned short)0x7FFFU; + } + return static_cast<__nv_bfloat16>(abs_a_raw); +) +} + +/****************************************************************************** +* __nv_bfloat16 arithmetic * +******************************************************************************/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmax(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + __nv_bfloat16 val; + asm( "{ max.bf16 %0,%1,%2;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); + return val; +, + __nv_bfloat16 maxval; + + maxval = (__hge(a, b) || __hisnan(b)) ? a : b; + + if (__hisnan(maxval)) + { + // if both inputs are NaN, return canonical NaN + maxval = CUDART_NAN_BF16; + } + else if (__heq(a, b)) + { + // hmax(+0.0, -0.0) = +0.0 + // unsigned compare 0x8000U > 0x0000U + __nv_bfloat16_raw ra = __nv_bfloat16_raw(a); + __nv_bfloat16_raw rb = __nv_bfloat16_raw(b); + maxval = (ra.x > rb.x) ? b : a; + } + + return maxval; +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmin(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + __nv_bfloat16 val; + asm( "{ min.bf16 %0,%1,%2;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); + return val; +, + __nv_bfloat16 minval; + + minval = (__hle(a, b) || __hisnan(b)) ? a : b; + + if (__hisnan(minval)) + { + // if both inputs are NaN, return canonical NaN + minval = CUDART_NAN_BF16; + } + else if (__heq(a, b)) + { + // hmin(+0.0, -0.0) = -0.0 + // unsigned compare 0x8000U > 0x0000U + __nv_bfloat16_raw ra = __nv_bfloat16_raw(a); + __nv_bfloat16_raw rb = __nv_bfloat16_raw(b); + minval = (ra.x > rb.x) ? a : b; + } + + return minval; +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmax_nan(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + __nv_bfloat16 val; + asm( "{ max.NaN.bf16 %0,%1,%2;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); + return val; +, + __nv_bfloat16 maxval; + + if (__hisnan(a) || __hisnan(b)) + { + // if either input is NaN, return canonical NaN + maxval = CUDART_NAN_BF16; + } + else + { + maxval = __hge(a, b) ? a : b; + } + + return maxval; +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmin_nan(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + __nv_bfloat16 val; + asm( "{ min.NaN.bf16 %0,%1,%2;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); + return val; +, + __nv_bfloat16 minval; + + if (__hisnan(a) || __hisnan(b)) + { + // if either input is NaN, return canonical NaN + minval = CUDART_NAN_BF16; + } + else + { + minval = __hle(a, b) ? a : b; + } + + return minval; +) +} +#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) +__CUDA_BF16_DECL__ __nv_bfloat16 __hfma_relu(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c) +{ + __nv_bfloat16 val; + asm( "{ fma.rn.relu.bf16 %0,%1,%2,%3;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)),"h"(__BFLOAT16_TO_CUS(c))); + return val; +} +#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */ +/****************************************************************************** +* __nv_bfloat162 arithmetic * +******************************************************************************/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmax2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + __nv_bfloat162 val; + asm( "{ max.bf16x2 %0,%1,%2;\n}" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); + return val; +, + __nv_bfloat162 val; + val.x = __hmax(a.x, b.x); + val.y = __hmax(a.y, b.y); + return val; +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmin2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + __nv_bfloat162 val; + asm( "{ min.bf16x2 %0,%1,%2;\n}" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); + return val; +, + __nv_bfloat162 val; + val.x = __hmin(a.x, b.x); + val.y = __hmin(a.y, b.y); + return val; +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmax2_nan(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + __nv_bfloat162 val; + asm( "{ max.NaN.bf16x2 %0,%1,%2;\n}" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); + return val; +, + __nv_bfloat162 val; + val.x = __hmax_nan(a.x, b.x); + val.y = __hmax_nan(a.y, b.y); + return val; +) +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmin2_nan(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + __nv_bfloat162 val; + asm( "{ min.NaN.bf16x2 %0,%1,%2;\n}" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); + return val; +, + __nv_bfloat162 val; + val.x = __hmin_nan(a.x, b.x); + val.y = __hmin_nan(a.y, b.y); + return val; +) +} +#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) +__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2_relu(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c) +{ + __nv_bfloat162 val; + asm( "{ fma.rn.relu.bf16x2 %0,%1,%2,%3;\n}" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)),"r"(__BFLOAT162_TO_CUI(c))); + return val; +} + +__CUDA_BF16_DECL__ __nv_bfloat162 __hcmadd(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c) +{ + // fast version of complex multiply-accumulate + // (a.re, a.im) * (b.re, b.im) + (c.re, c.im) + // acc.re = (c.re + a.re*b.re) - a.im*b.im + // acc.im = (c.im + a.re*b.im) + a.im*b.re + __nv_bfloat16 real_tmp = __hfma(a.x, b.x, c.x); + __nv_bfloat16 img_tmp = __hfma(a.x, b.y, c.y); + real_tmp = __hfma(__hneg(a.y), b.y, real_tmp); + img_tmp = __hfma(a.y, b.x, img_tmp); + return make_bfloat162(real_tmp, img_tmp); +} +#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */ + +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/* Define __PTR for atomicAdd prototypes below, undef after done */ +#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__) +#define __PTR "l" +#else +#define __PTR "r" +#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/ + +__CUDA_BF16_DECL__ __nv_bfloat162 atomicAdd(__nv_bfloat162 *const address, const __nv_bfloat162 val) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat162 r; + asm volatile ("{ atom.add.noftz.bf16x2 %0,[%1],%2; }\n" + : "=r"(__BFLOAT162_TO_UI(r)) : __PTR(address), "r"(__BFLOAT162_TO_CUI(val)) + : "memory"); + return r; +, + unsigned int* address_as_uint = (unsigned int*)address; + unsigned int old = *address_as_uint; + unsigned int assumed; + do { + assumed = old; + __nv_bfloat162 new_val = __hadd2(val, *(__nv_bfloat162*)&assumed); + old = atomicCAS(address_as_uint, assumed, *(unsigned int*)&new_val); + } while (assumed != old); + return *(__nv_bfloat162*)&old; +) +} + +#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700))) || defined(_NVHPC_CUDA) +__CUDA_BF16_DECL__ __nv_bfloat16 atomicAdd(__nv_bfloat16 *const address, const __nv_bfloat16 val) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + __nv_bfloat16 r; + asm volatile ("{ atom.add.noftz.bf16 %0,[%1],%2; }\n" + : "=h"(__BFLOAT16_TO_US(r)) + : __PTR(address), "h"(__BFLOAT16_TO_CUS(val)) + : "memory"); + return r; +, + unsigned short int* address_as_us = (unsigned short int*)address; + unsigned short int old = *address_as_us; + unsigned short int assumed; + do { + assumed = old; + old = atomicCAS(address_as_us, assumed, + __bfloat16_as_ushort(__hadd(val, __ushort_as_bfloat16(assumed)))); + } while (assumed != old); + return __ushort_as_bfloat16(old); +) +} +#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700))) || defined(_NVHPC_CUDA) */ + +#undef __PTR +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ + +#endif /* !(defined __DOXYGEN_ONLY__) */ + +#endif /* defined(__cplusplus) */ + +#undef __CUDA_HOSTDEVICE_BF16_DECL__ +#undef __CUDA_BF16_DECL__ +#undef __CUDA_BF16_CONSTEXPR__ + +#if defined(__CPP_VERSION_AT_LEAST_11_BF16) +#undef __CPP_VERSION_AT_LEAST_11_BF16 +#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */ + +#undef ___CUDA_BF16_STRINGIFY_INNERMOST +#undef __CUDA_BF16_STRINGIFY + +#endif /* end of include guard: __CUDA_BF16_HPP__ */ From 834a905f5747c5f0f9bedbeb0e61fdd8dbc54892 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Thu, 14 Aug 2025 14:44:03 -0700 Subject: [PATCH 11/56] regenerate with ctk13 --- configs/cuda_bf16.yml | 4 +- numba_cuda/numba/cuda/_internal/cuda_bf16.py | 108 ++++++------------- 2 files changed, 33 insertions(+), 79 deletions(-) diff --git a/configs/cuda_bf16.yml b/configs/cuda_bf16.yml index 348e48ee7..9d4727aae 100644 --- a/configs/cuda_bf16.yml +++ b/configs/cuda_bf16.yml @@ -2,9 +2,9 @@ Name: Numba Bfloat16 Version: 0.0.2 GPU Arch: - sm_80 # sm_80 is the first CUDA architecture that supports bfloat16 -Entry Point: ./numba_cuda/numba/cuda/include/12/cuda_bf16.h +Entry Point: ./numba_cuda/numba/cuda/include/13/cuda_bf16.h File List: - - ./numba_cuda/numba/cuda/include/12/cuda_bf16.h + - ./numba_cuda/numba/cuda/include/13/cuda_bf16.h Exclude: {} Types: __nv_bfloat16_raw: Number diff --git a/numba_cuda/numba/cuda/_internal/cuda_bf16.py b/numba_cuda/numba/cuda/_internal/cuda_bf16.py index fcc70298d..6483cfd73 100644 --- a/numba_cuda/numba/cuda/_internal/cuda_bf16.py +++ b/numba_cuda/numba/cuda/_internal/cuda_bf16.py @@ -12,7 +12,6 @@ # Imports: import io import operator -import os import numba from llvmlite import ir @@ -25,9 +24,9 @@ ) from numba.core.imputils import Registry as TargetRegistry from numba.core.imputils import lower_cast -from numba.cuda.typing.templates import Registry as TypingRegistry -from numba.cuda.typing import signature -from numba.cuda.typing.templates import AttributeTemplate, ConcreteTemplate +from numba.core.typing import signature +from numba.core.typing.templates import AttributeTemplate, ConcreteTemplate +from numba.core.typing.templates import Registry as TypingRegistry from numba.cuda import CUSource, declare_device from numba.cuda.types import bfloat16 from numba.cuda.vector_types import vector_types @@ -55,9 +54,6 @@ float32x2 = vector_types["float32x2"] __half = float16 -# Setups: - - typing_registry = TypingRegistry() register = typing_registry.register register_attr = typing_registry.register_attr @@ -299,68 +295,12 @@ def conversion_impl(context, builder, fromty, toty, value): return ctor_impl( context, builder, - signature(_type___nv_bfloat16, CPointer(_type_unnamed1405307)), - value, - ) - - -_lower__ZN13__nv_bfloat16C1ERK17__nv_bfloat16_raw(shim_stream, shim_obj) - - -def _lower__float16_to_bfloat16(shim_stream, shim_obj): - shim_raw_str = """ - extern "C" __device__ int - _ZN13__float162bfloat16_nbst(int &ignore, __nv_bfloat16 *self , __half* hr) { - new (self) __nv_bfloat16(*hr); - return 0; - } - """ - - _ctor_decl___float162bfloat16 = declare_device( - "_ZN13__float162bfloat16_nbst", - int32(CPointer(_type___nv_bfloat16), CPointer(float16)), - ) - - def __float162bfloat16_device_caller(arg_0, arg_1): - return _ctor_decl___float162bfloat16(arg_0, arg_1) - - def ctor_impl(context, builder, sig, args): - context.active_code_library.add_linking_file(shim_obj) - shim_stream.write_with_key("_ZN13__float162bfloat16_nbst", shim_raw_str) - selfptr = builder.alloca( - context.get_value_type(_type___nv_bfloat16), name="selfptr" - ) - argptrs = [ - builder.alloca(context.get_value_type(arg)) for arg in sig.args - ] - for ptr, ty, arg in zip(argptrs, sig.args, args): - builder.store(arg, ptr, align=getattr(ty, "alignof_", None)) - - context.compile_internal( - builder, - __float162bfloat16_device_caller, - signature( - int32, - CPointer(_type___nv_bfloat16), - CPointer(float16), - ), - (selfptr, *argptrs), - ) - return builder.load( - selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) - ) - - @lower_cast(float16, _type___nv_bfloat16) - def conversion_impl(context, builder, fromty, toty, value): - return ctor_impl( - context, - builder, - signature(_type___nv_bfloat16, float16), + signature(_type___nv_bfloat16, fromty), [value], ) -_lower__float16_to_bfloat16(shim_stream, shim_obj) +_lower__ZN13__nv_bfloat16C1ERK17__nv_bfloat16_raw(shim_stream, shim_obj) def _lower__ZN13__nv_bfloat16C1E6__half(shim_stream, shim_obj): @@ -454,7 +394,7 @@ def conversion_impl(context, builder, fromty, toty, value): return ctor_impl( context, builder, - signature(_type___nv_bfloat16, float32), + signature(_type___nv_bfloat16, fromty), [value], ) @@ -507,7 +447,7 @@ def conversion_impl(context, builder, fromty, toty, value): return ctor_impl( context, builder, - signature(_type___nv_bfloat16, float64), + signature(_type___nv_bfloat16, fromty), [value], ) @@ -560,7 +500,7 @@ def conversion_impl(context, builder, fromty, toty, value): return ctor_impl( context, builder, - signature(_type___nv_bfloat16, int16), + signature(_type___nv_bfloat16, fromty), [value], ) @@ -613,7 +553,7 @@ def conversion_impl(context, builder, fromty, toty, value): return ctor_impl( context, builder, - signature(_type___nv_bfloat16, uint16), + signature(_type___nv_bfloat16, fromty), [value], ) @@ -666,7 +606,7 @@ def conversion_impl(context, builder, fromty, toty, value): return ctor_impl( context, builder, - signature(_type___nv_bfloat16, int32), + signature(_type___nv_bfloat16, fromty), [value], ) @@ -719,7 +659,7 @@ def conversion_impl(context, builder, fromty, toty, value): return ctor_impl( context, builder, - signature(_type___nv_bfloat16, uint32), + signature(_type___nv_bfloat16, fromty), [value], ) @@ -772,7 +712,7 @@ def conversion_impl(context, builder, fromty, toty, value): return ctor_impl( context, builder, - signature(_type___nv_bfloat16, int64), + signature(_type___nv_bfloat16, fromty), [value], ) @@ -825,7 +765,7 @@ def conversion_impl(context, builder, fromty, toty, value): return ctor_impl( context, builder, - signature(_type___nv_bfloat16, uint64), + signature(_type___nv_bfloat16, fromty), [value], ) @@ -878,7 +818,7 @@ def conversion_impl(context, builder, fromty, toty, value): return ctor_impl( context, builder, - signature(_type___nv_bfloat16, int64), + signature(_type___nv_bfloat16, fromty), [value], ) @@ -931,7 +871,7 @@ def conversion_impl(context, builder, fromty, toty, value): return ctor_impl( context, builder, - signature(_type___nv_bfloat16, uint64), + signature(_type___nv_bfloat16, fromty), [value], ) @@ -1954,8 +1894,8 @@ def conversion_impl(context, builder, fromty, toty, value): return ctor_impl( context, builder, - signature(_type___nv_bfloat162, CPointer(_type_unnamed1405416)), - value, + signature(_type___nv_bfloat162, fromty), + [value], ) @@ -13672,6 +13612,10 @@ def impl(context, builder, sig, args): _lower__ZleRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj) +def __half(): + pass + + def _lower__ZN6__halfC1E13__nv_bfloat16_nbst(shim_stream, shim_obj): shim_raw_str = """ extern "C" __device__ int @@ -15977,6 +15921,15 @@ class _typing_atomicAdd(ConcreteTemplate): register_global(atomicAdd, types.Function(_typing_atomicAdd)) +@register +class _typing___half(ConcreteTemplate): + key = globals()["__half"] + cases = [signature(void, _type___nv_bfloat16)] + + +register_global(__half, types.Function(_typing___half)) + + @register_global(operator.add) class _typing_operator_add(ConcreteTemplate): cases = [ @@ -16427,6 +16380,7 @@ class _typing_operator_le(ConcreteTemplate): "operator<", "operator>=", "operator<=", + "__half", ] From 577f00aafb1e41af6dc69385db155e3d1d1da904 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Thu, 14 Aug 2025 14:45:46 -0700 Subject: [PATCH 12/56] explicitly test against bfloat16 type --- numba_cuda/numba/cuda/bf16.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/numba_cuda/numba/cuda/bf16.py b/numba_cuda/numba/cuda/bf16.py index 693a8e573..65446d51f 100644 --- a/numba_cuda/numba/cuda/bf16.py +++ b/numba_cuda/numba/cuda/bf16.py @@ -20,14 +20,13 @@ htanh, htanh_approx, ) -from numba.cuda.types import Bfloat16 from numba.extending import overload import math def _make_unary(a, func): - if isinstance(a, Bfloat16): + if a == bfloat16: return lambda a: func(a) From 765f8ee4143a811830759233c2115a50f9dab0f0 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 15 Aug 2025 10:43:38 -0700 Subject: [PATCH 13/56] hand write lower cast fp16->bf16 --- numba_cuda/numba/cuda/_internal/cuda_bf16.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/numba_cuda/numba/cuda/_internal/cuda_bf16.py b/numba_cuda/numba/cuda/_internal/cuda_bf16.py index 6483cfd73..2a7032d9c 100644 --- a/numba_cuda/numba/cuda/_internal/cuda_bf16.py +++ b/numba_cuda/numba/cuda/_internal/cuda_bf16.py @@ -345,6 +345,17 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) ) + # In C++, this cast is an explicit constructor call, by default Numbast will not generate + # this lower cast. We implement this by hand to enable the cast from fp16 to bf16. + @lower_cast(float16, _type___nv_bfloat16) + def conversion_impl(context, builder, fromty, toty, value): + return ctor_impl( + context, + builder, + signature(_type___nv_bfloat16, fromty), + [value], + ) + _lower__ZN13__nv_bfloat16C1E6__half(shim_stream, shim_obj) From c443e4d6624e2ce82e89ec8148d78a6c6148df93 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 15 Aug 2025 11:07:10 -0700 Subject: [PATCH 14/56] ptx test for several basic ptx --- .../tests/cudapy/test_bfloat16_bindings.py | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py index 3538fb230..abc8e47d8 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py @@ -1,6 +1,7 @@ import numba.cuda as cuda from numba.cuda.testing import unittest, CUDATestCase import numpy as np +import operator from numba import ( config, @@ -291,6 +292,36 @@ def kernel(arr): np.testing.assert_allclose(arr, [3], atol=1e-2) + def test_bf16_intrinsics_used_in_lto(self): + self.skip_unsupported() + + operations = [ + (operator.add, "fma.rn.bf16"), + (operator.sub, "fma.rn.bf16"), + (operator.mul, "fma.rn.bf16"), + ( + operator.truediv, + "div.approx.f32", + ), # no native bf16 div, see cuda_bf16.hpp:L3067 + ] + + for op, ptx_op in operations: + with self.subTest(op=op): + + @cuda.jit(lto=True) + def kernel(arr): + a = nv_bfloat16(3.14) + b = nv_bfloat16(5) + arr[0] = float32(op(a, b)) + + arr = np.zeros(1, np.float32) + kernel[1, 1](arr) + np.testing.assert_allclose(arr, [op(3.14, 5)], atol=1e-1) + + ptx = next(iter(kernel.inspect_lto_ptx().values())) + + assert ptx_op in ptx, ptx + if __name__ == "__main__": unittest.main() From 166c9ae74c704ae5e4b4bdf619c27a65a86e28cc Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 15 Aug 2025 12:44:33 -0700 Subject: [PATCH 15/56] add double underscore intrinsics --- numba_cuda/numba/cuda/bf16.py | 34 ++++++ .../numba/cuda/tests/cudapy/test_bfloat16.py | 101 +++++++++++++++++- 2 files changed, 134 insertions(+), 1 deletion(-) diff --git a/numba_cuda/numba/cuda/bf16.py b/numba_cuda/numba/cuda/bf16.py index 65446d51f..1ac3798c0 100644 --- a/numba_cuda/numba/cuda/bf16.py +++ b/numba_cuda/numba/cuda/bf16.py @@ -2,6 +2,23 @@ typing_registry, target_registry, nv_bfloat16 as bfloat16, + # Arithmetic intrinsics + __habs, + __hadd, + __hsub, + __hmul, + __hadd_rn, + __hsub_rn, + __hmul_rn, + __hdiv, + __hadd_sat, + __hsub_sat, + __hmul_sat, + __hfma, + __hfma_sat, + __hneg, + __hfma_relu, + atomicAdd, htrunc, hceil, hfloor, @@ -95,6 +112,23 @@ def exp2_ol(a): "typing_registry", "target_registry", "bfloat16", + # Arithmetic intrinsics + "__habs", + "__hadd", + "__hsub", + "__hmul", + "__hadd_rn", + "__hsub_rn", + "__hmul_rn", + "__hdiv", + "__hadd_sat", + "__hsub_sat", + "__hmul_sat", + "__hfma", + "__hfma_sat", + "__hneg", + "__hfma_relu", + "atomicAdd", "htrunc", "hceil", "hfloor", diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py index 49e843abe..af25a3860 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py @@ -1,5 +1,22 @@ from numba import cuda, float32 -from numba.cuda.bf16 import bfloat16 +from numba.cuda.bf16 import ( + bfloat16, + __habs, + __hadd, + __hsub, + __hmul, + __hdiv, + __hadd_rn, + __hsub_rn, + __hmul_rn, + __hadd_sat, + __hsub_sat, + __hmul_sat, + __hfma, + __hfma_sat, + __hfma_relu, + __hneg, +) from numba.cuda.testing import CUDATestCase import math @@ -60,3 +77,85 @@ def kernel(arr): self.assertAlmostEqual(arr[0], f(3.14), delta=1e-1) else: self.assertAlmostEqual(arr[0], f(3.14), delta=1e-2) + + def test_arithmetic_intrinsics_basic(self): + self.skip_unsupported() + + @cuda.jit + def kernel(out): + a = bfloat16(1.25) + b = bfloat16(-2.5) + + out[0] = float32(__habs(b)) + out[1] = float32(__hadd(a, b)) + out[2] = float32(__hsub(a, b)) + out[3] = float32(__hmul(a, b)) + out[4] = float32(__hdiv(b, a)) + out[5] = float32(__hneg(a)) + out[6] = float32(__hfma(a, b, b)) + + out[7] = float32(__hadd_rn(a, b)) + out[8] = float32(__hsub_rn(a, b)) + out[9] = float32(__hmul_rn(a, b)) + + out = cuda.device_array((10,), dtype="float32") + kernel[1, 1](out) + + a = 1.25 + b = -2.5 + expected = [ + abs(b), + a + b, + a - b, + a * b, + b / a, + -a, + a * b + b, + a + b, + a - b, + a * b, + ] + for i, exp in enumerate(expected): + self.assertAlmostEqual(out[i], exp, delta=1e-2) + + def test_arithmetic_intrinsics_saturating(self): + self.skip_unsupported() + + @cuda.jit + def kernel(out): + a = bfloat16(1.5) + b = bfloat16(0.75) + + out[0] = float32(__hadd_sat(a, b)) # 2.25 -> 1.0 + out[1] = float32(__hsub_sat(b, a)) # -0.75 -> 0.0 + out[2] = float32(__hmul_sat(a, b)) # 1.125 -> 1.0 + out[3] = float32(__hfma_sat(a, b, a)) # 1.125 + 1.5 -> 1.0 + + out = cuda.device_array((4,), dtype="float32") + kernel[1, 1](out) + + self.assertAlmostEqual(out[0], 1.0, delta=1e-3) + self.assertAlmostEqual(out[1], 0.0, delta=1e-3) + self.assertAlmostEqual(out[2], 1.0, delta=1e-3) + self.assertAlmostEqual(out[3], 1.0, delta=1e-3) + + # Also check they are clamped within [0, 1] + for i in range(4): + self.assertGreaterEqual(out[i], 0.0) + self.assertLessEqual(out[i], 1.0) + + def test_fma_relu_intrinsic(self): + self.skip_unsupported() + + @cuda.jit + def kernel(out): + a = bfloat16(-1.5) + b = bfloat16(2.0) + c = bfloat16(0.0) + + out[0] = float32(__hfma_relu(a, b, c)) # -3.0 -> relu -> 0.0 + + out = cuda.device_array((1,), dtype="float32") + kernel[1, 1](out) + + self.assertAlmostEqual(out[0], 0.0, delta=1e-3) From 6d8fd66f0d823738acace44f30c9078a19a3f0b2 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 15 Aug 2025 12:51:37 -0700 Subject: [PATCH 16/56] regnerate with globals --- numba_cuda/numba/cuda/_internal/cuda_bf16.py | 1223 ++++++++++++------ 1 file changed, 817 insertions(+), 406 deletions(-) diff --git a/numba_cuda/numba/cuda/_internal/cuda_bf16.py b/numba_cuda/numba/cuda/_internal/cuda_bf16.py index 2a7032d9c..86ddba13b 100644 --- a/numba_cuda/numba/cuda/_internal/cuda_bf16.py +++ b/numba_cuda/numba/cuda/_internal/cuda_bf16.py @@ -1,7 +1,7 @@ # Automatically generated by Numbast Static Binding Generator # Generator Information: # Ast_canopy version: 0.4.0 -# Numbast version: 0.4.0 +# Numbast version: 0.5.0 # Generation command: /home/wangm/miniforge3/envs/numbast/lib/python3.13/site-packages/numbast/__main__.py --cfg-path configs/cuda_bf16.yml --output-dir numba_cuda/numba/cuda/_internal/ # Static binding generator parameters: {'cfg_path': 'configs/cuda_bf16.yml', 'output_dir': 'numba_cuda/numba/cuda/_internal/', 'run_ruff_format': True} # Config file path (relative to the path of the generated binding): ../../../../../configs/cuda_bf16.yml @@ -23,12 +23,11 @@ register_model, ) from numba.core.imputils import Registry as TargetRegistry -from numba.core.imputils import lower_cast from numba.core.typing import signature from numba.core.typing.templates import AttributeTemplate, ConcreteTemplate from numba.core.typing.templates import Registry as TypingRegistry from numba.cuda import CUSource, declare_device -from numba.cuda.types import bfloat16 +from numba.cuda._internal.cuda_bf16 import _type___nv_bfloat16 from numba.cuda.vector_types import vector_types from numba.extending import as_numba_type from numba.types import ( @@ -52,7 +51,7 @@ ) float32x2 = vector_types["float32x2"] -__half = float16 + typing_registry = TypingRegistry() register = typing_registry.register @@ -190,7 +189,28 @@ class _ctor_template_unnamed1405416(ConcreteTemplate): register_global(unnamed1405416, Function(_ctor_template_unnamed1405416)) -__nv_bfloat16 = _type___nv_bfloat16 = bfloat16 +# Typing for __nv_bfloat16 +class _type_class___nv_bfloat16(Number): + def __init__(self): + super().__init__(name="__nv_bfloat16") + self.alignof_ = 2 + self.bitwidth = 2 * 8 + + +_type___nv_bfloat16 = _type_class___nv_bfloat16() + + +# Make Python API for struct +__nv_bfloat16 = type("__nv_bfloat16", (), {"_nbtype": _type___nv_bfloat16}) + +as_numba_type.register(__nv_bfloat16, _type___nv_bfloat16) + + +@register_model(_type_class___nv_bfloat16) +class _model___nv_bfloat16(PrimitiveModel): + def __init__(self, dmm, fe_type): + be_type = ir.IntType(fe_type.bitwidth) + super(_model___nv_bfloat16, self).__init__(dmm, fe_type, be_type) def _lower__ZN13__nv_bfloat16C1Ev(shim_stream, shim_obj): @@ -290,15 +310,6 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) ) - @lower_cast(_type_unnamed1405307, _type___nv_bfloat16) - def conversion_impl(context, builder, fromty, toty, value): - return ctor_impl( - context, - builder, - signature(_type___nv_bfloat16, fromty), - [value], - ) - _lower__ZN13__nv_bfloat16C1ERK17__nv_bfloat16_raw(shim_stream, shim_obj) @@ -345,17 +356,6 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) ) - # In C++, this cast is an explicit constructor call, by default Numbast will not generate - # this lower cast. We implement this by hand to enable the cast from fp16 to bf16. - @lower_cast(float16, _type___nv_bfloat16) - def conversion_impl(context, builder, fromty, toty, value): - return ctor_impl( - context, - builder, - signature(_type___nv_bfloat16, fromty), - [value], - ) - _lower__ZN13__nv_bfloat16C1E6__half(shim_stream, shim_obj) @@ -400,15 +400,6 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) ) - @lower_cast(float32, _type___nv_bfloat16) - def conversion_impl(context, builder, fromty, toty, value): - return ctor_impl( - context, - builder, - signature(_type___nv_bfloat16, fromty), - [value], - ) - _lower__ZN13__nv_bfloat16C1Ef(shim_stream, shim_obj) @@ -453,15 +444,6 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) ) - @lower_cast(float64, _type___nv_bfloat16) - def conversion_impl(context, builder, fromty, toty, value): - return ctor_impl( - context, - builder, - signature(_type___nv_bfloat16, fromty), - [value], - ) - _lower__ZN13__nv_bfloat16C1Ed(shim_stream, shim_obj) @@ -506,15 +488,6 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) ) - @lower_cast(int16, _type___nv_bfloat16) - def conversion_impl(context, builder, fromty, toty, value): - return ctor_impl( - context, - builder, - signature(_type___nv_bfloat16, fromty), - [value], - ) - _lower__ZN13__nv_bfloat16C1Es(shim_stream, shim_obj) @@ -559,15 +532,6 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) ) - @lower_cast(uint16, _type___nv_bfloat16) - def conversion_impl(context, builder, fromty, toty, value): - return ctor_impl( - context, - builder, - signature(_type___nv_bfloat16, fromty), - [value], - ) - _lower__ZN13__nv_bfloat16C1Et(shim_stream, shim_obj) @@ -612,15 +576,6 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) ) - @lower_cast(int32, _type___nv_bfloat16) - def conversion_impl(context, builder, fromty, toty, value): - return ctor_impl( - context, - builder, - signature(_type___nv_bfloat16, fromty), - [value], - ) - _lower__ZN13__nv_bfloat16C1Ei(shim_stream, shim_obj) @@ -665,15 +620,6 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) ) - @lower_cast(uint32, _type___nv_bfloat16) - def conversion_impl(context, builder, fromty, toty, value): - return ctor_impl( - context, - builder, - signature(_type___nv_bfloat16, fromty), - [value], - ) - _lower__ZN13__nv_bfloat16C1Ej(shim_stream, shim_obj) @@ -718,15 +664,6 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) ) - @lower_cast(int64, _type___nv_bfloat16) - def conversion_impl(context, builder, fromty, toty, value): - return ctor_impl( - context, - builder, - signature(_type___nv_bfloat16, fromty), - [value], - ) - _lower__ZN13__nv_bfloat16C1El(shim_stream, shim_obj) @@ -771,15 +708,6 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) ) - @lower_cast(uint64, _type___nv_bfloat16) - def conversion_impl(context, builder, fromty, toty, value): - return ctor_impl( - context, - builder, - signature(_type___nv_bfloat16, fromty), - [value], - ) - _lower__ZN13__nv_bfloat16C1Em(shim_stream, shim_obj) @@ -824,15 +752,6 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) ) - @lower_cast(int64, _type___nv_bfloat16) - def conversion_impl(context, builder, fromty, toty, value): - return ctor_impl( - context, - builder, - signature(_type___nv_bfloat16, fromty), - [value], - ) - _lower__ZN13__nv_bfloat16C1Ex(shim_stream, shim_obj) @@ -877,15 +796,6 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) ) - @lower_cast(uint64, _type___nv_bfloat16) - def conversion_impl(context, builder, fromty, toty, value): - return ctor_impl( - context, - builder, - signature(_type___nv_bfloat16, fromty), - [value], - ) - _lower__ZN13__nv_bfloat16C1Ey(shim_stream, shim_obj) @@ -1900,15 +1810,6 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat162, "alignof_", None) ) - @lower_cast(_type_unnamed1405416, _type___nv_bfloat162) - def conversion_impl(context, builder, fromty, toty, value): - return ctor_impl( - context, - builder, - signature(_type___nv_bfloat162, fromty), - [value], - ) - _lower__ZN14__nv_bfloat162C1ERK18__nv_bfloat162_raw(shim_stream, shim_obj) @@ -2002,7 +1903,9 @@ def _lower__ZL17__double2bfloat16d_nbst(shim_stream, shim_obj): def _ZL17__double2bfloat16d_nbst_caller(arg_0): return _ZL17__double2bfloat16d_nbst(arg_0) - @lower(__double2bfloat16, float64) + handle = globals()["__double2bfloat16"] + + @lower(handle, float64) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key("_ZL17__double2bfloat16d_nbst", shim_raw_str) @@ -2041,7 +1944,9 @@ def _lower__ZL16__float2bfloat16f_nbst(shim_stream, shim_obj): def _ZL16__float2bfloat16f_nbst_caller(arg_0): return _ZL16__float2bfloat16f_nbst(arg_0) - @lower(__float2bfloat16, float32) + handle = globals()["__float2bfloat16"] + + @lower(handle, float32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key("_ZL16__float2bfloat16f_nbst", shim_raw_str) @@ -2080,7 +1985,9 @@ def _lower__ZL19__float2bfloat16_rnf_nbst(shim_stream, shim_obj): def _ZL19__float2bfloat16_rnf_nbst_caller(arg_0): return _ZL19__float2bfloat16_rnf_nbst(arg_0) - @lower(__float2bfloat16_rn, float32) + handle = globals()["__float2bfloat16_rn"] + + @lower(handle, float32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -2121,7 +2028,9 @@ def _lower__ZL19__float2bfloat16_rzf_nbst(shim_stream, shim_obj): def _ZL19__float2bfloat16_rzf_nbst_caller(arg_0): return _ZL19__float2bfloat16_rzf_nbst(arg_0) - @lower(__float2bfloat16_rz, float32) + handle = globals()["__float2bfloat16_rz"] + + @lower(handle, float32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -2162,7 +2071,9 @@ def _lower__ZL19__float2bfloat16_rdf_nbst(shim_stream, shim_obj): def _ZL19__float2bfloat16_rdf_nbst_caller(arg_0): return _ZL19__float2bfloat16_rdf_nbst(arg_0) - @lower(__float2bfloat16_rd, float32) + handle = globals()["__float2bfloat16_rd"] + + @lower(handle, float32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -2203,7 +2114,9 @@ def _lower__ZL19__float2bfloat16_ruf_nbst(shim_stream, shim_obj): def _ZL19__float2bfloat16_ruf_nbst_caller(arg_0): return _ZL19__float2bfloat16_ruf_nbst(arg_0) - @lower(__float2bfloat16_ru, float32) + handle = globals()["__float2bfloat16_ru"] + + @lower(handle, float32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -2245,7 +2158,9 @@ def _lower__ZL16__bfloat162float13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL16__bfloat162float13__nv_bfloat16_nbst_caller(arg_0): return _ZL16__bfloat162float13__nv_bfloat16_nbst(arg_0) - @lower(__bfloat162float, _type___nv_bfloat16) + handle = globals()["__bfloat162float"] + + @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -2287,7 +2202,9 @@ def _lower__ZL20__float2bfloat162_rnf_nbst(shim_stream, shim_obj): def _ZL20__float2bfloat162_rnf_nbst_caller(arg_0): return _ZL20__float2bfloat162_rnf_nbst(arg_0) - @lower(__float2bfloat162_rn, float32) + handle = globals()["__float2bfloat162_rn"] + + @lower(handle, float32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -2329,7 +2246,9 @@ def _lower__ZL21__floats2bfloat162_rnff_nbst(shim_stream, shim_obj): def _ZL21__floats2bfloat162_rnff_nbst_caller(arg_0, arg_1): return _ZL21__floats2bfloat162_rnff_nbst(arg_0, arg_1) - @lower(__floats2bfloat162_rn, float32, float32) + handle = globals()["__floats2bfloat162_rn"] + + @lower(handle, float32, float32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -2373,7 +2292,9 @@ def _lower__ZL11__low2float14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL11__low2float14__nv_bfloat162_nbst_caller(arg_0): return _ZL11__low2float14__nv_bfloat162_nbst(arg_0) - @lower(__low2float, _type___nv_bfloat162) + handle = globals()["__low2float"] + + @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -2415,7 +2336,9 @@ def _lower__ZL12__high2float14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL12__high2float14__nv_bfloat162_nbst_caller(arg_0): return _ZL12__high2float14__nv_bfloat162_nbst(arg_0) - @lower(__high2float, _type___nv_bfloat162) + handle = globals()["__high2float"] + + @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -2457,7 +2380,9 @@ def _lower__ZL21__float22bfloat162_rn6float2_nbst(shim_stream, shim_obj): def _ZL21__float22bfloat162_rn6float2_nbst_caller(arg_0): return _ZL21__float22bfloat162_rn6float2_nbst(arg_0) - @lower(__float22bfloat162_rn, float32x2) + handle = globals()["__float22bfloat162_rn"] + + @lower(handle, float32x2) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -2499,7 +2424,9 @@ def _lower__ZL18__bfloat1622float214__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL18__bfloat1622float214__nv_bfloat162_nbst_caller(arg_0): return _ZL18__bfloat1622float214__nv_bfloat162_nbst(arg_0) - @lower(__bfloat1622float2, _type___nv_bfloat162) + handle = globals()["__bfloat1622float2"] + + @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -2541,7 +2468,9 @@ def _lower__ZL18__bfloat162char_rz13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL18__bfloat162char_rz13__nv_bfloat16_nbst_caller(arg_0): return _ZL18__bfloat162char_rz13__nv_bfloat16_nbst(arg_0) - @lower(__bfloat162char_rz, _type___nv_bfloat16) + handle = globals()["__bfloat162char_rz"] + + @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -2583,7 +2512,9 @@ def _lower__ZL19__bfloat162uchar_rz13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL19__bfloat162uchar_rz13__nv_bfloat16_nbst_caller(arg_0): return _ZL19__bfloat162uchar_rz13__nv_bfloat16_nbst(arg_0) - @lower(__bfloat162uchar_rz, _type___nv_bfloat16) + handle = globals()["__bfloat162uchar_rz"] + + @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -2625,7 +2556,9 @@ def _lower__ZL17__bfloat162int_rn13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL17__bfloat162int_rn13__nv_bfloat16_nbst_caller(arg_0): return _ZL17__bfloat162int_rn13__nv_bfloat16_nbst(arg_0) - @lower(__bfloat162int_rn, _type___nv_bfloat16) + handle = globals()["__bfloat162int_rn"] + + @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -2667,7 +2600,9 @@ def _lower__ZL17__bfloat162int_rz13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL17__bfloat162int_rz13__nv_bfloat16_nbst_caller(arg_0): return _ZL17__bfloat162int_rz13__nv_bfloat16_nbst(arg_0) - @lower(__bfloat162int_rz, _type___nv_bfloat16) + handle = globals()["__bfloat162int_rz"] + + @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -2709,7 +2644,9 @@ def _lower__ZL17__bfloat162int_rd13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL17__bfloat162int_rd13__nv_bfloat16_nbst_caller(arg_0): return _ZL17__bfloat162int_rd13__nv_bfloat16_nbst(arg_0) - @lower(__bfloat162int_rd, _type___nv_bfloat16) + handle = globals()["__bfloat162int_rd"] + + @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -2751,7 +2688,9 @@ def _lower__ZL17__bfloat162int_ru13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL17__bfloat162int_ru13__nv_bfloat16_nbst_caller(arg_0): return _ZL17__bfloat162int_ru13__nv_bfloat16_nbst(arg_0) - @lower(__bfloat162int_ru, _type___nv_bfloat16) + handle = globals()["__bfloat162int_ru"] + + @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -2792,7 +2731,9 @@ def _lower__ZL17__int2bfloat16_rni_nbst(shim_stream, shim_obj): def _ZL17__int2bfloat16_rni_nbst_caller(arg_0): return _ZL17__int2bfloat16_rni_nbst(arg_0) - @lower(__int2bfloat16_rn, int32) + handle = globals()["__int2bfloat16_rn"] + + @lower(handle, int32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key("_ZL17__int2bfloat16_rni_nbst", shim_raw_str) @@ -2831,7 +2772,9 @@ def _lower__ZL17__int2bfloat16_rzi_nbst(shim_stream, shim_obj): def _ZL17__int2bfloat16_rzi_nbst_caller(arg_0): return _ZL17__int2bfloat16_rzi_nbst(arg_0) - @lower(__int2bfloat16_rz, int32) + handle = globals()["__int2bfloat16_rz"] + + @lower(handle, int32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key("_ZL17__int2bfloat16_rzi_nbst", shim_raw_str) @@ -2870,7 +2813,9 @@ def _lower__ZL17__int2bfloat16_rdi_nbst(shim_stream, shim_obj): def _ZL17__int2bfloat16_rdi_nbst_caller(arg_0): return _ZL17__int2bfloat16_rdi_nbst(arg_0) - @lower(__int2bfloat16_rd, int32) + handle = globals()["__int2bfloat16_rd"] + + @lower(handle, int32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key("_ZL17__int2bfloat16_rdi_nbst", shim_raw_str) @@ -2909,7 +2854,9 @@ def _lower__ZL17__int2bfloat16_rui_nbst(shim_stream, shim_obj): def _ZL17__int2bfloat16_rui_nbst_caller(arg_0): return _ZL17__int2bfloat16_rui_nbst(arg_0) - @lower(__int2bfloat16_ru, int32) + handle = globals()["__int2bfloat16_ru"] + + @lower(handle, int32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key("_ZL17__int2bfloat16_rui_nbst", shim_raw_str) @@ -2949,7 +2896,9 @@ def _lower__ZL19__bfloat162short_rn13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL19__bfloat162short_rn13__nv_bfloat16_nbst_caller(arg_0): return _ZL19__bfloat162short_rn13__nv_bfloat16_nbst(arg_0) - @lower(__bfloat162short_rn, _type___nv_bfloat16) + handle = globals()["__bfloat162short_rn"] + + @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -2991,7 +2940,9 @@ def _lower__ZL19__bfloat162short_rz13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL19__bfloat162short_rz13__nv_bfloat16_nbst_caller(arg_0): return _ZL19__bfloat162short_rz13__nv_bfloat16_nbst(arg_0) - @lower(__bfloat162short_rz, _type___nv_bfloat16) + handle = globals()["__bfloat162short_rz"] + + @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -3033,7 +2984,9 @@ def _lower__ZL19__bfloat162short_rd13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL19__bfloat162short_rd13__nv_bfloat16_nbst_caller(arg_0): return _ZL19__bfloat162short_rd13__nv_bfloat16_nbst(arg_0) - @lower(__bfloat162short_rd, _type___nv_bfloat16) + handle = globals()["__bfloat162short_rd"] + + @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -3075,7 +3028,9 @@ def _lower__ZL19__bfloat162short_ru13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL19__bfloat162short_ru13__nv_bfloat16_nbst_caller(arg_0): return _ZL19__bfloat162short_ru13__nv_bfloat16_nbst(arg_0) - @lower(__bfloat162short_ru, _type___nv_bfloat16) + handle = globals()["__bfloat162short_ru"] + + @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -3116,7 +3071,9 @@ def _lower__ZL19__short2bfloat16_rns_nbst(shim_stream, shim_obj): def _ZL19__short2bfloat16_rns_nbst_caller(arg_0): return _ZL19__short2bfloat16_rns_nbst(arg_0) - @lower(__short2bfloat16_rn, int16) + handle = globals()["__short2bfloat16_rn"] + + @lower(handle, int16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -3157,7 +3114,9 @@ def _lower__ZL19__short2bfloat16_rzs_nbst(shim_stream, shim_obj): def _ZL19__short2bfloat16_rzs_nbst_caller(arg_0): return _ZL19__short2bfloat16_rzs_nbst(arg_0) - @lower(__short2bfloat16_rz, int16) + handle = globals()["__short2bfloat16_rz"] + + @lower(handle, int16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -3198,7 +3157,9 @@ def _lower__ZL19__short2bfloat16_rds_nbst(shim_stream, shim_obj): def _ZL19__short2bfloat16_rds_nbst_caller(arg_0): return _ZL19__short2bfloat16_rds_nbst(arg_0) - @lower(__short2bfloat16_rd, int16) + handle = globals()["__short2bfloat16_rd"] + + @lower(handle, int16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -3239,7 +3200,9 @@ def _lower__ZL19__short2bfloat16_rus_nbst(shim_stream, shim_obj): def _ZL19__short2bfloat16_rus_nbst_caller(arg_0): return _ZL19__short2bfloat16_rus_nbst(arg_0) - @lower(__short2bfloat16_ru, int16) + handle = globals()["__short2bfloat16_ru"] + + @lower(handle, int16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -3281,7 +3244,9 @@ def _lower__ZL18__bfloat162uint_rn13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL18__bfloat162uint_rn13__nv_bfloat16_nbst_caller(arg_0): return _ZL18__bfloat162uint_rn13__nv_bfloat16_nbst(arg_0) - @lower(__bfloat162uint_rn, _type___nv_bfloat16) + handle = globals()["__bfloat162uint_rn"] + + @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -3323,7 +3288,9 @@ def _lower__ZL18__bfloat162uint_rz13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL18__bfloat162uint_rz13__nv_bfloat16_nbst_caller(arg_0): return _ZL18__bfloat162uint_rz13__nv_bfloat16_nbst(arg_0) - @lower(__bfloat162uint_rz, _type___nv_bfloat16) + handle = globals()["__bfloat162uint_rz"] + + @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -3365,7 +3332,9 @@ def _lower__ZL18__bfloat162uint_rd13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL18__bfloat162uint_rd13__nv_bfloat16_nbst_caller(arg_0): return _ZL18__bfloat162uint_rd13__nv_bfloat16_nbst(arg_0) - @lower(__bfloat162uint_rd, _type___nv_bfloat16) + handle = globals()["__bfloat162uint_rd"] + + @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -3407,7 +3376,9 @@ def _lower__ZL18__bfloat162uint_ru13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL18__bfloat162uint_ru13__nv_bfloat16_nbst_caller(arg_0): return _ZL18__bfloat162uint_ru13__nv_bfloat16_nbst(arg_0) - @lower(__bfloat162uint_ru, _type___nv_bfloat16) + handle = globals()["__bfloat162uint_ru"] + + @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -3448,7 +3419,9 @@ def _lower__ZL18__uint2bfloat16_rnj_nbst(shim_stream, shim_obj): def _ZL18__uint2bfloat16_rnj_nbst_caller(arg_0): return _ZL18__uint2bfloat16_rnj_nbst(arg_0) - @lower(__uint2bfloat16_rn, uint32) + handle = globals()["__uint2bfloat16_rn"] + + @lower(handle, uint32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -3489,7 +3462,9 @@ def _lower__ZL18__uint2bfloat16_rzj_nbst(shim_stream, shim_obj): def _ZL18__uint2bfloat16_rzj_nbst_caller(arg_0): return _ZL18__uint2bfloat16_rzj_nbst(arg_0) - @lower(__uint2bfloat16_rz, uint32) + handle = globals()["__uint2bfloat16_rz"] + + @lower(handle, uint32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -3530,7 +3505,9 @@ def _lower__ZL18__uint2bfloat16_rdj_nbst(shim_stream, shim_obj): def _ZL18__uint2bfloat16_rdj_nbst_caller(arg_0): return _ZL18__uint2bfloat16_rdj_nbst(arg_0) - @lower(__uint2bfloat16_rd, uint32) + handle = globals()["__uint2bfloat16_rd"] + + @lower(handle, uint32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -3571,7 +3548,9 @@ def _lower__ZL18__uint2bfloat16_ruj_nbst(shim_stream, shim_obj): def _ZL18__uint2bfloat16_ruj_nbst_caller(arg_0): return _ZL18__uint2bfloat16_ruj_nbst(arg_0) - @lower(__uint2bfloat16_ru, uint32) + handle = globals()["__uint2bfloat16_ru"] + + @lower(handle, uint32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -3613,7 +3592,9 @@ def _lower__ZL20__bfloat162ushort_rn13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL20__bfloat162ushort_rn13__nv_bfloat16_nbst_caller(arg_0): return _ZL20__bfloat162ushort_rn13__nv_bfloat16_nbst(arg_0) - @lower(__bfloat162ushort_rn, _type___nv_bfloat16) + handle = globals()["__bfloat162ushort_rn"] + + @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -3655,7 +3636,9 @@ def _lower__ZL20__bfloat162ushort_rz13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL20__bfloat162ushort_rz13__nv_bfloat16_nbst_caller(arg_0): return _ZL20__bfloat162ushort_rz13__nv_bfloat16_nbst(arg_0) - @lower(__bfloat162ushort_rz, _type___nv_bfloat16) + handle = globals()["__bfloat162ushort_rz"] + + @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -3697,7 +3680,9 @@ def _lower__ZL20__bfloat162ushort_rd13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL20__bfloat162ushort_rd13__nv_bfloat16_nbst_caller(arg_0): return _ZL20__bfloat162ushort_rd13__nv_bfloat16_nbst(arg_0) - @lower(__bfloat162ushort_rd, _type___nv_bfloat16) + handle = globals()["__bfloat162ushort_rd"] + + @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -3739,7 +3724,9 @@ def _lower__ZL20__bfloat162ushort_ru13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL20__bfloat162ushort_ru13__nv_bfloat16_nbst_caller(arg_0): return _ZL20__bfloat162ushort_ru13__nv_bfloat16_nbst(arg_0) - @lower(__bfloat162ushort_ru, _type___nv_bfloat16) + handle = globals()["__bfloat162ushort_ru"] + + @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -3780,7 +3767,9 @@ def _lower__ZL20__ushort2bfloat16_rnt_nbst(shim_stream, shim_obj): def _ZL20__ushort2bfloat16_rnt_nbst_caller(arg_0): return _ZL20__ushort2bfloat16_rnt_nbst(arg_0) - @lower(__ushort2bfloat16_rn, uint16) + handle = globals()["__ushort2bfloat16_rn"] + + @lower(handle, uint16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -3821,7 +3810,9 @@ def _lower__ZL20__ushort2bfloat16_rzt_nbst(shim_stream, shim_obj): def _ZL20__ushort2bfloat16_rzt_nbst_caller(arg_0): return _ZL20__ushort2bfloat16_rzt_nbst(arg_0) - @lower(__ushort2bfloat16_rz, uint16) + handle = globals()["__ushort2bfloat16_rz"] + + @lower(handle, uint16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -3862,7 +3853,9 @@ def _lower__ZL20__ushort2bfloat16_rdt_nbst(shim_stream, shim_obj): def _ZL20__ushort2bfloat16_rdt_nbst_caller(arg_0): return _ZL20__ushort2bfloat16_rdt_nbst(arg_0) - @lower(__ushort2bfloat16_rd, uint16) + handle = globals()["__ushort2bfloat16_rd"] + + @lower(handle, uint16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -3903,7 +3896,9 @@ def _lower__ZL20__ushort2bfloat16_rut_nbst(shim_stream, shim_obj): def _ZL20__ushort2bfloat16_rut_nbst_caller(arg_0): return _ZL20__ushort2bfloat16_rut_nbst(arg_0) - @lower(__ushort2bfloat16_ru, uint16) + handle = globals()["__ushort2bfloat16_ru"] + + @lower(handle, uint16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -3945,7 +3940,9 @@ def _lower__ZL17__bfloat162ull_rn13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL17__bfloat162ull_rn13__nv_bfloat16_nbst_caller(arg_0): return _ZL17__bfloat162ull_rn13__nv_bfloat16_nbst(arg_0) - @lower(__bfloat162ull_rn, _type___nv_bfloat16) + handle = globals()["__bfloat162ull_rn"] + + @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -3987,7 +3984,9 @@ def _lower__ZL17__bfloat162ull_rz13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL17__bfloat162ull_rz13__nv_bfloat16_nbst_caller(arg_0): return _ZL17__bfloat162ull_rz13__nv_bfloat16_nbst(arg_0) - @lower(__bfloat162ull_rz, _type___nv_bfloat16) + handle = globals()["__bfloat162ull_rz"] + + @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -4031,7 +4030,9 @@ def _lower__ZL14make_bfloat16213__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL14make_bfloat16213__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL14make_bfloat16213__nv_bfloat16S__nbst(arg_0, arg_1) - @lower(make_bfloat162, _type___nv_bfloat16, _type___nv_bfloat16) + handle = globals()["make_bfloat162"] + + @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -4077,7 +4078,9 @@ def _lower__ZL17__bfloat162ull_rd13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL17__bfloat162ull_rd13__nv_bfloat16_nbst_caller(arg_0): return _ZL17__bfloat162ull_rd13__nv_bfloat16_nbst(arg_0) - @lower(__bfloat162ull_rd, _type___nv_bfloat16) + handle = globals()["__bfloat162ull_rd"] + + @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -4119,7 +4122,9 @@ def _lower__ZL17__bfloat162ull_ru13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL17__bfloat162ull_ru13__nv_bfloat16_nbst_caller(arg_0): return _ZL17__bfloat162ull_ru13__nv_bfloat16_nbst(arg_0) - @lower(__bfloat162ull_ru, _type___nv_bfloat16) + handle = globals()["__bfloat162ull_ru"] + + @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -4160,7 +4165,9 @@ def _lower__ZL17__ull2bfloat16_rny_nbst(shim_stream, shim_obj): def _ZL17__ull2bfloat16_rny_nbst_caller(arg_0): return _ZL17__ull2bfloat16_rny_nbst(arg_0) - @lower(__ull2bfloat16_rn, uint64) + handle = globals()["__ull2bfloat16_rn"] + + @lower(handle, uint64) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key("_ZL17__ull2bfloat16_rny_nbst", shim_raw_str) @@ -4199,7 +4206,9 @@ def _lower__ZL17__ull2bfloat16_rzy_nbst(shim_stream, shim_obj): def _ZL17__ull2bfloat16_rzy_nbst_caller(arg_0): return _ZL17__ull2bfloat16_rzy_nbst(arg_0) - @lower(__ull2bfloat16_rz, uint64) + handle = globals()["__ull2bfloat16_rz"] + + @lower(handle, uint64) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key("_ZL17__ull2bfloat16_rzy_nbst", shim_raw_str) @@ -4238,7 +4247,9 @@ def _lower__ZL17__ull2bfloat16_rdy_nbst(shim_stream, shim_obj): def _ZL17__ull2bfloat16_rdy_nbst_caller(arg_0): return _ZL17__ull2bfloat16_rdy_nbst(arg_0) - @lower(__ull2bfloat16_rd, uint64) + handle = globals()["__ull2bfloat16_rd"] + + @lower(handle, uint64) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key("_ZL17__ull2bfloat16_rdy_nbst", shim_raw_str) @@ -4277,7 +4288,9 @@ def _lower__ZL17__ull2bfloat16_ruy_nbst(shim_stream, shim_obj): def _ZL17__ull2bfloat16_ruy_nbst_caller(arg_0): return _ZL17__ull2bfloat16_ruy_nbst(arg_0) - @lower(__ull2bfloat16_ru, uint64) + handle = globals()["__ull2bfloat16_ru"] + + @lower(handle, uint64) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key("_ZL17__ull2bfloat16_ruy_nbst", shim_raw_str) @@ -4317,7 +4330,9 @@ def _lower__ZL16__bfloat162ll_rn13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL16__bfloat162ll_rn13__nv_bfloat16_nbst_caller(arg_0): return _ZL16__bfloat162ll_rn13__nv_bfloat16_nbst(arg_0) - @lower(__bfloat162ll_rn, _type___nv_bfloat16) + handle = globals()["__bfloat162ll_rn"] + + @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -4359,7 +4374,9 @@ def _lower__ZL16__bfloat162ll_rz13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL16__bfloat162ll_rz13__nv_bfloat16_nbst_caller(arg_0): return _ZL16__bfloat162ll_rz13__nv_bfloat16_nbst(arg_0) - @lower(__bfloat162ll_rz, _type___nv_bfloat16) + handle = globals()["__bfloat162ll_rz"] + + @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -4401,7 +4418,9 @@ def _lower__ZL16__bfloat162ll_rd13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL16__bfloat162ll_rd13__nv_bfloat16_nbst_caller(arg_0): return _ZL16__bfloat162ll_rd13__nv_bfloat16_nbst(arg_0) - @lower(__bfloat162ll_rd, _type___nv_bfloat16) + handle = globals()["__bfloat162ll_rd"] + + @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -4443,7 +4462,9 @@ def _lower__ZL16__bfloat162ll_ru13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL16__bfloat162ll_ru13__nv_bfloat16_nbst_caller(arg_0): return _ZL16__bfloat162ll_ru13__nv_bfloat16_nbst(arg_0) - @lower(__bfloat162ll_ru, _type___nv_bfloat16) + handle = globals()["__bfloat162ll_ru"] + + @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -4484,7 +4505,9 @@ def _lower__ZL16__ll2bfloat16_rnx_nbst(shim_stream, shim_obj): def _ZL16__ll2bfloat16_rnx_nbst_caller(arg_0): return _ZL16__ll2bfloat16_rnx_nbst(arg_0) - @lower(__ll2bfloat16_rn, int64) + handle = globals()["__ll2bfloat16_rn"] + + @lower(handle, int64) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key("_ZL16__ll2bfloat16_rnx_nbst", shim_raw_str) @@ -4523,7 +4546,9 @@ def _lower__ZL16__ll2bfloat16_rzx_nbst(shim_stream, shim_obj): def _ZL16__ll2bfloat16_rzx_nbst_caller(arg_0): return _ZL16__ll2bfloat16_rzx_nbst(arg_0) - @lower(__ll2bfloat16_rz, int64) + handle = globals()["__ll2bfloat16_rz"] + + @lower(handle, int64) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key("_ZL16__ll2bfloat16_rzx_nbst", shim_raw_str) @@ -4562,7 +4587,9 @@ def _lower__ZL16__ll2bfloat16_rdx_nbst(shim_stream, shim_obj): def _ZL16__ll2bfloat16_rdx_nbst_caller(arg_0): return _ZL16__ll2bfloat16_rdx_nbst(arg_0) - @lower(__ll2bfloat16_rd, int64) + handle = globals()["__ll2bfloat16_rd"] + + @lower(handle, int64) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key("_ZL16__ll2bfloat16_rdx_nbst", shim_raw_str) @@ -4601,7 +4628,9 @@ def _lower__ZL16__ll2bfloat16_rux_nbst(shim_stream, shim_obj): def _ZL16__ll2bfloat16_rux_nbst_caller(arg_0): return _ZL16__ll2bfloat16_rux_nbst(arg_0) - @lower(__ll2bfloat16_ru, int64) + handle = globals()["__ll2bfloat16_ru"] + + @lower(handle, int64) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key("_ZL16__ll2bfloat16_rux_nbst", shim_raw_str) @@ -4641,7 +4670,9 @@ def _lower__ZL6htrunc13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL6htrunc13__nv_bfloat16_nbst_caller(arg_0): return _ZL6htrunc13__nv_bfloat16_nbst(arg_0) - @lower(htrunc, _type___nv_bfloat16) + handle = globals()["htrunc"] + + @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -4683,7 +4714,9 @@ def _lower__ZL5hceil13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL5hceil13__nv_bfloat16_nbst_caller(arg_0): return _ZL5hceil13__nv_bfloat16_nbst(arg_0) - @lower(hceil, _type___nv_bfloat16) + handle = globals()["hceil"] + + @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -4725,7 +4758,9 @@ def _lower__ZL6hfloor13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL6hfloor13__nv_bfloat16_nbst_caller(arg_0): return _ZL6hfloor13__nv_bfloat16_nbst(arg_0) - @lower(hfloor, _type___nv_bfloat16) + handle = globals()["hfloor"] + + @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -4767,7 +4802,9 @@ def _lower__ZL5hrint13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL5hrint13__nv_bfloat16_nbst_caller(arg_0): return _ZL5hrint13__nv_bfloat16_nbst(arg_0) - @lower(hrint, _type___nv_bfloat16) + handle = globals()["hrint"] + + @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -4809,7 +4846,9 @@ def _lower__ZL7h2trunc14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL7h2trunc14__nv_bfloat162_nbst_caller(arg_0): return _ZL7h2trunc14__nv_bfloat162_nbst(arg_0) - @lower(h2trunc, _type___nv_bfloat162) + handle = globals()["h2trunc"] + + @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -4851,7 +4890,9 @@ def _lower__ZL6h2ceil14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL6h2ceil14__nv_bfloat162_nbst_caller(arg_0): return _ZL6h2ceil14__nv_bfloat162_nbst(arg_0) - @lower(h2ceil, _type___nv_bfloat162) + handle = globals()["h2ceil"] + + @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -4893,7 +4934,9 @@ def _lower__ZL7h2floor14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL7h2floor14__nv_bfloat162_nbst_caller(arg_0): return _ZL7h2floor14__nv_bfloat162_nbst(arg_0) - @lower(h2floor, _type___nv_bfloat162) + handle = globals()["h2floor"] + + @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -4935,7 +4978,9 @@ def _lower__ZL6h2rint14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL6h2rint14__nv_bfloat162_nbst_caller(arg_0): return _ZL6h2rint14__nv_bfloat162_nbst(arg_0) - @lower(h2rint, _type___nv_bfloat162) + handle = globals()["h2rint"] + + @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -4977,7 +5022,9 @@ def _lower__ZL20__bfloat162bfloat16213__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL20__bfloat162bfloat16213__nv_bfloat16_nbst_caller(arg_0): return _ZL20__bfloat162bfloat16213__nv_bfloat16_nbst(arg_0) - @lower(__bfloat162bfloat162, _type___nv_bfloat16) + handle = globals()["__bfloat162bfloat162"] + + @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -5019,7 +5066,9 @@ def _lower__ZL17__lowhigh2highlow14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL17__lowhigh2highlow14__nv_bfloat162_nbst_caller(arg_0): return _ZL17__lowhigh2highlow14__nv_bfloat162_nbst(arg_0) - @lower(__lowhigh2highlow, _type___nv_bfloat162) + handle = globals()["__lowhigh2highlow"] + + @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -5063,7 +5112,9 @@ def _lower__ZL16__lows2bfloat16214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL16__lows2bfloat16214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL16__lows2bfloat16214__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__lows2bfloat162, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["__lows2bfloat162"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -5111,7 +5162,9 @@ def _lower__ZL17__highs2bfloat16214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL17__highs2bfloat16214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL17__highs2bfloat16214__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__highs2bfloat162, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["__highs2bfloat162"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -5157,7 +5210,9 @@ def _lower__ZL15__high2bfloat1614__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL15__high2bfloat1614__nv_bfloat162_nbst_caller(arg_0): return _ZL15__high2bfloat1614__nv_bfloat162_nbst(arg_0) - @lower(__high2bfloat16, _type___nv_bfloat162) + handle = globals()["__high2bfloat16"] + + @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -5199,7 +5254,9 @@ def _lower__ZL14__low2bfloat1614__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL14__low2bfloat1614__nv_bfloat162_nbst_caller(arg_0): return _ZL14__low2bfloat1614__nv_bfloat162_nbst(arg_0) - @lower(__low2bfloat16, _type___nv_bfloat162) + handle = globals()["__low2bfloat16"] + + @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -5240,7 +5297,9 @@ def _lower__ZL8__hisinf13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL8__hisinf13__nv_bfloat16_nbst_caller(arg_0): return _ZL8__hisinf13__nv_bfloat16_nbst(arg_0) - @lower(__hisinf, _type___nv_bfloat16) + handle = globals()["__hisinf"] + + @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -5284,7 +5343,9 @@ def _lower__ZL18__halves2bfloat16213__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL18__halves2bfloat16213__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL18__halves2bfloat16213__nv_bfloat16S__nbst(arg_0, arg_1) - @lower(__halves2bfloat162, _type___nv_bfloat16, _type___nv_bfloat16) + handle = globals()["__halves2bfloat162"] + + @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -5330,7 +5391,9 @@ def _lower__ZL15__low2bfloat16214__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL15__low2bfloat16214__nv_bfloat162_nbst_caller(arg_0): return _ZL15__low2bfloat16214__nv_bfloat162_nbst(arg_0) - @lower(__low2bfloat162, _type___nv_bfloat162) + handle = globals()["__low2bfloat162"] + + @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -5372,7 +5435,9 @@ def _lower__ZL16__high2bfloat16214__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL16__high2bfloat16214__nv_bfloat162_nbst_caller(arg_0): return _ZL16__high2bfloat16214__nv_bfloat162_nbst(arg_0) - @lower(__high2bfloat162, _type___nv_bfloat162) + handle = globals()["__high2bfloat162"] + + @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -5414,7 +5479,9 @@ def _lower__ZL19__bfloat16_as_short13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL19__bfloat16_as_short13__nv_bfloat16_nbst_caller(arg_0): return _ZL19__bfloat16_as_short13__nv_bfloat16_nbst(arg_0) - @lower(__bfloat16_as_short, _type___nv_bfloat16) + handle = globals()["__bfloat16_as_short"] + + @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -5456,7 +5523,9 @@ def _lower__ZL20__bfloat16_as_ushort13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL20__bfloat16_as_ushort13__nv_bfloat16_nbst_caller(arg_0): return _ZL20__bfloat16_as_ushort13__nv_bfloat16_nbst(arg_0) - @lower(__bfloat16_as_ushort, _type___nv_bfloat16) + handle = globals()["__bfloat16_as_ushort"] + + @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -5497,7 +5566,9 @@ def _lower__ZL19__short_as_bfloat16s_nbst(shim_stream, shim_obj): def _ZL19__short_as_bfloat16s_nbst_caller(arg_0): return _ZL19__short_as_bfloat16s_nbst(arg_0) - @lower(__short_as_bfloat16, int16) + handle = globals()["__short_as_bfloat16"] + + @lower(handle, int16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -5538,7 +5609,9 @@ def _lower__ZL20__ushort_as_bfloat16t_nbst(shim_stream, shim_obj): def _ZL20__ushort_as_bfloat16t_nbst_caller(arg_0): return _ZL20__ushort_as_bfloat16t_nbst(arg_0) - @lower(__ushort_as_bfloat16, uint16) + handle = globals()["__ushort_as_bfloat16"] + + @lower(handle, uint16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -5589,7 +5662,9 @@ def _ZL11__shfl_syncj14__nv_bfloat162ii_nbst_caller( arg_0, arg_1, arg_2, arg_3 ) - @lower(__shfl_sync, uint32, _type___nv_bfloat162, int32, int32) + handle = globals()["__shfl_sync"] + + @lower(handle, uint32, _type___nv_bfloat162, int32, int32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -5646,7 +5721,9 @@ def _ZL14__shfl_up_syncj14__nv_bfloat162ji_nbst_caller( arg_0, arg_1, arg_2, arg_3 ) - @lower(__shfl_up_sync, uint32, _type___nv_bfloat162, uint32, int32) + handle = globals()["__shfl_up_sync"] + + @lower(handle, uint32, _type___nv_bfloat162, uint32, int32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -5703,7 +5780,9 @@ def _ZL16__shfl_down_syncj14__nv_bfloat162ji_nbst_caller( arg_0, arg_1, arg_2, arg_3 ) - @lower(__shfl_down_sync, uint32, _type___nv_bfloat162, uint32, int32) + handle = globals()["__shfl_down_sync"] + + @lower(handle, uint32, _type___nv_bfloat162, uint32, int32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -5760,7 +5839,9 @@ def _ZL15__shfl_xor_syncj14__nv_bfloat162ii_nbst_caller( arg_0, arg_1, arg_2, arg_3 ) - @lower(__shfl_xor_sync, uint32, _type___nv_bfloat162, int32, int32) + handle = globals()["__shfl_xor_sync"] + + @lower(handle, uint32, _type___nv_bfloat162, int32, int32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -5813,7 +5894,9 @@ def _ZL11__shfl_syncj13__nv_bfloat16ii_nbst_caller( arg_0, arg_1, arg_2, arg_3 ) - @lower(__shfl_sync, uint32, _type___nv_bfloat16, int32, int32) + handle = globals()["__shfl_sync"] + + @lower(handle, uint32, _type___nv_bfloat16, int32, int32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -5866,7 +5949,9 @@ def _ZL14__shfl_up_syncj13__nv_bfloat16ji_nbst_caller( arg_0, arg_1, arg_2, arg_3 ) - @lower(__shfl_up_sync, uint32, _type___nv_bfloat16, uint32, int32) + handle = globals()["__shfl_up_sync"] + + @lower(handle, uint32, _type___nv_bfloat16, uint32, int32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -5919,7 +6004,9 @@ def _ZL16__shfl_down_syncj13__nv_bfloat16ji_nbst_caller( arg_0, arg_1, arg_2, arg_3 ) - @lower(__shfl_down_sync, uint32, _type___nv_bfloat16, uint32, int32) + handle = globals()["__shfl_down_sync"] + + @lower(handle, uint32, _type___nv_bfloat16, uint32, int32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -5972,7 +6059,9 @@ def _ZL15__shfl_xor_syncj13__nv_bfloat16ii_nbst_caller( arg_0, arg_1, arg_2, arg_3 ) - @lower(__shfl_xor_sync, uint32, _type___nv_bfloat16, int32, int32) + handle = globals()["__shfl_xor_sync"] + + @lower(handle, uint32, _type___nv_bfloat16, int32, int32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -6020,7 +6109,9 @@ def _lower__ZL5__ldgPK14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL5__ldgPK14__nv_bfloat162_nbst_caller(arg_0): return _ZL5__ldgPK14__nv_bfloat162_nbst(arg_0) - @lower(__ldg, CPointer(_type___nv_bfloat162)) + handle = globals()["__ldg"] + + @lower(handle, CPointer(_type___nv_bfloat162)) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -6060,7 +6151,9 @@ def _lower__ZL5__ldgPK13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL5__ldgPK13__nv_bfloat16_nbst_caller(arg_0): return _ZL5__ldgPK13__nv_bfloat16_nbst(arg_0) - @lower(__ldg, CPointer(_type___nv_bfloat16)) + handle = globals()["__ldg"] + + @lower(handle, CPointer(_type___nv_bfloat16)) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -6104,7 +6197,9 @@ def _lower__ZL6__ldcgPK14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL6__ldcgPK14__nv_bfloat162_nbst_caller(arg_0): return _ZL6__ldcgPK14__nv_bfloat162_nbst(arg_0) - @lower(__ldcg, CPointer(_type___nv_bfloat162)) + handle = globals()["__ldcg"] + + @lower(handle, CPointer(_type___nv_bfloat162)) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -6144,7 +6239,9 @@ def _lower__ZL6__ldcgPK13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL6__ldcgPK13__nv_bfloat16_nbst_caller(arg_0): return _ZL6__ldcgPK13__nv_bfloat16_nbst(arg_0) - @lower(__ldcg, CPointer(_type___nv_bfloat16)) + handle = globals()["__ldcg"] + + @lower(handle, CPointer(_type___nv_bfloat16)) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -6188,7 +6285,9 @@ def _lower__ZL6__ldcaPK14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL6__ldcaPK14__nv_bfloat162_nbst_caller(arg_0): return _ZL6__ldcaPK14__nv_bfloat162_nbst(arg_0) - @lower(__ldca, CPointer(_type___nv_bfloat162)) + handle = globals()["__ldca"] + + @lower(handle, CPointer(_type___nv_bfloat162)) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -6228,7 +6327,9 @@ def _lower__ZL6__ldcaPK13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL6__ldcaPK13__nv_bfloat16_nbst_caller(arg_0): return _ZL6__ldcaPK13__nv_bfloat16_nbst(arg_0) - @lower(__ldca, CPointer(_type___nv_bfloat16)) + handle = globals()["__ldca"] + + @lower(handle, CPointer(_type___nv_bfloat16)) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -6272,7 +6373,9 @@ def _lower__ZL6__ldcsPK14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL6__ldcsPK14__nv_bfloat162_nbst_caller(arg_0): return _ZL6__ldcsPK14__nv_bfloat162_nbst(arg_0) - @lower(__ldcs, CPointer(_type___nv_bfloat162)) + handle = globals()["__ldcs"] + + @lower(handle, CPointer(_type___nv_bfloat162)) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -6312,7 +6415,9 @@ def _lower__ZL6__ldcsPK13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL6__ldcsPK13__nv_bfloat16_nbst_caller(arg_0): return _ZL6__ldcsPK13__nv_bfloat16_nbst(arg_0) - @lower(__ldcs, CPointer(_type___nv_bfloat16)) + handle = globals()["__ldcs"] + + @lower(handle, CPointer(_type___nv_bfloat16)) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -6356,7 +6461,9 @@ def _lower__ZL6__ldluPK14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL6__ldluPK14__nv_bfloat162_nbst_caller(arg_0): return _ZL6__ldluPK14__nv_bfloat162_nbst(arg_0) - @lower(__ldlu, CPointer(_type___nv_bfloat162)) + handle = globals()["__ldlu"] + + @lower(handle, CPointer(_type___nv_bfloat162)) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -6396,7 +6503,9 @@ def _lower__ZL6__ldluPK13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL6__ldluPK13__nv_bfloat16_nbst_caller(arg_0): return _ZL6__ldluPK13__nv_bfloat16_nbst(arg_0) - @lower(__ldlu, CPointer(_type___nv_bfloat16)) + handle = globals()["__ldlu"] + + @lower(handle, CPointer(_type___nv_bfloat16)) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -6440,7 +6549,9 @@ def _lower__ZL6__ldcvPK14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL6__ldcvPK14__nv_bfloat162_nbst_caller(arg_0): return _ZL6__ldcvPK14__nv_bfloat162_nbst(arg_0) - @lower(__ldcv, CPointer(_type___nv_bfloat162)) + handle = globals()["__ldcv"] + + @lower(handle, CPointer(_type___nv_bfloat162)) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -6480,7 +6591,9 @@ def _lower__ZL6__ldcvPK13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL6__ldcvPK13__nv_bfloat16_nbst_caller(arg_0): return _ZL6__ldcvPK13__nv_bfloat16_nbst(arg_0) - @lower(__ldcv, CPointer(_type___nv_bfloat16)) + handle = globals()["__ldcv"] + + @lower(handle, CPointer(_type___nv_bfloat16)) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -6527,7 +6640,9 @@ def _lower__ZL6__stwbP14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL6__stwbP14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL6__stwbP14__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__stwb, CPointer(_type___nv_bfloat162), _type___nv_bfloat162) + handle = globals()["__stwb"] + + @lower(handle, CPointer(_type___nv_bfloat162), _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -6572,7 +6687,9 @@ def _lower__ZL6__stwbP13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL6__stwbP13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL6__stwbP13__nv_bfloat16S__nbst(arg_0, arg_1) - @lower(__stwb, CPointer(_type___nv_bfloat16), _type___nv_bfloat16) + handle = globals()["__stwb"] + + @lower(handle, CPointer(_type___nv_bfloat16), _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -6621,7 +6738,9 @@ def _lower__ZL6__stcgP14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL6__stcgP14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL6__stcgP14__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__stcg, CPointer(_type___nv_bfloat162), _type___nv_bfloat162) + handle = globals()["__stcg"] + + @lower(handle, CPointer(_type___nv_bfloat162), _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -6666,7 +6785,9 @@ def _lower__ZL6__stcgP13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL6__stcgP13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL6__stcgP13__nv_bfloat16S__nbst(arg_0, arg_1) - @lower(__stcg, CPointer(_type___nv_bfloat16), _type___nv_bfloat16) + handle = globals()["__stcg"] + + @lower(handle, CPointer(_type___nv_bfloat16), _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -6715,7 +6836,9 @@ def _lower__ZL6__stcsP14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL6__stcsP14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL6__stcsP14__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__stcs, CPointer(_type___nv_bfloat162), _type___nv_bfloat162) + handle = globals()["__stcs"] + + @lower(handle, CPointer(_type___nv_bfloat162), _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -6760,7 +6883,9 @@ def _lower__ZL6__stcsP13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL6__stcsP13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL6__stcsP13__nv_bfloat16S__nbst(arg_0, arg_1) - @lower(__stcs, CPointer(_type___nv_bfloat16), _type___nv_bfloat16) + handle = globals()["__stcs"] + + @lower(handle, CPointer(_type___nv_bfloat16), _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -6809,7 +6934,9 @@ def _lower__ZL6__stwtP14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL6__stwtP14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL6__stwtP14__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__stwt, CPointer(_type___nv_bfloat162), _type___nv_bfloat162) + handle = globals()["__stwt"] + + @lower(handle, CPointer(_type___nv_bfloat162), _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -6854,7 +6981,9 @@ def _lower__ZL6__stwtP13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL6__stwtP13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL6__stwtP13__nv_bfloat16S__nbst(arg_0, arg_1) - @lower(__stwt, CPointer(_type___nv_bfloat16), _type___nv_bfloat16) + handle = globals()["__stwt"] + + @lower(handle, CPointer(_type___nv_bfloat16), _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -6902,7 +7031,9 @@ def _lower__ZL6__heq214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL6__heq214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL6__heq214__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__heq2, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["__heq2"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -6950,7 +7081,9 @@ def _lower__ZL6__hne214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL6__hne214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL6__hne214__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__hne2, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["__hne2"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -6998,7 +7131,9 @@ def _lower__ZL6__hle214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL6__hle214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL6__hle214__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__hle2, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["__hle2"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -7046,7 +7181,9 @@ def _lower__ZL6__hge214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL6__hge214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL6__hge214__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__hge2, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["__hge2"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -7094,7 +7231,9 @@ def _lower__ZL6__hlt214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL6__hlt214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL6__hlt214__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__hlt2, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["__hlt2"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -7142,7 +7281,9 @@ def _lower__ZL6__hgt214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL6__hgt214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL6__hgt214__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__hgt2, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["__hgt2"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -7190,7 +7331,9 @@ def _lower__ZL7__hequ214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL7__hequ214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL7__hequ214__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__hequ2, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["__hequ2"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -7238,7 +7381,9 @@ def _lower__ZL7__hneu214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL7__hneu214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL7__hneu214__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__hneu2, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["__hneu2"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -7286,7 +7431,9 @@ def _lower__ZL7__hleu214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL7__hleu214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL7__hleu214__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__hleu2, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["__hleu2"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -7334,7 +7481,9 @@ def _lower__ZL7__hgeu214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL7__hgeu214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL7__hgeu214__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__hgeu2, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["__hgeu2"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -7382,7 +7531,9 @@ def _lower__ZL7__hltu214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL7__hltu214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL7__hltu214__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__hltu2, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["__hltu2"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -7430,7 +7581,9 @@ def _lower__ZL7__hgtu214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL7__hgtu214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL7__hgtu214__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__hgtu2, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["__hgtu2"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -7476,7 +7629,9 @@ def _lower__ZL11__heq2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL11__heq2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL11__heq2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__heq2_mask, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["__heq2_mask"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -7522,7 +7677,9 @@ def _lower__ZL11__hne2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL11__hne2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL11__hne2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__hne2_mask, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["__hne2_mask"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -7568,7 +7725,9 @@ def _lower__ZL11__hle2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL11__hle2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL11__hle2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__hle2_mask, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["__hle2_mask"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -7614,7 +7773,9 @@ def _lower__ZL11__hge2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL11__hge2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL11__hge2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__hge2_mask, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["__hge2_mask"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -7660,7 +7821,9 @@ def _lower__ZL11__hlt2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL11__hlt2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL11__hlt2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__hlt2_mask, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["__hlt2_mask"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -7706,7 +7869,9 @@ def _lower__ZL11__hgt2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL11__hgt2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL11__hgt2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__hgt2_mask, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["__hgt2_mask"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -7752,7 +7917,9 @@ def _lower__ZL12__hequ2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL12__hequ2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL12__hequ2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__hequ2_mask, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["__hequ2_mask"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -7798,7 +7965,9 @@ def _lower__ZL12__hneu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL12__hneu2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL12__hneu2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__hneu2_mask, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["__hneu2_mask"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -7844,7 +8013,9 @@ def _lower__ZL12__hleu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL12__hleu2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL12__hleu2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__hleu2_mask, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["__hleu2_mask"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -7890,7 +8061,9 @@ def _lower__ZL12__hgeu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL12__hgeu2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL12__hgeu2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__hgeu2_mask, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["__hgeu2_mask"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -7936,7 +8109,9 @@ def _lower__ZL12__hltu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL12__hltu2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL12__hltu2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__hltu2_mask, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["__hltu2_mask"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -7982,7 +8157,9 @@ def _lower__ZL12__hgtu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL12__hgtu2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL12__hgtu2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__hgtu2_mask, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["__hgtu2_mask"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -8028,7 +8205,9 @@ def _lower__ZL9__hisnan214__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL9__hisnan214__nv_bfloat162_nbst_caller(arg_0): return _ZL9__hisnan214__nv_bfloat162_nbst(arg_0) - @lower(__hisnan2, _type___nv_bfloat162) + handle = globals()["__hisnan2"] + + @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -8072,7 +8251,9 @@ def _lower__ZL7__hadd214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL7__hadd214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL7__hadd214__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__hadd2, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["__hadd2"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -8120,7 +8301,9 @@ def _lower__ZL7__hsub214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL7__hsub214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL7__hsub214__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__hsub2, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["__hsub2"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -8168,7 +8351,9 @@ def _lower__ZL7__hmul214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL7__hmul214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL7__hmul214__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__hmul2, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["__hmul2"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -8216,7 +8401,9 @@ def _lower__ZL10__hadd2_rn14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL10__hadd2_rn14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL10__hadd2_rn14__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__hadd2_rn, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["__hadd2_rn"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -8264,7 +8451,9 @@ def _lower__ZL10__hsub2_rn14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL10__hsub2_rn14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL10__hsub2_rn14__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__hsub2_rn, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["__hsub2_rn"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -8312,7 +8501,9 @@ def _lower__ZL10__hmul2_rn14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL10__hmul2_rn14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL10__hmul2_rn14__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__hmul2_rn, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["__hmul2_rn"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -8360,7 +8551,9 @@ def _lower__ZL7__h2div14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL7__h2div14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL7__h2div14__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__h2div, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["__h2div"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -8406,7 +8599,9 @@ def _lower__ZL7__habs214__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL7__habs214__nv_bfloat162_nbst_caller(arg_0): return _ZL7__habs214__nv_bfloat162_nbst(arg_0) - @lower(__habs2, _type___nv_bfloat162) + handle = globals()["__habs2"] + + @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -8450,7 +8645,9 @@ def _lower__ZL11__hadd2_sat14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL11__hadd2_sat14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL11__hadd2_sat14__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__hadd2_sat, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["__hadd2_sat"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -8498,7 +8695,9 @@ def _lower__ZL11__hsub2_sat14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL11__hsub2_sat14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL11__hsub2_sat14__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__hsub2_sat, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["__hsub2_sat"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -8546,7 +8745,9 @@ def _lower__ZL11__hmul2_sat14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL11__hmul2_sat14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL11__hmul2_sat14__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__hmul2_sat, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["__hmul2_sat"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -8596,11 +8797,10 @@ def _lower__ZL7__hfma214__nv_bfloat162S_S__nbst(shim_stream, shim_obj): def _ZL7__hfma214__nv_bfloat162S_S__nbst_caller(arg_0, arg_1, arg_2): return _ZL7__hfma214__nv_bfloat162S_S__nbst(arg_0, arg_1, arg_2) + handle = globals()["__hfma2"] + @lower( - __hfma2, - _type___nv_bfloat162, - _type___nv_bfloat162, - _type___nv_bfloat162, + handle, _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 ) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) @@ -8652,11 +8852,10 @@ def _lower__ZL11__hfma2_sat14__nv_bfloat162S_S__nbst(shim_stream, shim_obj): def _ZL11__hfma2_sat14__nv_bfloat162S_S__nbst_caller(arg_0, arg_1, arg_2): return _ZL11__hfma2_sat14__nv_bfloat162S_S__nbst(arg_0, arg_1, arg_2) + handle = globals()["__hfma2_sat"] + @lower( - __hfma2_sat, - _type___nv_bfloat162, - _type___nv_bfloat162, - _type___nv_bfloat162, + handle, _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 ) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) @@ -8704,7 +8903,9 @@ def _lower__ZL7__hneg214__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL7__hneg214__nv_bfloat162_nbst_caller(arg_0): return _ZL7__hneg214__nv_bfloat162_nbst(arg_0) - @lower(__hneg2, _type___nv_bfloat162) + handle = globals()["__hneg2"] + + @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -8746,7 +8947,9 @@ def _lower__ZL6__habs13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL6__habs13__nv_bfloat16_nbst_caller(arg_0): return _ZL6__habs13__nv_bfloat16_nbst(arg_0) - @lower(__habs, _type___nv_bfloat16) + handle = globals()["__habs"] + + @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -8790,7 +8993,9 @@ def _lower__ZL6__hadd13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL6__hadd13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL6__hadd13__nv_bfloat16S__nbst(arg_0, arg_1) - @lower(__hadd, _type___nv_bfloat16, _type___nv_bfloat16) + handle = globals()["__hadd"] + + @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -8838,7 +9043,9 @@ def _lower__ZL6__hsub13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL6__hsub13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL6__hsub13__nv_bfloat16S__nbst(arg_0, arg_1) - @lower(__hsub, _type___nv_bfloat16, _type___nv_bfloat16) + handle = globals()["__hsub"] + + @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -8886,7 +9093,9 @@ def _lower__ZL6__hmul13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL6__hmul13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL6__hmul13__nv_bfloat16S__nbst(arg_0, arg_1) - @lower(__hmul, _type___nv_bfloat16, _type___nv_bfloat16) + handle = globals()["__hmul"] + + @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -8934,7 +9143,9 @@ def _lower__ZL9__hadd_rn13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL9__hadd_rn13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL9__hadd_rn13__nv_bfloat16S__nbst(arg_0, arg_1) - @lower(__hadd_rn, _type___nv_bfloat16, _type___nv_bfloat16) + handle = globals()["__hadd_rn"] + + @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -8982,7 +9193,9 @@ def _lower__ZL9__hsub_rn13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL9__hsub_rn13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL9__hsub_rn13__nv_bfloat16S__nbst(arg_0, arg_1) - @lower(__hsub_rn, _type___nv_bfloat16, _type___nv_bfloat16) + handle = globals()["__hsub_rn"] + + @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -9030,7 +9243,9 @@ def _lower__ZL9__hmul_rn13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL9__hmul_rn13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL9__hmul_rn13__nv_bfloat16S__nbst(arg_0, arg_1) - @lower(__hmul_rn, _type___nv_bfloat16, _type___nv_bfloat16) + handle = globals()["__hmul_rn"] + + @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -9078,7 +9293,9 @@ def _lower__ZL6__hdiv13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL6__hdiv13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL6__hdiv13__nv_bfloat16S__nbst(arg_0, arg_1) - @lower(__hdiv, _type___nv_bfloat16, _type___nv_bfloat16) + handle = globals()["__hdiv"] + + @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -9126,7 +9343,9 @@ def _lower__ZL10__hadd_sat13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL10__hadd_sat13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL10__hadd_sat13__nv_bfloat16S__nbst(arg_0, arg_1) - @lower(__hadd_sat, _type___nv_bfloat16, _type___nv_bfloat16) + handle = globals()["__hadd_sat"] + + @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -9174,7 +9393,9 @@ def _lower__ZL10__hsub_sat13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL10__hsub_sat13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL10__hsub_sat13__nv_bfloat16S__nbst(arg_0, arg_1) - @lower(__hsub_sat, _type___nv_bfloat16, _type___nv_bfloat16) + handle = globals()["__hsub_sat"] + + @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -9222,7 +9443,9 @@ def _lower__ZL10__hmul_sat13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL10__hmul_sat13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL10__hmul_sat13__nv_bfloat16S__nbst(arg_0, arg_1) - @lower(__hmul_sat, _type___nv_bfloat16, _type___nv_bfloat16) + handle = globals()["__hmul_sat"] + + @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -9272,8 +9495,10 @@ def _lower__ZL6__hfma13__nv_bfloat16S_S__nbst(shim_stream, shim_obj): def _ZL6__hfma13__nv_bfloat16S_S__nbst_caller(arg_0, arg_1, arg_2): return _ZL6__hfma13__nv_bfloat16S_S__nbst(arg_0, arg_1, arg_2) + handle = globals()["__hfma"] + @lower( - __hfma, _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16 + handle, _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16 ) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) @@ -9325,11 +9550,10 @@ def _lower__ZL10__hfma_sat13__nv_bfloat16S_S__nbst(shim_stream, shim_obj): def _ZL10__hfma_sat13__nv_bfloat16S_S__nbst_caller(arg_0, arg_1, arg_2): return _ZL10__hfma_sat13__nv_bfloat16S_S__nbst(arg_0, arg_1, arg_2) + handle = globals()["__hfma_sat"] + @lower( - __hfma_sat, - _type___nv_bfloat16, - _type___nv_bfloat16, - _type___nv_bfloat16, + handle, _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16 ) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) @@ -9377,7 +9601,9 @@ def _lower__ZL6__hneg13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL6__hneg13__nv_bfloat16_nbst_caller(arg_0): return _ZL6__hneg13__nv_bfloat16_nbst(arg_0) - @lower(__hneg, _type___nv_bfloat16) + handle = globals()["__hneg"] + + @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -9419,7 +9645,9 @@ def _lower__ZL7__hbeq214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL7__hbeq214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL7__hbeq214__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__hbeq2, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["__hbeq2"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -9465,7 +9693,9 @@ def _lower__ZL7__hbne214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL7__hbne214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL7__hbne214__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__hbne2, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["__hbne2"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -9511,7 +9741,9 @@ def _lower__ZL7__hble214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL7__hble214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL7__hble214__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__hble2, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["__hble2"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -9557,7 +9789,9 @@ def _lower__ZL7__hbge214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL7__hbge214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL7__hbge214__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__hbge2, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["__hbge2"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -9603,7 +9837,9 @@ def _lower__ZL7__hblt214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL7__hblt214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL7__hblt214__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__hblt2, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["__hblt2"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -9649,7 +9885,9 @@ def _lower__ZL7__hbgt214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL7__hbgt214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL7__hbgt214__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__hbgt2, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["__hbgt2"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -9695,7 +9933,9 @@ def _lower__ZL8__hbequ214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL8__hbequ214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL8__hbequ214__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__hbequ2, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["__hbequ2"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -9741,7 +9981,9 @@ def _lower__ZL8__hbneu214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL8__hbneu214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL8__hbneu214__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__hbneu2, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["__hbneu2"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -9787,7 +10029,9 @@ def _lower__ZL8__hbleu214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL8__hbleu214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL8__hbleu214__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__hbleu2, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["__hbleu2"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -9833,7 +10077,9 @@ def _lower__ZL8__hbgeu214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL8__hbgeu214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL8__hbgeu214__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__hbgeu2, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["__hbgeu2"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -9879,7 +10125,9 @@ def _lower__ZL8__hbltu214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL8__hbltu214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL8__hbltu214__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__hbltu2, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["__hbltu2"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -9925,7 +10173,9 @@ def _lower__ZL8__hbgtu214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL8__hbgtu214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL8__hbgtu214__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__hbgtu2, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["__hbgtu2"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -9971,7 +10221,9 @@ def _lower__ZL5__heq13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL5__heq13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL5__heq13__nv_bfloat16S__nbst(arg_0, arg_1) - @lower(__heq, _type___nv_bfloat16, _type___nv_bfloat16) + handle = globals()["__heq"] + + @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -10017,7 +10269,9 @@ def _lower__ZL5__hne13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL5__hne13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL5__hne13__nv_bfloat16S__nbst(arg_0, arg_1) - @lower(__hne, _type___nv_bfloat16, _type___nv_bfloat16) + handle = globals()["__hne"] + + @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -10063,7 +10317,9 @@ def _lower__ZL5__hle13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL5__hle13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL5__hle13__nv_bfloat16S__nbst(arg_0, arg_1) - @lower(__hle, _type___nv_bfloat16, _type___nv_bfloat16) + handle = globals()["__hle"] + + @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -10109,7 +10365,9 @@ def _lower__ZL5__hge13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL5__hge13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL5__hge13__nv_bfloat16S__nbst(arg_0, arg_1) - @lower(__hge, _type___nv_bfloat16, _type___nv_bfloat16) + handle = globals()["__hge"] + + @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -10155,7 +10413,9 @@ def _lower__ZL5__hlt13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL5__hlt13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL5__hlt13__nv_bfloat16S__nbst(arg_0, arg_1) - @lower(__hlt, _type___nv_bfloat16, _type___nv_bfloat16) + handle = globals()["__hlt"] + + @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -10201,7 +10461,9 @@ def _lower__ZL5__hgt13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL5__hgt13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL5__hgt13__nv_bfloat16S__nbst(arg_0, arg_1) - @lower(__hgt, _type___nv_bfloat16, _type___nv_bfloat16) + handle = globals()["__hgt"] + + @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -10247,7 +10509,9 @@ def _lower__ZL6__hequ13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL6__hequ13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL6__hequ13__nv_bfloat16S__nbst(arg_0, arg_1) - @lower(__hequ, _type___nv_bfloat16, _type___nv_bfloat16) + handle = globals()["__hequ"] + + @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -10293,7 +10557,9 @@ def _lower__ZL6__hneu13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL6__hneu13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL6__hneu13__nv_bfloat16S__nbst(arg_0, arg_1) - @lower(__hneu, _type___nv_bfloat16, _type___nv_bfloat16) + handle = globals()["__hneu"] + + @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -10339,7 +10605,9 @@ def _lower__ZL6__hleu13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL6__hleu13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL6__hleu13__nv_bfloat16S__nbst(arg_0, arg_1) - @lower(__hleu, _type___nv_bfloat16, _type___nv_bfloat16) + handle = globals()["__hleu"] + + @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -10385,7 +10653,9 @@ def _lower__ZL6__hgeu13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL6__hgeu13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL6__hgeu13__nv_bfloat16S__nbst(arg_0, arg_1) - @lower(__hgeu, _type___nv_bfloat16, _type___nv_bfloat16) + handle = globals()["__hgeu"] + + @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -10431,7 +10701,9 @@ def _lower__ZL6__hltu13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL6__hltu13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL6__hltu13__nv_bfloat16S__nbst(arg_0, arg_1) - @lower(__hltu, _type___nv_bfloat16, _type___nv_bfloat16) + handle = globals()["__hltu"] + + @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -10477,7 +10749,9 @@ def _lower__ZL6__hgtu13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL6__hgtu13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL6__hgtu13__nv_bfloat16S__nbst(arg_0, arg_1) - @lower(__hgtu, _type___nv_bfloat16, _type___nv_bfloat16) + handle = globals()["__hgtu"] + + @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -10522,7 +10796,9 @@ def _lower__ZL8__hisnan13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL8__hisnan13__nv_bfloat16_nbst_caller(arg_0): return _ZL8__hisnan13__nv_bfloat16_nbst(arg_0) - @lower(__hisnan, _type___nv_bfloat16) + handle = globals()["__hisnan"] + + @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -10566,7 +10842,9 @@ def _lower__ZL6__hmax13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL6__hmax13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL6__hmax13__nv_bfloat16S__nbst(arg_0, arg_1) - @lower(__hmax, _type___nv_bfloat16, _type___nv_bfloat16) + handle = globals()["__hmax"] + + @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -10614,7 +10892,9 @@ def _lower__ZL6__hmin13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL6__hmin13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL6__hmin13__nv_bfloat16S__nbst(arg_0, arg_1) - @lower(__hmin, _type___nv_bfloat16, _type___nv_bfloat16) + handle = globals()["__hmin"] + + @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -10662,7 +10942,9 @@ def _lower__ZL10__hmax_nan13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL10__hmax_nan13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL10__hmax_nan13__nv_bfloat16S__nbst(arg_0, arg_1) - @lower(__hmax_nan, _type___nv_bfloat16, _type___nv_bfloat16) + handle = globals()["__hmax_nan"] + + @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -10710,7 +10992,9 @@ def _lower__ZL10__hmin_nan13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL10__hmin_nan13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL10__hmin_nan13__nv_bfloat16S__nbst(arg_0, arg_1) - @lower(__hmin_nan, _type___nv_bfloat16, _type___nv_bfloat16) + handle = globals()["__hmin_nan"] + + @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -10760,11 +11044,10 @@ def _lower__ZL11__hfma_relu13__nv_bfloat16S_S__nbst(shim_stream, shim_obj): def _ZL11__hfma_relu13__nv_bfloat16S_S__nbst_caller(arg_0, arg_1, arg_2): return _ZL11__hfma_relu13__nv_bfloat16S_S__nbst(arg_0, arg_1, arg_2) + handle = globals()["__hfma_relu"] + @lower( - __hfma_relu, - _type___nv_bfloat16, - _type___nv_bfloat16, - _type___nv_bfloat16, + handle, _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16 ) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) @@ -10814,7 +11097,9 @@ def _lower__ZL7__hmax214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL7__hmax214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL7__hmax214__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__hmax2, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["__hmax2"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -10862,7 +11147,9 @@ def _lower__ZL7__hmin214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL7__hmin214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL7__hmin214__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__hmin2, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["__hmin2"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -10910,7 +11197,9 @@ def _lower__ZL11__hmax2_nan14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL11__hmax2_nan14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL11__hmax2_nan14__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__hmax2_nan, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["__hmax2_nan"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -10958,7 +11247,9 @@ def _lower__ZL11__hmin2_nan14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL11__hmin2_nan14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL11__hmin2_nan14__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(__hmin2_nan, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["__hmin2_nan"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -11008,11 +11299,10 @@ def _lower__ZL12__hfma2_relu14__nv_bfloat162S_S__nbst(shim_stream, shim_obj): def _ZL12__hfma2_relu14__nv_bfloat162S_S__nbst_caller(arg_0, arg_1, arg_2): return _ZL12__hfma2_relu14__nv_bfloat162S_S__nbst(arg_0, arg_1, arg_2) + handle = globals()["__hfma2_relu"] + @lower( - __hfma2_relu, - _type___nv_bfloat162, - _type___nv_bfloat162, - _type___nv_bfloat162, + handle, _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 ) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) @@ -11064,11 +11354,10 @@ def _lower__ZL8__hcmadd14__nv_bfloat162S_S__nbst(shim_stream, shim_obj): def _ZL8__hcmadd14__nv_bfloat162S_S__nbst_caller(arg_0, arg_1, arg_2): return _ZL8__hcmadd14__nv_bfloat162S_S__nbst(arg_0, arg_1, arg_2) + handle = globals()["__hcmadd"] + @lower( - __hcmadd, - _type___nv_bfloat162, - _type___nv_bfloat162, - _type___nv_bfloat162, + handle, _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 ) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) @@ -11116,7 +11405,9 @@ def _lower__ZL5hsqrt13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL5hsqrt13__nv_bfloat16_nbst_caller(arg_0): return _ZL5hsqrt13__nv_bfloat16_nbst(arg_0) - @lower(hsqrt, _type___nv_bfloat16) + handle = globals()["hsqrt"] + + @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -11158,7 +11449,9 @@ def _lower__ZL6hrsqrt13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL6hrsqrt13__nv_bfloat16_nbst_caller(arg_0): return _ZL6hrsqrt13__nv_bfloat16_nbst(arg_0) - @lower(hrsqrt, _type___nv_bfloat16) + handle = globals()["hrsqrt"] + + @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -11200,7 +11493,9 @@ def _lower__ZL4hrcp13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL4hrcp13__nv_bfloat16_nbst_caller(arg_0): return _ZL4hrcp13__nv_bfloat16_nbst(arg_0) - @lower(hrcp, _type___nv_bfloat16) + handle = globals()["hrcp"] + + @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key("_ZL4hrcp13__nv_bfloat16_nbst", shim_raw_str) @@ -11240,7 +11535,9 @@ def _lower__ZL4hlog13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL4hlog13__nv_bfloat16_nbst_caller(arg_0): return _ZL4hlog13__nv_bfloat16_nbst(arg_0) - @lower(hlog, _type___nv_bfloat16) + handle = globals()["hlog"] + + @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key("_ZL4hlog13__nv_bfloat16_nbst", shim_raw_str) @@ -11280,7 +11577,9 @@ def _lower__ZL5hlog213__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL5hlog213__nv_bfloat16_nbst_caller(arg_0): return _ZL5hlog213__nv_bfloat16_nbst(arg_0) - @lower(hlog2, _type___nv_bfloat16) + handle = globals()["hlog2"] + + @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -11322,7 +11621,9 @@ def _lower__ZL6hlog1013__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL6hlog1013__nv_bfloat16_nbst_caller(arg_0): return _ZL6hlog1013__nv_bfloat16_nbst(arg_0) - @lower(hlog10, _type___nv_bfloat16) + handle = globals()["hlog10"] + + @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -11364,7 +11665,9 @@ def _lower__ZL4hexp13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL4hexp13__nv_bfloat16_nbst_caller(arg_0): return _ZL4hexp13__nv_bfloat16_nbst(arg_0) - @lower(hexp, _type___nv_bfloat16) + handle = globals()["hexp"] + + @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key("_ZL4hexp13__nv_bfloat16_nbst", shim_raw_str) @@ -11404,7 +11707,9 @@ def _lower__ZL12htanh_approx13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL12htanh_approx13__nv_bfloat16_nbst_caller(arg_0): return _ZL12htanh_approx13__nv_bfloat16_nbst(arg_0) - @lower(htanh_approx, _type___nv_bfloat16) + handle = globals()["htanh_approx"] + + @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -11446,7 +11751,9 @@ def _lower__ZL13h2tanh_approx14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL13h2tanh_approx14__nv_bfloat162_nbst_caller(arg_0): return _ZL13h2tanh_approx14__nv_bfloat162_nbst(arg_0) - @lower(h2tanh_approx, _type___nv_bfloat162) + handle = globals()["h2tanh_approx"] + + @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -11488,7 +11795,9 @@ def _lower__ZL5htanh13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL5htanh13__nv_bfloat16_nbst_caller(arg_0): return _ZL5htanh13__nv_bfloat16_nbst(arg_0) - @lower(htanh, _type___nv_bfloat16) + handle = globals()["htanh"] + + @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -11530,7 +11839,9 @@ def _lower__ZL6h2tanh14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL6h2tanh14__nv_bfloat162_nbst_caller(arg_0): return _ZL6h2tanh14__nv_bfloat162_nbst(arg_0) - @lower(h2tanh, _type___nv_bfloat162) + handle = globals()["h2tanh"] + + @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -11572,7 +11883,9 @@ def _lower__ZL5hexp213__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL5hexp213__nv_bfloat16_nbst_caller(arg_0): return _ZL5hexp213__nv_bfloat16_nbst(arg_0) - @lower(hexp2, _type___nv_bfloat16) + handle = globals()["hexp2"] + + @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -11614,7 +11927,9 @@ def _lower__ZL6hexp1013__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL6hexp1013__nv_bfloat16_nbst_caller(arg_0): return _ZL6hexp1013__nv_bfloat16_nbst(arg_0) - @lower(hexp10, _type___nv_bfloat16) + handle = globals()["hexp10"] + + @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -11656,7 +11971,9 @@ def _lower__ZL4hcos13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL4hcos13__nv_bfloat16_nbst_caller(arg_0): return _ZL4hcos13__nv_bfloat16_nbst(arg_0) - @lower(hcos, _type___nv_bfloat16) + handle = globals()["hcos"] + + @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key("_ZL4hcos13__nv_bfloat16_nbst", shim_raw_str) @@ -11696,7 +12013,9 @@ def _lower__ZL4hsin13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL4hsin13__nv_bfloat16_nbst_caller(arg_0): return _ZL4hsin13__nv_bfloat16_nbst(arg_0) - @lower(hsin, _type___nv_bfloat16) + handle = globals()["hsin"] + + @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key("_ZL4hsin13__nv_bfloat16_nbst", shim_raw_str) @@ -11736,7 +12055,9 @@ def _lower__ZL6h2sqrt14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL6h2sqrt14__nv_bfloat162_nbst_caller(arg_0): return _ZL6h2sqrt14__nv_bfloat162_nbst(arg_0) - @lower(h2sqrt, _type___nv_bfloat162) + handle = globals()["h2sqrt"] + + @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -11778,7 +12099,9 @@ def _lower__ZL7h2rsqrt14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL7h2rsqrt14__nv_bfloat162_nbst_caller(arg_0): return _ZL7h2rsqrt14__nv_bfloat162_nbst(arg_0) - @lower(h2rsqrt, _type___nv_bfloat162) + handle = globals()["h2rsqrt"] + + @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -11820,7 +12143,9 @@ def _lower__ZL5h2rcp14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL5h2rcp14__nv_bfloat162_nbst_caller(arg_0): return _ZL5h2rcp14__nv_bfloat162_nbst(arg_0) - @lower(h2rcp, _type___nv_bfloat162) + handle = globals()["h2rcp"] + + @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -11862,7 +12187,9 @@ def _lower__ZL5h2log14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL5h2log14__nv_bfloat162_nbst_caller(arg_0): return _ZL5h2log14__nv_bfloat162_nbst(arg_0) - @lower(h2log, _type___nv_bfloat162) + handle = globals()["h2log"] + + @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -11904,7 +12231,9 @@ def _lower__ZL6h2log214__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL6h2log214__nv_bfloat162_nbst_caller(arg_0): return _ZL6h2log214__nv_bfloat162_nbst(arg_0) - @lower(h2log2, _type___nv_bfloat162) + handle = globals()["h2log2"] + + @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -11946,7 +12275,9 @@ def _lower__ZL7h2log1014__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL7h2log1014__nv_bfloat162_nbst_caller(arg_0): return _ZL7h2log1014__nv_bfloat162_nbst(arg_0) - @lower(h2log10, _type___nv_bfloat162) + handle = globals()["h2log10"] + + @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -11988,7 +12319,9 @@ def _lower__ZL5h2exp14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL5h2exp14__nv_bfloat162_nbst_caller(arg_0): return _ZL5h2exp14__nv_bfloat162_nbst(arg_0) - @lower(h2exp, _type___nv_bfloat162) + handle = globals()["h2exp"] + + @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -12030,7 +12363,9 @@ def _lower__ZL6h2exp214__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL6h2exp214__nv_bfloat162_nbst_caller(arg_0): return _ZL6h2exp214__nv_bfloat162_nbst(arg_0) - @lower(h2exp2, _type___nv_bfloat162) + handle = globals()["h2exp2"] + + @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -12072,7 +12407,9 @@ def _lower__ZL7h2exp1014__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL7h2exp1014__nv_bfloat162_nbst_caller(arg_0): return _ZL7h2exp1014__nv_bfloat162_nbst(arg_0) - @lower(h2exp10, _type___nv_bfloat162) + handle = globals()["h2exp10"] + + @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -12114,7 +12451,9 @@ def _lower__ZL5h2cos14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL5h2cos14__nv_bfloat162_nbst_caller(arg_0): return _ZL5h2cos14__nv_bfloat162_nbst(arg_0) - @lower(h2cos, _type___nv_bfloat162) + handle = globals()["h2cos"] + + @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -12156,7 +12495,9 @@ def _lower__ZL5h2sin14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL5h2sin14__nv_bfloat162_nbst_caller(arg_0): return _ZL5h2sin14__nv_bfloat162_nbst(arg_0) - @lower(h2sin, _type___nv_bfloat162) + handle = globals()["h2sin"] + + @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -12201,7 +12542,9 @@ def _lower__ZL9atomicAddP14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL9atomicAddP14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL9atomicAddP14__nv_bfloat162S__nbst(arg_0, arg_1) - @lower(atomicAdd, CPointer(_type___nv_bfloat162), _type___nv_bfloat162) + handle = globals()["atomicAdd"] + + @lower(handle, CPointer(_type___nv_bfloat162), _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -12246,7 +12589,9 @@ def _lower__ZL9atomicAddP13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL9atomicAddP13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL9atomicAddP13__nv_bfloat16S__nbst(arg_0, arg_1) - @lower(atomicAdd, CPointer(_type___nv_bfloat16), _type___nv_bfloat16) + handle = globals()["atomicAdd"] + + @lower(handle, CPointer(_type___nv_bfloat16), _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -12290,7 +12635,9 @@ def _lower__ZplRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj): def _ZplRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1): return _ZplRK13__nv_bfloat16S1__nbst(arg_0, arg_1) - @lower(operator.add, _type___nv_bfloat16, _type___nv_bfloat16) + handle = globals()["operator.add"] + + @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -12334,7 +12681,9 @@ def _lower__ZmiRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj): def _ZmiRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1): return _ZmiRK13__nv_bfloat16S1__nbst(arg_0, arg_1) - @lower(operator.sub, _type___nv_bfloat16, _type___nv_bfloat16) + handle = globals()["operator.sub"] + + @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -12378,7 +12727,9 @@ def _lower__ZmlRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj): def _ZmlRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1): return _ZmlRK13__nv_bfloat16S1__nbst(arg_0, arg_1) - @lower(operator.mul, _type___nv_bfloat16, _type___nv_bfloat16) + handle = globals()["operator.mul"] + + @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -12422,7 +12773,9 @@ def _lower__ZdvRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj): def _ZdvRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1): return _ZdvRK13__nv_bfloat16S1__nbst(arg_0, arg_1) - @lower(operator.truediv, _type___nv_bfloat16, _type___nv_bfloat16) + handle = globals()["operator.truediv"] + + @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -12466,7 +12819,9 @@ def _lower__ZpLR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj): def _ZpLR13__nv_bfloat16RKS__nbst_caller(arg_0, arg_1): return _ZpLR13__nv_bfloat16RKS__nbst(arg_0, arg_1) - @lower(operator.iadd, _type___nv_bfloat16, _type___nv_bfloat16) + handle = globals()["operator.iadd"] + + @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -12510,7 +12865,9 @@ def _lower__ZmIR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj): def _ZmIR13__nv_bfloat16RKS__nbst_caller(arg_0, arg_1): return _ZmIR13__nv_bfloat16RKS__nbst(arg_0, arg_1) - @lower(operator.isub, _type___nv_bfloat16, _type___nv_bfloat16) + handle = globals()["operator.isub"] + + @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -12554,7 +12911,9 @@ def _lower__ZmLR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj): def _ZmLR13__nv_bfloat16RKS__nbst_caller(arg_0, arg_1): return _ZmLR13__nv_bfloat16RKS__nbst(arg_0, arg_1) - @lower(operator.imul, _type___nv_bfloat16, _type___nv_bfloat16) + handle = globals()["operator.imul"] + + @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -12598,7 +12957,9 @@ def _lower__ZdVR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj): def _ZdVR13__nv_bfloat16RKS__nbst_caller(arg_0, arg_1): return _ZdVR13__nv_bfloat16RKS__nbst(arg_0, arg_1) - @lower(operator.itruediv, _type___nv_bfloat16, _type___nv_bfloat16) + handle = globals()["operator.itruediv"] + + @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -12640,7 +13001,9 @@ def _lower__ZpsRK13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZpsRK13__nv_bfloat16_nbst_caller(arg_0): return _ZpsRK13__nv_bfloat16_nbst(arg_0) - @lower(operator.pos, _type___nv_bfloat16) + handle = globals()["operator.pos"] + + @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key("_ZpsRK13__nv_bfloat16_nbst", shim_raw_str) @@ -12676,7 +13039,9 @@ def _lower__ZngRK13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZngRK13__nv_bfloat16_nbst_caller(arg_0): return _ZngRK13__nv_bfloat16_nbst(arg_0) - @lower(operator.neg, _type___nv_bfloat16) + handle = globals()["operator.neg"] + + @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key("_ZngRK13__nv_bfloat16_nbst", shim_raw_str) @@ -12712,7 +13077,9 @@ def _lower__ZeqRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj): def _ZeqRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1): return _ZeqRK13__nv_bfloat16S1__nbst(arg_0, arg_1) - @lower(operator.eq, _type___nv_bfloat16, _type___nv_bfloat16) + handle = globals()["operator.eq"] + + @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -12754,7 +13121,9 @@ def _lower__ZneRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj): def _ZneRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1): return _ZneRK13__nv_bfloat16S1__nbst(arg_0, arg_1) - @lower(operator.ne, _type___nv_bfloat16, _type___nv_bfloat16) + handle = globals()["operator.ne"] + + @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -12796,7 +13165,9 @@ def _lower__ZgtRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj): def _ZgtRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1): return _ZgtRK13__nv_bfloat16S1__nbst(arg_0, arg_1) - @lower(operator.gt, _type___nv_bfloat16, _type___nv_bfloat16) + handle = globals()["operator.gt"] + + @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -12838,7 +13209,9 @@ def _lower__ZltRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj): def _ZltRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1): return _ZltRK13__nv_bfloat16S1__nbst(arg_0, arg_1) - @lower(operator.lt, _type___nv_bfloat16, _type___nv_bfloat16) + handle = globals()["operator.lt"] + + @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -12880,7 +13253,9 @@ def _lower__ZgeRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj): def _ZgeRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1): return _ZgeRK13__nv_bfloat16S1__nbst(arg_0, arg_1) - @lower(operator.ge, _type___nv_bfloat16, _type___nv_bfloat16) + handle = globals()["operator.ge"] + + @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -12922,7 +13297,9 @@ def _lower__ZleRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj): def _ZleRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1): return _ZleRK13__nv_bfloat16S1__nbst(arg_0, arg_1) - @lower(operator.le, _type___nv_bfloat16, _type___nv_bfloat16) + handle = globals()["operator.le"] + + @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -12966,7 +13343,9 @@ def _lower__ZplRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj): def _ZplRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1): return _ZplRK14__nv_bfloat162S1__nbst(arg_0, arg_1) - @lower(operator.add, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["operator.add"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -13010,7 +13389,9 @@ def _lower__ZmiRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj): def _ZmiRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1): return _ZmiRK14__nv_bfloat162S1__nbst(arg_0, arg_1) - @lower(operator.sub, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["operator.sub"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -13054,7 +13435,9 @@ def _lower__ZmlRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj): def _ZmlRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1): return _ZmlRK14__nv_bfloat162S1__nbst(arg_0, arg_1) - @lower(operator.mul, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["operator.mul"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -13098,7 +13481,9 @@ def _lower__ZdvRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj): def _ZdvRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1): return _ZdvRK14__nv_bfloat162S1__nbst(arg_0, arg_1) - @lower(operator.truediv, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["operator.truediv"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -13142,7 +13527,9 @@ def _lower__ZpLR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj): def _ZpLR14__nv_bfloat162RKS__nbst_caller(arg_0, arg_1): return _ZpLR14__nv_bfloat162RKS__nbst(arg_0, arg_1) - @lower(operator.iadd, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["operator.iadd"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -13186,7 +13573,9 @@ def _lower__ZmIR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj): def _ZmIR14__nv_bfloat162RKS__nbst_caller(arg_0, arg_1): return _ZmIR14__nv_bfloat162RKS__nbst(arg_0, arg_1) - @lower(operator.isub, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["operator.isub"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -13230,7 +13619,9 @@ def _lower__ZmLR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj): def _ZmLR14__nv_bfloat162RKS__nbst_caller(arg_0, arg_1): return _ZmLR14__nv_bfloat162RKS__nbst(arg_0, arg_1) - @lower(operator.imul, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["operator.imul"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -13274,7 +13665,9 @@ def _lower__ZdVR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj): def _ZdVR14__nv_bfloat162RKS__nbst_caller(arg_0, arg_1): return _ZdVR14__nv_bfloat162RKS__nbst(arg_0, arg_1) - @lower(operator.itruediv, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["operator.itruediv"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -13316,7 +13709,9 @@ def _lower__ZpsRK14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZpsRK14__nv_bfloat162_nbst_caller(arg_0): return _ZpsRK14__nv_bfloat162_nbst(arg_0) - @lower(operator.pos, _type___nv_bfloat162) + handle = globals()["operator.pos"] + + @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key("_ZpsRK14__nv_bfloat162_nbst", shim_raw_str) @@ -13352,7 +13747,9 @@ def _lower__ZngRK14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZngRK14__nv_bfloat162_nbst_caller(arg_0): return _ZngRK14__nv_bfloat162_nbst(arg_0) - @lower(operator.neg, _type___nv_bfloat162) + handle = globals()["operator.neg"] + + @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key("_ZngRK14__nv_bfloat162_nbst", shim_raw_str) @@ -13388,7 +13785,9 @@ def _lower__ZeqRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj): def _ZeqRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1): return _ZeqRK14__nv_bfloat162S1__nbst(arg_0, arg_1) - @lower(operator.eq, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["operator.eq"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -13430,7 +13829,9 @@ def _lower__ZneRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj): def _ZneRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1): return _ZneRK14__nv_bfloat162S1__nbst(arg_0, arg_1) - @lower(operator.ne, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["operator.ne"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -13472,7 +13873,9 @@ def _lower__ZgtRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj): def _ZgtRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1): return _ZgtRK14__nv_bfloat162S1__nbst(arg_0, arg_1) - @lower(operator.gt, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["operator.gt"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -13514,7 +13917,9 @@ def _lower__ZltRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj): def _ZltRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1): return _ZltRK14__nv_bfloat162S1__nbst(arg_0, arg_1) - @lower(operator.lt, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["operator.lt"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -13556,7 +13961,9 @@ def _lower__ZgeRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj): def _ZgeRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1): return _ZgeRK14__nv_bfloat162S1__nbst(arg_0, arg_1) - @lower(operator.ge, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["operator.ge"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -13598,7 +14005,9 @@ def _lower__ZleRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj): def _ZleRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1): return _ZleRK14__nv_bfloat162S1__nbst(arg_0, arg_1) - @lower(operator.le, _type___nv_bfloat162, _type___nv_bfloat162) + handle = globals()["operator.le"] + + @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -13643,7 +14052,9 @@ def _lower__ZN6__halfC1E13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZN6__halfC1E13__nv_bfloat16_nbst_caller(arg_0): return _ZN6__halfC1E13__nv_bfloat16_nbst(arg_0) - @lower(__half, _type___nv_bfloat16) + handle = globals()["__half"] + + @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( From c647ae320486b998fb6ea724377db5a02ce19763 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 15 Aug 2025 13:22:57 -0700 Subject: [PATCH 17/56] apply binding patches --- numba_cuda/numba/cuda/_internal/cuda_bf16.py | 46 ++++++-------------- 1 file changed, 14 insertions(+), 32 deletions(-) diff --git a/numba_cuda/numba/cuda/_internal/cuda_bf16.py b/numba_cuda/numba/cuda/_internal/cuda_bf16.py index 86ddba13b..398db5cbf 100644 --- a/numba_cuda/numba/cuda/_internal/cuda_bf16.py +++ b/numba_cuda/numba/cuda/_internal/cuda_bf16.py @@ -27,7 +27,6 @@ from numba.core.typing.templates import AttributeTemplate, ConcreteTemplate from numba.core.typing.templates import Registry as TypingRegistry from numba.cuda import CUSource, declare_device -from numba.cuda._internal.cuda_bf16 import _type___nv_bfloat16 from numba.cuda.vector_types import vector_types from numba.extending import as_numba_type from numba.types import ( @@ -49,8 +48,10 @@ uint64, void, ) +from numba.cuda.types import bfloat16 float32x2 = vector_types["float32x2"] +__half = float16 typing_registry = TypingRegistry() @@ -180,37 +181,7 @@ def resolve_y(self, obj): make_attribute_wrapper(_type_class_unnamed1405416, "y", "y") -@register -class _ctor_template_unnamed1405416(ConcreteTemplate): - key = globals()["unnamed1405416"] - cases = [] - - -register_global(unnamed1405416, Function(_ctor_template_unnamed1405416)) - - -# Typing for __nv_bfloat16 -class _type_class___nv_bfloat16(Number): - def __init__(self): - super().__init__(name="__nv_bfloat16") - self.alignof_ = 2 - self.bitwidth = 2 * 8 - - -_type___nv_bfloat16 = _type_class___nv_bfloat16() - - -# Make Python API for struct -__nv_bfloat16 = type("__nv_bfloat16", (), {"_nbtype": _type___nv_bfloat16}) - -as_numba_type.register(__nv_bfloat16, _type___nv_bfloat16) - - -@register_model(_type_class___nv_bfloat16) -class _model___nv_bfloat16(PrimitiveModel): - def __init__(self, dmm, fe_type): - be_type = ir.IntType(fe_type.bitwidth) - super(_model___nv_bfloat16, self).__init__(dmm, fe_type, be_type) +__nv_bfloat16 = _type___nv_bfloat16 = bfloat16 def _lower__ZN13__nv_bfloat16C1Ev(shim_stream, shim_obj): @@ -356,6 +327,17 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) ) + # By default, Numbast does not generate this cast because the c++ conversion + # constructor is marked explict. We enable it by hand here. + @lower_cast(float16, __nv_bfloat16) + def conversion_impl(context, builder, fromty, toty, value): + return ctor_impl( + context, + builder, + signature(__nv_bfloat16, fromty), + [value], + ) + _lower__ZN13__nv_bfloat16C1E6__half(shim_stream, shim_obj) From 4b262e999a3059e59ddf0f64bd42ca2550bce4f4 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 15 Aug 2025 13:24:02 -0700 Subject: [PATCH 18/56] generate the bindings --- numba_cuda/numba/cuda/_internal/cuda_bf16.py | 1104 +++++++++++++----- 1 file changed, 825 insertions(+), 279 deletions(-) diff --git a/numba_cuda/numba/cuda/_internal/cuda_bf16.py b/numba_cuda/numba/cuda/_internal/cuda_bf16.py index 398db5cbf..fb4e4de21 100644 --- a/numba_cuda/numba/cuda/_internal/cuda_bf16.py +++ b/numba_cuda/numba/cuda/_internal/cuda_bf16.py @@ -2,7 +2,7 @@ # Generator Information: # Ast_canopy version: 0.4.0 # Numbast version: 0.5.0 -# Generation command: /home/wangm/miniforge3/envs/numbast/lib/python3.13/site-packages/numbast/__main__.py --cfg-path configs/cuda_bf16.yml --output-dir numba_cuda/numba/cuda/_internal/ +# Generation command: /home/wangm/numbast/numbast/src/numbast/__main__.py --cfg-path configs/cuda_bf16.yml --output-dir numba_cuda/numba/cuda/_internal/ # Static binding generator parameters: {'cfg_path': 'configs/cuda_bf16.yml', 'output_dir': 'numba_cuda/numba/cuda/_internal/', 'run_ruff_format': True} # Config file path (relative to the path of the generated binding): ../../../../../configs/cuda_bf16.yml # Cudatoolkit version: (12, 8) @@ -27,6 +27,7 @@ from numba.core.typing.templates import AttributeTemplate, ConcreteTemplate from numba.core.typing.templates import Registry as TypingRegistry from numba.cuda import CUSource, declare_device +from numba.cuda._internal.cuda_bf16 import _type___nv_bfloat16 from numba.cuda.vector_types import vector_types from numba.extending import as_numba_type from numba.types import ( @@ -48,10 +49,8 @@ uint64, void, ) -from numba.cuda.types import bfloat16 float32x2 = vector_types["float32x2"] -__half = float16 typing_registry = TypingRegistry() @@ -181,7 +180,37 @@ def resolve_y(self, obj): make_attribute_wrapper(_type_class_unnamed1405416, "y", "y") -__nv_bfloat16 = _type___nv_bfloat16 = bfloat16 +@register +class _ctor_template_unnamed1405416(ConcreteTemplate): + key = globals()["unnamed1405416"] + cases = [] + + +register_global(unnamed1405416, Function(_ctor_template_unnamed1405416)) + + +# Typing for __nv_bfloat16 +class _type_class___nv_bfloat16(Number): + def __init__(self): + super().__init__(name="__nv_bfloat16") + self.alignof_ = 2 + self.bitwidth = 2 * 8 + + +_type___nv_bfloat16 = _type_class___nv_bfloat16() + + +# Make Python API for struct +__nv_bfloat16 = type("__nv_bfloat16", (), {"_nbtype": _type___nv_bfloat16}) + +as_numba_type.register(__nv_bfloat16, _type___nv_bfloat16) + + +@register_model(_type_class___nv_bfloat16) +class _model___nv_bfloat16(PrimitiveModel): + def __init__(self, dmm, fe_type): + be_type = ir.IntType(fe_type.bitwidth) + super(_model___nv_bfloat16, self).__init__(dmm, fe_type, be_type) def _lower__ZN13__nv_bfloat16C1Ev(shim_stream, shim_obj): @@ -327,17 +356,6 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) ) - # By default, Numbast does not generate this cast because the c++ conversion - # constructor is marked explict. We enable it by hand here. - @lower_cast(float16, __nv_bfloat16) - def conversion_impl(context, builder, fromty, toty, value): - return ctor_impl( - context, - builder, - signature(__nv_bfloat16, fromty), - [value], - ) - _lower__ZN13__nv_bfloat16C1E6__half(shim_stream, shim_obj) @@ -1885,7 +1903,9 @@ def _lower__ZL17__double2bfloat16d_nbst(shim_stream, shim_obj): def _ZL17__double2bfloat16d_nbst_caller(arg_0): return _ZL17__double2bfloat16d_nbst(arg_0) - handle = globals()["__double2bfloat16"] + handle = globals().get("__double2bfloat16") + if handle is None: + handle = __double2bfloat16 @lower(handle, float64) def impl(context, builder, sig, args): @@ -1926,7 +1946,9 @@ def _lower__ZL16__float2bfloat16f_nbst(shim_stream, shim_obj): def _ZL16__float2bfloat16f_nbst_caller(arg_0): return _ZL16__float2bfloat16f_nbst(arg_0) - handle = globals()["__float2bfloat16"] + handle = globals().get("__float2bfloat16") + if handle is None: + handle = __float2bfloat16 @lower(handle, float32) def impl(context, builder, sig, args): @@ -1967,7 +1989,9 @@ def _lower__ZL19__float2bfloat16_rnf_nbst(shim_stream, shim_obj): def _ZL19__float2bfloat16_rnf_nbst_caller(arg_0): return _ZL19__float2bfloat16_rnf_nbst(arg_0) - handle = globals()["__float2bfloat16_rn"] + handle = globals().get("__float2bfloat16_rn") + if handle is None: + handle = __float2bfloat16_rn @lower(handle, float32) def impl(context, builder, sig, args): @@ -2010,7 +2034,9 @@ def _lower__ZL19__float2bfloat16_rzf_nbst(shim_stream, shim_obj): def _ZL19__float2bfloat16_rzf_nbst_caller(arg_0): return _ZL19__float2bfloat16_rzf_nbst(arg_0) - handle = globals()["__float2bfloat16_rz"] + handle = globals().get("__float2bfloat16_rz") + if handle is None: + handle = __float2bfloat16_rz @lower(handle, float32) def impl(context, builder, sig, args): @@ -2053,7 +2079,9 @@ def _lower__ZL19__float2bfloat16_rdf_nbst(shim_stream, shim_obj): def _ZL19__float2bfloat16_rdf_nbst_caller(arg_0): return _ZL19__float2bfloat16_rdf_nbst(arg_0) - handle = globals()["__float2bfloat16_rd"] + handle = globals().get("__float2bfloat16_rd") + if handle is None: + handle = __float2bfloat16_rd @lower(handle, float32) def impl(context, builder, sig, args): @@ -2096,7 +2124,9 @@ def _lower__ZL19__float2bfloat16_ruf_nbst(shim_stream, shim_obj): def _ZL19__float2bfloat16_ruf_nbst_caller(arg_0): return _ZL19__float2bfloat16_ruf_nbst(arg_0) - handle = globals()["__float2bfloat16_ru"] + handle = globals().get("__float2bfloat16_ru") + if handle is None: + handle = __float2bfloat16_ru @lower(handle, float32) def impl(context, builder, sig, args): @@ -2140,7 +2170,9 @@ def _lower__ZL16__bfloat162float13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL16__bfloat162float13__nv_bfloat16_nbst_caller(arg_0): return _ZL16__bfloat162float13__nv_bfloat16_nbst(arg_0) - handle = globals()["__bfloat162float"] + handle = globals().get("__bfloat162float") + if handle is None: + handle = __bfloat162float @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -2184,7 +2216,9 @@ def _lower__ZL20__float2bfloat162_rnf_nbst(shim_stream, shim_obj): def _ZL20__float2bfloat162_rnf_nbst_caller(arg_0): return _ZL20__float2bfloat162_rnf_nbst(arg_0) - handle = globals()["__float2bfloat162_rn"] + handle = globals().get("__float2bfloat162_rn") + if handle is None: + handle = __float2bfloat162_rn @lower(handle, float32) def impl(context, builder, sig, args): @@ -2228,7 +2262,9 @@ def _lower__ZL21__floats2bfloat162_rnff_nbst(shim_stream, shim_obj): def _ZL21__floats2bfloat162_rnff_nbst_caller(arg_0, arg_1): return _ZL21__floats2bfloat162_rnff_nbst(arg_0, arg_1) - handle = globals()["__floats2bfloat162_rn"] + handle = globals().get("__floats2bfloat162_rn") + if handle is None: + handle = __floats2bfloat162_rn @lower(handle, float32, float32) def impl(context, builder, sig, args): @@ -2274,7 +2310,9 @@ def _lower__ZL11__low2float14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL11__low2float14__nv_bfloat162_nbst_caller(arg_0): return _ZL11__low2float14__nv_bfloat162_nbst(arg_0) - handle = globals()["__low2float"] + handle = globals().get("__low2float") + if handle is None: + handle = __low2float @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -2318,7 +2356,9 @@ def _lower__ZL12__high2float14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL12__high2float14__nv_bfloat162_nbst_caller(arg_0): return _ZL12__high2float14__nv_bfloat162_nbst(arg_0) - handle = globals()["__high2float"] + handle = globals().get("__high2float") + if handle is None: + handle = __high2float @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -2362,7 +2402,9 @@ def _lower__ZL21__float22bfloat162_rn6float2_nbst(shim_stream, shim_obj): def _ZL21__float22bfloat162_rn6float2_nbst_caller(arg_0): return _ZL21__float22bfloat162_rn6float2_nbst(arg_0) - handle = globals()["__float22bfloat162_rn"] + handle = globals().get("__float22bfloat162_rn") + if handle is None: + handle = __float22bfloat162_rn @lower(handle, float32x2) def impl(context, builder, sig, args): @@ -2406,7 +2448,9 @@ def _lower__ZL18__bfloat1622float214__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL18__bfloat1622float214__nv_bfloat162_nbst_caller(arg_0): return _ZL18__bfloat1622float214__nv_bfloat162_nbst(arg_0) - handle = globals()["__bfloat1622float2"] + handle = globals().get("__bfloat1622float2") + if handle is None: + handle = __bfloat1622float2 @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -2450,7 +2494,9 @@ def _lower__ZL18__bfloat162char_rz13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL18__bfloat162char_rz13__nv_bfloat16_nbst_caller(arg_0): return _ZL18__bfloat162char_rz13__nv_bfloat16_nbst(arg_0) - handle = globals()["__bfloat162char_rz"] + handle = globals().get("__bfloat162char_rz") + if handle is None: + handle = __bfloat162char_rz @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -2494,7 +2540,9 @@ def _lower__ZL19__bfloat162uchar_rz13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL19__bfloat162uchar_rz13__nv_bfloat16_nbst_caller(arg_0): return _ZL19__bfloat162uchar_rz13__nv_bfloat16_nbst(arg_0) - handle = globals()["__bfloat162uchar_rz"] + handle = globals().get("__bfloat162uchar_rz") + if handle is None: + handle = __bfloat162uchar_rz @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -2538,7 +2586,9 @@ def _lower__ZL17__bfloat162int_rn13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL17__bfloat162int_rn13__nv_bfloat16_nbst_caller(arg_0): return _ZL17__bfloat162int_rn13__nv_bfloat16_nbst(arg_0) - handle = globals()["__bfloat162int_rn"] + handle = globals().get("__bfloat162int_rn") + if handle is None: + handle = __bfloat162int_rn @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -2582,7 +2632,9 @@ def _lower__ZL17__bfloat162int_rz13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL17__bfloat162int_rz13__nv_bfloat16_nbst_caller(arg_0): return _ZL17__bfloat162int_rz13__nv_bfloat16_nbst(arg_0) - handle = globals()["__bfloat162int_rz"] + handle = globals().get("__bfloat162int_rz") + if handle is None: + handle = __bfloat162int_rz @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -2626,7 +2678,9 @@ def _lower__ZL17__bfloat162int_rd13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL17__bfloat162int_rd13__nv_bfloat16_nbst_caller(arg_0): return _ZL17__bfloat162int_rd13__nv_bfloat16_nbst(arg_0) - handle = globals()["__bfloat162int_rd"] + handle = globals().get("__bfloat162int_rd") + if handle is None: + handle = __bfloat162int_rd @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -2670,7 +2724,9 @@ def _lower__ZL17__bfloat162int_ru13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL17__bfloat162int_ru13__nv_bfloat16_nbst_caller(arg_0): return _ZL17__bfloat162int_ru13__nv_bfloat16_nbst(arg_0) - handle = globals()["__bfloat162int_ru"] + handle = globals().get("__bfloat162int_ru") + if handle is None: + handle = __bfloat162int_ru @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -2713,7 +2769,9 @@ def _lower__ZL17__int2bfloat16_rni_nbst(shim_stream, shim_obj): def _ZL17__int2bfloat16_rni_nbst_caller(arg_0): return _ZL17__int2bfloat16_rni_nbst(arg_0) - handle = globals()["__int2bfloat16_rn"] + handle = globals().get("__int2bfloat16_rn") + if handle is None: + handle = __int2bfloat16_rn @lower(handle, int32) def impl(context, builder, sig, args): @@ -2754,7 +2812,9 @@ def _lower__ZL17__int2bfloat16_rzi_nbst(shim_stream, shim_obj): def _ZL17__int2bfloat16_rzi_nbst_caller(arg_0): return _ZL17__int2bfloat16_rzi_nbst(arg_0) - handle = globals()["__int2bfloat16_rz"] + handle = globals().get("__int2bfloat16_rz") + if handle is None: + handle = __int2bfloat16_rz @lower(handle, int32) def impl(context, builder, sig, args): @@ -2795,7 +2855,9 @@ def _lower__ZL17__int2bfloat16_rdi_nbst(shim_stream, shim_obj): def _ZL17__int2bfloat16_rdi_nbst_caller(arg_0): return _ZL17__int2bfloat16_rdi_nbst(arg_0) - handle = globals()["__int2bfloat16_rd"] + handle = globals().get("__int2bfloat16_rd") + if handle is None: + handle = __int2bfloat16_rd @lower(handle, int32) def impl(context, builder, sig, args): @@ -2836,7 +2898,9 @@ def _lower__ZL17__int2bfloat16_rui_nbst(shim_stream, shim_obj): def _ZL17__int2bfloat16_rui_nbst_caller(arg_0): return _ZL17__int2bfloat16_rui_nbst(arg_0) - handle = globals()["__int2bfloat16_ru"] + handle = globals().get("__int2bfloat16_ru") + if handle is None: + handle = __int2bfloat16_ru @lower(handle, int32) def impl(context, builder, sig, args): @@ -2878,7 +2942,9 @@ def _lower__ZL19__bfloat162short_rn13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL19__bfloat162short_rn13__nv_bfloat16_nbst_caller(arg_0): return _ZL19__bfloat162short_rn13__nv_bfloat16_nbst(arg_0) - handle = globals()["__bfloat162short_rn"] + handle = globals().get("__bfloat162short_rn") + if handle is None: + handle = __bfloat162short_rn @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -2922,7 +2988,9 @@ def _lower__ZL19__bfloat162short_rz13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL19__bfloat162short_rz13__nv_bfloat16_nbst_caller(arg_0): return _ZL19__bfloat162short_rz13__nv_bfloat16_nbst(arg_0) - handle = globals()["__bfloat162short_rz"] + handle = globals().get("__bfloat162short_rz") + if handle is None: + handle = __bfloat162short_rz @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -2966,7 +3034,9 @@ def _lower__ZL19__bfloat162short_rd13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL19__bfloat162short_rd13__nv_bfloat16_nbst_caller(arg_0): return _ZL19__bfloat162short_rd13__nv_bfloat16_nbst(arg_0) - handle = globals()["__bfloat162short_rd"] + handle = globals().get("__bfloat162short_rd") + if handle is None: + handle = __bfloat162short_rd @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -3010,7 +3080,9 @@ def _lower__ZL19__bfloat162short_ru13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL19__bfloat162short_ru13__nv_bfloat16_nbst_caller(arg_0): return _ZL19__bfloat162short_ru13__nv_bfloat16_nbst(arg_0) - handle = globals()["__bfloat162short_ru"] + handle = globals().get("__bfloat162short_ru") + if handle is None: + handle = __bfloat162short_ru @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -3053,7 +3125,9 @@ def _lower__ZL19__short2bfloat16_rns_nbst(shim_stream, shim_obj): def _ZL19__short2bfloat16_rns_nbst_caller(arg_0): return _ZL19__short2bfloat16_rns_nbst(arg_0) - handle = globals()["__short2bfloat16_rn"] + handle = globals().get("__short2bfloat16_rn") + if handle is None: + handle = __short2bfloat16_rn @lower(handle, int16) def impl(context, builder, sig, args): @@ -3096,7 +3170,9 @@ def _lower__ZL19__short2bfloat16_rzs_nbst(shim_stream, shim_obj): def _ZL19__short2bfloat16_rzs_nbst_caller(arg_0): return _ZL19__short2bfloat16_rzs_nbst(arg_0) - handle = globals()["__short2bfloat16_rz"] + handle = globals().get("__short2bfloat16_rz") + if handle is None: + handle = __short2bfloat16_rz @lower(handle, int16) def impl(context, builder, sig, args): @@ -3139,7 +3215,9 @@ def _lower__ZL19__short2bfloat16_rds_nbst(shim_stream, shim_obj): def _ZL19__short2bfloat16_rds_nbst_caller(arg_0): return _ZL19__short2bfloat16_rds_nbst(arg_0) - handle = globals()["__short2bfloat16_rd"] + handle = globals().get("__short2bfloat16_rd") + if handle is None: + handle = __short2bfloat16_rd @lower(handle, int16) def impl(context, builder, sig, args): @@ -3182,7 +3260,9 @@ def _lower__ZL19__short2bfloat16_rus_nbst(shim_stream, shim_obj): def _ZL19__short2bfloat16_rus_nbst_caller(arg_0): return _ZL19__short2bfloat16_rus_nbst(arg_0) - handle = globals()["__short2bfloat16_ru"] + handle = globals().get("__short2bfloat16_ru") + if handle is None: + handle = __short2bfloat16_ru @lower(handle, int16) def impl(context, builder, sig, args): @@ -3226,7 +3306,9 @@ def _lower__ZL18__bfloat162uint_rn13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL18__bfloat162uint_rn13__nv_bfloat16_nbst_caller(arg_0): return _ZL18__bfloat162uint_rn13__nv_bfloat16_nbst(arg_0) - handle = globals()["__bfloat162uint_rn"] + handle = globals().get("__bfloat162uint_rn") + if handle is None: + handle = __bfloat162uint_rn @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -3270,7 +3352,9 @@ def _lower__ZL18__bfloat162uint_rz13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL18__bfloat162uint_rz13__nv_bfloat16_nbst_caller(arg_0): return _ZL18__bfloat162uint_rz13__nv_bfloat16_nbst(arg_0) - handle = globals()["__bfloat162uint_rz"] + handle = globals().get("__bfloat162uint_rz") + if handle is None: + handle = __bfloat162uint_rz @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -3314,7 +3398,9 @@ def _lower__ZL18__bfloat162uint_rd13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL18__bfloat162uint_rd13__nv_bfloat16_nbst_caller(arg_0): return _ZL18__bfloat162uint_rd13__nv_bfloat16_nbst(arg_0) - handle = globals()["__bfloat162uint_rd"] + handle = globals().get("__bfloat162uint_rd") + if handle is None: + handle = __bfloat162uint_rd @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -3358,7 +3444,9 @@ def _lower__ZL18__bfloat162uint_ru13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL18__bfloat162uint_ru13__nv_bfloat16_nbst_caller(arg_0): return _ZL18__bfloat162uint_ru13__nv_bfloat16_nbst(arg_0) - handle = globals()["__bfloat162uint_ru"] + handle = globals().get("__bfloat162uint_ru") + if handle is None: + handle = __bfloat162uint_ru @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -3401,7 +3489,9 @@ def _lower__ZL18__uint2bfloat16_rnj_nbst(shim_stream, shim_obj): def _ZL18__uint2bfloat16_rnj_nbst_caller(arg_0): return _ZL18__uint2bfloat16_rnj_nbst(arg_0) - handle = globals()["__uint2bfloat16_rn"] + handle = globals().get("__uint2bfloat16_rn") + if handle is None: + handle = __uint2bfloat16_rn @lower(handle, uint32) def impl(context, builder, sig, args): @@ -3444,7 +3534,9 @@ def _lower__ZL18__uint2bfloat16_rzj_nbst(shim_stream, shim_obj): def _ZL18__uint2bfloat16_rzj_nbst_caller(arg_0): return _ZL18__uint2bfloat16_rzj_nbst(arg_0) - handle = globals()["__uint2bfloat16_rz"] + handle = globals().get("__uint2bfloat16_rz") + if handle is None: + handle = __uint2bfloat16_rz @lower(handle, uint32) def impl(context, builder, sig, args): @@ -3487,7 +3579,9 @@ def _lower__ZL18__uint2bfloat16_rdj_nbst(shim_stream, shim_obj): def _ZL18__uint2bfloat16_rdj_nbst_caller(arg_0): return _ZL18__uint2bfloat16_rdj_nbst(arg_0) - handle = globals()["__uint2bfloat16_rd"] + handle = globals().get("__uint2bfloat16_rd") + if handle is None: + handle = __uint2bfloat16_rd @lower(handle, uint32) def impl(context, builder, sig, args): @@ -3530,7 +3624,9 @@ def _lower__ZL18__uint2bfloat16_ruj_nbst(shim_stream, shim_obj): def _ZL18__uint2bfloat16_ruj_nbst_caller(arg_0): return _ZL18__uint2bfloat16_ruj_nbst(arg_0) - handle = globals()["__uint2bfloat16_ru"] + handle = globals().get("__uint2bfloat16_ru") + if handle is None: + handle = __uint2bfloat16_ru @lower(handle, uint32) def impl(context, builder, sig, args): @@ -3574,7 +3670,9 @@ def _lower__ZL20__bfloat162ushort_rn13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL20__bfloat162ushort_rn13__nv_bfloat16_nbst_caller(arg_0): return _ZL20__bfloat162ushort_rn13__nv_bfloat16_nbst(arg_0) - handle = globals()["__bfloat162ushort_rn"] + handle = globals().get("__bfloat162ushort_rn") + if handle is None: + handle = __bfloat162ushort_rn @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -3618,7 +3716,9 @@ def _lower__ZL20__bfloat162ushort_rz13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL20__bfloat162ushort_rz13__nv_bfloat16_nbst_caller(arg_0): return _ZL20__bfloat162ushort_rz13__nv_bfloat16_nbst(arg_0) - handle = globals()["__bfloat162ushort_rz"] + handle = globals().get("__bfloat162ushort_rz") + if handle is None: + handle = __bfloat162ushort_rz @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -3662,7 +3762,9 @@ def _lower__ZL20__bfloat162ushort_rd13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL20__bfloat162ushort_rd13__nv_bfloat16_nbst_caller(arg_0): return _ZL20__bfloat162ushort_rd13__nv_bfloat16_nbst(arg_0) - handle = globals()["__bfloat162ushort_rd"] + handle = globals().get("__bfloat162ushort_rd") + if handle is None: + handle = __bfloat162ushort_rd @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -3706,7 +3808,9 @@ def _lower__ZL20__bfloat162ushort_ru13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL20__bfloat162ushort_ru13__nv_bfloat16_nbst_caller(arg_0): return _ZL20__bfloat162ushort_ru13__nv_bfloat16_nbst(arg_0) - handle = globals()["__bfloat162ushort_ru"] + handle = globals().get("__bfloat162ushort_ru") + if handle is None: + handle = __bfloat162ushort_ru @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -3749,7 +3853,9 @@ def _lower__ZL20__ushort2bfloat16_rnt_nbst(shim_stream, shim_obj): def _ZL20__ushort2bfloat16_rnt_nbst_caller(arg_0): return _ZL20__ushort2bfloat16_rnt_nbst(arg_0) - handle = globals()["__ushort2bfloat16_rn"] + handle = globals().get("__ushort2bfloat16_rn") + if handle is None: + handle = __ushort2bfloat16_rn @lower(handle, uint16) def impl(context, builder, sig, args): @@ -3792,7 +3898,9 @@ def _lower__ZL20__ushort2bfloat16_rzt_nbst(shim_stream, shim_obj): def _ZL20__ushort2bfloat16_rzt_nbst_caller(arg_0): return _ZL20__ushort2bfloat16_rzt_nbst(arg_0) - handle = globals()["__ushort2bfloat16_rz"] + handle = globals().get("__ushort2bfloat16_rz") + if handle is None: + handle = __ushort2bfloat16_rz @lower(handle, uint16) def impl(context, builder, sig, args): @@ -3835,7 +3943,9 @@ def _lower__ZL20__ushort2bfloat16_rdt_nbst(shim_stream, shim_obj): def _ZL20__ushort2bfloat16_rdt_nbst_caller(arg_0): return _ZL20__ushort2bfloat16_rdt_nbst(arg_0) - handle = globals()["__ushort2bfloat16_rd"] + handle = globals().get("__ushort2bfloat16_rd") + if handle is None: + handle = __ushort2bfloat16_rd @lower(handle, uint16) def impl(context, builder, sig, args): @@ -3878,7 +3988,9 @@ def _lower__ZL20__ushort2bfloat16_rut_nbst(shim_stream, shim_obj): def _ZL20__ushort2bfloat16_rut_nbst_caller(arg_0): return _ZL20__ushort2bfloat16_rut_nbst(arg_0) - handle = globals()["__ushort2bfloat16_ru"] + handle = globals().get("__ushort2bfloat16_ru") + if handle is None: + handle = __ushort2bfloat16_ru @lower(handle, uint16) def impl(context, builder, sig, args): @@ -3922,7 +4034,9 @@ def _lower__ZL17__bfloat162ull_rn13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL17__bfloat162ull_rn13__nv_bfloat16_nbst_caller(arg_0): return _ZL17__bfloat162ull_rn13__nv_bfloat16_nbst(arg_0) - handle = globals()["__bfloat162ull_rn"] + handle = globals().get("__bfloat162ull_rn") + if handle is None: + handle = __bfloat162ull_rn @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -3966,7 +4080,9 @@ def _lower__ZL17__bfloat162ull_rz13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL17__bfloat162ull_rz13__nv_bfloat16_nbst_caller(arg_0): return _ZL17__bfloat162ull_rz13__nv_bfloat16_nbst(arg_0) - handle = globals()["__bfloat162ull_rz"] + handle = globals().get("__bfloat162ull_rz") + if handle is None: + handle = __bfloat162ull_rz @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -4012,7 +4128,9 @@ def _lower__ZL14make_bfloat16213__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL14make_bfloat16213__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL14make_bfloat16213__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals()["make_bfloat162"] + handle = globals().get("make_bfloat162") + if handle is None: + handle = make_bfloat162 @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -4060,7 +4178,9 @@ def _lower__ZL17__bfloat162ull_rd13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL17__bfloat162ull_rd13__nv_bfloat16_nbst_caller(arg_0): return _ZL17__bfloat162ull_rd13__nv_bfloat16_nbst(arg_0) - handle = globals()["__bfloat162ull_rd"] + handle = globals().get("__bfloat162ull_rd") + if handle is None: + handle = __bfloat162ull_rd @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -4104,7 +4224,9 @@ def _lower__ZL17__bfloat162ull_ru13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL17__bfloat162ull_ru13__nv_bfloat16_nbst_caller(arg_0): return _ZL17__bfloat162ull_ru13__nv_bfloat16_nbst(arg_0) - handle = globals()["__bfloat162ull_ru"] + handle = globals().get("__bfloat162ull_ru") + if handle is None: + handle = __bfloat162ull_ru @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -4147,7 +4269,9 @@ def _lower__ZL17__ull2bfloat16_rny_nbst(shim_stream, shim_obj): def _ZL17__ull2bfloat16_rny_nbst_caller(arg_0): return _ZL17__ull2bfloat16_rny_nbst(arg_0) - handle = globals()["__ull2bfloat16_rn"] + handle = globals().get("__ull2bfloat16_rn") + if handle is None: + handle = __ull2bfloat16_rn @lower(handle, uint64) def impl(context, builder, sig, args): @@ -4188,7 +4312,9 @@ def _lower__ZL17__ull2bfloat16_rzy_nbst(shim_stream, shim_obj): def _ZL17__ull2bfloat16_rzy_nbst_caller(arg_0): return _ZL17__ull2bfloat16_rzy_nbst(arg_0) - handle = globals()["__ull2bfloat16_rz"] + handle = globals().get("__ull2bfloat16_rz") + if handle is None: + handle = __ull2bfloat16_rz @lower(handle, uint64) def impl(context, builder, sig, args): @@ -4229,7 +4355,9 @@ def _lower__ZL17__ull2bfloat16_rdy_nbst(shim_stream, shim_obj): def _ZL17__ull2bfloat16_rdy_nbst_caller(arg_0): return _ZL17__ull2bfloat16_rdy_nbst(arg_0) - handle = globals()["__ull2bfloat16_rd"] + handle = globals().get("__ull2bfloat16_rd") + if handle is None: + handle = __ull2bfloat16_rd @lower(handle, uint64) def impl(context, builder, sig, args): @@ -4270,7 +4398,9 @@ def _lower__ZL17__ull2bfloat16_ruy_nbst(shim_stream, shim_obj): def _ZL17__ull2bfloat16_ruy_nbst_caller(arg_0): return _ZL17__ull2bfloat16_ruy_nbst(arg_0) - handle = globals()["__ull2bfloat16_ru"] + handle = globals().get("__ull2bfloat16_ru") + if handle is None: + handle = __ull2bfloat16_ru @lower(handle, uint64) def impl(context, builder, sig, args): @@ -4312,7 +4442,9 @@ def _lower__ZL16__bfloat162ll_rn13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL16__bfloat162ll_rn13__nv_bfloat16_nbst_caller(arg_0): return _ZL16__bfloat162ll_rn13__nv_bfloat16_nbst(arg_0) - handle = globals()["__bfloat162ll_rn"] + handle = globals().get("__bfloat162ll_rn") + if handle is None: + handle = __bfloat162ll_rn @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -4356,7 +4488,9 @@ def _lower__ZL16__bfloat162ll_rz13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL16__bfloat162ll_rz13__nv_bfloat16_nbst_caller(arg_0): return _ZL16__bfloat162ll_rz13__nv_bfloat16_nbst(arg_0) - handle = globals()["__bfloat162ll_rz"] + handle = globals().get("__bfloat162ll_rz") + if handle is None: + handle = __bfloat162ll_rz @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -4400,7 +4534,9 @@ def _lower__ZL16__bfloat162ll_rd13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL16__bfloat162ll_rd13__nv_bfloat16_nbst_caller(arg_0): return _ZL16__bfloat162ll_rd13__nv_bfloat16_nbst(arg_0) - handle = globals()["__bfloat162ll_rd"] + handle = globals().get("__bfloat162ll_rd") + if handle is None: + handle = __bfloat162ll_rd @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -4444,7 +4580,9 @@ def _lower__ZL16__bfloat162ll_ru13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL16__bfloat162ll_ru13__nv_bfloat16_nbst_caller(arg_0): return _ZL16__bfloat162ll_ru13__nv_bfloat16_nbst(arg_0) - handle = globals()["__bfloat162ll_ru"] + handle = globals().get("__bfloat162ll_ru") + if handle is None: + handle = __bfloat162ll_ru @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -4487,7 +4625,9 @@ def _lower__ZL16__ll2bfloat16_rnx_nbst(shim_stream, shim_obj): def _ZL16__ll2bfloat16_rnx_nbst_caller(arg_0): return _ZL16__ll2bfloat16_rnx_nbst(arg_0) - handle = globals()["__ll2bfloat16_rn"] + handle = globals().get("__ll2bfloat16_rn") + if handle is None: + handle = __ll2bfloat16_rn @lower(handle, int64) def impl(context, builder, sig, args): @@ -4528,7 +4668,9 @@ def _lower__ZL16__ll2bfloat16_rzx_nbst(shim_stream, shim_obj): def _ZL16__ll2bfloat16_rzx_nbst_caller(arg_0): return _ZL16__ll2bfloat16_rzx_nbst(arg_0) - handle = globals()["__ll2bfloat16_rz"] + handle = globals().get("__ll2bfloat16_rz") + if handle is None: + handle = __ll2bfloat16_rz @lower(handle, int64) def impl(context, builder, sig, args): @@ -4569,7 +4711,9 @@ def _lower__ZL16__ll2bfloat16_rdx_nbst(shim_stream, shim_obj): def _ZL16__ll2bfloat16_rdx_nbst_caller(arg_0): return _ZL16__ll2bfloat16_rdx_nbst(arg_0) - handle = globals()["__ll2bfloat16_rd"] + handle = globals().get("__ll2bfloat16_rd") + if handle is None: + handle = __ll2bfloat16_rd @lower(handle, int64) def impl(context, builder, sig, args): @@ -4610,7 +4754,9 @@ def _lower__ZL16__ll2bfloat16_rux_nbst(shim_stream, shim_obj): def _ZL16__ll2bfloat16_rux_nbst_caller(arg_0): return _ZL16__ll2bfloat16_rux_nbst(arg_0) - handle = globals()["__ll2bfloat16_ru"] + handle = globals().get("__ll2bfloat16_ru") + if handle is None: + handle = __ll2bfloat16_ru @lower(handle, int64) def impl(context, builder, sig, args): @@ -4652,7 +4798,9 @@ def _lower__ZL6htrunc13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL6htrunc13__nv_bfloat16_nbst_caller(arg_0): return _ZL6htrunc13__nv_bfloat16_nbst(arg_0) - handle = globals()["htrunc"] + handle = globals().get("htrunc") + if handle is None: + handle = htrunc @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -4696,7 +4844,9 @@ def _lower__ZL5hceil13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL5hceil13__nv_bfloat16_nbst_caller(arg_0): return _ZL5hceil13__nv_bfloat16_nbst(arg_0) - handle = globals()["hceil"] + handle = globals().get("hceil") + if handle is None: + handle = hceil @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -4740,7 +4890,9 @@ def _lower__ZL6hfloor13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL6hfloor13__nv_bfloat16_nbst_caller(arg_0): return _ZL6hfloor13__nv_bfloat16_nbst(arg_0) - handle = globals()["hfloor"] + handle = globals().get("hfloor") + if handle is None: + handle = hfloor @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -4784,7 +4936,9 @@ def _lower__ZL5hrint13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL5hrint13__nv_bfloat16_nbst_caller(arg_0): return _ZL5hrint13__nv_bfloat16_nbst(arg_0) - handle = globals()["hrint"] + handle = globals().get("hrint") + if handle is None: + handle = hrint @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -4828,7 +4982,9 @@ def _lower__ZL7h2trunc14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL7h2trunc14__nv_bfloat162_nbst_caller(arg_0): return _ZL7h2trunc14__nv_bfloat162_nbst(arg_0) - handle = globals()["h2trunc"] + handle = globals().get("h2trunc") + if handle is None: + handle = h2trunc @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -4872,7 +5028,9 @@ def _lower__ZL6h2ceil14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL6h2ceil14__nv_bfloat162_nbst_caller(arg_0): return _ZL6h2ceil14__nv_bfloat162_nbst(arg_0) - handle = globals()["h2ceil"] + handle = globals().get("h2ceil") + if handle is None: + handle = h2ceil @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -4916,7 +5074,9 @@ def _lower__ZL7h2floor14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL7h2floor14__nv_bfloat162_nbst_caller(arg_0): return _ZL7h2floor14__nv_bfloat162_nbst(arg_0) - handle = globals()["h2floor"] + handle = globals().get("h2floor") + if handle is None: + handle = h2floor @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -4960,7 +5120,9 @@ def _lower__ZL6h2rint14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL6h2rint14__nv_bfloat162_nbst_caller(arg_0): return _ZL6h2rint14__nv_bfloat162_nbst(arg_0) - handle = globals()["h2rint"] + handle = globals().get("h2rint") + if handle is None: + handle = h2rint @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -5004,7 +5166,9 @@ def _lower__ZL20__bfloat162bfloat16213__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL20__bfloat162bfloat16213__nv_bfloat16_nbst_caller(arg_0): return _ZL20__bfloat162bfloat16213__nv_bfloat16_nbst(arg_0) - handle = globals()["__bfloat162bfloat162"] + handle = globals().get("__bfloat162bfloat162") + if handle is None: + handle = __bfloat162bfloat162 @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -5048,7 +5212,9 @@ def _lower__ZL17__lowhigh2highlow14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL17__lowhigh2highlow14__nv_bfloat162_nbst_caller(arg_0): return _ZL17__lowhigh2highlow14__nv_bfloat162_nbst(arg_0) - handle = globals()["__lowhigh2highlow"] + handle = globals().get("__lowhigh2highlow") + if handle is None: + handle = __lowhigh2highlow @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -5094,7 +5260,9 @@ def _lower__ZL16__lows2bfloat16214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL16__lows2bfloat16214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL16__lows2bfloat16214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__lows2bfloat162"] + handle = globals().get("__lows2bfloat162") + if handle is None: + handle = __lows2bfloat162 @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -5144,7 +5312,9 @@ def _lower__ZL17__highs2bfloat16214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL17__highs2bfloat16214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL17__highs2bfloat16214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__highs2bfloat162"] + handle = globals().get("__highs2bfloat162") + if handle is None: + handle = __highs2bfloat162 @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -5192,7 +5362,9 @@ def _lower__ZL15__high2bfloat1614__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL15__high2bfloat1614__nv_bfloat162_nbst_caller(arg_0): return _ZL15__high2bfloat1614__nv_bfloat162_nbst(arg_0) - handle = globals()["__high2bfloat16"] + handle = globals().get("__high2bfloat16") + if handle is None: + handle = __high2bfloat16 @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -5236,7 +5408,9 @@ def _lower__ZL14__low2bfloat1614__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL14__low2bfloat1614__nv_bfloat162_nbst_caller(arg_0): return _ZL14__low2bfloat1614__nv_bfloat162_nbst(arg_0) - handle = globals()["__low2bfloat16"] + handle = globals().get("__low2bfloat16") + if handle is None: + handle = __low2bfloat16 @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -5279,7 +5453,9 @@ def _lower__ZL8__hisinf13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL8__hisinf13__nv_bfloat16_nbst_caller(arg_0): return _ZL8__hisinf13__nv_bfloat16_nbst(arg_0) - handle = globals()["__hisinf"] + handle = globals().get("__hisinf") + if handle is None: + handle = __hisinf @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -5325,7 +5501,9 @@ def _lower__ZL18__halves2bfloat16213__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL18__halves2bfloat16213__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL18__halves2bfloat16213__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals()["__halves2bfloat162"] + handle = globals().get("__halves2bfloat162") + if handle is None: + handle = __halves2bfloat162 @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -5373,7 +5551,9 @@ def _lower__ZL15__low2bfloat16214__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL15__low2bfloat16214__nv_bfloat162_nbst_caller(arg_0): return _ZL15__low2bfloat16214__nv_bfloat162_nbst(arg_0) - handle = globals()["__low2bfloat162"] + handle = globals().get("__low2bfloat162") + if handle is None: + handle = __low2bfloat162 @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -5417,7 +5597,9 @@ def _lower__ZL16__high2bfloat16214__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL16__high2bfloat16214__nv_bfloat162_nbst_caller(arg_0): return _ZL16__high2bfloat16214__nv_bfloat162_nbst(arg_0) - handle = globals()["__high2bfloat162"] + handle = globals().get("__high2bfloat162") + if handle is None: + handle = __high2bfloat162 @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -5461,7 +5643,9 @@ def _lower__ZL19__bfloat16_as_short13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL19__bfloat16_as_short13__nv_bfloat16_nbst_caller(arg_0): return _ZL19__bfloat16_as_short13__nv_bfloat16_nbst(arg_0) - handle = globals()["__bfloat16_as_short"] + handle = globals().get("__bfloat16_as_short") + if handle is None: + handle = __bfloat16_as_short @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -5505,7 +5689,9 @@ def _lower__ZL20__bfloat16_as_ushort13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL20__bfloat16_as_ushort13__nv_bfloat16_nbst_caller(arg_0): return _ZL20__bfloat16_as_ushort13__nv_bfloat16_nbst(arg_0) - handle = globals()["__bfloat16_as_ushort"] + handle = globals().get("__bfloat16_as_ushort") + if handle is None: + handle = __bfloat16_as_ushort @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -5548,7 +5734,9 @@ def _lower__ZL19__short_as_bfloat16s_nbst(shim_stream, shim_obj): def _ZL19__short_as_bfloat16s_nbst_caller(arg_0): return _ZL19__short_as_bfloat16s_nbst(arg_0) - handle = globals()["__short_as_bfloat16"] + handle = globals().get("__short_as_bfloat16") + if handle is None: + handle = __short_as_bfloat16 @lower(handle, int16) def impl(context, builder, sig, args): @@ -5591,7 +5779,9 @@ def _lower__ZL20__ushort_as_bfloat16t_nbst(shim_stream, shim_obj): def _ZL20__ushort_as_bfloat16t_nbst_caller(arg_0): return _ZL20__ushort_as_bfloat16t_nbst(arg_0) - handle = globals()["__ushort_as_bfloat16"] + handle = globals().get("__ushort_as_bfloat16") + if handle is None: + handle = __ushort_as_bfloat16 @lower(handle, uint16) def impl(context, builder, sig, args): @@ -5644,7 +5834,9 @@ def _ZL11__shfl_syncj14__nv_bfloat162ii_nbst_caller( arg_0, arg_1, arg_2, arg_3 ) - handle = globals()["__shfl_sync"] + handle = globals().get("__shfl_sync") + if handle is None: + handle = __shfl_sync @lower(handle, uint32, _type___nv_bfloat162, int32, int32) def impl(context, builder, sig, args): @@ -5703,7 +5895,9 @@ def _ZL14__shfl_up_syncj14__nv_bfloat162ji_nbst_caller( arg_0, arg_1, arg_2, arg_3 ) - handle = globals()["__shfl_up_sync"] + handle = globals().get("__shfl_up_sync") + if handle is None: + handle = __shfl_up_sync @lower(handle, uint32, _type___nv_bfloat162, uint32, int32) def impl(context, builder, sig, args): @@ -5762,7 +5956,9 @@ def _ZL16__shfl_down_syncj14__nv_bfloat162ji_nbst_caller( arg_0, arg_1, arg_2, arg_3 ) - handle = globals()["__shfl_down_sync"] + handle = globals().get("__shfl_down_sync") + if handle is None: + handle = __shfl_down_sync @lower(handle, uint32, _type___nv_bfloat162, uint32, int32) def impl(context, builder, sig, args): @@ -5821,7 +6017,9 @@ def _ZL15__shfl_xor_syncj14__nv_bfloat162ii_nbst_caller( arg_0, arg_1, arg_2, arg_3 ) - handle = globals()["__shfl_xor_sync"] + handle = globals().get("__shfl_xor_sync") + if handle is None: + handle = __shfl_xor_sync @lower(handle, uint32, _type___nv_bfloat162, int32, int32) def impl(context, builder, sig, args): @@ -5876,7 +6074,9 @@ def _ZL11__shfl_syncj13__nv_bfloat16ii_nbst_caller( arg_0, arg_1, arg_2, arg_3 ) - handle = globals()["__shfl_sync"] + handle = globals().get("__shfl_sync") + if handle is None: + handle = __shfl_sync @lower(handle, uint32, _type___nv_bfloat16, int32, int32) def impl(context, builder, sig, args): @@ -5931,7 +6131,9 @@ def _ZL14__shfl_up_syncj13__nv_bfloat16ji_nbst_caller( arg_0, arg_1, arg_2, arg_3 ) - handle = globals()["__shfl_up_sync"] + handle = globals().get("__shfl_up_sync") + if handle is None: + handle = __shfl_up_sync @lower(handle, uint32, _type___nv_bfloat16, uint32, int32) def impl(context, builder, sig, args): @@ -5986,7 +6188,9 @@ def _ZL16__shfl_down_syncj13__nv_bfloat16ji_nbst_caller( arg_0, arg_1, arg_2, arg_3 ) - handle = globals()["__shfl_down_sync"] + handle = globals().get("__shfl_down_sync") + if handle is None: + handle = __shfl_down_sync @lower(handle, uint32, _type___nv_bfloat16, uint32, int32) def impl(context, builder, sig, args): @@ -6041,7 +6245,9 @@ def _ZL15__shfl_xor_syncj13__nv_bfloat16ii_nbst_caller( arg_0, arg_1, arg_2, arg_3 ) - handle = globals()["__shfl_xor_sync"] + handle = globals().get("__shfl_xor_sync") + if handle is None: + handle = __shfl_xor_sync @lower(handle, uint32, _type___nv_bfloat16, int32, int32) def impl(context, builder, sig, args): @@ -6091,7 +6297,9 @@ def _lower__ZL5__ldgPK14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL5__ldgPK14__nv_bfloat162_nbst_caller(arg_0): return _ZL5__ldgPK14__nv_bfloat162_nbst(arg_0) - handle = globals()["__ldg"] + handle = globals().get("__ldg") + if handle is None: + handle = __ldg @lower(handle, CPointer(_type___nv_bfloat162)) def impl(context, builder, sig, args): @@ -6133,7 +6341,9 @@ def _lower__ZL5__ldgPK13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL5__ldgPK13__nv_bfloat16_nbst_caller(arg_0): return _ZL5__ldgPK13__nv_bfloat16_nbst(arg_0) - handle = globals()["__ldg"] + handle = globals().get("__ldg") + if handle is None: + handle = __ldg @lower(handle, CPointer(_type___nv_bfloat16)) def impl(context, builder, sig, args): @@ -6179,7 +6389,9 @@ def _lower__ZL6__ldcgPK14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL6__ldcgPK14__nv_bfloat162_nbst_caller(arg_0): return _ZL6__ldcgPK14__nv_bfloat162_nbst(arg_0) - handle = globals()["__ldcg"] + handle = globals().get("__ldcg") + if handle is None: + handle = __ldcg @lower(handle, CPointer(_type___nv_bfloat162)) def impl(context, builder, sig, args): @@ -6221,7 +6433,9 @@ def _lower__ZL6__ldcgPK13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL6__ldcgPK13__nv_bfloat16_nbst_caller(arg_0): return _ZL6__ldcgPK13__nv_bfloat16_nbst(arg_0) - handle = globals()["__ldcg"] + handle = globals().get("__ldcg") + if handle is None: + handle = __ldcg @lower(handle, CPointer(_type___nv_bfloat16)) def impl(context, builder, sig, args): @@ -6267,7 +6481,9 @@ def _lower__ZL6__ldcaPK14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL6__ldcaPK14__nv_bfloat162_nbst_caller(arg_0): return _ZL6__ldcaPK14__nv_bfloat162_nbst(arg_0) - handle = globals()["__ldca"] + handle = globals().get("__ldca") + if handle is None: + handle = __ldca @lower(handle, CPointer(_type___nv_bfloat162)) def impl(context, builder, sig, args): @@ -6309,7 +6525,9 @@ def _lower__ZL6__ldcaPK13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL6__ldcaPK13__nv_bfloat16_nbst_caller(arg_0): return _ZL6__ldcaPK13__nv_bfloat16_nbst(arg_0) - handle = globals()["__ldca"] + handle = globals().get("__ldca") + if handle is None: + handle = __ldca @lower(handle, CPointer(_type___nv_bfloat16)) def impl(context, builder, sig, args): @@ -6355,7 +6573,9 @@ def _lower__ZL6__ldcsPK14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL6__ldcsPK14__nv_bfloat162_nbst_caller(arg_0): return _ZL6__ldcsPK14__nv_bfloat162_nbst(arg_0) - handle = globals()["__ldcs"] + handle = globals().get("__ldcs") + if handle is None: + handle = __ldcs @lower(handle, CPointer(_type___nv_bfloat162)) def impl(context, builder, sig, args): @@ -6397,7 +6617,9 @@ def _lower__ZL6__ldcsPK13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL6__ldcsPK13__nv_bfloat16_nbst_caller(arg_0): return _ZL6__ldcsPK13__nv_bfloat16_nbst(arg_0) - handle = globals()["__ldcs"] + handle = globals().get("__ldcs") + if handle is None: + handle = __ldcs @lower(handle, CPointer(_type___nv_bfloat16)) def impl(context, builder, sig, args): @@ -6443,7 +6665,9 @@ def _lower__ZL6__ldluPK14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL6__ldluPK14__nv_bfloat162_nbst_caller(arg_0): return _ZL6__ldluPK14__nv_bfloat162_nbst(arg_0) - handle = globals()["__ldlu"] + handle = globals().get("__ldlu") + if handle is None: + handle = __ldlu @lower(handle, CPointer(_type___nv_bfloat162)) def impl(context, builder, sig, args): @@ -6485,7 +6709,9 @@ def _lower__ZL6__ldluPK13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL6__ldluPK13__nv_bfloat16_nbst_caller(arg_0): return _ZL6__ldluPK13__nv_bfloat16_nbst(arg_0) - handle = globals()["__ldlu"] + handle = globals().get("__ldlu") + if handle is None: + handle = __ldlu @lower(handle, CPointer(_type___nv_bfloat16)) def impl(context, builder, sig, args): @@ -6531,7 +6757,9 @@ def _lower__ZL6__ldcvPK14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL6__ldcvPK14__nv_bfloat162_nbst_caller(arg_0): return _ZL6__ldcvPK14__nv_bfloat162_nbst(arg_0) - handle = globals()["__ldcv"] + handle = globals().get("__ldcv") + if handle is None: + handle = __ldcv @lower(handle, CPointer(_type___nv_bfloat162)) def impl(context, builder, sig, args): @@ -6573,7 +6801,9 @@ def _lower__ZL6__ldcvPK13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL6__ldcvPK13__nv_bfloat16_nbst_caller(arg_0): return _ZL6__ldcvPK13__nv_bfloat16_nbst(arg_0) - handle = globals()["__ldcv"] + handle = globals().get("__ldcv") + if handle is None: + handle = __ldcv @lower(handle, CPointer(_type___nv_bfloat16)) def impl(context, builder, sig, args): @@ -6622,7 +6852,9 @@ def _lower__ZL6__stwbP14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL6__stwbP14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL6__stwbP14__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__stwb"] + handle = globals().get("__stwb") + if handle is None: + handle = __stwb @lower(handle, CPointer(_type___nv_bfloat162), _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -6669,7 +6901,9 @@ def _lower__ZL6__stwbP13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL6__stwbP13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL6__stwbP13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals()["__stwb"] + handle = globals().get("__stwb") + if handle is None: + handle = __stwb @lower(handle, CPointer(_type___nv_bfloat16), _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -6720,7 +6954,9 @@ def _lower__ZL6__stcgP14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL6__stcgP14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL6__stcgP14__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__stcg"] + handle = globals().get("__stcg") + if handle is None: + handle = __stcg @lower(handle, CPointer(_type___nv_bfloat162), _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -6767,7 +7003,9 @@ def _lower__ZL6__stcgP13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL6__stcgP13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL6__stcgP13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals()["__stcg"] + handle = globals().get("__stcg") + if handle is None: + handle = __stcg @lower(handle, CPointer(_type___nv_bfloat16), _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -6818,7 +7056,9 @@ def _lower__ZL6__stcsP14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL6__stcsP14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL6__stcsP14__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__stcs"] + handle = globals().get("__stcs") + if handle is None: + handle = __stcs @lower(handle, CPointer(_type___nv_bfloat162), _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -6865,7 +7105,9 @@ def _lower__ZL6__stcsP13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL6__stcsP13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL6__stcsP13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals()["__stcs"] + handle = globals().get("__stcs") + if handle is None: + handle = __stcs @lower(handle, CPointer(_type___nv_bfloat16), _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -6916,7 +7158,9 @@ def _lower__ZL6__stwtP14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL6__stwtP14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL6__stwtP14__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__stwt"] + handle = globals().get("__stwt") + if handle is None: + handle = __stwt @lower(handle, CPointer(_type___nv_bfloat162), _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -6963,7 +7207,9 @@ def _lower__ZL6__stwtP13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL6__stwtP13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL6__stwtP13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals()["__stwt"] + handle = globals().get("__stwt") + if handle is None: + handle = __stwt @lower(handle, CPointer(_type___nv_bfloat16), _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -7013,7 +7259,9 @@ def _lower__ZL6__heq214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL6__heq214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL6__heq214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__heq2"] + handle = globals().get("__heq2") + if handle is None: + handle = __heq2 @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -7063,7 +7311,9 @@ def _lower__ZL6__hne214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL6__hne214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL6__hne214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__hne2"] + handle = globals().get("__hne2") + if handle is None: + handle = __hne2 @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -7113,7 +7363,9 @@ def _lower__ZL6__hle214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL6__hle214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL6__hle214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__hle2"] + handle = globals().get("__hle2") + if handle is None: + handle = __hle2 @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -7163,7 +7415,9 @@ def _lower__ZL6__hge214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL6__hge214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL6__hge214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__hge2"] + handle = globals().get("__hge2") + if handle is None: + handle = __hge2 @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -7213,7 +7467,9 @@ def _lower__ZL6__hlt214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL6__hlt214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL6__hlt214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__hlt2"] + handle = globals().get("__hlt2") + if handle is None: + handle = __hlt2 @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -7263,7 +7519,9 @@ def _lower__ZL6__hgt214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL6__hgt214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL6__hgt214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__hgt2"] + handle = globals().get("__hgt2") + if handle is None: + handle = __hgt2 @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -7313,7 +7571,9 @@ def _lower__ZL7__hequ214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL7__hequ214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL7__hequ214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__hequ2"] + handle = globals().get("__hequ2") + if handle is None: + handle = __hequ2 @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -7363,7 +7623,9 @@ def _lower__ZL7__hneu214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL7__hneu214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL7__hneu214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__hneu2"] + handle = globals().get("__hneu2") + if handle is None: + handle = __hneu2 @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -7413,7 +7675,9 @@ def _lower__ZL7__hleu214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL7__hleu214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL7__hleu214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__hleu2"] + handle = globals().get("__hleu2") + if handle is None: + handle = __hleu2 @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -7463,7 +7727,9 @@ def _lower__ZL7__hgeu214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL7__hgeu214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL7__hgeu214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__hgeu2"] + handle = globals().get("__hgeu2") + if handle is None: + handle = __hgeu2 @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -7513,7 +7779,9 @@ def _lower__ZL7__hltu214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL7__hltu214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL7__hltu214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__hltu2"] + handle = globals().get("__hltu2") + if handle is None: + handle = __hltu2 @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -7563,7 +7831,9 @@ def _lower__ZL7__hgtu214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL7__hgtu214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL7__hgtu214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__hgtu2"] + handle = globals().get("__hgtu2") + if handle is None: + handle = __hgtu2 @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -7611,7 +7881,9 @@ def _lower__ZL11__heq2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL11__heq2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL11__heq2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__heq2_mask"] + handle = globals().get("__heq2_mask") + if handle is None: + handle = __heq2_mask @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -7659,7 +7931,9 @@ def _lower__ZL11__hne2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL11__hne2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL11__hne2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__hne2_mask"] + handle = globals().get("__hne2_mask") + if handle is None: + handle = __hne2_mask @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -7707,7 +7981,9 @@ def _lower__ZL11__hle2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL11__hle2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL11__hle2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__hle2_mask"] + handle = globals().get("__hle2_mask") + if handle is None: + handle = __hle2_mask @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -7755,7 +8031,9 @@ def _lower__ZL11__hge2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL11__hge2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL11__hge2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__hge2_mask"] + handle = globals().get("__hge2_mask") + if handle is None: + handle = __hge2_mask @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -7803,7 +8081,9 @@ def _lower__ZL11__hlt2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL11__hlt2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL11__hlt2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__hlt2_mask"] + handle = globals().get("__hlt2_mask") + if handle is None: + handle = __hlt2_mask @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -7851,7 +8131,9 @@ def _lower__ZL11__hgt2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL11__hgt2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL11__hgt2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__hgt2_mask"] + handle = globals().get("__hgt2_mask") + if handle is None: + handle = __hgt2_mask @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -7899,7 +8181,9 @@ def _lower__ZL12__hequ2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL12__hequ2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL12__hequ2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__hequ2_mask"] + handle = globals().get("__hequ2_mask") + if handle is None: + handle = __hequ2_mask @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -7947,7 +8231,9 @@ def _lower__ZL12__hneu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL12__hneu2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL12__hneu2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__hneu2_mask"] + handle = globals().get("__hneu2_mask") + if handle is None: + handle = __hneu2_mask @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -7995,7 +8281,9 @@ def _lower__ZL12__hleu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL12__hleu2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL12__hleu2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__hleu2_mask"] + handle = globals().get("__hleu2_mask") + if handle is None: + handle = __hleu2_mask @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -8043,7 +8331,9 @@ def _lower__ZL12__hgeu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL12__hgeu2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL12__hgeu2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__hgeu2_mask"] + handle = globals().get("__hgeu2_mask") + if handle is None: + handle = __hgeu2_mask @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -8091,7 +8381,9 @@ def _lower__ZL12__hltu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL12__hltu2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL12__hltu2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__hltu2_mask"] + handle = globals().get("__hltu2_mask") + if handle is None: + handle = __hltu2_mask @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -8139,7 +8431,9 @@ def _lower__ZL12__hgtu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL12__hgtu2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL12__hgtu2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__hgtu2_mask"] + handle = globals().get("__hgtu2_mask") + if handle is None: + handle = __hgtu2_mask @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -8187,7 +8481,9 @@ def _lower__ZL9__hisnan214__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL9__hisnan214__nv_bfloat162_nbst_caller(arg_0): return _ZL9__hisnan214__nv_bfloat162_nbst(arg_0) - handle = globals()["__hisnan2"] + handle = globals().get("__hisnan2") + if handle is None: + handle = __hisnan2 @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -8233,7 +8529,9 @@ def _lower__ZL7__hadd214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL7__hadd214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL7__hadd214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__hadd2"] + handle = globals().get("__hadd2") + if handle is None: + handle = __hadd2 @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -8283,7 +8581,9 @@ def _lower__ZL7__hsub214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL7__hsub214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL7__hsub214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__hsub2"] + handle = globals().get("__hsub2") + if handle is None: + handle = __hsub2 @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -8333,7 +8633,9 @@ def _lower__ZL7__hmul214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL7__hmul214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL7__hmul214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__hmul2"] + handle = globals().get("__hmul2") + if handle is None: + handle = __hmul2 @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -8383,7 +8685,9 @@ def _lower__ZL10__hadd2_rn14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL10__hadd2_rn14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL10__hadd2_rn14__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__hadd2_rn"] + handle = globals().get("__hadd2_rn") + if handle is None: + handle = __hadd2_rn @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -8433,7 +8737,9 @@ def _lower__ZL10__hsub2_rn14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL10__hsub2_rn14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL10__hsub2_rn14__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__hsub2_rn"] + handle = globals().get("__hsub2_rn") + if handle is None: + handle = __hsub2_rn @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -8483,7 +8789,9 @@ def _lower__ZL10__hmul2_rn14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL10__hmul2_rn14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL10__hmul2_rn14__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__hmul2_rn"] + handle = globals().get("__hmul2_rn") + if handle is None: + handle = __hmul2_rn @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -8533,7 +8841,9 @@ def _lower__ZL7__h2div14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL7__h2div14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL7__h2div14__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__h2div"] + handle = globals().get("__h2div") + if handle is None: + handle = __h2div @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -8581,7 +8891,9 @@ def _lower__ZL7__habs214__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL7__habs214__nv_bfloat162_nbst_caller(arg_0): return _ZL7__habs214__nv_bfloat162_nbst(arg_0) - handle = globals()["__habs2"] + handle = globals().get("__habs2") + if handle is None: + handle = __habs2 @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -8627,7 +8939,9 @@ def _lower__ZL11__hadd2_sat14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL11__hadd2_sat14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL11__hadd2_sat14__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__hadd2_sat"] + handle = globals().get("__hadd2_sat") + if handle is None: + handle = __hadd2_sat @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -8677,7 +8991,9 @@ def _lower__ZL11__hsub2_sat14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL11__hsub2_sat14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL11__hsub2_sat14__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__hsub2_sat"] + handle = globals().get("__hsub2_sat") + if handle is None: + handle = __hsub2_sat @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -8727,7 +9043,9 @@ def _lower__ZL11__hmul2_sat14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL11__hmul2_sat14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL11__hmul2_sat14__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__hmul2_sat"] + handle = globals().get("__hmul2_sat") + if handle is None: + handle = __hmul2_sat @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -8779,7 +9097,9 @@ def _lower__ZL7__hfma214__nv_bfloat162S_S__nbst(shim_stream, shim_obj): def _ZL7__hfma214__nv_bfloat162S_S__nbst_caller(arg_0, arg_1, arg_2): return _ZL7__hfma214__nv_bfloat162S_S__nbst(arg_0, arg_1, arg_2) - handle = globals()["__hfma2"] + handle = globals().get("__hfma2") + if handle is None: + handle = __hfma2 @lower( handle, _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 @@ -8834,7 +9154,9 @@ def _lower__ZL11__hfma2_sat14__nv_bfloat162S_S__nbst(shim_stream, shim_obj): def _ZL11__hfma2_sat14__nv_bfloat162S_S__nbst_caller(arg_0, arg_1, arg_2): return _ZL11__hfma2_sat14__nv_bfloat162S_S__nbst(arg_0, arg_1, arg_2) - handle = globals()["__hfma2_sat"] + handle = globals().get("__hfma2_sat") + if handle is None: + handle = __hfma2_sat @lower( handle, _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 @@ -8885,7 +9207,9 @@ def _lower__ZL7__hneg214__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL7__hneg214__nv_bfloat162_nbst_caller(arg_0): return _ZL7__hneg214__nv_bfloat162_nbst(arg_0) - handle = globals()["__hneg2"] + handle = globals().get("__hneg2") + if handle is None: + handle = __hneg2 @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -8929,7 +9253,9 @@ def _lower__ZL6__habs13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL6__habs13__nv_bfloat16_nbst_caller(arg_0): return _ZL6__habs13__nv_bfloat16_nbst(arg_0) - handle = globals()["__habs"] + handle = globals().get("__habs") + if handle is None: + handle = __habs @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -8975,7 +9301,9 @@ def _lower__ZL6__hadd13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL6__hadd13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL6__hadd13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals()["__hadd"] + handle = globals().get("__hadd") + if handle is None: + handle = __hadd @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -9025,7 +9353,9 @@ def _lower__ZL6__hsub13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL6__hsub13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL6__hsub13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals()["__hsub"] + handle = globals().get("__hsub") + if handle is None: + handle = __hsub @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -9075,7 +9405,9 @@ def _lower__ZL6__hmul13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL6__hmul13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL6__hmul13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals()["__hmul"] + handle = globals().get("__hmul") + if handle is None: + handle = __hmul @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -9125,7 +9457,9 @@ def _lower__ZL9__hadd_rn13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL9__hadd_rn13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL9__hadd_rn13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals()["__hadd_rn"] + handle = globals().get("__hadd_rn") + if handle is None: + handle = __hadd_rn @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -9175,7 +9509,9 @@ def _lower__ZL9__hsub_rn13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL9__hsub_rn13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL9__hsub_rn13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals()["__hsub_rn"] + handle = globals().get("__hsub_rn") + if handle is None: + handle = __hsub_rn @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -9225,7 +9561,9 @@ def _lower__ZL9__hmul_rn13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL9__hmul_rn13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL9__hmul_rn13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals()["__hmul_rn"] + handle = globals().get("__hmul_rn") + if handle is None: + handle = __hmul_rn @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -9275,7 +9613,9 @@ def _lower__ZL6__hdiv13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL6__hdiv13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL6__hdiv13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals()["__hdiv"] + handle = globals().get("__hdiv") + if handle is None: + handle = __hdiv @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -9325,7 +9665,9 @@ def _lower__ZL10__hadd_sat13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL10__hadd_sat13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL10__hadd_sat13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals()["__hadd_sat"] + handle = globals().get("__hadd_sat") + if handle is None: + handle = __hadd_sat @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -9375,7 +9717,9 @@ def _lower__ZL10__hsub_sat13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL10__hsub_sat13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL10__hsub_sat13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals()["__hsub_sat"] + handle = globals().get("__hsub_sat") + if handle is None: + handle = __hsub_sat @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -9425,7 +9769,9 @@ def _lower__ZL10__hmul_sat13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL10__hmul_sat13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL10__hmul_sat13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals()["__hmul_sat"] + handle = globals().get("__hmul_sat") + if handle is None: + handle = __hmul_sat @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -9477,7 +9823,9 @@ def _lower__ZL6__hfma13__nv_bfloat16S_S__nbst(shim_stream, shim_obj): def _ZL6__hfma13__nv_bfloat16S_S__nbst_caller(arg_0, arg_1, arg_2): return _ZL6__hfma13__nv_bfloat16S_S__nbst(arg_0, arg_1, arg_2) - handle = globals()["__hfma"] + handle = globals().get("__hfma") + if handle is None: + handle = __hfma @lower( handle, _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16 @@ -9532,7 +9880,9 @@ def _lower__ZL10__hfma_sat13__nv_bfloat16S_S__nbst(shim_stream, shim_obj): def _ZL10__hfma_sat13__nv_bfloat16S_S__nbst_caller(arg_0, arg_1, arg_2): return _ZL10__hfma_sat13__nv_bfloat16S_S__nbst(arg_0, arg_1, arg_2) - handle = globals()["__hfma_sat"] + handle = globals().get("__hfma_sat") + if handle is None: + handle = __hfma_sat @lower( handle, _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16 @@ -9583,7 +9933,9 @@ def _lower__ZL6__hneg13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL6__hneg13__nv_bfloat16_nbst_caller(arg_0): return _ZL6__hneg13__nv_bfloat16_nbst(arg_0) - handle = globals()["__hneg"] + handle = globals().get("__hneg") + if handle is None: + handle = __hneg @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -9627,7 +9979,9 @@ def _lower__ZL7__hbeq214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL7__hbeq214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL7__hbeq214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__hbeq2"] + handle = globals().get("__hbeq2") + if handle is None: + handle = __hbeq2 @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -9675,7 +10029,9 @@ def _lower__ZL7__hbne214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL7__hbne214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL7__hbne214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__hbne2"] + handle = globals().get("__hbne2") + if handle is None: + handle = __hbne2 @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -9723,7 +10079,9 @@ def _lower__ZL7__hble214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL7__hble214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL7__hble214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__hble2"] + handle = globals().get("__hble2") + if handle is None: + handle = __hble2 @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -9771,7 +10129,9 @@ def _lower__ZL7__hbge214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL7__hbge214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL7__hbge214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__hbge2"] + handle = globals().get("__hbge2") + if handle is None: + handle = __hbge2 @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -9819,7 +10179,9 @@ def _lower__ZL7__hblt214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL7__hblt214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL7__hblt214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__hblt2"] + handle = globals().get("__hblt2") + if handle is None: + handle = __hblt2 @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -9867,7 +10229,9 @@ def _lower__ZL7__hbgt214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL7__hbgt214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL7__hbgt214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__hbgt2"] + handle = globals().get("__hbgt2") + if handle is None: + handle = __hbgt2 @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -9915,7 +10279,9 @@ def _lower__ZL8__hbequ214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL8__hbequ214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL8__hbequ214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__hbequ2"] + handle = globals().get("__hbequ2") + if handle is None: + handle = __hbequ2 @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -9963,7 +10329,9 @@ def _lower__ZL8__hbneu214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL8__hbneu214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL8__hbneu214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__hbneu2"] + handle = globals().get("__hbneu2") + if handle is None: + handle = __hbneu2 @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -10011,7 +10379,9 @@ def _lower__ZL8__hbleu214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL8__hbleu214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL8__hbleu214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__hbleu2"] + handle = globals().get("__hbleu2") + if handle is None: + handle = __hbleu2 @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -10059,7 +10429,9 @@ def _lower__ZL8__hbgeu214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL8__hbgeu214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL8__hbgeu214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__hbgeu2"] + handle = globals().get("__hbgeu2") + if handle is None: + handle = __hbgeu2 @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -10107,7 +10479,9 @@ def _lower__ZL8__hbltu214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL8__hbltu214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL8__hbltu214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__hbltu2"] + handle = globals().get("__hbltu2") + if handle is None: + handle = __hbltu2 @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -10155,7 +10529,9 @@ def _lower__ZL8__hbgtu214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL8__hbgtu214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL8__hbgtu214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__hbgtu2"] + handle = globals().get("__hbgtu2") + if handle is None: + handle = __hbgtu2 @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -10203,7 +10579,9 @@ def _lower__ZL5__heq13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL5__heq13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL5__heq13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals()["__heq"] + handle = globals().get("__heq") + if handle is None: + handle = __heq @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -10251,7 +10629,9 @@ def _lower__ZL5__hne13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL5__hne13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL5__hne13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals()["__hne"] + handle = globals().get("__hne") + if handle is None: + handle = __hne @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -10299,7 +10679,9 @@ def _lower__ZL5__hle13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL5__hle13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL5__hle13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals()["__hle"] + handle = globals().get("__hle") + if handle is None: + handle = __hle @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -10347,7 +10729,9 @@ def _lower__ZL5__hge13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL5__hge13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL5__hge13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals()["__hge"] + handle = globals().get("__hge") + if handle is None: + handle = __hge @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -10395,7 +10779,9 @@ def _lower__ZL5__hlt13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL5__hlt13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL5__hlt13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals()["__hlt"] + handle = globals().get("__hlt") + if handle is None: + handle = __hlt @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -10443,7 +10829,9 @@ def _lower__ZL5__hgt13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL5__hgt13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL5__hgt13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals()["__hgt"] + handle = globals().get("__hgt") + if handle is None: + handle = __hgt @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -10491,7 +10879,9 @@ def _lower__ZL6__hequ13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL6__hequ13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL6__hequ13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals()["__hequ"] + handle = globals().get("__hequ") + if handle is None: + handle = __hequ @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -10539,7 +10929,9 @@ def _lower__ZL6__hneu13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL6__hneu13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL6__hneu13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals()["__hneu"] + handle = globals().get("__hneu") + if handle is None: + handle = __hneu @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -10587,7 +10979,9 @@ def _lower__ZL6__hleu13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL6__hleu13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL6__hleu13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals()["__hleu"] + handle = globals().get("__hleu") + if handle is None: + handle = __hleu @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -10635,7 +11029,9 @@ def _lower__ZL6__hgeu13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL6__hgeu13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL6__hgeu13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals()["__hgeu"] + handle = globals().get("__hgeu") + if handle is None: + handle = __hgeu @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -10683,7 +11079,9 @@ def _lower__ZL6__hltu13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL6__hltu13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL6__hltu13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals()["__hltu"] + handle = globals().get("__hltu") + if handle is None: + handle = __hltu @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -10731,7 +11129,9 @@ def _lower__ZL6__hgtu13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL6__hgtu13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL6__hgtu13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals()["__hgtu"] + handle = globals().get("__hgtu") + if handle is None: + handle = __hgtu @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -10778,7 +11178,9 @@ def _lower__ZL8__hisnan13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL8__hisnan13__nv_bfloat16_nbst_caller(arg_0): return _ZL8__hisnan13__nv_bfloat16_nbst(arg_0) - handle = globals()["__hisnan"] + handle = globals().get("__hisnan") + if handle is None: + handle = __hisnan @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -10824,7 +11226,9 @@ def _lower__ZL6__hmax13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL6__hmax13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL6__hmax13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals()["__hmax"] + handle = globals().get("__hmax") + if handle is None: + handle = __hmax @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -10874,7 +11278,9 @@ def _lower__ZL6__hmin13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL6__hmin13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL6__hmin13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals()["__hmin"] + handle = globals().get("__hmin") + if handle is None: + handle = __hmin @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -10924,7 +11330,9 @@ def _lower__ZL10__hmax_nan13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL10__hmax_nan13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL10__hmax_nan13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals()["__hmax_nan"] + handle = globals().get("__hmax_nan") + if handle is None: + handle = __hmax_nan @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -10974,7 +11382,9 @@ def _lower__ZL10__hmin_nan13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL10__hmin_nan13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL10__hmin_nan13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals()["__hmin_nan"] + handle = globals().get("__hmin_nan") + if handle is None: + handle = __hmin_nan @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -11026,7 +11436,9 @@ def _lower__ZL11__hfma_relu13__nv_bfloat16S_S__nbst(shim_stream, shim_obj): def _ZL11__hfma_relu13__nv_bfloat16S_S__nbst_caller(arg_0, arg_1, arg_2): return _ZL11__hfma_relu13__nv_bfloat16S_S__nbst(arg_0, arg_1, arg_2) - handle = globals()["__hfma_relu"] + handle = globals().get("__hfma_relu") + if handle is None: + handle = __hfma_relu @lower( handle, _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16 @@ -11079,7 +11491,9 @@ def _lower__ZL7__hmax214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL7__hmax214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL7__hmax214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__hmax2"] + handle = globals().get("__hmax2") + if handle is None: + handle = __hmax2 @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -11129,7 +11543,9 @@ def _lower__ZL7__hmin214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL7__hmin214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL7__hmin214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__hmin2"] + handle = globals().get("__hmin2") + if handle is None: + handle = __hmin2 @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -11179,7 +11595,9 @@ def _lower__ZL11__hmax2_nan14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL11__hmax2_nan14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL11__hmax2_nan14__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__hmax2_nan"] + handle = globals().get("__hmax2_nan") + if handle is None: + handle = __hmax2_nan @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -11229,7 +11647,9 @@ def _lower__ZL11__hmin2_nan14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL11__hmin2_nan14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL11__hmin2_nan14__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["__hmin2_nan"] + handle = globals().get("__hmin2_nan") + if handle is None: + handle = __hmin2_nan @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -11281,7 +11701,9 @@ def _lower__ZL12__hfma2_relu14__nv_bfloat162S_S__nbst(shim_stream, shim_obj): def _ZL12__hfma2_relu14__nv_bfloat162S_S__nbst_caller(arg_0, arg_1, arg_2): return _ZL12__hfma2_relu14__nv_bfloat162S_S__nbst(arg_0, arg_1, arg_2) - handle = globals()["__hfma2_relu"] + handle = globals().get("__hfma2_relu") + if handle is None: + handle = __hfma2_relu @lower( handle, _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 @@ -11336,7 +11758,9 @@ def _lower__ZL8__hcmadd14__nv_bfloat162S_S__nbst(shim_stream, shim_obj): def _ZL8__hcmadd14__nv_bfloat162S_S__nbst_caller(arg_0, arg_1, arg_2): return _ZL8__hcmadd14__nv_bfloat162S_S__nbst(arg_0, arg_1, arg_2) - handle = globals()["__hcmadd"] + handle = globals().get("__hcmadd") + if handle is None: + handle = __hcmadd @lower( handle, _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 @@ -11387,7 +11811,9 @@ def _lower__ZL5hsqrt13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL5hsqrt13__nv_bfloat16_nbst_caller(arg_0): return _ZL5hsqrt13__nv_bfloat16_nbst(arg_0) - handle = globals()["hsqrt"] + handle = globals().get("hsqrt") + if handle is None: + handle = hsqrt @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -11431,7 +11857,9 @@ def _lower__ZL6hrsqrt13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL6hrsqrt13__nv_bfloat16_nbst_caller(arg_0): return _ZL6hrsqrt13__nv_bfloat16_nbst(arg_0) - handle = globals()["hrsqrt"] + handle = globals().get("hrsqrt") + if handle is None: + handle = hrsqrt @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -11475,7 +11903,9 @@ def _lower__ZL4hrcp13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL4hrcp13__nv_bfloat16_nbst_caller(arg_0): return _ZL4hrcp13__nv_bfloat16_nbst(arg_0) - handle = globals()["hrcp"] + handle = globals().get("hrcp") + if handle is None: + handle = hrcp @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -11517,7 +11947,9 @@ def _lower__ZL4hlog13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL4hlog13__nv_bfloat16_nbst_caller(arg_0): return _ZL4hlog13__nv_bfloat16_nbst(arg_0) - handle = globals()["hlog"] + handle = globals().get("hlog") + if handle is None: + handle = hlog @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -11559,7 +11991,9 @@ def _lower__ZL5hlog213__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL5hlog213__nv_bfloat16_nbst_caller(arg_0): return _ZL5hlog213__nv_bfloat16_nbst(arg_0) - handle = globals()["hlog2"] + handle = globals().get("hlog2") + if handle is None: + handle = hlog2 @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -11603,7 +12037,9 @@ def _lower__ZL6hlog1013__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL6hlog1013__nv_bfloat16_nbst_caller(arg_0): return _ZL6hlog1013__nv_bfloat16_nbst(arg_0) - handle = globals()["hlog10"] + handle = globals().get("hlog10") + if handle is None: + handle = hlog10 @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -11647,7 +12083,9 @@ def _lower__ZL4hexp13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL4hexp13__nv_bfloat16_nbst_caller(arg_0): return _ZL4hexp13__nv_bfloat16_nbst(arg_0) - handle = globals()["hexp"] + handle = globals().get("hexp") + if handle is None: + handle = hexp @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -11689,7 +12127,9 @@ def _lower__ZL12htanh_approx13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL12htanh_approx13__nv_bfloat16_nbst_caller(arg_0): return _ZL12htanh_approx13__nv_bfloat16_nbst(arg_0) - handle = globals()["htanh_approx"] + handle = globals().get("htanh_approx") + if handle is None: + handle = htanh_approx @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -11733,7 +12173,9 @@ def _lower__ZL13h2tanh_approx14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL13h2tanh_approx14__nv_bfloat162_nbst_caller(arg_0): return _ZL13h2tanh_approx14__nv_bfloat162_nbst(arg_0) - handle = globals()["h2tanh_approx"] + handle = globals().get("h2tanh_approx") + if handle is None: + handle = h2tanh_approx @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -11777,7 +12219,9 @@ def _lower__ZL5htanh13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL5htanh13__nv_bfloat16_nbst_caller(arg_0): return _ZL5htanh13__nv_bfloat16_nbst(arg_0) - handle = globals()["htanh"] + handle = globals().get("htanh") + if handle is None: + handle = htanh @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -11821,7 +12265,9 @@ def _lower__ZL6h2tanh14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL6h2tanh14__nv_bfloat162_nbst_caller(arg_0): return _ZL6h2tanh14__nv_bfloat162_nbst(arg_0) - handle = globals()["h2tanh"] + handle = globals().get("h2tanh") + if handle is None: + handle = h2tanh @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -11865,7 +12311,9 @@ def _lower__ZL5hexp213__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL5hexp213__nv_bfloat16_nbst_caller(arg_0): return _ZL5hexp213__nv_bfloat16_nbst(arg_0) - handle = globals()["hexp2"] + handle = globals().get("hexp2") + if handle is None: + handle = hexp2 @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -11909,7 +12357,9 @@ def _lower__ZL6hexp1013__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL6hexp1013__nv_bfloat16_nbst_caller(arg_0): return _ZL6hexp1013__nv_bfloat16_nbst(arg_0) - handle = globals()["hexp10"] + handle = globals().get("hexp10") + if handle is None: + handle = hexp10 @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -11953,7 +12403,9 @@ def _lower__ZL4hcos13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL4hcos13__nv_bfloat16_nbst_caller(arg_0): return _ZL4hcos13__nv_bfloat16_nbst(arg_0) - handle = globals()["hcos"] + handle = globals().get("hcos") + if handle is None: + handle = hcos @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -11995,7 +12447,9 @@ def _lower__ZL4hsin13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL4hsin13__nv_bfloat16_nbst_caller(arg_0): return _ZL4hsin13__nv_bfloat16_nbst(arg_0) - handle = globals()["hsin"] + handle = globals().get("hsin") + if handle is None: + handle = hsin @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -12037,7 +12491,9 @@ def _lower__ZL6h2sqrt14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL6h2sqrt14__nv_bfloat162_nbst_caller(arg_0): return _ZL6h2sqrt14__nv_bfloat162_nbst(arg_0) - handle = globals()["h2sqrt"] + handle = globals().get("h2sqrt") + if handle is None: + handle = h2sqrt @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -12081,7 +12537,9 @@ def _lower__ZL7h2rsqrt14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL7h2rsqrt14__nv_bfloat162_nbst_caller(arg_0): return _ZL7h2rsqrt14__nv_bfloat162_nbst(arg_0) - handle = globals()["h2rsqrt"] + handle = globals().get("h2rsqrt") + if handle is None: + handle = h2rsqrt @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -12125,7 +12583,9 @@ def _lower__ZL5h2rcp14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL5h2rcp14__nv_bfloat162_nbst_caller(arg_0): return _ZL5h2rcp14__nv_bfloat162_nbst(arg_0) - handle = globals()["h2rcp"] + handle = globals().get("h2rcp") + if handle is None: + handle = h2rcp @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -12169,7 +12629,9 @@ def _lower__ZL5h2log14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL5h2log14__nv_bfloat162_nbst_caller(arg_0): return _ZL5h2log14__nv_bfloat162_nbst(arg_0) - handle = globals()["h2log"] + handle = globals().get("h2log") + if handle is None: + handle = h2log @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -12213,7 +12675,9 @@ def _lower__ZL6h2log214__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL6h2log214__nv_bfloat162_nbst_caller(arg_0): return _ZL6h2log214__nv_bfloat162_nbst(arg_0) - handle = globals()["h2log2"] + handle = globals().get("h2log2") + if handle is None: + handle = h2log2 @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -12257,7 +12721,9 @@ def _lower__ZL7h2log1014__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL7h2log1014__nv_bfloat162_nbst_caller(arg_0): return _ZL7h2log1014__nv_bfloat162_nbst(arg_0) - handle = globals()["h2log10"] + handle = globals().get("h2log10") + if handle is None: + handle = h2log10 @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -12301,7 +12767,9 @@ def _lower__ZL5h2exp14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL5h2exp14__nv_bfloat162_nbst_caller(arg_0): return _ZL5h2exp14__nv_bfloat162_nbst(arg_0) - handle = globals()["h2exp"] + handle = globals().get("h2exp") + if handle is None: + handle = h2exp @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -12345,7 +12813,9 @@ def _lower__ZL6h2exp214__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL6h2exp214__nv_bfloat162_nbst_caller(arg_0): return _ZL6h2exp214__nv_bfloat162_nbst(arg_0) - handle = globals()["h2exp2"] + handle = globals().get("h2exp2") + if handle is None: + handle = h2exp2 @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -12389,7 +12859,9 @@ def _lower__ZL7h2exp1014__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL7h2exp1014__nv_bfloat162_nbst_caller(arg_0): return _ZL7h2exp1014__nv_bfloat162_nbst(arg_0) - handle = globals()["h2exp10"] + handle = globals().get("h2exp10") + if handle is None: + handle = h2exp10 @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -12433,7 +12905,9 @@ def _lower__ZL5h2cos14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL5h2cos14__nv_bfloat162_nbst_caller(arg_0): return _ZL5h2cos14__nv_bfloat162_nbst(arg_0) - handle = globals()["h2cos"] + handle = globals().get("h2cos") + if handle is None: + handle = h2cos @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -12477,7 +12951,9 @@ def _lower__ZL5h2sin14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL5h2sin14__nv_bfloat162_nbst_caller(arg_0): return _ZL5h2sin14__nv_bfloat162_nbst(arg_0) - handle = globals()["h2sin"] + handle = globals().get("h2sin") + if handle is None: + handle = h2sin @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -12524,7 +13000,9 @@ def _lower__ZL9atomicAddP14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL9atomicAddP14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL9atomicAddP14__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals()["atomicAdd"] + handle = globals().get("atomicAdd") + if handle is None: + handle = atomicAdd @lower(handle, CPointer(_type___nv_bfloat162), _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -12571,7 +13049,9 @@ def _lower__ZL9atomicAddP13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL9atomicAddP13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL9atomicAddP13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals()["atomicAdd"] + handle = globals().get("atomicAdd") + if handle is None: + handle = atomicAdd @lower(handle, CPointer(_type___nv_bfloat16), _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -12617,7 +13097,9 @@ def _lower__ZplRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj): def _ZplRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1): return _ZplRK13__nv_bfloat16S1__nbst(arg_0, arg_1) - handle = globals()["operator.add"] + handle = globals().get("operator.add") + if handle is None: + handle = operator.add @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -12663,7 +13145,9 @@ def _lower__ZmiRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj): def _ZmiRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1): return _ZmiRK13__nv_bfloat16S1__nbst(arg_0, arg_1) - handle = globals()["operator.sub"] + handle = globals().get("operator.sub") + if handle is None: + handle = operator.sub @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -12709,7 +13193,9 @@ def _lower__ZmlRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj): def _ZmlRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1): return _ZmlRK13__nv_bfloat16S1__nbst(arg_0, arg_1) - handle = globals()["operator.mul"] + handle = globals().get("operator.mul") + if handle is None: + handle = operator.mul @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -12755,7 +13241,9 @@ def _lower__ZdvRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj): def _ZdvRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1): return _ZdvRK13__nv_bfloat16S1__nbst(arg_0, arg_1) - handle = globals()["operator.truediv"] + handle = globals().get("operator.truediv") + if handle is None: + handle = operator.truediv @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -12801,7 +13289,9 @@ def _lower__ZpLR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj): def _ZpLR13__nv_bfloat16RKS__nbst_caller(arg_0, arg_1): return _ZpLR13__nv_bfloat16RKS__nbst(arg_0, arg_1) - handle = globals()["operator.iadd"] + handle = globals().get("operator.iadd") + if handle is None: + handle = operator.iadd @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -12847,7 +13337,9 @@ def _lower__ZmIR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj): def _ZmIR13__nv_bfloat16RKS__nbst_caller(arg_0, arg_1): return _ZmIR13__nv_bfloat16RKS__nbst(arg_0, arg_1) - handle = globals()["operator.isub"] + handle = globals().get("operator.isub") + if handle is None: + handle = operator.isub @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -12893,7 +13385,9 @@ def _lower__ZmLR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj): def _ZmLR13__nv_bfloat16RKS__nbst_caller(arg_0, arg_1): return _ZmLR13__nv_bfloat16RKS__nbst(arg_0, arg_1) - handle = globals()["operator.imul"] + handle = globals().get("operator.imul") + if handle is None: + handle = operator.imul @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -12939,7 +13433,9 @@ def _lower__ZdVR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj): def _ZdVR13__nv_bfloat16RKS__nbst_caller(arg_0, arg_1): return _ZdVR13__nv_bfloat16RKS__nbst(arg_0, arg_1) - handle = globals()["operator.itruediv"] + handle = globals().get("operator.itruediv") + if handle is None: + handle = operator.itruediv @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -12983,7 +13479,9 @@ def _lower__ZpsRK13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZpsRK13__nv_bfloat16_nbst_caller(arg_0): return _ZpsRK13__nv_bfloat16_nbst(arg_0) - handle = globals()["operator.pos"] + handle = globals().get("operator.pos") + if handle is None: + handle = operator.pos @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -13021,7 +13519,9 @@ def _lower__ZngRK13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZngRK13__nv_bfloat16_nbst_caller(arg_0): return _ZngRK13__nv_bfloat16_nbst(arg_0) - handle = globals()["operator.neg"] + handle = globals().get("operator.neg") + if handle is None: + handle = operator.neg @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -13059,7 +13559,9 @@ def _lower__ZeqRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj): def _ZeqRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1): return _ZeqRK13__nv_bfloat16S1__nbst(arg_0, arg_1) - handle = globals()["operator.eq"] + handle = globals().get("operator.eq") + if handle is None: + handle = operator.eq @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -13103,7 +13605,9 @@ def _lower__ZneRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj): def _ZneRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1): return _ZneRK13__nv_bfloat16S1__nbst(arg_0, arg_1) - handle = globals()["operator.ne"] + handle = globals().get("operator.ne") + if handle is None: + handle = operator.ne @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -13147,7 +13651,9 @@ def _lower__ZgtRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj): def _ZgtRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1): return _ZgtRK13__nv_bfloat16S1__nbst(arg_0, arg_1) - handle = globals()["operator.gt"] + handle = globals().get("operator.gt") + if handle is None: + handle = operator.gt @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -13191,7 +13697,9 @@ def _lower__ZltRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj): def _ZltRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1): return _ZltRK13__nv_bfloat16S1__nbst(arg_0, arg_1) - handle = globals()["operator.lt"] + handle = globals().get("operator.lt") + if handle is None: + handle = operator.lt @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -13235,7 +13743,9 @@ def _lower__ZgeRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj): def _ZgeRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1): return _ZgeRK13__nv_bfloat16S1__nbst(arg_0, arg_1) - handle = globals()["operator.ge"] + handle = globals().get("operator.ge") + if handle is None: + handle = operator.ge @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -13279,7 +13789,9 @@ def _lower__ZleRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj): def _ZleRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1): return _ZleRK13__nv_bfloat16S1__nbst(arg_0, arg_1) - handle = globals()["operator.le"] + handle = globals().get("operator.le") + if handle is None: + handle = operator.le @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): @@ -13325,7 +13837,9 @@ def _lower__ZplRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj): def _ZplRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1): return _ZplRK14__nv_bfloat162S1__nbst(arg_0, arg_1) - handle = globals()["operator.add"] + handle = globals().get("operator.add") + if handle is None: + handle = operator.add @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -13371,7 +13885,9 @@ def _lower__ZmiRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj): def _ZmiRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1): return _ZmiRK14__nv_bfloat162S1__nbst(arg_0, arg_1) - handle = globals()["operator.sub"] + handle = globals().get("operator.sub") + if handle is None: + handle = operator.sub @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -13417,7 +13933,9 @@ def _lower__ZmlRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj): def _ZmlRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1): return _ZmlRK14__nv_bfloat162S1__nbst(arg_0, arg_1) - handle = globals()["operator.mul"] + handle = globals().get("operator.mul") + if handle is None: + handle = operator.mul @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -13463,7 +13981,9 @@ def _lower__ZdvRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj): def _ZdvRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1): return _ZdvRK14__nv_bfloat162S1__nbst(arg_0, arg_1) - handle = globals()["operator.truediv"] + handle = globals().get("operator.truediv") + if handle is None: + handle = operator.truediv @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -13509,7 +14029,9 @@ def _lower__ZpLR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj): def _ZpLR14__nv_bfloat162RKS__nbst_caller(arg_0, arg_1): return _ZpLR14__nv_bfloat162RKS__nbst(arg_0, arg_1) - handle = globals()["operator.iadd"] + handle = globals().get("operator.iadd") + if handle is None: + handle = operator.iadd @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -13555,7 +14077,9 @@ def _lower__ZmIR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj): def _ZmIR14__nv_bfloat162RKS__nbst_caller(arg_0, arg_1): return _ZmIR14__nv_bfloat162RKS__nbst(arg_0, arg_1) - handle = globals()["operator.isub"] + handle = globals().get("operator.isub") + if handle is None: + handle = operator.isub @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -13601,7 +14125,9 @@ def _lower__ZmLR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj): def _ZmLR14__nv_bfloat162RKS__nbst_caller(arg_0, arg_1): return _ZmLR14__nv_bfloat162RKS__nbst(arg_0, arg_1) - handle = globals()["operator.imul"] + handle = globals().get("operator.imul") + if handle is None: + handle = operator.imul @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -13647,7 +14173,9 @@ def _lower__ZdVR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj): def _ZdVR14__nv_bfloat162RKS__nbst_caller(arg_0, arg_1): return _ZdVR14__nv_bfloat162RKS__nbst(arg_0, arg_1) - handle = globals()["operator.itruediv"] + handle = globals().get("operator.itruediv") + if handle is None: + handle = operator.itruediv @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -13691,7 +14219,9 @@ def _lower__ZpsRK14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZpsRK14__nv_bfloat162_nbst_caller(arg_0): return _ZpsRK14__nv_bfloat162_nbst(arg_0) - handle = globals()["operator.pos"] + handle = globals().get("operator.pos") + if handle is None: + handle = operator.pos @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -13729,7 +14259,9 @@ def _lower__ZngRK14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZngRK14__nv_bfloat162_nbst_caller(arg_0): return _ZngRK14__nv_bfloat162_nbst(arg_0) - handle = globals()["operator.neg"] + handle = globals().get("operator.neg") + if handle is None: + handle = operator.neg @lower(handle, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -13767,7 +14299,9 @@ def _lower__ZeqRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj): def _ZeqRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1): return _ZeqRK14__nv_bfloat162S1__nbst(arg_0, arg_1) - handle = globals()["operator.eq"] + handle = globals().get("operator.eq") + if handle is None: + handle = operator.eq @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -13811,7 +14345,9 @@ def _lower__ZneRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj): def _ZneRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1): return _ZneRK14__nv_bfloat162S1__nbst(arg_0, arg_1) - handle = globals()["operator.ne"] + handle = globals().get("operator.ne") + if handle is None: + handle = operator.ne @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -13855,7 +14391,9 @@ def _lower__ZgtRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj): def _ZgtRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1): return _ZgtRK14__nv_bfloat162S1__nbst(arg_0, arg_1) - handle = globals()["operator.gt"] + handle = globals().get("operator.gt") + if handle is None: + handle = operator.gt @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -13899,7 +14437,9 @@ def _lower__ZltRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj): def _ZltRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1): return _ZltRK14__nv_bfloat162S1__nbst(arg_0, arg_1) - handle = globals()["operator.lt"] + handle = globals().get("operator.lt") + if handle is None: + handle = operator.lt @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -13943,7 +14483,9 @@ def _lower__ZgeRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj): def _ZgeRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1): return _ZgeRK14__nv_bfloat162S1__nbst(arg_0, arg_1) - handle = globals()["operator.ge"] + handle = globals().get("operator.ge") + if handle is None: + handle = operator.ge @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -13987,7 +14529,9 @@ def _lower__ZleRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj): def _ZleRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1): return _ZleRK14__nv_bfloat162S1__nbst(arg_0, arg_1) - handle = globals()["operator.le"] + handle = globals().get("operator.le") + if handle is None: + handle = operator.le @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): @@ -14034,7 +14578,9 @@ def _lower__ZN6__halfC1E13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZN6__halfC1E13__nv_bfloat16_nbst_caller(arg_0): return _ZN6__halfC1E13__nv_bfloat16_nbst(arg_0) - handle = globals()["__half"] + handle = globals().get("__half") + if handle is None: + handle = __half @lower(handle, _type___nv_bfloat16) def impl(context, builder, sig, args): From 9e79d370a435e7b354423a9df068338606d22918 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 15 Aug 2025 13:22:57 -0700 Subject: [PATCH 19/56] apply binding patches --- numba_cuda/numba/cuda/_internal/cuda_bf16.py | 46 ++++++-------------- 1 file changed, 14 insertions(+), 32 deletions(-) diff --git a/numba_cuda/numba/cuda/_internal/cuda_bf16.py b/numba_cuda/numba/cuda/_internal/cuda_bf16.py index fb4e4de21..e27df978d 100644 --- a/numba_cuda/numba/cuda/_internal/cuda_bf16.py +++ b/numba_cuda/numba/cuda/_internal/cuda_bf16.py @@ -27,7 +27,6 @@ from numba.core.typing.templates import AttributeTemplate, ConcreteTemplate from numba.core.typing.templates import Registry as TypingRegistry from numba.cuda import CUSource, declare_device -from numba.cuda._internal.cuda_bf16 import _type___nv_bfloat16 from numba.cuda.vector_types import vector_types from numba.extending import as_numba_type from numba.types import ( @@ -49,8 +48,10 @@ uint64, void, ) +from numba.cuda.types import bfloat16 float32x2 = vector_types["float32x2"] +__half = float16 typing_registry = TypingRegistry() @@ -180,37 +181,7 @@ def resolve_y(self, obj): make_attribute_wrapper(_type_class_unnamed1405416, "y", "y") -@register -class _ctor_template_unnamed1405416(ConcreteTemplate): - key = globals()["unnamed1405416"] - cases = [] - - -register_global(unnamed1405416, Function(_ctor_template_unnamed1405416)) - - -# Typing for __nv_bfloat16 -class _type_class___nv_bfloat16(Number): - def __init__(self): - super().__init__(name="__nv_bfloat16") - self.alignof_ = 2 - self.bitwidth = 2 * 8 - - -_type___nv_bfloat16 = _type_class___nv_bfloat16() - - -# Make Python API for struct -__nv_bfloat16 = type("__nv_bfloat16", (), {"_nbtype": _type___nv_bfloat16}) - -as_numba_type.register(__nv_bfloat16, _type___nv_bfloat16) - - -@register_model(_type_class___nv_bfloat16) -class _model___nv_bfloat16(PrimitiveModel): - def __init__(self, dmm, fe_type): - be_type = ir.IntType(fe_type.bitwidth) - super(_model___nv_bfloat16, self).__init__(dmm, fe_type, be_type) +__nv_bfloat16 = _type___nv_bfloat16 = bfloat16 def _lower__ZN13__nv_bfloat16C1Ev(shim_stream, shim_obj): @@ -356,6 +327,17 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) ) + # By default, Numbast does not generate this cast because the c++ conversion + # constructor is marked explict. We enable it by hand here. + @lower_cast(float16, __nv_bfloat16) + def conversion_impl(context, builder, fromty, toty, value): + return ctor_impl( + context, + builder, + signature(__nv_bfloat16, fromty), + [value], + ) + _lower__ZN13__nv_bfloat16C1E6__half(shim_stream, shim_obj) From 667d9fae07a1a20e4f4f5f26f25f5ead70fc6385 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 15 Aug 2025 13:32:50 -0700 Subject: [PATCH 20/56] generate bindings --- numba_cuda/numba/cuda/_internal/cuda_bf16.py | 157 +++++++++++++++++-- 1 file changed, 142 insertions(+), 15 deletions(-) diff --git a/numba_cuda/numba/cuda/_internal/cuda_bf16.py b/numba_cuda/numba/cuda/_internal/cuda_bf16.py index e27df978d..5adb821eb 100644 --- a/numba_cuda/numba/cuda/_internal/cuda_bf16.py +++ b/numba_cuda/numba/cuda/_internal/cuda_bf16.py @@ -1,6 +1,6 @@ # Automatically generated by Numbast Static Binding Generator # Generator Information: -# Ast_canopy version: 0.4.0 +# Ast_canopy version: 0.5.0 # Numbast version: 0.5.0 # Generation command: /home/wangm/numbast/numbast/src/numbast/__main__.py --cfg-path configs/cuda_bf16.yml --output-dir numba_cuda/numba/cuda/_internal/ # Static binding generator parameters: {'cfg_path': 'configs/cuda_bf16.yml', 'output_dir': 'numba_cuda/numba/cuda/_internal/', 'run_ruff_format': True} @@ -23,10 +23,12 @@ register_model, ) from numba.core.imputils import Registry as TargetRegistry +from numba.core.imputils import lower_cast from numba.core.typing import signature from numba.core.typing.templates import AttributeTemplate, ConcreteTemplate from numba.core.typing.templates import Registry as TypingRegistry from numba.cuda import CUSource, declare_device +from numba.cuda._internal.cuda_bf16 import _type___nv_bfloat16 from numba.cuda.vector_types import vector_types from numba.extending import as_numba_type from numba.types import ( @@ -48,10 +50,8 @@ uint64, void, ) -from numba.cuda.types import bfloat16 float32x2 = vector_types["float32x2"] -__half = float16 typing_registry = TypingRegistry() @@ -181,7 +181,37 @@ def resolve_y(self, obj): make_attribute_wrapper(_type_class_unnamed1405416, "y", "y") -__nv_bfloat16 = _type___nv_bfloat16 = bfloat16 +@register +class _ctor_template_unnamed1405416(ConcreteTemplate): + key = globals()["unnamed1405416"] + cases = [] + + +register_global(unnamed1405416, Function(_ctor_template_unnamed1405416)) + + +# Typing for __nv_bfloat16 +class _type_class___nv_bfloat16(Number): + def __init__(self): + super().__init__(name="__nv_bfloat16") + self.alignof_ = 2 + self.bitwidth = 2 * 8 + + +_type___nv_bfloat16 = _type_class___nv_bfloat16() + + +# Make Python API for struct +__nv_bfloat16 = type("__nv_bfloat16", (), {"_nbtype": _type___nv_bfloat16}) + +as_numba_type.register(__nv_bfloat16, _type___nv_bfloat16) + + +@register_model(_type_class___nv_bfloat16) +class _model___nv_bfloat16(PrimitiveModel): + def __init__(self, dmm, fe_type): + be_type = ir.IntType(fe_type.bitwidth) + super(_model___nv_bfloat16, self).__init__(dmm, fe_type, be_type) def _lower__ZN13__nv_bfloat16C1Ev(shim_stream, shim_obj): @@ -281,6 +311,15 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) ) + @lower_cast(_type_unnamed1405307, _type___nv_bfloat16) + def conversion_impl(context, builder, fromty, toty, value): + return ctor_impl( + context, + builder, + signature(_type___nv_bfloat16, fromty), + [value], + ) + _lower__ZN13__nv_bfloat16C1ERK17__nv_bfloat16_raw(shim_stream, shim_obj) @@ -327,17 +366,6 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) ) - # By default, Numbast does not generate this cast because the c++ conversion - # constructor is marked explict. We enable it by hand here. - @lower_cast(float16, __nv_bfloat16) - def conversion_impl(context, builder, fromty, toty, value): - return ctor_impl( - context, - builder, - signature(__nv_bfloat16, fromty), - [value], - ) - _lower__ZN13__nv_bfloat16C1E6__half(shim_stream, shim_obj) @@ -382,6 +410,15 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) ) + @lower_cast(float32, _type___nv_bfloat16) + def conversion_impl(context, builder, fromty, toty, value): + return ctor_impl( + context, + builder, + signature(_type___nv_bfloat16, fromty), + [value], + ) + _lower__ZN13__nv_bfloat16C1Ef(shim_stream, shim_obj) @@ -426,6 +463,15 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) ) + @lower_cast(float64, _type___nv_bfloat16) + def conversion_impl(context, builder, fromty, toty, value): + return ctor_impl( + context, + builder, + signature(_type___nv_bfloat16, fromty), + [value], + ) + _lower__ZN13__nv_bfloat16C1Ed(shim_stream, shim_obj) @@ -470,6 +516,15 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) ) + @lower_cast(int16, _type___nv_bfloat16) + def conversion_impl(context, builder, fromty, toty, value): + return ctor_impl( + context, + builder, + signature(_type___nv_bfloat16, fromty), + [value], + ) + _lower__ZN13__nv_bfloat16C1Es(shim_stream, shim_obj) @@ -514,6 +569,15 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) ) + @lower_cast(uint16, _type___nv_bfloat16) + def conversion_impl(context, builder, fromty, toty, value): + return ctor_impl( + context, + builder, + signature(_type___nv_bfloat16, fromty), + [value], + ) + _lower__ZN13__nv_bfloat16C1Et(shim_stream, shim_obj) @@ -558,6 +622,15 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) ) + @lower_cast(int32, _type___nv_bfloat16) + def conversion_impl(context, builder, fromty, toty, value): + return ctor_impl( + context, + builder, + signature(_type___nv_bfloat16, fromty), + [value], + ) + _lower__ZN13__nv_bfloat16C1Ei(shim_stream, shim_obj) @@ -602,6 +675,15 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) ) + @lower_cast(uint32, _type___nv_bfloat16) + def conversion_impl(context, builder, fromty, toty, value): + return ctor_impl( + context, + builder, + signature(_type___nv_bfloat16, fromty), + [value], + ) + _lower__ZN13__nv_bfloat16C1Ej(shim_stream, shim_obj) @@ -646,6 +728,15 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) ) + @lower_cast(int64, _type___nv_bfloat16) + def conversion_impl(context, builder, fromty, toty, value): + return ctor_impl( + context, + builder, + signature(_type___nv_bfloat16, fromty), + [value], + ) + _lower__ZN13__nv_bfloat16C1El(shim_stream, shim_obj) @@ -690,6 +781,15 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) ) + @lower_cast(uint64, _type___nv_bfloat16) + def conversion_impl(context, builder, fromty, toty, value): + return ctor_impl( + context, + builder, + signature(_type___nv_bfloat16, fromty), + [value], + ) + _lower__ZN13__nv_bfloat16C1Em(shim_stream, shim_obj) @@ -734,6 +834,15 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) ) + @lower_cast(int64, _type___nv_bfloat16) + def conversion_impl(context, builder, fromty, toty, value): + return ctor_impl( + context, + builder, + signature(_type___nv_bfloat16, fromty), + [value], + ) + _lower__ZN13__nv_bfloat16C1Ex(shim_stream, shim_obj) @@ -778,6 +887,15 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) ) + @lower_cast(uint64, _type___nv_bfloat16) + def conversion_impl(context, builder, fromty, toty, value): + return ctor_impl( + context, + builder, + signature(_type___nv_bfloat16, fromty), + [value], + ) + _lower__ZN13__nv_bfloat16C1Ey(shim_stream, shim_obj) @@ -1792,6 +1910,15 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat162, "alignof_", None) ) + @lower_cast(_type_unnamed1405416, _type___nv_bfloat162) + def conversion_impl(context, builder, fromty, toty, value): + return ctor_impl( + context, + builder, + signature(_type___nv_bfloat162, fromty), + [value], + ) + _lower__ZN14__nv_bfloat162C1ERK18__nv_bfloat162_raw(shim_stream, shim_obj) From c4cf6858b5b6c073ac58375ff3c33652c1adf94c Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 15 Aug 2025 13:22:57 -0700 Subject: [PATCH 21/56] apply binding patches --- numba_cuda/numba/cuda/_internal/cuda_bf16.py | 46 ++++++-------------- 1 file changed, 14 insertions(+), 32 deletions(-) diff --git a/numba_cuda/numba/cuda/_internal/cuda_bf16.py b/numba_cuda/numba/cuda/_internal/cuda_bf16.py index 5adb821eb..21babf0c9 100644 --- a/numba_cuda/numba/cuda/_internal/cuda_bf16.py +++ b/numba_cuda/numba/cuda/_internal/cuda_bf16.py @@ -28,7 +28,6 @@ from numba.core.typing.templates import AttributeTemplate, ConcreteTemplate from numba.core.typing.templates import Registry as TypingRegistry from numba.cuda import CUSource, declare_device -from numba.cuda._internal.cuda_bf16 import _type___nv_bfloat16 from numba.cuda.vector_types import vector_types from numba.extending import as_numba_type from numba.types import ( @@ -50,8 +49,10 @@ uint64, void, ) +from numba.cuda.types import bfloat16 float32x2 = vector_types["float32x2"] +__half = float16 typing_registry = TypingRegistry() @@ -181,37 +182,7 @@ def resolve_y(self, obj): make_attribute_wrapper(_type_class_unnamed1405416, "y", "y") -@register -class _ctor_template_unnamed1405416(ConcreteTemplate): - key = globals()["unnamed1405416"] - cases = [] - - -register_global(unnamed1405416, Function(_ctor_template_unnamed1405416)) - - -# Typing for __nv_bfloat16 -class _type_class___nv_bfloat16(Number): - def __init__(self): - super().__init__(name="__nv_bfloat16") - self.alignof_ = 2 - self.bitwidth = 2 * 8 - - -_type___nv_bfloat16 = _type_class___nv_bfloat16() - - -# Make Python API for struct -__nv_bfloat16 = type("__nv_bfloat16", (), {"_nbtype": _type___nv_bfloat16}) - -as_numba_type.register(__nv_bfloat16, _type___nv_bfloat16) - - -@register_model(_type_class___nv_bfloat16) -class _model___nv_bfloat16(PrimitiveModel): - def __init__(self, dmm, fe_type): - be_type = ir.IntType(fe_type.bitwidth) - super(_model___nv_bfloat16, self).__init__(dmm, fe_type, be_type) +__nv_bfloat16 = _type___nv_bfloat16 = bfloat16 def _lower__ZN13__nv_bfloat16C1Ev(shim_stream, shim_obj): @@ -366,6 +337,17 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) ) + # By default, Numbast does not generate this cast because the c++ conversion + # constructor is marked explict. We enable it by hand here. + @lower_cast(float16, __nv_bfloat16) + def conversion_impl(context, builder, fromty, toty, value): + return ctor_impl( + context, + builder, + signature(__nv_bfloat16, fromty), + [value], + ) + _lower__ZN13__nv_bfloat16C1E6__half(shim_stream, shim_obj) From 7a89d3ebfd620491da36323dc5f956e87364d411 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 15 Aug 2025 13:38:13 -0700 Subject: [PATCH 22/56] generate the bindings --- numba_cuda/numba/cuda/_internal/cuda_bf16.py | 1648 ++++-------------- 1 file changed, 314 insertions(+), 1334 deletions(-) diff --git a/numba_cuda/numba/cuda/_internal/cuda_bf16.py b/numba_cuda/numba/cuda/_internal/cuda_bf16.py index 21babf0c9..0eef75e12 100644 --- a/numba_cuda/numba/cuda/_internal/cuda_bf16.py +++ b/numba_cuda/numba/cuda/_internal/cuda_bf16.py @@ -28,6 +28,7 @@ from numba.core.typing.templates import AttributeTemplate, ConcreteTemplate from numba.core.typing.templates import Registry as TypingRegistry from numba.cuda import CUSource, declare_device +from numba.cuda._internal.cuda_bf16 import _type___nv_bfloat16 from numba.cuda.vector_types import vector_types from numba.extending import as_numba_type from numba.types import ( @@ -49,10 +50,8 @@ uint64, void, ) -from numba.cuda.types import bfloat16 float32x2 = vector_types["float32x2"] -__half = float16 typing_registry = TypingRegistry() @@ -182,7 +181,37 @@ def resolve_y(self, obj): make_attribute_wrapper(_type_class_unnamed1405416, "y", "y") -__nv_bfloat16 = _type___nv_bfloat16 = bfloat16 +@register +class _ctor_template_unnamed1405416(ConcreteTemplate): + key = globals()["unnamed1405416"] + cases = [] + + +register_global(unnamed1405416, Function(_ctor_template_unnamed1405416)) + + +# Typing for __nv_bfloat16 +class _type_class___nv_bfloat16(Number): + def __init__(self): + super().__init__(name="__nv_bfloat16") + self.alignof_ = 2 + self.bitwidth = 2 * 8 + + +_type___nv_bfloat16 = _type_class___nv_bfloat16() + + +# Make Python API for struct +__nv_bfloat16 = type("__nv_bfloat16", (), {"_nbtype": _type___nv_bfloat16}) + +as_numba_type.register(__nv_bfloat16, _type___nv_bfloat16) + + +@register_model(_type_class___nv_bfloat16) +class _model___nv_bfloat16(PrimitiveModel): + def __init__(self, dmm, fe_type): + be_type = ir.IntType(fe_type.bitwidth) + super(_model___nv_bfloat16, self).__init__(dmm, fe_type, be_type) def _lower__ZN13__nv_bfloat16C1Ev(shim_stream, shim_obj): @@ -337,17 +366,6 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) ) - # By default, Numbast does not generate this cast because the c++ conversion - # constructor is marked explict. We enable it by hand here. - @lower_cast(float16, __nv_bfloat16) - def conversion_impl(context, builder, fromty, toty, value): - return ctor_impl( - context, - builder, - signature(__nv_bfloat16, fromty), - [value], - ) - _lower__ZN13__nv_bfloat16C1E6__half(shim_stream, shim_obj) @@ -1994,11 +2012,7 @@ def _lower__ZL17__double2bfloat16d_nbst(shim_stream, shim_obj): def _ZL17__double2bfloat16d_nbst_caller(arg_0): return _ZL17__double2bfloat16d_nbst(arg_0) - handle = globals().get("__double2bfloat16") - if handle is None: - handle = __double2bfloat16 - - @lower(handle, float64) + @lower(__double2bfloat16, float64) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key("_ZL17__double2bfloat16d_nbst", shim_raw_str) @@ -2037,11 +2051,7 @@ def _lower__ZL16__float2bfloat16f_nbst(shim_stream, shim_obj): def _ZL16__float2bfloat16f_nbst_caller(arg_0): return _ZL16__float2bfloat16f_nbst(arg_0) - handle = globals().get("__float2bfloat16") - if handle is None: - handle = __float2bfloat16 - - @lower(handle, float32) + @lower(__float2bfloat16, float32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key("_ZL16__float2bfloat16f_nbst", shim_raw_str) @@ -2080,11 +2090,7 @@ def _lower__ZL19__float2bfloat16_rnf_nbst(shim_stream, shim_obj): def _ZL19__float2bfloat16_rnf_nbst_caller(arg_0): return _ZL19__float2bfloat16_rnf_nbst(arg_0) - handle = globals().get("__float2bfloat16_rn") - if handle is None: - handle = __float2bfloat16_rn - - @lower(handle, float32) + @lower(__float2bfloat16_rn, float32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -2125,11 +2131,7 @@ def _lower__ZL19__float2bfloat16_rzf_nbst(shim_stream, shim_obj): def _ZL19__float2bfloat16_rzf_nbst_caller(arg_0): return _ZL19__float2bfloat16_rzf_nbst(arg_0) - handle = globals().get("__float2bfloat16_rz") - if handle is None: - handle = __float2bfloat16_rz - - @lower(handle, float32) + @lower(__float2bfloat16_rz, float32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -2170,11 +2172,7 @@ def _lower__ZL19__float2bfloat16_rdf_nbst(shim_stream, shim_obj): def _ZL19__float2bfloat16_rdf_nbst_caller(arg_0): return _ZL19__float2bfloat16_rdf_nbst(arg_0) - handle = globals().get("__float2bfloat16_rd") - if handle is None: - handle = __float2bfloat16_rd - - @lower(handle, float32) + @lower(__float2bfloat16_rd, float32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -2215,11 +2213,7 @@ def _lower__ZL19__float2bfloat16_ruf_nbst(shim_stream, shim_obj): def _ZL19__float2bfloat16_ruf_nbst_caller(arg_0): return _ZL19__float2bfloat16_ruf_nbst(arg_0) - handle = globals().get("__float2bfloat16_ru") - if handle is None: - handle = __float2bfloat16_ru - - @lower(handle, float32) + @lower(__float2bfloat16_ru, float32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -2261,11 +2255,7 @@ def _lower__ZL16__bfloat162float13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL16__bfloat162float13__nv_bfloat16_nbst_caller(arg_0): return _ZL16__bfloat162float13__nv_bfloat16_nbst(arg_0) - handle = globals().get("__bfloat162float") - if handle is None: - handle = __bfloat162float - - @lower(handle, _type___nv_bfloat16) + @lower(__bfloat162float, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -2307,11 +2297,7 @@ def _lower__ZL20__float2bfloat162_rnf_nbst(shim_stream, shim_obj): def _ZL20__float2bfloat162_rnf_nbst_caller(arg_0): return _ZL20__float2bfloat162_rnf_nbst(arg_0) - handle = globals().get("__float2bfloat162_rn") - if handle is None: - handle = __float2bfloat162_rn - - @lower(handle, float32) + @lower(__float2bfloat162_rn, float32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -2353,11 +2339,7 @@ def _lower__ZL21__floats2bfloat162_rnff_nbst(shim_stream, shim_obj): def _ZL21__floats2bfloat162_rnff_nbst_caller(arg_0, arg_1): return _ZL21__floats2bfloat162_rnff_nbst(arg_0, arg_1) - handle = globals().get("__floats2bfloat162_rn") - if handle is None: - handle = __floats2bfloat162_rn - - @lower(handle, float32, float32) + @lower(__floats2bfloat162_rn, float32, float32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -2401,11 +2383,7 @@ def _lower__ZL11__low2float14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL11__low2float14__nv_bfloat162_nbst_caller(arg_0): return _ZL11__low2float14__nv_bfloat162_nbst(arg_0) - handle = globals().get("__low2float") - if handle is None: - handle = __low2float - - @lower(handle, _type___nv_bfloat162) + @lower(__low2float, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -2447,11 +2425,7 @@ def _lower__ZL12__high2float14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL12__high2float14__nv_bfloat162_nbst_caller(arg_0): return _ZL12__high2float14__nv_bfloat162_nbst(arg_0) - handle = globals().get("__high2float") - if handle is None: - handle = __high2float - - @lower(handle, _type___nv_bfloat162) + @lower(__high2float, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -2493,11 +2467,7 @@ def _lower__ZL21__float22bfloat162_rn6float2_nbst(shim_stream, shim_obj): def _ZL21__float22bfloat162_rn6float2_nbst_caller(arg_0): return _ZL21__float22bfloat162_rn6float2_nbst(arg_0) - handle = globals().get("__float22bfloat162_rn") - if handle is None: - handle = __float22bfloat162_rn - - @lower(handle, float32x2) + @lower(__float22bfloat162_rn, float32x2) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -2539,11 +2509,7 @@ def _lower__ZL18__bfloat1622float214__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL18__bfloat1622float214__nv_bfloat162_nbst_caller(arg_0): return _ZL18__bfloat1622float214__nv_bfloat162_nbst(arg_0) - handle = globals().get("__bfloat1622float2") - if handle is None: - handle = __bfloat1622float2 - - @lower(handle, _type___nv_bfloat162) + @lower(__bfloat1622float2, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -2585,11 +2551,7 @@ def _lower__ZL18__bfloat162char_rz13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL18__bfloat162char_rz13__nv_bfloat16_nbst_caller(arg_0): return _ZL18__bfloat162char_rz13__nv_bfloat16_nbst(arg_0) - handle = globals().get("__bfloat162char_rz") - if handle is None: - handle = __bfloat162char_rz - - @lower(handle, _type___nv_bfloat16) + @lower(__bfloat162char_rz, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -2631,11 +2593,7 @@ def _lower__ZL19__bfloat162uchar_rz13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL19__bfloat162uchar_rz13__nv_bfloat16_nbst_caller(arg_0): return _ZL19__bfloat162uchar_rz13__nv_bfloat16_nbst(arg_0) - handle = globals().get("__bfloat162uchar_rz") - if handle is None: - handle = __bfloat162uchar_rz - - @lower(handle, _type___nv_bfloat16) + @lower(__bfloat162uchar_rz, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -2677,11 +2635,7 @@ def _lower__ZL17__bfloat162int_rn13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL17__bfloat162int_rn13__nv_bfloat16_nbst_caller(arg_0): return _ZL17__bfloat162int_rn13__nv_bfloat16_nbst(arg_0) - handle = globals().get("__bfloat162int_rn") - if handle is None: - handle = __bfloat162int_rn - - @lower(handle, _type___nv_bfloat16) + @lower(__bfloat162int_rn, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -2723,11 +2677,7 @@ def _lower__ZL17__bfloat162int_rz13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL17__bfloat162int_rz13__nv_bfloat16_nbst_caller(arg_0): return _ZL17__bfloat162int_rz13__nv_bfloat16_nbst(arg_0) - handle = globals().get("__bfloat162int_rz") - if handle is None: - handle = __bfloat162int_rz - - @lower(handle, _type___nv_bfloat16) + @lower(__bfloat162int_rz, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -2769,11 +2719,7 @@ def _lower__ZL17__bfloat162int_rd13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL17__bfloat162int_rd13__nv_bfloat16_nbst_caller(arg_0): return _ZL17__bfloat162int_rd13__nv_bfloat16_nbst(arg_0) - handle = globals().get("__bfloat162int_rd") - if handle is None: - handle = __bfloat162int_rd - - @lower(handle, _type___nv_bfloat16) + @lower(__bfloat162int_rd, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -2815,11 +2761,7 @@ def _lower__ZL17__bfloat162int_ru13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL17__bfloat162int_ru13__nv_bfloat16_nbst_caller(arg_0): return _ZL17__bfloat162int_ru13__nv_bfloat16_nbst(arg_0) - handle = globals().get("__bfloat162int_ru") - if handle is None: - handle = __bfloat162int_ru - - @lower(handle, _type___nv_bfloat16) + @lower(__bfloat162int_ru, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -2860,11 +2802,7 @@ def _lower__ZL17__int2bfloat16_rni_nbst(shim_stream, shim_obj): def _ZL17__int2bfloat16_rni_nbst_caller(arg_0): return _ZL17__int2bfloat16_rni_nbst(arg_0) - handle = globals().get("__int2bfloat16_rn") - if handle is None: - handle = __int2bfloat16_rn - - @lower(handle, int32) + @lower(__int2bfloat16_rn, int32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key("_ZL17__int2bfloat16_rni_nbst", shim_raw_str) @@ -2903,11 +2841,7 @@ def _lower__ZL17__int2bfloat16_rzi_nbst(shim_stream, shim_obj): def _ZL17__int2bfloat16_rzi_nbst_caller(arg_0): return _ZL17__int2bfloat16_rzi_nbst(arg_0) - handle = globals().get("__int2bfloat16_rz") - if handle is None: - handle = __int2bfloat16_rz - - @lower(handle, int32) + @lower(__int2bfloat16_rz, int32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key("_ZL17__int2bfloat16_rzi_nbst", shim_raw_str) @@ -2946,11 +2880,7 @@ def _lower__ZL17__int2bfloat16_rdi_nbst(shim_stream, shim_obj): def _ZL17__int2bfloat16_rdi_nbst_caller(arg_0): return _ZL17__int2bfloat16_rdi_nbst(arg_0) - handle = globals().get("__int2bfloat16_rd") - if handle is None: - handle = __int2bfloat16_rd - - @lower(handle, int32) + @lower(__int2bfloat16_rd, int32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key("_ZL17__int2bfloat16_rdi_nbst", shim_raw_str) @@ -2989,11 +2919,7 @@ def _lower__ZL17__int2bfloat16_rui_nbst(shim_stream, shim_obj): def _ZL17__int2bfloat16_rui_nbst_caller(arg_0): return _ZL17__int2bfloat16_rui_nbst(arg_0) - handle = globals().get("__int2bfloat16_ru") - if handle is None: - handle = __int2bfloat16_ru - - @lower(handle, int32) + @lower(__int2bfloat16_ru, int32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key("_ZL17__int2bfloat16_rui_nbst", shim_raw_str) @@ -3033,11 +2959,7 @@ def _lower__ZL19__bfloat162short_rn13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL19__bfloat162short_rn13__nv_bfloat16_nbst_caller(arg_0): return _ZL19__bfloat162short_rn13__nv_bfloat16_nbst(arg_0) - handle = globals().get("__bfloat162short_rn") - if handle is None: - handle = __bfloat162short_rn - - @lower(handle, _type___nv_bfloat16) + @lower(__bfloat162short_rn, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -3079,11 +3001,7 @@ def _lower__ZL19__bfloat162short_rz13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL19__bfloat162short_rz13__nv_bfloat16_nbst_caller(arg_0): return _ZL19__bfloat162short_rz13__nv_bfloat16_nbst(arg_0) - handle = globals().get("__bfloat162short_rz") - if handle is None: - handle = __bfloat162short_rz - - @lower(handle, _type___nv_bfloat16) + @lower(__bfloat162short_rz, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -3125,11 +3043,7 @@ def _lower__ZL19__bfloat162short_rd13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL19__bfloat162short_rd13__nv_bfloat16_nbst_caller(arg_0): return _ZL19__bfloat162short_rd13__nv_bfloat16_nbst(arg_0) - handle = globals().get("__bfloat162short_rd") - if handle is None: - handle = __bfloat162short_rd - - @lower(handle, _type___nv_bfloat16) + @lower(__bfloat162short_rd, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -3171,11 +3085,7 @@ def _lower__ZL19__bfloat162short_ru13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL19__bfloat162short_ru13__nv_bfloat16_nbst_caller(arg_0): return _ZL19__bfloat162short_ru13__nv_bfloat16_nbst(arg_0) - handle = globals().get("__bfloat162short_ru") - if handle is None: - handle = __bfloat162short_ru - - @lower(handle, _type___nv_bfloat16) + @lower(__bfloat162short_ru, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -3216,11 +3126,7 @@ def _lower__ZL19__short2bfloat16_rns_nbst(shim_stream, shim_obj): def _ZL19__short2bfloat16_rns_nbst_caller(arg_0): return _ZL19__short2bfloat16_rns_nbst(arg_0) - handle = globals().get("__short2bfloat16_rn") - if handle is None: - handle = __short2bfloat16_rn - - @lower(handle, int16) + @lower(__short2bfloat16_rn, int16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -3261,11 +3167,7 @@ def _lower__ZL19__short2bfloat16_rzs_nbst(shim_stream, shim_obj): def _ZL19__short2bfloat16_rzs_nbst_caller(arg_0): return _ZL19__short2bfloat16_rzs_nbst(arg_0) - handle = globals().get("__short2bfloat16_rz") - if handle is None: - handle = __short2bfloat16_rz - - @lower(handle, int16) + @lower(__short2bfloat16_rz, int16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -3306,11 +3208,7 @@ def _lower__ZL19__short2bfloat16_rds_nbst(shim_stream, shim_obj): def _ZL19__short2bfloat16_rds_nbst_caller(arg_0): return _ZL19__short2bfloat16_rds_nbst(arg_0) - handle = globals().get("__short2bfloat16_rd") - if handle is None: - handle = __short2bfloat16_rd - - @lower(handle, int16) + @lower(__short2bfloat16_rd, int16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -3351,11 +3249,7 @@ def _lower__ZL19__short2bfloat16_rus_nbst(shim_stream, shim_obj): def _ZL19__short2bfloat16_rus_nbst_caller(arg_0): return _ZL19__short2bfloat16_rus_nbst(arg_0) - handle = globals().get("__short2bfloat16_ru") - if handle is None: - handle = __short2bfloat16_ru - - @lower(handle, int16) + @lower(__short2bfloat16_ru, int16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -3397,11 +3291,7 @@ def _lower__ZL18__bfloat162uint_rn13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL18__bfloat162uint_rn13__nv_bfloat16_nbst_caller(arg_0): return _ZL18__bfloat162uint_rn13__nv_bfloat16_nbst(arg_0) - handle = globals().get("__bfloat162uint_rn") - if handle is None: - handle = __bfloat162uint_rn - - @lower(handle, _type___nv_bfloat16) + @lower(__bfloat162uint_rn, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -3443,11 +3333,7 @@ def _lower__ZL18__bfloat162uint_rz13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL18__bfloat162uint_rz13__nv_bfloat16_nbst_caller(arg_0): return _ZL18__bfloat162uint_rz13__nv_bfloat16_nbst(arg_0) - handle = globals().get("__bfloat162uint_rz") - if handle is None: - handle = __bfloat162uint_rz - - @lower(handle, _type___nv_bfloat16) + @lower(__bfloat162uint_rz, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -3489,11 +3375,7 @@ def _lower__ZL18__bfloat162uint_rd13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL18__bfloat162uint_rd13__nv_bfloat16_nbst_caller(arg_0): return _ZL18__bfloat162uint_rd13__nv_bfloat16_nbst(arg_0) - handle = globals().get("__bfloat162uint_rd") - if handle is None: - handle = __bfloat162uint_rd - - @lower(handle, _type___nv_bfloat16) + @lower(__bfloat162uint_rd, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -3535,11 +3417,7 @@ def _lower__ZL18__bfloat162uint_ru13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL18__bfloat162uint_ru13__nv_bfloat16_nbst_caller(arg_0): return _ZL18__bfloat162uint_ru13__nv_bfloat16_nbst(arg_0) - handle = globals().get("__bfloat162uint_ru") - if handle is None: - handle = __bfloat162uint_ru - - @lower(handle, _type___nv_bfloat16) + @lower(__bfloat162uint_ru, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -3580,11 +3458,7 @@ def _lower__ZL18__uint2bfloat16_rnj_nbst(shim_stream, shim_obj): def _ZL18__uint2bfloat16_rnj_nbst_caller(arg_0): return _ZL18__uint2bfloat16_rnj_nbst(arg_0) - handle = globals().get("__uint2bfloat16_rn") - if handle is None: - handle = __uint2bfloat16_rn - - @lower(handle, uint32) + @lower(__uint2bfloat16_rn, uint32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -3625,11 +3499,7 @@ def _lower__ZL18__uint2bfloat16_rzj_nbst(shim_stream, shim_obj): def _ZL18__uint2bfloat16_rzj_nbst_caller(arg_0): return _ZL18__uint2bfloat16_rzj_nbst(arg_0) - handle = globals().get("__uint2bfloat16_rz") - if handle is None: - handle = __uint2bfloat16_rz - - @lower(handle, uint32) + @lower(__uint2bfloat16_rz, uint32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -3670,11 +3540,7 @@ def _lower__ZL18__uint2bfloat16_rdj_nbst(shim_stream, shim_obj): def _ZL18__uint2bfloat16_rdj_nbst_caller(arg_0): return _ZL18__uint2bfloat16_rdj_nbst(arg_0) - handle = globals().get("__uint2bfloat16_rd") - if handle is None: - handle = __uint2bfloat16_rd - - @lower(handle, uint32) + @lower(__uint2bfloat16_rd, uint32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -3715,11 +3581,7 @@ def _lower__ZL18__uint2bfloat16_ruj_nbst(shim_stream, shim_obj): def _ZL18__uint2bfloat16_ruj_nbst_caller(arg_0): return _ZL18__uint2bfloat16_ruj_nbst(arg_0) - handle = globals().get("__uint2bfloat16_ru") - if handle is None: - handle = __uint2bfloat16_ru - - @lower(handle, uint32) + @lower(__uint2bfloat16_ru, uint32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -3761,11 +3623,7 @@ def _lower__ZL20__bfloat162ushort_rn13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL20__bfloat162ushort_rn13__nv_bfloat16_nbst_caller(arg_0): return _ZL20__bfloat162ushort_rn13__nv_bfloat16_nbst(arg_0) - handle = globals().get("__bfloat162ushort_rn") - if handle is None: - handle = __bfloat162ushort_rn - - @lower(handle, _type___nv_bfloat16) + @lower(__bfloat162ushort_rn, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -3807,11 +3665,7 @@ def _lower__ZL20__bfloat162ushort_rz13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL20__bfloat162ushort_rz13__nv_bfloat16_nbst_caller(arg_0): return _ZL20__bfloat162ushort_rz13__nv_bfloat16_nbst(arg_0) - handle = globals().get("__bfloat162ushort_rz") - if handle is None: - handle = __bfloat162ushort_rz - - @lower(handle, _type___nv_bfloat16) + @lower(__bfloat162ushort_rz, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -3853,11 +3707,7 @@ def _lower__ZL20__bfloat162ushort_rd13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL20__bfloat162ushort_rd13__nv_bfloat16_nbst_caller(arg_0): return _ZL20__bfloat162ushort_rd13__nv_bfloat16_nbst(arg_0) - handle = globals().get("__bfloat162ushort_rd") - if handle is None: - handle = __bfloat162ushort_rd - - @lower(handle, _type___nv_bfloat16) + @lower(__bfloat162ushort_rd, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -3899,11 +3749,7 @@ def _lower__ZL20__bfloat162ushort_ru13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL20__bfloat162ushort_ru13__nv_bfloat16_nbst_caller(arg_0): return _ZL20__bfloat162ushort_ru13__nv_bfloat16_nbst(arg_0) - handle = globals().get("__bfloat162ushort_ru") - if handle is None: - handle = __bfloat162ushort_ru - - @lower(handle, _type___nv_bfloat16) + @lower(__bfloat162ushort_ru, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -3944,11 +3790,7 @@ def _lower__ZL20__ushort2bfloat16_rnt_nbst(shim_stream, shim_obj): def _ZL20__ushort2bfloat16_rnt_nbst_caller(arg_0): return _ZL20__ushort2bfloat16_rnt_nbst(arg_0) - handle = globals().get("__ushort2bfloat16_rn") - if handle is None: - handle = __ushort2bfloat16_rn - - @lower(handle, uint16) + @lower(__ushort2bfloat16_rn, uint16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -3989,11 +3831,7 @@ def _lower__ZL20__ushort2bfloat16_rzt_nbst(shim_stream, shim_obj): def _ZL20__ushort2bfloat16_rzt_nbst_caller(arg_0): return _ZL20__ushort2bfloat16_rzt_nbst(arg_0) - handle = globals().get("__ushort2bfloat16_rz") - if handle is None: - handle = __ushort2bfloat16_rz - - @lower(handle, uint16) + @lower(__ushort2bfloat16_rz, uint16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -4034,11 +3872,7 @@ def _lower__ZL20__ushort2bfloat16_rdt_nbst(shim_stream, shim_obj): def _ZL20__ushort2bfloat16_rdt_nbst_caller(arg_0): return _ZL20__ushort2bfloat16_rdt_nbst(arg_0) - handle = globals().get("__ushort2bfloat16_rd") - if handle is None: - handle = __ushort2bfloat16_rd - - @lower(handle, uint16) + @lower(__ushort2bfloat16_rd, uint16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -4079,11 +3913,7 @@ def _lower__ZL20__ushort2bfloat16_rut_nbst(shim_stream, shim_obj): def _ZL20__ushort2bfloat16_rut_nbst_caller(arg_0): return _ZL20__ushort2bfloat16_rut_nbst(arg_0) - handle = globals().get("__ushort2bfloat16_ru") - if handle is None: - handle = __ushort2bfloat16_ru - - @lower(handle, uint16) + @lower(__ushort2bfloat16_ru, uint16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -4125,11 +3955,7 @@ def _lower__ZL17__bfloat162ull_rn13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL17__bfloat162ull_rn13__nv_bfloat16_nbst_caller(arg_0): return _ZL17__bfloat162ull_rn13__nv_bfloat16_nbst(arg_0) - handle = globals().get("__bfloat162ull_rn") - if handle is None: - handle = __bfloat162ull_rn - - @lower(handle, _type___nv_bfloat16) + @lower(__bfloat162ull_rn, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -4171,11 +3997,7 @@ def _lower__ZL17__bfloat162ull_rz13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL17__bfloat162ull_rz13__nv_bfloat16_nbst_caller(arg_0): return _ZL17__bfloat162ull_rz13__nv_bfloat16_nbst(arg_0) - handle = globals().get("__bfloat162ull_rz") - if handle is None: - handle = __bfloat162ull_rz - - @lower(handle, _type___nv_bfloat16) + @lower(__bfloat162ull_rz, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -4219,11 +4041,7 @@ def _lower__ZL14make_bfloat16213__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL14make_bfloat16213__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL14make_bfloat16213__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals().get("make_bfloat162") - if handle is None: - handle = make_bfloat162 - - @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(make_bfloat162, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -4269,11 +4087,7 @@ def _lower__ZL17__bfloat162ull_rd13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL17__bfloat162ull_rd13__nv_bfloat16_nbst_caller(arg_0): return _ZL17__bfloat162ull_rd13__nv_bfloat16_nbst(arg_0) - handle = globals().get("__bfloat162ull_rd") - if handle is None: - handle = __bfloat162ull_rd - - @lower(handle, _type___nv_bfloat16) + @lower(__bfloat162ull_rd, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -4315,11 +4129,7 @@ def _lower__ZL17__bfloat162ull_ru13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL17__bfloat162ull_ru13__nv_bfloat16_nbst_caller(arg_0): return _ZL17__bfloat162ull_ru13__nv_bfloat16_nbst(arg_0) - handle = globals().get("__bfloat162ull_ru") - if handle is None: - handle = __bfloat162ull_ru - - @lower(handle, _type___nv_bfloat16) + @lower(__bfloat162ull_ru, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -4360,11 +4170,7 @@ def _lower__ZL17__ull2bfloat16_rny_nbst(shim_stream, shim_obj): def _ZL17__ull2bfloat16_rny_nbst_caller(arg_0): return _ZL17__ull2bfloat16_rny_nbst(arg_0) - handle = globals().get("__ull2bfloat16_rn") - if handle is None: - handle = __ull2bfloat16_rn - - @lower(handle, uint64) + @lower(__ull2bfloat16_rn, uint64) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key("_ZL17__ull2bfloat16_rny_nbst", shim_raw_str) @@ -4403,11 +4209,7 @@ def _lower__ZL17__ull2bfloat16_rzy_nbst(shim_stream, shim_obj): def _ZL17__ull2bfloat16_rzy_nbst_caller(arg_0): return _ZL17__ull2bfloat16_rzy_nbst(arg_0) - handle = globals().get("__ull2bfloat16_rz") - if handle is None: - handle = __ull2bfloat16_rz - - @lower(handle, uint64) + @lower(__ull2bfloat16_rz, uint64) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key("_ZL17__ull2bfloat16_rzy_nbst", shim_raw_str) @@ -4446,11 +4248,7 @@ def _lower__ZL17__ull2bfloat16_rdy_nbst(shim_stream, shim_obj): def _ZL17__ull2bfloat16_rdy_nbst_caller(arg_0): return _ZL17__ull2bfloat16_rdy_nbst(arg_0) - handle = globals().get("__ull2bfloat16_rd") - if handle is None: - handle = __ull2bfloat16_rd - - @lower(handle, uint64) + @lower(__ull2bfloat16_rd, uint64) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key("_ZL17__ull2bfloat16_rdy_nbst", shim_raw_str) @@ -4489,11 +4287,7 @@ def _lower__ZL17__ull2bfloat16_ruy_nbst(shim_stream, shim_obj): def _ZL17__ull2bfloat16_ruy_nbst_caller(arg_0): return _ZL17__ull2bfloat16_ruy_nbst(arg_0) - handle = globals().get("__ull2bfloat16_ru") - if handle is None: - handle = __ull2bfloat16_ru - - @lower(handle, uint64) + @lower(__ull2bfloat16_ru, uint64) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key("_ZL17__ull2bfloat16_ruy_nbst", shim_raw_str) @@ -4533,11 +4327,7 @@ def _lower__ZL16__bfloat162ll_rn13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL16__bfloat162ll_rn13__nv_bfloat16_nbst_caller(arg_0): return _ZL16__bfloat162ll_rn13__nv_bfloat16_nbst(arg_0) - handle = globals().get("__bfloat162ll_rn") - if handle is None: - handle = __bfloat162ll_rn - - @lower(handle, _type___nv_bfloat16) + @lower(__bfloat162ll_rn, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -4579,11 +4369,7 @@ def _lower__ZL16__bfloat162ll_rz13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL16__bfloat162ll_rz13__nv_bfloat16_nbst_caller(arg_0): return _ZL16__bfloat162ll_rz13__nv_bfloat16_nbst(arg_0) - handle = globals().get("__bfloat162ll_rz") - if handle is None: - handle = __bfloat162ll_rz - - @lower(handle, _type___nv_bfloat16) + @lower(__bfloat162ll_rz, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -4625,11 +4411,7 @@ def _lower__ZL16__bfloat162ll_rd13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL16__bfloat162ll_rd13__nv_bfloat16_nbst_caller(arg_0): return _ZL16__bfloat162ll_rd13__nv_bfloat16_nbst(arg_0) - handle = globals().get("__bfloat162ll_rd") - if handle is None: - handle = __bfloat162ll_rd - - @lower(handle, _type___nv_bfloat16) + @lower(__bfloat162ll_rd, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -4671,11 +4453,7 @@ def _lower__ZL16__bfloat162ll_ru13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL16__bfloat162ll_ru13__nv_bfloat16_nbst_caller(arg_0): return _ZL16__bfloat162ll_ru13__nv_bfloat16_nbst(arg_0) - handle = globals().get("__bfloat162ll_ru") - if handle is None: - handle = __bfloat162ll_ru - - @lower(handle, _type___nv_bfloat16) + @lower(__bfloat162ll_ru, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -4716,11 +4494,7 @@ def _lower__ZL16__ll2bfloat16_rnx_nbst(shim_stream, shim_obj): def _ZL16__ll2bfloat16_rnx_nbst_caller(arg_0): return _ZL16__ll2bfloat16_rnx_nbst(arg_0) - handle = globals().get("__ll2bfloat16_rn") - if handle is None: - handle = __ll2bfloat16_rn - - @lower(handle, int64) + @lower(__ll2bfloat16_rn, int64) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key("_ZL16__ll2bfloat16_rnx_nbst", shim_raw_str) @@ -4759,11 +4533,7 @@ def _lower__ZL16__ll2bfloat16_rzx_nbst(shim_stream, shim_obj): def _ZL16__ll2bfloat16_rzx_nbst_caller(arg_0): return _ZL16__ll2bfloat16_rzx_nbst(arg_0) - handle = globals().get("__ll2bfloat16_rz") - if handle is None: - handle = __ll2bfloat16_rz - - @lower(handle, int64) + @lower(__ll2bfloat16_rz, int64) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key("_ZL16__ll2bfloat16_rzx_nbst", shim_raw_str) @@ -4802,11 +4572,7 @@ def _lower__ZL16__ll2bfloat16_rdx_nbst(shim_stream, shim_obj): def _ZL16__ll2bfloat16_rdx_nbst_caller(arg_0): return _ZL16__ll2bfloat16_rdx_nbst(arg_0) - handle = globals().get("__ll2bfloat16_rd") - if handle is None: - handle = __ll2bfloat16_rd - - @lower(handle, int64) + @lower(__ll2bfloat16_rd, int64) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key("_ZL16__ll2bfloat16_rdx_nbst", shim_raw_str) @@ -4845,11 +4611,7 @@ def _lower__ZL16__ll2bfloat16_rux_nbst(shim_stream, shim_obj): def _ZL16__ll2bfloat16_rux_nbst_caller(arg_0): return _ZL16__ll2bfloat16_rux_nbst(arg_0) - handle = globals().get("__ll2bfloat16_ru") - if handle is None: - handle = __ll2bfloat16_ru - - @lower(handle, int64) + @lower(__ll2bfloat16_ru, int64) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key("_ZL16__ll2bfloat16_rux_nbst", shim_raw_str) @@ -4889,11 +4651,7 @@ def _lower__ZL6htrunc13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL6htrunc13__nv_bfloat16_nbst_caller(arg_0): return _ZL6htrunc13__nv_bfloat16_nbst(arg_0) - handle = globals().get("htrunc") - if handle is None: - handle = htrunc - - @lower(handle, _type___nv_bfloat16) + @lower(htrunc, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -4935,11 +4693,7 @@ def _lower__ZL5hceil13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL5hceil13__nv_bfloat16_nbst_caller(arg_0): return _ZL5hceil13__nv_bfloat16_nbst(arg_0) - handle = globals().get("hceil") - if handle is None: - handle = hceil - - @lower(handle, _type___nv_bfloat16) + @lower(hceil, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -4981,11 +4735,7 @@ def _lower__ZL6hfloor13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL6hfloor13__nv_bfloat16_nbst_caller(arg_0): return _ZL6hfloor13__nv_bfloat16_nbst(arg_0) - handle = globals().get("hfloor") - if handle is None: - handle = hfloor - - @lower(handle, _type___nv_bfloat16) + @lower(hfloor, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -5027,11 +4777,7 @@ def _lower__ZL5hrint13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL5hrint13__nv_bfloat16_nbst_caller(arg_0): return _ZL5hrint13__nv_bfloat16_nbst(arg_0) - handle = globals().get("hrint") - if handle is None: - handle = hrint - - @lower(handle, _type___nv_bfloat16) + @lower(hrint, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -5073,11 +4819,7 @@ def _lower__ZL7h2trunc14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL7h2trunc14__nv_bfloat162_nbst_caller(arg_0): return _ZL7h2trunc14__nv_bfloat162_nbst(arg_0) - handle = globals().get("h2trunc") - if handle is None: - handle = h2trunc - - @lower(handle, _type___nv_bfloat162) + @lower(h2trunc, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -5119,11 +4861,7 @@ def _lower__ZL6h2ceil14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL6h2ceil14__nv_bfloat162_nbst_caller(arg_0): return _ZL6h2ceil14__nv_bfloat162_nbst(arg_0) - handle = globals().get("h2ceil") - if handle is None: - handle = h2ceil - - @lower(handle, _type___nv_bfloat162) + @lower(h2ceil, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -5165,11 +4903,7 @@ def _lower__ZL7h2floor14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL7h2floor14__nv_bfloat162_nbst_caller(arg_0): return _ZL7h2floor14__nv_bfloat162_nbst(arg_0) - handle = globals().get("h2floor") - if handle is None: - handle = h2floor - - @lower(handle, _type___nv_bfloat162) + @lower(h2floor, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -5211,11 +4945,7 @@ def _lower__ZL6h2rint14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL6h2rint14__nv_bfloat162_nbst_caller(arg_0): return _ZL6h2rint14__nv_bfloat162_nbst(arg_0) - handle = globals().get("h2rint") - if handle is None: - handle = h2rint - - @lower(handle, _type___nv_bfloat162) + @lower(h2rint, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -5257,11 +4987,7 @@ def _lower__ZL20__bfloat162bfloat16213__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL20__bfloat162bfloat16213__nv_bfloat16_nbst_caller(arg_0): return _ZL20__bfloat162bfloat16213__nv_bfloat16_nbst(arg_0) - handle = globals().get("__bfloat162bfloat162") - if handle is None: - handle = __bfloat162bfloat162 - - @lower(handle, _type___nv_bfloat16) + @lower(__bfloat162bfloat162, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -5303,11 +5029,7 @@ def _lower__ZL17__lowhigh2highlow14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL17__lowhigh2highlow14__nv_bfloat162_nbst_caller(arg_0): return _ZL17__lowhigh2highlow14__nv_bfloat162_nbst(arg_0) - handle = globals().get("__lowhigh2highlow") - if handle is None: - handle = __lowhigh2highlow - - @lower(handle, _type___nv_bfloat162) + @lower(__lowhigh2highlow, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -5351,11 +5073,7 @@ def _lower__ZL16__lows2bfloat16214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL16__lows2bfloat16214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL16__lows2bfloat16214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__lows2bfloat162") - if handle is None: - handle = __lows2bfloat162 - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__lows2bfloat162, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -5403,11 +5121,7 @@ def _lower__ZL17__highs2bfloat16214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL17__highs2bfloat16214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL17__highs2bfloat16214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__highs2bfloat162") - if handle is None: - handle = __highs2bfloat162 - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__highs2bfloat162, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -5453,11 +5167,7 @@ def _lower__ZL15__high2bfloat1614__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL15__high2bfloat1614__nv_bfloat162_nbst_caller(arg_0): return _ZL15__high2bfloat1614__nv_bfloat162_nbst(arg_0) - handle = globals().get("__high2bfloat16") - if handle is None: - handle = __high2bfloat16 - - @lower(handle, _type___nv_bfloat162) + @lower(__high2bfloat16, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -5499,11 +5209,7 @@ def _lower__ZL14__low2bfloat1614__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL14__low2bfloat1614__nv_bfloat162_nbst_caller(arg_0): return _ZL14__low2bfloat1614__nv_bfloat162_nbst(arg_0) - handle = globals().get("__low2bfloat16") - if handle is None: - handle = __low2bfloat16 - - @lower(handle, _type___nv_bfloat162) + @lower(__low2bfloat16, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -5544,11 +5250,7 @@ def _lower__ZL8__hisinf13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL8__hisinf13__nv_bfloat16_nbst_caller(arg_0): return _ZL8__hisinf13__nv_bfloat16_nbst(arg_0) - handle = globals().get("__hisinf") - if handle is None: - handle = __hisinf - - @lower(handle, _type___nv_bfloat16) + @lower(__hisinf, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -5592,11 +5294,7 @@ def _lower__ZL18__halves2bfloat16213__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL18__halves2bfloat16213__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL18__halves2bfloat16213__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals().get("__halves2bfloat162") - if handle is None: - handle = __halves2bfloat162 - - @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__halves2bfloat162, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -5642,11 +5340,7 @@ def _lower__ZL15__low2bfloat16214__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL15__low2bfloat16214__nv_bfloat162_nbst_caller(arg_0): return _ZL15__low2bfloat16214__nv_bfloat162_nbst(arg_0) - handle = globals().get("__low2bfloat162") - if handle is None: - handle = __low2bfloat162 - - @lower(handle, _type___nv_bfloat162) + @lower(__low2bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -5688,11 +5382,7 @@ def _lower__ZL16__high2bfloat16214__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL16__high2bfloat16214__nv_bfloat162_nbst_caller(arg_0): return _ZL16__high2bfloat16214__nv_bfloat162_nbst(arg_0) - handle = globals().get("__high2bfloat162") - if handle is None: - handle = __high2bfloat162 - - @lower(handle, _type___nv_bfloat162) + @lower(__high2bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -5734,11 +5424,7 @@ def _lower__ZL19__bfloat16_as_short13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL19__bfloat16_as_short13__nv_bfloat16_nbst_caller(arg_0): return _ZL19__bfloat16_as_short13__nv_bfloat16_nbst(arg_0) - handle = globals().get("__bfloat16_as_short") - if handle is None: - handle = __bfloat16_as_short - - @lower(handle, _type___nv_bfloat16) + @lower(__bfloat16_as_short, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -5780,11 +5466,7 @@ def _lower__ZL20__bfloat16_as_ushort13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL20__bfloat16_as_ushort13__nv_bfloat16_nbst_caller(arg_0): return _ZL20__bfloat16_as_ushort13__nv_bfloat16_nbst(arg_0) - handle = globals().get("__bfloat16_as_ushort") - if handle is None: - handle = __bfloat16_as_ushort - - @lower(handle, _type___nv_bfloat16) + @lower(__bfloat16_as_ushort, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -5825,11 +5507,7 @@ def _lower__ZL19__short_as_bfloat16s_nbst(shim_stream, shim_obj): def _ZL19__short_as_bfloat16s_nbst_caller(arg_0): return _ZL19__short_as_bfloat16s_nbst(arg_0) - handle = globals().get("__short_as_bfloat16") - if handle is None: - handle = __short_as_bfloat16 - - @lower(handle, int16) + @lower(__short_as_bfloat16, int16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -5870,11 +5548,7 @@ def _lower__ZL20__ushort_as_bfloat16t_nbst(shim_stream, shim_obj): def _ZL20__ushort_as_bfloat16t_nbst_caller(arg_0): return _ZL20__ushort_as_bfloat16t_nbst(arg_0) - handle = globals().get("__ushort_as_bfloat16") - if handle is None: - handle = __ushort_as_bfloat16 - - @lower(handle, uint16) + @lower(__ushort_as_bfloat16, uint16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -5925,11 +5599,7 @@ def _ZL11__shfl_syncj14__nv_bfloat162ii_nbst_caller( arg_0, arg_1, arg_2, arg_3 ) - handle = globals().get("__shfl_sync") - if handle is None: - handle = __shfl_sync - - @lower(handle, uint32, _type___nv_bfloat162, int32, int32) + @lower(__shfl_sync, uint32, _type___nv_bfloat162, int32, int32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -5986,11 +5656,7 @@ def _ZL14__shfl_up_syncj14__nv_bfloat162ji_nbst_caller( arg_0, arg_1, arg_2, arg_3 ) - handle = globals().get("__shfl_up_sync") - if handle is None: - handle = __shfl_up_sync - - @lower(handle, uint32, _type___nv_bfloat162, uint32, int32) + @lower(__shfl_up_sync, uint32, _type___nv_bfloat162, uint32, int32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -6047,11 +5713,7 @@ def _ZL16__shfl_down_syncj14__nv_bfloat162ji_nbst_caller( arg_0, arg_1, arg_2, arg_3 ) - handle = globals().get("__shfl_down_sync") - if handle is None: - handle = __shfl_down_sync - - @lower(handle, uint32, _type___nv_bfloat162, uint32, int32) + @lower(__shfl_down_sync, uint32, _type___nv_bfloat162, uint32, int32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -6108,11 +5770,7 @@ def _ZL15__shfl_xor_syncj14__nv_bfloat162ii_nbst_caller( arg_0, arg_1, arg_2, arg_3 ) - handle = globals().get("__shfl_xor_sync") - if handle is None: - handle = __shfl_xor_sync - - @lower(handle, uint32, _type___nv_bfloat162, int32, int32) + @lower(__shfl_xor_sync, uint32, _type___nv_bfloat162, int32, int32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -6165,11 +5823,7 @@ def _ZL11__shfl_syncj13__nv_bfloat16ii_nbst_caller( arg_0, arg_1, arg_2, arg_3 ) - handle = globals().get("__shfl_sync") - if handle is None: - handle = __shfl_sync - - @lower(handle, uint32, _type___nv_bfloat16, int32, int32) + @lower(__shfl_sync, uint32, _type___nv_bfloat16, int32, int32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -6222,11 +5876,7 @@ def _ZL14__shfl_up_syncj13__nv_bfloat16ji_nbst_caller( arg_0, arg_1, arg_2, arg_3 ) - handle = globals().get("__shfl_up_sync") - if handle is None: - handle = __shfl_up_sync - - @lower(handle, uint32, _type___nv_bfloat16, uint32, int32) + @lower(__shfl_up_sync, uint32, _type___nv_bfloat16, uint32, int32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -6279,11 +5929,7 @@ def _ZL16__shfl_down_syncj13__nv_bfloat16ji_nbst_caller( arg_0, arg_1, arg_2, arg_3 ) - handle = globals().get("__shfl_down_sync") - if handle is None: - handle = __shfl_down_sync - - @lower(handle, uint32, _type___nv_bfloat16, uint32, int32) + @lower(__shfl_down_sync, uint32, _type___nv_bfloat16, uint32, int32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -6336,11 +5982,7 @@ def _ZL15__shfl_xor_syncj13__nv_bfloat16ii_nbst_caller( arg_0, arg_1, arg_2, arg_3 ) - handle = globals().get("__shfl_xor_sync") - if handle is None: - handle = __shfl_xor_sync - - @lower(handle, uint32, _type___nv_bfloat16, int32, int32) + @lower(__shfl_xor_sync, uint32, _type___nv_bfloat16, int32, int32) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -6388,11 +6030,7 @@ def _lower__ZL5__ldgPK14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL5__ldgPK14__nv_bfloat162_nbst_caller(arg_0): return _ZL5__ldgPK14__nv_bfloat162_nbst(arg_0) - handle = globals().get("__ldg") - if handle is None: - handle = __ldg - - @lower(handle, CPointer(_type___nv_bfloat162)) + @lower(__ldg, CPointer(_type___nv_bfloat162)) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -6432,11 +6070,7 @@ def _lower__ZL5__ldgPK13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL5__ldgPK13__nv_bfloat16_nbst_caller(arg_0): return _ZL5__ldgPK13__nv_bfloat16_nbst(arg_0) - handle = globals().get("__ldg") - if handle is None: - handle = __ldg - - @lower(handle, CPointer(_type___nv_bfloat16)) + @lower(__ldg, CPointer(_type___nv_bfloat16)) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -6480,11 +6114,7 @@ def _lower__ZL6__ldcgPK14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL6__ldcgPK14__nv_bfloat162_nbst_caller(arg_0): return _ZL6__ldcgPK14__nv_bfloat162_nbst(arg_0) - handle = globals().get("__ldcg") - if handle is None: - handle = __ldcg - - @lower(handle, CPointer(_type___nv_bfloat162)) + @lower(__ldcg, CPointer(_type___nv_bfloat162)) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -6524,11 +6154,7 @@ def _lower__ZL6__ldcgPK13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL6__ldcgPK13__nv_bfloat16_nbst_caller(arg_0): return _ZL6__ldcgPK13__nv_bfloat16_nbst(arg_0) - handle = globals().get("__ldcg") - if handle is None: - handle = __ldcg - - @lower(handle, CPointer(_type___nv_bfloat16)) + @lower(__ldcg, CPointer(_type___nv_bfloat16)) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -6572,11 +6198,7 @@ def _lower__ZL6__ldcaPK14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL6__ldcaPK14__nv_bfloat162_nbst_caller(arg_0): return _ZL6__ldcaPK14__nv_bfloat162_nbst(arg_0) - handle = globals().get("__ldca") - if handle is None: - handle = __ldca - - @lower(handle, CPointer(_type___nv_bfloat162)) + @lower(__ldca, CPointer(_type___nv_bfloat162)) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -6616,11 +6238,7 @@ def _lower__ZL6__ldcaPK13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL6__ldcaPK13__nv_bfloat16_nbst_caller(arg_0): return _ZL6__ldcaPK13__nv_bfloat16_nbst(arg_0) - handle = globals().get("__ldca") - if handle is None: - handle = __ldca - - @lower(handle, CPointer(_type___nv_bfloat16)) + @lower(__ldca, CPointer(_type___nv_bfloat16)) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -6664,11 +6282,7 @@ def _lower__ZL6__ldcsPK14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL6__ldcsPK14__nv_bfloat162_nbst_caller(arg_0): return _ZL6__ldcsPK14__nv_bfloat162_nbst(arg_0) - handle = globals().get("__ldcs") - if handle is None: - handle = __ldcs - - @lower(handle, CPointer(_type___nv_bfloat162)) + @lower(__ldcs, CPointer(_type___nv_bfloat162)) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -6708,11 +6322,7 @@ def _lower__ZL6__ldcsPK13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL6__ldcsPK13__nv_bfloat16_nbst_caller(arg_0): return _ZL6__ldcsPK13__nv_bfloat16_nbst(arg_0) - handle = globals().get("__ldcs") - if handle is None: - handle = __ldcs - - @lower(handle, CPointer(_type___nv_bfloat16)) + @lower(__ldcs, CPointer(_type___nv_bfloat16)) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -6756,11 +6366,7 @@ def _lower__ZL6__ldluPK14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL6__ldluPK14__nv_bfloat162_nbst_caller(arg_0): return _ZL6__ldluPK14__nv_bfloat162_nbst(arg_0) - handle = globals().get("__ldlu") - if handle is None: - handle = __ldlu - - @lower(handle, CPointer(_type___nv_bfloat162)) + @lower(__ldlu, CPointer(_type___nv_bfloat162)) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -6800,11 +6406,7 @@ def _lower__ZL6__ldluPK13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL6__ldluPK13__nv_bfloat16_nbst_caller(arg_0): return _ZL6__ldluPK13__nv_bfloat16_nbst(arg_0) - handle = globals().get("__ldlu") - if handle is None: - handle = __ldlu - - @lower(handle, CPointer(_type___nv_bfloat16)) + @lower(__ldlu, CPointer(_type___nv_bfloat16)) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -6848,11 +6450,7 @@ def _lower__ZL6__ldcvPK14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL6__ldcvPK14__nv_bfloat162_nbst_caller(arg_0): return _ZL6__ldcvPK14__nv_bfloat162_nbst(arg_0) - handle = globals().get("__ldcv") - if handle is None: - handle = __ldcv - - @lower(handle, CPointer(_type___nv_bfloat162)) + @lower(__ldcv, CPointer(_type___nv_bfloat162)) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -6892,11 +6490,7 @@ def _lower__ZL6__ldcvPK13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL6__ldcvPK13__nv_bfloat16_nbst_caller(arg_0): return _ZL6__ldcvPK13__nv_bfloat16_nbst(arg_0) - handle = globals().get("__ldcv") - if handle is None: - handle = __ldcv - - @lower(handle, CPointer(_type___nv_bfloat16)) + @lower(__ldcv, CPointer(_type___nv_bfloat16)) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -6943,11 +6537,7 @@ def _lower__ZL6__stwbP14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL6__stwbP14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL6__stwbP14__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__stwb") - if handle is None: - handle = __stwb - - @lower(handle, CPointer(_type___nv_bfloat162), _type___nv_bfloat162) + @lower(__stwb, CPointer(_type___nv_bfloat162), _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -6992,11 +6582,7 @@ def _lower__ZL6__stwbP13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL6__stwbP13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL6__stwbP13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals().get("__stwb") - if handle is None: - handle = __stwb - - @lower(handle, CPointer(_type___nv_bfloat16), _type___nv_bfloat16) + @lower(__stwb, CPointer(_type___nv_bfloat16), _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -7045,11 +6631,7 @@ def _lower__ZL6__stcgP14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL6__stcgP14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL6__stcgP14__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__stcg") - if handle is None: - handle = __stcg - - @lower(handle, CPointer(_type___nv_bfloat162), _type___nv_bfloat162) + @lower(__stcg, CPointer(_type___nv_bfloat162), _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -7094,11 +6676,7 @@ def _lower__ZL6__stcgP13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL6__stcgP13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL6__stcgP13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals().get("__stcg") - if handle is None: - handle = __stcg - - @lower(handle, CPointer(_type___nv_bfloat16), _type___nv_bfloat16) + @lower(__stcg, CPointer(_type___nv_bfloat16), _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -7147,11 +6725,7 @@ def _lower__ZL6__stcsP14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL6__stcsP14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL6__stcsP14__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__stcs") - if handle is None: - handle = __stcs - - @lower(handle, CPointer(_type___nv_bfloat162), _type___nv_bfloat162) + @lower(__stcs, CPointer(_type___nv_bfloat162), _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -7196,11 +6770,7 @@ def _lower__ZL6__stcsP13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL6__stcsP13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL6__stcsP13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals().get("__stcs") - if handle is None: - handle = __stcs - - @lower(handle, CPointer(_type___nv_bfloat16), _type___nv_bfloat16) + @lower(__stcs, CPointer(_type___nv_bfloat16), _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -7249,11 +6819,7 @@ def _lower__ZL6__stwtP14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL6__stwtP14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL6__stwtP14__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__stwt") - if handle is None: - handle = __stwt - - @lower(handle, CPointer(_type___nv_bfloat162), _type___nv_bfloat162) + @lower(__stwt, CPointer(_type___nv_bfloat162), _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -7298,11 +6864,7 @@ def _lower__ZL6__stwtP13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL6__stwtP13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL6__stwtP13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals().get("__stwt") - if handle is None: - handle = __stwt - - @lower(handle, CPointer(_type___nv_bfloat16), _type___nv_bfloat16) + @lower(__stwt, CPointer(_type___nv_bfloat16), _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -7350,11 +6912,7 @@ def _lower__ZL6__heq214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL6__heq214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL6__heq214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__heq2") - if handle is None: - handle = __heq2 - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__heq2, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -7402,11 +6960,7 @@ def _lower__ZL6__hne214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL6__hne214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL6__hne214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__hne2") - if handle is None: - handle = __hne2 - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__hne2, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -7454,11 +7008,7 @@ def _lower__ZL6__hle214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL6__hle214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL6__hle214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__hle2") - if handle is None: - handle = __hle2 - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__hle2, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -7506,11 +7056,7 @@ def _lower__ZL6__hge214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL6__hge214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL6__hge214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__hge2") - if handle is None: - handle = __hge2 - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__hge2, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -7558,11 +7104,7 @@ def _lower__ZL6__hlt214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL6__hlt214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL6__hlt214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__hlt2") - if handle is None: - handle = __hlt2 - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__hlt2, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -7610,11 +7152,7 @@ def _lower__ZL6__hgt214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL6__hgt214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL6__hgt214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__hgt2") - if handle is None: - handle = __hgt2 - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__hgt2, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -7662,11 +7200,7 @@ def _lower__ZL7__hequ214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL7__hequ214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL7__hequ214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__hequ2") - if handle is None: - handle = __hequ2 - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__hequ2, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -7714,11 +7248,7 @@ def _lower__ZL7__hneu214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL7__hneu214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL7__hneu214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__hneu2") - if handle is None: - handle = __hneu2 - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__hneu2, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -7766,11 +7296,7 @@ def _lower__ZL7__hleu214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL7__hleu214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL7__hleu214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__hleu2") - if handle is None: - handle = __hleu2 - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__hleu2, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -7818,11 +7344,7 @@ def _lower__ZL7__hgeu214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL7__hgeu214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL7__hgeu214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__hgeu2") - if handle is None: - handle = __hgeu2 - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__hgeu2, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -7870,11 +7392,7 @@ def _lower__ZL7__hltu214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL7__hltu214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL7__hltu214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__hltu2") - if handle is None: - handle = __hltu2 - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__hltu2, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -7922,11 +7440,7 @@ def _lower__ZL7__hgtu214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL7__hgtu214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL7__hgtu214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__hgtu2") - if handle is None: - handle = __hgtu2 - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__hgtu2, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -7972,11 +7486,7 @@ def _lower__ZL11__heq2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL11__heq2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL11__heq2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__heq2_mask") - if handle is None: - handle = __heq2_mask - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__heq2_mask, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -8022,11 +7532,7 @@ def _lower__ZL11__hne2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL11__hne2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL11__hne2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__hne2_mask") - if handle is None: - handle = __hne2_mask - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__hne2_mask, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -8072,11 +7578,7 @@ def _lower__ZL11__hle2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL11__hle2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL11__hle2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__hle2_mask") - if handle is None: - handle = __hle2_mask - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__hle2_mask, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -8122,11 +7624,7 @@ def _lower__ZL11__hge2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL11__hge2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL11__hge2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__hge2_mask") - if handle is None: - handle = __hge2_mask - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__hge2_mask, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -8172,11 +7670,7 @@ def _lower__ZL11__hlt2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL11__hlt2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL11__hlt2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__hlt2_mask") - if handle is None: - handle = __hlt2_mask - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__hlt2_mask, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -8222,11 +7716,7 @@ def _lower__ZL11__hgt2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL11__hgt2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL11__hgt2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__hgt2_mask") - if handle is None: - handle = __hgt2_mask - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__hgt2_mask, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -8272,11 +7762,7 @@ def _lower__ZL12__hequ2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL12__hequ2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL12__hequ2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__hequ2_mask") - if handle is None: - handle = __hequ2_mask - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__hequ2_mask, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -8322,11 +7808,7 @@ def _lower__ZL12__hneu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL12__hneu2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL12__hneu2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__hneu2_mask") - if handle is None: - handle = __hneu2_mask - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__hneu2_mask, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -8372,11 +7854,7 @@ def _lower__ZL12__hleu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL12__hleu2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL12__hleu2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__hleu2_mask") - if handle is None: - handle = __hleu2_mask - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__hleu2_mask, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -8422,11 +7900,7 @@ def _lower__ZL12__hgeu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL12__hgeu2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL12__hgeu2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__hgeu2_mask") - if handle is None: - handle = __hgeu2_mask - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__hgeu2_mask, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -8472,11 +7946,7 @@ def _lower__ZL12__hltu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL12__hltu2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL12__hltu2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__hltu2_mask") - if handle is None: - handle = __hltu2_mask - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__hltu2_mask, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -8522,11 +7992,7 @@ def _lower__ZL12__hgtu2_mask14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL12__hgtu2_mask14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL12__hgtu2_mask14__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__hgtu2_mask") - if handle is None: - handle = __hgtu2_mask - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__hgtu2_mask, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -8572,11 +8038,7 @@ def _lower__ZL9__hisnan214__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL9__hisnan214__nv_bfloat162_nbst_caller(arg_0): return _ZL9__hisnan214__nv_bfloat162_nbst(arg_0) - handle = globals().get("__hisnan2") - if handle is None: - handle = __hisnan2 - - @lower(handle, _type___nv_bfloat162) + @lower(__hisnan2, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -8620,11 +8082,7 @@ def _lower__ZL7__hadd214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL7__hadd214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL7__hadd214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__hadd2") - if handle is None: - handle = __hadd2 - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__hadd2, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -8672,11 +8130,7 @@ def _lower__ZL7__hsub214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL7__hsub214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL7__hsub214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__hsub2") - if handle is None: - handle = __hsub2 - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__hsub2, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -8724,11 +8178,7 @@ def _lower__ZL7__hmul214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL7__hmul214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL7__hmul214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__hmul2") - if handle is None: - handle = __hmul2 - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__hmul2, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -8776,11 +8226,7 @@ def _lower__ZL10__hadd2_rn14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL10__hadd2_rn14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL10__hadd2_rn14__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__hadd2_rn") - if handle is None: - handle = __hadd2_rn - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__hadd2_rn, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -8828,11 +8274,7 @@ def _lower__ZL10__hsub2_rn14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL10__hsub2_rn14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL10__hsub2_rn14__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__hsub2_rn") - if handle is None: - handle = __hsub2_rn - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__hsub2_rn, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -8880,11 +8322,7 @@ def _lower__ZL10__hmul2_rn14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL10__hmul2_rn14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL10__hmul2_rn14__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__hmul2_rn") - if handle is None: - handle = __hmul2_rn - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__hmul2_rn, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -8932,11 +8370,7 @@ def _lower__ZL7__h2div14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL7__h2div14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL7__h2div14__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__h2div") - if handle is None: - handle = __h2div - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__h2div, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -8982,11 +8416,7 @@ def _lower__ZL7__habs214__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL7__habs214__nv_bfloat162_nbst_caller(arg_0): return _ZL7__habs214__nv_bfloat162_nbst(arg_0) - handle = globals().get("__habs2") - if handle is None: - handle = __habs2 - - @lower(handle, _type___nv_bfloat162) + @lower(__habs2, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -9030,11 +8460,7 @@ def _lower__ZL11__hadd2_sat14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL11__hadd2_sat14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL11__hadd2_sat14__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__hadd2_sat") - if handle is None: - handle = __hadd2_sat - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__hadd2_sat, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -9082,11 +8508,7 @@ def _lower__ZL11__hsub2_sat14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL11__hsub2_sat14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL11__hsub2_sat14__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__hsub2_sat") - if handle is None: - handle = __hsub2_sat - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__hsub2_sat, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -9134,11 +8556,7 @@ def _lower__ZL11__hmul2_sat14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL11__hmul2_sat14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL11__hmul2_sat14__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__hmul2_sat") - if handle is None: - handle = __hmul2_sat - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__hmul2_sat, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -9188,12 +8606,11 @@ def _lower__ZL7__hfma214__nv_bfloat162S_S__nbst(shim_stream, shim_obj): def _ZL7__hfma214__nv_bfloat162S_S__nbst_caller(arg_0, arg_1, arg_2): return _ZL7__hfma214__nv_bfloat162S_S__nbst(arg_0, arg_1, arg_2) - handle = globals().get("__hfma2") - if handle is None: - handle = __hfma2 - @lower( - handle, _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + __hfma2, + _type___nv_bfloat162, + _type___nv_bfloat162, + _type___nv_bfloat162, ) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) @@ -9245,12 +8662,11 @@ def _lower__ZL11__hfma2_sat14__nv_bfloat162S_S__nbst(shim_stream, shim_obj): def _ZL11__hfma2_sat14__nv_bfloat162S_S__nbst_caller(arg_0, arg_1, arg_2): return _ZL11__hfma2_sat14__nv_bfloat162S_S__nbst(arg_0, arg_1, arg_2) - handle = globals().get("__hfma2_sat") - if handle is None: - handle = __hfma2_sat - @lower( - handle, _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + __hfma2_sat, + _type___nv_bfloat162, + _type___nv_bfloat162, + _type___nv_bfloat162, ) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) @@ -9298,11 +8714,7 @@ def _lower__ZL7__hneg214__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL7__hneg214__nv_bfloat162_nbst_caller(arg_0): return _ZL7__hneg214__nv_bfloat162_nbst(arg_0) - handle = globals().get("__hneg2") - if handle is None: - handle = __hneg2 - - @lower(handle, _type___nv_bfloat162) + @lower(__hneg2, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -9344,11 +8756,7 @@ def _lower__ZL6__habs13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL6__habs13__nv_bfloat16_nbst_caller(arg_0): return _ZL6__habs13__nv_bfloat16_nbst(arg_0) - handle = globals().get("__habs") - if handle is None: - handle = __habs - - @lower(handle, _type___nv_bfloat16) + @lower(__habs, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -9392,11 +8800,7 @@ def _lower__ZL6__hadd13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL6__hadd13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL6__hadd13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals().get("__hadd") - if handle is None: - handle = __hadd - - @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__hadd, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -9444,11 +8848,7 @@ def _lower__ZL6__hsub13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL6__hsub13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL6__hsub13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals().get("__hsub") - if handle is None: - handle = __hsub - - @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__hsub, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -9496,11 +8896,7 @@ def _lower__ZL6__hmul13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL6__hmul13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL6__hmul13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals().get("__hmul") - if handle is None: - handle = __hmul - - @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__hmul, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -9548,11 +8944,7 @@ def _lower__ZL9__hadd_rn13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL9__hadd_rn13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL9__hadd_rn13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals().get("__hadd_rn") - if handle is None: - handle = __hadd_rn - - @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__hadd_rn, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -9600,11 +8992,7 @@ def _lower__ZL9__hsub_rn13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL9__hsub_rn13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL9__hsub_rn13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals().get("__hsub_rn") - if handle is None: - handle = __hsub_rn - - @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__hsub_rn, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -9652,11 +9040,7 @@ def _lower__ZL9__hmul_rn13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL9__hmul_rn13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL9__hmul_rn13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals().get("__hmul_rn") - if handle is None: - handle = __hmul_rn - - @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__hmul_rn, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -9704,11 +9088,7 @@ def _lower__ZL6__hdiv13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL6__hdiv13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL6__hdiv13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals().get("__hdiv") - if handle is None: - handle = __hdiv - - @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__hdiv, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -9756,11 +9136,7 @@ def _lower__ZL10__hadd_sat13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL10__hadd_sat13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL10__hadd_sat13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals().get("__hadd_sat") - if handle is None: - handle = __hadd_sat - - @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__hadd_sat, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -9808,11 +9184,7 @@ def _lower__ZL10__hsub_sat13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL10__hsub_sat13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL10__hsub_sat13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals().get("__hsub_sat") - if handle is None: - handle = __hsub_sat - - @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__hsub_sat, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -9860,11 +9232,7 @@ def _lower__ZL10__hmul_sat13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL10__hmul_sat13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL10__hmul_sat13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals().get("__hmul_sat") - if handle is None: - handle = __hmul_sat - - @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__hmul_sat, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -9914,12 +9282,8 @@ def _lower__ZL6__hfma13__nv_bfloat16S_S__nbst(shim_stream, shim_obj): def _ZL6__hfma13__nv_bfloat16S_S__nbst_caller(arg_0, arg_1, arg_2): return _ZL6__hfma13__nv_bfloat16S_S__nbst(arg_0, arg_1, arg_2) - handle = globals().get("__hfma") - if handle is None: - handle = __hfma - @lower( - handle, _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16 + __hfma, _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16 ) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) @@ -9971,12 +9335,11 @@ def _lower__ZL10__hfma_sat13__nv_bfloat16S_S__nbst(shim_stream, shim_obj): def _ZL10__hfma_sat13__nv_bfloat16S_S__nbst_caller(arg_0, arg_1, arg_2): return _ZL10__hfma_sat13__nv_bfloat16S_S__nbst(arg_0, arg_1, arg_2) - handle = globals().get("__hfma_sat") - if handle is None: - handle = __hfma_sat - @lower( - handle, _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16 + __hfma_sat, + _type___nv_bfloat16, + _type___nv_bfloat16, + _type___nv_bfloat16, ) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) @@ -10024,11 +9387,7 @@ def _lower__ZL6__hneg13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL6__hneg13__nv_bfloat16_nbst_caller(arg_0): return _ZL6__hneg13__nv_bfloat16_nbst(arg_0) - handle = globals().get("__hneg") - if handle is None: - handle = __hneg - - @lower(handle, _type___nv_bfloat16) + @lower(__hneg, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -10070,11 +9429,7 @@ def _lower__ZL7__hbeq214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL7__hbeq214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL7__hbeq214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__hbeq2") - if handle is None: - handle = __hbeq2 - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__hbeq2, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -10120,11 +9475,7 @@ def _lower__ZL7__hbne214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL7__hbne214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL7__hbne214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__hbne2") - if handle is None: - handle = __hbne2 - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__hbne2, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -10170,11 +9521,7 @@ def _lower__ZL7__hble214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL7__hble214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL7__hble214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__hble2") - if handle is None: - handle = __hble2 - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__hble2, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -10220,11 +9567,7 @@ def _lower__ZL7__hbge214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL7__hbge214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL7__hbge214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__hbge2") - if handle is None: - handle = __hbge2 - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__hbge2, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -10270,11 +9613,7 @@ def _lower__ZL7__hblt214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL7__hblt214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL7__hblt214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__hblt2") - if handle is None: - handle = __hblt2 - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__hblt2, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -10320,11 +9659,7 @@ def _lower__ZL7__hbgt214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL7__hbgt214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL7__hbgt214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__hbgt2") - if handle is None: - handle = __hbgt2 - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__hbgt2, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -10370,11 +9705,7 @@ def _lower__ZL8__hbequ214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL8__hbequ214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL8__hbequ214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__hbequ2") - if handle is None: - handle = __hbequ2 - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__hbequ2, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -10420,11 +9751,7 @@ def _lower__ZL8__hbneu214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL8__hbneu214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL8__hbneu214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__hbneu2") - if handle is None: - handle = __hbneu2 - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__hbneu2, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -10470,11 +9797,7 @@ def _lower__ZL8__hbleu214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL8__hbleu214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL8__hbleu214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__hbleu2") - if handle is None: - handle = __hbleu2 - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__hbleu2, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -10520,11 +9843,7 @@ def _lower__ZL8__hbgeu214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL8__hbgeu214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL8__hbgeu214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__hbgeu2") - if handle is None: - handle = __hbgeu2 - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__hbgeu2, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -10570,11 +9889,7 @@ def _lower__ZL8__hbltu214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL8__hbltu214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL8__hbltu214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__hbltu2") - if handle is None: - handle = __hbltu2 - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__hbltu2, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -10620,11 +9935,7 @@ def _lower__ZL8__hbgtu214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL8__hbgtu214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL8__hbgtu214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__hbgtu2") - if handle is None: - handle = __hbgtu2 - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__hbgtu2, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -10670,11 +9981,7 @@ def _lower__ZL5__heq13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL5__heq13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL5__heq13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals().get("__heq") - if handle is None: - handle = __heq - - @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__heq, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -10720,11 +10027,7 @@ def _lower__ZL5__hne13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL5__hne13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL5__hne13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals().get("__hne") - if handle is None: - handle = __hne - - @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__hne, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -10770,11 +10073,7 @@ def _lower__ZL5__hle13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL5__hle13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL5__hle13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals().get("__hle") - if handle is None: - handle = __hle - - @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__hle, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -10820,11 +10119,7 @@ def _lower__ZL5__hge13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL5__hge13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL5__hge13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals().get("__hge") - if handle is None: - handle = __hge - - @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__hge, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -10870,11 +10165,7 @@ def _lower__ZL5__hlt13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL5__hlt13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL5__hlt13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals().get("__hlt") - if handle is None: - handle = __hlt - - @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__hlt, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -10920,11 +10211,7 @@ def _lower__ZL5__hgt13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL5__hgt13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL5__hgt13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals().get("__hgt") - if handle is None: - handle = __hgt - - @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__hgt, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -10970,11 +10257,7 @@ def _lower__ZL6__hequ13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL6__hequ13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL6__hequ13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals().get("__hequ") - if handle is None: - handle = __hequ - - @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__hequ, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -11020,11 +10303,7 @@ def _lower__ZL6__hneu13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL6__hneu13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL6__hneu13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals().get("__hneu") - if handle is None: - handle = __hneu - - @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__hneu, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -11070,11 +10349,7 @@ def _lower__ZL6__hleu13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL6__hleu13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL6__hleu13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals().get("__hleu") - if handle is None: - handle = __hleu - - @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__hleu, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -11120,11 +10395,7 @@ def _lower__ZL6__hgeu13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL6__hgeu13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL6__hgeu13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals().get("__hgeu") - if handle is None: - handle = __hgeu - - @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__hgeu, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -11170,11 +10441,7 @@ def _lower__ZL6__hltu13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL6__hltu13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL6__hltu13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals().get("__hltu") - if handle is None: - handle = __hltu - - @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__hltu, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -11220,11 +10487,7 @@ def _lower__ZL6__hgtu13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL6__hgtu13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL6__hgtu13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals().get("__hgtu") - if handle is None: - handle = __hgtu - - @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__hgtu, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -11269,11 +10532,7 @@ def _lower__ZL8__hisnan13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL8__hisnan13__nv_bfloat16_nbst_caller(arg_0): return _ZL8__hisnan13__nv_bfloat16_nbst(arg_0) - handle = globals().get("__hisnan") - if handle is None: - handle = __hisnan - - @lower(handle, _type___nv_bfloat16) + @lower(__hisnan, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -11317,11 +10576,7 @@ def _lower__ZL6__hmax13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL6__hmax13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL6__hmax13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals().get("__hmax") - if handle is None: - handle = __hmax - - @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__hmax, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -11369,11 +10624,7 @@ def _lower__ZL6__hmin13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL6__hmin13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL6__hmin13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals().get("__hmin") - if handle is None: - handle = __hmin - - @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__hmin, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -11421,11 +10672,7 @@ def _lower__ZL10__hmax_nan13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL10__hmax_nan13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL10__hmax_nan13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals().get("__hmax_nan") - if handle is None: - handle = __hmax_nan - - @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__hmax_nan, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -11473,11 +10720,7 @@ def _lower__ZL10__hmin_nan13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL10__hmin_nan13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL10__hmin_nan13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals().get("__hmin_nan") - if handle is None: - handle = __hmin_nan - - @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(__hmin_nan, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -11527,12 +10770,11 @@ def _lower__ZL11__hfma_relu13__nv_bfloat16S_S__nbst(shim_stream, shim_obj): def _ZL11__hfma_relu13__nv_bfloat16S_S__nbst_caller(arg_0, arg_1, arg_2): return _ZL11__hfma_relu13__nv_bfloat16S_S__nbst(arg_0, arg_1, arg_2) - handle = globals().get("__hfma_relu") - if handle is None: - handle = __hfma_relu - @lower( - handle, _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16 + __hfma_relu, + _type___nv_bfloat16, + _type___nv_bfloat16, + _type___nv_bfloat16, ) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) @@ -11582,11 +10824,7 @@ def _lower__ZL7__hmax214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL7__hmax214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL7__hmax214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__hmax2") - if handle is None: - handle = __hmax2 - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__hmax2, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -11634,11 +10872,7 @@ def _lower__ZL7__hmin214__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL7__hmin214__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL7__hmin214__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__hmin2") - if handle is None: - handle = __hmin2 - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__hmin2, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -11686,11 +10920,7 @@ def _lower__ZL11__hmax2_nan14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL11__hmax2_nan14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL11__hmax2_nan14__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__hmax2_nan") - if handle is None: - handle = __hmax2_nan - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__hmax2_nan, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -11738,11 +10968,7 @@ def _lower__ZL11__hmin2_nan14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL11__hmin2_nan14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL11__hmin2_nan14__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("__hmin2_nan") - if handle is None: - handle = __hmin2_nan - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(__hmin2_nan, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -11792,12 +11018,11 @@ def _lower__ZL12__hfma2_relu14__nv_bfloat162S_S__nbst(shim_stream, shim_obj): def _ZL12__hfma2_relu14__nv_bfloat162S_S__nbst_caller(arg_0, arg_1, arg_2): return _ZL12__hfma2_relu14__nv_bfloat162S_S__nbst(arg_0, arg_1, arg_2) - handle = globals().get("__hfma2_relu") - if handle is None: - handle = __hfma2_relu - @lower( - handle, _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + __hfma2_relu, + _type___nv_bfloat162, + _type___nv_bfloat162, + _type___nv_bfloat162, ) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) @@ -11849,12 +11074,11 @@ def _lower__ZL8__hcmadd14__nv_bfloat162S_S__nbst(shim_stream, shim_obj): def _ZL8__hcmadd14__nv_bfloat162S_S__nbst_caller(arg_0, arg_1, arg_2): return _ZL8__hcmadd14__nv_bfloat162S_S__nbst(arg_0, arg_1, arg_2) - handle = globals().get("__hcmadd") - if handle is None: - handle = __hcmadd - @lower( - handle, _type___nv_bfloat162, _type___nv_bfloat162, _type___nv_bfloat162 + __hcmadd, + _type___nv_bfloat162, + _type___nv_bfloat162, + _type___nv_bfloat162, ) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) @@ -11902,11 +11126,7 @@ def _lower__ZL5hsqrt13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL5hsqrt13__nv_bfloat16_nbst_caller(arg_0): return _ZL5hsqrt13__nv_bfloat16_nbst(arg_0) - handle = globals().get("hsqrt") - if handle is None: - handle = hsqrt - - @lower(handle, _type___nv_bfloat16) + @lower(hsqrt, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -11948,11 +11168,7 @@ def _lower__ZL6hrsqrt13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL6hrsqrt13__nv_bfloat16_nbst_caller(arg_0): return _ZL6hrsqrt13__nv_bfloat16_nbst(arg_0) - handle = globals().get("hrsqrt") - if handle is None: - handle = hrsqrt - - @lower(handle, _type___nv_bfloat16) + @lower(hrsqrt, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -11994,11 +11210,7 @@ def _lower__ZL4hrcp13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL4hrcp13__nv_bfloat16_nbst_caller(arg_0): return _ZL4hrcp13__nv_bfloat16_nbst(arg_0) - handle = globals().get("hrcp") - if handle is None: - handle = hrcp - - @lower(handle, _type___nv_bfloat16) + @lower(hrcp, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key("_ZL4hrcp13__nv_bfloat16_nbst", shim_raw_str) @@ -12038,11 +11250,7 @@ def _lower__ZL4hlog13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL4hlog13__nv_bfloat16_nbst_caller(arg_0): return _ZL4hlog13__nv_bfloat16_nbst(arg_0) - handle = globals().get("hlog") - if handle is None: - handle = hlog - - @lower(handle, _type___nv_bfloat16) + @lower(hlog, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key("_ZL4hlog13__nv_bfloat16_nbst", shim_raw_str) @@ -12082,11 +11290,7 @@ def _lower__ZL5hlog213__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL5hlog213__nv_bfloat16_nbst_caller(arg_0): return _ZL5hlog213__nv_bfloat16_nbst(arg_0) - handle = globals().get("hlog2") - if handle is None: - handle = hlog2 - - @lower(handle, _type___nv_bfloat16) + @lower(hlog2, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -12128,11 +11332,7 @@ def _lower__ZL6hlog1013__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL6hlog1013__nv_bfloat16_nbst_caller(arg_0): return _ZL6hlog1013__nv_bfloat16_nbst(arg_0) - handle = globals().get("hlog10") - if handle is None: - handle = hlog10 - - @lower(handle, _type___nv_bfloat16) + @lower(hlog10, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -12174,11 +11374,7 @@ def _lower__ZL4hexp13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL4hexp13__nv_bfloat16_nbst_caller(arg_0): return _ZL4hexp13__nv_bfloat16_nbst(arg_0) - handle = globals().get("hexp") - if handle is None: - handle = hexp - - @lower(handle, _type___nv_bfloat16) + @lower(hexp, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key("_ZL4hexp13__nv_bfloat16_nbst", shim_raw_str) @@ -12218,11 +11414,7 @@ def _lower__ZL12htanh_approx13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL12htanh_approx13__nv_bfloat16_nbst_caller(arg_0): return _ZL12htanh_approx13__nv_bfloat16_nbst(arg_0) - handle = globals().get("htanh_approx") - if handle is None: - handle = htanh_approx - - @lower(handle, _type___nv_bfloat16) + @lower(htanh_approx, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -12264,11 +11456,7 @@ def _lower__ZL13h2tanh_approx14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL13h2tanh_approx14__nv_bfloat162_nbst_caller(arg_0): return _ZL13h2tanh_approx14__nv_bfloat162_nbst(arg_0) - handle = globals().get("h2tanh_approx") - if handle is None: - handle = h2tanh_approx - - @lower(handle, _type___nv_bfloat162) + @lower(h2tanh_approx, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -12310,11 +11498,7 @@ def _lower__ZL5htanh13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL5htanh13__nv_bfloat16_nbst_caller(arg_0): return _ZL5htanh13__nv_bfloat16_nbst(arg_0) - handle = globals().get("htanh") - if handle is None: - handle = htanh - - @lower(handle, _type___nv_bfloat16) + @lower(htanh, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -12356,11 +11540,7 @@ def _lower__ZL6h2tanh14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL6h2tanh14__nv_bfloat162_nbst_caller(arg_0): return _ZL6h2tanh14__nv_bfloat162_nbst(arg_0) - handle = globals().get("h2tanh") - if handle is None: - handle = h2tanh - - @lower(handle, _type___nv_bfloat162) + @lower(h2tanh, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -12402,11 +11582,7 @@ def _lower__ZL5hexp213__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL5hexp213__nv_bfloat16_nbst_caller(arg_0): return _ZL5hexp213__nv_bfloat16_nbst(arg_0) - handle = globals().get("hexp2") - if handle is None: - handle = hexp2 - - @lower(handle, _type___nv_bfloat16) + @lower(hexp2, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -12448,11 +11624,7 @@ def _lower__ZL6hexp1013__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL6hexp1013__nv_bfloat16_nbst_caller(arg_0): return _ZL6hexp1013__nv_bfloat16_nbst(arg_0) - handle = globals().get("hexp10") - if handle is None: - handle = hexp10 - - @lower(handle, _type___nv_bfloat16) + @lower(hexp10, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -12494,11 +11666,7 @@ def _lower__ZL4hcos13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL4hcos13__nv_bfloat16_nbst_caller(arg_0): return _ZL4hcos13__nv_bfloat16_nbst(arg_0) - handle = globals().get("hcos") - if handle is None: - handle = hcos - - @lower(handle, _type___nv_bfloat16) + @lower(hcos, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key("_ZL4hcos13__nv_bfloat16_nbst", shim_raw_str) @@ -12538,11 +11706,7 @@ def _lower__ZL4hsin13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZL4hsin13__nv_bfloat16_nbst_caller(arg_0): return _ZL4hsin13__nv_bfloat16_nbst(arg_0) - handle = globals().get("hsin") - if handle is None: - handle = hsin - - @lower(handle, _type___nv_bfloat16) + @lower(hsin, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key("_ZL4hsin13__nv_bfloat16_nbst", shim_raw_str) @@ -12582,11 +11746,7 @@ def _lower__ZL6h2sqrt14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL6h2sqrt14__nv_bfloat162_nbst_caller(arg_0): return _ZL6h2sqrt14__nv_bfloat162_nbst(arg_0) - handle = globals().get("h2sqrt") - if handle is None: - handle = h2sqrt - - @lower(handle, _type___nv_bfloat162) + @lower(h2sqrt, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -12628,11 +11788,7 @@ def _lower__ZL7h2rsqrt14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL7h2rsqrt14__nv_bfloat162_nbst_caller(arg_0): return _ZL7h2rsqrt14__nv_bfloat162_nbst(arg_0) - handle = globals().get("h2rsqrt") - if handle is None: - handle = h2rsqrt - - @lower(handle, _type___nv_bfloat162) + @lower(h2rsqrt, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -12674,11 +11830,7 @@ def _lower__ZL5h2rcp14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL5h2rcp14__nv_bfloat162_nbst_caller(arg_0): return _ZL5h2rcp14__nv_bfloat162_nbst(arg_0) - handle = globals().get("h2rcp") - if handle is None: - handle = h2rcp - - @lower(handle, _type___nv_bfloat162) + @lower(h2rcp, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -12720,11 +11872,7 @@ def _lower__ZL5h2log14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL5h2log14__nv_bfloat162_nbst_caller(arg_0): return _ZL5h2log14__nv_bfloat162_nbst(arg_0) - handle = globals().get("h2log") - if handle is None: - handle = h2log - - @lower(handle, _type___nv_bfloat162) + @lower(h2log, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -12766,11 +11914,7 @@ def _lower__ZL6h2log214__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL6h2log214__nv_bfloat162_nbst_caller(arg_0): return _ZL6h2log214__nv_bfloat162_nbst(arg_0) - handle = globals().get("h2log2") - if handle is None: - handle = h2log2 - - @lower(handle, _type___nv_bfloat162) + @lower(h2log2, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -12812,11 +11956,7 @@ def _lower__ZL7h2log1014__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL7h2log1014__nv_bfloat162_nbst_caller(arg_0): return _ZL7h2log1014__nv_bfloat162_nbst(arg_0) - handle = globals().get("h2log10") - if handle is None: - handle = h2log10 - - @lower(handle, _type___nv_bfloat162) + @lower(h2log10, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -12858,11 +11998,7 @@ def _lower__ZL5h2exp14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL5h2exp14__nv_bfloat162_nbst_caller(arg_0): return _ZL5h2exp14__nv_bfloat162_nbst(arg_0) - handle = globals().get("h2exp") - if handle is None: - handle = h2exp - - @lower(handle, _type___nv_bfloat162) + @lower(h2exp, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -12904,11 +12040,7 @@ def _lower__ZL6h2exp214__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL6h2exp214__nv_bfloat162_nbst_caller(arg_0): return _ZL6h2exp214__nv_bfloat162_nbst(arg_0) - handle = globals().get("h2exp2") - if handle is None: - handle = h2exp2 - - @lower(handle, _type___nv_bfloat162) + @lower(h2exp2, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -12950,11 +12082,7 @@ def _lower__ZL7h2exp1014__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL7h2exp1014__nv_bfloat162_nbst_caller(arg_0): return _ZL7h2exp1014__nv_bfloat162_nbst(arg_0) - handle = globals().get("h2exp10") - if handle is None: - handle = h2exp10 - - @lower(handle, _type___nv_bfloat162) + @lower(h2exp10, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -12996,11 +12124,7 @@ def _lower__ZL5h2cos14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL5h2cos14__nv_bfloat162_nbst_caller(arg_0): return _ZL5h2cos14__nv_bfloat162_nbst(arg_0) - handle = globals().get("h2cos") - if handle is None: - handle = h2cos - - @lower(handle, _type___nv_bfloat162) + @lower(h2cos, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -13042,11 +12166,7 @@ def _lower__ZL5h2sin14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZL5h2sin14__nv_bfloat162_nbst_caller(arg_0): return _ZL5h2sin14__nv_bfloat162_nbst(arg_0) - handle = globals().get("h2sin") - if handle is None: - handle = h2sin - - @lower(handle, _type___nv_bfloat162) + @lower(h2sin, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -13091,11 +12211,7 @@ def _lower__ZL9atomicAddP14__nv_bfloat162S__nbst(shim_stream, shim_obj): def _ZL9atomicAddP14__nv_bfloat162S__nbst_caller(arg_0, arg_1): return _ZL9atomicAddP14__nv_bfloat162S__nbst(arg_0, arg_1) - handle = globals().get("atomicAdd") - if handle is None: - handle = atomicAdd - - @lower(handle, CPointer(_type___nv_bfloat162), _type___nv_bfloat162) + @lower(atomicAdd, CPointer(_type___nv_bfloat162), _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -13140,11 +12256,7 @@ def _lower__ZL9atomicAddP13__nv_bfloat16S__nbst(shim_stream, shim_obj): def _ZL9atomicAddP13__nv_bfloat16S__nbst_caller(arg_0, arg_1): return _ZL9atomicAddP13__nv_bfloat16S__nbst(arg_0, arg_1) - handle = globals().get("atomicAdd") - if handle is None: - handle = atomicAdd - - @lower(handle, CPointer(_type___nv_bfloat16), _type___nv_bfloat16) + @lower(atomicAdd, CPointer(_type___nv_bfloat16), _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -13188,11 +12300,7 @@ def _lower__ZplRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj): def _ZplRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1): return _ZplRK13__nv_bfloat16S1__nbst(arg_0, arg_1) - handle = globals().get("operator.add") - if handle is None: - handle = operator.add - - @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(operator.add, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -13236,11 +12344,7 @@ def _lower__ZmiRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj): def _ZmiRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1): return _ZmiRK13__nv_bfloat16S1__nbst(arg_0, arg_1) - handle = globals().get("operator.sub") - if handle is None: - handle = operator.sub - - @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(operator.sub, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -13284,11 +12388,7 @@ def _lower__ZmlRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj): def _ZmlRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1): return _ZmlRK13__nv_bfloat16S1__nbst(arg_0, arg_1) - handle = globals().get("operator.mul") - if handle is None: - handle = operator.mul - - @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(operator.mul, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -13332,11 +12432,7 @@ def _lower__ZdvRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj): def _ZdvRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1): return _ZdvRK13__nv_bfloat16S1__nbst(arg_0, arg_1) - handle = globals().get("operator.truediv") - if handle is None: - handle = operator.truediv - - @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(operator.truediv, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -13380,11 +12476,7 @@ def _lower__ZpLR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj): def _ZpLR13__nv_bfloat16RKS__nbst_caller(arg_0, arg_1): return _ZpLR13__nv_bfloat16RKS__nbst(arg_0, arg_1) - handle = globals().get("operator.iadd") - if handle is None: - handle = operator.iadd - - @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(operator.iadd, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -13428,11 +12520,7 @@ def _lower__ZmIR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj): def _ZmIR13__nv_bfloat16RKS__nbst_caller(arg_0, arg_1): return _ZmIR13__nv_bfloat16RKS__nbst(arg_0, arg_1) - handle = globals().get("operator.isub") - if handle is None: - handle = operator.isub - - @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(operator.isub, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -13476,11 +12564,7 @@ def _lower__ZmLR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj): def _ZmLR13__nv_bfloat16RKS__nbst_caller(arg_0, arg_1): return _ZmLR13__nv_bfloat16RKS__nbst(arg_0, arg_1) - handle = globals().get("operator.imul") - if handle is None: - handle = operator.imul - - @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(operator.imul, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -13524,11 +12608,7 @@ def _lower__ZdVR13__nv_bfloat16RKS__nbst(shim_stream, shim_obj): def _ZdVR13__nv_bfloat16RKS__nbst_caller(arg_0, arg_1): return _ZdVR13__nv_bfloat16RKS__nbst(arg_0, arg_1) - handle = globals().get("operator.itruediv") - if handle is None: - handle = operator.itruediv - - @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(operator.itruediv, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -13570,11 +12650,7 @@ def _lower__ZpsRK13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZpsRK13__nv_bfloat16_nbst_caller(arg_0): return _ZpsRK13__nv_bfloat16_nbst(arg_0) - handle = globals().get("operator.pos") - if handle is None: - handle = operator.pos - - @lower(handle, _type___nv_bfloat16) + @lower(operator.pos, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key("_ZpsRK13__nv_bfloat16_nbst", shim_raw_str) @@ -13610,11 +12686,7 @@ def _lower__ZngRK13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZngRK13__nv_bfloat16_nbst_caller(arg_0): return _ZngRK13__nv_bfloat16_nbst(arg_0) - handle = globals().get("operator.neg") - if handle is None: - handle = operator.neg - - @lower(handle, _type___nv_bfloat16) + @lower(operator.neg, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key("_ZngRK13__nv_bfloat16_nbst", shim_raw_str) @@ -13650,11 +12722,7 @@ def _lower__ZeqRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj): def _ZeqRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1): return _ZeqRK13__nv_bfloat16S1__nbst(arg_0, arg_1) - handle = globals().get("operator.eq") - if handle is None: - handle = operator.eq - - @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(operator.eq, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -13696,11 +12764,7 @@ def _lower__ZneRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj): def _ZneRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1): return _ZneRK13__nv_bfloat16S1__nbst(arg_0, arg_1) - handle = globals().get("operator.ne") - if handle is None: - handle = operator.ne - - @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(operator.ne, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -13742,11 +12806,7 @@ def _lower__ZgtRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj): def _ZgtRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1): return _ZgtRK13__nv_bfloat16S1__nbst(arg_0, arg_1) - handle = globals().get("operator.gt") - if handle is None: - handle = operator.gt - - @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(operator.gt, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -13788,11 +12848,7 @@ def _lower__ZltRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj): def _ZltRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1): return _ZltRK13__nv_bfloat16S1__nbst(arg_0, arg_1) - handle = globals().get("operator.lt") - if handle is None: - handle = operator.lt - - @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(operator.lt, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -13834,11 +12890,7 @@ def _lower__ZgeRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj): def _ZgeRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1): return _ZgeRK13__nv_bfloat16S1__nbst(arg_0, arg_1) - handle = globals().get("operator.ge") - if handle is None: - handle = operator.ge - - @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(operator.ge, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -13880,11 +12932,7 @@ def _lower__ZleRK13__nv_bfloat16S1__nbst(shim_stream, shim_obj): def _ZleRK13__nv_bfloat16S1__nbst_caller(arg_0, arg_1): return _ZleRK13__nv_bfloat16S1__nbst(arg_0, arg_1) - handle = globals().get("operator.le") - if handle is None: - handle = operator.le - - @lower(handle, _type___nv_bfloat16, _type___nv_bfloat16) + @lower(operator.le, _type___nv_bfloat16, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -13928,11 +12976,7 @@ def _lower__ZplRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj): def _ZplRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1): return _ZplRK14__nv_bfloat162S1__nbst(arg_0, arg_1) - handle = globals().get("operator.add") - if handle is None: - handle = operator.add - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(operator.add, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -13976,11 +13020,7 @@ def _lower__ZmiRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj): def _ZmiRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1): return _ZmiRK14__nv_bfloat162S1__nbst(arg_0, arg_1) - handle = globals().get("operator.sub") - if handle is None: - handle = operator.sub - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(operator.sub, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -14024,11 +13064,7 @@ def _lower__ZmlRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj): def _ZmlRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1): return _ZmlRK14__nv_bfloat162S1__nbst(arg_0, arg_1) - handle = globals().get("operator.mul") - if handle is None: - handle = operator.mul - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(operator.mul, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -14072,11 +13108,7 @@ def _lower__ZdvRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj): def _ZdvRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1): return _ZdvRK14__nv_bfloat162S1__nbst(arg_0, arg_1) - handle = globals().get("operator.truediv") - if handle is None: - handle = operator.truediv - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(operator.truediv, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -14120,11 +13152,7 @@ def _lower__ZpLR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj): def _ZpLR14__nv_bfloat162RKS__nbst_caller(arg_0, arg_1): return _ZpLR14__nv_bfloat162RKS__nbst(arg_0, arg_1) - handle = globals().get("operator.iadd") - if handle is None: - handle = operator.iadd - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(operator.iadd, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -14168,11 +13196,7 @@ def _lower__ZmIR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj): def _ZmIR14__nv_bfloat162RKS__nbst_caller(arg_0, arg_1): return _ZmIR14__nv_bfloat162RKS__nbst(arg_0, arg_1) - handle = globals().get("operator.isub") - if handle is None: - handle = operator.isub - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(operator.isub, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -14216,11 +13240,7 @@ def _lower__ZmLR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj): def _ZmLR14__nv_bfloat162RKS__nbst_caller(arg_0, arg_1): return _ZmLR14__nv_bfloat162RKS__nbst(arg_0, arg_1) - handle = globals().get("operator.imul") - if handle is None: - handle = operator.imul - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(operator.imul, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -14264,11 +13284,7 @@ def _lower__ZdVR14__nv_bfloat162RKS__nbst(shim_stream, shim_obj): def _ZdVR14__nv_bfloat162RKS__nbst_caller(arg_0, arg_1): return _ZdVR14__nv_bfloat162RKS__nbst(arg_0, arg_1) - handle = globals().get("operator.itruediv") - if handle is None: - handle = operator.itruediv - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(operator.itruediv, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -14310,11 +13326,7 @@ def _lower__ZpsRK14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZpsRK14__nv_bfloat162_nbst_caller(arg_0): return _ZpsRK14__nv_bfloat162_nbst(arg_0) - handle = globals().get("operator.pos") - if handle is None: - handle = operator.pos - - @lower(handle, _type___nv_bfloat162) + @lower(operator.pos, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key("_ZpsRK14__nv_bfloat162_nbst", shim_raw_str) @@ -14350,11 +13362,7 @@ def _lower__ZngRK14__nv_bfloat162_nbst(shim_stream, shim_obj): def _ZngRK14__nv_bfloat162_nbst_caller(arg_0): return _ZngRK14__nv_bfloat162_nbst(arg_0) - handle = globals().get("operator.neg") - if handle is None: - handle = operator.neg - - @lower(handle, _type___nv_bfloat162) + @lower(operator.neg, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key("_ZngRK14__nv_bfloat162_nbst", shim_raw_str) @@ -14390,11 +13398,7 @@ def _lower__ZeqRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj): def _ZeqRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1): return _ZeqRK14__nv_bfloat162S1__nbst(arg_0, arg_1) - handle = globals().get("operator.eq") - if handle is None: - handle = operator.eq - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(operator.eq, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -14436,11 +13440,7 @@ def _lower__ZneRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj): def _ZneRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1): return _ZneRK14__nv_bfloat162S1__nbst(arg_0, arg_1) - handle = globals().get("operator.ne") - if handle is None: - handle = operator.ne - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(operator.ne, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -14482,11 +13482,7 @@ def _lower__ZgtRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj): def _ZgtRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1): return _ZgtRK14__nv_bfloat162S1__nbst(arg_0, arg_1) - handle = globals().get("operator.gt") - if handle is None: - handle = operator.gt - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(operator.gt, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -14528,11 +13524,7 @@ def _lower__ZltRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj): def _ZltRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1): return _ZltRK14__nv_bfloat162S1__nbst(arg_0, arg_1) - handle = globals().get("operator.lt") - if handle is None: - handle = operator.lt - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(operator.lt, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -14574,11 +13566,7 @@ def _lower__ZgeRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj): def _ZgeRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1): return _ZgeRK14__nv_bfloat162S1__nbst(arg_0, arg_1) - handle = globals().get("operator.ge") - if handle is None: - handle = operator.ge - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(operator.ge, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -14620,11 +13608,7 @@ def _lower__ZleRK14__nv_bfloat162S1__nbst(shim_stream, shim_obj): def _ZleRK14__nv_bfloat162S1__nbst_caller(arg_0, arg_1): return _ZleRK14__nv_bfloat162S1__nbst(arg_0, arg_1) - handle = globals().get("operator.le") - if handle is None: - handle = operator.le - - @lower(handle, _type___nv_bfloat162, _type___nv_bfloat162) + @lower(operator.le, _type___nv_bfloat162, _type___nv_bfloat162) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( @@ -14669,11 +13653,7 @@ def _lower__ZN6__halfC1E13__nv_bfloat16_nbst(shim_stream, shim_obj): def _ZN6__halfC1E13__nv_bfloat16_nbst_caller(arg_0): return _ZN6__halfC1E13__nv_bfloat16_nbst(arg_0) - handle = globals().get("__half") - if handle is None: - handle = __half - - @lower(handle, _type___nv_bfloat16) + @lower(__half, _type___nv_bfloat16) def impl(context, builder, sig, args): context.active_code_library.add_linking_file(shim_obj) shim_stream.write_with_key( From da312aa0b74a9f67abf6317545b79d2c55b753d2 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 15 Aug 2025 13:22:57 -0700 Subject: [PATCH 23/56] apply binding patches --- numba_cuda/numba/cuda/_internal/cuda_bf16.py | 46 ++++++-------------- 1 file changed, 14 insertions(+), 32 deletions(-) diff --git a/numba_cuda/numba/cuda/_internal/cuda_bf16.py b/numba_cuda/numba/cuda/_internal/cuda_bf16.py index 0eef75e12..a2af16d04 100644 --- a/numba_cuda/numba/cuda/_internal/cuda_bf16.py +++ b/numba_cuda/numba/cuda/_internal/cuda_bf16.py @@ -28,7 +28,6 @@ from numba.core.typing.templates import AttributeTemplate, ConcreteTemplate from numba.core.typing.templates import Registry as TypingRegistry from numba.cuda import CUSource, declare_device -from numba.cuda._internal.cuda_bf16 import _type___nv_bfloat16 from numba.cuda.vector_types import vector_types from numba.extending import as_numba_type from numba.types import ( @@ -50,8 +49,10 @@ uint64, void, ) +from numba.cuda.types import bfloat16 float32x2 = vector_types["float32x2"] +__half = float16 typing_registry = TypingRegistry() @@ -181,37 +182,7 @@ def resolve_y(self, obj): make_attribute_wrapper(_type_class_unnamed1405416, "y", "y") -@register -class _ctor_template_unnamed1405416(ConcreteTemplate): - key = globals()["unnamed1405416"] - cases = [] - - -register_global(unnamed1405416, Function(_ctor_template_unnamed1405416)) - - -# Typing for __nv_bfloat16 -class _type_class___nv_bfloat16(Number): - def __init__(self): - super().__init__(name="__nv_bfloat16") - self.alignof_ = 2 - self.bitwidth = 2 * 8 - - -_type___nv_bfloat16 = _type_class___nv_bfloat16() - - -# Make Python API for struct -__nv_bfloat16 = type("__nv_bfloat16", (), {"_nbtype": _type___nv_bfloat16}) - -as_numba_type.register(__nv_bfloat16, _type___nv_bfloat16) - - -@register_model(_type_class___nv_bfloat16) -class _model___nv_bfloat16(PrimitiveModel): - def __init__(self, dmm, fe_type): - be_type = ir.IntType(fe_type.bitwidth) - super(_model___nv_bfloat16, self).__init__(dmm, fe_type, be_type) +__nv_bfloat16 = _type___nv_bfloat16 = bfloat16 def _lower__ZN13__nv_bfloat16C1Ev(shim_stream, shim_obj): @@ -366,6 +337,17 @@ def ctor_impl(context, builder, sig, args): selfptr, align=getattr(_type___nv_bfloat16, "alignof_", None) ) + # By default, Numbast does not generate this cast because the c++ conversion + # constructor is marked explict. We enable it by hand here. + @lower_cast(float16, __nv_bfloat16) + def conversion_impl(context, builder, fromty, toty, value): + return ctor_impl( + context, + builder, + signature(__nv_bfloat16, fromty), + [value], + ) + _lower__ZN13__nv_bfloat16C1E6__half(shim_stream, shim_obj) From bc7dbaae3d1c785e41126562a52c3f1a1c642436 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 15 Aug 2025 13:56:10 -0700 Subject: [PATCH 24/56] re-imports the bf16 intrinsics --- numba_cuda/numba/cuda/bf16.py | 63 +++++++++---------- .../numba/cuda/tests/cudapy/test_bfloat16.py | 60 +++++++++--------- 2 files changed, 60 insertions(+), 63 deletions(-) diff --git a/numba_cuda/numba/cuda/bf16.py b/numba_cuda/numba/cuda/bf16.py index 1ac3798c0..e29123bb2 100644 --- a/numba_cuda/numba/cuda/bf16.py +++ b/numba_cuda/numba/cuda/bf16.py @@ -3,22 +3,21 @@ target_registry, nv_bfloat16 as bfloat16, # Arithmetic intrinsics - __habs, - __hadd, - __hsub, - __hmul, - __hadd_rn, - __hsub_rn, - __hmul_rn, - __hdiv, - __hadd_sat, - __hsub_sat, - __hmul_sat, - __hfma, - __hfma_sat, - __hneg, - __hfma_relu, - atomicAdd, + __habs as habs, + __hadd as hadd, + __hsub as hsub, + __hmul as hmul, + __hadd_rn as hadd_rn, + __hsub_rn as hsub_rn, + __hmul_rn as hmul_rn, + __hdiv as hdiv, + __hadd_sat as hadd_sat, + __hsub_sat as hsub_sat, + __hmul_sat as hmul_sat, + __hfma as hfma, + __hfma_sat as hfma_sat, + __hneg as hneg, + __hfma_relu as hfma_relu, htrunc, hceil, hfloor, @@ -107,28 +106,26 @@ def exp2_ol(a): except ImportError: pass - __all__ = [ "typing_registry", "target_registry", "bfloat16", # Arithmetic intrinsics - "__habs", - "__hadd", - "__hsub", - "__hmul", - "__hadd_rn", - "__hsub_rn", - "__hmul_rn", - "__hdiv", - "__hadd_sat", - "__hsub_sat", - "__hmul_sat", - "__hfma", - "__hfma_sat", - "__hneg", - "__hfma_relu", - "atomicAdd", + "habs", + "hadd", + "hsub", + "hmul", + "hadd_rn", + "hsub_rn", + "hmul_rn", + "hdiv", + "hadd_sat", + "hsub_sat", + "hmul_sat", + "hfma", + "hfma_sat", + "hneg", + "hfma_relu", "htrunc", "hceil", "hfloor", diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py index af25a3860..b6210498a 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py @@ -1,21 +1,21 @@ from numba import cuda, float32 from numba.cuda.bf16 import ( bfloat16, - __habs, - __hadd, - __hsub, - __hmul, - __hdiv, - __hadd_rn, - __hsub_rn, - __hmul_rn, - __hadd_sat, - __hsub_sat, - __hmul_sat, - __hfma, - __hfma_sat, - __hfma_relu, - __hneg, + habs, + hadd, + hsub, + hmul, + hadd_rn, + hsub_rn, + hmul_rn, + hdiv, + hadd_sat, + hsub_sat, + hmul_sat, + hfma, + hfma_sat, + hneg, + hfma_relu, ) from numba.cuda.testing import CUDATestCase @@ -86,17 +86,17 @@ def kernel(out): a = bfloat16(1.25) b = bfloat16(-2.5) - out[0] = float32(__habs(b)) - out[1] = float32(__hadd(a, b)) - out[2] = float32(__hsub(a, b)) - out[3] = float32(__hmul(a, b)) - out[4] = float32(__hdiv(b, a)) - out[5] = float32(__hneg(a)) - out[6] = float32(__hfma(a, b, b)) + out[0] = float32(habs(b)) + out[1] = float32(hadd(a, b)) + out[2] = float32(hsub(a, b)) + out[3] = float32(hmul(a, b)) + out[4] = float32(hdiv(b, a)) + out[5] = float32(hneg(a)) + out[6] = float32(hfma(a, b, b)) - out[7] = float32(__hadd_rn(a, b)) - out[8] = float32(__hsub_rn(a, b)) - out[9] = float32(__hmul_rn(a, b)) + out[7] = float32(hadd_rn(a, b)) + out[8] = float32(hsub_rn(a, b)) + out[9] = float32(hmul_rn(a, b)) out = cuda.device_array((10,), dtype="float32") kernel[1, 1](out) @@ -126,10 +126,10 @@ def kernel(out): a = bfloat16(1.5) b = bfloat16(0.75) - out[0] = float32(__hadd_sat(a, b)) # 2.25 -> 1.0 - out[1] = float32(__hsub_sat(b, a)) # -0.75 -> 0.0 - out[2] = float32(__hmul_sat(a, b)) # 1.125 -> 1.0 - out[3] = float32(__hfma_sat(a, b, a)) # 1.125 + 1.5 -> 1.0 + out[0] = float32(hadd_sat(a, b)) # 2.25 -> 1.0 + out[1] = float32(hsub_sat(b, a)) # -0.75 -> 0.0 + out[2] = float32(hmul_sat(a, b)) # 1.125 -> 1.0 + out[3] = float32(hfma_sat(a, b, a)) # 1.125 + 1.5 -> 1.0 out = cuda.device_array((4,), dtype="float32") kernel[1, 1](out) @@ -153,7 +153,7 @@ def kernel(out): b = bfloat16(2.0) c = bfloat16(0.0) - out[0] = float32(__hfma_relu(a, b, c)) # -3.0 -> relu -> 0.0 + out[0] = float32(hfma_relu(a, b, c)) # -3.0 -> relu -> 0.0 out = cuda.device_array((1,), dtype="float32") kernel[1, 1](out) From 04823e8acef71eedd286182805e6a398d68c8541 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 15 Aug 2025 14:09:53 -0700 Subject: [PATCH 25/56] Add documentation for arithmetic operations --- docs/source/reference/types.rst | 86 +++++++++++++++++++++++++++++++-- 1 file changed, 82 insertions(+), 4 deletions(-) diff --git a/docs/source/reference/types.rst b/docs/source/reference/types.rst index 774714c53..fc5e583e0 100644 --- a/docs/source/reference/types.rst +++ b/docs/source/reference/types.rst @@ -92,6 +92,7 @@ Construction of a single instance of a ``bfloat16`` object: - ``int32`` - ``uint64`` - ``uint32`` + - ``float16`` Conversely, ``bfloat16`` data can be cast back to existing native data type via ``dtype(b)``, where ``dtype`` is one of the data types above (except float16), @@ -100,7 +101,7 @@ and ``b`` is a bfloat16 object. Arithmetic ********** -Supported arithmetic operations on ``bfloat`16`` operands are: +Supported arithmetic operations on ``bfloat16`` operands are: - Arithmetic (``+``, ``-``, ``*``, ``/``) - Arithmetic assignment operators (``+=``, ``-=``, ``*=``, ``/=``) @@ -140,11 +141,11 @@ on ``bfloat16`` are provided: mode. .. function:: numba.cuda.bf16.hlog2(b) - Calculates bfloat16 decimal logarithm of input ``b`` in round-to-nearest-even - mode. + Calculates bfloat16 binary logarithm (base-2) of input ``b`` in + round-to-nearest-even mode. .. function:: numba.cuda.bf16.hlog10(b) - Calculates bfloat16 natural exponential function of input ``b`` in + Calculates bfloat16 common logarithm (base-10) of input ``b`` in round-to-nearest-even mode. .. function:: numba.cuda.bf16.hcos(b) @@ -187,3 +188,80 @@ on ``bfloat16`` are provided: .. function:: numba.cuda.bf16.hexp10(b) Calculates bfloat16 decimal exponential function of input ``b`` in round-to-nearest-even mode. + + +Arithmetic Intrinsics +********************* + +The following low-level arithmetic intrinsics are available under +``numba.cuda.bf16`` and map to CUDA bfloat16 arithmetic functions. Unless +otherwise noted, operations are performed in round-to-nearest-even mode. + +.. function:: numba.cuda.bf16.habs(a) + + Calculates the absolute value of input ``a`` (bfloat16) and returns the result. + +.. function:: numba.cuda.bf16.hneg(a) + + Negates input ``a`` (bfloat16) and returns the result. + +.. function:: numba.cuda.bf16.hadd(a, b) + + Adds ``a`` and ``b`` (bfloat16) in round-to-nearest-even mode. + +.. function:: numba.cuda.bf16.hadd_rn(a, b) + + Adds ``a`` and ``b`` (bfloat16) in round-to-nearest-even mode. Prevents + contraction of separate operations into a fused-multiply-add. + +.. function:: numba.cuda.bf16.hadd_sat(a, b) + + Adds ``a`` and ``b`` (bfloat16) in round-to-nearest-even mode, with + saturation to the range ``[0.0, 1.0]``. NaN results are flushed to ``+0.0``. + +.. function:: numba.cuda.bf16.hsub(a, b) + + Subtracts ``b`` from ``a`` (bfloat16) in round-to-nearest-even mode. + +.. function:: numba.cuda.bf16.hsub_rn(a, b) + + Subtracts ``b`` from ``a`` (bfloat16) in round-to-nearest-even mode. + Prevents contraction of separate operations into a fused-multiply-add. + +.. function:: numba.cuda.bf16.hsub_sat(a, b) + + Subtracts ``b`` from ``a`` (bfloat16) in round-to-nearest-even mode, with + saturation to the range ``[0.0, 1.0]``. NaN results are flushed to ``+0.0``. + +.. function:: numba.cuda.bf16.hmul(a, b) + + Multiplies ``a`` and ``b`` (bfloat16) in round-to-nearest-even mode. + +.. function:: numba.cuda.bf16.hmul_rn(a, b) + + Multiplies ``a`` and ``b`` (bfloat16) in round-to-nearest-even mode. + Prevents contraction of separate operations into a fused-multiply-add. + +.. function:: numba.cuda.bf16.hmul_sat(a, b) + + Multiplies ``a`` and ``b`` (bfloat16) in round-to-nearest-even mode, with + saturation to the range ``[0.0, 1.0]``. NaN results are flushed to ``+0.0``. + +.. function:: numba.cuda.bf16.hdiv(a, b) + + Divides ``a`` by ``b`` (bfloat16) in round-to-nearest-even mode. + +.. function:: numba.cuda.bf16.hfma(a, b, c) + + Computes a fused multiply-add of ``a`` and ``b`` plus ``c`` (bfloat16) in + round-to-nearest-even mode; i.e. returns ``a * b + c``. + +.. function:: numba.cuda.bf16.hfma_sat(a, b, c) + + Fused multiply-add in round-to-nearest-even mode with saturation to the + range ``[0.0, 1.0]``. NaN results are flushed to ``+0.0``. + +.. function:: numba.cuda.bf16.hfma_relu(a, b, c) + + Fused multiply-add in round-to-nearest-even mode with ReLU saturation; + i.e. returns ``max(0, a * b + c)``. From b7e0e8b89d572acb553e9e01a2ad1b9e43008c39 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 15 Aug 2025 14:34:38 -0700 Subject: [PATCH 26/56] add logical intrinsics --- numba_cuda/numba/cuda/bf16.py | 39 +++++++ .../numba/cuda/tests/cudapy/test_bfloat16.py | 107 ++++++++++++++++++ 2 files changed, 146 insertions(+) diff --git a/numba_cuda/numba/cuda/bf16.py b/numba_cuda/numba/cuda/bf16.py index e29123bb2..86cf7c510 100644 --- a/numba_cuda/numba/cuda/bf16.py +++ b/numba_cuda/numba/cuda/bf16.py @@ -18,6 +18,26 @@ __hfma_sat as hfma_sat, __hneg as hneg, __hfma_relu as hfma_relu, + # Comparison intrinsics + __heq as heq, + __hne as hne, + __hge as hge, + __hgt as hgt, + __hle as hle, + __hlt as hlt, + __hmax as hmax, + __hmin as hmin, + __hmax_nan as hmax_nan, + __hmin_nan as hmin_nan, + __hisinf as hisinf, + __hisnan as hisnan, + # Unordered comparison intrinsics + __hequ as hequ, + __hneu as hneu, + __hgeu as hgeu, + __hgtu as hgtu, + __hleu as hleu, + __hltu as hltu, htrunc, hceil, hfloor, @@ -126,6 +146,25 @@ def exp2_ol(a): "hfma_sat", "hneg", "hfma_relu", + # Comparison intrinsics + "heq", + "hne", + "hge", + "hgt", + "hle", + "hlt", + "hmax", + "hmin", + "hmax_nan", + "hmin_nan", + "hisinf", + "hisnan", + "hequ", + "hneu", + "hgeu", + "hgtu", + "hleu", + "hltu", "htrunc", "hceil", "hfloor", diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py index b6210498a..3721c1506 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py @@ -16,6 +16,19 @@ hfma_sat, hneg, hfma_relu, + # Comparison intrinsics + heq, + hne, + hge, + hgt, + hle, + hlt, + hmax, + hmin, + hmax_nan, + hmin_nan, + hisnan, + hisinf, ) from numba.cuda.testing import CUDATestCase @@ -159,3 +172,97 @@ def kernel(out): kernel[1, 1](out) self.assertAlmostEqual(out[0], 0.0, delta=1e-3) + + def test_comparison_intrinsics(self): + self.skip_unsupported() + + def make_kernel(cmpfn): + @cuda.jit + def kernel(out, a, b): + a_bf16 = bfloat16(a) + b_bf16 = bfloat16(b) + out[0] = cmpfn(a_bf16, b_bf16) + + return kernel + + comparisons = [heq, hne, hge, hgt, hle, hlt] + ops = [ + lambda x, y: x == y, + lambda x, y: x != y, + lambda x, y: x >= y, + lambda x, y: x > y, + lambda x, y: x <= y, + lambda x, y: x < y, + ] + + for cmpfn, op in zip(comparisons, ops): + with self.subTest(cmpfn=cmpfn): + kernel = make_kernel(cmpfn) + out = cuda.device_array((1,), dtype="bool") + + a = 3.0 + b = 3.0 + kernel[1, 1](out, a, b) + self.assertEqual(bool(out[0]), op(3.0, 3.0)) + + a = 3.0 + b = 4.0 + kernel[1, 1](out, a, b) + self.assertEqual(bool(out[0]), op(3.0, 4.0)) + + a = 4.0 + b = 3.0 + kernel[1, 1](out, a, b) + self.assertEqual(bool(out[0]), op(4.0, 3.0)) + + def test_hmax_hmin_intrinsics(self): + self.skip_unsupported() + + @cuda.jit + def kernel(out): + a = bfloat16(3.0) + b = bfloat16(4.0) + out[0] = float32(hmax(a, b)) + out[1] = float32(hmin(a, b)) + + out = cuda.device_array((2,), dtype="float32") + kernel[1, 1](out) + self.assertAlmostEqual(out[0], 4.0, delta=1e-3) + self.assertAlmostEqual(out[1], 3.0, delta=1e-3) + + def test_nan_and_inf_intrinsics(self): + self.skip_unsupported() + + @cuda.jit + def kernel(out_bool, out_int): + nanv = bfloat16(float("nan")) + infv = bfloat16(float("inf")) + out_bool[0] = hisnan(nanv) + out_int[0] = hisinf(infv) + + out_bool = cuda.device_array((1,), dtype="bool") + out_int = cuda.device_array((1,), dtype="int32") + kernel[1, 1](out_bool, out_int) + self.assertTrue(bool(out_bool[0])) + self.assertNotEqual(int(out_int[0]), 0) + + def test_hmax_nan_hmin_nan_intrinsics(self): + self.skip_unsupported() + + @cuda.jit + def kernel(out): + a = bfloat16(float("nan")) + b = bfloat16(2.0) + out[0] = float32(hmax_nan(a, b)) + out[1] = float32(hmin_nan(a, b)) + out[2] = float32(hmax(a, b)) + out[3] = float32(hmin(a, b)) + + out = cuda.device_array((4,), dtype="float32") + kernel[1, 1](out) + # NaN-propagating variants should produce NaN + self.assertTrue(math.isnan(out[0])) + self.assertTrue(math.isnan(out[1])) + # Non-NaN variants should return the non-NaN operand + self.assertAlmostEqual(out[2], 2.0, delta=1e-3) + self.assertAlmostEqual(out[3], 2.0, delta=1e-3) From 3407e19bd078bc4a9ab943c13e125eaf95d55c91 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 15 Aug 2025 14:36:02 -0700 Subject: [PATCH 27/56] make bfloat16 usable on host if ml_dtypes is installed --- numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py | 11 +++++++++++ numba_cuda/numba/cuda/types.py | 8 ++++++++ 2 files changed, 19 insertions(+) diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py index 3721c1506..7077ed122 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py @@ -1,3 +1,6 @@ +import unittest +from importlib.util import find_spec + from numba import cuda, float32 from numba.cuda.bf16 import ( bfloat16, @@ -266,3 +269,11 @@ def kernel(out): # Non-NaN variants should return the non-NaN operand self.assertAlmostEqual(out[2], 2.0, delta=1e-3) self.assertAlmostEqual(out[3], 2.0, delta=1e-3) + + @unittest.skipIf( + find_spec("ml_dtypes") is None, + "ml_dtypes is required to use bfloat16 on host", + ) + def test_use_bfloat16_on_host(self): + x = bfloat16(3.0) + self.assertEqual(x, 3.0) diff --git a/numba_cuda/numba/cuda/types.py b/numba_cuda/numba/cuda/types.py index 92e3cafde..5ddcaef5e 100644 --- a/numba_cuda/numba/cuda/types.py +++ b/numba_cuda/numba/cuda/types.py @@ -77,5 +77,13 @@ def unify(self, typingctx, other): if isinstance(other, (types.Float, types.Integer)): return typingctx.unify_pairs(self, other) + def cast_python_value(self, value): + try: + import ml_dtypes # noqa: F401 + + return ml_dtypes.bfloat16(value) + except ImportError: + raise NotImplementedError + bfloat16 = Bfloat16() From 2ce64ed10b1aa1602248f6706ec34c0ae221e871 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 15 Aug 2025 14:45:44 -0700 Subject: [PATCH 28/56] add comparison operators --- docs/source/reference/types.rst | 99 +++++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) diff --git a/docs/source/reference/types.rst b/docs/source/reference/types.rst index fc5e583e0..b11d68186 100644 --- a/docs/source/reference/types.rst +++ b/docs/source/reference/types.rst @@ -265,3 +265,102 @@ otherwise noted, operations are performed in round-to-nearest-even mode. Fused multiply-add in round-to-nearest-even mode with ReLU saturation; i.e. returns ``max(0, a * b + c)``. + +Comparison Intrinsics +********************* + +Device-level comparison intrinsics operating on ``bfloat16`` values are +available under ``numba.cuda.bf16``. Unless stated otherwise, the ordered +comparisons return ``False`` if either input is NaN, following IEEE semantics. + +.. function:: numba.cuda.bf16.heq(a, b) + + Ordered equality. Returns ``True`` iff ``a == b``. NaN inputs yield ``False``. + +.. function:: numba.cuda.bf16.hne(a, b) + + Ordered inequality. Returns ``True`` iff ``a != b`` and neither input is NaN. + NaN inputs yield ``False``. + +.. function:: numba.cuda.bf16.hge(a, b) + + Ordered greater-or-equal. NaN inputs yield ``False``. + +.. function:: numba.cuda.bf16.hgt(a, b) + + Ordered greater-than. NaN inputs yield ``False``. + +.. function:: numba.cuda.bf16.hle(a, b) + + Ordered less-or-equal. NaN inputs yield ``False``. + +.. function:: numba.cuda.bf16.hlt(a, b) + + Ordered less-than. NaN inputs yield ``False``. + +The unordered comparison variants return ``True`` when either input is NaN: + +.. function:: numba.cuda.bf16.hequ(a, b) + + Unordered equality. Returns ``True`` if ``a`` or ``b`` is NaN, or if ``a == b``. + +.. function:: numba.cuda.bf16.hneu(a, b) + + Unordered inequality. Returns ``True`` if ``a`` or ``b`` is NaN, or if ``a != b``. + +.. function:: numba.cuda.bf16.hgeu(a, b) + + Unordered greater-or-equal. Returns ``True`` if ``a`` or ``b`` is NaN, or if ``a >= b``. + +.. function:: numba.cuda.bf16.hgtu(a, b) + + Unordered greater-than. Returns ``True`` if ``a`` or ``b`` is NaN, or if ``a > b``. + +.. function:: numba.cuda.bf16.hleu(a, b) + + Unordered less-or-equal. Returns ``True`` if ``a`` or ``b`` is NaN, or if ``a <= b``. + +.. function:: numba.cuda.bf16.hltu(a, b) + + Unordered less-than. Returns ``True`` if ``a`` or ``b`` is NaN, or if ``a < b``. + +Min/Max operations follow CUDA semantics for zeros and NaNs: + +.. function:: numba.cuda.bf16.hmax(a, b) + + Returns ``max(a, b)`` with the following behavior: + if either input is NaN, the other input is returned; if both are NaN, + the canonical NaN is returned. If both inputs are zero, ``+0.0 > -0.0``. + +.. function:: numba.cuda.bf16.hmin(a, b) + + Returns ``min(a, b)`` with the following behavior: + if either input is NaN, the other input is returned; if both are NaN, + the canonical NaN is returned. If both inputs are zero, ``+0.0 > -0.0``. + +.. function:: numba.cuda.bf16.hmax_nan(a, b) + + Returns ``max(a, b)`` where NaNs pass through: if either input is NaN, + the canonical NaN is returned. + +.. function:: numba.cuda.bf16.hmin_nan(a, b) + + Returns ``min(a, b)`` where NaNs pass through: if either input is NaN, + the canonical NaN is returned. + +Special value predicates: + +.. function:: numba.cuda.bf16.hisnan(a) + + Returns ``True`` if ``a`` is a NaN, ``False`` otherwise. + +.. function:: numba.cuda.bf16.hisinf(a) + + Returns a nonzero integer if ``a`` is infinite, otherwise ``0``. + +.. note:: + + Python comparison operators on ``bfloat16`` values in device code map to + the ordered comparisons above. For more details on the CUDA bfloat16 + comparison semantics, see `NVIDIA CUDA Math API: Bfloat16 Comparison Functions + `_. From 7d289b22c382188fb2b1d3a9f38f3c22348a0f6f Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 15 Aug 2025 14:59:35 -0700 Subject: [PATCH 29/56] add basic conversion: float, int bidirectional conversion intrinsics --- numba_cuda/numba/cuda/bf16.py | 28 +++++++ .../numba/cuda/tests/cudapy/test_bfloat16.py | 75 +++++++++++++++++++ 2 files changed, 103 insertions(+) diff --git a/numba_cuda/numba/cuda/bf16.py b/numba_cuda/numba/cuda/bf16.py index 86cf7c510..ad893961d 100644 --- a/numba_cuda/numba/cuda/bf16.py +++ b/numba_cuda/numba/cuda/bf16.py @@ -38,6 +38,20 @@ __hgtu as hgtu, __hleu as hleu, __hltu as hltu, + # Precision conversion and data movement + __bfloat162float as bfloat162float, + __float2bfloat16_rn as float2bfloat16_rn, + __float2bfloat16_rz as float2bfloat16_rz, + __float2bfloat16_rd as float2bfloat16_rd, + __float2bfloat16_ru as float2bfloat16_ru, + __int2bfloat16_rn as int2bfloat16_rn, + __int2bfloat16_rz as int2bfloat16_rz, + __int2bfloat16_rd as int2bfloat16_rd, + __int2bfloat16_ru as int2bfloat16_ru, + __bfloat162int_rn as bfloat162int_rn, + __bfloat162int_rz as bfloat162int_rz, + __bfloat162int_rd as bfloat162int_rd, + __bfloat162int_ru as bfloat162int_ru, htrunc, hceil, hfloor, @@ -165,6 +179,20 @@ def exp2_ol(a): "hgtu", "hleu", "hltu", + # Precision conversion and data movement + "bfloat162float", + "float2bfloat16_rn", + "float2bfloat16_rz", + "float2bfloat16_rd", + "float2bfloat16_ru", + "int2bfloat16_rn", + "int2bfloat16_rz", + "int2bfloat16_rd", + "int2bfloat16_ru", + "bfloat162int_rn", + "bfloat162int_rz", + "bfloat162int_rd", + "bfloat162int_ru", "htrunc", "hceil", "hfloor", diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py index 7077ed122..53309f671 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py @@ -32,6 +32,20 @@ hmin_nan, hisnan, hisinf, + # Conversion intrinsics + bfloat162float, + float2bfloat16_rn, + float2bfloat16_rz, + float2bfloat16_rd, + float2bfloat16_ru, + int2bfloat16_rn, + int2bfloat16_rz, + int2bfloat16_rd, + int2bfloat16_ru, + bfloat162int_rn, + bfloat162int_rz, + bfloat162int_rd, + bfloat162int_ru, ) from numba.cuda.testing import CUDATestCase @@ -270,6 +284,67 @@ def kernel(out): self.assertAlmostEqual(out[2], 2.0, delta=1e-3) self.assertAlmostEqual(out[3], 2.0, delta=1e-3) + def test_precision_conversion_intrinsics(self): + self.skip_unsupported() + + @cuda.jit + def kernel_float_to_bf16(out): + f = float32(3.14) + out[0] = float32(float2bfloat16_rn(f)) + out[1] = float32(float2bfloat16_rz(f)) + out[2] = float32(float2bfloat16_rd(f)) + out[3] = float32(float2bfloat16_ru(f)) + + @cuda.jit + def kernel_bf16_to_float(out): + a = bfloat16(3.14) + out[0] = bfloat162float(a) + + @cuda.jit + def kernel_int_to_bf16(out): + i = 3 + out[0] = float32(int2bfloat16_rn(i)) + out[1] = float32(int2bfloat16_rz(i)) + out[2] = float32(int2bfloat16_rd(i)) + out[3] = float32(int2bfloat16_ru(i)) + + @cuda.jit + def kernel_bf16_to_int(out): + a = bfloat16(3.14) + out[0] = bfloat162int_rn(a) + out[1] = bfloat162int_rz(a) + out[2] = bfloat162int_rd(a) + out[3] = bfloat162int_ru(a) + + out = cuda.device_array((4,), dtype="float32") + kernel_float_to_bf16[1, 1](out) + # Check they are near the original value in float32 after round-trip + # Note: Different rounding modes produce slightly different values + self.assertAlmostEqual(out[0], 3.140625, delta=1e-3) # rn + self.assertTrue(abs(out[1] - 3.140625) < 2e-2, out[1] - 3.140625) # rz + self.assertTrue(abs(out[2] - 3.140625) < 2e-2, out[2] - 3.140625) # rd + self.assertTrue(abs(out[3] - 3.140625) < 2e-2, out[3] - 3.140625) # ru + + out = cuda.device_array((1,), dtype="float32") + kernel_bf16_to_float[1, 1](out) + self.assertAlmostEqual(out[0], 3.140625, delta=1e-3) + + outi = cuda.device_array((4,), dtype="int32") + kernel_int_to_bf16[1, 1](outi) + # int to bf16 should be exactly representable for small integers + self.assertEqual(int(outi[0]), 3) + self.assertEqual(int(outi[1]), 3) + self.assertEqual(int(outi[2]), 3) + self.assertEqual(int(outi[3]), 3) + + outi = cuda.device_array((4,), dtype="int32") + kernel_bf16_to_int[1, 1](outi) + # 3.14 -> 3 for rz/rd, 3 or 4 for rn/ru depending on rounding + self.assertIn(int(outi[0]), (3, 4)) + self.assertEqual(int(outi[1]), 3) + self.assertEqual(int(outi[2]), 3) + self.assertIn(int(outi[3]), (3, 4)) + @unittest.skipIf( find_spec("ml_dtypes") is None, "ml_dtypes is required to use bfloat16 on host", From 9317e7a03f37e09af61f88ce16e3918d80e310b2 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 15 Aug 2025 15:43:41 -0700 Subject: [PATCH 30/56] add numerical precision cast and tests --- numba_cuda/numba/cuda/bf16.py | 208 ++++++++++++++++-- .../numba/cuda/tests/cudapy/test_bfloat16.py | 160 +++++++++++--- 2 files changed, 327 insertions(+), 41 deletions(-) diff --git a/numba_cuda/numba/cuda/bf16.py b/numba_cuda/numba/cuda/bf16.py index ad893961d..96c1c6f78 100644 --- a/numba_cuda/numba/cuda/bf16.py +++ b/numba_cuda/numba/cuda/bf16.py @@ -39,11 +39,18 @@ __hleu as hleu, __hltu as hltu, # Precision conversion and data movement + # - floating-point family __bfloat162float as bfloat162float, + __float2bfloat16 as float2bfloat16, + __double2bfloat16 as double2bfloat16, __float2bfloat16_rn as float2bfloat16_rn, __float2bfloat16_rz as float2bfloat16_rz, __float2bfloat16_rd as float2bfloat16_rd, __float2bfloat16_ru as float2bfloat16_ru, + # - char family + __bfloat162char_rz as bfloat162char_rz, + __bfloat162uchar_rz as bfloat162uchar_rz, + # - int family (signed 32-bit) __int2bfloat16_rn as int2bfloat16_rn, __int2bfloat16_rz as int2bfloat16_rz, __int2bfloat16_rd as int2bfloat16_rd, @@ -52,6 +59,56 @@ __bfloat162int_rz as bfloat162int_rz, __bfloat162int_rd as bfloat162int_rd, __bfloat162int_ru as bfloat162int_ru, + # - short family (signed 16-bit) + __short2bfloat16_rn as short2bfloat16_rn, + __short2bfloat16_rz as short2bfloat16_rz, + __short2bfloat16_rd as short2bfloat16_rd, + __short2bfloat16_ru as short2bfloat16_ru, + __bfloat162short_rn as bfloat162short_rn, + __bfloat162short_rz as bfloat162short_rz, + __bfloat162short_rd as bfloat162short_rd, + __bfloat162short_ru as bfloat162short_ru, + # - ushort family (unsigned 16-bit) + __ushort2bfloat16_rn as ushort2bfloat16_rn, + __ushort2bfloat16_rz as ushort2bfloat16_rz, + __ushort2bfloat16_rd as ushort2bfloat16_rd, + __ushort2bfloat16_ru as ushort2bfloat16_ru, + __bfloat162ushort_rn as bfloat162ushort_rn, + __bfloat162ushort_rz as bfloat162ushort_rz, + __bfloat162ushort_rd as bfloat162ushort_rd, + __bfloat162ushort_ru as bfloat162ushort_ru, + # - uint family (unsigned 32-bit) + __uint2bfloat16_rn as uint2bfloat16_rn, + __uint2bfloat16_rz as uint2bfloat16_rz, + __uint2bfloat16_rd as uint2bfloat16_rd, + __uint2bfloat16_ru as uint2bfloat16_ru, + __bfloat162uint_rn as bfloat162uint_rn, + __bfloat162uint_rz as bfloat162uint_rz, + __bfloat162uint_rd as bfloat162uint_rd, + __bfloat162uint_ru as bfloat162uint_ru, + # - ll family (signed 64-bit) + __ll2bfloat16_rn as ll2bfloat16_rn, + __ll2bfloat16_rz as ll2bfloat16_rz, + __ll2bfloat16_rd as ll2bfloat16_rd, + __ll2bfloat16_ru as ll2bfloat16_ru, + __bfloat162ll_rn as bfloat162ll_rn, + __bfloat162ll_rz as bfloat162ll_rz, + __bfloat162ll_rd as bfloat162ll_rd, + __bfloat162ll_ru as bfloat162ll_ru, + # - ull family (unsigned 64-bit) + __ull2bfloat16_rn as ull2bfloat16_rn, + __ull2bfloat16_rz as ull2bfloat16_rz, + __ull2bfloat16_rd as ull2bfloat16_rd, + __ull2bfloat16_ru as ull2bfloat16_ru, + __bfloat162ull_rn as bfloat162ull_rn, + __bfloat162ull_rz as bfloat162ull_rz, + __bfloat162ull_rd as bfloat162ull_rd, + __bfloat162ull_ru as bfloat162ull_ru, + # - bit reinterpret casts + __bfloat16_as_short as bfloat16_as_short, + __bfloat16_as_ushort as bfloat16_as_ushort, + __short_as_bfloat16 as short_as_bfloat16, + __ushort_as_bfloat16 as ushort_as_bfloat16, htrunc, hceil, hfloor, @@ -140,6 +197,83 @@ def exp2_ol(a): except ImportError: pass +## Public aliases using Numba/Numpy-style type names +# Floating-point +float32_to_bfloat16 = float2bfloat16 +float64_to_bfloat16 = double2bfloat16 +bfloat16_to_float32 = bfloat162float +float32_to_bfloat16_rn = float2bfloat16_rn +float32_to_bfloat16_rz = float2bfloat16_rz +float32_to_bfloat16_rd = float2bfloat16_rd +float32_to_bfloat16_ru = float2bfloat16_ru + +# Char (8-bit) +bfloat16_to_int8_rz = bfloat162char_rz +bfloat16_to_uint8_rz = bfloat162uchar_rz + +# Int16 / UInt16 +int16_to_bfloat16_rn = short2bfloat16_rn +int16_to_bfloat16_rz = short2bfloat16_rz +int16_to_bfloat16_rd = short2bfloat16_rd +int16_to_bfloat16_ru = short2bfloat16_ru +bfloat16_to_int16_rn = bfloat162short_rn +bfloat16_to_int16_rz = bfloat162short_rz +bfloat16_to_int16_rd = bfloat162short_rd +bfloat16_to_int16_ru = bfloat162short_ru + +uint16_to_bfloat16_rn = ushort2bfloat16_rn +uint16_to_bfloat16_rz = ushort2bfloat16_rz +uint16_to_bfloat16_rd = ushort2bfloat16_rd +uint16_to_bfloat16_ru = ushort2bfloat16_ru +bfloat16_to_uint16_rn = bfloat162ushort_rn +bfloat16_to_uint16_rz = bfloat162ushort_rz +bfloat16_to_uint16_rd = bfloat162ushort_rd +bfloat16_to_uint16_ru = bfloat162ushort_ru + +# Int32 / UInt32 +int32_to_bfloat16_rn = int2bfloat16_rn +int32_to_bfloat16_rz = int2bfloat16_rz +int32_to_bfloat16_rd = int2bfloat16_rd +int32_to_bfloat16_ru = int2bfloat16_ru +bfloat16_to_int32_rn = bfloat162int_rn +bfloat16_to_int32_rz = bfloat162int_rz +bfloat16_to_int32_rd = bfloat162int_rd +bfloat16_to_int32_ru = bfloat162int_ru + +uint32_to_bfloat16_rn = uint2bfloat16_rn +uint32_to_bfloat16_rz = uint2bfloat16_rz +uint32_to_bfloat16_rd = uint2bfloat16_rd +uint32_to_bfloat16_ru = uint2bfloat16_ru +bfloat16_to_uint32_rn = bfloat162uint_rn +bfloat16_to_uint32_rz = bfloat162uint_rz +bfloat16_to_uint32_rd = bfloat162uint_rd +bfloat16_to_uint32_ru = bfloat162uint_ru + +# Int64 / UInt64 +int64_to_bfloat16_rn = ll2bfloat16_rn +int64_to_bfloat16_rz = ll2bfloat16_rz +int64_to_bfloat16_rd = ll2bfloat16_rd +int64_to_bfloat16_ru = ll2bfloat16_ru +bfloat16_to_int64_rn = bfloat162ll_rn +bfloat16_to_int64_rz = bfloat162ll_rz +bfloat16_to_int64_rd = bfloat162ll_rd +bfloat16_to_int64_ru = bfloat162ll_ru + +uint64_to_bfloat16_rn = ull2bfloat16_rn +uint64_to_bfloat16_rz = ull2bfloat16_rz +uint64_to_bfloat16_rd = ull2bfloat16_rd +uint64_to_bfloat16_ru = ull2bfloat16_ru +bfloat16_to_uint64_rn = bfloat162ull_rn +bfloat16_to_uint64_rz = bfloat162ull_rz +bfloat16_to_uint64_rd = bfloat162ull_rd +bfloat16_to_uint64_ru = bfloat162ull_ru + +# Bit reinterpret casts +bfloat16_as_int16 = bfloat16_as_short +bfloat16_as_uint16 = bfloat16_as_ushort +int16_as_bfloat16 = short_as_bfloat16 +uint16_as_bfloat16 = ushort_as_bfloat16 + __all__ = [ "typing_registry", "target_registry", @@ -180,19 +314,67 @@ def exp2_ol(a): "hleu", "hltu", # Precision conversion and data movement - "bfloat162float", - "float2bfloat16_rn", - "float2bfloat16_rz", - "float2bfloat16_rd", - "float2bfloat16_ru", - "int2bfloat16_rn", - "int2bfloat16_rz", - "int2bfloat16_rd", - "int2bfloat16_ru", - "bfloat162int_rn", - "bfloat162int_rz", - "bfloat162int_rd", - "bfloat162int_ru", + "float32_to_bfloat16", + "float64_to_bfloat16", + "bfloat16_to_float32", + "float32_to_bfloat16_rn", + "float32_to_bfloat16_rz", + "float32_to_bfloat16_rd", + "float32_to_bfloat16_ru", + "bfloat16_to_int8_rz", + "bfloat16_to_uint8_rz", + "int16_to_bfloat16_rn", + "int16_to_bfloat16_rz", + "int16_to_bfloat16_rd", + "int16_to_bfloat16_ru", + "bfloat16_to_int16_rn", + "bfloat16_to_int16_rz", + "bfloat16_to_int16_rd", + "bfloat16_to_int16_ru", + "uint16_to_bfloat16_rn", + "uint16_to_bfloat16_rz", + "uint16_to_bfloat16_rd", + "uint16_to_bfloat16_ru", + "bfloat16_to_uint16_rn", + "bfloat16_to_uint16_rz", + "bfloat16_to_uint16_rd", + "bfloat16_to_uint16_ru", + "int32_to_bfloat16_rn", + "int32_to_bfloat16_rz", + "int32_to_bfloat16_rd", + "int32_to_bfloat16_ru", + "bfloat16_to_int32_rn", + "bfloat16_to_int32_rz", + "bfloat16_to_int32_rd", + "bfloat16_to_int32_ru", + "uint32_to_bfloat16_rn", + "uint32_to_bfloat16_rz", + "uint32_to_bfloat16_rd", + "uint32_to_bfloat16_ru", + "bfloat16_to_uint32_rn", + "bfloat16_to_uint32_rz", + "bfloat16_to_uint32_rd", + "bfloat16_to_uint32_ru", + "int64_to_bfloat16_rn", + "int64_to_bfloat16_rz", + "int64_to_bfloat16_rd", + "int64_to_bfloat16_ru", + "bfloat16_to_int64_rn", + "bfloat16_to_int64_rz", + "bfloat16_to_int64_rd", + "bfloat16_to_int64_ru", + "uint64_to_bfloat16_rn", + "uint64_to_bfloat16_rz", + "uint64_to_bfloat16_rd", + "uint64_to_bfloat16_ru", + "bfloat16_to_uint64_rn", + "bfloat16_to_uint64_rz", + "bfloat16_to_uint64_rd", + "bfloat16_to_uint64_ru", + "bfloat16_as_int16", + "bfloat16_as_uint16", + "int16_as_bfloat16", + "uint16_as_bfloat16", "htrunc", "hceil", "hfloor", diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py index 53309f671..1147bba11 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py @@ -32,20 +32,38 @@ hmin_nan, hisnan, hisinf, - # Conversion intrinsics - bfloat162float, - float2bfloat16_rn, - float2bfloat16_rz, - float2bfloat16_rd, - float2bfloat16_ru, - int2bfloat16_rn, - int2bfloat16_rz, - int2bfloat16_rd, - int2bfloat16_ru, - bfloat162int_rn, - bfloat162int_rz, - bfloat162int_rd, - bfloat162int_ru, + # Conversion intrinsics (NumPy-style names) + bfloat16_to_float32, + float32_to_bfloat16, + float64_to_bfloat16, + float32_to_bfloat16_rn, + float32_to_bfloat16_rz, + float32_to_bfloat16_rd, + float32_to_bfloat16_ru, + int32_to_bfloat16_rn, + int32_to_bfloat16_rz, + int32_to_bfloat16_rd, + int32_to_bfloat16_ru, + bfloat16_to_int32_rn, + bfloat16_to_int32_rz, + bfloat16_to_int32_rd, + bfloat16_to_int32_ru, + bfloat16_to_int16_rn, + int16_to_bfloat16_rn, + bfloat16_to_uint16_rn, + uint16_to_bfloat16_rn, + bfloat16_to_uint32_rn, + uint32_to_bfloat16_rn, + bfloat16_to_int64_rn, + int64_to_bfloat16_rn, + bfloat16_to_uint64_rn, + uint64_to_bfloat16_rn, + bfloat16_as_short, + bfloat16_as_ushort, + short_as_bfloat16, + ushort_as_bfloat16, + bfloat16_to_int8_rz, + bfloat16_to_uint8_rz, ) from numba.cuda.testing import CUDATestCase @@ -284,37 +302,37 @@ def kernel(out): self.assertAlmostEqual(out[2], 2.0, delta=1e-3) self.assertAlmostEqual(out[3], 2.0, delta=1e-3) - def test_precision_conversion_intrinsics(self): + def test_int32_float32_precision_conversion_intrinsics(self): self.skip_unsupported() @cuda.jit def kernel_float_to_bf16(out): f = float32(3.14) - out[0] = float32(float2bfloat16_rn(f)) - out[1] = float32(float2bfloat16_rz(f)) - out[2] = float32(float2bfloat16_rd(f)) - out[3] = float32(float2bfloat16_ru(f)) + out[0] = float32(float32_to_bfloat16_rn(f)) + out[1] = float32(float32_to_bfloat16_rz(f)) + out[2] = float32(float32_to_bfloat16_rd(f)) + out[3] = float32(float32_to_bfloat16_ru(f)) @cuda.jit def kernel_bf16_to_float(out): a = bfloat16(3.14) - out[0] = bfloat162float(a) + out[0] = bfloat16_to_float32(a) @cuda.jit def kernel_int_to_bf16(out): i = 3 - out[0] = float32(int2bfloat16_rn(i)) - out[1] = float32(int2bfloat16_rz(i)) - out[2] = float32(int2bfloat16_rd(i)) - out[3] = float32(int2bfloat16_ru(i)) + out[0] = float32(int32_to_bfloat16_rn(i)) + out[1] = float32(int32_to_bfloat16_rz(i)) + out[2] = float32(int32_to_bfloat16_rd(i)) + out[3] = float32(int32_to_bfloat16_ru(i)) @cuda.jit def kernel_bf16_to_int(out): a = bfloat16(3.14) - out[0] = bfloat162int_rn(a) - out[1] = bfloat162int_rz(a) - out[2] = bfloat162int_rd(a) - out[3] = bfloat162int_ru(a) + out[0] = bfloat16_to_int32_rn(a) + out[1] = bfloat16_to_int32_rz(a) + out[2] = bfloat16_to_int32_rd(a) + out[3] = bfloat16_to_int32_ru(a) out = cuda.device_array((4,), dtype="float32") kernel_float_to_bf16[1, 1](out) @@ -345,6 +363,92 @@ def kernel_bf16_to_int(out): self.assertEqual(int(outi[2]), 3) self.assertIn(int(outi[3]), (3, 4)) + def test_floatroundtrip_integer_conversion_intrinsics(self): + self.skip_unsupported() + + @cuda.jit + def kernel_scalar_roundtrip(out): + f = 3.14 + bf = float32_to_bfloat16(f) + out[0] = bfloat16_to_float32(bf) + d = 3.14 + bf2 = float64_to_bfloat16(d) + out[1] = bfloat16_to_float32(bf2) + + out = cuda.device_array((2,), dtype="float32") + kernel_scalar_roundtrip[1, 1](out) + self.assertAlmostEqual(out[0], 3.140625, delta=1e-3) + self.assertAlmostEqual(out[1], 3.140625, delta=1e-3) + + @cuda.jit + def kernel_int_family(outf): + outf[0] = float32(int16_to_bfloat16_rn(123)) + outf[1] = float32(uint16_to_bfloat16_rn(456)) + outf[2] = float32(uint32_to_bfloat16_rn(789)) + outf[3] = float32(int64_to_bfloat16_rn(1011)) + outf[4] = float32(uint64_to_bfloat16_rn(1213)) + + outf = cuda.device_array((5,), dtype="float32") + kernel_int_family[1, 1](outf) + vals = [123, 456, 789, 1011, 1213] + for i, v in enumerate(vals): + got = int(outf[i]) + # `step` estimates ULP near the integer `v`. + # Bfloat16 has 7 bits of precision, spacing between representable values are 2**(e-7). + # We use the exponent of the value `v` to raise the minSpacing, the result is a reasonable + # esitmate the local ULP. + step = ( + 0 if v == 0 else 2 ** (int(math.floor(math.log2(abs(v)))) - 7) + ) + # `allowed` is the maximum error in ULP, with a minimum of 1 + # In general, half ULP is the typical rounding error bound. + allowed = max(1, int(step // 2)) + self.assertLessEqual(abs(got - v), allowed) + + @cuda.jit + def kernel_from_bf16_to_ints(outi): + a = bfloat16(5.75) + outi[0] = bfloat16_to_int16_rn(a) + outi[1] = bfloat16_to_uint16_rn(a) + outi[2] = bfloat16_to_uint32_rn(a) + outi[3] = bfloat16_to_int64_rn(a) + outi[4] = bfloat16_to_uint64_rn(a) + + outi = cuda.device_array((5,), dtype="int64") + kernel_from_bf16_to_ints[1, 1](outi) + self.assertEqual(int(outi[0]), 6) + self.assertEqual(int(outi[1]), 6) + self.assertEqual(int(outi[2]), 6) + self.assertEqual(int(outi[3]), 6) + self.assertEqual(int(outi[4]), 6) + + @cuda.jit + def kernel_bit_reinterpret(out_short, out_ushort): + s = 12345 + bf = short_as_bfloat16(s) + out_short[0] = bfloat16_as_short(bf) + us = 54321 + bf2 = ushort_as_bfloat16(us) + out_ushort[0] = bfloat16_as_ushort(bf2) + + out_short = cuda.device_array((1,), dtype="int32") + out_ushort = cuda.device_array((1,), dtype="uint32") + kernel_bit_reinterpret[1, 1](out_short, out_ushort) + self.assertEqual(int(out_short[0]), 12345) + self.assertEqual(int(out_ushort[0]), 54321) + + @cuda.jit + def kernel_char(out_c, out_uc): + a = bfloat16(3.9) + out_c[0] = bfloat16_to_int8_rz(a) + out_uc[0] = bfloat16_to_uint8_rz(a) + + out_c = cuda.device_array((1,), dtype="int8") + out_uc = cuda.device_array((1,), dtype="uint8") + kernel_char[1, 1](out_c, out_uc) + self.assertEqual(int(out_c[0]), 3) + self.assertEqual(int(out_uc[0]), 3) + @unittest.skipIf( find_spec("ml_dtypes") is None, "ml_dtypes is required to use bfloat16 on host", From 55d222024a929bf4c4b942dec87a877224a453df Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 15 Aug 2025 15:51:39 -0700 Subject: [PATCH 31/56] add documentation for conversions --- docs/source/reference/types.rst | 173 ++++++++++++++++++++++++++++++++ 1 file changed, 173 insertions(+) diff --git a/docs/source/reference/types.rst b/docs/source/reference/types.rst index b11d68186..40112210a 100644 --- a/docs/source/reference/types.rst +++ b/docs/source/reference/types.rst @@ -364,3 +364,176 @@ Special value predicates: the ordered comparisons above. For more details on the CUDA bfloat16 comparison semantics, see `NVIDIA CUDA Math API: Bfloat16 Comparison Functions `_. + +Precision Conversion and Data Movement +************************************* + +The following conversion intrinsics convert between ``bfloat16`` and other +scalar types. Rounding-mode suffixes: + +- ``_rn``: round-to-nearest-even +- ``_rz``: round-towards-zero +- ``_rd``: round-down (towards −∞) +- ``_ru``: round-up (towards +∞) + +Floating-point conversions +========================== + +.. function:: numba.cuda.bf16.float32_to_bfloat16(x) + + Convert a ``float32`` to ``bfloat16`` (default rounding is round-to-nearest-even). + +.. function:: numba.cuda.bf16.float64_to_bfloat16(x) + + Convert a ``float64`` to ``bfloat16`` (default rounding is round-to-nearest-even). + +.. function:: numba.cuda.bf16.bfloat16_to_float32(x) + + Convert a ``bfloat16`` to ``float32``. + +.. function:: numba.cuda.bf16.float32_to_bfloat16_rn(x) +.. function:: numba.cuda.bf16.float32_to_bfloat16_rz(x) +.. function:: numba.cuda.bf16.float32_to_bfloat16_rd(x) +.. function:: numba.cuda.bf16.float32_to_bfloat16_ru(x) + + Convert a ``float32`` to ``bfloat16`` using the specified rounding mode. + +Integer conversions +=================== + +Representative APIs for each integer width are listed below. All have +rounding-mode variants ``_rn``, ``_rz``, ``_rd``, ``_ru``. + +int16 (signed 16-bit) +--------------------- + +.. function:: numba.cuda.bf16.int16_to_bfloat16_rn(x) +.. function:: numba.cuda.bf16.int16_to_bfloat16_rz(x) +.. function:: numba.cuda.bf16.int16_to_bfloat16_rd(x) +.. function:: numba.cuda.bf16.int16_to_bfloat16_ru(x) + + Convert an ``int16`` to ``bfloat16`` with the selected rounding mode. + +.. function:: numba.cuda.bf16.bfloat16_to_int16_rn(x) +.. function:: numba.cuda.bf16.bfloat16_to_int16_rz(x) +.. function:: numba.cuda.bf16.bfloat16_to_int16_rd(x) +.. function:: numba.cuda.bf16.bfloat16_to_int16_ru(x) + + Convert a ``bfloat16`` to ``int16`` with the selected rounding mode. + +uint16 (unsigned 16-bit) +------------------------ + +.. function:: numba.cuda.bf16.uint16_to_bfloat16_rn(x) +.. function:: numba.cuda.bf16.uint16_to_bfloat16_rz(x) +.. function:: numba.cuda.bf16.uint16_to_bfloat16_rd(x) +.. function:: numba.cuda.bf16.uint16_to_bfloat16_ru(x) + + Convert a ``uint16`` to ``bfloat16`` with the selected rounding mode. + +.. function:: numba.cuda.bf16.bfloat16_to_uint16_rn(x) +.. function:: numba.cuda.bf16.bfloat16_to_uint16_rz(x) +.. function:: numba.cuda.bf16.bfloat16_to_uint16_rd(x) +.. function:: numba.cuda.bf16.bfloat16_to_uint16_ru(x) + + Convert a ``bfloat16`` to ``uint16`` with the selected rounding mode. + +int32 (signed 32-bit) +--------------------- + +.. function:: numba.cuda.bf16.int32_to_bfloat16_rn(x) +.. function:: numba.cuda.bf16.int32_to_bfloat16_rz(x) +.. function:: numba.cuda.bf16.int32_to_bfloat16_rd(x) +.. function:: numba.cuda.bf16.int32_to_bfloat16_ru(x) + + Convert an ``int32`` to ``bfloat16`` with the selected rounding mode. + +.. function:: numba.cuda.bf16.bfloat16_to_int32_rn(x) +.. function:: numba.cuda.bf16.bfloat16_to_int32_rz(x) +.. function:: numba.cuda.bf16.bfloat16_to_int32_rd(x) +.. function:: numba.cuda.bf16.bfloat16_to_int32_ru(x) + + Convert a ``bfloat16`` to ``int32`` with the selected rounding mode. + +uint32 (unsigned 32-bit) +------------------------ + +.. function:: numba.cuda.bf16.uint32_to_bfloat16_rn(x) +.. function:: numba.cuda.bf16.uint32_to_bfloat16_rz(x) +.. function:: numba.cuda.bf16.uint32_to_bfloat16_rd(x) +.. function:: numba.cuda.bf16.uint32_to_bfloat16_ru(x) + + Convert a ``uint32`` to ``bfloat16`` with the selected rounding mode. + +.. function:: numba.cuda.bf16.bfloat16_to_uint32_rn(x) +.. function:: numba.cuda.bf16.bfloat16_to_uint32_rz(x) +.. function:: numba.cuda.bf16.bfloat16_to_uint32_rd(x) +.. function:: numba.cuda.bf16.bfloat16_to_uint32_ru(x) + + Convert a ``bfloat16`` to ``uint32`` with the selected rounding mode. + +int64 (signed 64-bit) +--------------------- + +.. function:: numba.cuda.bf16.int64_to_bfloat16_rn(x) +.. function:: numba.cuda.bf16.int64_to_bfloat16_rz(x) +.. function:: numba.cuda.bf16.int64_to_bfloat16_rd(x) +.. function:: numba.cuda.bf16.int64_to_bfloat16_ru(x) + + Convert an ``int64`` to ``bfloat16`` with the selected rounding mode. + +.. function:: numba.cuda.bf16.bfloat16_to_int64_rn(x) +.. function:: numba.cuda.bf16.bfloat16_to_int64_rz(x) +.. function:: numba.cuda.bf16.bfloat16_to_int64_rd(x) +.. function:: numba.cuda.bf16.bfloat16_to_int64_ru(x) + + Convert a ``bfloat16`` to ``int64`` with the selected rounding mode. + +uint64 (unsigned 64-bit) +------------------------ + +.. function:: numba.cuda.bf16.uint64_to_bfloat16_rn(x) +.. function:: numba.cuda.bf16.uint64_to_bfloat16_rz(x) +.. function:: numba.cuda.bf16.uint64_to_bfloat16_rd(x) +.. function:: numba.cuda.bf16.uint64_to_bfloat16_ru(x) + + Convert a ``uint64`` to ``bfloat16`` with the selected rounding mode. + +.. function:: numba.cuda.bf16.bfloat16_to_uint64_rn(x) +.. function:: numba.cuda.bf16.bfloat16_to_uint64_rz(x) +.. function:: numba.cuda.bf16.bfloat16_to_uint64_rd(x) +.. function:: numba.cuda.bf16.bfloat16_to_uint64_ru(x) + + Convert a ``bfloat16`` to ``uint64`` with the selected rounding mode. + +8-bit conversions +================= + +.. function:: numba.cuda.bf16.bfloat16_to_int8_rz(x) + + Convert a ``bfloat16`` to ``int8`` with round-towards-zero. + +.. function:: numba.cuda.bf16.bfloat16_to_uint8_rz(x) + + Convert a ``bfloat16`` to ``uint8`` with round-towards-zero. + +Bit Reinterpret Casts +********************* + +These APIs reinterpret bits without numeric conversion: + +.. function:: numba.cuda.bf16.bfloat16_as_int16(x) + + Reinterpret the bits of ``bfloat16`` as an ``int16``. + +.. function:: numba.cuda.bf16.bfloat16_as_uint16(x) + + Reinterpret the bits of ``bfloat16`` as a ``uint16``. + +.. function:: numba.cuda.bf16.int16_as_bfloat16(x) + + Reinterpret the bits of an ``int16`` as a ``bfloat16``. + +.. function:: numba.cuda.bf16.uint16_as_bfloat16(x) + + Reinterpret the bits of a ``uint16`` as a ``bfloat16``. From 702b8cab9eb2bac265a00d384ac750ca1676c315 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 15 Aug 2025 15:54:57 -0700 Subject: [PATCH 32/56] removing cuda_bf16 vended headers --- numba_cuda/numba/cuda/include/11/cuda_bf16.h | 3749 ----------------- .../numba/cuda/include/11/cuda_bf16.hpp | 2683 ------------ 2 files changed, 6432 deletions(-) delete mode 100644 numba_cuda/numba/cuda/include/11/cuda_bf16.h delete mode 100644 numba_cuda/numba/cuda/include/11/cuda_bf16.hpp diff --git a/numba_cuda/numba/cuda/include/11/cuda_bf16.h b/numba_cuda/numba/cuda/include/11/cuda_bf16.h deleted file mode 100644 index 78f660d38..000000000 --- a/numba_cuda/numba/cuda/include/11/cuda_bf16.h +++ /dev/null @@ -1,3749 +0,0 @@ -/* -* Copyright 1993-2021 NVIDIA Corporation. All rights reserved. -* -* NOTICE TO LICENSEE: -* -* This source code and/or documentation ("Licensed Deliverables") are -* subject to NVIDIA intellectual property rights under U.S. and -* international Copyright laws. -* -* These Licensed Deliverables contained herein is PROPRIETARY and -* CONFIDENTIAL to NVIDIA and is being provided under the terms and -* conditions of a form of NVIDIA software license agreement by and -* between NVIDIA and Licensee ("License Agreement") or electronically -* accepted by Licensee. Notwithstanding any terms or conditions to -* the contrary in the License Agreement, reproduction or disclosure -* of the Licensed Deliverables to any third party without the express -* written consent of NVIDIA is prohibited. -* -* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE -* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE -* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS -* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. -* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED -* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, -* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. -* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE -* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY -* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY -* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, -* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS -* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE -* OF THESE LICENSED DELIVERABLES. -* -* U.S. Government End Users. These Licensed Deliverables are a -* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT -* 1995), consisting of "commercial computer software" and "commercial -* computer software documentation" as such terms are used in 48 -* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government -* only as a commercial end item. Consistent with 48 C.F.R.12.212 and -* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all -* U.S. Government End Users acquire the Licensed Deliverables with -* only those rights set forth herein. -* -* Any use of the Licensed Deliverables in individual and commercial -* software must include, in the user documentation and internal -* comments to the code, the above Disclaimer and U.S. Government End -* Users Notice. -*/ - -/** -* \defgroup CUDA_MATH_INTRINSIC_BFLOAT16 Bfloat16 Precision Intrinsics -* This section describes nv_bfloat16 precision intrinsic functions that are -* only supported in device code. -* To use these functions, include the header file \p cuda_bf16.h in your program. -*/ - -/** -* \defgroup CUDA_MATH__BFLOAT16_ARITHMETIC Bfloat16 Arithmetic Functions -* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 -* To use these functions, include the header file \p cuda_bf16.h in your program. -*/ - -/** -* \defgroup CUDA_MATH__BFLOAT162_ARITHMETIC Bfloat162 Arithmetic Functions -* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 -* To use these functions, include the header file \p cuda_bf16.h in your program. -*/ - -/** -* \defgroup CUDA_MATH__BFLOAT16_COMPARISON Bfloat16 Comparison Functions -* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 -* To use these functions, include the header file \p cuda_bf16.h in your program. -*/ - -/** -* \defgroup CUDA_MATH__BFLOAT162_COMPARISON Bfloat162 Comparison Functions -* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 -* To use these functions, include the header file \p cuda_bf16.h in your program. -*/ - -/** -* \defgroup CUDA_MATH__BFLOAT16_MISC Bfloat16 Precision Conversion and Data Movement -* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 -* To use these functions, include the header file \p cuda_bf16.h in your program. -*/ - -/** -* \defgroup CUDA_MATH__BFLOAT16_FUNCTIONS Bfloat16 Math Functions -* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 -* To use these functions, include the header file \p cuda_bf16.h in your program. -*/ - -/** -* \defgroup CUDA_MATH__BFLOAT162_FUNCTIONS Bfloat162 Math Functions -* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 -* To use these functions, include the header file \p cuda_bf16.h in your program. -*/ - -#ifndef __CUDA_BF16_H__ -#define __CUDA_BF16_H__ - -#define ___CUDA_BF16_STRINGIFY_INNERMOST(x) #x -#define __CUDA_BF16_STRINGIFY(x) ___CUDA_BF16_STRINGIFY_INNERMOST(x) - -#if defined(__cplusplus) -#if defined(__CUDACC__) -#define __CUDA_BF16_DECL__ static __device__ __inline__ -#define __CUDA_HOSTDEVICE_BF16_DECL__ static __host__ __device__ __inline__ -#else -#define __CUDA_HOSTDEVICE_BF16_DECL__ static -#endif /* defined(__CUDACC__) */ - -#define __CUDA_BF16_TYPES_EXIST__ - -/* Forward-declaration of structures defined in "cuda_bf16.hpp" */ - -/** - * \brief nv_bfloat16 datatype - * - * \details This structure implements the datatype for storing - * nv_bfloat16 floating-point numbers. The structure implements - * assignment operators and type conversions. 16 bits are being - * used in total: 1 sign bit, 8 bits for the exponent, and - * the significand is being stored in 7 bits. The total - * precision is 8 bits. - * - */ -struct __nv_bfloat16; - -/** - * \brief nv_bfloat162 datatype - * - * \details This structure implements the datatype for storing two - * nv_bfloat16 floating-point numbers. - * The structure implements assignment operators and type conversions. - * - */ -struct __nv_bfloat162; - -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Converts double number to nv_bfloat16 precision in round-to-nearest-even mode -* and returns \p nv_bfloat16 with converted value. -* -* \details Converts double number \p a to nv_bfloat16 precision in round-to-nearest-even mode. -* \param[in] a - double. Is only being read. -* \returns nv_bfloat16 -* - \p a converted to nv_bfloat16. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __double2bfloat16(const double a); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Converts float number to nv_bfloat16 precision in round-to-nearest-even mode -* and returns \p nv_bfloat16 with converted value. -* -* \details Converts float number \p a to nv_bfloat16 precision in round-to-nearest-even mode. -* \param[in] a - float. Is only being read. -* \returns nv_bfloat16 -* - \p a converted to nv_bfloat16. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16(const float a); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Converts float number to nv_bfloat16 precision in round-to-nearest-even mode -* and returns \p nv_bfloat16 with converted value. -* -* \details Converts float number \p a to nv_bfloat16 precision in round-to-nearest-even mode. -* \param[in] a - float. Is only being read. -* \returns nv_bfloat16 -* - \p a converted to nv_bfloat16. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rn(const float a); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Converts float number to nv_bfloat16 precision in round-towards-zero mode -* and returns \p nv_bfloat16 with converted value. -* -* \details Converts float number \p a to nv_bfloat16 precision in round-towards-zero mode. -* \param[in] a - float. Is only being read. -* \returns nv_bfloat16 -* - \p a converted to nv_bfloat16. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rz(const float a); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Converts float number to nv_bfloat16 precision in round-down mode -* and returns \p nv_bfloat16 with converted value. -* -* \details Converts float number \p a to nv_bfloat16 precision in round-down mode. -* \param[in] a - float. Is only being read. -* -* \returns nv_bfloat16 -* - \p a converted to nv_bfloat16. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rd(const float a); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Converts float number to nv_bfloat16 precision in round-up mode -* and returns \p nv_bfloat16 with converted value. -* -* \details Converts float number \p a to nv_bfloat16 precision in round-up mode. -* \param[in] a - float. Is only being read. -* -* \returns nv_bfloat16 -* - \p a converted to nv_bfloat16. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_ru(const float a); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Converts \p nv_bfloat16 number to float. -* -* \details Converts nv_bfloat16 number \p a to float. -* \param[in] a - float. Is only being read. -* -* \returns float -* - \p a converted to float. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_HOSTDEVICE_BF16_DECL__ float __bfloat162float(const __nv_bfloat16 a); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Converts input to nv_bfloat16 precision in round-to-nearest-even mode and -* populates both halves of \p nv_bfloat162 with converted value. -* -* \details Converts input \p a to nv_bfloat16 precision in round-to-nearest-even mode and -* populates both halves of \p nv_bfloat162 with converted value. -* \param[in] a - float. Is only being read. -* -* \returns nv_bfloat162 -* - The \p nv_bfloat162 value with both halves equal to the converted nv_bfloat16 -* precision number. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __float2bfloat162_rn(const float a); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Converts both input floats to nv_bfloat16 precision in round-to-nearest-even -* mode and returns \p nv_bfloat162 with converted values. -* -* \details Converts both input floats to nv_bfloat16 precision in round-to-nearest-even mode -* and combines the results into one \p nv_bfloat162 number. Low 16 bits of the return -* value correspond to the input \p a, high 16 bits correspond to the input \p -* b. -* \param[in] a - float. Is only being read. -* \param[in] b - float. Is only being read. -* -* \returns nv_bfloat162 -* - The \p nv_bfloat162 value with corresponding halves equal to the -* converted input floats. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __floats2bfloat162_rn(const float a, const float b); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Converts low 16 bits of \p nv_bfloat162 to float and returns the result -* -* \details Converts low 16 bits of \p nv_bfloat162 input \p a to 32-bit floating-point number -* and returns the result. -* \param[in] a - nv_bfloat162. Is only being read. -* -* \returns float -* - The low 16 bits of \p a converted to float. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_HOSTDEVICE_BF16_DECL__ float __low2float(const __nv_bfloat162 a); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Converts high 16 bits of \p nv_bfloat162 to float and returns the result -* -* \details Converts high 16 bits of \p nv_bfloat162 input \p a to 32-bit floating-point number -* and returns the result. -* \param[in] a - nv_bfloat162. Is only being read. -* -* \returns float -* - The high 16 bits of \p a converted to float. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_HOSTDEVICE_BF16_DECL__ float __high2float(const __nv_bfloat162 a); - -#if defined(__CUDACC__) && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Converts both components of float2 number to nv_bfloat16 precision in -* round-to-nearest-even mode and returns \p nv_bfloat162 with converted values. -* -* \details Converts both components of float2 to nv_bfloat16 precision in round-to-nearest -* mode and combines the results into one \p nv_bfloat162 number. Low 16 bits of the -* return value correspond to \p a.x and high 16 bits of the return value -* correspond to \p a.y. -* \param[in] a - float2. Is only being read. -* -* \returns nv_bfloat162 -* - The \p nv_bfloat162 which has corresponding halves equal to the -* converted float2 components. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __float22bfloat162_rn(const float2 a); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Converts both halves of \p nv_bfloat162 to float2 and returns the result. -* -* \details Converts both halves of \p nv_bfloat162 input \p a to float2 and returns the -* result. -* \param[in] a - nv_bfloat162. Is only being read. -* -* \returns float2 -* - \p a converted to float2. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_HOSTDEVICE_BF16_DECL__ float2 __bfloat1622float2(const __nv_bfloat162 a); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Convert a nv_bfloat16 to a signed integer in round-to-nearest-even mode. -* -* \details Convert the nv_bfloat16 floating-point value \p h to a signed integer in -* round-to-nearest-even mode. NaN inputs are converted to 0. -* \param[in] h - nv_bfloat16. Is only being read. -* -* \returns int -* - \p h converted to a signed integer. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ int __bfloat162int_rn(const __nv_bfloat16 h); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Convert a nv_bfloat16 to a signed integer in round-towards-zero mode. -* -* \details Convert the nv_bfloat16 floating-point value \p h to a signed integer in -* round-towards-zero mode. NaN inputs are converted to 0. -* \param[in] h - nv_bfloat16. Is only being read. -* -* \returns int -* - \p h converted to a signed integer. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_HOSTDEVICE_BF16_DECL__ int __bfloat162int_rz(const __nv_bfloat16 h); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Convert a nv_bfloat16 to a signed integer in round-down mode. -* -* \details Convert the nv_bfloat16 floating-point value \p h to a signed integer in -* round-down mode. NaN inputs are converted to 0. -* \param[in] h - nv_bfloat16. Is only being read. -* -* \returns int -* - \p h converted to a signed integer. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ int __bfloat162int_rd(const __nv_bfloat16 h); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Convert a nv_bfloat16 to a signed integer in round-up mode. -* -* \details Convert the nv_bfloat16 floating-point value \p h to a signed integer in -* round-up mode. NaN inputs are converted to 0. -* \param[in] h - nv_bfloat16. Is only being read. -* -* \returns int -* - \p h converted to a signed integer. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ int __bfloat162int_ru(const __nv_bfloat16 h); - -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Convert a signed integer to a nv_bfloat16 in round-to-nearest-even mode. -* -* \details Convert the signed integer value \p i to a nv_bfloat16 floating-point -* value in round-to-nearest-even mode. -* \param[in] i - int. Is only being read. -* -* \returns nv_bfloat16 -* - \p i converted to nv_bfloat16. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rn(const int i); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Convert a signed integer to a nv_bfloat16 in round-towards-zero mode. -* -* \details Convert the signed integer value \p i to a nv_bfloat16 floating-point -* value in round-towards-zero mode. -* \param[in] i - int. Is only being read. -* -* \returns nv_bfloat16 -* - \p i converted to nv_bfloat16. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rz(const int i); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Convert a signed integer to a nv_bfloat16 in round-down mode. -* -* \details Convert the signed integer value \p i to a nv_bfloat16 floating-point -* value in round-down mode. -* \param[in] i - int. Is only being read. -* -* \returns nv_bfloat16 -* - \p i converted to nv_bfloat16. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rd(const int i); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Convert a signed integer to a nv_bfloat16 in round-up mode. -* -* \details Convert the signed integer value \p i to a nv_bfloat16 floating-point -* value in round-up mode. -* \param[in] i - int. Is only being read. -* -* \returns nv_bfloat16 -* - \p i converted to nv_bfloat16. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_ru(const int i); - -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Convert a nv_bfloat16 to a signed short integer in round-to-nearest-even -* mode. -* -* \details Convert the nv_bfloat16 floating-point value \p h to a signed short -* integer in round-to-nearest-even mode. NaN inputs are converted to 0. -* \param[in] h - nv_bfloat16. Is only being read. -* -* \returns short int -* - \p h converted to a signed short integer. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ short int __bfloat162short_rn(const __nv_bfloat16 h); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Convert a nv_bfloat16 to a signed short integer in round-towards-zero mode. -* -* \details Convert the nv_bfloat16 floating-point value \p h to a signed short -* integer in round-towards-zero mode. NaN inputs are converted to 0. -* \param[in] h - nv_bfloat16. Is only being read. -* -* \returns short int -* - \p h converted to a signed short integer. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_HOSTDEVICE_BF16_DECL__ short int __bfloat162short_rz(const __nv_bfloat16 h); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Convert a nv_bfloat16 to a signed short integer in round-down mode. -* -* \details Convert the nv_bfloat16 floating-point value \p h to a signed short -* integer in round-down mode. NaN inputs are converted to 0. -* \param[in] h - nv_bfloat16. Is only being read. -* -* \returns short int -* - \p h converted to a signed short integer. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ short int __bfloat162short_rd(const __nv_bfloat16 h); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Convert a nv_bfloat16 to a signed short integer in round-up mode. -* -* \details Convert the nv_bfloat16 floating-point value \p h to a signed short -* integer in round-up mode. NaN inputs are converted to 0. -* \param[in] h - nv_bfloat16. Is only being read. -* -* \returns short int -* - \p h converted to a signed short integer. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ short int __bfloat162short_ru(const __nv_bfloat16 h); - -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Convert a signed short integer to a nv_bfloat16 in round-to-nearest-even -* mode. -* -* \details Convert the signed short integer value \p i to a nv_bfloat16 floating-point -* value in round-to-nearest-even mode. -* \param[in] i - short int. Is only being read. -* -* \returns nv_bfloat16 -* - \p i converted to nv_bfloat16. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rn(const short int i); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Convert a signed short integer to a nv_bfloat16 in round-towards-zero mode. -* -* \details Convert the signed short integer value \p i to a nv_bfloat16 floating-point -* value in round-towards-zero mode. -* \param[in] i - short int. Is only being read. -* -* \returns nv_bfloat16 -* - \p i converted to nv_bfloat16. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rz(const short int i); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Convert a signed short integer to a nv_bfloat16 in round-down mode. -* -* \details Convert the signed short integer value \p i to a nv_bfloat16 floating-point -* value in round-down mode. -* \param[in] i - short int. Is only being read. -* -* \returns nv_bfloat16 -* - \p i converted to nv_bfloat16. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rd(const short int i); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Convert a signed short integer to a nv_bfloat16 in round-up mode. -* -* \details Convert the signed short integer value \p i to a nv_bfloat16 floating-point -* value in round-up mode. -* \param[in] i - short int. Is only being read. -* -* \returns nv_bfloat16 -* - \p i converted to nv_bfloat16. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_ru(const short int i); - -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Convert a nv_bfloat16 to an unsigned integer in round-to-nearest-even mode. -* -* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned integer -* in round-to-nearest-even mode. NaN inputs are converted to 0. -* \param[in] h - nv_bfloat16. Is only being read. -* -* \returns unsigned int -* - \p h converted to an unsigned integer. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ unsigned int __bfloat162uint_rn(const __nv_bfloat16 h); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Convert a nv_bfloat16 to an unsigned integer in round-towards-zero mode. -* -* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned integer -* in round-towards-zero mode. NaN inputs are converted to 0. -* \param[in] h - nv_bfloat16. Is only being read. -* -* \returns unsigned int -* - \p h converted to an unsigned integer. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __bfloat162uint_rz(const __nv_bfloat16 h); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Convert a nv_bfloat16 to an unsigned integer in round-down mode. -* -* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned integer -* in round-down mode. NaN inputs are converted to 0. -* \param[in] h - nv_bfloat16. Is only being read. -* -* \returns unsigned int -* - \p h converted to an unsigned integer. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ unsigned int __bfloat162uint_rd(const __nv_bfloat16 h); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Convert a nv_bfloat16 to an unsigned integer in round-up mode. -* -* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned integer -* in round-up mode. NaN inputs are converted to 0. -* \param[in] h - nv_bfloat16. Is only being read. -* -* \returns unsigned int -* - \p h converted to an unsigned integer. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ unsigned int __bfloat162uint_ru(const __nv_bfloat16 h); - -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Convert an unsigned integer to a nv_bfloat16 in round-to-nearest-even mode. -* -* \details Convert the unsigned integer value \p i to a nv_bfloat16 floating-point -* value in round-to-nearest-even mode. -* \param[in] i - unsigned int. Is only being read. -* -* \returns nv_bfloat16 -* - \p i converted to nv_bfloat16. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rn(const unsigned int i); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Convert an unsigned integer to a nv_bfloat16 in round-towards-zero mode. -* -* \details Convert the unsigned integer value \p i to a nv_bfloat16 floating-point -* value in round-towards-zero mode. -* \param[in] i - unsigned int. Is only being read. -* -* \returns nv_bfloat16 -* - \p i converted to nv_bfloat16. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rz(const unsigned int i); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Convert an unsigned integer to a nv_bfloat16 in round-down mode. -* -* \details Convert the unsigned integer value \p i to a nv_bfloat16 floating-point -* value in round-down mode. -* \param[in] i - unsigned int. Is only being read. -* -* \returns nv_bfloat16 -* - \p i converted to nv_bfloat16. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rd(const unsigned int i); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Convert an unsigned integer to a nv_bfloat16 in round-up mode. -* -* \details Convert the unsigned integer value \p i to a nv_bfloat16 floating-point -* value in round-up mode. -* \param[in] i - unsigned int. Is only being read. -* -* \returns nv_bfloat16 -* - \p i converted to nv_bfloat16. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_ru(const unsigned int i); - -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Convert a nv_bfloat16 to an unsigned short integer in round-to-nearest-even -* mode. -* -* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned short -* integer in round-to-nearest-even mode. NaN inputs are converted to 0. -* \param[in] h - nv_bfloat16. Is only being read. -* -* \returns unsigned short int -* - \p h converted to an unsigned short integer. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_rn(const __nv_bfloat16 h); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Convert a nv_bfloat16 to an unsigned short integer in round-towards-zero -* mode. -* -* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned short -* integer in round-towards-zero mode. NaN inputs are converted to 0. -* \param[in] h - nv_bfloat16. Is only being read. -* -* \returns unsigned short int -* - \p h converted to an unsigned short integer. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_HOSTDEVICE_BF16_DECL__ unsigned short int __bfloat162ushort_rz(const __nv_bfloat16 h); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Convert a nv_bfloat16 to an unsigned short integer in round-down mode. -* -* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned short -* integer in round-down mode. NaN inputs are converted to 0. -* \param[in] h - nv_bfloat16. Is only being read. -* -* \returns unsigned short int -* - \p h converted to an unsigned short integer. -*/ -__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_rd(const __nv_bfloat16 h); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Convert a nv_bfloat16 to an unsigned short integer in round-up mode. -* -* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned short -* integer in round-up mode. NaN inputs are converted to 0. -* \param[in] h - nv_bfloat16. Is only being read. -* -* \returns unsigned short int -* - \p h converted to an unsigned short integer. -*/ -__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_ru(const __nv_bfloat16 h); - -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Convert an unsigned short integer to a nv_bfloat16 in round-to-nearest-even -* mode. -* -* \details Convert the unsigned short integer value \p i to a nv_bfloat16 floating-point -* value in round-to-nearest-even mode. -* \param[in] i - unsigned short int. Is only being read. -* -* \returns nv_bfloat16 -* - \p i converted to nv_bfloat16. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rn(const unsigned short int i); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Convert an unsigned short integer to a nv_bfloat16 in round-towards-zero -* mode. -* -* \details Convert the unsigned short integer value \p i to a nv_bfloat16 floating-point -* value in round-towards-zero mode. -* \param[in] i - unsigned short int. Is only being read. -* -* \returns nv_bfloat16 -* - \p i converted to nv_bfloat16. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rz(const unsigned short int i); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Convert an unsigned short integer to a nv_bfloat16 in round-down mode. -* -* \details Convert the unsigned short integer value \p i to a nv_bfloat16 floating-point -* value in round-down mode. -* \param[in] i - unsigned short int. Is only being read. -* -* \returns nv_bfloat16 -* - \p i converted to nv_bfloat16. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rd(const unsigned short int i); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Convert an unsigned short integer to a nv_bfloat16 in round-up mode. -* -* \details Convert the unsigned short integer value \p i to a nv_bfloat16 floating-point -* value in round-up mode. -* \param[in] i - unsigned short int. Is only being read. -* -* \returns nv_bfloat16 -* - \p i converted to nv_bfloat16. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_ru(const unsigned short int i); - -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Convert a nv_bfloat16 to an unsigned 64-bit integer in round-to-nearest-even -* mode. -* -* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned 64-bit -* integer in round-to-nearest-even mode. NaN inputs return 0x8000000000000000. -* \param[in] h - nv_bfloat16. Is only being read. -* -* \returns unsigned long long int -* - \p h converted to an unsigned 64-bit integer. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_rn(const __nv_bfloat16 h); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Convert a nv_bfloat16 to an unsigned 64-bit integer in round-towards-zero -* mode. -* -* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned 64-bit -* integer in round-towards-zero mode. NaN inputs return 0x8000000000000000. -* \param[in] h - nv_bfloat16. Is only being read. -* -* \returns unsigned long long int -* - \p h converted to an unsigned 64-bit integer. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_HOSTDEVICE_BF16_DECL__ unsigned long long int __bfloat162ull_rz(const __nv_bfloat16 h); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Convert a nv_bfloat16 to an unsigned 64-bit integer in round-down mode. -* -* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned 64-bit -* integer in round-down mode. NaN inputs return 0x8000000000000000. -* \param[in] h - nv_bfloat16. Is only being read. -* -* \returns unsigned long long int -* - \p h converted to an unsigned 64-bit integer. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_rd(const __nv_bfloat16 h); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Convert a nv_bfloat16 to an unsigned 64-bit integer in round-up mode. -* -* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned 64-bit -* integer in round-up mode. NaN inputs return 0x8000000000000000. -* \param[in] h - nv_bfloat16. Is only being read. -* -* \returns unsigned long long int -* - \p h converted to an unsigned 64-bit integer. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_ru(const __nv_bfloat16 h); - -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Convert an unsigned 64-bit integer to a nv_bfloat16 in round-to-nearest-even -* mode. -* -* \details Convert the unsigned 64-bit integer value \p i to a nv_bfloat16 floating-point -* value in round-to-nearest-even mode. -* \param[in] i - unsigned long long int. Is only being read. -* -* \returns nv_bfloat16 -* - \p i converted to nv_bfloat16. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rn(const unsigned long long int i); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Convert an unsigned 64-bit integer to a nv_bfloat16 in round-towards-zero -* mode. -* -* \details Convert the unsigned 64-bit integer value \p i to a nv_bfloat16 floating-point -* value in round-towards-zero mode. -* \param[in] i - unsigned long long int. Is only being read. -* -* \returns nv_bfloat16 -* - \p i converted to nv_bfloat16. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rz(const unsigned long long int i); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Convert an unsigned 64-bit integer to a nv_bfloat16 in round-down mode. -* -* \details Convert the unsigned 64-bit integer value \p i to a nv_bfloat16 floating-point -* value in round-down mode. -* \param[in] i - unsigned long long int. Is only being read. -* -* \returns nv_bfloat16 -* - \p i converted to nv_bfloat16. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rd(const unsigned long long int i); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Convert an unsigned 64-bit integer to a nv_bfloat16 in round-up mode. -* -* \details Convert the unsigned 64-bit integer value \p i to a nv_bfloat16 floating-point -* value in round-up mode. -* \param[in] i - unsigned long long int. Is only being read. -* -* \returns nv_bfloat16 -* - \p i converted to nv_bfloat16. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_ru(const unsigned long long int i); - -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Convert a nv_bfloat16 to a signed 64-bit integer in round-to-nearest-even -* mode. -* -* \details Convert the nv_bfloat16 floating-point value \p h to a signed 64-bit -* integer in round-to-nearest-even mode. NaN inputs return a long long int with hex value of 0x8000000000000000. -* \param[in] h - nv_bfloat16. Is only being read. -* -* \returns long long int -* - \p h converted to a signed 64-bit integer. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ long long int __bfloat162ll_rn(const __nv_bfloat16 h); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Convert a nv_bfloat16 to a signed 64-bit integer in round-towards-zero mode. -* -* \details Convert the nv_bfloat16 floating-point value \p h to a signed 64-bit -* integer in round-towards-zero mode. NaN inputs return a long long int with hex value of 0x8000000000000000. -* \param[in] h - nv_bfloat16. Is only being read. -* -* \returns long long int -* - \p h converted to a signed 64-bit integer. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_HOSTDEVICE_BF16_DECL__ long long int __bfloat162ll_rz(const __nv_bfloat16 h); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Convert a nv_bfloat16 to a signed 64-bit integer in round-down mode. -* -* \details Convert the nv_bfloat16 floating-point value \p h to a signed 64-bit -* integer in round-down mode. NaN inputs return a long long int with hex value of 0x8000000000000000. -* \param[in] h - nv_bfloat16. Is only being read. -* -* \returns long long int -* - \p h converted to a signed 64-bit integer. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ long long int __bfloat162ll_rd(const __nv_bfloat16 h); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Convert a nv_bfloat16 to a signed 64-bit integer in round-up mode. -* -* \details Convert the nv_bfloat16 floating-point value \p h to a signed 64-bit -* integer in round-up mode. NaN inputs return a long long int with hex value of 0x8000000000000000. -* \param[in] h - nv_bfloat16. Is only being read. -* -* \returns long long int -* - \p h converted to a signed 64-bit integer. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ long long int __bfloat162ll_ru(const __nv_bfloat16 h); - -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Convert a signed 64-bit integer to a nv_bfloat16 in round-to-nearest-even -* mode. -* -* \details Convert the signed 64-bit integer value \p i to a nv_bfloat16 floating-point -* value in round-to-nearest-even mode. -* \param[in] i - long long int. Is only being read. -* -* \returns nv_bfloat16 -* - \p i converted to nv_bfloat16. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rn(const long long int i); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Convert a signed 64-bit integer to a nv_bfloat16 in round-towards-zero mode. -* -* \details Convert the signed 64-bit integer value \p i to a nv_bfloat16 floating-point -* value in round-towards-zero mode. -* \param[in] i - long long int. Is only being read. -* -* \returns nv_bfloat16 -* - \p i converted to nv_bfloat16. -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rz(const long long int i); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Convert a signed 64-bit integer to a nv_bfloat16 in round-down mode. -* -* \details Convert the signed 64-bit integer value \p i to a nv_bfloat16 floating-point -* value in round-down mode. -* \param[in] i - long long int. Is only being read. -* -* \returns nv_bfloat16 -* - \p i converted to nv_bfloat16. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rd(const long long int i); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Convert a signed 64-bit integer to a nv_bfloat16 in round-up mode. -* -* \details Convert the signed 64-bit integer value \p i to a nv_bfloat16 floating-point -* value in round-up mode. -* \param[in] i - long long int. Is only being read. -* -* \returns nv_bfloat16 -* - \p i converted to nv_bfloat16. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_ru(const long long int i); - -/** -* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS -* \brief Truncate input argument to the integral part. -* -* \details Round \p h to the nearest integer value that does not exceed \p h in -* magnitude. -* \param[in] h - nv_bfloat16. Is only being read. -* -* \returns nv_bfloat16 -* - The truncated integer value. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 htrunc(const __nv_bfloat16 h); -/** -* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS -* \brief Calculate ceiling of the input argument. -* -* \details Compute the smallest integer value not less than \p h. -* \param[in] h - nv_bfloat16. Is only being read. -* -* \returns nv_bfloat16 -* - The smallest integer value not less than \p h. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 hceil(const __nv_bfloat16 h); -/** -* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS -* \brief Calculate the largest integer less than or equal to \p h. -* -* \details Calculate the largest integer value which is less than or equal to \p h. -* \param[in] h - nv_bfloat16. Is only being read. -* -* \returns nv_bfloat16 -* - The largest integer value which is less than or equal to \p h. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 hfloor(const __nv_bfloat16 h); -/** -* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS -* \brief Round input to nearest integer value in nv_bfloat16 floating-point -* number. -* -* \details Round \p h to the nearest integer value in nv_bfloat16 floating-point -* format, with bfloat16way cases rounded to the nearest even integer value. -* \param[in] h - nv_bfloat16. Is only being read. -* -* \returns nv_bfloat16 -* - The nearest integer to \p h. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 hrint(const __nv_bfloat16 h); - -/** -* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS -* \brief Truncate \p nv_bfloat162 vector input argument to the integral part. -* -* \details Round each component of vector \p h to the nearest integer value that does -* not exceed \p h in magnitude. -* \param[in] h - nv_bfloat162. Is only being read. -* -* \returns nv_bfloat162 -* - The truncated \p h. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 h2trunc(const __nv_bfloat162 h); -/** -* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS -* \brief Calculate \p nv_bfloat162 vector ceiling of the input argument. -* -* \details For each component of vector \p h compute the smallest integer value not less -* than \p h. -* \param[in] h - nv_bfloat162. Is only being read. -* -* \returns nv_bfloat162 -* - The vector of smallest integers not less than \p h. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 h2ceil(const __nv_bfloat162 h); -/** -* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS -* \brief Calculate the largest integer less than or equal to \p h. -* -* \details For each component of vector \p h calculate the largest integer value which -* is less than or equal to \p h. -* \param[in] h - nv_bfloat162. Is only being read. -* -* \returns nv_bfloat162 -* - The vector of largest integers which is less than or equal to \p h. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 h2floor(const __nv_bfloat162 h); -/** -* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS -* \brief Round input to nearest integer value in nv_bfloat16 floating-point -* number. -* -* \details Round each component of \p nv_bfloat162 vector \p h to the nearest integer value in -* nv_bfloat16 floating-point format, with bfloat16way cases rounded to the -* nearest even integer value. -* \param[in] h - nv_bfloat162. Is only being read. -* -* \returns nv_bfloat162 -* - The vector of rounded integer values. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 h2rint(const __nv_bfloat162 h); - -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Returns \p nv_bfloat162 with both halves equal to the input value. -* -* \details Returns \p nv_bfloat162 number with both halves equal to the input \p a \p nv_bfloat16 -* number. -* \param[in] a - nv_bfloat16. Is only being read. -* -* \returns nv_bfloat162 -* - The vector which has both its halves equal to the input \p a. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 __bfloat162bfloat162(const __nv_bfloat16 a); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Swaps both halves of the \p nv_bfloat162 input. -* -* \details Swaps both halves of the \p nv_bfloat162 input and returns a new \p nv_bfloat162 number -* with swapped halves. -* \param[in] a - nv_bfloat162. Is only being read. -* -* \returns nv_bfloat162 -* - \p a with its halves being swapped. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 __lowhigh2highlow(const __nv_bfloat162 a); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Extracts low 16 bits from each of the two \p nv_bfloat162 inputs and combines -* into one \p nv_bfloat162 number. -* -* \details Extracts low 16 bits from each of the two \p nv_bfloat162 inputs and combines into -* one \p nv_bfloat162 number. Low 16 bits from input \p a is stored in low 16 bits of -* the return value, low 16 bits from input \p b is stored in high 16 bits of -* the return value. -* \param[in] a - nv_bfloat162. Is only being read. -* \param[in] b - nv_bfloat162. Is only being read. -* -* \returns nv_bfloat162 -* - The low 16 bits of \p a and of \p b. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 __lows2bfloat162(const __nv_bfloat162 a, const __nv_bfloat162 b); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Extracts high 16 bits from each of the two \p nv_bfloat162 inputs and -* combines into one \p nv_bfloat162 number. -* -* \details Extracts high 16 bits from each of the two \p nv_bfloat162 inputs and combines into -* one \p nv_bfloat162 number. High 16 bits from input \p a is stored in low 16 bits of -* the return value, high 16 bits from input \p b is stored in high 16 bits of -* the return value. -* \param[in] a - nv_bfloat162. Is only being read. -* \param[in] b - nv_bfloat162. Is only being read. -* -* \returns nv_bfloat162 -* - The high 16 bits of \p a and of \p b. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 __highs2bfloat162(const __nv_bfloat162 a, const __nv_bfloat162 b); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Returns high 16 bits of \p nv_bfloat162 input. -* -* \details Returns high 16 bits of \p nv_bfloat162 input \p a. -* \param[in] a - nv_bfloat162. Is only being read. -* -* \returns nv_bfloat16 -* - The high 16 bits of the input. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 __high2bfloat16(const __nv_bfloat162 a); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Returns low 16 bits of \p nv_bfloat162 input. -* -* \details Returns low 16 bits of \p nv_bfloat162 input \p a. -* \param[in] a - nv_bfloat162. Is only being read. -* -* \returns nv_bfloat16 -* - Returns \p nv_bfloat16 which contains low 16 bits of the input \p a. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 __low2bfloat16(const __nv_bfloat162 a); -/** -* \ingroup CUDA_MATH__BFLOAT16_COMPARISON -* \brief Checks if the input \p nv_bfloat16 number is infinite. -* -* \details Checks if the input \p nv_bfloat16 number \p a is infinite. -* \param[in] a - nv_bfloat16. Is only being read. -* -* \returns int -* - -1 iff \p a is equal to negative infinity, -* - 1 iff \p a is equal to positive infinity, -* - 0 otherwise. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ int __hisinf(const __nv_bfloat16 a); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Combines two \p nv_bfloat16 numbers into one \p nv_bfloat162 number. -* -* \details Combines two input \p nv_bfloat16 number \p a and \p b into one \p nv_bfloat162 number. -* Input \p a is stored in low 16 bits of the return value, input \p b is stored -* in high 16 bits of the return value. -* \param[in] a - nv_bfloat16. Is only being read. -* \param[in] b - nv_bfloat16. Is only being read. -* -* \returns nv_bfloat162 -* - The nv_bfloat162 with one nv_bfloat16 equal to \p a and the other to \p b. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 __halves2bfloat162(const __nv_bfloat16 a, const __nv_bfloat16 b); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Extracts low 16 bits from \p nv_bfloat162 input. -* -* \details Extracts low 16 bits from \p nv_bfloat162 input \p a and returns a new \p nv_bfloat162 -* number which has both halves equal to the extracted bits. -* \param[in] a - nv_bfloat162. Is only being read. -* -* \returns nv_bfloat162 -* - The nv_bfloat162 with both halves equal to the low 16 bits of the input. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 __low2bfloat162(const __nv_bfloat162 a); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Extracts high 16 bits from \p nv_bfloat162 input. -* -* \details Extracts high 16 bits from \p nv_bfloat162 input \p a and returns a new \p nv_bfloat162 -* number which has both halves equal to the extracted bits. -* \param[in] a - nv_bfloat162. Is only being read. -* -* \returns nv_bfloat162 -* - The nv_bfloat162 with both halves equal to the high 16 bits of the input. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 __high2bfloat162(const __nv_bfloat162 a); - -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Reinterprets bits in a \p nv_bfloat16 as a signed short integer. -* -* \details Reinterprets the bits in the nv_bfloat16 floating-point number \p h -* as a signed short integer. -* \param[in] h - nv_bfloat16. Is only being read. -* -* \returns short int -* - The reinterpreted value. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ short int __bfloat16_as_short(const __nv_bfloat16 h); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Reinterprets bits in a \p nv_bfloat16 as an unsigned short integer. -* -* \details Reinterprets the bits in the nv_bfloat16 floating-point \p h -* as an unsigned short number. -* \param[in] h - nv_bfloat16. Is only being read. -* -* \returns unsigned short int -* - The reinterpreted value. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ unsigned short int __bfloat16_as_ushort(const __nv_bfloat16 h); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Reinterprets bits in a signed short integer as a \p nv_bfloat16. -* -* \details Reinterprets the bits in the signed short integer \p i as a -* nv_bfloat16 floating-point number. -* \param[in] i - short int. Is only being read. -* -* \returns nv_bfloat16 -* - The reinterpreted value. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 __short_as_bfloat16(const short int i); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Reinterprets bits in an unsigned short integer as a \p nv_bfloat16. -* -* \details Reinterprets the bits in the unsigned short integer \p i as a -* nv_bfloat16 floating-point number. -* \param[in] i - unsigned short int. Is only being read. -* -* \returns nv_bfloat16 -* - The reinterpreted value. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 __ushort_as_bfloat16(const unsigned short int i); - -#if !defined warpSize && !defined __local_warpSize -#define warpSize 32 -#define __local_warpSize -#endif - -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Exchange a variable between threads within a warp. Direct copy from indexed thread. -* -* \details Returns the value of var held by the thread whose ID is given by delta. -* If width is less than warpSize then each subsection of the warp behaves as a separate -* entity with a starting logical thread ID of 0. If delta is outside the range [0:width-1], -* the value returned corresponds to the value of var held by the delta modulo width (i.e. -* within the same subsection). width must have a value which is a power of 2; -* results are undefined if width is not a power of 2, or is a number greater than -* warpSize. -* \param[in] mask - unsigned int. Is only being read. -* \param[in] var - nv_bfloat162. Is only being read. -* \param[in] delta - int. Is only being read. -* \param[in] width - int. Is only being read. -* -* \returns Returns the 4-byte word referenced by var from the source thread ID as nv_bfloat162. -* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. -* \note_ref_guide_warp_shuffle -* \internal -* \exception-guarantee no-throw guarantee -* \behavior not reentrant, not thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_sync(const unsigned mask, const __nv_bfloat162 var, const int delta, const int width = warpSize); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller. -* -* \details Calculates a source thread ID by subtracting delta from the caller's lane ID. -* The value of var held by the resulting lane ID is returned: in effect, var is shifted up -* the warp by delta threads. If width is less than warpSize then each subsection of the warp -* behaves as a separate entity with a starting logical thread ID of 0. The source thread index -* will not wrap around the value of width, so effectively the lower delta threads will be unchanged. -* width must have a value which is a power of 2; results are undefined if width is not a power of 2, -* or is a number greater than warpSize. -* \param[in] mask - unsigned int. Is only being read. -* \param[in] var - nv_bfloat162. Is only being read. -* \param[in] delta - int. Is only being read. -* \param[in] width - int. Is only being read. -* -* \returns Returns the 4-byte word referenced by var from the source thread ID as nv_bfloat162. -* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. -* \note_ref_guide_warp_shuffle -* \internal -* \exception-guarantee no-throw guarantee -* \behavior not reentrant, not thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_up_sync(const unsigned mask, const __nv_bfloat162 var, const unsigned int delta, const int width = warpSize); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller. -* -* \details Calculates a source thread ID by adding delta to the caller's thread ID. -* The value of var held by the resulting thread ID is returned: this has the effect -* of shifting var down the warp by delta threads. If width is less than warpSize then -* each subsection of the warp behaves as a separate entity with a starting logical -* thread ID of 0. As for __shfl_up_sync(), the ID number of the source thread -* will not wrap around the value of width and so the upper delta threads -* will remain unchanged. -* \param[in] mask - unsigned int. Is only being read. -* \param[in] var - nv_bfloat162. Is only being read. -* \param[in] delta - int. Is only being read. -* \param[in] width - int. Is only being read. -* -* \returns Returns the 4-byte word referenced by var from the source thread ID as nv_bfloat162. -* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. -* \note_ref_guide_warp_shuffle -* \internal -* \exception-guarantee no-throw guarantee -* \behavior not reentrant, not thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_down_sync(const unsigned mask, const __nv_bfloat162 var, const unsigned int delta, const int width = warpSize); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID. -* -* \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with mask: -* the value of var held by the resulting thread ID is returned. If width is less than warpSize then each -* group of width consecutive threads are able to access elements from earlier groups of threads, -* however if they attempt to access elements from later groups of threads their own value of var -* will be returned. This mode implements a butterfly addressing pattern such as is used in tree -* reduction and broadcast. -* \param[in] mask - unsigned int. Is only being read. -* \param[in] var - nv_bfloat162. Is only being read. -* \param[in] delta - int. Is only being read. -* \param[in] width - int. Is only being read. -* -* \returns Returns the 4-byte word referenced by var from the source thread ID as nv_bfloat162. -* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. -* \note_ref_guide_warp_shuffle -* \internal -* \exception-guarantee no-throw guarantee -* \behavior not reentrant, not thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_xor_sync(const unsigned mask, const __nv_bfloat162 var, const int delta, const int width = warpSize); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Exchange a variable between threads within a warp. Direct copy from indexed thread. -* -* \details Returns the value of var held by the thread whose ID is given by delta. -* If width is less than warpSize then each subsection of the warp behaves as a separate -* entity with a starting logical thread ID of 0. If delta is outside the range [0:width-1], -* the value returned corresponds to the value of var held by the delta modulo width (i.e. -* within the same subsection). width must have a value which is a power of 2; -* results are undefined if width is not a power of 2, or is a number greater than -* warpSize. -* \param[in] mask - unsigned int. Is only being read. -* \param[in] var - nv_bfloat16. Is only being read. -* \param[in] delta - int. Is only being read. -* \param[in] width - int. Is only being read. -* -* \returns Returns the 2-byte word referenced by var from the source thread ID as nv_bfloat16. -* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. -* \note_ref_guide_warp_shuffle -* \internal -* \exception-guarantee no-throw guarantee -* \behavior not reentrant, not thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_sync(const unsigned mask, const __nv_bfloat16 var, const int delta, const int width = warpSize); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller. -* \details Calculates a source thread ID by subtracting delta from the caller's lane ID. -* The value of var held by the resulting lane ID is returned: in effect, var is shifted up -* the warp by delta threads. If width is less than warpSize then each subsection of the warp -* behaves as a separate entity with a starting logical thread ID of 0. The source thread index -* will not wrap around the value of width, so effectively the lower delta threads will be unchanged. -* width must have a value which is a power of 2; results are undefined if width is not a power of 2, -* or is a number greater than warpSize. -* \param[in] mask - unsigned int. Is only being read. -* \param[in] var - nv_bfloat16. Is only being read. -* \param[in] delta - int. Is only being read. -* \param[in] width - int. Is only being read. -* -* \returns Returns the 2-byte word referenced by var from the source thread ID as nv_bfloat16. -* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. -* \note_ref_guide_warp_shuffle -* \internal -* \exception-guarantee no-throw guarantee -* \behavior not reentrant, not thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_up_sync(const unsigned mask, const __nv_bfloat16 var, const unsigned int delta, const int width = warpSize); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller. -* -* \details Calculates a source thread ID by adding delta to the caller's thread ID. -* The value of var held by the resulting thread ID is returned: this has the effect -* of shifting var down the warp by delta threads. If width is less than warpSize then -* each subsection of the warp behaves as a separate entity with a starting logical -* thread ID of 0. As for __shfl_up_sync(), the ID number of the source thread -* will not wrap around the value of width and so the upper delta threads -* will remain unchanged. -* \param[in] mask - unsigned int. Is only being read. -* \param[in] var - nv_bfloat16. Is only being read. -* \param[in] delta - int. Is only being read. -* \param[in] width - int. Is only being read. -* -* \returns Returns the 2-byte word referenced by var from the source thread ID as nv_bfloat16. -* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. -* \note_ref_guide_warp_shuffle -* \internal -* \exception-guarantee no-throw guarantee -* \behavior not reentrant, not thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_down_sync(const unsigned mask, const __nv_bfloat16 var, const unsigned int delta, const int width = warpSize); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID. -* -* \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with mask: -* the value of var held by the resulting thread ID is returned. If width is less than warpSize then each -* group of width consecutive threads are able to access elements from earlier groups of threads, -* however if they attempt to access elements from later groups of threads their own value of var -* will be returned. This mode implements a butterfly addressing pattern such as is used in tree -* reduction and broadcast. -* \param[in] mask - unsigned int. Is only being read. -* \param[in] var - nv_bfloat16. Is only being read. -* \param[in] delta - int. Is only being read. -* \param[in] width - int. Is only being read. -* -* \returns Returns the 2-byte word referenced by var from the source thread ID as nv_bfloat16. -* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. -* \note_ref_guide_warp_shuffle -* \internal -* \exception-guarantee no-throw guarantee -* \behavior not reentrant, not thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_xor_sync(const unsigned mask, const __nv_bfloat16 var, const int delta, const int width = warpSize); - -#if defined(__local_warpSize) -#undef warpSize -#undef __local_warpSize -#endif - -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Generates a `ld.global.nc` load instruction. -* \param[in] ptr - memory location -* \returns The value pointed by `ptr` -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 __ldg(const __nv_bfloat162 *const ptr); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Generates a `ld.global.nc` load instruction. -* \param[in] ptr - memory location -* \returns The value pointed by `ptr` -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 __ldg(const __nv_bfloat16 *const ptr); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Generates a `ld.global.cg` load instruction. -* \param[in] ptr - memory location -* \returns The value pointed by `ptr` -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 __ldcg(const __nv_bfloat162 *const ptr); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Generates a `ld.global.cg` load instruction. -* \param[in] ptr - memory location -* \returns The value pointed by `ptr` -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 __ldcg(const __nv_bfloat16 *const ptr); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Generates a `ld.global.ca` load instruction. -* \param[in] ptr - memory location -* \returns The value pointed by `ptr` -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 __ldca(const __nv_bfloat162 *const ptr); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Generates a `ld.global.ca` load instruction. -* \param[in] ptr - memory location -* \returns The value pointed by `ptr` -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 __ldca(const __nv_bfloat16 *const ptr); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Generates a `ld.global.cs` load instruction. -* \param[in] ptr - memory location -* \returns The value pointed by `ptr` -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 __ldcs(const __nv_bfloat162 *const ptr); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Generates a `ld.global.cs` load instruction. -* \param[in] ptr - memory location -* \returns The value pointed by `ptr` -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 __ldcs(const __nv_bfloat16 *const ptr); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Generates a `ld.global.lu` load instruction. -* \param[in] ptr - memory location -* \returns The value pointed by `ptr` -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 __ldlu(const __nv_bfloat162 *const ptr); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Generates a `ld.global.lu` load instruction. -* \param[in] ptr - memory location -* \returns The value pointed by `ptr` -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 __ldlu(const __nv_bfloat16 *const ptr); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Generates a `ld.global.cv` load instruction. -* \param[in] ptr - memory location -* \returns The value pointed by `ptr` -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 __ldcv(const __nv_bfloat162 *const ptr); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Generates a `ld.global.cv` load instruction. -* \param[in] ptr - memory location -* \returns The value pointed by `ptr` -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 __ldcv(const __nv_bfloat16 *const ptr); - -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Generates a `st.global.wb` store instruction. -* \param[out] ptr - memory location -* \param[in] value - the value to be stored -*/ -__CUDA_BF16_DECL__ void __stwb(__nv_bfloat162 *const ptr, const __nv_bfloat162 value); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Generates a `st.global.wb` store instruction. -* \param[out] ptr - memory location -* \param[in] value - the value to be stored -*/ -__CUDA_BF16_DECL__ void __stwb(__nv_bfloat16 *const ptr, const __nv_bfloat16 value); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Generates a `st.global.cg` store instruction. -* \param[out] ptr - memory location -* \param[in] value - the value to be stored -*/ -__CUDA_BF16_DECL__ void __stcg(__nv_bfloat162 *const ptr, const __nv_bfloat162 value); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Generates a `st.global.cg` store instruction. -* \param[out] ptr - memory location -* \param[in] value - the value to be stored -*/ -__CUDA_BF16_DECL__ void __stcg(__nv_bfloat16 *const ptr, const __nv_bfloat16 value); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Generates a `st.global.cs` store instruction. -* \param[out] ptr - memory location -* \param[in] value - the value to be stored -*/ -__CUDA_BF16_DECL__ void __stcs(__nv_bfloat162 *const ptr, const __nv_bfloat162 value); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Generates a `st.global.cs` store instruction. -* \param[out] ptr - memory location -* \param[in] value - the value to be stored -*/ -__CUDA_BF16_DECL__ void __stcs(__nv_bfloat16 *const ptr, const __nv_bfloat16 value); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Generates a `st.global.wt` store instruction. -* \param[out] ptr - memory location -* \param[in] value - the value to be stored -*/ -__CUDA_BF16_DECL__ void __stwt(__nv_bfloat162 *const ptr, const __nv_bfloat162 value); -/** -* \ingroup CUDA_MATH__BFLOAT16_MISC -* \brief Generates a `st.global.wt` store instruction. -* \param[out] ptr - memory location -* \param[in] value - the value to be stored -*/ -__CUDA_BF16_DECL__ void __stwt(__nv_bfloat16 *const ptr, const __nv_bfloat16 value); - -/** -* \ingroup CUDA_MATH__BFLOAT162_COMPARISON -* \brief Performs nv_bfloat162 vector if-equal comparison. -* -* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b. -* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. -* NaN inputs generate false results. -* \param[in] a - nv_bfloat162. Is only being read. -* \param[in] b - nv_bfloat162. Is only being read. -* -* \returns nv_bfloat162 -* - The vector result of if-equal comparison of vectors \p a and \p b. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 __heq2(const __nv_bfloat162 a, const __nv_bfloat162 b); -/** -* \ingroup CUDA_MATH__BFLOAT162_COMPARISON -* \brief Performs \p nv_bfloat162 vector not-equal comparison. -* -* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b. -* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. -* NaN inputs generate false results. -* \param[in] a - nv_bfloat162. Is only being read. -* \param[in] b - nv_bfloat162. Is only being read. -* -* \returns nv_bfloat162 -* - The vector result of not-equal comparison of vectors \p a and \p b. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 __hne2(const __nv_bfloat162 a, const __nv_bfloat162 b); -/** -* \ingroup CUDA_MATH__BFLOAT162_COMPARISON -* \brief Performs \p nv_bfloat162 vector less-equal comparison. -* -* \details Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b. -* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. -* NaN inputs generate false results. -* \param[in] a - nv_bfloat162. Is only being read. -* \param[in] b - nv_bfloat162. Is only being read. -* -* \returns nv_bfloat162 -* - The \p nv_bfloat162 result of less-equal comparison of vectors \p a and \p b. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 __hle2(const __nv_bfloat162 a, const __nv_bfloat162 b); -/** -* \ingroup CUDA_MATH__BFLOAT162_COMPARISON -* \brief Performs \p nv_bfloat162 vector greater-equal comparison. -* -* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b. -* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. -* NaN inputs generate false results. -* \param[in] a - nv_bfloat162. Is only being read. -* \param[in] b - nv_bfloat162. Is only being read. -* -* \returns nv_bfloat162 -* - The vector result of greater-equal comparison of vectors \p a and \p b. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 __hge2(const __nv_bfloat162 a, const __nv_bfloat162 b); -/** -* \ingroup CUDA_MATH__BFLOAT162_COMPARISON -* \brief Performs \p nv_bfloat162 vector less-than comparison. -* -* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b. -* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. -* NaN inputs generate false results. -* \param[in] a - nv_bfloat162. Is only being read. -* \param[in] b - nv_bfloat162. Is only being read. -* -* \returns nv_bfloat162 -* - The nv_bfloat162 vector result of less-than comparison of vectors \p a and \p b. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 __hlt2(const __nv_bfloat162 a, const __nv_bfloat162 b); -/** -* \ingroup CUDA_MATH__BFLOAT162_COMPARISON -* \brief Performs \p nv_bfloat162 vector greater-than comparison. -* -* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b. -* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. -* NaN inputs generate false results. -* \param[in] a - nv_bfloat162. Is only being read. -* \param[in] b - nv_bfloat162. Is only being read. -* -* \returns nv_bfloat162 -* - The vector result of greater-than comparison of vectors \p a and \p b. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 __hgt2(const __nv_bfloat162 a, const __nv_bfloat162 b); -/** -* \ingroup CUDA_MATH__BFLOAT162_COMPARISON -* \brief Performs \p nv_bfloat162 vector unordered if-equal comparison. -* -* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b. -* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. -* NaN inputs generate true results. -* \param[in] a - nv_bfloat162. Is only being read. -* \param[in] b - nv_bfloat162. Is only being read. -* -* \returns nv_bfloat162 -* - The vector result of unordered if-equal comparison of vectors \p a and \p b. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 __hequ2(const __nv_bfloat162 a, const __nv_bfloat162 b); -/** -* \ingroup CUDA_MATH__BFLOAT162_COMPARISON -* \brief Performs \p nv_bfloat162 vector unordered not-equal comparison. -* -* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b. -* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. -* NaN inputs generate true results. -* \param[in] a - nv_bfloat162. Is only being read. -* \param[in] b - nv_bfloat162. Is only being read. -* -* \returns nv_bfloat162 -* - The vector result of unordered not-equal comparison of vectors \p a and \p b. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 __hneu2(const __nv_bfloat162 a, const __nv_bfloat162 b); -/** -* \ingroup CUDA_MATH__BFLOAT162_COMPARISON -* \brief Performs \p nv_bfloat162 vector unordered less-equal comparison. -* -* Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b. -* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. -* NaN inputs generate true results. -* \param[in] a - nv_bfloat162. Is only being read. -* \param[in] b - nv_bfloat162. Is only being read. -* -* \returns nv_bfloat162 -* - The vector result of unordered less-equal comparison of vectors \p a and \p b. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 __hleu2(const __nv_bfloat162 a, const __nv_bfloat162 b); -/** -* \ingroup CUDA_MATH__BFLOAT162_COMPARISON -* \brief Performs \p nv_bfloat162 vector unordered greater-equal comparison. -* -* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b. -* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. -* NaN inputs generate true results. -* \param[in] a - nv_bfloat162. Is only being read. -* \param[in] b - nv_bfloat162. Is only being read. -* -* \returns nv_bfloat162 -* - The \p nv_bfloat162 vector result of unordered greater-equal comparison of vectors \p a and \p b. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 __hgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b); -/** -* \ingroup CUDA_MATH__BFLOAT162_COMPARISON -* \brief Performs \p nv_bfloat162 vector unordered less-than comparison. -* -* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b. -* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. -* NaN inputs generate true results. -* \param[in] a - nv_bfloat162. Is only being read. -* \param[in] b - nv_bfloat162. Is only being read. -* -* \returns nv_bfloat162 -* - The vector result of unordered less-than comparison of vectors \p a and \p b. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 __hltu2(const __nv_bfloat162 a, const __nv_bfloat162 b); -/** -* \ingroup CUDA_MATH__BFLOAT162_COMPARISON -* \brief Performs \p nv_bfloat162 vector unordered greater-than comparison. -* -* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b. -* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. -* NaN inputs generate true results. -* \param[in] a - nv_bfloat162. Is only being read. -* \param[in] b - nv_bfloat162. Is only being read. -* -* \returns nv_bfloat162 -* - The \p nv_bfloat162 vector result of unordered greater-than comparison of vectors \p a and \p b. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 __hgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b); -/** -* \ingroup CUDA_MATH__BFLOAT162_COMPARISON -* \brief Determine whether \p nv_bfloat162 argument is a NaN. -* -* \details Determine whether each nv_bfloat16 of input \p nv_bfloat162 number \p a is a NaN. -* \param[in] a - nv_bfloat162. Is only being read. -* -* \returns nv_bfloat162 -* - The nv_bfloat162 with the corresponding \p nv_bfloat16 results set to -* 1.0 for NaN, 0.0 otherwise. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 __hisnan2(const __nv_bfloat162 a); -/** -* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC -* \brief Performs \p nv_bfloat162 vector addition in round-to-nearest-even mode. -* -* \details Performs \p nv_bfloat162 vector add of inputs \p a and \p b, in round-to-nearest -* mode. -* \internal -* \req DEEPLEARN-SRM_REQ-95 -* \endinternal -* \param[in] a - nv_bfloat162. Is only being read. -* \param[in] b - nv_bfloat162. Is only being read. -* -* \returns nv_bfloat162 -* - The sum of vectors \p a and \p b. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 __hadd2(const __nv_bfloat162 a, const __nv_bfloat162 b); -/** -* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC -* \brief Performs \p nv_bfloat162 vector subtraction in round-to-nearest-even mode. -* -* \details Subtracts \p nv_bfloat162 input vector \p b from input vector \p a in -* round-to-nearest-even mode. -* \internal -* \req DEEPLEARN-SRM_REQ-104 -* \endinternal -* \param[in] a - nv_bfloat162. Is only being read. -* \param[in] b - nv_bfloat162. Is only being read. -* -* \returns nv_bfloat162 -* - The subtraction of vector \p b from \p a. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 __hsub2(const __nv_bfloat162 a, const __nv_bfloat162 b); -/** -* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC -* \brief Performs \p nv_bfloat162 vector multiplication in round-to-nearest-even mode. -* -* \details Performs \p nv_bfloat162 vector multiplication of inputs \p a and \p b, in -* round-to-nearest-even mode. -* \internal -* \req DEEPLEARN-SRM_REQ-102 -* \endinternal -* \param[in] a - nv_bfloat162. Is only being read. -* \param[in] b - nv_bfloat162. Is only being read. -* -* \returns nv_bfloat162 -* - The result of elementwise multiplying the vectors \p a and \p b. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 __hmul2(const __nv_bfloat162 a, const __nv_bfloat162 b); -/** -* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC -* \brief Performs \p nv_bfloat162 vector addition in round-to-nearest-even mode. -* -* \details Performs \p nv_bfloat162 vector add of inputs \p a and \p b, in round-to-nearest -* mode. Prevents floating-point contractions of mul+add into fma. -* \internal -* \req DEEPLEARN-SRM_REQ-95 -* \endinternal -* \param[in] a - nv_bfloat162. Is only being read. -* \param[in] b - nv_bfloat162. Is only being read. -* -* \returns nv_bfloat162 -* - The sum of vectors \p a and \p b. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 __hadd2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b); -/** -* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC -* \brief Performs \p nv_bfloat162 vector subtraction in round-to-nearest-even mode. -* -* \details Subtracts \p nv_bfloat162 input vector \p b from input vector \p a in -* round-to-nearest-even mode. Prevents floating-point contractions of mul+sub into fma. -* \internal -* \req DEEPLEARN-SRM_REQ-104 -* \endinternal -* \param[in] a - nv_bfloat162. Is only being read. -* \param[in] b - nv_bfloat162. Is only being read. -* -* \returns nv_bfloat162 -* - The subtraction of vector \p b from \p a. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 __hsub2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b); -/** -* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC -* \brief Performs \p nv_bfloat162 vector multiplication in round-to-nearest-even mode. -* -* \details Performs \p nv_bfloat162 vector multiplication of inputs \p a and \p b, in -* round-to-nearest-even mode. Prevents floating-point contractions of mul+add -* or sub into fma. -* \internal -* \req DEEPLEARN-SRM_REQ-102 -* \endinternal -* \param[in] a - nv_bfloat162. Is only being read. -* \param[in] b - nv_bfloat162. Is only being read. -* -* \returns nv_bfloat162 -* - The result of elementwise multiplying the vectors \p a and \p b. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 __hmul2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b); -/** -* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC -* \brief Performs \p nv_bfloat162 vector division in round-to-nearest-even mode. -* -* \details Divides \p nv_bfloat162 input vector \p a by input vector \p b in round-to-nearest -* mode. -* \internal -* \req DEEPLEARN-SRM_REQ-103 -* \endinternal -* \param[in] a - nv_bfloat162. Is only being read. -* \param[in] b - nv_bfloat162. Is only being read. -* -* \returns nv_bfloat162 -* - The elementwise division of \p a with \p b. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 __h2div(const __nv_bfloat162 a, const __nv_bfloat162 b); -/** -* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC -* \brief Calculates the absolute value of both halves of the input \p nv_bfloat162 number and -* returns the result. -* -* \details Calculates the absolute value of both halves of the input \p nv_bfloat162 number and -* returns the result. -* \param[in] a - nv_bfloat162. Is only being read. -* -* \returns bfloat2 -* - Returns \p a with the absolute value of both halves. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 __habs2(const __nv_bfloat162 a); -/** -* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC -* \brief Performs \p nv_bfloat162 vector addition in round-to-nearest-even mode, with -* saturation to [0.0, 1.0]. -* -* \details Performs \p nv_bfloat162 vector add of inputs \p a and \p b, in round-to-nearest -* mode, and clamps the results to range [0.0, 1.0]. NaN results are flushed to -* +0.0. -* \param[in] a - nv_bfloat162. Is only being read. -* \param[in] b - nv_bfloat162. Is only being read. -* -* \returns nv_bfloat162 -* - The sum of \p a and \p b, with respect to saturation. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 __hadd2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b); -/** -* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC -* \brief Performs \p nv_bfloat162 vector subtraction in round-to-nearest-even mode, -* with saturation to [0.0, 1.0]. -* -* \details Subtracts \p nv_bfloat162 input vector \p b from input vector \p a in -* round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN -* results are flushed to +0.0. -* \param[in] a - nv_bfloat162. Is only being read. -* \param[in] b - nv_bfloat162. Is only being read. -* -* \returns nv_bfloat162 -* - The subtraction of vector \p b from \p a, with respect to saturation. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 __hsub2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b); -/** -* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC -* \brief Performs \p nv_bfloat162 vector multiplication in round-to-nearest-even mode, -* with saturation to [0.0, 1.0]. -* -* \details Performs \p nv_bfloat162 vector multiplication of inputs \p a and \p b, in -* round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN -* results are flushed to +0.0. -* \param[in] a - nv_bfloat162. Is only being read. -* \param[in] b - nv_bfloat162. Is only being read. -* -* \returns nv_bfloat162 -* - The result of elementwise multiplication of vectors \p a and \p b, -* with respect to saturation. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 __hmul2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b); -/** -* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC -* \brief Performs \p nv_bfloat162 vector fused multiply-add in round-to-nearest-even -* mode. -* -* \details Performs \p nv_bfloat162 vector multiply on inputs \p a and \p b, -* then performs a \p nv_bfloat162 vector add of the result with \p c, -* rounding the result once in round-to-nearest-even mode. -* \internal -* \req DEEPLEARN-SRM_REQ-105 -* \endinternal -* \param[in] a - nv_bfloat162. Is only being read. -* \param[in] b - nv_bfloat162. Is only being read. -* \param[in] c - nv_bfloat162. Is only being read. -* -* \returns nv_bfloat162 -* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c); -/** -* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC -* \brief Performs \p nv_bfloat162 vector fused multiply-add in round-to-nearest-even -* mode, with saturation to [0.0, 1.0]. -* -* \details Performs \p nv_bfloat162 vector multiply on inputs \p a and \p b, -* then performs a \p nv_bfloat162 vector add of the result with \p c, -* rounding the result once in round-to-nearest-even mode, and clamps the -* results to range [0.0, 1.0]. NaN results are flushed to +0.0. -* \param[in] a - nv_bfloat162. Is only being read. -* \param[in] b - nv_bfloat162. Is only being read. -* \param[in] c - nv_bfloat162. Is only being read. -* -* \returns nv_bfloat162 -* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c, -* with respect to saturation. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c); -/** -* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC -* \brief Negates both halves of the input \p nv_bfloat162 number and returns the -* result. -* -* \details Negates both halves of the input \p nv_bfloat162 number \p a and returns the result. -* \internal -* \req DEEPLEARN-SRM_REQ-101 -* \endinternal -* \param[in] a - nv_bfloat162. Is only being read. -* -* \returns nv_bfloat162 -* - Returns \p a with both halves negated. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 __hneg2(const __nv_bfloat162 a); -/** -* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC -* \brief Calculates the absolute value of input \p nv_bfloat16 number and returns the result. -* -* \details Calculates the absolute value of input \p nv_bfloat16 number and returns the result. -* \param[in] a - nv_bfloat16. Is only being read. -* -* \returns nv_bfloat16 -* - The absolute value of a. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 __habs(const __nv_bfloat16 a); -/** -* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC -* \brief Performs \p nv_bfloat16 addition in round-to-nearest-even mode. -* -* \details Performs \p nv_bfloat16 addition of inputs \p a and \p b, in round-to-nearest-even -* mode. -* \internal -* \req DEEPLEARN-SRM_REQ-94 -* \endinternal -* \param[in] a - nv_bfloat16. Is only being read. -* \param[in] b - nv_bfloat16. Is only being read. -* -* \returns nv_bfloat16 -* - The sum of \p a and \p b. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 __hadd(const __nv_bfloat16 a, const __nv_bfloat16 b); -/** -* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC -* \brief Performs \p nv_bfloat16 subtraction in round-to-nearest-even mode. -* -* \details Subtracts \p nv_bfloat16 input \p b from input \p a in round-to-nearest -* mode. -* \internal -* \req DEEPLEARN-SRM_REQ-97 -* \endinternal -* \param[in] a - nv_bfloat16. Is only being read. -* \param[in] b - nv_bfloat16. Is only being read. -* -* \returns nv_bfloat16 -* - The result of subtracting \p b from \p a. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 __hsub(const __nv_bfloat16 a, const __nv_bfloat16 b); -/** -* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC -* \brief Performs \p nv_bfloat16 multiplication in round-to-nearest-even mode. -* -* \details Performs \p nv_bfloat16 multiplication of inputs \p a and \p b, in round-to-nearest -* mode. -* \internal -* \req DEEPLEARN-SRM_REQ-99 -* \endinternal -* \param[in] a - nv_bfloat16. Is only being read. -* \param[in] b - nv_bfloat16. Is only being read. -* -* \returns nv_bfloat16 -* - The result of multiplying \p a and \p b. -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 __hmul(const __nv_bfloat16 a, const __nv_bfloat16 b); -/** -* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC -* \brief Performs \p nv_bfloat16 addition in round-to-nearest-even mode. -* -* \details Performs \p nv_bfloat16 addition of inputs \p a and \p b, in round-to-nearest-even -* mode. Prevents floating-point contractions of mul+add into fma. -* \internal -* \req DEEPLEARN-SRM_REQ-94 -* \endinternal -* \param[in] a - nv_bfloat16. Is only being read. -* \param[in] b - nv_bfloat16. Is only being read. -* -* \returns nv_bfloat16 -* - The sum of \p a and \p b. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 __hadd_rn(const __nv_bfloat16 a, const __nv_bfloat16 b); -/** -* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC -* \brief Performs \p nv_bfloat16 subtraction in round-to-nearest-even mode. -* -* \details Subtracts \p nv_bfloat16 input \p b from input \p a in round-to-nearest -* mode. Prevents floating-point contractions of mul+sub into fma. -* \internal -* \req DEEPLEARN-SRM_REQ-97 -* \endinternal -* \param[in] a - nv_bfloat16. Is only being read. -* \param[in] b - nv_bfloat16. Is only being read. -* -* \returns nv_bfloat16 -* - The result of subtracting \p b from \p a. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 __hsub_rn(const __nv_bfloat16 a, const __nv_bfloat16 b); -/** -* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC -* \brief Performs \p nv_bfloat16 multiplication in round-to-nearest-even mode. -* -* \details Performs \p nv_bfloat16 multiplication of inputs \p a and \p b, in round-to-nearest -* mode. Prevents floating-point contractions of mul+add or sub into fma. -* \internal -* \req DEEPLEARN-SRM_REQ-99 -* \endinternal -* \param[in] a - nv_bfloat16. Is only being read. -* \param[in] b - nv_bfloat16. Is only being read. -* -* \returns nv_bfloat16 -* - The result of multiplying \p a and \p b. -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 __hmul_rn(const __nv_bfloat16 a, const __nv_bfloat16 b); -/** -* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC -* \brief Performs \p nv_bfloat16 division in round-to-nearest-even mode. -* -* \details Divides \p nv_bfloat16 input \p a by input \p b in round-to-nearest -* mode. -* \internal -* \req DEEPLEARN-SRM_REQ-98 -* \endinternal -* \param[in] a - nv_bfloat16. Is only being read. -* \param[in] b - nv_bfloat16. Is only being read. -* -* \returns nv_bfloat16 -* - The result of dividing \p a by \p b. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 __hdiv(const __nv_bfloat16 a, const __nv_bfloat16 b); -/** -* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC -* \brief Performs \p nv_bfloat16 addition in round-to-nearest-even mode, with -* saturation to [0.0, 1.0]. -* -* \details Performs \p nv_bfloat16 add of inputs \p a and \p b, in round-to-nearest-even mode, -* and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0. -* \param[in] a - nv_bfloat16. Is only being read. -* \param[in] b - nv_bfloat16. Is only being read. -* -* \returns nv_bfloat16 -* - The sum of \p a and \p b, with respect to saturation. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 __hadd_sat(const __nv_bfloat16 a, const __nv_bfloat16 b); -/** -* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC -* \brief Performs \p nv_bfloat16 subtraction in round-to-nearest-even mode, with -* saturation to [0.0, 1.0]. -* -* \details Subtracts \p nv_bfloat16 input \p b from input \p a in round-to-nearest -* mode, -* and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0. -* \param[in] a - nv_bfloat16. Is only being read. -* \param[in] b - nv_bfloat16. Is only being read. -* -* \returns nv_bfloat16 -* - The result of subtraction of \p b from \p a, with respect to saturation. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 __hsub_sat(const __nv_bfloat16 a, const __nv_bfloat16 b); -/** -* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC -* \brief Performs \p nv_bfloat16 multiplication in round-to-nearest-even mode, with -* saturation to [0.0, 1.0]. -* -* \details Performs \p nv_bfloat16 multiplication of inputs \p a and \p b, in round-to-nearest -* mode, and clamps the result to range [0.0, 1.0]. NaN results are flushed to -* +0.0. -* \param[in] a - nv_bfloat16. Is only being read. -* \param[in] b - nv_bfloat16. Is only being read. -* -* \returns nv_bfloat16 -* - The result of multiplying \p a and \p b, with respect to saturation. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 __hmul_sat(const __nv_bfloat16 a, const __nv_bfloat16 b); -/** -* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC -* \brief Performs \p nv_bfloat16 fused multiply-add in round-to-nearest-even mode. -* -* \details Performs \p nv_bfloat16 multiply on inputs \p a and \p b, -* then performs a \p nv_bfloat16 add of the result with \p c, -* rounding the result once in round-to-nearest-even mode. -* \internal -* \req DEEPLEARN-SRM_REQ-96 -* \endinternal -* \param[in] a - nv_bfloat16. Is only being read. -* \param[in] b - nv_bfloat16. Is only being read. -* \param[in] c - nv_bfloat16. Is only being read. -* -* \returns nv_bfloat16 -* - The result of fused multiply-add operation on \p -* a, \p b, and \p c. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 __hfma(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c); -/** -* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC -* \brief Performs \p nv_bfloat16 fused multiply-add in round-to-nearest-even mode, -* with saturation to [0.0, 1.0]. -* -* \details Performs \p nv_bfloat16 multiply on inputs \p a and \p b, -* then performs a \p nv_bfloat16 add of the result with \p c, -* rounding the result once in round-to-nearest-even mode, and clamps the result -* to range [0.0, 1.0]. NaN results are flushed to +0.0. -* \param[in] a - nv_bfloat16. Is only being read. -* \param[in] b - nv_bfloat16. Is only being read. -* \param[in] c - nv_bfloat16. Is only being read. -* -* \returns nv_bfloat16 -* - The result of fused multiply-add operation on \p -* a, \p b, and \p c, with respect to saturation. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 __hfma_sat(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c); -/** -* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC -* \brief Negates input \p nv_bfloat16 number and returns the result. -* -* \details Negates input \p nv_bfloat16 number and returns the result. -* \internal -* \req DEEPLEARN-SRM_REQ-100 -* \endinternal -* \param[in] a - nv_bfloat16. Is only being read. -* -* \returns nv_bfloat16 -* - minus a -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 __hneg(const __nv_bfloat16 a); -/** -* \ingroup CUDA_MATH__BFLOAT162_COMPARISON -* \brief Performs \p nv_bfloat162 vector if-equal comparison and returns boolean true -* iff both \p nv_bfloat16 results are true, boolean false otherwise. -* -* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b. -* The bool result is set to true only if both \p nv_bfloat16 if-equal comparisons -* evaluate to true, or false otherwise. -* NaN inputs generate false results. -* \param[in] a - nv_bfloat162. Is only being read. -* \param[in] b - nv_bfloat162. Is only being read. -* -* \returns bool -* - true if both \p nv_bfloat16 results of if-equal comparison -* of vectors \p a and \p b are true; -* - false otherwise. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ bool __hbeq2(const __nv_bfloat162 a, const __nv_bfloat162 b); -/** -* \ingroup CUDA_MATH__BFLOAT162_COMPARISON -* \brief Performs \p nv_bfloat162 vector not-equal comparison and returns boolean -* true iff both \p nv_bfloat16 results are true, boolean false otherwise. -* -* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b. -* The bool result is set to true only if both \p nv_bfloat16 not-equal comparisons -* evaluate to true, or false otherwise. -* NaN inputs generate false results. -* \param[in] a - nv_bfloat162. Is only being read. -* \param[in] b - nv_bfloat162. Is only being read. -* -* \returns bool -* - true if both \p nv_bfloat16 results of not-equal comparison -* of vectors \p a and \p b are true, -* - false otherwise. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ bool __hbne2(const __nv_bfloat162 a, const __nv_bfloat162 b); -/** -* \ingroup CUDA_MATH__BFLOAT162_COMPARISON -* \brief Performs \p nv_bfloat162 vector less-equal comparison and returns boolean -* true iff both \p nv_bfloat16 results are true, boolean false otherwise. -* -* \details Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b. -* The bool result is set to true only if both \p nv_bfloat16 less-equal comparisons -* evaluate to true, or false otherwise. -* NaN inputs generate false results. -* \param[in] a - nv_bfloat162. Is only being read. -* \param[in] b - nv_bfloat162. Is only being read. -* -* \returns bool -* - true if both \p nv_bfloat16 results of less-equal comparison -* of vectors \p a and \p b are true; -* - false otherwise. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ bool __hble2(const __nv_bfloat162 a, const __nv_bfloat162 b); -/** -* \ingroup CUDA_MATH__BFLOAT162_COMPARISON -* \brief Performs \p nv_bfloat162 vector greater-equal comparison and returns boolean -* true iff both \p nv_bfloat16 results are true, boolean false otherwise. -* -* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b. -* The bool result is set to true only if both \p nv_bfloat16 greater-equal comparisons -* evaluate to true, or false otherwise. -* NaN inputs generate false results. -* \param[in] a - nv_bfloat162. Is only being read. -* \param[in] b - nv_bfloat162. Is only being read. -* -* \returns bool -* - true if both \p nv_bfloat16 results of greater-equal -* comparison of vectors \p a and \p b are true; -* - false otherwise. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ bool __hbge2(const __nv_bfloat162 a, const __nv_bfloat162 b); -/** -* \ingroup CUDA_MATH__BFLOAT162_COMPARISON -* \brief Performs \p nv_bfloat162 vector less-than comparison and returns boolean -* true iff both \p nv_bfloat16 results are true, boolean false otherwise. -* -* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b. -* The bool result is set to true only if both \p nv_bfloat16 less-than comparisons -* evaluate to true, or false otherwise. -* NaN inputs generate false results. -* \param[in] a - nv_bfloat162. Is only being read. -* \param[in] b - nv_bfloat162. Is only being read. -* -* \returns bool -* - true if both \p nv_bfloat16 results of less-than comparison -* of vectors \p a and \p b are true; -* - false otherwise. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ bool __hblt2(const __nv_bfloat162 a, const __nv_bfloat162 b); -/** -* \ingroup CUDA_MATH__BFLOAT162_COMPARISON -* \brief Performs \p nv_bfloat162 vector greater-than comparison and returns boolean -* true iff both \p nv_bfloat16 results are true, boolean false otherwise. -* -* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b. -* The bool result is set to true only if both \p nv_bfloat16 greater-than comparisons -* evaluate to true, or false otherwise. -* NaN inputs generate false results. -* \param[in] a - nv_bfloat162. Is only being read. -* \param[in] b - nv_bfloat162. Is only being read. -* -* \returns bool -* - true if both \p nv_bfloat16 results of greater-than -* comparison of vectors \p a and \p b are true; -* - false otherwise. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ bool __hbgt2(const __nv_bfloat162 a, const __nv_bfloat162 b); -/** -* \ingroup CUDA_MATH__BFLOAT162_COMPARISON -* \brief Performs \p nv_bfloat162 vector unordered if-equal comparison and returns -* boolean true iff both \p nv_bfloat16 results are true, boolean false otherwise. -* -* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b. -* The bool result is set to true only if both \p nv_bfloat16 if-equal comparisons -* evaluate to true, or false otherwise. -* NaN inputs generate true results. -* \param[in] a - nv_bfloat162. Is only being read. -* \param[in] b - nv_bfloat162. Is only being read. -* -* \returns bool -* - true if both \p nv_bfloat16 results of unordered if-equal -* comparison of vectors \p a and \p b are true; -* - false otherwise. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ bool __hbequ2(const __nv_bfloat162 a, const __nv_bfloat162 b); -/** -* \ingroup CUDA_MATH__BFLOAT162_COMPARISON -* \brief Performs \p nv_bfloat162 vector unordered not-equal comparison and returns -* boolean true iff both \p nv_bfloat16 results are true, boolean false otherwise. -* -* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b. -* The bool result is set to true only if both \p nv_bfloat16 not-equal comparisons -* evaluate to true, or false otherwise. -* NaN inputs generate true results. -* \param[in] a - nv_bfloat162. Is only being read. -* \param[in] b - nv_bfloat162. Is only being read. -* -* \returns bool -* - true if both \p nv_bfloat16 results of unordered not-equal -* comparison of vectors \p a and \p b are true; -* - false otherwise. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ bool __hbneu2(const __nv_bfloat162 a, const __nv_bfloat162 b); -/** -* \ingroup CUDA_MATH__BFLOAT162_COMPARISON -* \brief Performs \p nv_bfloat162 vector unordered less-equal comparison and returns -* boolean true iff both \p nv_bfloat16 results are true, boolean false otherwise. -* -* \details Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b. -* The bool result is set to true only if both \p nv_bfloat16 less-equal comparisons -* evaluate to true, or false otherwise. -* NaN inputs generate true results. -* \param[in] a - nv_bfloat162. Is only being read. -* \param[in] b - nv_bfloat162. Is only being read. -* -* \returns bool -* - true if both \p nv_bfloat16 results of unordered less-equal -* comparison of vectors \p a and \p b are true; -* - false otherwise. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ bool __hbleu2(const __nv_bfloat162 a, const __nv_bfloat162 b); -/** -* \ingroup CUDA_MATH__BFLOAT162_COMPARISON -* \brief Performs \p nv_bfloat162 vector unordered greater-equal comparison and -* returns boolean true iff both \p nv_bfloat16 results are true, boolean false -* otherwise. -* -* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b. -* The bool result is set to true only if both \p nv_bfloat16 greater-equal comparisons -* evaluate to true, or false otherwise. -* NaN inputs generate true results. -* \param[in] a - nv_bfloat162. Is only being read. -* \param[in] b - nv_bfloat162. Is only being read. -* -* \returns bool -* - true if both \p nv_bfloat16 results of unordered -* greater-equal comparison of vectors \p a and \p b are true; -* - false otherwise. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ bool __hbgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b); -/** -* \ingroup CUDA_MATH__BFLOAT162_COMPARISON -* \brief Performs \p nv_bfloat162 vector unordered less-than comparison and returns -* boolean true iff both \p nv_bfloat16 results are true, boolean false otherwise. -* -* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b. -* The bool result is set to true only if both \p nv_bfloat16 less-than comparisons -* evaluate to true, or false otherwise. -* NaN inputs generate true results. -* \param[in] a - nv_bfloat162. Is only being read. -* \param[in] b - nv_bfloat162. Is only being read. -* -* \returns bool -* - true if both \p nv_bfloat16 results of unordered less-than comparison of -* vectors \p a and \p b are true; -* - false otherwise. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ bool __hbltu2(const __nv_bfloat162 a, const __nv_bfloat162 b); -/** -* \ingroup CUDA_MATH__BFLOAT162_COMPARISON -* \brief Performs \p nv_bfloat162 vector unordered greater-than comparison and -* returns boolean true iff both \p nv_bfloat16 results are true, boolean false -* otherwise. -* -* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b. -* The bool result is set to true only if both \p nv_bfloat16 greater-than comparisons -* evaluate to true, or false otherwise. -* NaN inputs generate true results. -* \param[in] a - nv_bfloat162. Is only being read. -* \param[in] b - nv_bfloat162. Is only being read. -* -* \returns bool -* - true if both \p nv_bfloat16 results of unordered -* greater-than comparison of vectors \p a and \p b are true; -* - false otherwise. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ bool __hbgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b); -/** -* \ingroup CUDA_MATH__BFLOAT16_COMPARISON -* \brief Performs \p nv_bfloat16 if-equal comparison. -* -* \details Performs \p nv_bfloat16 if-equal comparison of inputs \p a and \p b. -* NaN inputs generate false results. -* \param[in] a - nv_bfloat16. Is only being read. -* \param[in] b - nv_bfloat16. Is only being read. -* -* \returns bool -* - The boolean result of if-equal comparison of \p a and \p b. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ bool __heq(const __nv_bfloat16 a, const __nv_bfloat16 b); -/** -* \ingroup CUDA_MATH__BFLOAT16_COMPARISON -* \brief Performs \p nv_bfloat16 not-equal comparison. -* -* \details Performs \p nv_bfloat16 not-equal comparison of inputs \p a and \p b. -* NaN inputs generate false results. -* \param[in] a - nv_bfloat16. Is only being read. -* \param[in] b - nv_bfloat16. Is only being read. -* -* \returns bool -* - The boolean result of not-equal comparison of \p a and \p b. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ bool __hne(const __nv_bfloat16 a, const __nv_bfloat16 b); -/** -* \ingroup CUDA_MATH__BFLOAT16_COMPARISON -* \brief Performs \p nv_bfloat16 less-equal comparison. -* -* \details Performs \p nv_bfloat16 less-equal comparison of inputs \p a and \p b. -* NaN inputs generate false results. -* \param[in] a - nv_bfloat16. Is only being read. -* \param[in] b - nv_bfloat16. Is only being read. -* -* \returns bool -* - The boolean result of less-equal comparison of \p a and \p b. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ bool __hle(const __nv_bfloat16 a, const __nv_bfloat16 b); -/** -* \ingroup CUDA_MATH__BFLOAT16_COMPARISON -* \brief Performs \p nv_bfloat16 greater-equal comparison. -* -* \details Performs \p nv_bfloat16 greater-equal comparison of inputs \p a and \p b. -* NaN inputs generate false results. -* \param[in] a - nv_bfloat16. Is only being read. -* \param[in] b - nv_bfloat16. Is only being read. -* -* \returns bool -* - The boolean result of greater-equal comparison of \p a and \p b. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ bool __hge(const __nv_bfloat16 a, const __nv_bfloat16 b); -/** -* \ingroup CUDA_MATH__BFLOAT16_COMPARISON -* \brief Performs \p nv_bfloat16 less-than comparison. -* -* \details Performs \p nv_bfloat16 less-than comparison of inputs \p a and \p b. -* NaN inputs generate false results. -* \param[in] a - nv_bfloat16. Is only being read. -* \param[in] b - nv_bfloat16. Is only being read. -* -* \returns bool -* - The boolean result of less-than comparison of \p a and \p b. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ bool __hlt(const __nv_bfloat16 a, const __nv_bfloat16 b); -/** -* \ingroup CUDA_MATH__BFLOAT16_COMPARISON -* \brief Performs \p nv_bfloat16 greater-than comparison. -* -* \details Performs \p nv_bfloat16 greater-than comparison of inputs \p a and \p b. -* NaN inputs generate false results. -* \param[in] a - nv_bfloat16. Is only being read. -* \param[in] b - nv_bfloat16. Is only being read. -* -* \returns bool -* - The boolean result of greater-than comparison of \p a and \p b. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ bool __hgt(const __nv_bfloat16 a, const __nv_bfloat16 b); -/** -* \ingroup CUDA_MATH__BFLOAT16_COMPARISON -* \brief Performs \p nv_bfloat16 unordered if-equal comparison. -* -* \details Performs \p nv_bfloat16 if-equal comparison of inputs \p a and \p b. -* NaN inputs generate true results. -* \param[in] a - nv_bfloat16. Is only being read. -* \param[in] b - nv_bfloat16. Is only being read. -* -* \returns bool -* - The boolean result of unordered if-equal comparison of \p a and -* \p b. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ bool __hequ(const __nv_bfloat16 a, const __nv_bfloat16 b); -/** -* \ingroup CUDA_MATH__BFLOAT16_COMPARISON -* \brief Performs \p nv_bfloat16 unordered not-equal comparison. -* -* \details Performs \p nv_bfloat16 not-equal comparison of inputs \p a and \p b. -* NaN inputs generate true results. -* \param[in] a - nv_bfloat16. Is only being read. -* \param[in] b - nv_bfloat16. Is only being read. -* -* \returns bool -* - The boolean result of unordered not-equal comparison of \p a and -* \p b. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ bool __hneu(const __nv_bfloat16 a, const __nv_bfloat16 b); -/** -* \ingroup CUDA_MATH__BFLOAT16_COMPARISON -* \brief Performs \p nv_bfloat16 unordered less-equal comparison. -* -* \details Performs \p nv_bfloat16 less-equal comparison of inputs \p a and \p b. -* NaN inputs generate true results. -* \param[in] a - nv_bfloat16. Is only being read. -* \param[in] b - nv_bfloat16. Is only being read. -* -* \returns bool -* - The boolean result of unordered less-equal comparison of \p a and -* \p b. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ bool __hleu(const __nv_bfloat16 a, const __nv_bfloat16 b); -/** -* \ingroup CUDA_MATH__BFLOAT16_COMPARISON -* \brief Performs \p nv_bfloat16 unordered greater-equal comparison. -* -* \details Performs \p nv_bfloat16 greater-equal comparison of inputs \p a and \p b. -* NaN inputs generate true results. -* \param[in] a - nv_bfloat16. Is only being read. -* \param[in] b - nv_bfloat16. Is only being read. -* -* \returns bool -* - The boolean result of unordered greater-equal comparison of \p a -* and \p b. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ bool __hgeu(const __nv_bfloat16 a, const __nv_bfloat16 b); -/** -* \ingroup CUDA_MATH__BFLOAT16_COMPARISON -* \brief Performs \p nv_bfloat16 unordered less-than comparison. -* -* \details Performs \p nv_bfloat16 less-than comparison of inputs \p a and \p b. -* NaN inputs generate true results. -* \param[in] a - nv_bfloat16. Is only being read. -* \param[in] b - nv_bfloat16. Is only being read. -* -* \returns bool -* - The boolean result of unordered less-than comparison of \p a and -* \p b. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ bool __hltu(const __nv_bfloat16 a, const __nv_bfloat16 b); -/** -* \ingroup CUDA_MATH__BFLOAT16_COMPARISON -* \brief Performs \p nv_bfloat16 unordered greater-than comparison. -* -* \details Performs \p nv_bfloat16 greater-than comparison of inputs \p a and \p b. -* NaN inputs generate true results. -* \param[in] a - nv_bfloat16. Is only being read. -* \param[in] b - nv_bfloat16. Is only being read. -* -* \returns bool -* - The boolean result of unordered greater-than comparison of \p a -* and \p b. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ bool __hgtu(const __nv_bfloat16 a, const __nv_bfloat16 b); -/** -* \ingroup CUDA_MATH__BFLOAT16_COMPARISON -* \brief Determine whether \p nv_bfloat16 argument is a NaN. -* -* \details Determine whether \p nv_bfloat16 value \p a is a NaN. -* \param[in] a - nv_bfloat16. Is only being read. -* -* \returns bool -* - true iff argument is NaN. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ bool __hisnan(const __nv_bfloat16 a); -/** -* \ingroup CUDA_MATH__BFLOAT16_COMPARISON -* \brief Calculates \p nv_bfloat16 maximum of two input values. -* -* \details Calculates \p nv_bfloat16 max(\p a, \p b) -* defined as (\p a > \p b) ? \p a : \p b. -* - If either of inputs is NaN, the other input is returned. -* - If both inputs are NaNs, then canonical NaN is returned. -* - If values of both inputs are 0.0, then +0.0 > -0.0 -* \param[in] a - nv_bfloat16. Is only being read. -* \param[in] b - nv_bfloat16. Is only being read. -* -* \returns nv_bfloat16 -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 __hmax(const __nv_bfloat16 a, const __nv_bfloat16 b); -/** -* \ingroup CUDA_MATH__BFLOAT16_COMPARISON -* \brief Calculates \p nv_bfloat16 minimum of two input values. -* -* \details Calculates \p nv_bfloat16 min(\p a, \p b) -* defined as (\p a < \p b) ? \p a : \p b. -* - If either of inputs is NaN, the other input is returned. -* - If both inputs are NaNs, then canonical NaN is returned. -* - If values of both inputs are 0.0, then +0.0 > -0.0 -* \param[in] a - nv_bfloat16. Is only being read. -* \param[in] b - nv_bfloat16. Is only being read. -* -* \returns nv_bfloat16 -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 __hmin(const __nv_bfloat16 a, const __nv_bfloat16 b); -/** -* \ingroup CUDA_MATH__BFLOAT16_COMPARISON -* \brief Calculates \p nv_bfloat16 maximum of two input values, NaNs pass through. -* -* \details Calculates \p nv_bfloat16 max(\p a, \p b) -* defined as (\p a > \p b) ? \p a : \p b. -* - If either of inputs is NaN, then canonical NaN is returned. -* - If values of both inputs are 0.0, then +0.0 > -0.0 -* \param[in] a - nv_bfloat16. Is only being read. -* \param[in] b - nv_bfloat16. Is only being read. -* -* \returns nv_bfloat16 -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 __hmax_nan(const __nv_bfloat16 a, const __nv_bfloat16 b); -/** -* \ingroup CUDA_MATH__BFLOAT16_COMPARISON -* \brief Calculates \p nv_bfloat16 minimum of two input values, NaNs pass through. -* -* \details Calculates \p nv_bfloat16 min(\p a, \p b) -* defined as (\p a < \p b) ? \p a : \p b. -* - If either of inputs is NaN, then canonical NaN is returned. -* - If values of both inputs are 0.0, then +0.0 > -0.0 -* \param[in] a - nv_bfloat16. Is only being read. -* \param[in] b - nv_bfloat16. Is only being read. -* -* \returns nv_bfloat16 -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 __hmin_nan(const __nv_bfloat16 a, const __nv_bfloat16 b); -/** -* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC -* \brief Performs \p nv_bfloat16 fused multiply-add in round-to-nearest-even mode with relu saturation. -* -* \details Performs \p nv_bfloat16 multiply on inputs \p a and \p b, -* then performs a \p nv_bfloat16 add of the result with \p c, -* rounding the result once in round-to-nearest-even mode. -* Then negative result is clamped to 0. -* NaN result is converted to canonical NaN. -* \param[in] a - nv_bfloat16. Is only being read. -* \param[in] b - nv_bfloat16. Is only being read. -* \param[in] c - nv_bfloat16. Is only being read. -* -* \returns nv_bfloat16 -* - The result of fused multiply-add operation on \p -* a, \p b, and \p c with relu saturation. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 __hfma_relu(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c); -/** -* \ingroup CUDA_MATH__BFLOAT162_COMPARISON -* \brief Calculates \p nv_bfloat162 vector maximum of two inputs. -* -* \details Calculates \p nv_bfloat162 vector max(\p a, \p b). -* Elementwise \p nv_bfloat16 operation is defined as -* (\p a > \p b) ? \p a : \p b. -* - If either of inputs is NaN, the other input is returned. -* - If both inputs are NaNs, then canonical NaN is returned. -* - If values of both inputs are 0.0, then +0.0 > -0.0 -* \param[in] a - nv_bfloat162. Is only being read. -* \param[in] b - nv_bfloat162. Is only being read. -* -* \returns nv_bfloat162 -* - The result of elementwise maximum of vectors \p a and \p b -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 __hmax2(const __nv_bfloat162 a, const __nv_bfloat162 b); -/** -* \ingroup CUDA_MATH__BFLOAT162_COMPARISON -* \brief Calculates \p nv_bfloat162 vector minimum of two inputs. -* -* \details Calculates \p nv_bfloat162 vector min(\p a, \p b). -* Elementwise \p nv_bfloat16 operation is defined as -* (\p a < \p b) ? \p a : \p b. -* - If either of inputs is NaN, the other input is returned. -* - If both inputs are NaNs, then canonical NaN is returned. -* - If values of both inputs are 0.0, then +0.0 > -0.0 -* \param[in] a - nv_bfloat162. Is only being read. -* \param[in] b - nv_bfloat162. Is only being read. -* -* \returns nv_bfloat162 -* - The result of elementwise minimum of vectors \p a and \p b -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 __hmin2(const __nv_bfloat162 a, const __nv_bfloat162 b); -/** -* \ingroup CUDA_MATH__BFLOAT162_COMPARISON -* \brief Calculates \p nv_bfloat162 vector maximum of two inputs, NaNs pass through. -* -* \details Calculates \p nv_bfloat162 vector max(\p a, \p b). -* Elementwise \p nv_bfloat16 operation is defined as -* (\p a > \p b) ? \p a : \p b. -* - If either of inputs is NaN, then canonical NaN is returned. -* - If values of both inputs are 0.0, then +0.0 > -0.0 -* \param[in] a - nv_bfloat162. Is only being read. -* \param[in] b - nv_bfloat162. Is only being read. -* -* \returns nv_bfloat162 -* - The result of elementwise maximum of vectors \p a and \p b, with NaNs pass through -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 __hmax2_nan(const __nv_bfloat162 a, const __nv_bfloat162 b); -/** -* \ingroup CUDA_MATH__BFLOAT162_COMPARISON -* \brief Calculates \p nv_bfloat162 vector minimum of two inputs, NaNs pass through. -* -* \details Calculates \p nv_bfloat162 vector min(\p a, \p b). -* Elementwise \p nv_bfloat16 operation is defined as -* (\p a < \p b) ? \p a : \p b. -* - If either of inputs is NaN, then canonical NaN is returned. -* - If values of both inputs are 0.0, then +0.0 > -0.0 -* \param[in] a - nv_bfloat162. Is only being read. -* \param[in] b - nv_bfloat162. Is only being read. -* -* \returns nv_bfloat162 -* - The result of elementwise minimum of vectors \p a and \p b, with NaNs pass through -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 __hmin2_nan(const __nv_bfloat162 a, const __nv_bfloat162 b); -/** -* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC -* \brief Performs \p nv_bfloat162 vector fused multiply-add in round-to-nearest-even -* mode with relu saturation. -* -* \details Performs \p nv_bfloat162 vector multiply on inputs \p a and \p b, -* then performs a \p nv_bfloat162 vector add of the result with \p c, -* rounding the result once in round-to-nearest-even mode. -* Then negative result is clamped to 0. -* NaN result is converted to canonical NaN. -* \param[in] a - nv_bfloat162. Is only being read. -* \param[in] b - nv_bfloat162. Is only being read. -* \param[in] c - nv_bfloat162. Is only being read. -* -* \returns nv_bfloat162 -* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c with relu saturation. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2_relu(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c); -/** -* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC -* \brief Performs fast complex multiply-accumulate -* -* \details Interprets vector \p nv_bfloat162 input pairs \p a, \p b, and \p c as -* complex numbers in \p nv_bfloat16 precision and performs -* complex multiply-accumulate operation: a*b + c -* \param[in] a - nv_bfloat162. Is only being read. -* \param[in] b - nv_bfloat162. Is only being read. -* \param[in] c - nv_bfloat162. Is only being read. -* -* \returns nv_bfloat162 -* - The result of complex multiply-accumulate operation on complex numbers \p a, \p b, and \p c -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 __hcmadd(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c); - -/** -* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS -* \brief Calculates \p nv_bfloat16 square root in round-to-nearest-even mode. -* -* \details Calculates \p nv_bfloat16 square root of input \p a in round-to-nearest-even mode. -* \param[in] a - nv_bfloat16. Is only being read. -* -* \returns nv_bfloat16 -* - The square root of \p a. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 hsqrt(const __nv_bfloat16 a); -/** -* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS -* \brief Calculates \p nv_bfloat16 reciprocal square root in round-to-nearest-even -* mode. -* -* \details Calculates \p nv_bfloat16 reciprocal square root of input \p a in round-to-nearest -* mode. -* \param[in] a - nv_bfloat16. Is only being read. -* -* \returns nv_bfloat16 -* - The reciprocal square root of \p a. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 hrsqrt(const __nv_bfloat16 a); -/** -* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS -* \brief Calculates \p nv_bfloat16 reciprocal in round-to-nearest-even mode. -* -* \details Calculates \p nv_bfloat16 reciprocal of input \p a in round-to-nearest-even mode. -* \param[in] a - nv_bfloat16. Is only being read. -* -* \returns nv_bfloat16 -* - The reciprocal of \p a. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 hrcp(const __nv_bfloat16 a); -/** -* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS -* \brief Calculates \p nv_bfloat16 natural logarithm in round-to-nearest-even mode. -* -* \details Calculates \p nv_bfloat16 natural logarithm of input \p a in round-to-nearest-even -* mode. -* \param[in] a - nv_bfloat16. Is only being read. -* -* \returns nv_bfloat16 -* - The natural logarithm of \p a. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 hlog(const __nv_bfloat16 a); -/** -* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS -* \brief Calculates \p nv_bfloat16 binary logarithm in round-to-nearest-even mode. -* -* \details Calculates \p nv_bfloat16 binary logarithm of input \p a in round-to-nearest-even -* mode. -* \param[in] a - nv_bfloat16. Is only being read. -* -* \returns nv_bfloat16 -* - The binary logarithm of \p a. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 hlog2(const __nv_bfloat16 a); -/** -* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS -* \brief Calculates \p nv_bfloat16 decimal logarithm in round-to-nearest-even mode. -* -* \details Calculates \p nv_bfloat16 decimal logarithm of input \p a in round-to-nearest-even -* mode. -* \param[in] a - nv_bfloat16. Is only being read. -* -* \returns nv_bfloat16 -* - The decimal logarithm of \p a. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 hlog10(const __nv_bfloat16 a); -/** -* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS -* \brief Calculates \p nv_bfloat16 natural exponential function in round-to-nearest -* mode. -* -* \details Calculates \p nv_bfloat16 natural exponential function of input \p a in -* round-to-nearest-even mode. -* \param[in] a - nv_bfloat16. Is only being read. -* -* \returns nv_bfloat16 -* - The natural exponential function on \p a. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 hexp(const __nv_bfloat16 a); -/** -* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS -* \brief Calculates \p nv_bfloat16 binary exponential function in round-to-nearest -* mode. -* -* \details Calculates \p nv_bfloat16 binary exponential function of input \p a in -* round-to-nearest-even mode. -* \param[in] a - nv_bfloat16. Is only being read. -* -* \returns nv_bfloat16 -* - The binary exponential function on \p a. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 hexp2(const __nv_bfloat16 a); -/** -* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS -* \brief Calculates \p nv_bfloat16 decimal exponential function in round-to-nearest -* mode. -* -* \details Calculates \p nv_bfloat16 decimal exponential function of input \p a in -* round-to-nearest-even mode. -* \param[in] a - nv_bfloat16. Is only being read. -* -* \returns nv_bfloat16 -* - The decimal exponential function on \p a. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 hexp10(const __nv_bfloat16 a); -/** -* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS -* \brief Calculates \p nv_bfloat16 cosine in round-to-nearest-even mode. -* -* \details Calculates \p nv_bfloat16 cosine of input \p a in round-to-nearest-even mode. -* \param[in] a - nv_bfloat16. Is only being read. -* -* \returns nv_bfloat16 -* - The cosine of \p a. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 hcos(const __nv_bfloat16 a); -/** -* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS -* \brief Calculates \p nv_bfloat16 sine in round-to-nearest-even mode. -* -* \details Calculates \p nv_bfloat16 sine of input \p a in round-to-nearest-even mode. -* \param[in] a - nv_bfloat16. Is only being read. -* -* \returns nv_bfloat16 -* - The sine of \p a. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 hsin(const __nv_bfloat16 a); -/** -* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS -* \brief Calculates \p nv_bfloat162 vector square root in round-to-nearest-even mode. -* -* \details Calculates \p nv_bfloat162 square root of input vector \p a in round-to-nearest -* mode. -* \param[in] a - nv_bfloat162. Is only being read. -* -* \returns nv_bfloat162 -* - The elementwise square root on vector \p a. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 h2sqrt(const __nv_bfloat162 a); -/** -* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS -* \brief Calculates \p nv_bfloat162 vector reciprocal square root in round-to-nearest -* mode. -* -* \details Calculates \p nv_bfloat162 reciprocal square root of input vector \p a in -* round-to-nearest-even mode. -* \param[in] a - nv_bfloat162. Is only being read. -* -* \returns nv_bfloat162 -* - The elementwise reciprocal square root on vector \p a. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 h2rsqrt(const __nv_bfloat162 a); -/** -* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS -* \brief Calculates \p nv_bfloat162 vector reciprocal in round-to-nearest-even mode. -* -* \details Calculates \p nv_bfloat162 reciprocal of input vector \p a in round-to-nearest-even -* mode. -* \param[in] a - nv_bfloat162. Is only being read. -* -* \returns nv_bfloat162 -* - The elementwise reciprocal on vector \p a. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 h2rcp(const __nv_bfloat162 a); -/** -* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS -* \brief Calculates \p nv_bfloat162 vector natural logarithm in round-to-nearest-even -* mode. -* -* \details Calculates \p nv_bfloat162 natural logarithm of input vector \p a in -* round-to-nearest-even mode. -* \param[in] a - nv_bfloat162. Is only being read. -* -* \returns nv_bfloat162 -* - The elementwise natural logarithm on vector \p a. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 h2log(const __nv_bfloat162 a); -/** -* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS -* \brief Calculates \p nv_bfloat162 vector binary logarithm in round-to-nearest-even -* mode. -* -* \details Calculates \p nv_bfloat162 binary logarithm of input vector \p a in round-to-nearest -* mode. -* \param[in] a - nv_bfloat162. Is only being read. -* -* \returns nv_bfloat162 -* - The elementwise binary logarithm on vector \p a. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 h2log2(const __nv_bfloat162 a); -/** -* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS -* \brief Calculates \p nv_bfloat162 vector decimal logarithm in round-to-nearest-even -* mode. -* -* \details Calculates \p nv_bfloat162 decimal logarithm of input vector \p a in -* round-to-nearest-even mode. -* \param[in] a - nv_bfloat162. Is only being read. -* -* \returns nv_bfloat162 -* - The elementwise decimal logarithm on vector \p a. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 h2log10(const __nv_bfloat162 a); -/** -* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS -* \brief Calculates \p nv_bfloat162 vector exponential function in round-to-nearest -* mode. -* -* \details Calculates \p nv_bfloat162 exponential function of input vector \p a in -* round-to-nearest-even mode. -* \param[in] a - nv_bfloat162. Is only being read. -* -* \returns nv_bfloat162 -* - The elementwise exponential function on vector \p a. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 h2exp(const __nv_bfloat162 a); -/** -* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS -* \brief Calculates \p nv_bfloat162 vector binary exponential function in -* round-to-nearest-even mode. -* -* \details Calculates \p nv_bfloat162 binary exponential function of input vector \p a in -* round-to-nearest-even mode. -* \param[in] a - nv_bfloat162. Is only being read. -* -* \returns nv_bfloat162 -* - The elementwise binary exponential function on vector \p a. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 h2exp2(const __nv_bfloat162 a); -/** -* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS -* \brief Calculates \p nv_bfloat162 vector decimal exponential function in -* round-to-nearest-even mode. -* -* \details Calculates \p nv_bfloat162 decimal exponential function of input vector \p a in -* round-to-nearest-even mode. -* \param[in] a - nv_bfloat162. Is only being read. -* -* \returns nv_bfloat162 -* - The elementwise decimal exponential function on vector \p a. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 h2exp10(const __nv_bfloat162 a); -/** -* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS -* \brief Calculates \p nv_bfloat162 vector cosine in round-to-nearest-even mode. -* -* \details Calculates \p nv_bfloat162 cosine of input vector \p a in round-to-nearest-even -* mode. -* \param[in] a - nv_bfloat162. Is only being read. -* -* \returns nv_bfloat162 -* - The elementwise cosine on vector \p a. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 h2cos(const __nv_bfloat162 a); -/** -* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS -* \brief Calculates \p nv_bfloat162 vector sine in round-to-nearest-even mode. -* -* \details Calculates \p nv_bfloat162 sine of input vector \p a in round-to-nearest-even mode. -* \param[in] a - nv_bfloat162. Is only being read. -* -* \returns nv_bfloat162 -* - The elementwise sine on vector \p a. -* \internal -* \exception-guarantee no-throw guarantee -* \behavior reentrant, thread safe -* \endinternal -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 h2sin(const __nv_bfloat162 a); - -/** -* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC -* \brief Vector add \p val to the value stored at \p address in global or shared memory, and writes this -* value back to \p address. The atomicity of the add operation is guaranteed separately for each of the -* two nv_bfloat16 elements; the entire __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. -* -* \details The location of \p address must be in global or shared memory. This operation has undefined -* behavior otherwise. This operation is only supported by devices of compute capability 8.x and higher. -* -* \param[in] address - __nv_bfloat162*. An address in global or shared memory. -* \param[in] val - __nv_bfloat162. The value to be added. -* -* \returns __nv_bfloat162 -* - The old value read from \p address. -* -* \note_ref_guide_atomic -*/ -__CUDA_BF16_DECL__ __nv_bfloat162 atomicAdd(__nv_bfloat162 *const address, const __nv_bfloat162 val); - -/** -* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC -* \brief Adds \p val to the value stored at \p address in global or shared memory, and writes this value -* back to \p address. This operation is performed in one atomic operation. -* -* \details The location of \p address must be in global or shared memory. This operation has undefined -* behavior otherwise. This operation is only supported by devices of compute capability 8.x and higher. -* -* \param[in] address - __nv_bfloat16*. An address in global or shared memory. -* \param[in] val - __nv_bfloat16. The value to be added. -* -* \returns __nv_bfloat16 -* - The old value read from \p address. -* -* \note_ref_guide_atomic -*/ -__CUDA_BF16_DECL__ __nv_bfloat16 atomicAdd(__nv_bfloat16 *const address, const __nv_bfloat16 val); - -#endif /* defined(__CUDACC__) && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) */ - -#undef __CUDA_BF16_DECL__ -#undef __CUDA_HOSTDEVICE_BF16_DECL__ - -#endif /* defined(__cplusplus) */ - -/* Note the .hpp file is included even for host-side compilation, to capture the "nv_bfloat16" & "nv_bfloat162" definitions */ -#include "cuda_bf16.hpp" -#undef ___CUDA_BF16_STRINGIFY_INNERMOST -#undef __CUDA_BF16_STRINGIFY - -#endif /* end of include guard: __CUDA_BF16_H__ */ diff --git a/numba_cuda/numba/cuda/include/11/cuda_bf16.hpp b/numba_cuda/numba/cuda/include/11/cuda_bf16.hpp deleted file mode 100644 index 30085da5e..000000000 --- a/numba_cuda/numba/cuda/include/11/cuda_bf16.hpp +++ /dev/null @@ -1,2683 +0,0 @@ -/* -* Copyright 1993-2022 NVIDIA Corporation. All rights reserved. -* -* NOTICE TO LICENSEE: -* -* This source code and/or documentation ("Licensed Deliverables") are -* subject to NVIDIA intellectual property rights under U.S. and -* international Copyright laws. -* -* These Licensed Deliverables contained herein is PROPRIETARY and -* CONFIDENTIAL to NVIDIA and is being provided under the terms and -* conditions of a form of NVIDIA software license agreement by and -* between NVIDIA and Licensee ("License Agreement") or electronically -* accepted by Licensee. Notwithstanding any terms or conditions to -* the contrary in the License Agreement, reproduction or disclosure -* of the Licensed Deliverables to any third party without the express -* written consent of NVIDIA is prohibited. -* -* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE -* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE -* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS -* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. -* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED -* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, -* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. -* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE -* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY -* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY -* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, -* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS -* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE -* OF THESE LICENSED DELIVERABLES. -* -* U.S. Government End Users. These Licensed Deliverables are a -* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT -* 1995), consisting of "commercial computer software" and "commercial -* computer software documentation" as such terms are used in 48 -* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government -* only as a commercial end item. Consistent with 48 C.F.R.12.212 and -* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all -* U.S. Government End Users acquire the Licensed Deliverables with -* only those rights set forth herein. -* -* Any use of the Licensed Deliverables in individual and commercial -* software must include, in the user documentation and internal -* comments to the code, the above Disclaimer and U.S. Government End -* Users Notice. -*/ - -#if !defined(__CUDA_BF16_HPP__) -#define __CUDA_BF16_HPP__ - -#if !defined(__CUDA_BF16_H__) -#error "Do not include this file directly. Instead, include cuda_bf16.h." -#endif - -#if !defined(_MSC_VER) && __cplusplus >= 201103L -# define __CPP_VERSION_AT_LEAST_11_BF16 -#elif _MSC_FULL_VER >= 190024210 && _MSVC_LANG >= 201103L -# define __CPP_VERSION_AT_LEAST_11_BF16 -#endif - -/* C++11 header for std::move. - * In RTC mode, std::move is provided implicitly; don't include the header - */ -#if defined(__CPP_VERSION_AT_LEAST_11_BF16) && !defined(__CUDACC_RTC__) -#include -#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) && !defined(__CUDACC_RTC__) */ - -/* C++ header for std::memcpy (used for type punning in host-side implementations). - * When compiling as a CUDA source file memcpy is provided implicitly. - * !defined(__CUDACC__) implies !defined(__CUDACC_RTC__). - */ -#if defined(__cplusplus) && !defined(__CUDACC__) -#include -#endif /* defined(__cplusplus) && !defined(__CUDACC__) */ - - -/* Set up function decorations */ -#if defined(__CUDACC__) -#define __CUDA_BF16_DECL__ static __device__ __inline__ -#define __CUDA_HOSTDEVICE_BF16_DECL__ static __host__ __device__ __inline__ -#define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__ -#define __CUDA_HOSTDEVICE__ __host__ __device__ -#else /* !defined(__CUDACC__) */ -#if defined(__GNUC__) -#define __CUDA_HOSTDEVICE_BF16_DECL__ static __attribute__ ((unused)) -#else -#define __CUDA_HOSTDEVICE_BF16_DECL__ static -#endif /* defined(__GNUC__) */ -#define __CUDA_HOSTDEVICE__ -#endif /* defined(__CUDACC_) */ - -/* Set up structure-alignment attribute */ -#if defined(__CUDACC__) -#define __CUDA_ALIGN__(align) __align__(align) -#else -/* Define alignment macro based on compiler type (cannot assume C11 "_Alignas" is available) */ -#if defined(__CPP_VERSION_AT_LEAST_11_BF16) -#define __CUDA_ALIGN__(n) alignas(n) /* C++11 kindly gives us a keyword for this */ -#else /* defined(__CPP_VERSION_AT_LEAST_11_BF16)*/ -#if defined(__GNUC__) -#define __CUDA_ALIGN__(n) __attribute__ ((aligned(n))) -#elif defined(_MSC_VER) -#define __CUDA_ALIGN__(n) __declspec(align(n)) -#else -#define __CUDA_ALIGN__(n) -#endif /* defined(__GNUC__) */ -#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */ -#endif /* defined(__CUDACC__) */ - -/* Macros to allow nv_bfloat16 & nv_bfloat162 to be used by inline assembly */ -#define __BFLOAT16_TO_US(var) *(reinterpret_cast(&(var))) -#define __BFLOAT16_TO_CUS(var) *(reinterpret_cast(&(var))) -#define __BFLOAT162_TO_UI(var) *(reinterpret_cast(&(var))) -#define __BFLOAT162_TO_CUI(var) *(reinterpret_cast(&(var))) - -/** -* Types which allow static initialization of "nv_bfloat16" and "nv_bfloat162" until -* these become an actual builtin. Note this initialization is as a -* bitfield representation of "nv_bfloat16", and not a conversion from short->nv_bfloat16. -* Such a representation will be deprecated in a future version of CUDA. -* (Note these are visible to non-nvcc compilers, including C-only compilation) -*/ -typedef struct __CUDA_ALIGN__(2) { - unsigned short x; -} __nv_bfloat16_raw; - -typedef struct __CUDA_ALIGN__(4) { - unsigned short x; - unsigned short y; -} __nv_bfloat162_raw; - -/* All other definitions in this file are only visible to C++ compilers */ -#if defined(__cplusplus) - -/* Hide GCC member initialization list warnings because of host/device in-function init requirement */ -#if defined(__GNUC__) -#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Weffc++" -#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */ -#endif /* defined(__GNUC__) */ - -/* class' : multiple assignment operators specified - The class has multiple assignment operators of a single type. This warning is informational */ -#if defined(_MSC_VER) && _MSC_VER >= 1500 -#pragma warning( push ) -#pragma warning( disable:4522 ) -#endif /* defined(__GNUC__) */ - -struct __CUDA_ALIGN__(2) __nv_bfloat16 { -protected: - unsigned short __x; - -public: -#if defined(__CPP_VERSION_AT_LEAST_11_BF16) - __nv_bfloat16() = default; -#else - __CUDA_HOSTDEVICE__ __nv_bfloat16() { } -#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */ - - /* Convert to/from __nv_bfloat16_raw */ - __CUDA_HOSTDEVICE__ __nv_bfloat16(const __nv_bfloat16_raw &hr) : __x(hr.x) { } - __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(const __nv_bfloat16_raw &hr) { __x = hr.x; return *this; } - __CUDA_HOSTDEVICE__ volatile __nv_bfloat16 &operator=(const __nv_bfloat16_raw &hr) volatile { __x = hr.x; return *this; } - __CUDA_HOSTDEVICE__ volatile __nv_bfloat16 &operator=(const volatile __nv_bfloat16_raw &hr) volatile { __x = hr.x; return *this; } - __CUDA_HOSTDEVICE__ operator __nv_bfloat16_raw() const { __nv_bfloat16_raw ret; ret.x = __x; return ret; } - __CUDA_HOSTDEVICE__ operator __nv_bfloat16_raw() const volatile { __nv_bfloat16_raw ret; ret.x = __x; return ret; } - -#if !defined(__CUDA_NO_BFLOAT16_CONVERSIONS__) - /* Construct from float/double */ - __CUDA_HOSTDEVICE__ __nv_bfloat16(const float f) { __x = __float2bfloat16(f).__x; } - __CUDA_HOSTDEVICE__ __nv_bfloat16(const double f) { __x = __double2bfloat16(f).__x; } - - __CUDA_HOSTDEVICE__ operator float() const { return __bfloat162float(*this); } - __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(const float f) { __x = __float2bfloat16(f).__x; return *this; } - - /* We omit "cast to double" operator, so as to not be ambiguous about up-cast */ - __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(const double f) { __x = __double2bfloat16(f).__x; return *this; } - -/* Member functions only available to nvcc compilation so far */ -#if defined(__CUDACC__) && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) - /* Allow automatic construction from types supported natively in hardware */ - /* Note we do avoid constructor init-list because of special host/device compilation rules */ - __CUDA_HOSTDEVICE__ __nv_bfloat16(short val) { __x = __short2bfloat16_rn(val).__x; } - __CUDA_HOSTDEVICE__ __nv_bfloat16(unsigned short val) { __x = __ushort2bfloat16_rn(val).__x; } - __CUDA_HOSTDEVICE__ __nv_bfloat16(int val) { __x = __int2bfloat16_rn(val).__x; } - __CUDA_HOSTDEVICE__ __nv_bfloat16(unsigned int val) { __x = __uint2bfloat16_rn(val).__x; } - __CUDA_HOSTDEVICE__ __nv_bfloat16(long long val) { __x = __ll2bfloat16_rn(val).__x; } - __CUDA_HOSTDEVICE__ __nv_bfloat16(unsigned long long val) { __x = __ull2bfloat16_rn(val).__x; } - - /* Allow automatic casts to supported builtin types, matching all that are permitted with float */ - __CUDA_HOSTDEVICE__ operator short() const { return __bfloat162short_rz(*this); } - __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(short val) { __x = __short2bfloat16_rn(val).__x; return *this; } - - __CUDA_HOSTDEVICE__ operator unsigned short() const { return __bfloat162ushort_rz(*this); } - __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(unsigned short val) { __x = __ushort2bfloat16_rn(val).__x; return *this; } - - __CUDA_HOSTDEVICE__ operator int() const { return __bfloat162int_rz(*this); } - __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(int val) { __x = __int2bfloat16_rn(val).__x; return *this; } - - __CUDA_HOSTDEVICE__ operator unsigned int() const { return __bfloat162uint_rz(*this); } - __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(unsigned int val) { __x = __uint2bfloat16_rn(val).__x; return *this; } - - __CUDA_HOSTDEVICE__ operator long long() const { return __bfloat162ll_rz(*this); } - __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(long long val) { __x = __ll2bfloat16_rn(val).__x; return *this; } - - __CUDA_HOSTDEVICE__ operator unsigned long long() const { return __bfloat162ull_rz(*this); } - __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(unsigned long long val) { __x = __ull2bfloat16_rn(val).__x; return *this; } - - /* Boolean conversion - note both 0 and -0 must return false */ - __CUDA_HOSTDEVICE__ operator bool() const { return (__x & 0x7FFF) != 0; } -#endif /* defined(__CUDACC__) && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) */ -#endif /* !defined(__CUDA_NO_BFLOAT16_CONVERSIONS__) */ -}; - -/* Global-space operator functions are only available to nvcc compilation */ -#if defined(__CUDACC__) - -#if __CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__) -#if !defined(__CUDA_NO_BFLOAT16_OPERATORS__) -/* Some basic arithmetic operations expected of a builtin */ -__device__ __forceinline__ __nv_bfloat16 operator+(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hadd(lh, rh); } -__device__ __forceinline__ __nv_bfloat16 operator-(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hsub(lh, rh); } -__device__ __forceinline__ __nv_bfloat16 operator*(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hmul(lh, rh); } -__device__ __forceinline__ __nv_bfloat16 operator/(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hdiv(lh, rh); } - -__device__ __forceinline__ __nv_bfloat16 &operator+=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh) { lh = __hadd(lh, rh); return lh; } -__device__ __forceinline__ __nv_bfloat16 &operator-=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh) { lh = __hsub(lh, rh); return lh; } -__device__ __forceinline__ __nv_bfloat16 &operator*=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh) { lh = __hmul(lh, rh); return lh; } -__device__ __forceinline__ __nv_bfloat16 &operator/=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh) { lh = __hdiv(lh, rh); return lh; } - -/* Note for increment and decrement we use the raw value 0x3F80 equating to nv_bfloat16(1.0f), to avoid the extra conversion */ -__device__ __forceinline__ __nv_bfloat16 &operator++(__nv_bfloat16 &h) { __nv_bfloat16_raw one; one.x = 0x3F80; h += one; return h; } -__device__ __forceinline__ __nv_bfloat16 &operator--(__nv_bfloat16 &h) { __nv_bfloat16_raw one; one.x = 0x3F80; h -= one; return h; } -__device__ __forceinline__ __nv_bfloat16 operator++(__nv_bfloat16 &h, const int ignored) -{ - // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators. - static_cast(ignored); - - const __nv_bfloat16 ret = h; - __nv_bfloat16_raw one; - one.x = 0x3F80; - h += one; - return ret; -} -__device__ __forceinline__ __nv_bfloat16 operator--(__nv_bfloat16 &h, const int ignored) -{ - // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators. - static_cast(ignored); - - const __nv_bfloat16 ret = h; - __nv_bfloat16_raw one; - one.x = 0x3F80; - h -= one; - return ret; -} -/* Unary plus and inverse operators */ -__device__ __forceinline__ __nv_bfloat16 operator+(const __nv_bfloat16 &h) { return h; } -__device__ __forceinline__ __nv_bfloat16 operator-(const __nv_bfloat16 &h) { return __hneg(h); } - -/* Some basic comparison operations to make it look like a builtin */ -__device__ __forceinline__ bool operator==(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __heq(lh, rh); } -__device__ __forceinline__ bool operator!=(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hneu(lh, rh); } -__device__ __forceinline__ bool operator> (const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hgt(lh, rh); } -__device__ __forceinline__ bool operator< (const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hlt(lh, rh); } -__device__ __forceinline__ bool operator>=(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hge(lh, rh); } -__device__ __forceinline__ bool operator<=(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hle(lh, rh); } -#endif /* !defined(__CUDA_NO_BFLOAT16_OPERATORS__) */ -#endif /* __CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__) */ -#endif /* defined(__CUDACC__) */ - -/* __nv_bfloat162 is visible to non-nvcc host compilers */ -struct __CUDA_ALIGN__(4) __nv_bfloat162 { - __nv_bfloat16 x; - __nv_bfloat16 y; - - // All construct/copy/assign/move -public: -#if defined(__CPP_VERSION_AT_LEAST_11_BF16) - __nv_bfloat162() = default; - __CUDA_HOSTDEVICE__ __nv_bfloat162(__nv_bfloat162 &&src) { __BFLOAT162_TO_UI(*this) = std::move(__BFLOAT162_TO_CUI(src)); } - __CUDA_HOSTDEVICE__ __nv_bfloat162 &operator=(__nv_bfloat162 &&src) { __BFLOAT162_TO_UI(*this) = std::move(__BFLOAT162_TO_CUI(src)); return *this; } -#else - __CUDA_HOSTDEVICE__ __nv_bfloat162() { } -#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */ - __CUDA_HOSTDEVICE__ __nv_bfloat162(const __nv_bfloat16 &a, const __nv_bfloat16 &b) : x(a), y(b) { } - __CUDA_HOSTDEVICE__ __nv_bfloat162(const __nv_bfloat162 &src) { __BFLOAT162_TO_UI(*this) = __BFLOAT162_TO_CUI(src); } - __CUDA_HOSTDEVICE__ __nv_bfloat162 &operator=(const __nv_bfloat162 &src) { __BFLOAT162_TO_UI(*this) = __BFLOAT162_TO_CUI(src); return *this; } - - /* Convert to/from __nv_bfloat162_raw */ - __CUDA_HOSTDEVICE__ __nv_bfloat162(const __nv_bfloat162_raw &h2r ) { __BFLOAT162_TO_UI(*this) = __BFLOAT162_TO_CUI(h2r); } - __CUDA_HOSTDEVICE__ __nv_bfloat162 &operator=(const __nv_bfloat162_raw &h2r) { __BFLOAT162_TO_UI(*this) = __BFLOAT162_TO_CUI(h2r); return *this; } - __CUDA_HOSTDEVICE__ operator __nv_bfloat162_raw() const { __nv_bfloat162_raw ret; ret.x = 0U; ret.y = 0U; __BFLOAT162_TO_UI(ret) = __BFLOAT162_TO_CUI(*this); return ret; } -}; - -/* Global-space operator functions are only available to nvcc compilation */ -#if defined(__CUDACC__) - -#if (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) && !defined(__CUDA_NO_BFLOAT162_OPERATORS__) - -__device__ __forceinline__ __nv_bfloat162 operator+(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hadd2(lh, rh); } -__device__ __forceinline__ __nv_bfloat162 operator-(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hsub2(lh, rh); } -__device__ __forceinline__ __nv_bfloat162 operator*(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hmul2(lh, rh); } -__device__ __forceinline__ __nv_bfloat162 operator/(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __h2div(lh, rh); } - -__device__ __forceinline__ __nv_bfloat162& operator+=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh) { lh = __hadd2(lh, rh); return lh; } -__device__ __forceinline__ __nv_bfloat162& operator-=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh) { lh = __hsub2(lh, rh); return lh; } -__device__ __forceinline__ __nv_bfloat162& operator*=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh) { lh = __hmul2(lh, rh); return lh; } -__device__ __forceinline__ __nv_bfloat162& operator/=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh) { lh = __h2div(lh, rh); return lh; } - -__device__ __forceinline__ __nv_bfloat162 &operator++(__nv_bfloat162 &h) { __nv_bfloat162_raw one; one.x = 0x3F80; one.y = 0x3F80; h = __hadd2(h, one); return h; } -__device__ __forceinline__ __nv_bfloat162 &operator--(__nv_bfloat162 &h) { __nv_bfloat162_raw one; one.x = 0x3F80; one.y = 0x3F80; h = __hsub2(h, one); return h; } -__device__ __forceinline__ __nv_bfloat162 operator++(__nv_bfloat162 &h, const int ignored) -{ - // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators. - static_cast(ignored); - - const __nv_bfloat162 ret = h; - __nv_bfloat162_raw one; - one.x = 0x3F80; - one.y = 0x3F80; - h = __hadd2(h, one); - return ret; -} -__device__ __forceinline__ __nv_bfloat162 operator--(__nv_bfloat162 &h, const int ignored) -{ - // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators. - static_cast(ignored); - - const __nv_bfloat162 ret = h; - __nv_bfloat162_raw one; - one.x = 0x3F80; - one.y = 0x3F80; - h = __hsub2(h, one); - return ret; -} -__device__ __forceinline__ __nv_bfloat162 operator+(const __nv_bfloat162 &h) { return h; } -__device__ __forceinline__ __nv_bfloat162 operator-(const __nv_bfloat162 &h) { return __hneg2(h); } - -__device__ __forceinline__ bool operator==(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hbeq2(lh, rh); } -__device__ __forceinline__ bool operator!=(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hbneu2(lh, rh); } -__device__ __forceinline__ bool operator>(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hbgt2(lh, rh); } -__device__ __forceinline__ bool operator<(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hblt2(lh, rh); } -__device__ __forceinline__ bool operator>=(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hbge2(lh, rh); } -__device__ __forceinline__ bool operator<=(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hble2(lh, rh); } - -#endif /* __CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__) */ -#endif /* defined(__CUDACC__) */ - -/* Restore warning for multiple assignment operators */ -#if defined(_MSC_VER) && _MSC_VER >= 1500 -#pragma warning( pop ) -#endif /* defined(_MSC_VER) && _MSC_VER >= 1500 */ - -/* Restore -Weffc++ warnings from here on */ -#if defined(__GNUC__) -#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) -#pragma GCC diagnostic pop -#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */ -#endif /* defined(__GNUC__) */ - -#undef __CUDA_HOSTDEVICE__ -#undef __CUDA_ALIGN__ - -__CUDA_HOSTDEVICE_BF16_DECL__ unsigned short __internal_float2bfloat16(const float f, unsigned int &sign, unsigned int &remainder) -{ - unsigned int x; - -#if defined(__CUDA_ARCH__) - x = __float_as_uint(f); -#elif defined(__CUDACC__) - (void)memcpy(&x, &f, sizeof(f)); -#else - (void)std::memcpy(&x, &f, sizeof(f)); -#endif - - if ((x & 0x7fffffffU) > 0x7f800000U) { - sign = 0U; - remainder = 0U; - return static_cast(0x7fffU); - } - sign = x >> 31U; - remainder = x << 16U; - return static_cast(x >> 16U); -} - -__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __double2bfloat16(const double x) -{ -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - __nv_bfloat16 val; - asm("{ cvt.rn.bf16.f64 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "d"(x)); - return val; -#else - - float f = static_cast(x); - const double d = static_cast(f); - unsigned int u; - -#if defined(__CUDA_ARCH__) - u = __float_as_uint(f); -#elif defined(__CUDACC__) - (void)memcpy(&u, &f, sizeof(f)); -#else - (void)std::memcpy(&u, &f, sizeof(f)); -#endif - bool x_is_not_nan = ((u << (unsigned)1U) <= (unsigned)0xFF000000U); - - - if ((x > 0.0) && (d > x)) { - u--; - } - if ((x < 0.0) && (d < x)) { - u--; - } - if ((d != x) && x_is_not_nan) { - u |= 1U; - } - -#if defined(__CUDA_ARCH__) - f = __int_as_float(static_cast(u)); -#elif defined(__CUDACC__) - (void)memcpy(&f, &u, sizeof(f)); -#else - (void)std::memcpy(&f, &u, sizeof(f)); -#endif - - return __float2bfloat16(f); - -#endif // defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) -} - -__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16(const float a) -{ - __nv_bfloat16 val; -#if __CUDA_ARCH__ >= 800 - asm("{ cvt.rn.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a)); -#else - __nv_bfloat16_raw r; - unsigned int sign = 0U; - unsigned int remainder = 0U; - r.x = __internal_float2bfloat16(a, sign, remainder); - if ((remainder > 0x80000000U) || ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) { - r.x++; - } - val = r; -#endif - return val; -} -__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rn(const float a) -{ - __nv_bfloat16 val; -#if __CUDA_ARCH__ >= 800 - asm("{ cvt.rn.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a)); -#else - __nv_bfloat16_raw r; - unsigned int sign = 0U; - unsigned int remainder = 0U; - r.x = __internal_float2bfloat16(a, sign, remainder); - if ((remainder > 0x80000000U) || ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) { - r.x++; - } - val = r; -#endif - return val; -} -__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rz(const float a) -{ - __nv_bfloat16 val; -#if __CUDA_ARCH__ >= 800 - asm("{ cvt.rz.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a)); -#else - __nv_bfloat16_raw r; - unsigned int sign = 0U; - unsigned int remainder = 0U; - r.x = __internal_float2bfloat16(a, sign, remainder); - val = r; -#endif - return val; -} -__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rd(const float a) -{ -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - __nv_bfloat16 val; - asm("{ cvt.rm.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a)); - return val; -#else - __nv_bfloat16 val; - __nv_bfloat16_raw r; - unsigned int sign = 0U; - unsigned int remainder = 0U; - r.x = __internal_float2bfloat16(a, sign, remainder); - if ((remainder != 0U) && (sign != 0U)) { - r.x++; - } - val = r; - return val; -#endif -} -__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_ru(const float a) -{ -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - __nv_bfloat16 val; - asm("{ cvt.rp.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a)); - return val; -#else - __nv_bfloat16 val; - __nv_bfloat16_raw r; - unsigned int sign = 0U; - unsigned int remainder = 0U; - r.x = __internal_float2bfloat16(a, sign, remainder); - if ((remainder != 0U) && (sign == 0U)) { - r.x++; - } - val = r; - return val; -#endif -} -__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __float2bfloat162_rn(const float a) -{ - __nv_bfloat162 val; -#if __CUDA_ARCH__ >= 800 - asm("{.reg .b16 low;\n" - " cvt.rn.bf16.f32 low, %1;\n" - " mov.b32 %0, {low,low};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "f"(a)); -#else - val = __nv_bfloat162(__float2bfloat16_rn(a), __float2bfloat16_rn(a)); -#endif - return val; -} -__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __floats2bfloat162_rn(const float a, const float b) -{ - __nv_bfloat162 val; -#if __CUDA_ARCH__ >= 800 - asm("{ cvt.rn.bf16x2.f32 %0, %2, %1;}\n" - : "=r"(__BFLOAT162_TO_UI(val)) : "f"(a), "f"(b)); -#else - val = __nv_bfloat162(__float2bfloat16_rn(a), __float2bfloat16_rn(b)); -#endif - return val; -} - -__CUDA_HOSTDEVICE_BF16_DECL__ float __internal_bfloat162float(const unsigned short h) -{ - float f; -#if defined(__CUDA_ARCH__) - #if (__CUDA_ARCH__ >= 900) - asm("{ cvt.f32.bf16 %0, %1;}\n" : "=f"(f) : "h"(h)); - #else - asm("{ mov.b32 %0, {0,%1};}\n" : "=f"(f) : "h"(h)); - #endif -#else - unsigned int u = static_cast(h) << 16; - #if defined(__CUDACC__) - (void)memcpy(&f, &u, sizeof(f)); - #else - (void)std::memcpy(&f, &u, sizeof(f)); - #endif -#endif - return f; -} - -__CUDA_HOSTDEVICE_BF16_DECL__ float __bfloat162float(const __nv_bfloat16 a) -{ - return __internal_bfloat162float(static_cast<__nv_bfloat16_raw>(a).x); -} -__CUDA_HOSTDEVICE_BF16_DECL__ float __low2float(const __nv_bfloat162 a) -{ - return __internal_bfloat162float(static_cast<__nv_bfloat162_raw>(a).x); -} - -__CUDA_HOSTDEVICE_BF16_DECL__ float __high2float(const __nv_bfloat162 a) -{ - return __internal_bfloat162float(static_cast<__nv_bfloat162_raw>(a).y); -} - -#if defined(__CUDACC__) && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) - -/* CUDA vector-types compatible vector creation function (note returns __nv_bfloat162, not nv_bfloat162) */ -__VECTOR_FUNCTIONS_DECL__ __nv_bfloat162 make_bfloat162(const __nv_bfloat16 x, const __nv_bfloat16 y) -{ - __nv_bfloat162 t; t.x = x; t.y = y; return t; -} -#undef __VECTOR_FUNCTIONS_DECL__ - - -/* Definitions of intrinsics */ -__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __float22bfloat162_rn(const float2 a) -{ - __nv_bfloat162 val = __floats2bfloat162_rn(a.x, a.y); - return val; -} -__CUDA_HOSTDEVICE_BF16_DECL__ float2 __bfloat1622float2(const __nv_bfloat162 a) -{ - float hi_float; - float lo_float; - lo_float = __internal_bfloat162float(((__nv_bfloat162_raw)a).x); - hi_float = __internal_bfloat162float(((__nv_bfloat162_raw)a).y); - return make_float2(lo_float, hi_float); -} -__CUDA_BF16_DECL__ int __bfloat162int_rn(const __nv_bfloat16 h) -{ -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - int val; - asm("{ cvt.rni.s32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h))); - return val; -#else - return __float2int_rn(__bfloat162float(h)); -#endif -} -__CUDA_HOSTDEVICE_BF16_DECL__ int __bfloat162int_rz(const __nv_bfloat16 h) -{ -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - int val; - asm("{ cvt.rzi.s32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h))); - return val; -#else - const float f = __bfloat162float(h); - int i; - i = static_cast(f); -#if !(defined __CUDA_ARCH__) - const int max_val = (int)0x7fffffffU; - const int min_val = (int)0x80000000U; - const unsigned short bits = static_cast(static_cast<__nv_bfloat16_raw>(h).x << 1U); - // saturation fixup - if (bits > (unsigned short)0xFF00U) { - // NaN - i = 0; - } else if (f >= static_cast(max_val)) { - // saturate maximum - i = max_val; - } else if (f < static_cast(min_val)) { - // saturate minimum - i = min_val; - } -#endif - return i; -#endif // defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) -} -__CUDA_BF16_DECL__ int __bfloat162int_rd(const __nv_bfloat16 h) -{ -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - int val; - asm("{ cvt.rmi.s32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h))); - return val; -#else - return __float2int_rd(__bfloat162float(h)); -#endif -} -__CUDA_BF16_DECL__ int __bfloat162int_ru(const __nv_bfloat16 h) -{ -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - int val; - asm("{ cvt.rpi.s32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h))); - return val; -#else - return __float2int_ru(__bfloat162float(h)); -#endif -} -__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rn(const int i) -{ -#if (defined __CUDA_ARCH__) - #if (__CUDA_ARCH__ >= 900) - __nv_bfloat16 val; - asm("cvt.rn.bf16.s32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i)); - return val; - #else - const float ru = __int2float_ru(i); - const float rd = __int2float_rd(i); - float rz = __int2float_rz(i); - if (ru != rd) { - rz = __uint_as_float(__float_as_uint(rz) | 1U); - } - return __float2bfloat16_rn(rz); - #endif -#else - const double d = static_cast(i); - return __double2bfloat16(d); -#endif -} -__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rz(const int i) -{ -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - __nv_bfloat16 val; - asm("cvt.rz.bf16.s32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i)); - return val; -#else - return __float2bfloat16_rz(__int2float_rz(i)); -#endif -} -__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rd(const int i) -{ -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - __nv_bfloat16 val; - asm("cvt.rm.bf16.s32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i)); - return val; -#else - return __float2bfloat16_rd(__int2float_rd(i)); -#endif -} - -__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_ru(const int i) -{ -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - __nv_bfloat16 val; - asm("cvt.rp.bf16.s32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i)); - return val; -#else - return __float2bfloat16_ru(__int2float_ru(i)); -#endif -} - -__CUDA_BF16_DECL__ short int __bfloat162short_rn(const __nv_bfloat16 h) -{ - short int val; -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - asm("cvt.rni.s16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); -#else - asm("{ .reg.f32 f;\n" - " mov.b32 f, {0,%1};\n" - " cvt.rni.s16.f32 %0,f;\n}" - :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); -#endif - return val; -} - -__CUDA_HOSTDEVICE_BF16_DECL__ short int __bfloat162short_rz(const __nv_bfloat16 h) -{ - short int val; -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - asm("cvt.rzi.s16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); -#elif (defined __CUDA_ARCH__) - asm("{ .reg.f32 f;\n" - " mov.b32 f, {0,%1};\n" - " cvt.rzi.s16.f32 %0,f;\n}" - :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); -#else - const float f = __bfloat162float(h); - val = static_cast(f); - const short int max_val = (short int)0x7fffU; - const short int min_val = (short int)0x8000U; - const unsigned short bits = static_cast(static_cast<__nv_bfloat16_raw>(h).x << 1U); - // saturation fixup - if (bits > (unsigned short)0xFF00U) { - // NaN - val = 0; - } else if (f > static_cast(max_val)) { - // saturate maximum - val = max_val; - } else if (f < static_cast(min_val)) { - // saturate minimum - val = min_val; - } -#endif - return val; -} -__CUDA_BF16_DECL__ short int __bfloat162short_rd(const __nv_bfloat16 h) -{ - short int val; -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - asm("cvt.rmi.s16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); -#else - asm("{ .reg.f32 f;\n" - " mov.b32 f, {0,%1};\n" - " cvt.rmi.s16.f32 %0,f;\n}" - :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); -#endif - return val; -} -__CUDA_BF16_DECL__ short int __bfloat162short_ru(const __nv_bfloat16 h) -{ - short int val; -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - asm("cvt.rpi.s16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); -#else - asm("{ .reg.f32 f;\n" - " mov.b32 f, {0,%1};\n" - " cvt.rpi.s16.f32 %0,f;\n}" - :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); -#endif - return val; -} -__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rn(const short int i) -{ -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - __nv_bfloat16 val; - asm("cvt.rn.bf16.s16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i)); - return val; -#else - const float f = static_cast(i); - return __float2bfloat16_rn(f); -#endif -} -__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rz(const short int i) -{ -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - __nv_bfloat16 val; - asm("cvt.rz.bf16.s16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i)); - return val; -#else - return __float2bfloat16_rz(__int2float_rz(static_cast(i))); -#endif -} -__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rd(const short int i) -{ -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - __nv_bfloat16 val; - asm("cvt.rm.bf16.s16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i)); - return val; -#else - return __float2bfloat16_rd(__int2float_rd(static_cast(i))); -#endif -} -__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_ru(const short int i) -{ -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - __nv_bfloat16 val; - asm("cvt.rp.bf16.s16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i)); - return val; -#else - return __float2bfloat16_ru(__int2float_ru(static_cast(i))); -#endif -} - -__CUDA_BF16_DECL__ unsigned int __bfloat162uint_rn(const __nv_bfloat16 h) -{ -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - unsigned int val; - asm("{ cvt.rni.u32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h))); - return val; -#else - return __float2uint_rn(__bfloat162float(h)); -#endif -} -__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __bfloat162uint_rz(const __nv_bfloat16 h) -{ -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - unsigned int val; - asm("{ cvt.rzi.u32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h))); - return val; -#else - - const float f = __bfloat162float(h); - unsigned int i; - i = static_cast(f); -#if !(defined __CUDA_ARCH__) - const unsigned int max_val = 0xffffffffU; - const unsigned int min_val = 0U; - const unsigned short bits = static_cast(static_cast<__nv_bfloat16_raw>(h).x << 1U); - // saturation fixup - if (bits > (unsigned short)0xFF00U) { - // NaN - i = 0U; - } else if (f >= static_cast(max_val)) { - // saturate maximum - i = max_val; - } else if (f < static_cast(min_val)) { - // saturate minimum - i = min_val; - } -#endif - return i; - -#endif // defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) -} -__CUDA_BF16_DECL__ unsigned int __bfloat162uint_rd(const __nv_bfloat16 h) -{ -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - unsigned int val; - asm("{ cvt.rmi.u32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h))); - return val; -#else - return __float2uint_rd(__bfloat162float(h)); -#endif -} -__CUDA_BF16_DECL__ unsigned int __bfloat162uint_ru(const __nv_bfloat16 h) -{ -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - unsigned int val; - asm("{ cvt.rpi.u32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h))); - return val; -#else - return __float2uint_ru(__bfloat162float(h)); -#endif -} -__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rn(const unsigned int i) -{ -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - __nv_bfloat16 val; - asm("cvt.rn.bf16.u32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i)); - return val; -#elif (defined __CUDA_ARCH__) - const float ru = __uint2float_ru(i); - const float rd = __uint2float_rd(i); - float rz = __uint2float_rz(i); - if (ru != rd) { - rz = __uint_as_float(__float_as_uint(rz) | 1U); - } - return __float2bfloat16_rn(rz); -#else - const double d = static_cast(i); - return __double2bfloat16(d); -#endif -} -__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rz(const unsigned int i) -{ -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - __nv_bfloat16 val; - asm("cvt.rz.bf16.u32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i)); - return val; -#else - return __float2bfloat16_rz(__uint2float_rz(i)); -#endif -} -__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rd(const unsigned int i) -{ -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - __nv_bfloat16 val; - asm("cvt.rm.bf16.u32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i)); - return val; -#else - return __float2bfloat16_rd(__uint2float_rd(i)); -#endif -} -__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_ru(const unsigned int i) -{ -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - __nv_bfloat16 val; - asm("cvt.rp.bf16.u32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i)); - return val; -#else - return __float2bfloat16_ru(__uint2float_ru(i)); -#endif -} - -__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_rn(const __nv_bfloat16 h) -{ - unsigned short int val; -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - asm("cvt.rni.u16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); -#else - asm("{ .reg.f32 f;\n" - " mov.b32 f, {0,%1};\n" - " cvt.rni.u16.f32 %0,f;\n}" - :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); -#endif - return val; -} -__CUDA_HOSTDEVICE_BF16_DECL__ unsigned short int __bfloat162ushort_rz(const __nv_bfloat16 h) -{ - unsigned short int val; -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - asm("cvt.rzi.u16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); -#elif (defined __CUDA_ARCH__) - asm("{ .reg.f32 f;\n" - " mov.b32 f, {0,%1};\n" - " cvt.rzi.u16.f32 %0,f;\n}" - :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); -#else - const float f = __bfloat162float(h); - val = static_cast(f); - const unsigned short int max_val = 0xffffU; - const unsigned short int min_val = 0U; - const unsigned short bits = static_cast(static_cast<__nv_bfloat16_raw>(h).x << 1U); - // saturation fixup - if (bits > (unsigned short)0xFF00U) { - // NaN - val = 0U; - } else if (f > static_cast(max_val)) { - // saturate maximum - val = max_val; - } else if (f < static_cast(min_val)) { - // saturate minimum - val = min_val; - } -#endif - return val; -} -__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_rd(const __nv_bfloat16 h) -{ - unsigned short int val; -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - asm("cvt.rmi.u16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); -#else - asm("{ .reg.f32 f;\n" - " mov.b32 f, {0,%1};\n" - " cvt.rmi.u16.f32 %0,f;\n}" - :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); -#endif - return val; -} -__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_ru(const __nv_bfloat16 h) -{ - unsigned short int val; -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - asm("cvt.rpi.u16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); -#else - asm("{ .reg.f32 f;\n" - " mov.b32 f, {0,%1};\n" - " cvt.rpi.u16.f32 %0,f;\n}" - :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); -#endif - return val; -} -__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rn(const unsigned short int i) -{ -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - __nv_bfloat16 val; - asm("cvt.rn.bf16.u16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i)); - return val; -#else - const float f = static_cast(i); - return __float2bfloat16_rn(f); -#endif -} -__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rz(const unsigned short int i) -{ -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - __nv_bfloat16 val; - asm("cvt.rz.bf16.u16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i)); - return val; -#else - return __float2bfloat16_rz(__uint2float_rz(static_cast(i))); -#endif -} -__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rd(const unsigned short int i) -{ -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - __nv_bfloat16 val; - asm("cvt.rm.bf16.u16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i)); - return val; -#else - return __float2bfloat16_rd(__uint2float_rd(static_cast(i))); -#endif -} -__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_ru(const unsigned short int i) -{ -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - __nv_bfloat16 val; - asm("cvt.rp.bf16.u16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i)); - return val; -#else - return __float2bfloat16_ru(__uint2float_ru(static_cast(i))); -#endif -} - -__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_rn(const __nv_bfloat16 h) -{ -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - unsigned long long int i; - asm("cvt.rni.u64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h))); - return i; -#else - return __float2ull_rn(__bfloat162float(h)); -#endif -} -__CUDA_HOSTDEVICE_BF16_DECL__ unsigned long long int __bfloat162ull_rz(const __nv_bfloat16 h) -{ - unsigned long long int i; -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - asm("cvt.rzi.u64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h))); - return i; -#else - const float f = __bfloat162float(h); - i = static_cast(f); -#if !(defined __CUDA_ARCH__) - const unsigned long long int max_val = 0xffffffffffffffffULL; - const unsigned long long int min_val = 0ULL; - const unsigned short bits = static_cast(static_cast<__nv_bfloat16_raw>(h).x << 1U); - // saturation fixup - if (bits > (unsigned short)0xFF00U) { - // NaN - i = 0x8000000000000000ULL; - } else if (f >= static_cast(max_val)) { - // saturate maximum - i = max_val; - } else if (f < static_cast(min_val)) { - // saturate minimum - i = min_val; - } -#endif -#endif // defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - return i; -} -__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_rd(const __nv_bfloat16 h) -{ -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - unsigned long long int i; - asm("cvt.rmi.u64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h))); - return i; -#else - return __float2ull_rd(__bfloat162float(h)); -#endif -} -__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_ru(const __nv_bfloat16 h) -{ -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - unsigned long long int i; - asm("cvt.rpi.u64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h))); - return i; -#else - return __float2ull_ru(__bfloat162float(h)); -#endif -} -__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rn(const unsigned long long int i) -{ -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - __nv_bfloat16 h; - asm("cvt.rn.bf16.u64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i)); - return h; -#elif (defined __CUDA_ARCH__) - const float ru = __ull2float_ru(i); - const float rd = __ull2float_rd(i); - float rz = __ull2float_rz(i); - if (ru != rd) { - rz = __uint_as_float(__float_as_uint(rz) | 1U); - } - return __float2bfloat16_rn(rz); -#else - float f = static_cast(i); - const unsigned long long int uf = static_cast(f); - unsigned int u; - - #if defined(__CUDA_ARCH__) - u = __float_as_uint(f); - #elif defined(__CUDACC__) - (void)memcpy(&u, &f, sizeof(f)); - #else - (void)std::memcpy(&u, &f, sizeof(f)); - #endif - - // round up happened here - // note: no need to handle round up to f == 0x1.p64 specially - if (uf > i) { - u--; - } - if (uf != i) { - u |= 1U; - } - - #if defined(__CUDA_ARCH__) - f = __int_as_float(static_cast(u)); - #elif defined(__CUDACC__) - (void)memcpy(&f, &u, sizeof(f)); - #else - (void)std::memcpy(&f, &u, sizeof(f)); - #endif - - return __float2bfloat16_rn(f); -#endif -} -__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rz(const unsigned long long int i) -{ -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - __nv_bfloat16 h; - asm("cvt.rz.bf16.u64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i)); - return h; -#else - return __float2bfloat16_rz(__ull2float_rz(i)); -#endif -} -__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rd(const unsigned long long int i) -{ -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - __nv_bfloat16 h; - asm("cvt.rm.bf16.u64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i)); - return h; -#else - return __float2bfloat16_rd(__ull2float_rd(i)); -#endif -} -__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_ru(const unsigned long long int i) -{ -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - __nv_bfloat16 h; - asm("cvt.rp.bf16.u64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i)); - return h; -#else - return __float2bfloat16_ru(__ull2float_ru(i)); -#endif -} -__CUDA_BF16_DECL__ long long int __bfloat162ll_rn(const __nv_bfloat16 h) -{ -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - long long int i; - asm("cvt.rni.s64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h))); - return i; -#else - return __float2ll_rn(__bfloat162float(h)); -#endif -} -__CUDA_HOSTDEVICE_BF16_DECL__ long long int __bfloat162ll_rz(const __nv_bfloat16 h) -{ - long long int i; -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - asm("cvt.rzi.s64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h))); -#else - const float f = __bfloat162float(h); - i = static_cast(f); -#if !(defined __CUDA_ARCH__) - const long long int max_val = (long long int)0x7fffffffffffffffULL; - const long long int min_val = (long long int)0x8000000000000000ULL; - const unsigned short bits = static_cast(static_cast<__nv_bfloat16_raw>(h).x << 1U); - // saturation fixup - if (bits > (unsigned short)0xFF00U) { - // NaN - i = min_val; - } else if (f >= static_cast(max_val)) { - // saturate maximum - i = max_val; - } else if (f < static_cast(min_val)) { - // saturate minimum - i = min_val; - } -#endif -#endif // defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - return i; -} -__CUDA_BF16_DECL__ long long int __bfloat162ll_rd(const __nv_bfloat16 h) -{ -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - long long int i; - asm("cvt.rmi.s64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h))); - return i; -#else - return __float2ll_rd(__bfloat162float(h)); -#endif -} -__CUDA_BF16_DECL__ long long int __bfloat162ll_ru(const __nv_bfloat16 h) -{ -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - long long int i; - asm("cvt.rpi.s64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h))); - return i; -#else - return __float2ll_ru(__bfloat162float(h)); -#endif -} -__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rn(const long long int i) -{ -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - __nv_bfloat16 h; - asm("cvt.rn.bf16.s64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i)); - return h; -#elif (defined __CUDA_ARCH__) - const float ru = __ll2float_ru(i); - const float rd = __ll2float_rd(i); - float rz = __ll2float_rz(i); - if (ru != rd) { - rz = __uint_as_float(__float_as_uint(rz) | 1U); - } - return __float2bfloat16_rn(rz); -#else - float f = static_cast(i); - const long long int lf = static_cast(f); - unsigned int u; - - #if defined(__CUDA_ARCH__) - u = __float_as_uint(f); - #elif defined(__CUDACC__) - (void)memcpy(&u, &f, sizeof(f)); - #else - (void)std::memcpy(&u, &f, sizeof(f)); - #endif - - if ((f > 0.0f) && (lf > i)) { - u--; - } - if ((f < 0.0f) && (lf < i)) { - u--; - } - if (lf != i) { - u |= 1U; - } - - #if defined(__CUDA_ARCH__) - f = __int_as_float(static_cast(u)); - #elif defined(__CUDACC__) - (void)memcpy(&f, &u, sizeof(f)); - #else - (void)std::memcpy(&f, &u, sizeof(f)); - #endif - - return __float2bfloat16_rn(f); -#endif -} -__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rz(const long long int i) -{ -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - __nv_bfloat16 h; - asm("cvt.rz.bf16.s64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i)); - return h; -#else - return __float2bfloat16_rz(__ll2float_rz(i)); -#endif -} -__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rd(const long long int i) -{ -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - __nv_bfloat16 h; - asm("cvt.rm.bf16.s64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i)); - return h; -#else - return __float2bfloat16_rd(__ll2float_rd(i)); -#endif -} -__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_ru(const long long int i) -{ -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - __nv_bfloat16 h; - asm("cvt.rp.bf16.s64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i)); - return h; -#else - return __float2bfloat16_ru(__ll2float_ru(i)); -#endif -} - -__CUDA_BF16_DECL__ __nv_bfloat16 htrunc(const __nv_bfloat16 h) -{ -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - __nv_bfloat16 r; - asm("cvt.rzi.bf16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(h))); - return r; -#else - return __float2bfloat16_rz(truncf(__bfloat162float(h))); -#endif -} -__CUDA_BF16_DECL__ __nv_bfloat16 hceil(const __nv_bfloat16 h) -{ -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - __nv_bfloat16 r; - asm("cvt.rpi.bf16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(h))); - return r; -#else - return __float2bfloat16_ru(ceilf(__bfloat162float(h))); -#endif -} -__CUDA_BF16_DECL__ __nv_bfloat16 hfloor(const __nv_bfloat16 h) -{ -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - __nv_bfloat16 r; - asm("cvt.rmi.bf16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(h))); - return r; -#else - return __float2bfloat16_rd(floorf(__bfloat162float(h))); -#endif -} -__CUDA_BF16_DECL__ __nv_bfloat16 hrint(const __nv_bfloat16 h) -{ -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - __nv_bfloat16 r; - asm("cvt.rni.bf16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(h))); - return r; -#else - return __float2bfloat16_rn(rintf(__bfloat162float(h))); -#endif -} - -__CUDA_BF16_DECL__ __nv_bfloat162 h2trunc(const __nv_bfloat162 h) -{ - const __nv_bfloat16 low = __float2bfloat16_rz(truncf(__low2float(h))); - const __nv_bfloat16 high = __float2bfloat16_rz(truncf(__high2float(h))); - return __nv_bfloat162(low, high); -} -__CUDA_BF16_DECL__ __nv_bfloat162 h2ceil(const __nv_bfloat162 h) -{ - const __nv_bfloat16 low = __float2bfloat16_ru(ceilf(__low2float(h))); - const __nv_bfloat16 high = __float2bfloat16_ru(ceilf(__high2float(h))); - return __nv_bfloat162(low, high); -} -__CUDA_BF16_DECL__ __nv_bfloat162 h2floor(const __nv_bfloat162 h) -{ - const __nv_bfloat16 low = __float2bfloat16_rd(floorf(__low2float(h))); - const __nv_bfloat16 high = __float2bfloat16_rd(floorf(__high2float(h))); - return __nv_bfloat162(low, high); -} - -__CUDA_BF16_DECL__ __nv_bfloat162 h2rint(const __nv_bfloat162 h) -{ - return __halves2bfloat162(hrint(__low2bfloat16(h)), hrint(__high2bfloat16(h))); -} -__CUDA_BF16_DECL__ __nv_bfloat162 __lows2bfloat162(const __nv_bfloat162 a, const __nv_bfloat162 b) -{ - __nv_bfloat162 val; - asm("{.reg .b16 alow,ahigh,blow,bhigh;\n" - " mov.b32 {alow,ahigh}, %1;\n" - " mov.b32 {blow,bhigh}, %2;\n" - " mov.b32 %0, {alow,blow};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)), "r"(__BFLOAT162_TO_CUI(b))); - return val; -} -__CUDA_BF16_DECL__ __nv_bfloat162 __highs2bfloat162(const __nv_bfloat162 a, const __nv_bfloat162 b) -{ - __nv_bfloat162 val; - asm("{.reg .b16 alow,ahigh,blow,bhigh;\n" - " mov.b32 {alow,ahigh}, %1;\n" - " mov.b32 {blow,bhigh}, %2;\n" - " mov.b32 %0, {ahigh,bhigh};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)), "r"(__BFLOAT162_TO_CUI(b))); - return val; -} -__CUDA_BF16_DECL__ __nv_bfloat16 __low2bfloat16(const __nv_bfloat162 a) -{ - __nv_bfloat16 ret; - asm("{.reg .b16 low,high;\n" - " mov.b32 {low,high}, %1;\n" - " mov.b16 %0, low;}" : "=h"(__BFLOAT16_TO_US(ret)) : "r"(__BFLOAT162_TO_CUI(a))); - return ret; -} -__CUDA_BF16_DECL__ int __hisinf(const __nv_bfloat16 a) -{ - int retval; - if (__BFLOAT16_TO_CUS(a) == 0xFF80U) { - retval = -1; - } else if (__BFLOAT16_TO_CUS(a) == 0x7F80U) { - retval = 1; - } else { - retval = 0; - } - return retval; -} -__CUDA_BF16_DECL__ __nv_bfloat162 __low2bfloat162(const __nv_bfloat162 a) -{ - __nv_bfloat162 val; - asm("{.reg .b16 low,high;\n" - " mov.b32 {low,high}, %1;\n" - " mov.b32 %0, {low,low};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a))); - return val; -} -__CUDA_BF16_DECL__ __nv_bfloat162 __high2bfloat162(const __nv_bfloat162 a) -{ - __nv_bfloat162 val; - asm("{.reg .b16 low,high;\n" - " mov.b32 {low,high}, %1;\n" - " mov.b32 %0, {high,high};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a))); - return val; -} -__CUDA_BF16_DECL__ __nv_bfloat16 __high2bfloat16(const __nv_bfloat162 a) -{ - __nv_bfloat16 ret; - asm("{.reg .b16 low,high;\n" - " mov.b32 {low,high}, %1;\n" - " mov.b16 %0, high;}" : "=h"(__BFLOAT16_TO_US(ret)) : "r"(__BFLOAT162_TO_CUI(a))); - return ret; -} -__CUDA_BF16_DECL__ __nv_bfloat162 __halves2bfloat162(const __nv_bfloat16 a, const __nv_bfloat16 b) -{ - __nv_bfloat162 val; - asm("{ mov.b32 %0, {%1,%2};}\n" - : "=r"(__BFLOAT162_TO_UI(val)) : "h"(__BFLOAT16_TO_CUS(a)), "h"(__BFLOAT16_TO_CUS(b))); - return val; -} -__CUDA_BF16_DECL__ __nv_bfloat162 __bfloat162bfloat162(const __nv_bfloat16 a) -{ - __nv_bfloat162 val; - asm("{ mov.b32 %0, {%1,%1};}\n" - : "=r"(__BFLOAT162_TO_UI(val)) : "h"(__BFLOAT16_TO_CUS(a))); - return val; -} -__CUDA_BF16_DECL__ __nv_bfloat162 __lowhigh2highlow(const __nv_bfloat162 a) -{ - __nv_bfloat162 val; - asm("{.reg .b16 low,high;\n" - " mov.b32 {low,high}, %1;\n" - " mov.b32 %0, {high,low};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a))); - return val; -} -__CUDA_BF16_DECL__ short int __bfloat16_as_short(const __nv_bfloat16 h) -{ - return static_cast(__BFLOAT16_TO_CUS(h)); -} -__CUDA_BF16_DECL__ unsigned short int __bfloat16_as_ushort(const __nv_bfloat16 h) -{ - return __BFLOAT16_TO_CUS(h); -} -__CUDA_BF16_DECL__ __nv_bfloat16 __short_as_bfloat16(const short int i) -{ - __nv_bfloat16 h; - __BFLOAT16_TO_US(h) = static_cast(i); - return h; -} -__CUDA_BF16_DECL__ __nv_bfloat16 __ushort_as_bfloat16(const unsigned short int i) -{ - __nv_bfloat16 h; - __BFLOAT16_TO_US(h) = i; - return h; -} - -/****************************************************************************** -* __nv_bfloat16, __nv_bfloat162 warp shuffle * -******************************************************************************/ -#define __SHUFFLE_SYNC_BFLOAT162_MACRO(name) /* do */ {\ - __nv_bfloat162 r; \ - asm volatile ("{" __CUDA_BF16_STRINGIFY(name) " %0,%1,%2,%3,%4;\n}" \ - :"=r"(__BFLOAT162_TO_UI(r)): "r"(__BFLOAT162_TO_CUI(var)), "r"(delta), "r"(c), "r"(mask)); \ - return r; \ -} /* while(0) */ - -__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_sync(const unsigned mask, const __nv_bfloat162 var, const int delta, const int width) -{ - unsigned int warp_size; - asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); - const unsigned int c = ((warp_size - static_cast(width)) << 8U) | 0x1fU; - __SHUFFLE_SYNC_BFLOAT162_MACRO(shfl.sync.idx.b32) -} -__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_up_sync(const unsigned mask, const __nv_bfloat162 var, const unsigned int delta, const int width) -{ - unsigned int warp_size; - asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); - const unsigned int c = (warp_size - static_cast(width)) << 8U; - __SHUFFLE_SYNC_BFLOAT162_MACRO(shfl.sync.up.b32) -} -__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_down_sync(const unsigned mask, const __nv_bfloat162 var, const unsigned int delta, const int width) -{ - unsigned int warp_size; - asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); - const unsigned int c = ((warp_size - static_cast(width)) << 8U) | 0x1fU; - __SHUFFLE_SYNC_BFLOAT162_MACRO(shfl.sync.down.b32) -} -__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_xor_sync(const unsigned mask, const __nv_bfloat162 var, const int delta, const int width) -{ - unsigned int warp_size; - asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); - const unsigned int c = ((warp_size - static_cast(width)) << 8U) | 0x1fU; - __SHUFFLE_SYNC_BFLOAT162_MACRO(shfl.sync.bfly.b32) -} - -#undef __SHUFFLE_SYNC_BFLOAT162_MACRO - -__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_sync(const unsigned mask, const __nv_bfloat16 var, const int delta, const int width) -{ - const __nv_bfloat162 temp1 = __halves2bfloat162(var, var); - const __nv_bfloat162 temp2 = __shfl_sync(mask, temp1, delta, width); - return __low2bfloat16(temp2); -} -__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_up_sync(const unsigned mask, const __nv_bfloat16 var, const unsigned int delta, const int width) -{ - const __nv_bfloat162 temp1 = __halves2bfloat162(var, var); - const __nv_bfloat162 temp2 = __shfl_up_sync(mask, temp1, delta, width); - return __low2bfloat16(temp2); -} -__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_down_sync(const unsigned mask, const __nv_bfloat16 var, const unsigned int delta, const int width) -{ - const __nv_bfloat162 temp1 = __halves2bfloat162(var, var); - const __nv_bfloat162 temp2 = __shfl_down_sync(mask, temp1, delta, width); - return __low2bfloat16(temp2); -} -__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_xor_sync(const unsigned mask, const __nv_bfloat16 var, const int delta, const int width) -{ - const __nv_bfloat162 temp1 = __halves2bfloat162(var, var); - const __nv_bfloat162 temp2 = __shfl_xor_sync(mask, temp1, delta, width); - return __low2bfloat16(temp2); -} - -/****************************************************************************** -* __nv_bfloat16 and __nv_bfloat162 __ldg,__ldcg,__ldca,__ldcs * -******************************************************************************/ - -#if defined(__cplusplus) -#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__) -#define __LDG_PTR "l" -#else -#define __LDG_PTR "r" -#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/ -__CUDA_BF16_DECL__ __nv_bfloat162 __ldg(const __nv_bfloat162 *const ptr) -{ - __nv_bfloat162 ret; - asm ("ld.global.nc.b32 %0, [%1];" : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr)); - return ret; -} -__CUDA_BF16_DECL__ __nv_bfloat16 __ldg(const __nv_bfloat16 *const ptr) -{ - __nv_bfloat16 ret; - asm ("ld.global.nc.b16 %0, [%1];" : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr)); - return ret; -} -__CUDA_BF16_DECL__ __nv_bfloat162 __ldcg(const __nv_bfloat162 *const ptr) -{ - __nv_bfloat162 ret; - asm ("ld.global.cg.b32 %0, [%1];" : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr)); - return ret; -} -__CUDA_BF16_DECL__ __nv_bfloat16 __ldcg(const __nv_bfloat16 *const ptr) -{ - __nv_bfloat16 ret; - asm ("ld.global.cg.b16 %0, [%1];" : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr)); - return ret; -} -__CUDA_BF16_DECL__ __nv_bfloat162 __ldca(const __nv_bfloat162 *const ptr) -{ - __nv_bfloat162 ret; - asm ("ld.global.ca.b32 %0, [%1];" : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr)); - return ret; -} -__CUDA_BF16_DECL__ __nv_bfloat16 __ldca(const __nv_bfloat16 *const ptr) -{ - __nv_bfloat16 ret; - asm ("ld.global.ca.b16 %0, [%1];" : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr)); - return ret; -} -__CUDA_BF16_DECL__ __nv_bfloat162 __ldcs(const __nv_bfloat162 *const ptr) -{ - __nv_bfloat162 ret; - asm ("ld.global.cs.b32 %0, [%1];" : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr)); - return ret; -} -__CUDA_BF16_DECL__ __nv_bfloat16 __ldcs(const __nv_bfloat16 *const ptr) -{ - __nv_bfloat16 ret; - asm ("ld.global.cs.b16 %0, [%1];" : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr)); - return ret; -} -__CUDA_BF16_DECL__ __nv_bfloat162 __ldlu(const __nv_bfloat162 *const ptr) -{ - __nv_bfloat162 ret; - asm ("ld.global.lu.b32 %0, [%1];" : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr) : "memory"); - return ret; -} -__CUDA_BF16_DECL__ __nv_bfloat16 __ldlu(const __nv_bfloat16 *const ptr) -{ - __nv_bfloat16 ret; - asm ("ld.global.lu.b16 %0, [%1];" : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr) : "memory"); - return ret; -} -__CUDA_BF16_DECL__ __nv_bfloat162 __ldcv(const __nv_bfloat162 *const ptr) -{ - __nv_bfloat162 ret; - asm ("ld.global.cv.b32 %0, [%1];" : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr) : "memory"); - return ret; -} -__CUDA_BF16_DECL__ __nv_bfloat16 __ldcv(const __nv_bfloat16 *const ptr) -{ - __nv_bfloat16 ret; - asm ("ld.global.cv.b16 %0, [%1];" : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr) : "memory"); - return ret; -} - -__CUDA_BF16_DECL__ void __stwb(__nv_bfloat162 *const ptr, const __nv_bfloat162 value) -{ - asm ("st.global.wb.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__BFLOAT162_TO_CUI(value)) : "memory"); -} -__CUDA_BF16_DECL__ void __stwb(__nv_bfloat16 *const ptr, const __nv_bfloat16 value) -{ - asm ("st.global.wb.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__BFLOAT16_TO_CUS(value)) : "memory"); -} -__CUDA_BF16_DECL__ void __stcg(__nv_bfloat162 *const ptr, const __nv_bfloat162 value) -{ - asm ("st.global.cg.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__BFLOAT162_TO_CUI(value)) : "memory"); -} -__CUDA_BF16_DECL__ void __stcg(__nv_bfloat16 *const ptr, const __nv_bfloat16 value) -{ - asm ("st.global.cg.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__BFLOAT16_TO_CUS(value)) : "memory"); -} -__CUDA_BF16_DECL__ void __stcs(__nv_bfloat162 *const ptr, const __nv_bfloat162 value) -{ - asm ("st.global.cs.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__BFLOAT162_TO_CUI(value)) : "memory"); -} -__CUDA_BF16_DECL__ void __stcs(__nv_bfloat16 *const ptr, const __nv_bfloat16 value) -{ - asm ("st.global.cs.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__BFLOAT16_TO_CUS(value)) : "memory"); -} -__CUDA_BF16_DECL__ void __stwt(__nv_bfloat162 *const ptr, const __nv_bfloat162 value) -{ - asm ("st.global.wt.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__BFLOAT162_TO_CUI(value)) : "memory"); -} -__CUDA_BF16_DECL__ void __stwt(__nv_bfloat16 *const ptr, const __nv_bfloat16 value) -{ - asm ("st.global.wt.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__BFLOAT16_TO_CUS(value)) : "memory"); -} - -#undef __LDG_PTR -#endif /*defined(__cplusplus) */ -/****************************************************************************** -* __nv_bfloat162 comparison * -******************************************************************************/ -#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) -#define __COMPARISON_OP_BFLOAT162_MACRO(name) {\ - __nv_bfloat162 val; \ - asm( "{ " __CUDA_BF16_STRINGIFY(name) ".bf16x2.bf16x2 %0,%1,%2;\n}" \ - :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \ - return val; \ -} -#else -#define __COMPARISON_OP_BFLOAT162_MACRO(name) {\ - __nv_bfloat162 val; \ - asm( "{.reg .b32 low_a,low_b,high_a,high_b,high_res,low_res;\n"\ - " and.b32 high_a, %1, 0xffff0000U;\n"\ - " and.b32 high_b, %2, 0xffff0000U;\n"\ - " shl.b32 low_a, %1, 16;\n"\ - " shl.b32 low_b, %2, 16;\n"\ - " " __CUDA_BF16_STRINGIFY(name) ".f32.f32 low_res, low_a, low_b;\n"\ - " " __CUDA_BF16_STRINGIFY(name) ".f32.f32 high_res, high_a, high_b;\n"\ - " shr.u32 low_res, low_res, 16;\n"\ - " or.b32 %0, high_res, low_res;}\n"\ - :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \ - return val; \ -} -#endif - -__CUDA_BF16_DECL__ __nv_bfloat162 __heq2(const __nv_bfloat162 a, const __nv_bfloat162 b) -{ - __COMPARISON_OP_BFLOAT162_MACRO(set.eq) -} -__CUDA_BF16_DECL__ __nv_bfloat162 __hne2(const __nv_bfloat162 a, const __nv_bfloat162 b) -{ - __COMPARISON_OP_BFLOAT162_MACRO(set.ne) -} -__CUDA_BF16_DECL__ __nv_bfloat162 __hle2(const __nv_bfloat162 a, const __nv_bfloat162 b) -{ - __COMPARISON_OP_BFLOAT162_MACRO(set.le) -} -__CUDA_BF16_DECL__ __nv_bfloat162 __hge2(const __nv_bfloat162 a, const __nv_bfloat162 b) -{ - __COMPARISON_OP_BFLOAT162_MACRO(set.ge) -} -__CUDA_BF16_DECL__ __nv_bfloat162 __hlt2(const __nv_bfloat162 a, const __nv_bfloat162 b) -{ - __COMPARISON_OP_BFLOAT162_MACRO(set.lt) -} -__CUDA_BF16_DECL__ __nv_bfloat162 __hgt2(const __nv_bfloat162 a, const __nv_bfloat162 b) -{ - __COMPARISON_OP_BFLOAT162_MACRO(set.gt) -} -__CUDA_BF16_DECL__ __nv_bfloat162 __hequ2(const __nv_bfloat162 a, const __nv_bfloat162 b) -{ - __COMPARISON_OP_BFLOAT162_MACRO(set.equ) -} -__CUDA_BF16_DECL__ __nv_bfloat162 __hneu2(const __nv_bfloat162 a, const __nv_bfloat162 b) -{ - __COMPARISON_OP_BFLOAT162_MACRO(set.neu) -} -__CUDA_BF16_DECL__ __nv_bfloat162 __hleu2(const __nv_bfloat162 a, const __nv_bfloat162 b) -{ - __COMPARISON_OP_BFLOAT162_MACRO(set.leu) -} -__CUDA_BF16_DECL__ __nv_bfloat162 __hgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b) -{ - __COMPARISON_OP_BFLOAT162_MACRO(set.geu) -} -__CUDA_BF16_DECL__ __nv_bfloat162 __hltu2(const __nv_bfloat162 a, const __nv_bfloat162 b) -{ - __COMPARISON_OP_BFLOAT162_MACRO(set.ltu) -} -__CUDA_BF16_DECL__ __nv_bfloat162 __hgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b) -{ - __COMPARISON_OP_BFLOAT162_MACRO(set.gtu) -} -#undef __COMPARISON_OP_BFLOAT162_MACRO - -#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) -#define __BOOL_COMPARISON_OP_BFLOAT162_MACRO(name) {\ - __nv_bfloat162 val; \ - bool retval; \ - asm( "{ " __CUDA_BF16_STRINGIFY(name) ".bf16x2.bf16x2 %0,%1,%2;\n}" \ - :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \ - if (__BFLOAT162_TO_CUI(val) == 0x3F803F80U) {\ - retval = true; \ - } else { \ - retval = false; \ - }\ - return retval;\ -} -#else - -#define __BOOL_COMPARISON_OP_BFLOAT162_MACRO(name) {\ - unsigned int val; \ - asm( "{.reg .b32 low_a,low_b,high_a,high_b,high_res,low_res;\n"\ - " and.b32 high_a, %1, 0xffff0000U;\n"\ - " and.b32 high_b, %2, 0xffff0000U;\n"\ - " shl.b32 low_a, %1, 16;\n"\ - " shl.b32 low_b, %2, 16;\n"\ - " " __CUDA_BF16_STRINGIFY(name) ".f32.f32 low_res, low_a, low_b;\n"\ - " " __CUDA_BF16_STRINGIFY(name) ".f32.f32 high_res, high_a, high_b;\n"\ - " and.b32 %0, high_res, low_res;}\n"\ - :"=r"(val) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \ - return (val != 0U) ? true : false; \ -} -#endif - -__CUDA_BF16_DECL__ bool __hbeq2(const __nv_bfloat162 a, const __nv_bfloat162 b) -{ - __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.eq) -} -__CUDA_BF16_DECL__ bool __hbne2(const __nv_bfloat162 a, const __nv_bfloat162 b) -{ - __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.ne) -} -__CUDA_BF16_DECL__ bool __hble2(const __nv_bfloat162 a, const __nv_bfloat162 b) -{ - __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.le) -} -__CUDA_BF16_DECL__ bool __hbge2(const __nv_bfloat162 a, const __nv_bfloat162 b) -{ - __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.ge) -} -__CUDA_BF16_DECL__ bool __hblt2(const __nv_bfloat162 a, const __nv_bfloat162 b) -{ - __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.lt) -} -__CUDA_BF16_DECL__ bool __hbgt2(const __nv_bfloat162 a, const __nv_bfloat162 b) -{ - __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.gt) -} -__CUDA_BF16_DECL__ bool __hbequ2(const __nv_bfloat162 a, const __nv_bfloat162 b) -{ - __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.equ) -} -__CUDA_BF16_DECL__ bool __hbneu2(const __nv_bfloat162 a, const __nv_bfloat162 b) -{ - __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.neu) -} -__CUDA_BF16_DECL__ bool __hbleu2(const __nv_bfloat162 a, const __nv_bfloat162 b) -{ - __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.leu) -} -__CUDA_BF16_DECL__ bool __hbgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b) -{ - __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.geu) -} -__CUDA_BF16_DECL__ bool __hbltu2(const __nv_bfloat162 a, const __nv_bfloat162 b) -{ - __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.ltu) -} -__CUDA_BF16_DECL__ bool __hbgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b) -{ - __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.gtu) -} -#undef __BOOL_COMPARISON_OP_BFLOAT162_MACRO -/****************************************************************************** -* __nv_bfloat16 comparison * -******************************************************************************/ -#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) -#define __COMPARISON_OP_BFLOAT16_MACRO(name) {\ - unsigned short val; \ - asm( "{ .reg .pred __$temp3;\n" \ - " setp." __CUDA_BF16_STRINGIFY(name) ".bf16 __$temp3, %1, %2;\n" \ - " selp.u16 %0, 1, 0, __$temp3;}" \ - : "=h"(val) : "h"(__BFLOAT16_TO_CUS(a)), "h"(__BFLOAT16_TO_CUS(b))); \ - return (val != 0U) ? true : false; \ -} -#else -#define __COMPARISON_OP_BFLOAT16_MACRO(name) {\ - unsigned int val; \ - asm( "{.reg .b32 a,b;\n"\ - " mov.b32 a, {0, %1};\n"\ - " mov.b32 b, {0, %2};\n"\ - " set." __CUDA_BF16_STRINGIFY(name) ".f32.f32 %0, a, b;}\n"\ - :"=r"(val) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \ - return (val != 0U) ? true : false; \ -} -#endif -__CUDA_BF16_DECL__ bool __heq(const __nv_bfloat16 a, const __nv_bfloat16 b) -{ - __COMPARISON_OP_BFLOAT16_MACRO(eq) -} -__CUDA_BF16_DECL__ bool __hne(const __nv_bfloat16 a, const __nv_bfloat16 b) -{ - __COMPARISON_OP_BFLOAT16_MACRO(ne) -} -__CUDA_BF16_DECL__ bool __hle(const __nv_bfloat16 a, const __nv_bfloat16 b) -{ - __COMPARISON_OP_BFLOAT16_MACRO(le) -} -__CUDA_BF16_DECL__ bool __hge(const __nv_bfloat16 a, const __nv_bfloat16 b) -{ - __COMPARISON_OP_BFLOAT16_MACRO(ge) -} -__CUDA_BF16_DECL__ bool __hlt(const __nv_bfloat16 a, const __nv_bfloat16 b) -{ - __COMPARISON_OP_BFLOAT16_MACRO(lt) -} -__CUDA_BF16_DECL__ bool __hgt(const __nv_bfloat16 a, const __nv_bfloat16 b) -{ - __COMPARISON_OP_BFLOAT16_MACRO(gt) -} -__CUDA_BF16_DECL__ bool __hequ(const __nv_bfloat16 a, const __nv_bfloat16 b) -{ - __COMPARISON_OP_BFLOAT16_MACRO(equ) -} -__CUDA_BF16_DECL__ bool __hneu(const __nv_bfloat16 a, const __nv_bfloat16 b) -{ - __COMPARISON_OP_BFLOAT16_MACRO(neu) -} -__CUDA_BF16_DECL__ bool __hleu(const __nv_bfloat16 a, const __nv_bfloat16 b) -{ - __COMPARISON_OP_BFLOAT16_MACRO(leu) -} -__CUDA_BF16_DECL__ bool __hgeu(const __nv_bfloat16 a, const __nv_bfloat16 b) -{ - __COMPARISON_OP_BFLOAT16_MACRO(geu) -} -__CUDA_BF16_DECL__ bool __hltu(const __nv_bfloat16 a, const __nv_bfloat16 b) -{ - __COMPARISON_OP_BFLOAT16_MACRO(ltu) -} -__CUDA_BF16_DECL__ bool __hgtu(const __nv_bfloat16 a, const __nv_bfloat16 b) -{ - __COMPARISON_OP_BFLOAT16_MACRO(gtu) -} -#undef __COMPARISON_OP_BFLOAT16_MACRO -/****************************************************************************** -* __nv_bfloat162 arithmetic * -******************************************************************************/ -#define __BINARY_OP_BFLOAT162_MACRO(name) /* do */ {\ - __nv_bfloat162 val; \ - asm( "{.reg .b32 low_a,low_b,high_a,high_b,high_res,low_res;\n"\ - " .reg .b16 low,high;\n"\ - " and.b32 high_a, %1, 0xffff0000U;\n"\ - " and.b32 high_b, %2, 0xffff0000U;\n"\ - " shl.b32 low_a, %1, 16;\n"\ - " shl.b32 low_b, %2, 16;\n"\ - " " __CUDA_BF16_STRINGIFY(name) ".f32 low_res, low_a, low_b;\n"\ - " " __CUDA_BF16_STRINGIFY(name) ".f32 high_res, high_a, high_b;\n"\ - " cvt.rn.bf16.f32 low, low_res;\n"\ - " cvt.rn.bf16.f32 high, high_res;\n"\ - " mov.b32 %0, {low,high};}\n"\ - :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \ - return val; \ -} /* while(0) */ - -__CUDA_BF16_DECL__ __nv_bfloat162 __hadd2(const __nv_bfloat162 a, const __nv_bfloat162 b) -{ - __nv_bfloat162 val; -#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - asm( "{ add.bf16x2 %0,%1,%2; }\n" -#else - asm( "{.reg .b32 c;\n" - " mov.b32 c, 0x3f803f80U;\n" - " fma.rn.bf16x2 %0,%1,c,%2;}\n" -#endif - :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \ - return val; -} -__CUDA_BF16_DECL__ __nv_bfloat162 __hsub2(const __nv_bfloat162 a, const __nv_bfloat162 b) -{ - __nv_bfloat162 val; -#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - asm( "{ sub.bf16x2 %0,%1,%2; }\n" -#else - asm( "{.reg .b32 c;\n" - " mov.b32 c, 0xbf80bf80U;\n" - " fma.rn.bf16x2 %0,%2,c,%1;}\n" -#endif - :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \ - return val; -} -__CUDA_BF16_DECL__ __nv_bfloat162 __hmul2(const __nv_bfloat162 a, const __nv_bfloat162 b) -{ - __nv_bfloat162 val; -#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - asm( "{ mul.bf16x2 %0,%1,%2; }\n" -#else - asm( "{.reg .b32 c;\n" - " mov.b32 c, 0x80008000U;\n" - " fma.rn.bf16x2 %0,%1,%2,c;}\n" -#endif - :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \ - return val; -} -__CUDA_BF16_DECL__ __nv_bfloat162 __hadd2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b) -{ - __nv_bfloat162 val; -#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - asm( "{ add.rn.bf16x2 %0,%1,%2; }\n" -#else - asm( "{.reg .b32 c;\n" - " mov.b32 c, 0x3f803f80U;\n" - " fma.rn.bf16x2 %0,%1,c,%2;}\n" -#endif - :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \ - return val; -} -__CUDA_BF16_DECL__ __nv_bfloat162 __hsub2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b) -{ - __nv_bfloat162 val; -#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - asm( "{ sub.rn.bf16x2 %0,%1,%2; }\n" -#else - asm( "{.reg .b32 c;\n" - " mov.b32 c, 0xbf80bf80U;\n" - " fma.rn.bf16x2 %0,%2,c,%1;}\n" -#endif - :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \ - return val; -} -__CUDA_BF16_DECL__ __nv_bfloat162 __hmul2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b) -{ - __nv_bfloat162 val; -#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - asm( "{ mul.rn.bf16x2 %0,%1,%2; }\n" -#else - asm( "{.reg .b32 c;\n" - " mov.b32 c, 0x80008000U;\n" - " fma.rn.bf16x2 %0,%1,%2,c;}\n" -#endif - :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \ - return val; -} -__CUDA_BF16_DECL__ __nv_bfloat162 __hadd2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b) -{ - __nv_bfloat162 val; - asm( "{.reg .b32 f, one, zero;\n" - " mov.b32 one, 0x3f803f80U;\n" - " mov.b32 zero, 0;\n" - " fma.rn.bf16x2 f,%1,one,%2;\n" - " max.bf16x2 f, f, zero;\n" - " min.bf16x2 %0, f, one;\n}" - :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \ - return val; -} -__CUDA_BF16_DECL__ __nv_bfloat162 __hsub2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b) -{ - __nv_bfloat162 val; - asm( "{.reg .b32 f, one, zero, mone;\n" - " mov.b32 one, 0x3f803f80U;\n" - " mov.b32 zero, 0;\n" - " mov.b32 mone, 0xbf80bf80U;\n" - " fma.rn.bf16x2 f,%2,mone,%1;\n" - " max.bf16x2 f, f, zero;\n" - " min.bf16x2 %0, f, one;\n}" - :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \ - return val; -} -__CUDA_BF16_DECL__ __nv_bfloat162 __hmul2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b) -{ - __nv_bfloat162 val; - asm( "{.reg .b32 f, one, zero, mzero;\n" - " mov.b32 one, 0x3f803f80U;\n" - " mov.b32 zero, 0;\n" - " mov.b32 mzero, 0x80008000U;\n" - " fma.rn.bf16x2 f,%1,%2,mzero;\n" - " max.bf16x2 f, f, zero;\n" - " min.bf16x2 %0, f, one;\n}" - :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \ - return val; -} -__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c) -{ - __nv_bfloat162 val; - asm( "{fma.rn.bf16x2 %0,%1,%2,%3;\n}" - :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)),"r"(__BFLOAT162_TO_CUI(c))); - return val; -} -__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c) -{ - __nv_bfloat162 val; - asm( "{ .reg .b32 f, one, zero;\n" - " mov.b32 one, 0x3f803f80U;\n" - " mov.b32 zero, 0;\n" - " fma.rn.bf16x2 f, %1, %2, %3;\n" - " max.bf16x2 f, f, zero;\n" - " min.bf16x2 %0, f, one;\n}" - :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)),"r"(__BFLOAT162_TO_CUI(c))); - return val; -} -__CUDA_BF16_DECL__ __nv_bfloat162 __h2div(const __nv_bfloat162 a, const __nv_bfloat162 b) { - __nv_bfloat16 ha, hb; - - ha = __low2bfloat16(a); - hb = __low2bfloat16(b); - - const __nv_bfloat16 v1 = __hdiv(ha, hb); - - ha = __high2bfloat16(a); - hb = __high2bfloat16(b); - - const __nv_bfloat16 v2 = __hdiv(ha, hb); - - return __halves2bfloat162(v1, v2); -} -/****************************************************************************** -* __nv_bfloat16 arithmetic * -******************************************************************************/ -#define __BINARY_OP_BFLOAT16_MACRO(name) /* do */ {\ - __nv_bfloat16 val; \ - asm( "{.reg .b32 a,b,res;\n"\ - " mov.b32 a, {0,%1};\n"\ - " mov.b32 b, {0,%2};\n"\ - " " __CUDA_BF16_STRINGIFY(name) ".f32 res, a, b;\n"\ - " cvt.rn.bf16.f32 %0, res;}\n"\ - :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \ - return val; \ -} /* while(0) */ - -__CUDA_BF16_DECL__ __nv_bfloat16 __hadd(const __nv_bfloat16 a, const __nv_bfloat16 b) -{ - __nv_bfloat16 val; -#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - asm( "{ add.bf16 %0,%1,%2; }\n" -#else - asm( "{.reg .b16 c;\n" - " mov.b16 c, 0x3f80U;\n" - " fma.rn.bf16 %0,%1,c,%2;}\n" -#endif - :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \ - return val; -} -__CUDA_BF16_DECL__ __nv_bfloat16 __hsub(const __nv_bfloat16 a, const __nv_bfloat16 b) -{ - __nv_bfloat16 val; -#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - asm( "{ sub.bf16 %0,%1,%2; }\n" -#else - asm( "{.reg .b16 c;\n" - " mov.b16 c, 0xbf80U;\n" - " fma.rn.bf16 %0,%2,c,%1;}\n" -#endif - :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \ - return val; -} -__CUDA_BF16_DECL__ __nv_bfloat16 __hmul(const __nv_bfloat16 a, const __nv_bfloat16 b) -{ - __nv_bfloat16 val; -#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - asm( "{ mul.bf16 %0,%1,%2; }\n" -#else - asm( "{.reg .b16 c;\n" - " mov.b16 c, 0x8000U;\n" - " fma.rn.bf16 %0,%1,%2,c;}\n" -#endif - :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \ - return val; -} -__CUDA_BF16_DECL__ __nv_bfloat16 __hadd_rn(const __nv_bfloat16 a, const __nv_bfloat16 b) -{ - __nv_bfloat16 val; -#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - asm( "{ add.rn.bf16 %0,%1,%2; }\n" -#else - asm( "{.reg .b16 c;\n" - " mov.b16 c, 0x3f80U;\n" - " fma.rn.bf16 %0,%1,c,%2;}\n" -#endif - :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \ - return val; -} -__CUDA_BF16_DECL__ __nv_bfloat16 __hsub_rn(const __nv_bfloat16 a, const __nv_bfloat16 b) -{ - __nv_bfloat16 val; -#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - asm( "{ sub.rn.bf16 %0,%1,%2; }\n" -#else - asm( "{.reg .b16 c;\n" - " mov.b16 c, 0xbf80U;\n" - " fma.rn.bf16 %0,%2,c,%1;}\n" -#endif - :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \ - return val; -} -__CUDA_BF16_DECL__ __nv_bfloat16 __hmul_rn(const __nv_bfloat16 a, const __nv_bfloat16 b) -{ - __nv_bfloat16 val; -#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - asm( "{ mul.rn.bf16 %0,%1,%2; }\n" -#else - asm( "{.reg .b16 c;\n" - " mov.b16 c, 0x8000U;\n" - " fma.rn.bf16 %0,%1,%2,c;}\n" -#endif - :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \ - return val; -} -__CUDA_BF16_DECL__ __nv_bfloat16 __hadd_sat(const __nv_bfloat16 a, const __nv_bfloat16 b) -{ - __nv_bfloat16 val; - asm( "{ .reg .b16 f, one, zero;\n" - " mov.b16 one, 0x3f80U;\n" - " mov.b16 zero, 0;\n" - " fma.rn.bf16 f, %1, one, %2;\n" - " max.bf16 f, f, zero;\n" - " min.bf16 %0, f, one;\n}" - :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); - return val; -} -__CUDA_BF16_DECL__ __nv_bfloat16 __hsub_sat(const __nv_bfloat16 a, const __nv_bfloat16 b) -{ - __nv_bfloat16 val; - asm( "{ .reg .b16 f, one, zero, mone;\n" - " mov.b16 one, 0x3f80U;\n" - " mov.b16 zero, 0;\n" - " mov.b16 mone, 0xbf80U;\n" - " fma.rn.bf16 f, %2, mone, %1;\n" - " max.bf16 f, f, zero;\n" - " min.bf16 %0, f, one;\n}" - :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); - return val; -} -__CUDA_BF16_DECL__ __nv_bfloat16 __hmul_sat(const __nv_bfloat16 a, const __nv_bfloat16 b) -{ - __nv_bfloat16 val; - asm( "{ .reg .b16 f, one, zero, mzero;\n" - " mov.b16 one, 0x3f80U;\n" - " mov.b16 zero, 0;\n" - " mov.b16 mzero, 0x8000U;\n" - " fma.rn.bf16 f, %1, %2, mzero;\n" - " max.bf16 f, f, zero;\n" - " min.bf16 %0, f, one;\n}" - :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); - return val; -} -__CUDA_BF16_DECL__ __nv_bfloat16 __hfma(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c) -{ - __nv_bfloat16 val; - asm( "{fma.rn.bf16 %0,%1,%2,%3;\n}" - :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)),"h"(__BFLOAT16_TO_CUS(c))); - return val; -} -__CUDA_BF16_DECL__ __nv_bfloat16 __hfma_sat(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c) -{ - __nv_bfloat16 val; - asm( "{ .reg .b16 f, one, zero;\n" - " mov.b16 one, 0x3f80U;\n" - " mov.b16 zero, 0;\n" - " fma.rn.bf16 f, %1, %2, %3;\n" - " max.bf16 f, f, zero;\n" - " min.bf16 %0, f, one;\n}" - :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)),"h"(__BFLOAT16_TO_CUS(c))); - return val; -} -__CUDA_BF16_DECL__ __nv_bfloat16 __hdiv(const __nv_bfloat16 a, const __nv_bfloat16 b) { - __BINARY_OP_BFLOAT16_MACRO(div.rn) -} - -/****************************************************************************** -* __nv_bfloat162 functions * -******************************************************************************/ -#define __APPROX_FCAST(fun) /* do */ {\ - __nv_bfloat16 val;\ - asm("{.reg.b32 f; \n"\ - " .reg.b16 r; \n"\ - " mov.b16 r,%1; \n"\ - " mov.b32 f,{0,r}; \n"\ - " " __CUDA_BF16_STRINGIFY(fun) ".approx.f32 f,f; \n"\ - " cvt.rn.bf16.f32 r,f; \n"\ - " mov.b16 %0,r; \n"\ - "}": "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)));\ - return val;\ -} /* while(0) */ -#define __APPROX_FCAST2(fun) /* do */ {\ - __nv_bfloat162 val;\ - asm("{.reg.b16 hl, hu; \n"\ - " .reg.b32 fl, fu; \n"\ - " mov.b32 {hl, hu}, %1; \n"\ - " mov.b32 fl, {0,hl}; \n"\ - " mov.b32 fu, {0,hu}; \n"\ - " " __CUDA_BF16_STRINGIFY(fun) ".approx.f32 fl, fl; \n"\ - " " __CUDA_BF16_STRINGIFY(fun) ".approx.f32 fu, fu; \n"\ - " cvt.rn.bf16.f32 hl, fl; \n"\ - " cvt.rn.bf16.f32 hu, fu; \n"\ - " mov.b32 %0, {hl, hu}; \n"\ - "}":"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a))); \ - return val;\ -} /* while(0) */ -__CUDA_BF16_DECL__ __nv_bfloat16 __hsin_internal(const __nv_bfloat16 a) { - float f = __bfloat162float(a); - f = sinf(f); - return __float2bfloat16_rn(f); -} -__CUDA_BF16_DECL__ __nv_bfloat16 hsin(const __nv_bfloat16 a) { - return __hsin_internal(a); -} -__CUDA_BF16_DECL__ __nv_bfloat162 h2sin(const __nv_bfloat162 a) { - const __nv_bfloat16 l = __low2bfloat16(a); - const __nv_bfloat16 h = __high2bfloat16(a); - return __halves2bfloat162(__hsin_internal(l), __hsin_internal(h)); -} -__CUDA_BF16_DECL__ __nv_bfloat16 __hcos_internal(const __nv_bfloat16 a) { - float f = __bfloat162float(a); - f = cosf(f); - return __float2bfloat16_rn(f); -} -__CUDA_BF16_DECL__ __nv_bfloat16 hcos(const __nv_bfloat16 a) { - return __hcos_internal(a); -} -__CUDA_BF16_DECL__ __nv_bfloat162 h2cos(const __nv_bfloat162 a) { - const __nv_bfloat16 l = __low2bfloat16(a); - const __nv_bfloat16 h = __high2bfloat16(a); - return __halves2bfloat162(__hcos_internal(l), __hcos_internal(h)); -} - -#define __BF16_SPEC_CASE2(i,r, spc, ulp) \ - "{.reg.b32 spc, ulp, p;\n"\ - " mov.b32 spc," __CUDA_BF16_STRINGIFY(spc) ";\n"\ - " mov.b32 ulp," __CUDA_BF16_STRINGIFY(ulp) ";\n"\ - " set.eq.f16x2.f16x2 p," __CUDA_BF16_STRINGIFY(i) ", spc;\n"\ - " fma.rn.bf16x2 " __CUDA_BF16_STRINGIFY(r) ",p,ulp," __CUDA_BF16_STRINGIFY(r) ";\n}\n" -#define __BF16_SPEC_CASE(i,r, spc, ulp) \ - "{.reg.b16 spc, ulp, p;\n"\ - " mov.b16 spc," __CUDA_BF16_STRINGIFY(spc) ";\n"\ - " mov.b16 ulp," __CUDA_BF16_STRINGIFY(ulp) ";\n"\ - " set.eq.f16.f16 p," __CUDA_BF16_STRINGIFY(i) ", spc;\n"\ - " fma.rn.bf16 " __CUDA_BF16_STRINGIFY(r) ",p,ulp," __CUDA_BF16_STRINGIFY(r) ";\n}\n" - -__CUDA_BF16_DECL__ __nv_bfloat16 hexp(const __nv_bfloat16 a) { - __nv_bfloat16 val; - asm("{.reg.b32 f, C; \n" - " .reg.b16 h,r; \n" - " mov.b16 h,%1; \n" - " mov.b32 f,{0,h}; \n" - " mov.b32 C, 0x3FB8AA3CU; \n" - " mul.f32 f,f,C; \n" - " ex2.approx.f32 f,f; \n" - " cvt.rn.bf16.f32 r,f; \n" - " mov.b16 %0,r; \n" - "}": "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a))); - return val; -} -__CUDA_BF16_DECL__ __nv_bfloat162 h2exp(const __nv_bfloat162 a) { - __nv_bfloat162 val; - asm("{.reg.b16 hl, hu; \n" - " .reg.b32 h,r,fl,fu, C; \n" - " mov.b32 {hl, hu}, %1; \n" - " mov.b32 h, %1; \n" - " mov.b32 fl, {0,hl}; \n" - " mov.b32 fu, {0,hu}; \n" - " mov.b32 C, 0x3FB8AA3CU; \n" - " mul.f32 fl,fl,C; \n" - " mul.f32 fu,fu,C; \n" - " ex2.approx.f32 fl, fl; \n" - " ex2.approx.f32 fu, fu; \n" - " cvt.rn.bf16.f32 hl, fl; \n" - " cvt.rn.bf16.f32 hu, fu; \n" - " mov.b32 r, {hl, hu}; \n" - " mov.b32 %0, r; \n" - "}":"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a))); - return val; -} -__CUDA_BF16_DECL__ __nv_bfloat16 hexp2(const __nv_bfloat16 a) { - __APPROX_FCAST(ex2) -} -__CUDA_BF16_DECL__ __nv_bfloat162 h2exp2(const __nv_bfloat162 a) { - __APPROX_FCAST2(ex2) -} -__CUDA_BF16_DECL__ __nv_bfloat16 hexp10(const __nv_bfloat16 a) { - __nv_bfloat16 val; - asm("{.reg.b16 h, r; \n" - " .reg.b32 f, C; \n" - " mov.b16 h, %1; \n" - " mov.b32 f, {0,h}; \n" - " mov.b32 C, 0x40549A78U; \n" - " mul.f32 f,f,C; \n" - " ex2.approx.f32 f, f; \n" - " cvt.rn.bf16.f32 r, f; \n" - __BF16_SPEC_CASE(%1, r, 0xBC95U,0xBF00U) - " mov.b16 %0, r; \n" - "}":"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a))); - return val; -} -__CUDA_BF16_DECL__ __nv_bfloat162 h2exp10(const __nv_bfloat162 a) { - __nv_bfloat162 val; - asm("{.reg.b16 hl, hu; \n" - " .reg.b32 h,r,fl,fu, C; \n" - " mov.b32 {hl, hu}, %1; \n" - " mov.b32 fl, {0,hl}; \n" - " mov.b32 fu, {0,hu}; \n" - " mov.b32 C, 0x40549A78U; \n" - " mul.f32 fl,fl,C; \n" - " mul.f32 fu,fu,C; \n" - " ex2.approx.f32 fl, fl; \n" - " ex2.approx.f32 fu, fu; \n" - " cvt.rn.bf16.f32 hl, fl; \n" - " cvt.rn.bf16.f32 hu, fu; \n" - " mov.b32 r, {hl, hu}; \n" - __BF16_SPEC_CASE2(%1, r, 0xBC95BC95U,0xBF00BF00U) - " mov.b32 %0, r; \n" - "}":"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a))); - return val; -} -__CUDA_BF16_DECL__ __nv_bfloat16 hlog2(const __nv_bfloat16 a) { - __APPROX_FCAST(lg2) -} -__CUDA_BF16_DECL__ __nv_bfloat162 h2log2(const __nv_bfloat162 a) { - __APPROX_FCAST2(lg2) -} -__CUDA_BF16_DECL__ __nv_bfloat16 hlog(const __nv_bfloat16 a) { - __nv_bfloat16 val; - asm("{.reg.b32 f, C; \n" - " .reg.b16 r,h; \n" - " mov.b16 h,%1; \n" - " mov.b32 f,{0,h}; \n" - " lg2.approx.f32 f,f; \n" - " mov.b32 C, 0x3f317218U; \n" - " mul.f32 f,f,C; \n" - " cvt.rn.bf16.f32 r,f; \n" - " mov.b16 %0,r; \n" - "}": "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a))); - return val; -} -__CUDA_BF16_DECL__ __nv_bfloat162 h2log(const __nv_bfloat162 a) { - __nv_bfloat162 val; - asm("{.reg.b16 hl, hu; \n" - " .reg.b32 r, fl, fu, C, h; \n" - " mov.b32 {hl, hu}, %1; \n" - " mov.b32 h, %1; \n" - " mov.b32 fl, {0,hl}; \n" - " mov.b32 fu, {0,hu}; \n" - " lg2.approx.f32 fl, fl; \n" - " lg2.approx.f32 fu, fu; \n" - " mov.b32 C, 0x3f317218U; \n" - " mul.f32 fl,fl,C; \n" - " mul.f32 fu,fu,C; \n" - " cvt.rn.bf16.f32 hl, fl; \n" - " cvt.rn.bf16.f32 hu, fu; \n" - " mov.b32 r, {hl, hu}; \n" - " mov.b32 %0, r; \n" - "}":"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a))); - return val; -} -__CUDA_BF16_DECL__ __nv_bfloat16 hlog10(const __nv_bfloat16 a) { - __nv_bfloat16 val; - asm("{.reg.b16 h, r; \n" - " .reg.b32 f, C; \n" - " mov.b16 h, %1; \n" - " mov.b32 f, {0,h}; \n" - " lg2.approx.f32 f, f; \n" - " mov.b32 C, 0x3E9A209BU; \n" - " mul.f32 f,f,C; \n" - " cvt.rn.bf16.f32 r, f; \n" - " mov.b16 %0, r; \n" - "}":"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a))); - return val; -} -__CUDA_BF16_DECL__ __nv_bfloat162 h2log10(const __nv_bfloat162 a) { - __nv_bfloat162 val; - asm("{.reg.b16 hl, hu; \n" - " .reg.b32 r, fl, fu, C, h; \n" - " mov.b32 {hl, hu}, %1; \n" - " mov.b32 h, %1; \n" - " mov.b32 fl, {0,hl}; \n" - " mov.b32 fu, {0,hu}; \n" - " lg2.approx.f32 fl, fl; \n" - " lg2.approx.f32 fu, fu; \n" - " mov.b32 C, 0x3E9A209BU; \n" - " mul.f32 fl,fl,C; \n" - " mul.f32 fu,fu,C; \n" - " cvt.rn.bf16.f32 hl, fl; \n" - " cvt.rn.bf16.f32 hu, fu; \n" - " mov.b32 r, {hl, hu}; \n" - " mov.b32 %0, r; \n" - "}":"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a))); - return val; -} -#undef __BF16_SPEC_CASE2 -#undef __BF16_SPEC_CASE -__CUDA_BF16_DECL__ __nv_bfloat162 h2rcp(const __nv_bfloat162 a) { - __APPROX_FCAST2(rcp) -} -__CUDA_BF16_DECL__ __nv_bfloat16 hrcp(const __nv_bfloat16 a) { - __APPROX_FCAST(rcp) -} -__CUDA_BF16_DECL__ __nv_bfloat162 h2rsqrt(const __nv_bfloat162 a) { - __APPROX_FCAST2(rsqrt) -} -__CUDA_BF16_DECL__ __nv_bfloat16 hrsqrt(const __nv_bfloat16 a) { - __APPROX_FCAST(rsqrt) -} -__CUDA_BF16_DECL__ __nv_bfloat162 h2sqrt(const __nv_bfloat162 a) { - __APPROX_FCAST2(sqrt) -} -__CUDA_BF16_DECL__ __nv_bfloat16 hsqrt(const __nv_bfloat16 a) { - __APPROX_FCAST(sqrt) -} -#undef __APPROX_FCAST -#undef __APPROX_FCAST2 -__CUDA_BF16_DECL__ __nv_bfloat162 __hisnan2(const __nv_bfloat162 a) -{ -#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - __nv_bfloat162 r; - asm("{set.nan.bf16x2.bf16x2 %0,%1,%1;\n}" - :"=r"(__BFLOAT162_TO_UI(r)) : "r"(__BFLOAT162_TO_CUI(a))); - return r; -#else - const __nv_bfloat162 b = a; - __BINARY_OP_BFLOAT162_MACRO(set.nan.f32) -#endif -} -__CUDA_BF16_DECL__ bool __hisnan(const __nv_bfloat16 a) -{ -#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - __nv_bfloat16 r; - asm("{set.nan.bf16.bf16 %0,%1,%1;\n}" - :"=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(a))); - return __BFLOAT16_TO_CUS(r) != 0U; -#else - unsigned int r; - asm( "{.reg .b32 a;\n" - " mov.b32 a, {0,%1};\n" - " set.nan.f32.f32 %0, a, a;}\n" - :"=r"(r) : "h"(__BFLOAT16_TO_CUS(a))); - return r != 0U; -#endif -} -__CUDA_BF16_DECL__ __nv_bfloat162 __hneg2(const __nv_bfloat162 a) -{ - __nv_bfloat162 r; - asm("{neg.bf16x2 %0,%1;\n}" - :"=r"(__BFLOAT162_TO_UI(r)) : "r"(__BFLOAT162_TO_CUI(a))); - return r; -} -__CUDA_BF16_DECL__ __nv_bfloat16 __hneg(const __nv_bfloat16 a) -{ - __nv_bfloat16 r; - asm("{neg.bf16 %0,%1;\n}" - :"=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(a))); - return r; -} -__CUDA_BF16_DECL__ __nv_bfloat162 __habs2(const __nv_bfloat162 a) -{ - __nv_bfloat162 r; - asm("{abs.bf16x2 %0,%1;\n}" - :"=r"(__BFLOAT162_TO_UI(r)) : "r"(__BFLOAT162_TO_CUI(a))); - return r; -} -__CUDA_BF16_DECL__ __nv_bfloat16 __habs(const __nv_bfloat16 a) -{ - __nv_bfloat16 r; - asm("{abs.bf16 %0,%1;\n}" - :"=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(a))); - return r; -} -/****************************************************************************** -* __nv_bfloat16 arithmetic * -******************************************************************************/ -__CUDA_BF16_DECL__ __nv_bfloat16 __hmax(const __nv_bfloat16 a, const __nv_bfloat16 b) -{ - __nv_bfloat16 val; - asm( "{ max.bf16 %0,%1,%2;\n}" - :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); - return val; -} -__CUDA_BF16_DECL__ __nv_bfloat16 __hmin(const __nv_bfloat16 a, const __nv_bfloat16 b) -{ - __nv_bfloat16 val; - asm( "{ min.bf16 %0,%1,%2;\n}" - :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); - return val; -} -__CUDA_BF16_DECL__ __nv_bfloat16 __hmax_nan(const __nv_bfloat16 a, const __nv_bfloat16 b) -{ - __nv_bfloat16 val; - asm( "{ max.NaN.bf16 %0,%1,%2;\n}" - :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); - return val; -} -__CUDA_BF16_DECL__ __nv_bfloat16 __hmin_nan(const __nv_bfloat16 a, const __nv_bfloat16 b) -{ - __nv_bfloat16 val; - asm( "{ min.NaN.bf16 %0,%1,%2;\n}" - :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); - return val; -} -__CUDA_BF16_DECL__ __nv_bfloat16 __hfma_relu(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c) -{ - __nv_bfloat16 val; - asm( "{ fma.rn.relu.bf16 %0,%1,%2,%3;\n}" - :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)),"h"(__BFLOAT16_TO_CUS(c))); - return val; -} -/****************************************************************************** -* __nv_bfloat162 arithmetic * -******************************************************************************/ -__CUDA_BF16_DECL__ __nv_bfloat162 __hmax2(const __nv_bfloat162 a, const __nv_bfloat162 b) -{ - __nv_bfloat162 val; - asm( "{ max.bf16x2 %0,%1,%2;\n}" - :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); - return val; -} -__CUDA_BF16_DECL__ __nv_bfloat162 __hmin2(const __nv_bfloat162 a, const __nv_bfloat162 b) -{ - __nv_bfloat162 val; - asm( "{ min.bf16x2 %0,%1,%2;\n}" - :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); - return val; -} -__CUDA_BF16_DECL__ __nv_bfloat162 __hmax2_nan(const __nv_bfloat162 a, const __nv_bfloat162 b) -{ - __nv_bfloat162 val; - asm( "{ max.NaN.bf16x2 %0,%1,%2;\n}" - :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); - return val; -} -__CUDA_BF16_DECL__ __nv_bfloat162 __hmin2_nan(const __nv_bfloat162 a, const __nv_bfloat162 b) -{ - __nv_bfloat162 val; - asm( "{ min.NaN.bf16x2 %0,%1,%2;\n}" - :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); - return val; -} -__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2_relu(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c) -{ - __nv_bfloat162 val; - asm( "{ fma.rn.relu.bf16x2 %0,%1,%2,%3;\n}" - :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)),"r"(__BFLOAT162_TO_CUI(c))); - return val; -} - -__CUDA_BF16_DECL__ __nv_bfloat162 __hcmadd(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c) -{ - // fast version of complex multiply-accumulate - // (a.re, a.im) * (b.re, b.im) + (c.re, c.im) - // acc.re = (c.re + a.re*b.re) - a.im*b.im - // acc.im = (c.im + a.re*b.im) + a.im*b.re - __nv_bfloat16 real_tmp = __hfma(a.x, b.x, c.x); - __nv_bfloat16 img_tmp = __hfma(a.x, b.y, c.y); - real_tmp = __hfma(__hneg(a.y), b.y, real_tmp); - img_tmp = __hfma(a.y, b.x, img_tmp); - return make_bfloat162(real_tmp, img_tmp); -} - - -/* Define __PTR for atomicAdd prototypes below, undef after done */ -#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__) -#define __PTR "l" -#else -#define __PTR "r" -#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/ - -__CUDA_BF16_DECL__ __nv_bfloat162 atomicAdd(__nv_bfloat162 *const address, const __nv_bfloat162 val) -{ -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - __nv_bfloat162 r; - asm volatile ("{ atom.add.noftz.bf16x2 %0,[%1],%2; }\n" - : "=r"(__BFLOAT162_TO_UI(r)) : __PTR(address), "r"(__BFLOAT162_TO_CUI(val)) - : "memory"); - return r; -#else - unsigned int* address_as_uint = (unsigned int*)address; - unsigned int old = *address_as_uint, assumed; - do { - assumed = old; - __nv_bfloat162 new_val = __hadd2(val, *(__nv_bfloat162*)&assumed); - old = atomicCAS(address_as_uint, assumed, *(unsigned int*)&new_val); - } while (assumed != old); - return *(__nv_bfloat162*)&old; -#endif -} - -__CUDA_BF16_DECL__ __nv_bfloat16 atomicAdd(__nv_bfloat16 *const address, const __nv_bfloat16 val) -{ -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - __nv_bfloat16 r; - asm volatile ("{ atom.add.noftz.bf16 %0,[%1],%2; }\n" - : "=h"(__BFLOAT16_TO_US(r)) - : __PTR(address), "h"(__BFLOAT16_TO_CUS(val)) - : "memory"); - return r; -#else - unsigned short int* address_as_us = (unsigned short int*)address; - unsigned short int old = *address_as_us, assumed; - do { - assumed = old; - old = atomicCAS(address_as_us, assumed, - __bfloat16_as_ushort(__hadd(val, __ushort_as_bfloat16(assumed)))); - } while (assumed != old); - return __ushort_as_bfloat16(old); -#endif -} - -#undef __PTR -#undef __CUDA_BF16_DECL__ -#endif /* defined(__CUDACC__) && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) */ -#endif /* defined(__cplusplus) */ - -#undef __BINARY_OP_BFLOAT162_MACRO -#undef __BINARY_OP_BFLOAT16_MACRO - -#undef __CUDA_HOSTDEVICE_BF16_DECL__ -#undef __CUDA_BF16_DECL__ - -/* Define first-class types "nv_bfloat16" and "nv_bfloat162", unless user specifies otherwise via "#define CUDA_NO_BFLOAT16" */ -/* C cannot ever have these types defined here, because __nv_bfloat16 and __nv_bfloat162 are C++ classes */ -#if defined(__cplusplus) && !defined(CUDA_NO_BFLOAT16) -typedef __nv_bfloat16 nv_bfloat16; -typedef __nv_bfloat162 nv_bfloat162; - -#endif /* defined(__cplusplus) && !defined(CUDA_NO_BFLOAT16) */ - -#if defined(__CPP_VERSION_AT_LEAST_11_BF16) -#undef __CPP_VERSION_AT_LEAST_11_BF16 -#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */ - -#endif /* end of include guard: __CUDA_BF16_HPP__ */ From 8b569c6e2d37eecc02d0f8299bfee9a56634ede2 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 15 Aug 2025 16:17:28 -0700 Subject: [PATCH 33/56] update format constant method for BfloatType --- numba_cuda/numba/cuda/models.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/numba_cuda/numba/cuda/models.py b/numba_cuda/numba/cuda/models.py index 02f629575..d6e28b82f 100644 --- a/numba_cuda/numba/cuda/models.py +++ b/numba_cuda/numba/cuda/models.py @@ -1,3 +1,4 @@ +import struct import functools from llvmlite import ir @@ -46,11 +47,16 @@ def __init__(self, dmm, fe_type): def _as_bfloat(value): - # Step 1: Convert to float - f = ir.types._as_float(value) + # Step 1: Reinterpret the input as u32 bits + u = struct.unpack("I", struct.pack("f", value))[0] + # Step 2: Truncate (or round, we choose truncate) last 16 bits - bf = f >> 16 - return bf + trunc = u >> 16 + + # Step 3: Unpack them back to Python floats + f = struct.unpack("f", struct.pack("I", trunc))[0] + + return f class BfloatType(ir.types._BaseFloatType): From 2148be9ee8a8ee83ee19cddb1ef0d736018ade65 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Mon, 18 Aug 2025 12:33:25 -0700 Subject: [PATCH 34/56] implement printing support for bfloat16 --- numba_cuda/numba/cuda/printimpl.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/numba_cuda/numba/cuda/printimpl.py b/numba_cuda/numba/cuda/printimpl.py index 0a4fb4347..52e058358 100644 --- a/numba_cuda/numba/cuda/printimpl.py +++ b/numba_cuda/numba/cuda/printimpl.py @@ -5,7 +5,7 @@ from numba.core.errors import NumbaWarning from numba.core.imputils import Registry from numba.cuda import nvvmutils -from numba.cuda.types import Dim3 +from numba.cuda.types import Dim3, Bfloat16 from warnings import warn registry = Registry() @@ -48,6 +48,17 @@ def real_print_impl(ty, context, builder, val): return "%f", [lld] +@print_item.register(Bfloat16) +def bfloat16_print_impl(ty, context, builder, val): + # Hand rolled bfloat16 -> float32 -> double conversion with zero-ext + bits32 = builder.zext(val, ir.IntType(32)) + shift = builder.shl(bits32, ir.Constant(ir.IntType(32), 16)) + f32 = builder.bitcast(shift, ir.FloatType()) + # printf("%f") expects a double; promote to f64 to match vararg expectation + f64 = builder.fpext(f32, ir.DoubleType()) + return "%f", [f64] + + @print_item.register(types.StringLiteral) def const_print_impl(ty, context, builder, sigval): pyval = ty.literal_value From 07b9c1e34084221134919f3d67f9a3d6889d3b14 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Mon, 18 Aug 2025 12:34:14 -0700 Subject: [PATCH 35/56] implement to int conversion tests --- .../numba/cuda/tests/cudapy/test_bfloat16.py | 257 +++++++----------- 1 file changed, 101 insertions(+), 156 deletions(-) diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py index 1147bba11..f4f198d5b 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py @@ -1,7 +1,8 @@ import unittest from importlib.util import find_spec +import numpy as np -from numba import cuda, float32 +from numba import cuda, float32, float64 from numba.cuda.bf16 import ( bfloat16, habs, @@ -40,6 +41,28 @@ float32_to_bfloat16_rz, float32_to_bfloat16_rd, float32_to_bfloat16_ru, + + bfloat16_to_int8_rz, + bfloat16_to_uint8_rz, + + int16_to_bfloat16_rn, + int16_to_bfloat16_rz, + int16_to_bfloat16_rd, + int16_to_bfloat16_ru, + bfloat16_to_int16_rn, + bfloat16_to_int16_rz, + bfloat16_to_int16_rd, + bfloat16_to_int16_ru, + + uint16_to_bfloat16_rn, + uint16_to_bfloat16_rz, + uint16_to_bfloat16_rd, + uint16_to_bfloat16_ru, + bfloat16_to_uint16_rn, + bfloat16_to_uint16_rz, + bfloat16_to_uint16_rd, + bfloat16_to_uint16_ru, + int32_to_bfloat16_rn, int32_to_bfloat16_rz, int32_to_bfloat16_rd, @@ -48,22 +71,29 @@ bfloat16_to_int32_rz, bfloat16_to_int32_rd, bfloat16_to_int32_ru, - bfloat16_to_int16_rn, - int16_to_bfloat16_rn, - bfloat16_to_uint16_rn, - uint16_to_bfloat16_rn, - bfloat16_to_uint32_rn, + uint32_to_bfloat16_rn, + uint32_to_bfloat16_rz, + uint32_to_bfloat16_rd, + uint32_to_bfloat16_ru, + bfloat16_to_uint32_rn, + bfloat16_to_uint32_rz, + bfloat16_to_uint32_rd, + bfloat16_to_uint32_ru, + bfloat16_to_int64_rn, - int64_to_bfloat16_rn, + bfloat16_to_int64_rz, + bfloat16_to_int64_rd, + bfloat16_to_int64_ru, bfloat16_to_uint64_rn, - uint64_to_bfloat16_rn, - bfloat16_as_short, - bfloat16_as_ushort, - short_as_bfloat16, - ushort_as_bfloat16, - bfloat16_to_int8_rz, - bfloat16_to_uint8_rz, + bfloat16_to_uint64_rz, + bfloat16_to_uint64_rd, + bfloat16_to_uint64_ru, + + bfloat16_as_int16, + bfloat16_as_uint16, + int16_as_bfloat16, + uint16_as_bfloat16, ) from numba.cuda.testing import CUDATestCase @@ -302,152 +332,67 @@ def kernel(out): self.assertAlmostEqual(out[2], 2.0, delta=1e-3) self.assertAlmostEqual(out[3], 2.0, delta=1e-3) - def test_int32_float32_precision_conversion_intrinsics(self): - self.skip_unsupported() - @cuda.jit - def kernel_float_to_bf16(out): - f = float32(3.14) - out[0] = float32(float32_to_bfloat16_rn(f)) - out[1] = float32(float32_to_bfloat16_rz(f)) - out[2] = float32(float32_to_bfloat16_rd(f)) - out[3] = float32(float32_to_bfloat16_ru(f)) - - @cuda.jit - def kernel_bf16_to_float(out): - a = bfloat16(3.14) - out[0] = bfloat16_to_float32(a) - - @cuda.jit - def kernel_int_to_bf16(out): - i = 3 - out[0] = float32(int32_to_bfloat16_rn(i)) - out[1] = float32(int32_to_bfloat16_rz(i)) - out[2] = float32(int32_to_bfloat16_rd(i)) - out[3] = float32(int32_to_bfloat16_ru(i)) - - @cuda.jit - def kernel_bf16_to_int(out): - a = bfloat16(3.14) - out[0] = bfloat16_to_int32_rn(a) - out[1] = bfloat16_to_int32_rz(a) - out[2] = bfloat16_to_int32_rd(a) - out[3] = bfloat16_to_int32_ru(a) - - out = cuda.device_array((4,), dtype="float32") - kernel_float_to_bf16[1, 1](out) - # Check they are near the original value in float32 after round-trip - # Note: Different rounding modes produce slightly different values - self.assertAlmostEqual(out[0], 3.140625, delta=1e-3) # rn - self.assertTrue(abs(out[1] - 3.140625) < 2e-2, out[1] - 3.140625) # rz - self.assertTrue(abs(out[2] - 3.140625) < 2e-2, out[2] - 3.140625) # rd - self.assertTrue(abs(out[3] - 3.140625) < 2e-2, out[3] - 3.140625) # ru - - out = cuda.device_array((1,), dtype="float32") - kernel_bf16_to_float[1, 1](out) - self.assertAlmostEqual(out[0], 3.140625, delta=1e-3) - - outi = cuda.device_array((4,), dtype="int32") - kernel_int_to_bf16[1, 1](outi) - # int to bf16 should be exactly representable for small integers - self.assertEqual(int(outi[0]), 3) - self.assertEqual(int(outi[1]), 3) - self.assertEqual(int(outi[2]), 3) - self.assertEqual(int(outi[3]), 3) - - outi = cuda.device_array((4,), dtype="int32") - kernel_bf16_to_int[1, 1](outi) - # 3.14 -> 3 for rz/rd, 3 or 4 for rn/ru depending on rounding - self.assertIn(int(outi[0]), (3, 4)) - self.assertEqual(int(outi[1]), 3) - self.assertEqual(int(outi[2]), 3) - self.assertIn(int(outi[3]), (3, 4)) - - def test_floatroundtrip_integer_conversion_intrinsics(self): + def test_to_integer_conversions(self): self.skip_unsupported() @cuda.jit - def kernel_scalar_roundtrip(out): - f = 3.14 - bf = float32_to_bfloat16(f) - out[0] = bfloat16_to_float32(bf) - d = 3.14 - bf2 = float64_to_bfloat16(d) - out[1] = bfloat16_to_float32(bf2) - - out = cuda.device_array((2,), dtype="float32") - kernel_scalar_roundtrip[1, 1](out) - self.assertAlmostEqual(out[0], 3.140625, delta=1e-3) - self.assertAlmostEqual(out[1], 3.140625, delta=1e-3) - - @cuda.jit - def kernel_int_family(outf): - outf[0] = float32(int16_to_bfloat16_rn(123)) - outf[1] = float32(uint16_to_bfloat16_rn(456)) - outf[2] = float32(uint32_to_bfloat16_rn(789)) - outf[3] = float32(int64_to_bfloat16_rn(1011)) - outf[4] = float32(uint64_to_bfloat16_rn(1213)) - - outf = cuda.device_array((5,), dtype="float32") - kernel_int_family[1, 1](outf) - vals = [123, 456, 789, 1011, 1213] - for i, v in enumerate(vals): - got = int(outf[i]) - # `step` estimates ULP near the integer `v`. - # Bfloat16 has 7 bits of precision, spacing between representable values are 2**(e-7). - # We use the exponent of the value `v` to raise the minSpacing, the result is a reasonable - # esitmate the local ULP. - step = ( - 0 if v == 0 else 2 ** (int(math.floor(math.log2(abs(v)))) - 7) - ) - # `allowed` is the maximum error in ULP, with a minimum of 1 - # In general, half ULP is the typical rounding error bound. - allowed = max(1, int(step // 2)) - self.assertLessEqual(abs(got - v), allowed) - - @cuda.jit - def kernel_from_bf16_to_ints(outi): - a = bfloat16(5.75) - outi[0] = bfloat16_to_int16_rn(a) - outi[1] = bfloat16_to_uint16_rn(a) - outi[2] = bfloat16_to_uint32_rn(a) - outi[3] = bfloat16_to_int64_rn(a) - outi[4] = bfloat16_to_uint64_rn(a) - - outi = cuda.device_array((5,), dtype="int64") - kernel_from_bf16_to_ints[1, 1](outi) - self.assertEqual(int(outi[0]), 6) - self.assertEqual(int(outi[1]), 6) - self.assertEqual(int(outi[2]), 6) - self.assertEqual(int(outi[3]), 6) - self.assertEqual(int(outi[4]), 6) - - @cuda.jit - def kernel_bit_reinterpret(out_short, out_ushort): - s = 12345 - bf = short_as_bfloat16(s) - out_short[0] = bfloat16_as_short(bf) - us = 54321 - bf2 = ushort_as_bfloat16(us) - out_ushort[0] = bfloat16_as_ushort(bf2) - - out_short = cuda.device_array((1,), dtype="int32") - out_ushort = cuda.device_array((1,), dtype="uint32") - kernel_bit_reinterpret[1, 1](out_short, out_ushort) - self.assertEqual(int(out_short[0]), 12345) - self.assertEqual(int(out_ushort[0]), 54321) - - @cuda.jit - def kernel_char(out_c, out_uc): - a = bfloat16(3.9) - out_c[0] = bfloat16_to_int8_rz(a) - out_uc[0] = bfloat16_to_uint8_rz(a) - - out_c = cuda.device_array((1,), dtype="int8") - out_uc = cuda.device_array((1,), dtype="uint8") - kernel_char[1, 1](out_c, out_uc) - self.assertEqual(int(out_c[0]), 3) - self.assertEqual(int(out_uc[0]), 3) + def kernel(test_val, i1, i2, i3, i4, u1, u2, u3, u4): + a = int16_as_bfloat16(test_val) + + i1[0] = bfloat16_to_int8_rz(a) + u1[0] = bfloat16_to_uint8_rz(a) + i2[0] = bfloat16_to_int16_rn(a) + i2[1] = bfloat16_to_int16_rz(a) + i2[2] = bfloat16_to_int16_rd(a) + i2[3] = bfloat16_to_int16_ru(a) + u2[0] = bfloat16_to_uint16_rn(a) + u2[1] = bfloat16_to_uint16_rz(a) + u2[2] = bfloat16_to_uint16_rd(a) + u2[3] = bfloat16_to_uint16_ru(a) + i3[0] = bfloat16_to_int32_rn(a) + i3[1] = bfloat16_to_int32_rz(a) + i3[2] = bfloat16_to_int32_rd(a) + i3[3] = bfloat16_to_int32_ru(a) + u3[0] = bfloat16_to_uint32_rn(a) + u3[1] = bfloat16_to_uint32_rz(a) + u3[2] = bfloat16_to_uint32_rd(a) + u3[3] = bfloat16_to_uint32_ru(a) + i4[0] = bfloat16_to_int64_rn(a) + i4[1] = bfloat16_to_int64_rz(a) + i4[2] = bfloat16_to_int64_rd(a) + i4[3] = bfloat16_to_int64_ru(a) + u4[0] = bfloat16_to_uint64_rn(a) + u4[1] = bfloat16_to_uint64_rz(a) + u4[2] = bfloat16_to_uint64_rd(a) + u4[3] = bfloat16_to_uint64_ru(a) + + # rz + i1 = cuda.device_array((1,), dtype="int8") + # rn, rz, rd, ru + i2 = cuda.device_array((4,), dtype="int16") + i3 = cuda.device_array((4,), dtype="int32") + i4 = cuda.device_array((4,), dtype="int64") + # rz + u1 = cuda.device_array((1,), dtype="uint8") + # rn, rz, rd, ru + u2 = cuda.device_array((4,), dtype="uint16") + u3 = cuda.device_array((4,), dtype="uint32") + u4 = cuda.device_array((4,), dtype="uint64") + + test_val = np.int16(0x3fc0) # 1.5 in bfloat16 + + kernel[1, 1](test_val, i1, i2, i3, i4, u1, u2, u3, u4) + + self.assertEqual(i1[0], 1) + self.assertEqual(u1[0], 1) + + np.testing.assert_equal(i2, np.array([2, 1, 1, 2], "int16")) + np.testing.assert_equal(i3, np.array([2, 1, 1, 2], "int32")) + np.testing.assert_equal(i4, np.array([2, 1, 1, 2], "int64")) + np.testing.assert_equal(u2, np.array([2, 1, 1, 2], "uint16")) + np.testing.assert_equal(u3, np.array([2, 1, 1, 2], "uint32")) + np.testing.assert_equal(u4, np.array([2, 1, 1, 2], "uint64")) @unittest.skipIf( find_spec("ml_dtypes") is None, From 0834f6d3ea47bbb6e80a0c6bb73c047359dd090d Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Mon, 18 Aug 2025 14:51:09 -0700 Subject: [PATCH 36/56] add from integer conversion test --- .../numba/cuda/tests/cudapy/test_bfloat16.py | 145 +++++++++++++++--- 1 file changed, 126 insertions(+), 19 deletions(-) diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py index f4f198d5b..ec59a5285 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py @@ -1,8 +1,9 @@ import unittest from importlib.util import find_spec import numpy as np +from ml_dtypes import bfloat16 as mldtypes_bf16 -from numba import cuda, float32, float64 +from numba import cuda, float32, int16, int32, int64, uint16, uint32, uint64 from numba.cuda.bf16 import ( bfloat16, habs, @@ -34,17 +35,8 @@ hisnan, hisinf, # Conversion intrinsics (NumPy-style names) - bfloat16_to_float32, - float32_to_bfloat16, - float64_to_bfloat16, - float32_to_bfloat16_rn, - float32_to_bfloat16_rz, - float32_to_bfloat16_rd, - float32_to_bfloat16_ru, - bfloat16_to_int8_rz, bfloat16_to_uint8_rz, - int16_to_bfloat16_rn, int16_to_bfloat16_rz, int16_to_bfloat16_rd, @@ -53,7 +45,6 @@ bfloat16_to_int16_rz, bfloat16_to_int16_rd, bfloat16_to_int16_ru, - uint16_to_bfloat16_rn, uint16_to_bfloat16_rz, uint16_to_bfloat16_rd, @@ -62,7 +53,6 @@ bfloat16_to_uint16_rz, bfloat16_to_uint16_rd, bfloat16_to_uint16_ru, - int32_to_bfloat16_rn, int32_to_bfloat16_rz, int32_to_bfloat16_rd, @@ -71,7 +61,6 @@ bfloat16_to_int32_rz, bfloat16_to_int32_rd, bfloat16_to_int32_ru, - uint32_to_bfloat16_rn, uint32_to_bfloat16_rz, uint32_to_bfloat16_rd, @@ -80,20 +69,24 @@ bfloat16_to_uint32_rz, bfloat16_to_uint32_rd, bfloat16_to_uint32_ru, - bfloat16_to_int64_rn, bfloat16_to_int64_rz, bfloat16_to_int64_rd, bfloat16_to_int64_ru, + int64_to_bfloat16_rn, + int64_to_bfloat16_rz, + int64_to_bfloat16_rd, + int64_to_bfloat16_ru, bfloat16_to_uint64_rn, bfloat16_to_uint64_rz, bfloat16_to_uint64_rd, bfloat16_to_uint64_ru, - + uint64_to_bfloat16_rn, + uint64_to_bfloat16_rz, + uint64_to_bfloat16_rd, + uint64_to_bfloat16_ru, bfloat16_as_int16, - bfloat16_as_uint16, int16_as_bfloat16, - uint16_as_bfloat16, ) from numba.cuda.testing import CUDATestCase @@ -332,7 +325,6 @@ def kernel(out): self.assertAlmostEqual(out[2], 2.0, delta=1e-3) self.assertAlmostEqual(out[3], 2.0, delta=1e-3) - def test_to_integer_conversions(self): self.skip_unsupported() @@ -380,7 +372,7 @@ def kernel(test_val, i1, i2, i3, i4, u1, u2, u3, u4): u3 = cuda.device_array((4,), dtype="uint32") u4 = cuda.device_array((4,), dtype="uint64") - test_val = np.int16(0x3fc0) # 1.5 in bfloat16 + test_val = np.int16(0x3FC0) # 1.5 in bfloat16 kernel[1, 1](test_val, i1, i2, i3, i4, u1, u2, u3, u4) @@ -394,6 +386,101 @@ def kernel(test_val, i1, i2, i3, i4, u1, u2, u3, u4): np.testing.assert_equal(u3, np.array([2, 1, 1, 2], "uint32")) np.testing.assert_equal(u4, np.array([2, 1, 1, 2], "uint64")) + def test_from_integer_conversions(self): + self.skip_unsupported() + + test_val = 789 + + @cuda.jit + def kernel(out): + i2 = int16(test_val) + i3 = int32(test_val) + i4 = int64(test_val) + u2 = uint16(test_val) + u3 = uint32(test_val) + u4 = uint64(test_val) + + i2rn = int16_to_bfloat16_rn(i2) + i2rz = int16_to_bfloat16_rz(i2) + i2rd = int16_to_bfloat16_rd(i2) + i2ru = int16_to_bfloat16_ru(i2) + + u2rn = uint16_to_bfloat16_rn(u2) + u2rz = uint16_to_bfloat16_rz(u2) + u2rd = uint16_to_bfloat16_rd(u2) + u2ru = uint16_to_bfloat16_ru(u2) + + i3rn = int32_to_bfloat16_rn(i3) + i3rz = int32_to_bfloat16_rz(i3) + i3rd = int32_to_bfloat16_rd(i3) + i3ru = int32_to_bfloat16_ru(i3) + + u3rn = uint32_to_bfloat16_rn(u3) + u3rz = uint32_to_bfloat16_rz(u3) + u3rd = uint32_to_bfloat16_rd(u3) + u3ru = uint32_to_bfloat16_ru(u3) + + i4rn = int64_to_bfloat16_rn(i4) + i4rz = int64_to_bfloat16_rz(i4) + i4rd = int64_to_bfloat16_rd(i4) + i4ru = int64_to_bfloat16_ru(i4) + + u4rn = uint64_to_bfloat16_rn(u4) + u4rz = uint64_to_bfloat16_rz(u4) + u4rd = uint64_to_bfloat16_rd(u4) + u4ru = uint64_to_bfloat16_ru(u4) + + out[0] = bfloat16_as_int16(i2rn) + out[1] = bfloat16_as_int16(i2rz) + out[2] = bfloat16_as_int16(i2rd) + out[3] = bfloat16_as_int16(i2ru) + out[4] = bfloat16_as_int16(u2rn) + out[5] = bfloat16_as_int16(u2rz) + out[6] = bfloat16_as_int16(u2rd) + out[7] = bfloat16_as_int16(u2ru) + out[8] = bfloat16_as_int16(i3rn) + out[9] = bfloat16_as_int16(i3rz) + out[10] = bfloat16_as_int16(i3rd) + out[11] = bfloat16_as_int16(i3ru) + out[12] = bfloat16_as_int16(u3rn) + out[13] = bfloat16_as_int16(u3rz) + out[14] = bfloat16_as_int16(u3rd) + out[15] = bfloat16_as_int16(u3ru) + out[16] = bfloat16_as_int16(i4rn) + out[17] = bfloat16_as_int16(i4rz) + out[18] = bfloat16_as_int16(i4rd) + out[19] = bfloat16_as_int16(i4ru) + out[20] = bfloat16_as_int16(u4rn) + out[21] = bfloat16_as_int16(u4rz) + out[22] = bfloat16_as_int16(u4rd) + out[23] = bfloat16_as_int16(u4ru) + + out = cuda.device_array((24,), dtype="int16") + kernel[1, 1](out) + res = out.copy_to_host() + + i2 = np.int16(789).astype(mldtypes_bf16).view("int16") + i3 = np.int32(789).astype(mldtypes_bf16).view("int16") + i4 = np.int64(789).astype(mldtypes_bf16).view("int16") + u2 = np.uint16(789).astype(mldtypes_bf16).view("int16") + u3 = np.uint32(789).astype(mldtypes_bf16).view("int16") + u4 = np.uint64(789).astype(mldtypes_bf16).view("int16") + + i2arr = np.array([i2] * 4) + i3arr = np.array([i3] * 4) + i4arr = np.array([i4] * 4) + u2arr = np.array([u2] * 4) + u3arr = np.array([u3] * 4) + u4arr = np.array([u4] * 4) + + two = np.ones_like(res[0:4]) * 2 + np.testing.assert_array_less(_bf16_ulp_distance(res[0:4], i2arr), two) + np.testing.assert_array_less(_bf16_ulp_distance(res[4:8], i3arr), two) + np.testing.assert_array_less(_bf16_ulp_distance(res[8:12], i4arr), two) + np.testing.assert_array_less(_bf16_ulp_distance(res[12:16], u2arr), two) + np.testing.assert_array_less(_bf16_ulp_distance(res[16:20], u3arr), two) + np.testing.assert_array_less(_bf16_ulp_distance(res[20:24], u4arr), two) + @unittest.skipIf( find_spec("ml_dtypes") is None, "ml_dtypes is required to use bfloat16 on host", @@ -401,3 +488,23 @@ def kernel(test_val, i1, i2, i3, i4, u1, u2, u3, u4): def test_use_bfloat16_on_host(self): x = bfloat16(3.0) self.assertEqual(x, 3.0) + + +def _bf16_ulp_rank(bits_int16: np.ndarray) -> np.ndarray: + """ + Compute the ULP rank of a bfloat16 value. Input is the bits of the bfloat16 value as an int16. + The ULP rank is the number of ULPs between the value and 0. + Negative values are performed the inverse of 2's complement before computing the rank. + """ + u = bits_int16.view(np.uint16) + sign = u >> 15 + return np.where(sign == 0, u + 0x8000, 0x8000 - u).astype(np.int32) + + +def _bf16_ulp_distance( + a_bits_int16: np.ndarray, b_bits_int16: np.ndarray +) -> np.ndarray: + """ + Compute the difference between two bfloat16 values in ULPs. + """ + return np.abs(_bf16_ulp_rank(a_bits_int16) - _bf16_ulp_rank(b_bits_int16)) From 264f06986f7ac4b6d17b36bc3ec89a74f100365a Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Mon, 18 Aug 2025 15:16:43 -0700 Subject: [PATCH 37/56] testing bitcast operations --- .../numba/cuda/tests/cudapy/test_bfloat16.py | 95 ++++++++++++++++++- numba_cuda/numba/cuda/types.py | 2 +- 2 files changed, 95 insertions(+), 2 deletions(-) diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py index ec59a5285..79cf6c976 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py @@ -3,7 +3,17 @@ import numpy as np from ml_dtypes import bfloat16 as mldtypes_bf16 -from numba import cuda, float32, int16, int32, int64, uint16, uint32, uint64 +from numba import ( + cuda, + float32, + float64, + int16, + int32, + int64, + uint16, + uint32, + uint64, +) from numba.cuda.bf16 import ( bfloat16, habs, @@ -87,6 +97,15 @@ uint64_to_bfloat16_ru, bfloat16_as_int16, int16_as_bfloat16, + bfloat16_as_uint16, + uint16_as_bfloat16, + bfloat16_to_float32, + float32_to_bfloat16, + float64_to_bfloat16, + float32_to_bfloat16_rn, + float32_to_bfloat16_rz, + float32_to_bfloat16_rd, + float32_to_bfloat16_ru, ) from numba.cuda.testing import CUDATestCase @@ -325,6 +344,20 @@ def kernel(out): self.assertAlmostEqual(out[2], 2.0, delta=1e-3) self.assertAlmostEqual(out[3], 2.0, delta=1e-3) + def test_bfloat16_as_bitcast(self): + @cuda.jit + def roundtrip_kernel(test_val, i2, u2): + i2[0] = int16_as_bfloat16(bfloat16_as_int16(test_val)) + u2[0] = uint16_as_bfloat16(bfloat16_as_uint16(test_val)) + + test_val = np.int16(0x3FC0) # 1.5 in bfloat16 + i2 = cuda.device_array((1,), dtype="int16") + u2 = cuda.device_array((1,), dtype="uint16") + roundtrip_kernel[1, 1](test_val, i2, u2) + + self.assertEqual(i2[0], test_val) + self.assertEqual(u2[0], test_val) + def test_to_integer_conversions(self): self.skip_unsupported() @@ -481,6 +514,66 @@ def kernel(out): np.testing.assert_array_less(_bf16_ulp_distance(res[16:20], u3arr), two) np.testing.assert_array_less(_bf16_ulp_distance(res[20:24], u4arr), two) + def test_to_float_conversions(self): + self.skip_unsupported() + + @cuda.jit + def kernel(out): + a = bfloat16(1.5) + out[0] = bfloat16_to_float32(a) + + out = cuda.device_array((1,), dtype="float32") + kernel[1, 1](out) + + self.assertAlmostEqual(out[0], 1.5, delta=1e-7) # conversion is exact + + def test_from_float_conversions(self): + self.skip_unsupported() + + test_val = 1.5 + + @cuda.jit + def kernel(out): + f4 = float32(test_val) + f8 = float64(test_val) + + f4rn = float32_to_bfloat16_rn(f4) + f4rz = float32_to_bfloat16_rz(f4) + f4rd = float32_to_bfloat16_rd(f4) + f4ru = float32_to_bfloat16_ru(f4) + + f4_default = float32_to_bfloat16(f4) + f8_default = float64_to_bfloat16(f8) + + out[0] = bfloat16_as_int16(f4rn) + out[1] = bfloat16_as_int16(f4rz) + out[2] = bfloat16_as_int16(f4rd) + out[3] = bfloat16_as_int16(f4ru) + out[4] = bfloat16_as_int16(f4_default) + out[5] = bfloat16_as_int16(f8_default) + + out = cuda.device_array((1,), dtype="int16") + kernel[1, 1](out) + raw = out.copy_to_host() + + f4_expected = ( + np.array([test_val] * 4, "float32") + .astype(mldtypes_bf16) + .view("int16") + ) + f8_expected = ( + np.array([test_val] * 1, "float64") + .astype(mldtypes_bf16) + .view("int16") + ) + + np.testing.assert_array_less( + _bf16_ulp_distance(raw[0:4], f4_expected), 2 + ) + np.testing.assert_array_less( + _bf16_ulp_distance(raw[4:], f8_expected), 2 + ) + @unittest.skipIf( find_spec("ml_dtypes") is None, "ml_dtypes is required to use bfloat16 on host", diff --git a/numba_cuda/numba/cuda/types.py b/numba_cuda/numba/cuda/types.py index 5ddcaef5e..17a4184d1 100644 --- a/numba_cuda/numba/cuda/types.py +++ b/numba_cuda/numba/cuda/types.py @@ -52,7 +52,7 @@ def __init__(self): self.alignof_ = 2 self.bitwidth = 16 - def can_convert_from(self, other): + def can_convert_from(self, typingctx, other): if isinstance(other, types.Float): return Conversion.unsafe From 88ac53eb0565dd4b69b75c232baa4d7e455d17bc Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Mon, 18 Aug 2025 21:42:12 -0700 Subject: [PATCH 38/56] add fp16, bf16 vended headers --- numba_cuda/numba/cuda/include/13/cuda_bf16.h | 5118 ++++++++++++++++ numba_cuda/numba/cuda/include/13/cuda_fp16.h | 5363 +++++++++++++++++ .../numba/cuda/include/13/cuda_fp16.hpp | 3483 +++++++++++ 3 files changed, 13964 insertions(+) create mode 100644 numba_cuda/numba/cuda/include/13/cuda_bf16.h create mode 100644 numba_cuda/numba/cuda/include/13/cuda_fp16.h create mode 100644 numba_cuda/numba/cuda/include/13/cuda_fp16.hpp diff --git a/numba_cuda/numba/cuda/include/13/cuda_bf16.h b/numba_cuda/numba/cuda/include/13/cuda_bf16.h new file mode 100644 index 000000000..38feffba0 --- /dev/null +++ b/numba_cuda/numba/cuda/include/13/cuda_bf16.h @@ -0,0 +1,5118 @@ +/* +* Copyright 1993-2024 NVIDIA Corporation. All rights reserved. +* +* NOTICE TO LICENSEE: +* +* This source code and/or documentation ("Licensed Deliverables") are +* subject to NVIDIA intellectual property rights under U.S. and +* international Copyright laws. +* +* These Licensed Deliverables contained herein is PROPRIETARY and +* CONFIDENTIAL to NVIDIA and is being provided under the terms and +* conditions of a form of NVIDIA software license agreement by and +* between NVIDIA and Licensee ("License Agreement") or electronically +* accepted by Licensee. Notwithstanding any terms or conditions to +* the contrary in the License Agreement, reproduction or disclosure +* of the Licensed Deliverables to any third party without the express +* written consent of NVIDIA is prohibited. +* +* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE +* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS +* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. +* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED +* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, +* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. +* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY +* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY +* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS +* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE +* OF THESE LICENSED DELIVERABLES. +* +* U.S. Government End Users. These Licensed Deliverables are a +* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT +* 1995), consisting of "commercial computer software" and "commercial +* computer software documentation" as such terms are used in 48 +* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government +* only as a commercial end item. Consistent with 48 C.F.R.12.212 and +* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all +* U.S. Government End Users acquire the Licensed Deliverables with +* only those rights set forth herein. +* +* Any use of the Licensed Deliverables in individual and commercial +* software must include, in the user documentation and internal +* comments to the code, the above Disclaimer and U.S. Government End +* Users Notice. +*/ + +/** +* \defgroup CUDA_MATH_INTRINSIC_BFLOAT16 Bfloat16 Precision Intrinsics +* This section describes nv_bfloat16 precision intrinsic functions. +* To use these functions, include the header file \p cuda_bf16.h in your program. +* All of the functions defined here are available in device code. +* Some of the functions are also available to host compilers, please +* refer to respective functions' documentation for details. +* +* NOTE: Aggressive floating-point optimizations performed by host or device +* compilers may affect numeric behavior of the functions implemented in this +* header. Specific examples are: +* - hsin(__nv_bfloat16); +* - hcos(__nv_bfloat16); +* - h2sin(__nv_bfloat162); +* - h2cos(__nv_bfloat162); +* +* The following macros are available to help users selectively enable/disable +* various definitions present in the header file: +* - \p CUDA_NO_BFLOAT16 - If defined, this macro will prevent the definition of +* additional type aliases in the global namespace, helping to avoid potential +* conflicts with symbols defined in the user program. +* - \p __CUDA_NO_BFLOAT16_CONVERSIONS__ - If defined, this macro will prevent +* the use of the C++ type conversions (converting constructors and conversion +* operators) that are common for built-in floating-point types, but may be +* undesirable for \p __nv_bfloat16 which is essentially a user-defined type. +* - \p __CUDA_NO_BFLOAT16_OPERATORS__ and \p __CUDA_NO_BFLOAT162_OPERATORS__ - +* If defined, these macros will prevent the inadvertent use of usual arithmetic +* and comparison operators. This enforces the storage-only type semantics and +* prevents C++ style computations on \p __nv_bfloat16 and \p __nv_bfloat162 types. +*/ + +/** +* \defgroup CUDA_MATH_INTRINSIC_BFLOAT16_CONSTANTS Bfloat16 Arithmetic Constants +* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 +* To use these constants, include the header file \p cuda_bf16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__BFLOAT16_ARITHMETIC Bfloat16 Arithmetic Functions +* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 +* To use these functions, include the header file \p cuda_bf16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__BFLOAT162_ARITHMETIC Bfloat162 Arithmetic Functions +* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 +* To use these functions, include the header file \p cuda_bf16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__BFLOAT16_COMPARISON Bfloat16 Comparison Functions +* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 +* To use these functions, include the header file \p cuda_bf16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__BFLOAT162_COMPARISON Bfloat162 Comparison Functions +* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 +* To use these functions, include the header file \p cuda_bf16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__BFLOAT16_MISC Bfloat16 Precision Conversion and Data Movement +* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 +* To use these functions, include the header file \p cuda_bf16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__BFLOAT16_FUNCTIONS Bfloat16 Math Functions +* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 +* To use these functions, include the header file \p cuda_bf16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__BFLOAT162_FUNCTIONS Bfloat162 Math Functions +* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 +* To use these functions, include the header file \p cuda_bf16.h in your program. +*/ + +#ifndef __CUDA_BF16_H__ +#define __CUDA_BF16_H__ + +/* bring in __half data type and operations, for use in converting constructors */ +#include "cuda_fp16.h" + +// implicitly provided by NVRTC +#if !defined(__CUDACC_RTC__) +/* bring in float2, double4, etc vector types */ +#include "vector_types.h" +/* bring in operations on vector types like: make_float2 */ +#include "vector_functions.h" +#endif /* !defined(__CUDACC_RTC__) */ + +#define ___CUDA_BF16_STRINGIFY_INNERMOST(x) #x +#define __CUDA_BF16_STRINGIFY(x) ___CUDA_BF16_STRINGIFY_INNERMOST(x) + +#if defined(__cplusplus) + +/* Set up function decorations */ +#if (defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3)))) +#define __CUDA_BF16_DECL__ __device__ +#define __CUDA_HOSTDEVICE_BF16_DECL__ __device__ +#define __CUDA_HOSTDEVICE__ __device__ +#elif defined(__CUDACC__) || defined(_NVHPC_CUDA) +#define __CUDA_BF16_DECL__ static __device__ __inline__ +#define __CUDA_HOSTDEVICE_BF16_DECL__ static __host__ __device__ __inline__ +#define __CUDA_HOSTDEVICE__ __host__ __device__ +#else /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +#if defined(__GNUC__) +#define __CUDA_HOSTDEVICE_BF16_DECL__ static __attribute__ ((unused)) +#else +#define __CUDA_HOSTDEVICE_BF16_DECL__ static +#endif /* defined(__GNUC__) */ +#define __CUDA_HOSTDEVICE__ +#endif /* (defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3))))) */ + +#define __CUDA_BF16_TYPES_EXIST__ + +/* Macros to allow nv_bfloat16 & nv_bfloat162 to be used by inline assembly */ +#define __BFLOAT16_TO_US(var) *(reinterpret_cast(&(var))) +#define __BFLOAT16_TO_CUS(var) *(reinterpret_cast(&(var))) +#define __BFLOAT162_TO_UI(var) *(reinterpret_cast(&(var))) +#define __BFLOAT162_TO_CUI(var) *(reinterpret_cast(&(var))) + +/* Forward-declaration of structures defined in "cuda_bf16.hpp" */ +struct __nv_bfloat16; +struct __nv_bfloat162; + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Converts double number to nv_bfloat16 precision in round-to-nearest-even mode +* and returns \p nv_bfloat16 with converted value. +* +* \details Converts double number \p a to nv_bfloat16 precision in round-to-nearest-even mode. +* \param[in] a - double. Is only being read. +* \returns nv_bfloat16 +* - \p a converted to \p nv_bfloat16 using round-to-nearest-even mode. +* - __double2bfloat16 \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula. +* - __double2bfloat16 \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula. +* - __double2bfloat16(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __double2bfloat16(const double a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Converts float number to nv_bfloat16 precision in round-to-nearest-even mode +* and returns \p nv_bfloat16 with converted value. +* +* \details Converts float number \p a to nv_bfloat16 precision in round-to-nearest-even mode. +* \param[in] a - float. Is only being read. +* \returns nv_bfloat16 +* - \p a converted to nv_bfloat16 using round-to-nearest-even mode. +* +* \see __float2bfloat16_rn(float) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16(const float a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Converts float number to nv_bfloat16 precision in round-to-nearest-even mode +* and returns \p nv_bfloat16 with converted value. +* +* \details Converts float number \p a to nv_bfloat16 precision in round-to-nearest-even mode. +* \param[in] a - float. Is only being read. +* \returns nv_bfloat16 +* - \p a converted to nv_bfloat16 using round-to-nearest-even mode. +* - __float2bfloat16_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula. +* - __float2bfloat16_rn \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula. +* - __float2bfloat16_rn(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rn(const float a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Converts float number to nv_bfloat16 precision in round-towards-zero mode +* and returns \p nv_bfloat16 with converted value. +* +* \details Converts float number \p a to nv_bfloat16 precision in round-towards-zero mode. +* \param[in] a - float. Is only being read. +* \returns nv_bfloat16 +* - \p a converted to nv_bfloat16 using round-towards-zero mode. +* - __float2bfloat16_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula. +* - __float2bfloat16_rz \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula. +* - __float2bfloat16_rz(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rz(const float a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Converts float number to nv_bfloat16 precision in round-down mode +* and returns \p nv_bfloat16 with converted value. +* +* \details Converts float number \p a to nv_bfloat16 precision in round-down mode. +* \param[in] a - float. Is only being read. +* +* \returns nv_bfloat16 +* - \p a converted to nv_bfloat16 using round-down mode. +* - __float2bfloat16_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula. +* - __float2bfloat16_rd \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula. +* - __float2bfloat16_rd(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rd(const float a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Converts float number to nv_bfloat16 precision in round-up mode +* and returns \p nv_bfloat16 with converted value. +* +* \details Converts float number \p a to nv_bfloat16 precision in round-up mode. +* \param[in] a - float. Is only being read. +* +* \returns nv_bfloat16 +* - \p a converted to nv_bfloat16 using round-up mode. +* - __float2bfloat16_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula. +* - __float2bfloat16_ru \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula. +* - __float2bfloat16_ru(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_ru(const float a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Converts \p nv_bfloat16 number to float. +* +* \details Converts nv_bfloat16 number \p a to float. +* \param[in] a - float. Is only being read. +* +* \returns float +* - \p a converted to float. +* - __bfloat162float \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula. +* - __bfloat162float \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula. +* - __bfloat162float(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ float __bfloat162float(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Converts input to nv_bfloat16 precision in round-to-nearest-even mode and +* populates both halves of \p nv_bfloat162 with converted value. +* +* \details Converts input \p a to nv_bfloat16 precision in round-to-nearest-even mode and +* populates both halves of \p nv_bfloat162 with converted value. +* \param[in] a - float. Is only being read. +* +* \returns nv_bfloat162 +* - The \p nv_bfloat162 value with both halves equal to the converted nv_bfloat16 +* precision number. +* +* \see __float2bfloat16_rn(float) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __float2bfloat162_rn(const float a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Converts both input floats to nv_bfloat16 precision in round-to-nearest-even +* mode and returns \p nv_bfloat162 with converted values. +* +* \details Converts both input floats to nv_bfloat16 precision in round-to-nearest-even mode +* and combines the results into one \p nv_bfloat162 number. Low 16 bits of the return +* value correspond to the input \p a, high 16 bits correspond to the input \p +* b. +* \param[in] a - float. Is only being read. +* \param[in] b - float. Is only being read. +* +* \returns nv_bfloat162 +* - The \p nv_bfloat162 value with corresponding halves equal to the +* converted input floats. +* +* \see __float2bfloat16_rn(float) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __floats2bfloat162_rn(const float a, const float b); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Converts low 16 bits of \p nv_bfloat162 to float and returns the result +* +* \details Converts low 16 bits of \p nv_bfloat162 input \p a to 32-bit floating-point number +* and returns the result. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns float +* - The low 16 bits of \p a converted to float. +* +* \see __bfloat162float(__nv_bfloat16) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ float __low2float(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Converts high 16 bits of \p nv_bfloat162 to float and returns the result +* +* \details Converts high 16 bits of \p nv_bfloat162 input \p a to 32-bit floating-point number +* and returns the result. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns float +* - The high 16 bits of \p a converted to float. +* +* \see __bfloat162float(__nv_bfloat16) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ float __high2float(const __nv_bfloat162 a); + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Converts both components of float2 number to nv_bfloat16 precision in +* round-to-nearest-even mode and returns \p nv_bfloat162 with converted values. +* +* \details Converts both components of float2 to nv_bfloat16 precision in round-to-nearest-even +* mode and combines the results into one \p nv_bfloat162 number. Low 16 bits of the +* return value correspond to \p a.x and high 16 bits of the return value +* correspond to \p a.y. +* \param[in] a - float2. Is only being read. +* +* \returns nv_bfloat162 +* - The \p nv_bfloat162 which has corresponding halves equal to the +* converted float2 components. +* +* \see __float2bfloat16_rn(float) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __float22bfloat162_rn(const float2 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Converts both halves of \p nv_bfloat162 to float2 and returns the result. +* +* \details Converts both halves of \p nv_bfloat162 input \p a to float and returns the +* result as a \p float2 packed value. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns float2 +* - \p a converted to float2. +* +* \see __bfloat162float(__nv_bfloat16) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ float2 __bfloat1622float2(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to a signed char in round-towards-zero mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to a signed +* char in round-towards-zero mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns signed char +* - \p h converted to a signed char using round-towards-zero mode. +* - __bfloat162char_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __bfloat162char_rz \cuda_math_formula (x), x > 127\end_cuda_math_formula returns SCHAR_MAX = \p 0x7F. +* - __bfloat162char_rz \cuda_math_formula (x), x < -128\end_cuda_math_formula returns SCHAR_MIN = \p 0x80. +* - __bfloat162char_rz(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ signed char __bfloat162char_rz(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to an unsigned char in round-towards-zero mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned +* char in round-towards-zero mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned char +* - \p h converted to an unsigned char using round-towards-zero mode. +* - __bfloat162uchar_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __bfloat162uchar_rz \cuda_math_formula (x), x > 255\end_cuda_math_formula returns UCHAR_MAX = \p 0xFF. +* - __bfloat162uchar_rz \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0. +* - __bfloat162uchar_rz(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned char __bfloat162uchar_rz(const __nv_bfloat16 h); +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to a signed integer in round-to-nearest-even mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to a signed integer in +* round-to-nearest-even mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns int +* - \p h converted to a signed integer using round-to-nearest-even mode. +* - __bfloat162int_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __bfloat162int_rn \cuda_math_formula (x), x > INT_MAX\end_cuda_math_formula returns INT_MAX = \p 0x7FFFFFFF. +* - __bfloat162int_rn \cuda_math_formula (x), x < INT_MIN\end_cuda_math_formula returns INT_MIN = \p 0x80000000. +* - __bfloat162int_rn(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ int __bfloat162int_rn(const __nv_bfloat16 h); +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to a signed integer in round-towards-zero mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to a signed integer in +* round-towards-zero mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns int +* - \p h converted to a signed integer using round-towards-zero mode. +* - __bfloat162int_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __bfloat162int_rz \cuda_math_formula (x), x > INT_MAX\end_cuda_math_formula returns INT_MAX = \p 0x7FFFFFFF. +* - __bfloat162int_rz \cuda_math_formula (x), x < INT_MIN\end_cuda_math_formula returns INT_MIN = \p 0x80000000. +* - __bfloat162int_rz(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ int __bfloat162int_rz(const __nv_bfloat16 h); +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to a signed integer in round-down mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to a signed integer in +* round-down mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns int +* - \p h converted to a signed integer using round-down mode. +* - __bfloat162int_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __bfloat162int_rd \cuda_math_formula (x), x > INT_MAX\end_cuda_math_formula returns INT_MAX = \p 0x7FFFFFFF. +* - __bfloat162int_rd \cuda_math_formula (x), x < INT_MIN\end_cuda_math_formula returns INT_MIN = \p 0x80000000. +* - __bfloat162int_rd(NaN) returns 0.* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ int __bfloat162int_rd(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to a signed integer in round-up mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to a signed integer in +* round-up mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns int +* - \p h converted to a signed integer using round-up mode. +* - __bfloat162int_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __bfloat162int_ru \cuda_math_formula (x), x > INT_MAX\end_cuda_math_formula returns INT_MAX = \p 0x7FFFFFFF. +* - __bfloat162int_ru \cuda_math_formula (x), x < INT_MIN\end_cuda_math_formula returns INT_MIN = \p 0x80000000. +* - __bfloat162int_ru(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ int __bfloat162int_ru(const __nv_bfloat16 h); +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a signed integer to a nv_bfloat16 in round-to-nearest-even mode. +* +* \details Convert the signed integer value \p i to a nv_bfloat16 floating-point +* value in round-to-nearest-even mode. +* \param[in] i - int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rn(const int i); +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a signed integer to a nv_bfloat16 in round-towards-zero mode. +* +* \details Convert the signed integer value \p i to a nv_bfloat16 floating-point +* value in round-towards-zero mode. +* \param[in] i - int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rz(const int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a signed integer to a nv_bfloat16 in round-down mode. +* +* \details Convert the signed integer value \p i to a nv_bfloat16 floating-point +* value in round-down mode. +* \param[in] i - int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rd(const int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a signed integer to a nv_bfloat16 in round-up mode. +* +* \details Convert the signed integer value \p i to a nv_bfloat16 floating-point +* value in round-up mode. +* \param[in] i - int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_ru(const int i); + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to a signed short integer in round-to-nearest-even +* mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to a signed short +* integer in round-to-nearest-even mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns short int +* - \p h converted to a signed short integer using round-to-nearest-even mode. +* - __bfloat162short_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __bfloat162short_rn \cuda_math_formula (x), x > 32767\end_cuda_math_formula returns SHRT_MAX = \p 0x7FFF. +* - __bfloat162short_rn \cuda_math_formula (x), x < -32768\end_cuda_math_formula returns SHRT_MIN = \p 0x8000. +* - __bfloat162short_rn(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ short int __bfloat162short_rn(const __nv_bfloat16 h); +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to a signed short integer in round-towards-zero mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to a signed short +* integer in round-towards-zero mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns short int +* - \p h converted to a signed short integer using round-towards-zero mode. +* - __bfloat162short_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __bfloat162short_rz \cuda_math_formula (x), x > 32767\end_cuda_math_formula returns SHRT_MAX = \p 0x7FFF. +* - __bfloat162short_rz \cuda_math_formula (x), x < -32768\end_cuda_math_formula returns SHRT_MIN = \p 0x8000. +* - __bfloat162short_rz(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ short int __bfloat162short_rz(const __nv_bfloat16 h); +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to a signed short integer in round-down mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to a signed short +* integer in round-down mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns short int +* - \p h converted to a signed short integer using round-down mode. +* - __bfloat162short_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __bfloat162short_rd \cuda_math_formula (x), x > 32767\end_cuda_math_formula returns SHRT_MAX = \p 0x7FFF. +* - __bfloat162short_rd \cuda_math_formula (x), x < -32768\end_cuda_math_formula returns SHRT_MIN = \p 0x8000. +* - __bfloat162short_rd(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ short int __bfloat162short_rd(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to a signed short integer in round-up mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to a signed short +* integer in round-up mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns short int +* - \p h converted to a signed short integer using round-up mode. +* - __bfloat162short_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __bfloat162short_ru \cuda_math_formula (x), x > 32767\end_cuda_math_formula returns SHRT_MAX = \p 0x7FFF. +* - __bfloat162short_ru \cuda_math_formula (x), x < -32768\end_cuda_math_formula returns SHRT_MIN = \p 0x8000. +* - __bfloat162short_ru(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ short int __bfloat162short_ru(const __nv_bfloat16 h); +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a signed short integer to a nv_bfloat16 in round-to-nearest-even +* mode. +* +* \details Convert the signed short integer value \p i to a nv_bfloat16 floating-point +* value in round-to-nearest-even mode. +* \param[in] i - short int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rn(const short int i); +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a signed short integer to a nv_bfloat16 in round-towards-zero mode. +* +* \details Convert the signed short integer value \p i to a nv_bfloat16 floating-point +* value in round-towards-zero mode. +* \param[in] i - short int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rz(const short int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a signed short integer to a nv_bfloat16 in round-down mode. +* +* \details Convert the signed short integer value \p i to a nv_bfloat16 floating-point +* value in round-down mode. +* \param[in] i - short int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rd(const short int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a signed short integer to a nv_bfloat16 in round-up mode. +* +* \details Convert the signed short integer value \p i to a nv_bfloat16 floating-point +* value in round-up mode. +* \param[in] i - short int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_ru(const short int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to an unsigned integer in round-to-nearest-even mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned integer +* in round-to-nearest-even mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned int +* - \p h converted to an unsigned integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ unsigned int __bfloat162uint_rn(const __nv_bfloat16 h); +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to an unsigned integer in round-towards-zero mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned integer +* in round-towards-zero mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned int +* - \p h converted to an unsigned integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __bfloat162uint_rz(const __nv_bfloat16 h); +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to an unsigned integer in round-down mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned integer +* in round-down mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned int +* - \p h converted to an unsigned integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ unsigned int __bfloat162uint_rd(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to an unsigned integer in round-up mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned integer +* in round-up mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned int +* - \p h converted to an unsigned integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ unsigned int __bfloat162uint_ru(const __nv_bfloat16 h); +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert an unsigned integer to a nv_bfloat16 in round-to-nearest-even mode. +* +* \details Convert the unsigned integer value \p i to a nv_bfloat16 floating-point +* value in round-to-nearest-even mode. +* \param[in] i - unsigned int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rn(const unsigned int i); +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert an unsigned integer to a nv_bfloat16 in round-towards-zero mode. +* +* \details Convert the unsigned integer value \p i to a nv_bfloat16 floating-point +* value in round-towards-zero mode. +* \param[in] i - unsigned int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rz(const unsigned int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert an unsigned integer to a nv_bfloat16 in round-down mode. +* +* \details Convert the unsigned integer value \p i to a nv_bfloat16 floating-point +* value in round-down mode. +* \param[in] i - unsigned int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rd(const unsigned int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert an unsigned integer to a nv_bfloat16 in round-up mode. +* +* \details Convert the unsigned integer value \p i to a nv_bfloat16 floating-point +* value in round-up mode. +* \param[in] i - unsigned int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_ru(const unsigned int i); + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to an unsigned short integer in round-to-nearest-even +* mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned short +* integer in round-to-nearest-even mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned short int +* - \p h converted to an unsigned short integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_rn(const __nv_bfloat16 h); +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to an unsigned short integer in round-towards-zero +* mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned short +* integer in round-towards-zero mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned short int +* - \p h converted to an unsigned short integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned short int __bfloat162ushort_rz(const __nv_bfloat16 h); +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to an unsigned short integer in round-down mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned short +* integer in round-down mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned short int +* - \p h converted to an unsigned short integer. +*/ +__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_rd(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to an unsigned short integer in round-up mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned short +* integer in round-up mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned short int +* - \p h converted to an unsigned short integer. +*/ +__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_ru(const __nv_bfloat16 h); +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert an unsigned short integer to a nv_bfloat16 in round-to-nearest-even +* mode. +* +* \details Convert the unsigned short integer value \p i to a nv_bfloat16 floating-point +* value in round-to-nearest-even mode. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rn(const unsigned short int i); +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert an unsigned short integer to a nv_bfloat16 in round-towards-zero +* mode. +* +* \details Convert the unsigned short integer value \p i to a nv_bfloat16 floating-point +* value in round-towards-zero mode. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rz(const unsigned short int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert an unsigned short integer to a nv_bfloat16 in round-down mode. +* +* \details Convert the unsigned short integer value \p i to a nv_bfloat16 floating-point +* value in round-down mode. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rd(const unsigned short int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert an unsigned short integer to a nv_bfloat16 in round-up mode. +* +* \details Convert the unsigned short integer value \p i to a nv_bfloat16 floating-point +* value in round-up mode. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_ru(const unsigned short int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to an unsigned 64-bit integer in round-to-nearest-even +* mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned 64-bit +* integer in round-to-nearest-even mode. NaN inputs return 0x8000000000000000. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned long long int +* - \p h converted to an unsigned 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_rn(const __nv_bfloat16 h); +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to an unsigned 64-bit integer in round-towards-zero +* mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned 64-bit +* integer in round-towards-zero mode. NaN inputs return 0x8000000000000000. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned long long int +* - \p h converted to an unsigned 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned long long int __bfloat162ull_rz(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Vector function, combines two \p nv_bfloat16 numbers into one \p nv_bfloat162 number. +* +* \details Combines two input \p nv_bfloat16 number \p x and \p y into one \p nv_bfloat162 number. +* Input \p x is stored in low 16 bits of the return value, input \p y is stored +* in high 16 bits of the return value. +* \param[in] x - nv_bfloat16. Is only being read. +* \param[in] y - nv_bfloat16. Is only being read. +* +* \returns __nv_bfloat162 +* - The \p __nv_bfloat162 vector with one half equal to \p x and the other to \p y. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 make_bfloat162(const __nv_bfloat16 x, const __nv_bfloat16 y); +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to an unsigned 64-bit integer in round-down mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned 64-bit +* integer in round-down mode. NaN inputs return 0x8000000000000000. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned long long int +* - \p h converted to an unsigned 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_rd(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to an unsigned 64-bit integer in round-up mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned 64-bit +* integer in round-up mode. NaN inputs return 0x8000000000000000. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned long long int +* - \p h converted to an unsigned 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_ru(const __nv_bfloat16 h); +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert an unsigned 64-bit integer to a nv_bfloat16 in round-to-nearest-even +* mode. +* +* \details Convert the unsigned 64-bit integer value \p i to a nv_bfloat16 floating-point +* value in round-to-nearest-even mode. +* \param[in] i - unsigned long long int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rn(const unsigned long long int i); +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert an unsigned 64-bit integer to a nv_bfloat16 in round-towards-zero +* mode. +* +* \details Convert the unsigned 64-bit integer value \p i to a nv_bfloat16 floating-point +* value in round-towards-zero mode. +* \param[in] i - unsigned long long int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rz(const unsigned long long int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert an unsigned 64-bit integer to a nv_bfloat16 in round-down mode. +* +* \details Convert the unsigned 64-bit integer value \p i to a nv_bfloat16 floating-point +* value in round-down mode. +* \param[in] i - unsigned long long int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rd(const unsigned long long int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert an unsigned 64-bit integer to a nv_bfloat16 in round-up mode. +* +* \details Convert the unsigned 64-bit integer value \p i to a nv_bfloat16 floating-point +* value in round-up mode. +* \param[in] i - unsigned long long int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_ru(const unsigned long long int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to a signed 64-bit integer in round-to-nearest-even +* mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to a signed 64-bit +* integer in round-to-nearest-even mode. NaN inputs return a long long int with hex value of 0x8000000000000000. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns long long int +* - \p h converted to a signed 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ long long int __bfloat162ll_rn(const __nv_bfloat16 h); +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to a signed 64-bit integer in round-towards-zero mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to a signed 64-bit +* integer in round-towards-zero mode. NaN inputs return a long long int with hex value of 0x8000000000000000. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns long long int +* - \p h converted to a signed 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ long long int __bfloat162ll_rz(const __nv_bfloat16 h); +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to a signed 64-bit integer in round-down mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to a signed 64-bit +* integer in round-down mode. NaN inputs return a long long int with hex value of 0x8000000000000000. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns long long int +* - \p h converted to a signed 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ long long int __bfloat162ll_rd(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to a signed 64-bit integer in round-up mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to a signed 64-bit +* integer in round-up mode. NaN inputs return a long long int with hex value of 0x8000000000000000. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns long long int +* - \p h converted to a signed 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ long long int __bfloat162ll_ru(const __nv_bfloat16 h); +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a signed 64-bit integer to a nv_bfloat16 in round-to-nearest-even +* mode. +* +* \details Convert the signed 64-bit integer value \p i to a nv_bfloat16 floating-point +* value in round-to-nearest-even mode. +* \param[in] i - long long int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rn(const long long int i); +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a signed 64-bit integer to a nv_bfloat16 in round-towards-zero mode. +* +* \details Convert the signed 64-bit integer value \p i to a nv_bfloat16 floating-point +* value in round-towards-zero mode. +* \param[in] i - long long int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rz(const long long int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a signed 64-bit integer to a nv_bfloat16 in round-down mode. +* +* \details Convert the signed 64-bit integer value \p i to a nv_bfloat16 floating-point +* value in round-down mode. +* \param[in] i - long long int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rd(const long long int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a signed 64-bit integer to a nv_bfloat16 in round-up mode. +* +* \details Convert the signed 64-bit integer value \p i to a nv_bfloat16 floating-point +* value in round-up mode. +* \param[in] i - long long int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_ru(const long long int i); + +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Truncate input argument to the integral part. +* +* \details Round \p h to the nearest integer value that does not exceed \p h in +* magnitude. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The truncated integer value. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 htrunc(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculate ceiling of the input argument. +* +* \details Compute the smallest integer value not less than \p h. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The smallest integer value not less than \p h. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hceil(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculate the largest integer less than or equal to \p h. +* +* \details Calculate the largest integer value which is less than or equal to \p h. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The largest integer value which is less than or equal to \p h. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hfloor(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Round input to nearest integer value in nv_bfloat16 floating-point +* number. +* +* \details Round \p h to the nearest integer value in nv_bfloat16 floating-point +* format, with bfloat16way cases rounded to the nearest even integer value. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The nearest integer to \p h. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hrint(const __nv_bfloat16 h); + +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Truncate \p nv_bfloat162 vector input argument to the integral part. +* +* \details Round each component of vector \p h to the nearest integer value that does +* not exceed \p h in magnitude. +* \param[in] h - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The truncated \p h. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2trunc(const __nv_bfloat162 h); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculate \p nv_bfloat162 vector ceiling of the input argument. +* +* \details For each component of vector \p h compute the smallest integer value not less +* than \p h. +* \param[in] h - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The vector of smallest integers not less than \p h. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2ceil(const __nv_bfloat162 h); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculate the largest integer less than or equal to \p h. +* +* \details For each component of vector \p h calculate the largest integer value which +* is less than or equal to \p h. +* \param[in] h - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The vector of largest integers which is less than or equal to \p h. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2floor(const __nv_bfloat162 h); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Round input to nearest integer value in nv_bfloat16 floating-point +* number. +* +* \details Round each component of \p nv_bfloat162 vector \p h to the nearest integer value in +* nv_bfloat16 floating-point format, with bfloat16way cases rounded to the +* nearest even integer value. +* \param[in] h - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The vector of rounded integer values. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2rint(const __nv_bfloat162 h); +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Returns \p nv_bfloat162 with both halves equal to the input value. +* +* \details Returns \p nv_bfloat162 number with both halves equal to the input \p a \p nv_bfloat16 +* number. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat162 +* - The vector which has both its halves equal to the input \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __bfloat162bfloat162(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Swaps both halves of the \p nv_bfloat162 input. +* +* \details Swaps both halves of the \p nv_bfloat162 input and returns a new \p nv_bfloat162 number +* with swapped halves. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - \p a with its halves being swapped. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __lowhigh2highlow(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Extracts low 16 bits from each of the two \p nv_bfloat162 inputs and combines +* into one \p nv_bfloat162 number. +* +* \details Extracts low 16 bits from each of the two \p nv_bfloat162 inputs and combines into +* one \p nv_bfloat162 number. Low 16 bits from input \p a is stored in low 16 bits of +* the return value, low 16 bits from input \p b is stored in high 16 bits of +* the return value. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The low 16 bits of \p a and of \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __lows2bfloat162(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Extracts high 16 bits from each of the two \p nv_bfloat162 inputs and +* combines into one \p nv_bfloat162 number. +* +* \details Extracts high 16 bits from each of the two \p nv_bfloat162 inputs and combines into +* one \p nv_bfloat162 number. High 16 bits from input \p a is stored in low 16 bits of +* the return value, high 16 bits from input \p b is stored in high 16 bits of +* the return value. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The high 16 bits of \p a and of \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __highs2bfloat162(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Returns high 16 bits of \p nv_bfloat162 input. +* +* \details Returns high 16 bits of \p nv_bfloat162 input \p a. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat16 +* - The high 16 bits of the input. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __high2bfloat16(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Returns low 16 bits of \p nv_bfloat162 input. +* +* \details Returns low 16 bits of \p nv_bfloat162 input \p a. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat16 +* - Returns \p nv_bfloat16 which contains low 16 bits of the input \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __low2bfloat16(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Checks if the input \p nv_bfloat16 number is infinite. +* +* \details Checks if the input \p nv_bfloat16 number \p a is infinite. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns int +* - -1 if \p a is equal to negative infinity, +* - 1 if \p a is equal to positive infinity, +* - 0 otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ int __hisinf(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Combines two \p nv_bfloat16 numbers into one \p nv_bfloat162 number. +* +* \details Combines two input \p nv_bfloat16 number \p a and \p b into one \p nv_bfloat162 number. +* Input \p a is stored in low 16 bits of the return value, input \p b is stored +* in high 16 bits of the return value. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat162 +* - The nv_bfloat162 with one nv_bfloat16 equal to \p a and the other to \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __halves2bfloat162(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Extracts low 16 bits from \p nv_bfloat162 input. +* +* \details Extracts low 16 bits from \p nv_bfloat162 input \p a and returns a new \p nv_bfloat162 +* number which has both halves equal to the extracted bits. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The nv_bfloat162 with both halves equal to the low 16 bits of the input. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __low2bfloat162(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Extracts high 16 bits from \p nv_bfloat162 input. +* +* \details Extracts high 16 bits from \p nv_bfloat162 input \p a and returns a new \p nv_bfloat162 +* number which has both halves equal to the extracted bits. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The nv_bfloat162 with both halves equal to the high 16 bits of the input. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __high2bfloat162(const __nv_bfloat162 a); + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Reinterprets bits in a \p nv_bfloat16 as a signed short integer. +* +* \details Reinterprets the bits in the nv_bfloat16 floating-point number \p h +* as a signed short integer. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns short int +* - The reinterpreted value. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ short int __bfloat16_as_short(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Reinterprets bits in a \p nv_bfloat16 as an unsigned short integer. +* +* \details Reinterprets the bits in the nv_bfloat16 floating-point \p h +* as an unsigned short number. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned short int +* - The reinterpreted value. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned short int __bfloat16_as_ushort(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Reinterprets bits in a signed short integer as a \p nv_bfloat16. +* +* \details Reinterprets the bits in the signed short integer \p i as a +* nv_bfloat16 floating-point number. +* \param[in] i - short int. Is only being read. +* +* \returns nv_bfloat16 +* - The reinterpreted value. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __short_as_bfloat16(const short int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Reinterprets bits in an unsigned short integer as a \p nv_bfloat16. +* +* \details Reinterprets the bits in the unsigned short integer \p i as a +* nv_bfloat16 floating-point number. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns nv_bfloat16 +* - The reinterpreted value. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ushort_as_bfloat16(const unsigned short int i); + +#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300))) || defined(_NVHPC_CUDA) + +#if !defined warpSize && !defined __local_warpSize +#define warpSize 32 +#define __local_warpSize +#endif + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Exchange a variable between threads within a warp. Direct copy from indexed thread. +* +* \details Returns the value of \p var held by the thread whose ID is given by \p srcLane. +* If the \p width is less than \p warpSize, then each subsection of the warp behaves as a separate +* entity with a starting logical thread ID of 0. If \p srcLane is outside the range \p [0:width-1], +* the value returned corresponds to the value of \p var held by the \p srcLane modulo \p width (i.e. +* within the same subsection). \p width must have a value which is a power of 2; +* results are undefined if \p width is not a power of 2, or is a number greater than +* \p warpSize. +* Threads may only read data from another thread which is actively participating in the +* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined. +* \param[in] mask - unsigned int. Is only being read. +* - Indicates the threads participating in the call. +* - A bit, representing the thread's lane ID, must be set for each participating thread +* to ensure they are properly converged before the intrinsic is executed by the hardware. +* - Each calling thread must have its own bit set in the \p mask and all non-exited threads +* named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined. +* \param[in] var - nv_bfloat162. Is only being read. +* \param[in] srcLane - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 4-byte word referenced by \p var from the source thread ID as \p nv_bfloat162. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_sync(const unsigned int mask, const __nv_bfloat162 var, const int srcLane, const int width = warpSize); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller. +* +* \details Calculates a source thread ID by subtracting \p delta from the caller's lane ID. +* The value of \p var held by the resulting lane ID is returned: in effect, \p var is shifted up +* the warp by \p delta threads. If the \p width is less than \p warpSize, then each subsection of the warp +* behaves as a separate entity with a starting logical thread ID of 0. The source thread index +* will not wrap around the value of \p width, so effectively the lower \p delta threads will be unchanged. +* \p width must have a value which is a power of 2; results are undefined if \p width is not a power of 2, +* or is a number greater than \p warpSize. +* Threads may only read data from another thread which is actively participating in the +* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined. +* \param[in] mask - unsigned int. Is only being read. +* - Indicates the threads participating in the call. +* - A bit, representing the thread's lane ID, must be set for each participating thread +* to ensure they are properly converged before the intrinsic is executed by the hardware. +* - Each calling thread must have its own bit set in the \p mask and all non-exited threads +* named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined. +* \param[in] var - nv_bfloat162. Is only being read. +* \param[in] delta - unsigned int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 4-byte word referenced by \p var from the source thread ID as \p nv_bfloat162. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_up_sync(const unsigned int mask, const __nv_bfloat162 var, const unsigned int delta, const int width = warpSize); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller. +* +* \details Calculates a source thread ID by adding \p delta to the caller's thread ID. +* The value of \p var held by the resulting thread ID is returned: this has the effect +* of shifting \p var down the warp by \p delta threads. If the \p width is less than \p warpSize, then +* each subsection of the warp behaves as a separate entity with a starting logical +* thread ID of 0. Similarly to the __shfl_up_sync(), the ID number of the source thread +* will not wrap around the value of \p width and the upper \p delta threads +* will remain unchanged. +* Threads may only read data from another thread which is actively participating in the +* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined. +* \param[in] mask - unsigned int. Is only being read. +* - Indicates the threads participating in the call. +* - A bit, representing the thread's lane ID, must be set for each participating thread +* to ensure they are properly converged before the intrinsic is executed by the hardware. +* - Each calling thread must have its own bit set in the \p mask and all non-exited threads +* named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined. +* \param[in] var - nv_bfloat162. Is only being read. +* \param[in] delta - unsigned int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 4-byte word referenced by \p var from the source thread ID as \p nv_bfloat162. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_down_sync(const unsigned int mask, const __nv_bfloat162 var, const unsigned int delta, const int width = warpSize); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID. +* +* \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with \p laneMask: +* the value of \p var held by the resulting thread ID is returned. If the \p width is less than \p warpSize, then each +* group of \p width consecutive threads are able to access elements from earlier groups of threads, +* however if they attempt to access elements from later groups of threads their own value of \p var +* will be returned. This mode implements a butterfly addressing pattern such as is used in tree +* reduction and broadcast. +* Threads may only read data from another thread which is actively participating in the +* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined. +* \param[in] mask - unsigned int. Is only being read. +* - Indicates the threads participating in the call. +* - A bit, representing the thread's lane ID, must be set for each participating thread +* to ensure they are properly converged before the intrinsic is executed by the hardware. +* - Each calling thread must have its own bit set in the \p mask and all non-exited threads +* named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined. +* \param[in] var - nv_bfloat162. Is only being read. +* \param[in] laneMask - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 4-byte word referenced by \p var from the source thread ID as \p nv_bfloat162. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_xor_sync(const unsigned int mask, const __nv_bfloat162 var, const int laneMask, const int width = warpSize); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Exchange a variable between threads within a warp. Direct copy from indexed thread. +* +* \details Returns the value of \p var held by the thread whose ID is given by \p srcLane. +* If the \p width is less than \p warpSize, then each subsection of the warp behaves as a separate +* entity with a starting logical thread ID of 0. If \p srcLane is outside the range \p [0:width-1], +* the value returned corresponds to the value of \p var held by the \p srcLane modulo \p width (i.e. +* within the same subsection). \p width must have a value which is a power of 2; +* results are undefined if \p width is not a power of 2, or is a number greater than +* \p warpSize. +* Threads may only read data from another thread which is actively participating in the +* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined. +* \param[in] mask - unsigned int. Is only being read. +* - Indicates the threads participating in the call. +* - A bit, representing the thread's lane ID, must be set for each participating thread +* to ensure they are properly converged before the intrinsic is executed by the hardware. +* - Each calling thread must have its own bit set in the \p mask and all non-exited threads +* named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined. +* \param[in] var - nv_bfloat16. Is only being read. +* \param[in] srcLane - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 2-byte word referenced by \p var from the source thread ID as \p nv_bfloat16. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_sync(const unsigned int mask, const __nv_bfloat16 var, const int srcLane, const int width = warpSize); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller. +* +* \details Calculates a source thread ID by subtracting \p delta from the caller's lane ID. +* The value of \p var held by the resulting lane ID is returned: in effect, \p var is shifted up +* the warp by \p delta threads. If the \p width is less than \p warpSize, then each subsection of the warp +* behaves as a separate entity with a starting logical thread ID of 0. The source thread index +* will not wrap around the value of \p width, so effectively the lower \p delta threads will be unchanged. +* \p width must have a value which is a power of 2; results are undefined if \p width is not a power of 2, +* or is a number greater than \p warpSize. +* Threads may only read data from another thread which is actively participating in the +* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined. +* \param[in] mask - unsigned int. Is only being read. +* - Indicates the threads participating in the call. +* - A bit, representing the thread's lane ID, must be set for each participating thread +* to ensure they are properly converged before the intrinsic is executed by the hardware. +* - Each calling thread must have its own bit set in the \p mask and all non-exited threads +* named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined. +* \param[in] var - nv_bfloat16. Is only being read. +* \param[in] delta - unsigned int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 2-byte word referenced by \p var from the source thread ID as \p nv_bfloat16. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_up_sync(const unsigned int mask, const __nv_bfloat16 var, const unsigned int delta, const int width = warpSize); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller. +* +* \details Calculates a source thread ID by adding \p delta to the caller's thread ID. +* The value of \p var held by the resulting thread ID is returned: this has the effect +* of shifting \p var down the warp by \p delta threads. If the \p width is less than \p warpSize, then +* each subsection of the warp behaves as a separate entity with a starting logical +* thread ID of 0. Similarly to the __shfl_up_sync(), the ID number of the source thread +* will not wrap around the value of \p width and the upper \p delta threads +* will remain unchanged. +* Threads may only read data from another thread which is actively participating in the +* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined. +* \param[in] mask - unsigned int. Is only being read. +* - Indicates the threads participating in the call. +* - A bit, representing the thread's lane ID, must be set for each participating thread +* to ensure they are properly converged before the intrinsic is executed by the hardware. +* - Each calling thread must have its own bit set in the \p mask and all non-exited threads +* named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined. +* \param[in] var - nv_bfloat16. Is only being read. +* \param[in] delta - unsigned int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 2-byte word referenced by \p var from the source thread ID as \p nv_bfloat16. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_down_sync(const unsigned int mask, const __nv_bfloat16 var, const unsigned int delta, const int width = warpSize); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID. +* +* \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with \p laneMask: +* the value of \p var held by the resulting thread ID is returned. If the \p width is less than \p warpSize, then each +* group of \p width consecutive threads are able to access elements from earlier groups of threads, +* however if they attempt to access elements from later groups of threads their own value of \p var +* will be returned. This mode implements a butterfly addressing pattern such as is used in tree +* reduction and broadcast. +* Threads may only read data from another thread which is actively participating in the +* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined. +* \param[in] mask - unsigned int. Is only being read. +* - Indicates the threads participating in the call. +* - A bit, representing the thread's lane ID, must be set for each participating thread +* to ensure they are properly converged before the intrinsic is executed by the hardware. +* - Each calling thread must have its own bit set in the \p mask and all non-exited threads +* named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined. +* \param[in] var - nv_bfloat16. Is only being read. +* \param[in] laneMask - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 2-byte word referenced by \p var from the source thread ID as \p nv_bfloat16. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_xor_sync(const unsigned int mask, const __nv_bfloat16 var, const int laneMask, const int width = warpSize); +#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300))) || defined(_NVHPC_CUDA) */ + +#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320))) || defined(_NVHPC_CUDA) +#if defined(__local_warpSize) +#undef warpSize +#undef __local_warpSize +#endif + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `ld.global.nc` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __ldg(const __nv_bfloat162 *const ptr); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `ld.global.nc` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ldg(const __nv_bfloat16 *const ptr); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `ld.global.cg` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __ldcg(const __nv_bfloat162 *const ptr); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `ld.global.cg` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ldcg(const __nv_bfloat16 *const ptr); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `ld.global.ca` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __ldca(const __nv_bfloat162 *const ptr); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `ld.global.ca` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ldca(const __nv_bfloat16 *const ptr); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `ld.global.cs` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __ldcs(const __nv_bfloat162 *const ptr); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `ld.global.cs` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ldcs(const __nv_bfloat16 *const ptr); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `ld.global.lu` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __ldlu(const __nv_bfloat162 *const ptr); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `ld.global.lu` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ldlu(const __nv_bfloat16 *const ptr); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `ld.global.cv` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __ldcv(const __nv_bfloat162 *const ptr); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `ld.global.cv` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ldcv(const __nv_bfloat16 *const ptr); + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `st.global.wb` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_BF16_DECL__ void __stwb(__nv_bfloat162 *const ptr, const __nv_bfloat162 value); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `st.global.wb` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_BF16_DECL__ void __stwb(__nv_bfloat16 *const ptr, const __nv_bfloat16 value); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `st.global.cg` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_BF16_DECL__ void __stcg(__nv_bfloat162 *const ptr, const __nv_bfloat162 value); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `st.global.cg` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_BF16_DECL__ void __stcg(__nv_bfloat16 *const ptr, const __nv_bfloat16 value); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `st.global.cs` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_BF16_DECL__ void __stcs(__nv_bfloat162 *const ptr, const __nv_bfloat162 value); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `st.global.cs` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_BF16_DECL__ void __stcs(__nv_bfloat16 *const ptr, const __nv_bfloat16 value); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `st.global.wt` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_BF16_DECL__ void __stwt(__nv_bfloat162 *const ptr, const __nv_bfloat162 value); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `st.global.wt` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_BF16_DECL__ void __stwt(__nv_bfloat16 *const ptr, const __nv_bfloat16 value); + +#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320))) || defined(_NVHPC_CUDA) */ + +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs nv_bfloat162 vector if-equal comparison. +* +* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b. +* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The vector result of if-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __heq2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector not-equal comparison. +* +* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b. +* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The vector result of not-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hne2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector less-equal comparison. +* +* \details Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b. +* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The \p nv_bfloat162 result of less-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hle2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector greater-equal comparison. +* +* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b. +* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The vector result of greater-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hge2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector less-than comparison. +* +* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b. +* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The nv_bfloat162 vector result of less-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hlt2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector greater-than comparison. +* +* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b. +* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The vector result of greater-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hgt2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered if-equal comparison. +* +* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b. +* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The vector result of unordered if-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hequ2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered not-equal comparison. +* +* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b. +* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The vector result of unordered not-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hneu2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered less-equal comparison. +* +* Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b. +* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The vector result of unordered less-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hleu2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered greater-equal comparison. +* +* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b. +* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The \p nv_bfloat162 vector result of unordered greater-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered less-than comparison. +* +* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b. +* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The vector result of unordered less-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hltu2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered greater-than comparison. +* +* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b. +* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The \p nv_bfloat162 vector result of unordered greater-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs nv_bfloat162 vector if-equal comparison. +* +* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns unsigned int +* - The vector mask result of if-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __heq2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector not-equal comparison. +* +* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns unsigned int +* - The vector mask result of not-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hne2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector less-equal comparison. +* +* \details Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns unsigned int +* - The vector mask result of less-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hle2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector greater-equal comparison. +* +* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns unsigned int +* - The vector mask result of greater-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hge2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector less-than comparison. +* +* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns unsigned int +* - The vector mask result of less-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hlt2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector greater-than comparison. +* +* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns unsigned int +* - The vector mask result of greater-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hgt2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered if-equal comparison. +* +* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns unsigned int +* - The vector mask result of unordered if-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hequ2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered not-equal comparison. +* +* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns unsigned int +* - The vector mask result of unordered not-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hneu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered less-equal comparison. +* +* Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns unsigned int +* - The vector mask result of unordered less-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hleu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered greater-equal comparison. +* +* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns unsigned int +* - The vector mask result of unordered greater-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hgeu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered less-than comparison. +* +* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns unsigned int +* - The vector mask result of unordered less-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hltu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered greater-than comparison. +* +* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns unsigned int +* - The vector mask result of unordered greater-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __hgtu2_mask(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Determine whether \p nv_bfloat162 argument is a NaN. +* +* \details Determine whether each nv_bfloat16 of input \p nv_bfloat162 number \p a is a NaN. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The nv_bfloat162 with the corresponding \p nv_bfloat16 results set to +* 1.0 for NaN, 0.0 otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hisnan2(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs \p nv_bfloat162 vector addition in round-to-nearest-even mode. +* +* \details Performs \p nv_bfloat162 vector add of inputs \p a and \p b, in round-to-nearest-even +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-95 +* \endinternal +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The sum of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hadd2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs \p nv_bfloat162 vector subtraction in round-to-nearest-even mode. +* +* \details Subtracts \p nv_bfloat162 input vector \p b from input vector \p a in +* round-to-nearest-even mode. +* \internal +* \req DEEPLEARN-SRM_REQ-104 +* \endinternal +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The subtraction of vector \p b from \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hsub2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs \p nv_bfloat162 vector multiplication in round-to-nearest-even mode. +* +* \details Performs \p nv_bfloat162 vector multiplication of inputs \p a and \p b, in +* round-to-nearest-even mode. +* \internal +* \req DEEPLEARN-SRM_REQ-102 +* \endinternal +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The result of elementwise multiplying the vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmul2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs \p nv_bfloat162 vector addition in round-to-nearest-even mode. +* +* \details Performs \p nv_bfloat162 vector add of inputs \p a and \p b, in round-to-nearest-even +* mode. Prevents floating-point contractions of mul+add into fma. +* \internal +* \req DEEPLEARN-SRM_REQ-95 +* \endinternal +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The sum of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hadd2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs \p nv_bfloat162 vector subtraction in round-to-nearest-even mode. +* +* \details Subtracts \p nv_bfloat162 input vector \p b from input vector \p a in +* round-to-nearest-even mode. Prevents floating-point contractions of mul+sub into fma. +* \internal +* \req DEEPLEARN-SRM_REQ-104 +* \endinternal +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The subtraction of vector \p b from \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hsub2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs \p nv_bfloat162 vector multiplication in round-to-nearest-even mode. +* +* \details Performs \p nv_bfloat162 vector multiplication of inputs \p a and \p b, in +* round-to-nearest-even mode. Prevents floating-point contractions of mul+add +* or sub into fma. +* \internal +* \req DEEPLEARN-SRM_REQ-102 +* \endinternal +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The result of elementwise multiplying the vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmul2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs \p nv_bfloat162 vector division in round-to-nearest-even mode. +* +* \details Divides \p nv_bfloat162 input vector \p a by input vector \p b in round-to-nearest-even +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-103 +* \endinternal +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The elementwise division of \p a with \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __h2div(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Calculates the absolute value of both halves of the input \p nv_bfloat162 number and +* returns the result. +* +* \details Calculates the absolute value of both halves of the input \p nv_bfloat162 number and +* returns the result. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns bfloat2 +* - Returns \p a with the absolute value of both halves. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __habs2(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs \p nv_bfloat162 vector addition in round-to-nearest-even mode, with +* saturation to [0.0, 1.0]. +* +* \details Performs \p nv_bfloat162 vector add of inputs \p a and \p b, in round-to-nearest-even +* mode, and clamps the results to range [0.0, 1.0]. NaN results are flushed to +* +0.0. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The sum of \p a and \p b, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hadd2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs \p nv_bfloat162 vector subtraction in round-to-nearest-even mode, +* with saturation to [0.0, 1.0]. +* +* \details Subtracts \p nv_bfloat162 input vector \p b from input vector \p a in +* round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN +* results are flushed to +0.0. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The subtraction of vector \p b from \p a, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hsub2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs \p nv_bfloat162 vector multiplication in round-to-nearest-even mode, +* with saturation to [0.0, 1.0]. +* +* \details Performs \p nv_bfloat162 vector multiplication of inputs \p a and \p b, in +* round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN +* results are flushed to +0.0. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The result of elementwise multiplication of vectors \p a and \p b, +* with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmul2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b); +#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs \p nv_bfloat162 vector fused multiply-add in round-to-nearest-even +* mode. +* +* \details Performs \p nv_bfloat162 vector multiply on inputs \p a and \p b, +* then performs a \p nv_bfloat162 vector add of the result with \p c, +* rounding the result once in round-to-nearest-even mode. +* \internal +* \req DEEPLEARN-SRM_REQ-105 +* \endinternal +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* \param[in] c - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c); +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs \p nv_bfloat162 vector fused multiply-add in round-to-nearest-even +* mode, with saturation to [0.0, 1.0]. +* +* \details Performs \p nv_bfloat162 vector multiply on inputs \p a and \p b, +* then performs a \p nv_bfloat162 vector add of the result with \p c, +* rounding the result once in round-to-nearest-even mode, and clamps the +* results to range [0.0, 1.0]. NaN results are flushed to +0.0. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* \param[in] c - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c, +* with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c); +#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Negates both halves of the input \p nv_bfloat162 number and returns the +* result. +* +* \details Negates both halves of the input \p nv_bfloat162 number \p a and returns the result. +* \internal +* \req DEEPLEARN-SRM_REQ-101 +* \endinternal +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - Returns \p a with both halves negated. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hneg2(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Calculates the absolute value of input \p nv_bfloat16 number and returns the result. +* +* \details Calculates the absolute value of input \p nv_bfloat16 number and returns the result. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The absolute value of a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __habs(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Performs \p nv_bfloat16 addition in round-to-nearest-even mode. +* +* \details Performs \p nv_bfloat16 addition of inputs \p a and \p b, in round-to-nearest-even +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-94 +* \endinternal +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The sum of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hadd(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Performs \p nv_bfloat16 subtraction in round-to-nearest-even mode. +* +* \details Subtracts \p nv_bfloat16 input \p b from input \p a in round-to-nearest-even +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-97 +* \endinternal +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The result of subtracting \p b from \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hsub(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Performs \p nv_bfloat16 multiplication in round-to-nearest-even mode. +* +* \details Performs \p nv_bfloat16 multiplication of inputs \p a and \p b, in round-to-nearest-even +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-99 +* \endinternal +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The result of multiplying \p a and \p b. +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmul(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Performs \p nv_bfloat16 addition in round-to-nearest-even mode. +* +* \details Performs \p nv_bfloat16 addition of inputs \p a and \p b, in round-to-nearest-even +* mode. Prevents floating-point contractions of mul+add into fma. +* \internal +* \req DEEPLEARN-SRM_REQ-94 +* \endinternal +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The sum of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hadd_rn(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Performs \p nv_bfloat16 subtraction in round-to-nearest-even mode. +* +* \details Subtracts \p nv_bfloat16 input \p b from input \p a in round-to-nearest-even +* mode. Prevents floating-point contractions of mul+sub into fma. +* \internal +* \req DEEPLEARN-SRM_REQ-97 +* \endinternal +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The result of subtracting \p b from \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hsub_rn(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Performs \p nv_bfloat16 multiplication in round-to-nearest-even mode. +* +* \details Performs \p nv_bfloat16 multiplication of inputs \p a and \p b, in round-to-nearest-even +* mode. Prevents floating-point contractions of mul+add or sub into fma. +* \internal +* \req DEEPLEARN-SRM_REQ-99 +* \endinternal +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The result of multiplying \p a and \p b. +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmul_rn(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Performs \p nv_bfloat16 division in round-to-nearest-even mode. +* +* \details Divides \p nv_bfloat16 input \p a by input \p b in round-to-nearest-even +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-98 +* \endinternal +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The result of dividing \p a by \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hdiv(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Performs \p nv_bfloat16 addition in round-to-nearest-even mode, with +* saturation to [0.0, 1.0]. +* +* \details Performs \p nv_bfloat16 add of inputs \p a and \p b, in round-to-nearest-even mode, +* and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The sum of \p a and \p b, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hadd_sat(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Performs \p nv_bfloat16 subtraction in round-to-nearest-even mode, with +* saturation to [0.0, 1.0]. +* +* \details Subtracts \p nv_bfloat16 input \p b from input \p a in round-to-nearest-even +* mode, +* and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The result of subtraction of \p b from \p a, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hsub_sat(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Performs \p nv_bfloat16 multiplication in round-to-nearest-even mode, with +* saturation to [0.0, 1.0]. +* +* \details Performs \p nv_bfloat16 multiplication of inputs \p a and \p b, in round-to-nearest-even +* mode, and clamps the result to range [0.0, 1.0]. NaN results are flushed to +* +0.0. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The result of multiplying \p a and \p b, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmul_sat(const __nv_bfloat16 a, const __nv_bfloat16 b); +#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Performs \p nv_bfloat16 fused multiply-add in round-to-nearest-even mode. +* +* \details Performs \p nv_bfloat16 multiply on inputs \p a and \p b, +* then performs a \p nv_bfloat16 add of the result with \p c, +* rounding the result once in round-to-nearest-even mode. +* \internal +* \req DEEPLEARN-SRM_REQ-96 +* \endinternal +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* \param[in] c - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The result of fused multiply-add operation on \p +* a, \p b, and \p c. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __hfma(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c); +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Performs \p nv_bfloat16 fused multiply-add in round-to-nearest-even mode, +* with saturation to [0.0, 1.0]. +* +* \details Performs \p nv_bfloat16 multiply on inputs \p a and \p b, +* then performs a \p nv_bfloat16 add of the result with \p c, +* rounding the result once in round-to-nearest-even mode, and clamps the result +* to range [0.0, 1.0]. NaN results are flushed to +0.0. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* \param[in] c - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The result of fused multiply-add operation on \p +* a, \p b, and \p c, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __hfma_sat(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c); +#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Negates input \p nv_bfloat16 number and returns the result. +* +* \details Negates input \p nv_bfloat16 number and returns the result. +* \internal +* \req DEEPLEARN-SRM_REQ-100 +* \endinternal +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - minus a +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hneg(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector if-equal comparison and returns boolean true +* if both \p nv_bfloat16 results are true, boolean false otherwise. +* +* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p nv_bfloat16 if-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns bool +* - true if both \p nv_bfloat16 results of if-equal comparison +* of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbeq2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector not-equal comparison and returns boolean +* true if both \p nv_bfloat16 results are true, boolean false otherwise. +* +* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p nv_bfloat16 not-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns bool +* - true if both \p nv_bfloat16 results of not-equal comparison +* of vectors \p a and \p b are true, +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbne2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector less-equal comparison and returns boolean +* true if both \p nv_bfloat16 results are true, boolean false otherwise. +* +* \details Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p nv_bfloat16 less-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns bool +* - true if both \p nv_bfloat16 results of less-equal comparison +* of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hble2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector greater-equal comparison and returns boolean +* true if both \p nv_bfloat16 results are true, boolean false otherwise. +* +* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p nv_bfloat16 greater-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns bool +* - true if both \p nv_bfloat16 results of greater-equal +* comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbge2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector less-than comparison and returns boolean +* true if both \p nv_bfloat16 results are true, boolean false otherwise. +* +* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p nv_bfloat16 less-than comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns bool +* - true if both \p nv_bfloat16 results of less-than comparison +* of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hblt2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector greater-than comparison and returns boolean +* true if both \p nv_bfloat16 results are true, boolean false otherwise. +* +* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p nv_bfloat16 greater-than comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns bool +* - true if both \p nv_bfloat16 results of greater-than +* comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbgt2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered if-equal comparison and returns +* boolean true if both \p nv_bfloat16 results are true, boolean false otherwise. +* +* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p nv_bfloat16 if-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns bool +* - true if both \p nv_bfloat16 results of unordered if-equal +* comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbequ2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered not-equal comparison and returns +* boolean true if both \p nv_bfloat16 results are true, boolean false otherwise. +* +* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p nv_bfloat16 not-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns bool +* - true if both \p nv_bfloat16 results of unordered not-equal +* comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbneu2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered less-equal comparison and returns +* boolean true if both \p nv_bfloat16 results are true, boolean false otherwise. +* +* \details Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p nv_bfloat16 less-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns bool +* - true if both \p nv_bfloat16 results of unordered less-equal +* comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbleu2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered greater-equal comparison and +* returns boolean true if both \p nv_bfloat16 results are true, boolean false +* otherwise. +* +* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p nv_bfloat16 greater-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns bool +* - true if both \p nv_bfloat16 results of unordered +* greater-equal comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered less-than comparison and returns +* boolean true if both \p nv_bfloat16 results are true, boolean false otherwise. +* +* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p nv_bfloat16 less-than comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns bool +* - true if both \p nv_bfloat16 results of unordered less-than comparison of +* vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbltu2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered greater-than comparison and +* returns boolean true if both \p nv_bfloat16 results are true, boolean false +* otherwise. +* +* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p nv_bfloat16 greater-than comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns bool +* - true if both \p nv_bfloat16 results of unordered +* greater-than comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hbgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Performs \p nv_bfloat16 if-equal comparison. +* +* \details Performs \p nv_bfloat16 if-equal comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns bool +* - The boolean result of if-equal comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __heq(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Performs \p nv_bfloat16 not-equal comparison. +* +* \details Performs \p nv_bfloat16 not-equal comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns bool +* - The boolean result of not-equal comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hne(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Performs \p nv_bfloat16 less-equal comparison. +* +* \details Performs \p nv_bfloat16 less-equal comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns bool +* - The boolean result of less-equal comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hle(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Performs \p nv_bfloat16 greater-equal comparison. +* +* \details Performs \p nv_bfloat16 greater-equal comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns bool +* - The boolean result of greater-equal comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hge(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Performs \p nv_bfloat16 less-than comparison. +* +* \details Performs \p nv_bfloat16 less-than comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns bool +* - The boolean result of less-than comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hlt(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Performs \p nv_bfloat16 greater-than comparison. +* +* \details Performs \p nv_bfloat16 greater-than comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns bool +* - The boolean result of greater-than comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hgt(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Performs \p nv_bfloat16 unordered if-equal comparison. +* +* \details Performs \p nv_bfloat16 if-equal comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns bool +* - The boolean result of unordered if-equal comparison of \p a and +* \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hequ(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Performs \p nv_bfloat16 unordered not-equal comparison. +* +* \details Performs \p nv_bfloat16 not-equal comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns bool +* - The boolean result of unordered not-equal comparison of \p a and +* \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hneu(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Performs \p nv_bfloat16 unordered less-equal comparison. +* +* \details Performs \p nv_bfloat16 less-equal comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns bool +* - The boolean result of unordered less-equal comparison of \p a and +* \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hleu(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Performs \p nv_bfloat16 unordered greater-equal comparison. +* +* \details Performs \p nv_bfloat16 greater-equal comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns bool +* - The boolean result of unordered greater-equal comparison of \p a +* and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hgeu(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Performs \p nv_bfloat16 unordered less-than comparison. +* +* \details Performs \p nv_bfloat16 less-than comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns bool +* - The boolean result of unordered less-than comparison of \p a and +* \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hltu(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Performs \p nv_bfloat16 unordered greater-than comparison. +* +* \details Performs \p nv_bfloat16 greater-than comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns bool +* - The boolean result of unordered greater-than comparison of \p a +* and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hgtu(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Determine whether \p nv_bfloat16 argument is a NaN. +* +* \details Determine whether \p nv_bfloat16 value \p a is a NaN. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns bool +* - true if argument is NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ bool __hisnan(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Calculates \p nv_bfloat16 maximum of two input values. +* +* \details Calculates \p nv_bfloat16 max(\p a, \p b) +* defined as (\p a > \p b) ? \p a : \p b. +* - If either of inputs is NaN, the other input is returned. +* - If both inputs are NaNs, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmax(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Calculates \p nv_bfloat16 minimum of two input values. +* +* \details Calculates \p nv_bfloat16 min(\p a, \p b) +* defined as (\p a < \p b) ? \p a : \p b. +* - If either of inputs is NaN, the other input is returned. +* - If both inputs are NaNs, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmin(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Calculates \p nv_bfloat16 maximum of two input values, NaNs pass through. +* +* \details Calculates \p nv_bfloat16 max(\p a, \p b) +* defined as (\p a > \p b) ? \p a : \p b. +* - If either of inputs is NaN, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmax_nan(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Calculates \p nv_bfloat16 minimum of two input values, NaNs pass through. +* +* \details Calculates \p nv_bfloat16 min(\p a, \p b) +* defined as (\p a < \p b) ? \p a : \p b. +* - If either of inputs is NaN, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __hmin_nan(const __nv_bfloat16 a, const __nv_bfloat16 b); +#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Performs \p nv_bfloat16 fused multiply-add in round-to-nearest-even mode with relu saturation. +* +* \details Performs \p nv_bfloat16 multiply on inputs \p a and \p b, +* then performs a \p nv_bfloat16 add of the result with \p c, +* rounding the result once in round-to-nearest-even mode. +* Then negative result is clamped to 0. +* NaN result is converted to canonical NaN. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* \param[in] c - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The result of fused multiply-add operation on \p +* a, \p b, and \p c with relu saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __hfma_relu(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c); +#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Calculates \p nv_bfloat162 vector maximum of two inputs. +* +* \details Calculates \p nv_bfloat162 vector max(\p a, \p b). +* Elementwise \p nv_bfloat16 operation is defined as +* (\p a > \p b) ? \p a : \p b. +* - If either of inputs is NaN, the other input is returned. +* - If both inputs are NaNs, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The result of elementwise maximum of vectors \p a and \p b +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmax2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Calculates \p nv_bfloat162 vector minimum of two inputs. +* +* \details Calculates \p nv_bfloat162 vector min(\p a, \p b). +* Elementwise \p nv_bfloat16 operation is defined as +* (\p a < \p b) ? \p a : \p b. +* - If either of inputs is NaN, the other input is returned. +* - If both inputs are NaNs, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The result of elementwise minimum of vectors \p a and \p b +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmin2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Calculates \p nv_bfloat162 vector maximum of two inputs, NaNs pass through. +* +* \details Calculates \p nv_bfloat162 vector max(\p a, \p b). +* Elementwise \p nv_bfloat16 operation is defined as +* (\p a > \p b) ? \p a : \p b. +* - If either of inputs is NaN, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The result of elementwise maximum of vectors \p a and \p b, with NaNs pass through +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmax2_nan(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Calculates \p nv_bfloat162 vector minimum of two inputs, NaNs pass through. +* +* \details Calculates \p nv_bfloat162 vector min(\p a, \p b). +* Elementwise \p nv_bfloat16 operation is defined as +* (\p a < \p b) ? \p a : \p b. +* - If either of inputs is NaN, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The result of elementwise minimum of vectors \p a and \p b, with NaNs pass through +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __hmin2_nan(const __nv_bfloat162 a, const __nv_bfloat162 b); +#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs \p nv_bfloat162 vector fused multiply-add in round-to-nearest-even +* mode with relu saturation. +* +* \details Performs \p nv_bfloat162 vector multiply on inputs \p a and \p b, +* then performs a \p nv_bfloat162 vector add of the result with \p c, +* rounding the result once in round-to-nearest-even mode. +* Then negative result is clamped to 0. +* NaN result is converted to canonical NaN. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* \param[in] c - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c with relu saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2_relu(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c); +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs fast complex multiply-accumulate +* +* \details Interprets vector \p nv_bfloat162 input pairs \p a, \p b, and \p c as +* complex numbers in \p nv_bfloat16 precision and performs +* complex multiply-accumulate operation: a*b + c +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* \param[in] c - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The result of complex multiply-accumulate operation on complex numbers \p a, \p b, and \p c +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __hcmadd(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c); +#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))) || defined(_NVHPC_CUDA) */ +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculates \p nv_bfloat16 square root in round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat16 square root of input \p a in round-to-nearest-even mode. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The square root of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hsqrt(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculates \p nv_bfloat16 reciprocal square root in round-to-nearest-even +* mode. +* +* \details Calculates \p nv_bfloat16 reciprocal square root of input \p a in round-to-nearest-even +* mode. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The reciprocal square root of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hrsqrt(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculates \p nv_bfloat16 reciprocal in round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat16 reciprocal of input \p a in round-to-nearest-even mode. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The reciprocal of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hrcp(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculates \p nv_bfloat16 natural logarithm in round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat16 natural logarithm of input \p a in round-to-nearest-even +* mode. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The natural logarithm of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hlog(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculates \p nv_bfloat16 binary logarithm in round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat16 binary logarithm of input \p a in round-to-nearest-even +* mode. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The binary logarithm of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hlog2(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculates \p nv_bfloat16 decimal logarithm in round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat16 decimal logarithm of input \p a in round-to-nearest-even +* mode. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The decimal logarithm of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hlog10(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculates \p nv_bfloat16 natural exponential function in round-to-nearest-even +* mode. +* +* \details Calculates \p nv_bfloat16 natural exponential function of input \p a in +* round-to-nearest-even mode. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The natural exponential function on \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hexp(const __nv_bfloat16 a); + +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculates approximate \p nv_bfloat16 hyperbolic tangent function. +* +* \details Calculates approximate \p nv_bfloat16 hyperbolic tangent function: \cuda_math_formula \tanh(a)\end_cuda_math_formula. +* This operation uses HW acceleration on devices of compute capability 9.x and higher. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The approximate hyperbolic tangent function of \p a. +* - htanh_approx \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula (\pm 0)\end_cuda_math_formula. +* - htanh_approx \cuda_math_formula (\pm\infty)\end_cuda_math_formula returns \cuda_math_formula (\pm 1)\end_cuda_math_formula. +* - htanh_approx(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 htanh_approx(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculates \p nv_bfloat162 vector approximate hyperbolic tangent function. +* +* \details Calculates \p nv_bfloat162 approximate hyperbolic tangent function of input vector \p a. +* This operation uses HW acceleration on devices of compute capability 9.x and higher. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The elementwise approximate hyperbolic tangent function on vector \p a. +* +* \see htanh_approx(__nv_bfloat16) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2tanh_approx(const __nv_bfloat162 a); + +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculates \p nv_bfloat16 hyperbolic tangent function in +* round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat16 hyperbolic tangent function: \cuda_math_formula \tanh(a)\end_cuda_math_formula in +* round-to-nearest-even mode. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The hyperbolic tangent function of \p a. +* - htanh \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula (\pm 0)\end_cuda_math_formula. +* - htanh \cuda_math_formula (\pm\infty)\end_cuda_math_formula returns \cuda_math_formula (\pm 1)\end_cuda_math_formula. +* - htanh(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 htanh(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculates \p nv_bfloat162 vector hyperbolic tangent function in round-to-nearest-even +* mode. +* +* \details Calculates \p nv_bfloat162 hyperbolic tangent function of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The elementwise hyperbolic tangent function on vector \p a. +* +* \see htanh(__nv_bfloat16) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2tanh(const __nv_bfloat162 a); + +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculates \p nv_bfloat16 binary exponential function in round-to-nearest-even +* mode. +* +* \details Calculates \p nv_bfloat16 binary exponential function of input \p a in +* round-to-nearest-even mode. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The binary exponential function on \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hexp2(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculates \p nv_bfloat16 decimal exponential function in round-to-nearest-even +* mode. +* +* \details Calculates \p nv_bfloat16 decimal exponential function of input \p a in +* round-to-nearest-even mode. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The decimal exponential function on \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hexp10(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculates \p nv_bfloat16 cosine in round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat16 cosine of input \p a in round-to-nearest-even mode. +* +* NOTE: this function's implementation calls cosf(float) function and is exposed +* to compiler optimizations. Specifically, \p --use_fast_math flag changes cosf(float) +* into an intrinsic __cosf(float), which has less accurate numeric behavior. +* +* \param[in] a - nv_bfloat16. Is only being read. +* \returns nv_bfloat16 +* - The cosine of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hcos(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculates \p nv_bfloat16 sine in round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat16 sine of input \p a in round-to-nearest-even mode. +* +* NOTE: this function's implementation calls sinf(float) function and is exposed +* to compiler optimizations. Specifically, \p --use_fast_math flag changes sinf(float) +* into an intrinsic __sinf(float), which has less accurate numeric behavior. +* +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The sine of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hsin(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculates \p nv_bfloat162 vector square root in round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat162 square root of input vector \p a in round-to-nearest-even +* mode. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The elementwise square root on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2sqrt(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculates \p nv_bfloat162 vector reciprocal square root in round-to-nearest-even +* mode. +* +* \details Calculates \p nv_bfloat162 reciprocal square root of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The elementwise reciprocal square root on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2rsqrt(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculates \p nv_bfloat162 vector reciprocal in round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat162 reciprocal of input vector \p a in round-to-nearest-even +* mode. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The elementwise reciprocal on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2rcp(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculates \p nv_bfloat162 vector natural logarithm in round-to-nearest-even +* mode. +* +* \details Calculates \p nv_bfloat162 natural logarithm of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The elementwise natural logarithm on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2log(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculates \p nv_bfloat162 vector binary logarithm in round-to-nearest-even +* mode. +* +* \details Calculates \p nv_bfloat162 binary logarithm of input vector \p a in round-to-nearest-even +* mode. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The elementwise binary logarithm on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2log2(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculates \p nv_bfloat162 vector decimal logarithm in round-to-nearest-even +* mode. +* +* \details Calculates \p nv_bfloat162 decimal logarithm of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The elementwise decimal logarithm on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2log10(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculates \p nv_bfloat162 vector exponential function in round-to-nearest-even +* mode. +* +* \details Calculates \p nv_bfloat162 exponential function of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The elementwise exponential function on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2exp(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculates \p nv_bfloat162 vector binary exponential function in +* round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat162 binary exponential function of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The elementwise binary exponential function on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2exp2(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculates \p nv_bfloat162 vector decimal exponential function in +* round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat162 decimal exponential function of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The elementwise decimal exponential function on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2exp10(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculates \p nv_bfloat162 vector cosine in round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat162 cosine of input vector \p a in round-to-nearest-even +* mode. +* +* NOTE: this function's implementation calls cosf(float) function and is exposed +* to compiler optimizations. Specifically, \p --use_fast_math flag changes cosf(float) +* into an intrinsic __cosf(float), which has less accurate numeric behavior. +* +* \param[in] a - nv_bfloat162. Is only being read. +* \returns nv_bfloat162 +* - The elementwise cosine on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2cos(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculates \p nv_bfloat162 vector sine in round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat162 sine of input vector \p a in round-to-nearest-even mode. +* +* NOTE: this function's implementation calls sinf(float) function and is exposed +* to compiler optimizations. Specifically, \p --use_fast_math flag changes sinf(float) +* into an intrinsic __sinf(float), which has less accurate numeric behavior. +* +* \param[in] a - nv_bfloat162. Is only being read. +* \returns nv_bfloat162 +* - The elementwise sine on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2sin(const __nv_bfloat162 a); + +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Vector add \p val to the value stored at \p address in global or shared memory, and writes this +* value back to \p address. The atomicity of the add operation is guaranteed separately for each of the +* two nv_bfloat16 elements; the entire __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. +* +* \details The location of \p address must be in global or shared memory. This operation has undefined +* behavior otherwise. This operation is natively supported by devices of compute capability 9.x and higher, +* older devices use emulation path. +* +* \param[in] address - __nv_bfloat162*. An address in global or shared memory. +* \param[in] val - __nv_bfloat162. The value to be added. +* +* \returns __nv_bfloat162 +* - The old value read from \p address. +* +* \note_ref_guide_atomic +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 atomicAdd(__nv_bfloat162 *const address, const __nv_bfloat162 val); + +#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700))) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Adds \p val to the value stored at \p address in global or shared memory, and writes this value +* back to \p address. This operation is performed in one atomic operation. +* +* \details The location of \p address must be in global or shared memory. This operation has undefined +* behavior otherwise. This operation is natively supported by devices of compute capability 9.x and higher, +* older devices of compute capability 7.x and 8.x use emulation path. +* +* \param[in] address - __nv_bfloat16*. An address in global or shared memory. +* \param[in] val - __nv_bfloat16. The value to be added. +* +* \returns __nv_bfloat16 +* - The old value read from \p address. +* +* \note_ref_guide_atomic +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 atomicAdd(__nv_bfloat16 *const address, const __nv_bfloat16 val); +#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700))) || defined(_NVHPC_CUDA) */ +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ + + +#endif /* defined(__cplusplus) */ + +#if !defined(_MSC_VER) && __cplusplus >= 201103L +# define __CPP_VERSION_AT_LEAST_11_BF16 +#elif _MSC_FULL_VER >= 190024210 && _MSVC_LANG >= 201103L +# define __CPP_VERSION_AT_LEAST_11_BF16 +#endif + +/* C++11 header for ::std::move. + * In RTC mode, ::std::move is provided implicitly; don't include the header + */ +#if defined(__CPP_VERSION_AT_LEAST_11_BF16) && !defined(__CUDACC_RTC__) +#include +#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) && !defined(__CUDACC_RTC__) */ + +/* C++ header for ::std::memcpy (used for type punning in host-side implementations). + * When compiling as a CUDA source file memcpy is provided implicitly. + * !defined(__CUDACC__) implies !defined(__CUDACC_RTC__). + */ +#if defined(__cplusplus) && !defined(__CUDACC__) +#include +#endif /* defined(__cplusplus) && !defined(__CUDACC__) */ + +// implicitly provided by NVRTC +#if !defined(__CUDACC_RTC__) +#include +#endif /* !defined(__CUDACC_RTC__) */ + +#if (defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3)))) +#define __CUDA_BF16_INLINE__ +#define __CUDA_BF16_FORCEINLINE__ +#else +#define __CUDA_BF16_INLINE__ inline +#define __CUDA_BF16_FORCEINLINE__ __forceinline__ +#endif /* #if (defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3)))) */ + +/* Set up structure-alignment attribute */ +#if defined(__CUDACC__) +#define __CUDA_ALIGN__(align) __align__(align) +#else +/* Define alignment macro based on compiler type (cannot assume C11 "_Alignas" is available) */ +#if defined(__CPP_VERSION_AT_LEAST_11_BF16) +#define __CUDA_ALIGN__(n) alignas(n) /* C++11 kindly gives us a keyword for this */ +#else /* defined(__CPP_VERSION_AT_LEAST_11_BF16)*/ +#if defined(__GNUC__) +#define __CUDA_ALIGN__(n) __attribute__ ((aligned(n))) +#elif defined(_MSC_VER) +#define __CUDA_ALIGN__(n) __declspec(align(n)) +#else +#define __CUDA_ALIGN__(n) +#endif /* defined(__GNUC__) */ +#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */ +#endif /* defined(__CUDACC__) */ + +// define __CUDA_BF16_CONSTEXPR__ in order to +// use constexpr where possible, with supporting C++ dialects +// undef after use +#if (defined __CPP_VERSION_AT_LEAST_11_BF16) +#define __CUDA_BF16_CONSTEXPR__ constexpr +#else +#define __CUDA_BF16_CONSTEXPR__ +#endif + +/** + * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 + * \brief __nv_bfloat16_raw data type + * \details Type allows static initialization of \p nv_bfloat16 until it becomes + * a built-in type. + * + * - Note: this initialization is as a bit-field representation of \p nv_bfloat16, + * and not a conversion from \p short to \p nv_bfloat16. + * Such representation will be deprecated in a future version of CUDA. + * + * - Note: this is visible to non-nvcc compilers, including C-only compilations + */ +typedef struct __CUDA_ALIGN__(2) { + /** + * Storage field contains bits representation of the \p nv_bfloat16 floating-point number. + */ + unsigned short x; +} __nv_bfloat16_raw; + +/** + * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 + * \brief __nv_bfloat162_raw data type + * \details Type allows static initialization of \p nv_bfloat162 until it becomes + * a built-in type. + * + * - Note: this initialization is as a bit-field representation of \p nv_bfloat162, + * and not a conversion from \p short2 to \p nv_bfloat162. + * Such representation will be deprecated in a future version of CUDA. + * + * - Note: this is visible to non-nvcc compilers, including C-only compilations + */ +typedef struct __CUDA_ALIGN__(4) { + /** + * Storage field contains bits of the lower \p nv_bfloat16 part. + */ + unsigned short x; + /** + * Storage field contains bits of the upper \p nv_bfloat16 part. + */ + unsigned short y; +} __nv_bfloat162_raw; + +/* All other definitions in this file are only visible to C++ compilers */ +#if defined(__cplusplus) + +/* Hide GCC member initialization list warnings because of host/device in-function init requirement */ +#if defined(__GNUC__) +#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Weffc++" +#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */ +#endif /* defined(__GNUC__) */ + +/* class' : multiple assignment operators specified + The class has multiple assignment operators of a single type. This warning is informational */ +#if defined(_MSC_VER) && _MSC_VER >= 1500 +#pragma warning( push ) +#pragma warning( disable:4522 ) +#endif /* defined(__GNUC__) */ + +/** + * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 + * \brief nv_bfloat16 datatype + * + * \details This structure implements the datatype for storing + * nv_bfloat16 floating-point numbers. The structure implements + * assignment operators and type conversions. 16 bits are being + * used in total: 1 sign bit, 8 bits for the exponent, and + * the significand is being stored in 7 bits. The total + * precision is 8 bits. + * + */ +struct __CUDA_ALIGN__(2) __nv_bfloat16 { +protected: + /** + * Protected storage variable contains the bits of floating-point data. + */ + unsigned short __x; + +public: + + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * \brief Constructor by default. + * \details Emtpy default constructor, result is uninitialized. + */ +#if defined(__CPP_VERSION_AT_LEAST_11_BF16) + __nv_bfloat16() = default; +#else + __CUDA_HOSTDEVICE__ __nv_bfloat16() { } +#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */ + + /* Convert to/from __nv_bfloat16_raw */ + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Constructor from \p __nv_bfloat16_raw. + */ + __CUDA_HOSTDEVICE__ __CUDA_BF16_CONSTEXPR__ __nv_bfloat16(const __nv_bfloat16_raw &hr) : __x(hr.x) { } + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Assignment operator from \p __nv_bfloat16_raw. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(const __nv_bfloat16_raw &hr); + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Assignment operator from \p __nv_bfloat16_raw to \p volatile \p __nv_bfloat16. + */ + __CUDA_HOSTDEVICE__ volatile __nv_bfloat16 &operator=(const __nv_bfloat16_raw &hr) volatile; + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Assignment operator from \p volatile \p __nv_bfloat16_raw to \p volatile \p __nv_bfloat16. + */ + __CUDA_HOSTDEVICE__ volatile __nv_bfloat16 &operator=(const volatile __nv_bfloat16_raw &hr) volatile; + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Type cast to \p __nv_bfloat16_raw operator. + */ + __CUDA_HOSTDEVICE__ operator __nv_bfloat16_raw() const; + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Type cast to \p __nv_bfloat16_raw operator with \p volatile input. + */ + __CUDA_HOSTDEVICE__ operator __nv_bfloat16_raw() const volatile; + +#if !defined(__CUDA_NO_BFLOAT16_CONVERSIONS__) +#if defined(__CPP_VERSION_AT_LEAST_11_BF16) + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Construct \p __nv_bfloat16 from \p __half input using default round-to-nearest-even rounding mode. + */ + explicit __CUDA_HOSTDEVICE__ __nv_bfloat16(const __half f) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm("{ cvt.rn.bf16.f16 %0, %1;}\n" : "=h"(__x) : "h"(__BFLOAT16_TO_CUS(f))); +, + __x = __float2bfloat16(__half2float(f)).__x; +) +} +#endif /* #if defined(__CPP_VERSION_AT_LEAST_11_BF16) */ + + /* Construct from float/double */ + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Construct \p __nv_bfloat16 from \p float input using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16(const float f) { __x = __float2bfloat16(f).__x; } + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Construct \p __nv_bfloat16 from \p double input using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16(const double f) { __x = __double2bfloat16(f).__x; } + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Type cast to \p float operator. + */ + __CUDA_HOSTDEVICE__ operator float() const; + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Type cast to \p __nv_bfloat16 assignment operator from \p float input using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(const float f); + + /* We omit "cast to double" operator, so as to not be ambiguous about up-cast */ + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Type cast to \p __nv_bfloat16 assignment operator from \p double input using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(const double f); + +/* + * Implicit type conversions to/from integer types were only available to nvcc compilation. + * Introducing them for all compilers is a potentially breaking change that may affect + * overloads resolution and will require users to update their code. + * Define __CUDA_BF16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__ to opt-out. + */ +#if !(defined __CUDA_BF16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__) || (defined __CUDACC__) + /* Allow automatic construction from types supported natively in hardware */ + /* Note we do avoid constructor init-list because of special host/device compilation rules */ + + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Construct \p __nv_bfloat16 from \p short integer input using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16(short val) { __x = __short2bfloat16_rn(val).__x; } + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Construct \p __nv_bfloat16 from \p unsigned \p short integer input using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16(unsigned short val) { __x = __ushort2bfloat16_rn(val).__x; } + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Construct \p __nv_bfloat16 from \p int input using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16(int val) { __x = __int2bfloat16_rn(val).__x; } + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Construct \p __nv_bfloat16 from \p unsigned \p int input using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16(unsigned int val) { __x = __uint2bfloat16_rn(val).__x; } + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Construct \p __nv_bfloat16 from \p long input using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16(const long val) { + /* Suppress VS warning: warning C4127: conditional expression is constant */ +#if defined(_MSC_VER) && !defined(__CUDA_ARCH__) +#pragma warning (push) +#pragma warning (disable: 4127) +#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */ + if (sizeof(long) == sizeof(long long)) +#if defined(_MSC_VER) && !defined(__CUDA_ARCH__) +#pragma warning (pop) +#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */ + { + __x = __ll2bfloat16_rn(static_cast(val)).__x; + } else { + __x = __int2bfloat16_rn(static_cast(val)).__x; + } + } + + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Construct \p __nv_bfloat16 from \p unsigned \p long input using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16(const unsigned long val) { + /* Suppress VS warning: warning C4127: conditional expression is constant */ +#if defined(_MSC_VER) && !defined(__CUDA_ARCH__) +#pragma warning (push) +#pragma warning (disable: 4127) +#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */ + if (sizeof(unsigned long) == sizeof(unsigned long long)) +#if defined(_MSC_VER) && !defined(__CUDA_ARCH__) +#pragma warning (pop) +#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */ + { + __x = __ull2bfloat16_rn(static_cast(val)).__x; + } else { + __x = __uint2bfloat16_rn(static_cast(val)).__x; + } + } + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Construct \p __nv_bfloat16 from \p long \p long input using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16(long long val) { __x = __ll2bfloat16_rn(val).__x; } + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Construct \p __nv_bfloat16 from \p unsigned \p long \p long input using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16(unsigned long long val) { __x = __ull2bfloat16_rn(val).__x; } + + /* Allow automatic casts to supported built-in types, matching all that are permitted with float */ + + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Conversion operator to \p signed \p char data type. + * Using round-toward-zero rounding mode. + * + * See __bfloat162char_rz(__nv_bfloat16) for further details + */ + __CUDA_HOSTDEVICE__ operator signed char() const; + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Conversion operator to \p unsigned \p char data type. + * Using round-toward-zero rounding mode. + * + * See __bfloat162uchar_rz(__nv_bfloat16) for further details + */ + __CUDA_HOSTDEVICE__ operator unsigned char() const; + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Conversion operator to an implementation defined \p char data type. + * Using round-toward-zero rounding mode. + * + * Detects signedness of the \p char type and proceeds accordingly, see + * further details in signed and unsigned char operators. + */ + __CUDA_HOSTDEVICE__ operator char() const; + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Conversion operator to \p short data type. + * Using round-toward-zero rounding mode. + * + * See __bfloat162short_rz(__nv_bfloat16) for further details + */ + __CUDA_HOSTDEVICE__ operator short() const; + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Conversion operator to \p unsigned \p short data type. + * Using round-toward-zero rounding mode. + * + * See __bfloat162ushort_rz(__nv_bfloat16) for further details + */ + __CUDA_HOSTDEVICE__ operator unsigned short() const; + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Conversion operator to \p int data type. + * Using round-toward-zero rounding mode. + * + * See __bfloat162int_rz(__nv_bfloat16) for further details + */ + __CUDA_HOSTDEVICE__ operator int() const; + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Conversion operator to \p unsigned \p int data type. + * Using round-toward-zero rounding mode. + * + * See __bfloat162uint_rz(__nv_bfloat16) for further details + */ + __CUDA_HOSTDEVICE__ operator unsigned int() const; + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Conversion operator to \p long data type. + * Using round-toward-zero rounding mode. + */ + __CUDA_HOSTDEVICE__ operator long() const; + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Conversion operator to \p unsigned \p long data type. + * Using round-toward-zero rounding mode. + */ + __CUDA_HOSTDEVICE__ operator unsigned long() const; + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Conversion operator to \p long \p long data type. + * Using round-toward-zero rounding mode. + * + * See __bfloat162ll_rz(__nv_bfloat16) for further details + */ + __CUDA_HOSTDEVICE__ operator long long() const; + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Conversion operator to \p unsigned \p long \p long data type. + * Using round-toward-zero rounding mode. + * + * See __bfloat162ull_rz(__nv_bfloat16) for further details + */ + __CUDA_HOSTDEVICE__ operator unsigned long long() const; + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Type cast from \p short assignment operator, using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(short val); + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Type cast from \p unsigned \p short assignment operator, using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(unsigned short val); + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Type cast from \p int assignment operator, using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(int val); + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Type cast from \p unsigned \p int assignment operator, using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(unsigned int val); + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Type cast from \p long \p long assignment operator, using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(long long val); + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Type cast from \p unsigned \p long \p long assignment operator, using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(unsigned long long val); + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Conversion operator to \p bool data type. + * +0 and -0 inputs convert to \p false. + * Non-zero inputs convert to \p true. + */ + __CUDA_HOSTDEVICE__ __CUDA_BF16_CONSTEXPR__ operator bool() const { return (__x & 0x7FFFU) != 0U; } +#endif /* !(defined __CUDA_BF16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__) || (defined __CUDACC__) */ +#endif /* !defined(__CUDA_NO_BFLOAT16_CONVERSIONS__) */ +}; + +#if !defined(__CUDA_NO_BFLOAT16_OPERATORS__) +/* Some basic arithmetic operations expected of a built-in */ +/** + * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC + * Performs \p nv_bfloat16 addition operation. + * See also __hadd(__nv_bfloat16, __nv_bfloat16) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator+(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC + * Performs \p nv_bfloat16 subtraction operation. + * See also __hsub(__nv_bfloat16, __nv_bfloat16) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator-(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC + * Performs \p nv_bfloat16 multiplication operation. + * See also __hmul(__nv_bfloat16, __nv_bfloat16) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator*(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC + * Performs \p nv_bfloat16 division operation. + * See also __hdiv(__nv_bfloat16, __nv_bfloat16) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator/(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh); + +/** + * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC + * Performs \p nv_bfloat16 compound assignment with addition operation. + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator+=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC + * Performs \p nv_bfloat16 compound assignment with subtraction operation. + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator-=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC + * Performs \p nv_bfloat16 compound assignment with multiplication operation. + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator*=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC + * Performs \p nv_bfloat16 compound assignment with division operation. + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator/=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh); + +/* Note for increment and decrement we use the raw value 0x3F80U equating to nv_bfloat16(1.0F), to avoid the extra conversion */ +/** + * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC + * Performs \p nv_bfloat16 prefix increment operation. + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator++(__nv_bfloat16 &h); +/** + * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC + * Performs \p nv_bfloat16 prefix decrement operation. + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 &operator--(__nv_bfloat16 &h); +/** + * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC + * Performs \p nv_bfloat16 postfix increment operation. + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator++(__nv_bfloat16 &h, const int ignored); +/** + * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC + * Performs \p nv_bfloat16 postfix decrement operation. + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator--(__nv_bfloat16 &h, const int ignored); +/* Unary plus and inverse operators */ +/** + * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC + * Implements \p nv_bfloat16 unary plus operator, returns input value. + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator+(const __nv_bfloat16 &h); +/** + * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC + * Implements \p nv_bfloat16 unary minus operator. + * See also __hneg(__nv_bfloat16) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat16 operator-(const __nv_bfloat16 &h); + +/* Some basic comparison operations to make it look like a built-in */ +/** + * \ingroup CUDA_MATH__BFLOAT16_COMPARISON + * Performs \p nv_bfloat16 ordered compare equal operation. + * See also __heq(__nv_bfloat16, __nv_bfloat16) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator==(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT16_COMPARISON + * Performs \p nv_bfloat16 unordered compare not-equal operation. + * See also __hneu(__nv_bfloat16, __nv_bfloat16) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator!=(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT16_COMPARISON + * Performs \p nv_bfloat16 ordered greater-than compare operation. + * See also __hgt(__nv_bfloat16, __nv_bfloat16) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator> (const __nv_bfloat16 &lh, const __nv_bfloat16 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT16_COMPARISON + * Performs \p nv_bfloat16 ordered less-than compare operation. + * See also __hlt(__nv_bfloat16, __nv_bfloat16) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator< (const __nv_bfloat16 &lh, const __nv_bfloat16 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT16_COMPARISON + * Performs \p nv_bfloat16 ordered greater-or-equal compare operation. + * See also __hge(__nv_bfloat16, __nv_bfloat16) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator>=(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT16_COMPARISON + * Performs \p nv_bfloat16 ordered less-or-equal compare operation. + * See also __hle(__nv_bfloat16, __nv_bfloat16) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator<=(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh); +#endif /* !defined(__CUDA_NO_BFLOAT16_OPERATORS__) */ + +/** +* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 + * \brief nv_bfloat162 datatype + * \details This structure implements the datatype for storing two + * nv_bfloat16 floating-point numbers. + * The structure implements assignment, arithmetic and comparison + * operators, and type conversions. + * + * - NOTE: __nv_bfloat162 is visible to non-nvcc host compilers + */ +struct __CUDA_ALIGN__(4) __nv_bfloat162 { + /** + * Storage field holding lower \p __nv_bfloat16 part. + */ + __nv_bfloat16 x; + /** + * Storage field holding upper \p __nv_bfloat16 part. + */ + __nv_bfloat16 y; + + // All construct/copy/assign/move +public: + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * \brief Constructor by default. + * \details Emtpy default constructor, result is uninitialized. + */ +#if defined(__CPP_VERSION_AT_LEAST_11_BF16) + __nv_bfloat162() = default; + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Move constructor, available for \p C++11 and later dialects + */ + __CUDA_HOSTDEVICE__ __nv_bfloat162(__nv_bfloat162 &&src); + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Move assignment operator, available for \p C++11 and later dialects + */ + __CUDA_HOSTDEVICE__ __nv_bfloat162 &operator=(__nv_bfloat162 &&src); +#else + __CUDA_HOSTDEVICE__ __nv_bfloat162(); +#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */ + + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Constructor from two \p __nv_bfloat16 variables + */ + __CUDA_HOSTDEVICE__ __CUDA_BF16_CONSTEXPR__ __nv_bfloat162(const __nv_bfloat16 &a, const __nv_bfloat16 &b) : x(a), y(b) { } + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Copy constructor + */ + __CUDA_HOSTDEVICE__ __nv_bfloat162(const __nv_bfloat162 &src); + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Copy assignment operator + */ + __CUDA_HOSTDEVICE__ __nv_bfloat162 &operator=(const __nv_bfloat162 &src); + + /* Convert to/from __nv_bfloat162_raw */ + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Constructor from \p __nv_bfloat162_raw + */ + __CUDA_HOSTDEVICE__ __nv_bfloat162(const __nv_bfloat162_raw &h2r ); + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Assignment operator from \p __nv_bfloat162_raw + */ + __CUDA_HOSTDEVICE__ __nv_bfloat162 &operator=(const __nv_bfloat162_raw &h2r); + /** + * \ingroup CUDA_MATH__BFLOAT16_MISC + * Conversion operator to \p __nv_bfloat162_raw + */ + __CUDA_HOSTDEVICE__ operator __nv_bfloat162_raw() const; +}; + +#if !defined(__CUDA_NO_BFLOAT162_OPERATORS__) +/** + * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC + * Performs packed \p nv_bfloat16 addition operation. + * See also __hadd2(__nv_bfloat162, __nv_bfloat162) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator+(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC + * Performs packed \p nv_bfloat16 subtraction operation. + * See also __hsub2(__nv_bfloat162, __nv_bfloat162) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator-(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC + * Performs packed \p nv_bfloat16 multiplication operation. + * See also __hmul2(__nv_bfloat162, __nv_bfloat162) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator*(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC + * Performs packed \p nv_bfloat16 division operation. + * See also __h2div(__nv_bfloat162, __nv_bfloat162) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator/(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC + * Performs packed \p nv_bfloat16 compound assignment with addition operation. + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162& operator+=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC + * Performs packed \p nv_bfloat16 compound assignment with subtraction operation. + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162& operator-=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC + * Performs packed \p nv_bfloat16 compound assignment with multiplication operation. + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162& operator*=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC + * Performs packed \p nv_bfloat16 compound assignment with division operation. + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162& operator/=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC + * Performs packed \p nv_bfloat16 prefix increment operation. + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 &operator++(__nv_bfloat162 &h); +/** + * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC + * Performs packed \p nv_bfloat16 prefix decrement operation. + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 &operator--(__nv_bfloat162 &h); +/** + * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC + * Performs packed \p nv_bfloat16 postfix increment operation. + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator++(__nv_bfloat162 &h, const int ignored); +/** + * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC + * Performs packed \p nv_bfloat16 postfix decrement operation. + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator--(__nv_bfloat162 &h, const int ignored); +/** + * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC + * Implements packed \p nv_bfloat16 unary plus operator, returns input value. + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator+(const __nv_bfloat162 &h); +/** + * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC + * Implements packed \p nv_bfloat16 unary minus operator. + * See also __hneg2(__nv_bfloat162) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ __nv_bfloat162 operator-(const __nv_bfloat162 &h); +/** + * \ingroup CUDA_MATH__BFLOAT162_COMPARISON + * Performs packed \p nv_bfloat16 ordered compare equal operation. + * See also __hbeq2(__nv_bfloat162, __nv_bfloat162) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator==(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT162_COMPARISON + * Performs packed \p nv_bfloat16 unordered compare not-equal operation. + * See also __hbneu2(__nv_bfloat162, __nv_bfloat162) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator!=(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT162_COMPARISON + * Performs packed \p nv_bfloat16 ordered greater-than compare operation. + * See also __hbgt2(__nv_bfloat162, __nv_bfloat162) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator>(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT162_COMPARISON + * Performs packed \p nv_bfloat16 ordered less-than compare operation. + * See also __hblt2(__nv_bfloat162, __nv_bfloat162) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator<(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT162_COMPARISON + * Performs packed \p nv_bfloat16 ordered greater-or-equal compare operation. + * See also __hbge2(__nv_bfloat162, __nv_bfloat162) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator>=(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh); +/** + * \ingroup CUDA_MATH__BFLOAT162_COMPARISON + * Performs packed \p nv_bfloat16 ordered less-or-equal compare operation. + * See also __hble2(__nv_bfloat162, __nv_bfloat162) + */ +__CUDA_HOSTDEVICE__ __CUDA_BF16_FORCEINLINE__ bool operator<=(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh); + +#endif /* !defined(__CUDA_NO_BFLOAT162_OPERATORS__) */ + +#if defined(__CPP_VERSION_AT_LEAST_11_BF16) +#if !defined(__CUDA_NO_HALF_CONVERSIONS__) +__CUDA_HOSTDEVICE__ +#ifdef __CUDACC_RTC__ +inline +#else +__CUDA_BF16_FORCEINLINE__ +#endif +__half::__half(const __nv_bfloat16 f) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + asm("{ cvt.rn.f16.bf16 %0, %1;}\n" : "=h"(__x) : "h"(__BFLOAT16_TO_CUS(f))); +, + __x = __float2half_rn(__bfloat162float(f)).__x; +) +} +#endif +#endif /* #if defined(__CPP_VERSION_AT_LEAST_11_BF16) */ + +#endif /* defined(__cplusplus) */ + +#if (defined(__FORCE_INCLUDE_CUDA_BF16_HPP_FROM_BF16_H__) || \ + !(defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3))))) +/* Note the .hpp file is included to capture the "nv_bfloat16" & "nv_bfloat162" built-in function definitions. For NVRTC, the built-in + function definitions are compiled at NVRTC library build-time and are available through the NVRTC built-ins library at + link time. +*/ +#include "cuda_bf16.hpp" +#endif /* (defined(__FORCE_INCLUDE_CUDA_BF16_HPP_FROM_BF16_H__) || \ + !(defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3))))) */ + +/* Define first-class types "nv_bfloat16" and "nv_bfloat162", unless user specifies otherwise via "#define CUDA_NO_BFLOAT16" */ +/* C cannot ever have these types defined here, because __nv_bfloat16 and __nv_bfloat162 are C++ classes */ +#if defined(__cplusplus) && !defined(CUDA_NO_BFLOAT16) +/** + * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 + * \brief This datatype is meant to be the first-class or fundamental + * implementation of the bfloat16 numbers format. + * + * \details Should be implemented in the compiler in the future. + * Current implementation is a simple typedef to a respective + * user-level type with underscores. + */ +typedef __nv_bfloat16 nv_bfloat16; + +/** + * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 + * \brief This datatype is meant to be the first-class or fundamental + * implementation of type for pairs of bfloat16 numbers. + * + * \details Should be implemented in the compiler in the future. + * Current implementation is a simple typedef to a respective + * user-level type with underscores. + */ +typedef __nv_bfloat162 nv_bfloat162; + +#endif /* defined(__cplusplus) && !defined(CUDA_NO_BFLOAT16) */ + +#undef __CUDA_BF16_DECL__ +#undef __CUDA_HOSTDEVICE_BF16_DECL__ +#undef __CUDA_HOSTDEVICE__ +#undef __CUDA_BF16_INLINE__ +#undef __CUDA_BF16_FORCEINLINE__ +#undef ___CUDA_BF16_STRINGIFY_INNERMOST +#undef __CUDA_BF16_STRINGIFY + +#endif /* end of include guard: __CUDA_BF16_H__ */ diff --git a/numba_cuda/numba/cuda/include/13/cuda_fp16.h b/numba_cuda/numba/cuda/include/13/cuda_fp16.h new file mode 100644 index 000000000..788b81452 --- /dev/null +++ b/numba_cuda/numba/cuda/include/13/cuda_fp16.h @@ -0,0 +1,5363 @@ +/* +* Copyright 1993-2024 NVIDIA Corporation. All rights reserved. +* +* NOTICE TO LICENSEE: +* +* This source code and/or documentation ("Licensed Deliverables") are +* subject to NVIDIA intellectual property rights under U.S. and +* international Copyright laws. +* +* These Licensed Deliverables contained herein is PROPRIETARY and +* CONFIDENTIAL to NVIDIA and is being provided under the terms and +* conditions of a form of NVIDIA software license agreement by and +* between NVIDIA and Licensee ("License Agreement") or electronically +* accepted by Licensee. Notwithstanding any terms or conditions to +* the contrary in the License Agreement, reproduction or disclosure +* of the Licensed Deliverables to any third party without the express +* written consent of NVIDIA is prohibited. +* +* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE +* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS +* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. +* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED +* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, +* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. +* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY +* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY +* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS +* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE +* OF THESE LICENSED DELIVERABLES. +* +* U.S. Government End Users. These Licensed Deliverables are a +* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT +* 1995), consisting of "commercial computer software" and "commercial +* computer software documentation" as such terms are used in 48 +* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government +* only as a commercial end item. Consistent with 48 C.F.R.12.212 and +* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all +* U.S. Government End Users acquire the Licensed Deliverables with +* only those rights set forth herein. +* +* Any use of the Licensed Deliverables in individual and commercial +* software must include, in the user documentation and internal +* comments to the code, the above Disclaimer and U.S. Government End +* Users Notice. +*/ + +/** +* \defgroup CUDA_MATH_INTRINSIC_HALF Half Precision Intrinsics +* This section describes half precision intrinsic functions. +* To use these functions, include the header file \p cuda_fp16.h in your program. +* All of the functions defined here are available in device code. +* Some of the functions are also available to host compilers, please +* refer to respective functions' documentation for details. +* +* NOTE: Aggressive floating-point optimizations performed by host or device +* compilers may affect numeric behavior of the functions implemented in this +* header. +* +* The following macros are available to help users selectively enable/disable +* various definitions present in the header file: +* - \p CUDA_NO_HALF - If defined, this macro will prevent the definition of +* additional type aliases in the global namespace, helping to avoid potential +* conflicts with symbols defined in the user program. +* - \p __CUDA_NO_HALF_CONVERSIONS__ - If defined, this macro will prevent the +* use of the C++ type conversions (converting constructors and conversion +* operators) that are common for built-in floating-point types, but may be +* undesirable for \p half which is essentially a user-defined type. +* - \p __CUDA_NO_HALF_OPERATORS__ and \p __CUDA_NO_HALF2_OPERATORS__ - If +* defined, these macros will prevent the inadvertent use of usual arithmetic +* and comparison operators. This enforces the storage-only type semantics and +* prevents C++ style computations on \p half and \p half2 types. +*/ + +/** +* \defgroup CUDA_MATH_INTRINSIC_HALF_CONSTANTS Half Arithmetic Constants +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these constants, include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF_ARITHMETIC Half Arithmetic Functions +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions, include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF2_ARITHMETIC Half2 Arithmetic Functions +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions, include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF_COMPARISON Half Comparison Functions +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions, include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF2_COMPARISON Half2 Comparison Functions +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions, include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF_MISC Half Precision Conversion and Data Movement +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions, include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF_FUNCTIONS Half Math Functions +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions, include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF2_FUNCTIONS Half2 Math Functions +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions, include the header file \p cuda_fp16.h in your program. +*/ + +#ifndef __CUDA_FP16_H__ +#define __CUDA_FP16_H__ + +// implicitly provided by NVRTC +#if !defined(__CUDACC_RTC__) +/* bring in float2, double4, etc vector types */ +#include "vector_types.h" +/* bring in operations on vector types like: make_float2 */ +#include "vector_functions.h" +#endif /* !defined(__CUDACC_RTC__) */ + +#define ___CUDA_FP16_STRINGIFY_INNERMOST(x) #x +#define __CUDA_FP16_STRINGIFY(x) ___CUDA_FP16_STRINGIFY_INNERMOST(x) + +#if defined(__cplusplus) + +/* Set up function decorations */ +#if (defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3)))) +#define __CUDA_FP16_DECL__ __device__ +#define __CUDA_HOSTDEVICE_FP16_DECL__ __device__ +#define __CUDA_HOSTDEVICE__ __device__ +#elif defined(__CUDACC__) || defined(_NVHPC_CUDA) +#define __CUDA_FP16_DECL__ static __device__ __inline__ +#define __CUDA_HOSTDEVICE_FP16_DECL__ static __host__ __device__ __inline__ +#define __CUDA_HOSTDEVICE__ __host__ __device__ +#else /* !defined(__CUDACC__) */ +#if defined(__GNUC__) +#define __CUDA_HOSTDEVICE_FP16_DECL__ static __attribute__ ((unused)) +#else +#define __CUDA_HOSTDEVICE_FP16_DECL__ static +#endif /* defined(__GNUC__) */ +#define __CUDA_HOSTDEVICE__ +#endif /* (defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3)))) */ + +#define __CUDA_FP16_TYPES_EXIST__ + +/* Macros to allow half & half2 to be used by inline assembly */ +#define __HALF_TO_US(var) *(reinterpret_cast(&(var))) +#define __HALF_TO_CUS(var) *(reinterpret_cast(&(var))) +#define __HALF2_TO_UI(var) *(reinterpret_cast(&(var))) +#define __HALF2_TO_CUI(var) *(reinterpret_cast(&(var))) + +/* Forward-declaration of structures defined in "cuda_fp16.hpp" */ +struct __half; +struct __half2; + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts double number to half precision in round-to-nearest-even mode +* and returns \p half with converted value. +* +* \details Converts double number \p a to half precision in round-to-nearest-even mode. +* \param[in] a - double. Is only being read. +* \returns half +* - \p a converted to half precision using round-to-nearest-even mode. +* - __double2half \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula. +* - __double2half \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula. +* - __double2half(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __double2half(const double a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts float number to half precision in round-to-nearest-even mode +* and returns \p half with converted value. +* +* \details Converts float number \p a to half precision in round-to-nearest-even mode. +* \param[in] a - float. Is only being read. +* \returns half +* - \p a converted to half precision using round-to-nearest-even mode. +* +* \see __float2half_rn(float) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half(const float a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts float number to half precision in round-to-nearest-even mode +* and returns \p half with converted value. +* +* \details Converts float number \p a to half precision in round-to-nearest-even mode. +* \param[in] a - float. Is only being read. +* \returns half +* - \p a converted to half precision using round-to-nearest-even mode. +* - __float2half_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula. +* - __float2half_rn \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula. +* - __float2half_rn(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rn(const float a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts float number to half precision in round-towards-zero mode +* and returns \p half with converted value. +* +* \details Converts float number \p a to half precision in round-towards-zero mode. +* \param[in] a - float. Is only being read. +* \returns half +* - \p a converted to half precision using round-towards-zero mode. +* - __float2half_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula. +* - __float2half_rz \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula. +* - __float2half_rz(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rz(const float a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts float number to half precision in round-down mode +* and returns \p half with converted value. +* +* \details Converts float number \p a to half precision in round-down mode. +* \param[in] a - float. Is only being read. +* +* \returns half +* - \p a converted to half precision using round-down mode. +* - __float2half_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula. +* - __float2half_rd \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula. +* - __float2half_rd(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rd(const float a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts float number to half precision in round-up mode +* and returns \p half with converted value. +* +* \details Converts float number \p a to half precision in round-up mode. +* \param[in] a - float. Is only being read. +* +* \returns half +* - \p a converted to half precision using round-up mode. +* - __float2half_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula. +* - __float2half_ru \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula. +* - __float2half_ru(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_ru(const float a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts \p half number to float. +* +* \details Converts half number \p a to float. +* \param[in] a - float. Is only being read. +* +* \returns float +* - \p a converted to float. +* - __half2float \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula. +* - __half2float \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula. +* - __half2float(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ float __half2float(const __half a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts input to half precision in round-to-nearest-even mode and +* populates both halves of \p half2 with converted value. +* +* \details Converts input \p a to half precision in round-to-nearest-even mode and +* populates both halves of \p half2 with converted value. +* \param[in] a - float. Is only being read. +* +* \returns half2 +* - The \p half2 value with both halves equal to the converted half +* precision number. +* +* \see __float2half_rn(float) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float2half2_rn(const float a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts both input floats to half precision in round-to-nearest-even +* mode and returns \p half2 with converted values. +* +* \details Converts both input floats to half precision in round-to-nearest-even mode +* and combines the results into one \p half2 number. Low 16 bits of the return +* value correspond to the input \p a, high 16 bits correspond to the input \p +* b. +* \param[in] a - float. Is only being read. +* \param[in] b - float. Is only being read. +* +* \returns half2 +* - The \p half2 value with corresponding halves equal to the +* converted input floats. +* +* \see __float2half_rn(float) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __floats2half2_rn(const float a, const float b); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts low 16 bits of \p half2 to float and returns the result +* +* \details Converts low 16 bits of \p half2 input \p a to 32-bit floating-point number +* and returns the result. +* \param[in] a - half2. Is only being read. +* +* \returns float +* - The low 16 bits of \p a converted to float. +* +* \see __half2float(__half) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ float __low2float(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts high 16 bits of \p half2 to float and returns the result +* +* \details Converts high 16 bits of \p half2 input \p a to 32-bit floating-point number +* and returns the result. +* \param[in] a - half2. Is only being read. +* +* \returns float +* - The high 16 bits of \p a converted to float. +* +* \see __half2float(__half) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ float __high2float(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed char in round-towards-zero mode. +* +* \details Convert the half-precision floating-point value \p h to a signed char +* integer in round-towards-zero mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns signed char +* - \p h converted to a signed char using round-towards-zero mode. +* - __half2char_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __half2char_rz \cuda_math_formula (x), x > 127\end_cuda_math_formula returns SCHAR_MAX = \p 0x7F. +* - __half2char_rz \cuda_math_formula (x), x < -128\end_cuda_math_formula returns SCHAR_MIN = \p 0x80. +* - __half2char_rz(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ signed char __half2char_rz(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned char in round-towards-zero +* mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned +* char in round-towards-zero mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns unsigned char +* - \p h converted to an unsigned char using round-towards-zero mode. +* - __half2uchar_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __half2uchar_rz \cuda_math_formula (x), x > 255\end_cuda_math_formula returns UCHAR_MAX = \p 0xFF. +* - __half2uchar_rz \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0. +* - __half2uchar_rz(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned char __half2uchar_rz(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed short integer in round-towards-zero mode. +* +* \details Convert the half-precision floating-point value \p h to a signed short +* integer in round-towards-zero mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns short int +* - \p h converted to a signed short integer using round-towards-zero mode. +* - __half2short_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __half2short_rz \cuda_math_formula (x), x > 32767\end_cuda_math_formula returns SHRT_MAX = \p 0x7FFF. +* - __half2short_rz \cuda_math_formula (x), x < -32768\end_cuda_math_formula returns SHRT_MIN = \p 0x8000. +* - __half2short_rz(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ short int __half2short_rz(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned short integer in round-towards-zero +* mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned short +* integer in round-towards-zero mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns unsigned short int +* - \p h converted to an unsigned short integer using round-towards-zero mode. +* - __half2ushort_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __half2ushort_rz \cuda_math_formula (+\infty)\end_cuda_math_formula returns USHRT_MAX = \p 0xFFFF. +* - __half2ushort_rz \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0. +* - __half2ushort_rz(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned short int __half2ushort_rz(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed integer in round-towards-zero mode. +* +* \details Convert the half-precision floating-point value \p h to a signed integer in +* round-towards-zero mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns int +* - \p h converted to a signed integer using round-towards-zero mode. +* - __half2int_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __half2int_rz \cuda_math_formula (+\infty)\end_cuda_math_formula returns INT_MAX = \p 0x7FFFFFFF. +* - __half2int_rz \cuda_math_formula (-\infty)\end_cuda_math_formula returns INT_MIN = \p 0x80000000. +* - __half2int_rz(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ int __half2int_rz(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned integer in round-towards-zero mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned integer +* in round-towards-zero mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns unsigned int +* - \p h converted to an unsigned integer using round-towards-zero mode. +* - __half2uint_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __half2uint_rz \cuda_math_formula (+\infty)\end_cuda_math_formula returns UINT_MAX = \p 0xFFFFFFFF. +* - __half2uint_rz \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0. +* - __half2uint_rz(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __half2uint_rz(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed 64-bit integer in round-towards-zero mode. +* +* \details Convert the half-precision floating-point value \p h to a signed 64-bit +* integer in round-towards-zero mode. NaN inputs return a long long int with hex value of \p 0x8000000000000000. +* \param[in] h - half. Is only being read. +* +* \returns long long int +* - \p h converted to a signed 64-bit integer using round-towards-zero mode. +* - __half2ll_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __half2ll_rz \cuda_math_formula (+\infty)\end_cuda_math_formula returns LLONG_MAX = \p 0x7FFFFFFFFFFFFFFF. +* - __half2ll_rz \cuda_math_formula (-\infty)\end_cuda_math_formula returns LLONG_MIN = \p 0x8000000000000000. +* - __half2ll_rz(NaN) returns \p 0x8000000000000000. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ long long int __half2ll_rz(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned 64-bit integer in round-towards-zero +* mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned 64-bit +* integer in round-towards-zero mode. NaN inputs return \p 0x8000000000000000. +* \param[in] h - half. Is only being read. +* +* \returns unsigned long long int +* - \p h converted to an unsigned 64-bit integer using round-towards-zero mode. +* - __half2ull_rz \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __half2ull_rz \cuda_math_formula (+\infty)\end_cuda_math_formula returns ULLONG_MAX = \p 0xFFFFFFFFFFFFFFFF. +* - __half2ull_rz \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0. +* - __half2ull_rz(NaN) returns \p 0x8000000000000000. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned long long int __half2ull_rz(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Vector function, combines two \p __half numbers into one \p __half2 number. +* +* \details Combines two input \p __half number \p x and \p y into one \p __half2 number. +* Input \p x is stored in low 16 bits of the return value, input \p y is stored +* in high 16 bits of the return value. +* \param[in] x - half. Is only being read. +* \param[in] y - half. Is only being read. +* +* \returns __half2 +* - The \p __half2 vector with one half equal to \p x and the other to \p y. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 make_half2(const __half x, const __half y); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts both components of \p float2 number to half precision in +* round-to-nearest-even mode and returns \p half2 with converted values. +* +* \details Converts both components of \p float2 to half precision in round-to-nearest-even +* mode and combines the results into one \p half2 number. Low 16 bits of the +* return value correspond to \p a.x and high 16 bits of the return value +* correspond to \p a.y. +* \param[in] a - float2. Is only being read. +* +* \returns half2 +* - The \p half2 which has corresponding halves equal to the +* converted \p float2 components. +* +* \see __float2half_rn(float) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float22half2_rn(const float2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts both halves of \p half2 to \p float2 and returns the result. +* +* \details Converts both halves of \p half2 input \p a to \p float2 and returns the +* result. +* \param[in] a - half2. Is only being read. +* +* \returns float2 +* - \p a converted to \p float2. +* +* \see __half2float(__half) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ float2 __half22float2(const __half2 a); +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed integer in round-to-nearest-even mode. +* +* \details Convert the half-precision floating-point value \p h to a signed integer in +* round-to-nearest-even mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns int +* - \p h converted to a signed integer using round-to-nearest-even mode. +* - __half2int_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __half2int_rn \cuda_math_formula (+\infty)\end_cuda_math_formula returns INT_MAX = \p 0x7FFFFFFF. +* - __half2int_rn \cuda_math_formula (-\infty)\end_cuda_math_formula returns INT_MIN = \p 0x80000000. +* - __half2int_rn(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ int __half2int_rn(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed integer in round-down mode. +* +* \details Convert the half-precision floating-point value \p h to a signed integer in +* round-down mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns int +* - \p h converted to a signed integer using round-down mode. +* - __half2int_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __half2int_rd \cuda_math_formula (+\infty)\end_cuda_math_formula returns INT_MAX = \p 0x7FFFFFFF. +* - __half2int_rd \cuda_math_formula (-\infty)\end_cuda_math_formula returns INT_MIN = \p 0x80000000. +* - __half2int_rd(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ int __half2int_rd(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed integer in round-up mode. +* +* \details Convert the half-precision floating-point value \p h to a signed integer in +* round-up mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns int +* - \p h converted to a signed integer using round-up mode. +* - __half2int_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __half2int_ru \cuda_math_formula (+\infty)\end_cuda_math_formula returns INT_MAX = \p 0x7FFFFFFF. +* - __half2int_ru \cuda_math_formula (-\infty)\end_cuda_math_formula returns INT_MIN = \p 0x80000000. +* - __half2int_ru(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ int __half2int_ru(const __half h); +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed integer to a half in round-to-nearest-even mode. +* +* \details Convert the signed integer value \p i to a half-precision floating-point +* value in round-to-nearest-even mode. +* \param[in] i - int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_rn(const int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed integer to a half in round-towards-zero mode. +* +* \details Convert the signed integer value \p i to a half-precision floating-point +* value in round-towards-zero mode. +* \param[in] i - int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_rz(const int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed integer to a half in round-down mode. +* +* \details Convert the signed integer value \p i to a half-precision floating-point +* value in round-down mode. +* \param[in] i - int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_rd(const int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed integer to a half in round-up mode. +* +* \details Convert the signed integer value \p i to a half-precision floating-point +* value in round-up mode. +* \param[in] i - int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_ru(const int i); +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed short integer in round-to-nearest-even +* mode. +* +* \details Convert the half-precision floating-point value \p h to a signed short +* integer in round-to-nearest-even mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns short int +* - \p h converted to a signed short integer using round-to-nearest-even mode. +* - __half2short_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __half2short_rn \cuda_math_formula (x), x > 32767\end_cuda_math_formula returns SHRT_MAX = \p 0x7FFF. +* - __half2short_rn \cuda_math_formula (x), x < -32768\end_cuda_math_formula returns SHRT_MIN = \p 0x8000. +* - __half2short_rn(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ short int __half2short_rn(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed short integer in round-down mode. +* +* \details Convert the half-precision floating-point value \p h to a signed short +* integer in round-down mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns short int +* - \p h converted to a signed short integer using round-down mode. +* - __half2short_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __half2short_rd \cuda_math_formula (x), x > 32767\end_cuda_math_formula returns SHRT_MAX = \p 0x7FFF. +* - __half2short_rd \cuda_math_formula (x), x < -32768\end_cuda_math_formula returns SHRT_MIN = \p 0x8000. +* - __half2short_rd(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ short int __half2short_rd(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed short integer in round-up mode. +* +* \details Convert the half-precision floating-point value \p h to a signed short +* integer in round-up mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns short int +* - \p h converted to a signed short integer using round-up mode. +* - __half2short_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __half2short_ru \cuda_math_formula (x), x > 32767\end_cuda_math_formula returns SHRT_MAX = \p 0x7FFF. +* - __half2short_ru \cuda_math_formula (x), x < -32768\end_cuda_math_formula returns SHRT_MIN = \p 0x8000. +* - __half2short_ru(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ short int __half2short_ru(const __half h); +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed short integer to a half in round-to-nearest-even +* mode. +* +* \details Convert the signed short integer value \p i to a half-precision floating-point +* value in round-to-nearest-even mode. +* \param[in] i - short int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_rn(const short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed short integer to a half in round-towards-zero mode. +* +* \details Convert the signed short integer value \p i to a half-precision floating-point +* value in round-towards-zero mode. +* \param[in] i - short int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_rz(const short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed short integer to a half in round-down mode. +* +* \details Convert the signed short integer value \p i to a half-precision floating-point +* value in round-down mode. +* \param[in] i - short int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_rd(const short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed short integer to a half in round-up mode. +* +* \details Convert the signed short integer value \p i to a half-precision floating-point +* value in round-up mode. +* \param[in] i - short int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_ru(const short int i); +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned integer in round-to-nearest-even mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned integer +* in round-to-nearest-even mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns unsigned int +* - \p h converted to an unsigned integer using round-to-nearest-even mode. +* - __half2uint_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __half2uint_rn \cuda_math_formula (+\infty)\end_cuda_math_formula returns UINT_MAX = \p 0xFFFFFFFF. +* - __half2uint_rn \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0. +* - __half2uint_rn(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned int __half2uint_rn(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned integer in round-down mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned integer +* in round-down mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns unsigned int +* - \p h converted to an unsigned integer using round-down mode. +* - __half2uint_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __half2uint_rd \cuda_math_formula (+\infty)\end_cuda_math_formula returns UINT_MAX = \p 0xFFFFFFFF. +* - __half2uint_rd \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0. +* - __half2uint_rd(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned int __half2uint_rd(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned integer in round-up mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned integer +* in round-up mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns unsigned int +* - \p h converted to an unsigned integer using round-up mode. +* - __half2uint_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __half2uint_ru \cuda_math_formula (+\infty)\end_cuda_math_formula returns UINT_MAX = \p 0xFFFFFFFF. +* - __half2uint_ru \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0. +* - __half2uint_ru(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned int __half2uint_ru(const __half h); +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned integer to a half in round-to-nearest-even mode. +* +* \details Convert the unsigned integer value \p i to a half-precision floating-point +* value in round-to-nearest-even mode. +* \param[in] i - unsigned int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_rn(const unsigned int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned integer to a half in round-towards-zero mode. +* +* \details Convert the unsigned integer value \p i to a half-precision floating-point +* value in round-towards-zero mode. +* \param[in] i - unsigned int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_rz(const unsigned int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned integer to a half in round-down mode. +* +* \details Convert the unsigned integer value \p i to a half-precision floating-point +* value in round-down mode. +* \param[in] i - unsigned int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_rd(const unsigned int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned integer to a half in round-up mode. +* +* \details Convert the unsigned integer value \p i to a half-precision floating-point +* value in round-up mode. +* \param[in] i - unsigned int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_ru(const unsigned int i); +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned short integer in round-to-nearest-even +* mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned short +* integer in round-to-nearest-even mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns unsigned short int +* - \p h converted to an unsigned short integer using round-to-nearest-even mode. +* - __half2ushort_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __half2ushort_rn \cuda_math_formula (+\infty)\end_cuda_math_formula returns USHRT_MAX = \p 0xFFFF. +* - __half2ushort_rn \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0. +* - __half2ushort_rn(NaN) returns 0. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned short int __half2ushort_rn(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned short integer in round-down mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned short +* integer in round-down mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns unsigned short int +* - \p h converted to an unsigned short integer using round-down mode. +* - __half2ushort_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __half2ushort_rd \cuda_math_formula (+\infty)\end_cuda_math_formula returns USHRT_MAX = \p 0xFFFF. +* - __half2ushort_rd \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0. +* - __half2ushort_rd(NaN) returns 0. +*/ +__CUDA_FP16_DECL__ unsigned short int __half2ushort_rd(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned short integer in round-up mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned short +* integer in round-up mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns unsigned short int +* - \p h converted to an unsigned short integer using round-up mode. +* - __half2ushort_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __half2ushort_ru \cuda_math_formula (+\infty)\end_cuda_math_formula returns USHRT_MAX = \p 0xFFFF. +* - __half2ushort_ru \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0. +* - __half2ushort_ru(NaN) returns 0. +*/ +__CUDA_FP16_DECL__ unsigned short int __half2ushort_ru(const __half h); +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned short integer to a half in round-to-nearest-even +* mode. +* +* \details Convert the unsigned short integer value \p i to a half-precision floating-point +* value in round-to-nearest-even mode. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_rn(const unsigned short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned short integer to a half in round-towards-zero +* mode. +* +* \details Convert the unsigned short integer value \p i to a half-precision floating-point +* value in round-towards-zero mode. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_rz(const unsigned short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned short integer to a half in round-down mode. +* +* \details Convert the unsigned short integer value \p i to a half-precision floating-point +* value in round-down mode. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_rd(const unsigned short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned short integer to a half in round-up mode. +* +* \details Convert the unsigned short integer value \p i to a half-precision floating-point +* value in round-up mode. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_ru(const unsigned short int i); +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned 64-bit integer in round-to-nearest-even +* mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned 64-bit +* integer in round-to-nearest-even mode. NaN inputs return \p 0x8000000000000000. +* \param[in] h - half. Is only being read. +* +* \returns unsigned long long int +* - \p h converted to an unsigned 64-bit integer using round-to-nearest-even mode. +* - __half2ull_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __half2ull_rn \cuda_math_formula (+\infty)\end_cuda_math_formula returns ULLONG_MAX = \p 0xFFFFFFFFFFFFFFFF. +* - __half2ull_rn \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0. +* - __half2ull_rn(NaN) returns \p 0x8000000000000000. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned long long int __half2ull_rn(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned 64-bit integer in round-down mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned 64-bit +* integer in round-down mode. NaN inputs return \p 0x8000000000000000. +* \param[in] h - half. Is only being read. +* +* \returns unsigned long long int +* - \p h converted to an unsigned 64-bit integer using round-down mode. +* - __half2ull_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __half2ull_rd \cuda_math_formula (+\infty)\end_cuda_math_formula returns ULLONG_MAX = \p 0xFFFFFFFFFFFFFFFF. +* - __half2ull_rd \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0. +* - __half2ull_rd(NaN) returns \p 0x8000000000000000. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned long long int __half2ull_rd(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned 64-bit integer in round-up mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned 64-bit +* integer in round-up mode. NaN inputs return \p 0x8000000000000000. +* \param[in] h - half. Is only being read. +* +* \returns unsigned long long int +* - \p h converted to an unsigned 64-bit integer using round-up mode. +* - __half2ull_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __half2ull_ru \cuda_math_formula (+\infty)\end_cuda_math_formula returns ULLONG_MAX = \p 0xFFFFFFFFFFFFFFFF. +* - __half2ull_ru \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns 0. +* - __half2ull_ru(NaN) returns \p 0x8000000000000000. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned long long int __half2ull_ru(const __half h); +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned 64-bit integer to a half in round-to-nearest-even +* mode. +* +* \details Convert the unsigned 64-bit integer value \p i to a half-precision floating-point +* value in round-to-nearest-even mode. +* \param[in] i - unsigned long long int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_rn(const unsigned long long int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned 64-bit integer to a half in round-towards-zero +* mode. +* +* \details Convert the unsigned 64-bit integer value \p i to a half-precision floating-point +* value in round-towards-zero mode. +* \param[in] i - unsigned long long int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_rz(const unsigned long long int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned 64-bit integer to a half in round-down mode. +* +* \details Convert the unsigned 64-bit integer value \p i to a half-precision floating-point +* value in round-down mode. +* \param[in] i - unsigned long long int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_rd(const unsigned long long int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned 64-bit integer to a half in round-up mode. +* +* \details Convert the unsigned 64-bit integer value \p i to a half-precision floating-point +* value in round-up mode. +* \param[in] i - unsigned long long int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_ru(const unsigned long long int i); +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed 64-bit integer in round-to-nearest-even +* mode. +* +* \details Convert the half-precision floating-point value \p h to a signed 64-bit +* integer in round-to-nearest-even mode. NaN inputs return a long long int with hex value of \p 0x8000000000000000. +* \param[in] h - half. Is only being read. +* +* \returns long long int +* - \p h converted to a signed 64-bit integer using round-to-nearest-even mode. +* - __half2ll_rn \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __half2ll_rn \cuda_math_formula (+\infty)\end_cuda_math_formula returns LLONG_MAX = \p 0x7FFFFFFFFFFFFFFF. +* - __half2ll_rn \cuda_math_formula (-\infty)\end_cuda_math_formula returns LLONG_MIN = \p 0x8000000000000000. +* - __half2ll_rn(NaN) returns \p 0x8000000000000000. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ long long int __half2ll_rn(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed 64-bit integer in round-down mode. +* +* \details Convert the half-precision floating-point value \p h to a signed 64-bit +* integer in round-down mode. NaN inputs return a long long int with hex value of \p 0x8000000000000000. +* \param[in] h - half. Is only being read. +* +* \returns long long int +* - \p h converted to a signed 64-bit integer using round-down mode. +* - __half2ll_rd \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __half2ll_rd \cuda_math_formula (+\infty)\end_cuda_math_formula returns LLONG_MAX = \p 0x7FFFFFFFFFFFFFFF. +* - __half2ll_rd \cuda_math_formula (-\infty)\end_cuda_math_formula returns LLONG_MIN = \p 0x8000000000000000. +* - __half2ll_rd(NaN) returns \p 0x8000000000000000. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ long long int __half2ll_rd(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed 64-bit integer in round-up mode. +* +* \details Convert the half-precision floating-point value \p h to a signed 64-bit +* integer in round-up mode. NaN inputs return a long long int with hex value of \p 0x8000000000000000. +* \param[in] h - half. Is only being read. +* +* \returns long long int +* - \p h converted to a signed 64-bit integer using round-up mode. +* - __half2ll_ru \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 0. +* - __half2ll_ru \cuda_math_formula (+\infty)\end_cuda_math_formula returns LLONG_MAX = \p 0x7FFFFFFFFFFFFFFF. +* - __half2ll_ru \cuda_math_formula (-\infty)\end_cuda_math_formula returns LLONG_MIN = \p 0x8000000000000000. +* - __half2ll_ru(NaN) returns \p 0x8000000000000000. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ long long int __half2ll_ru(const __half h); +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed 64-bit integer to a half in round-to-nearest-even +* mode. +* +* \details Convert the signed 64-bit integer value \p i to a half-precision floating-point +* value in round-to-nearest-even mode. +* \param[in] i - long long int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_rn(const long long int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed 64-bit integer to a half in round-towards-zero mode. +* +* \details Convert the signed 64-bit integer value \p i to a half-precision floating-point +* value in round-towards-zero mode. +* \param[in] i - long long int. Is only being read. +* +* \returns half +* - \p i converted to half. +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_rz(const long long int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed 64-bit integer to a half in round-down mode. +* +* \details Convert the signed 64-bit integer value \p i to a half-precision floating-point +* value in round-down mode. +* \param[in] i - long long int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_rd(const long long int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed 64-bit integer to a half in round-up mode. +* +* \details Convert the signed 64-bit integer value \p i to a half-precision floating-point +* value in round-up mode. +* \param[in] i - long long int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_ru(const long long int i); +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Truncate input argument to the integral part. +* +* \details Round \p h to the largest integer value that does not exceed \p h in +* magnitude. +* \param[in] h - half. Is only being read. +* +* \returns half +* - The truncated value. +* - htrunc( +* \cuda_math_formula \pm 0 \end_cuda_math_formula +* ) returns +* \cuda_math_formula \pm 0 \end_cuda_math_formula. +* - htrunc( +* \cuda_math_formula \pm \infty \end_cuda_math_formula +* ) returns +* \cuda_math_formula \pm \infty \end_cuda_math_formula. +* - htrunc(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half htrunc(const __half h); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculate ceiling of the input argument. +* +* \details Compute the smallest integer value not less than \p h. +* \param[in] h - half. Is only being read. +* +* \returns half +* - The smallest integer value not less than \p h. +* - hceil( +* \cuda_math_formula \pm 0 \end_cuda_math_formula +* ) returns +* \cuda_math_formula \pm 0 \end_cuda_math_formula. +* - hceil( +* \cuda_math_formula \pm \infty \end_cuda_math_formula +* ) returns +* \cuda_math_formula \pm \infty \end_cuda_math_formula. +* - hceil(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hceil(const __half h); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculate the largest integer less than or equal to \p h. +* +* \details Calculate the largest integer value which is less than or equal to \p h. +* \param[in] h - half. Is only being read. +* +* \returns half +* - The largest integer value which is less than or equal to \p h. +* - hfloor( +* \cuda_math_formula \pm 0 \end_cuda_math_formula +* ) returns +* \cuda_math_formula \pm 0 \end_cuda_math_formula. +* - hfloor( +* \cuda_math_formula \pm \infty \end_cuda_math_formula +* ) returns +* \cuda_math_formula \pm \infty \end_cuda_math_formula. +* - hfloor(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hfloor(const __half h); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Round input to nearest integer value in half-precision floating-point +* number. +* +* \details Round \p h to the nearest integer value in half-precision floating-point +* format, with halfway cases rounded to the nearest even integer value. +* \param[in] h - half. Is only being read. +* +* \returns half +* - The nearest integer to \p h. +* - hrint( +* \cuda_math_formula \pm 0 \end_cuda_math_formula +* ) returns +* \cuda_math_formula \pm 0 \end_cuda_math_formula. +* - hrint( +* \cuda_math_formula \pm \infty \end_cuda_math_formula +* ) returns +* \cuda_math_formula \pm \infty \end_cuda_math_formula. +* - hrint(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hrint(const __half h); + +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Truncate \p half2 vector input argument to the integral part. +* +* \details Round each component of vector \p h to the largest integer value that does +* not exceed \p h in magnitude. +* \param[in] h - half2. Is only being read. +* +* \returns half2 +* - The truncated \p h. +* +* \see htrunc(__half) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2trunc(const __half2 h); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculate \p half2 vector ceiling of the input argument. +* +* \details For each component of vector \p h compute the smallest integer value not less +* than \p h. +* \param[in] h - half2. Is only being read. +* +* \returns half2 +* - The vector of smallest integers not less than \p h. +* +* \see hceil(__half) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2ceil(const __half2 h); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculate the largest integer less than or equal to \p h. +* +* \details For each component of vector \p h calculate the largest integer value which +* is less than or equal to \p h. +* \param[in] h - half2. Is only being read. +* +* \returns half2 +* - The vector of largest integers which is less than or equal to \p h. +* +* \see hfloor(__half) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2floor(const __half2 h); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Round input to nearest integer value in half-precision floating-point +* number. +* +* \details Round each component of \p half2 vector \p h to the nearest integer value in +* half-precision floating-point format, with halfway cases rounded to the +* nearest even integer value. +* \param[in] h - half2. Is only being read. +* +* \returns half2 +* - The vector of rounded integer values. +* +* \see hrint(__half) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2rint(const __half2 h); +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Returns \p half2 with both halves equal to the input value. +* +* \details Returns \p half2 number with both halves equal to the input \p a \p half +* number. +* \param[in] a - half. Is only being read. +* +* \returns half2 +* - The vector which has both its halves equal to the input \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __half2half2(const __half a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Swaps both halves of the \p half2 input. +* +* \details Swaps both halves of the \p half2 input and returns a new \p half2 number +* with swapped halves. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - \p a with its halves being swapped. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __lowhigh2highlow(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Extracts low 16 bits from each of the two \p half2 inputs and combines +* into one \p half2 number. +* +* \details Extracts low 16 bits from each of the two \p half2 inputs and combines into +* one \p half2 number. Low 16 bits from input \p a is stored in low 16 bits of +* the return value, low 16 bits from input \p b is stored in high 16 bits of +* the return value. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The low 16 bits of \p a and of \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __lows2half2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Extracts high 16 bits from each of the two \p half2 inputs and +* combines into one \p half2 number. +* +* \details Extracts high 16 bits from each of the two \p half2 inputs and combines into +* one \p half2 number. High 16 bits from input \p a is stored in low 16 bits of +* the return value, high 16 bits from input \p b is stored in high 16 bits of +* the return value. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The high 16 bits of \p a and of \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __highs2half2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Returns high 16 bits of \p half2 input. +* +* \details Returns high 16 bits of \p half2 input \p a. +* \param[in] a - half2. Is only being read. +* +* \returns half +* - The high 16 bits of the input. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __high2half(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Returns low 16 bits of \p half2 input. +* +* \details Returns low 16 bits of \p half2 input \p a. +* \param[in] a - half2. Is only being read. +* +* \returns half +* - Returns \p half which contains low 16 bits of the input \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __low2half(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Checks if the input \p half number is infinite. +* +* \details Checks if the input \p half number \p a is infinite. +* \param[in] a - half. Is only being read. +* +* \returns int +* - -1 if \p a is equal to negative infinity, +* - 1 if \p a is equal to positive infinity, +* - 0 otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ int __hisinf(const __half a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Combines two \p half numbers into one \p half2 number. +* +* \details Combines two input \p half number \p a and \p b into one \p half2 number. +* Input \p a is stored in low 16 bits of the return value, input \p b is stored +* in high 16 bits of the return value. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half2 +* - The half2 with one half equal to \p a and the other to \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __halves2half2(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Extracts low 16 bits from \p half2 input. +* +* \details Extracts low 16 bits from \p half2 input \p a and returns a new \p half2 +* number which has both halves equal to the extracted bits. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The half2 with both halves equal to the low 16 bits of the input. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __low2half2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Extracts high 16 bits from \p half2 input. +* +* \details Extracts high 16 bits from \p half2 input \p a and returns a new \p half2 +* number which has both halves equal to the extracted bits. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The half2 with both halves equal to the high 16 bits of the input. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __high2half2(const __half2 a); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Reinterprets bits in a \p half as a signed short integer. +* +* \details Reinterprets the bits in the half-precision floating-point number \p h +* as a signed short integer. +* \param[in] h - half. Is only being read. +* +* \returns short int +* - The reinterpreted value. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ short int __half_as_short(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Reinterprets bits in a \p half as an unsigned short integer. +* +* \details Reinterprets the bits in the half-precision floating-point \p h +* as an unsigned short number. +* \param[in] h - half. Is only being read. +* +* \returns unsigned short int +* - The reinterpreted value. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned short int __half_as_ushort(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Reinterprets bits in a signed short integer as a \p half. +* +* \details Reinterprets the bits in the signed short integer \p i as a +* half-precision floating-point number. +* \param[in] i - short int. Is only being read. +* +* \returns half +* - The reinterpreted value. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __short_as_half(const short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Reinterprets bits in an unsigned short integer as a \p half. +* +* \details Reinterprets the bits in the unsigned short integer \p i as a +* half-precision floating-point number. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns half +* - The reinterpreted value. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort_as_half(const unsigned short int i); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Calculates \p half maximum of two input values. +* +* \details Calculates \p half max(\p a, \p b) +* defined as (\p a > \p b) ? \p a : \p b. +* - If either of inputs is NaN, the other input is returned. +* - If both inputs are NaNs, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmax(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Calculates \p half minimum of two input values. +* +* \details Calculates \p half min(\p a, \p b) +* defined as (\p a < \p b) ? \p a : \p b. +* - If either of inputs is NaN, the other input is returned. +* - If both inputs are NaNs, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmin(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Calculates \p half2 vector maximum of two inputs. +* +* \details Calculates \p half2 vector max(\p a, \p b). +* Elementwise \p half operation is defined as +* (\p a > \p b) ? \p a : \p b. +* - If either of inputs is NaN, the other input is returned. +* - If both inputs are NaNs, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The result of elementwise maximum of vectors \p a and \p b +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmax2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Calculates \p half2 vector minimum of two inputs. +* +* \details Calculates \p half2 vector min(\p a, \p b). +* Elementwise \p half operation is defined as +* (\p a < \p b) ? \p a : \p b. +* - If either of inputs is NaN, the other input is returned. +* - If both inputs are NaNs, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The result of elementwise minimum of vectors \p a and \p b +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmin2(const __half2 a, const __half2 b); + +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300) +#if !defined warpSize && !defined __local_warpSize +#define warpSize 32 +#define __local_warpSize +#endif + +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ < 700) + +#if defined(_WIN32) +# define __CUDA_FP16_DEPRECATED__(msg) __declspec(deprecated(msg)) +#elif (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 5 && !defined(__clang__)))) +# define __CUDA_FP16_DEPRECATED__(msg) __attribute__((deprecated)) +#else +# define __CUDA_FP16_DEPRECATED__(msg) __attribute__((deprecated(msg))) +#endif + +#if defined(_NVHPC_CUDA) +#define __CUDA_FP16_WSB_DEPRECATION_MESSAGE(x) __CUDA_FP16_STRINGIFY(x) "() is deprecated in favor of " __CUDA_FP16_STRINGIFY(x) "_sync() and may be removed in a future release." +#else +#define __CUDA_FP16_WSB_DEPRECATION_MESSAGE(x) __CUDA_FP16_STRINGIFY(x) "() is deprecated in favor of " __CUDA_FP16_STRINGIFY(x) "_sync() and may be removed in a future release (Use -Wno-deprecated-declarations to suppress this warning)." +#endif + +__CUDA_FP16_DECL__ __CUDA_FP16_DEPRECATED__(__CUDA_FP16_WSB_DEPRECATION_MESSAGE(__shfl)) __half2 __shfl(const __half2 var, const int delta, const int width = warpSize); +__CUDA_FP16_DECL__ __CUDA_FP16_DEPRECATED__(__CUDA_FP16_WSB_DEPRECATION_MESSAGE(__shfl_up)) __half2 __shfl_up(const __half2 var, const unsigned int delta, const int width = warpSize); +__CUDA_FP16_DECL__ __CUDA_FP16_DEPRECATED__(__CUDA_FP16_WSB_DEPRECATION_MESSAGE(__shfl_down))__half2 __shfl_down(const __half2 var, const unsigned int delta, const int width = warpSize); +__CUDA_FP16_DECL__ __CUDA_FP16_DEPRECATED__(__CUDA_FP16_WSB_DEPRECATION_MESSAGE(__shfl_xor)) __half2 __shfl_xor(const __half2 var, const int delta, const int width = warpSize); +__CUDA_FP16_DECL__ __CUDA_FP16_DEPRECATED__(__CUDA_FP16_WSB_DEPRECATION_MESSAGE(__shfl)) __half __shfl(const __half var, const int delta, const int width = warpSize); +__CUDA_FP16_DECL__ __CUDA_FP16_DEPRECATED__(__CUDA_FP16_WSB_DEPRECATION_MESSAGE(__shfl_up)) __half __shfl_up(const __half var, const unsigned int delta, const int width = warpSize); +__CUDA_FP16_DECL__ __CUDA_FP16_DEPRECATED__(__CUDA_FP16_WSB_DEPRECATION_MESSAGE(__shfl_down)) __half __shfl_down(const __half var, const unsigned int delta, const int width = warpSize); +__CUDA_FP16_DECL__ __CUDA_FP16_DEPRECATED__(__CUDA_FP16_WSB_DEPRECATION_MESSAGE(__shfl_xor)) __half __shfl_xor(const __half var, const int delta, const int width = warpSize); + +#undef __CUDA_FP16_WSB_DEPRECATION_MESSAGE +#undef __CUDA_FP16_DEPRECATED__ +#endif /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 */ + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Direct copy from indexed thread. +* +* \details Returns the value of \p var held by the thread whose ID is given by \p srcLane. +* If the \p width is less than \p warpSize, then each subsection of the warp behaves as a separate +* entity with a starting logical thread ID of 0. If \p srcLane is outside the range \p [0:width-1], +* the value returned corresponds to the value of \p var held by the \p srcLane modulo \p width (i.e. +* within the same subsection). \p width must have a value which is a power of 2; +* results are undefined if \p width is not a power of 2, or is a number greater than +* \p warpSize. +* Threads may only read data from another thread which is actively participating in the +* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined. +* \param[in] mask - unsigned int. Is only being read. +* - Indicates the threads participating in the call. +* - A bit, representing the thread's lane ID, must be set for each participating thread +* to ensure they are properly converged before the intrinsic is executed by the hardware. +* - Each calling thread must have its own bit set in the \p mask and all non-exited threads +* named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined. +* \param[in] var - half2. Is only being read. +* \param[in] srcLane - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 4-byte word referenced by \p var from the source thread ID as \p half2. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __shfl_sync(const unsigned int mask, const __half2 var, const int srcLane, const int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller. +* +* \details Calculates a source thread ID by subtracting \p delta from the caller's lane ID. +* The value of \p var held by the resulting lane ID is returned: in effect, \p var is shifted up +* the warp by \p delta threads. If the \p width is less than \p warpSize, then each subsection of the warp +* behaves as a separate entity with a starting logical thread ID of 0. The source thread index +* will not wrap around the value of \p width, so effectively the lower \p delta threads will be unchanged. +* \p width must have a value which is a power of 2; results are undefined if \p width is not a power of 2, +* or is a number greater than \p warpSize. +* Threads may only read data from another thread which is actively participating in the +* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined. +* \param[in] mask - unsigned int. Is only being read. +* - Indicates the threads participating in the call. +* - A bit, representing the thread's lane ID, must be set for each participating thread +* to ensure they are properly converged before the intrinsic is executed by the hardware. +* - Each calling thread must have its own bit set in the \p mask and all non-exited threads +* named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined. +* \param[in] var - half2. Is only being read. +* \param[in] delta - unsigned int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 4-byte word referenced by \p var from the source thread ID as \p half2. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __shfl_up_sync(const unsigned int mask, const __half2 var, const unsigned int delta, const int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller. +* +* \details Calculates a source thread ID by adding \p delta to the caller's thread ID. +* The value of \p var held by the resulting thread ID is returned: this has the effect +* of shifting \p var down the warp by \p delta threads. If the \p width is less than \p warpSize, then +* each subsection of the warp behaves as a separate entity with a starting logical +* thread ID of 0. Similarly to the __shfl_up_sync(), the ID number of the source thread +* will not wrap around the value of \p width and the upper \p delta threads +* will remain unchanged. +* Threads may only read data from another thread which is actively participating in the +* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined. +* \param[in] mask - unsigned int. Is only being read. +* - Indicates the threads participating in the call. +* - A bit, representing the thread's lane ID, must be set for each participating thread +* to ensure they are properly converged before the intrinsic is executed by the hardware. +* - Each calling thread must have its own bit set in the \p mask and all non-exited threads +* named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined. +* \param[in] var - half2. Is only being read. +* \param[in] delta - unsigned int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 4-byte word referenced by \p var from the source thread ID as \p half2. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __shfl_down_sync(const unsigned int mask, const __half2 var, const unsigned int delta, const int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID. +* +* \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with \p laneMask: +* the value of \p var held by the resulting thread ID is returned. If the \p width is less than \p warpSize, then each +* group of \p width consecutive threads are able to access elements from earlier groups of threads, +* however if they attempt to access elements from later groups of threads their own value of \p var +* will be returned. This mode implements a butterfly addressing pattern such as is used in tree +* reduction and broadcast. +* Threads may only read data from another thread which is actively participating in the +* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined. +* \param[in] mask - unsigned int. Is only being read. +* - Indicates the threads participating in the call. +* - A bit, representing the thread's lane ID, must be set for each participating thread +* to ensure they are properly converged before the intrinsic is executed by the hardware. +* - Each calling thread must have its own bit set in the \p mask and all non-exited threads +* named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined. +* \param[in] var - half2. Is only being read. +* \param[in] laneMask - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 4-byte word referenced by \p var from the source thread ID as \p half2. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __shfl_xor_sync(const unsigned int mask, const __half2 var, const int laneMask, const int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Direct copy from indexed thread. +* +* \details Returns the value of \p var held by the thread whose ID is given by \p srcLane. +* If the \p width is less than \p warpSize, then each subsection of the warp behaves as a separate +* entity with a starting logical thread ID of 0. If \p srcLane is outside the range \p [0:width-1], +* the value returned corresponds to the value of \p var held by the \p srcLane modulo \p width (i.e. +* within the same subsection). \p width must have a value which is a power of 2; +* results are undefined if \p width is not a power of 2, or is a number greater than +* \p warpSize. +* Threads may only read data from another thread which is actively participating in the +* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined. +* \param[in] mask - unsigned int. Is only being read. +* - Indicates the threads participating in the call. +* - A bit, representing the thread's lane ID, must be set for each participating thread +* to ensure they are properly converged before the intrinsic is executed by the hardware. +* - Each calling thread must have its own bit set in the \p mask and all non-exited threads +* named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined. +* \param[in] var - half. Is only being read. +* \param[in] srcLane - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 2-byte word referenced by \p var from the source thread ID as \p half. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __shfl_sync(const unsigned int mask, const __half var, const int srcLane, const int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller. +* +* \details Calculates a source thread ID by subtracting \p delta from the caller's lane ID. +* The value of \p var held by the resulting lane ID is returned: in effect, \p var is shifted up +* the warp by \p delta threads. If the \p width is less than \p warpSize, then each subsection of the warp +* behaves as a separate entity with a starting logical thread ID of 0. The source thread index +* will not wrap around the value of \p width, so effectively the lower \p delta threads will be unchanged. +* \p width must have a value which is a power of 2; results are undefined if \p width is not a power of 2, +* or is a number greater than \p warpSize. +* Threads may only read data from another thread which is actively participating in the +* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined. +* \param[in] mask - unsigned int. Is only being read. +* - Indicates the threads participating in the call. +* - A bit, representing the thread's lane ID, must be set for each participating thread +* to ensure they are properly converged before the intrinsic is executed by the hardware. +* - Each calling thread must have its own bit set in the \p mask and all non-exited threads +* named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined. +* \param[in] var - half. Is only being read. +* \param[in] delta - unsigned int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 2-byte word referenced by \p var from the source thread ID as \p half. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __shfl_up_sync(const unsigned int mask, const __half var, const unsigned int delta, const int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller. +* +* \details Calculates a source thread ID by adding \p delta to the caller's thread ID. +* The value of \p var held by the resulting thread ID is returned: this has the effect +* of shifting \p var down the warp by \p delta threads. If the \p width is less than \p warpSize, then +* each subsection of the warp behaves as a separate entity with a starting logical +* thread ID of 0. Similarly to the __shfl_up_sync(), the ID number of the source thread +* will not wrap around the value of \p width and the upper \p delta threads +* will remain unchanged. +* Threads may only read data from another thread which is actively participating in the +* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined. +* \param[in] mask - unsigned int. Is only being read. +* - Indicates the threads participating in the call. +* - A bit, representing the thread's lane ID, must be set for each participating thread +* to ensure they are properly converged before the intrinsic is executed by the hardware. +* - Each calling thread must have its own bit set in the \p mask and all non-exited threads +* named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined. +* \param[in] var - half. Is only being read. +* \param[in] delta - unsigned int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 2-byte word referenced by \p var from the source thread ID as \p half. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __shfl_down_sync(const unsigned int mask, const __half var, const unsigned int delta, const int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID. +* +* \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with \p laneMask: +* the value of \p var held by the resulting thread ID is returned. If the \p width is less than \p warpSize, then each +* group of \p width consecutive threads are able to access elements from earlier groups of threads, +* however if they attempt to access elements from later groups of threads their own value of \p var +* will be returned. This mode implements a butterfly addressing pattern such as is used in tree +* reduction and broadcast. +* Threads may only read data from another thread which is actively participating in the +* \p __shfl_*sync() command. If the target thread is inactive, the retrieved value is undefined. +* \param[in] mask - unsigned int. Is only being read. +* - Indicates the threads participating in the call. +* - A bit, representing the thread's lane ID, must be set for each participating thread +* to ensure they are properly converged before the intrinsic is executed by the hardware. +* - Each calling thread must have its own bit set in the \p mask and all non-exited threads +* named in \p mask must execute the same intrinsic with the same \p mask, or the result is undefined. +* \param[in] var - half. Is only being read. +* \param[in] laneMask - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 2-byte word referenced by \p var from the source thread ID as \p half. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __shfl_xor_sync(const unsigned int mask, const __half var, const int laneMask, const int width = warpSize); + +#if defined(__local_warpSize) +#undef warpSize +#undef __local_warpSize +#endif +#endif /*!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300) */ + +#if defined(__cplusplus) && ( !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320) ) +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.nc` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half2 __ldg(const __half2 *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.nc` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half __ldg(const __half *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.cg` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half2 __ldcg(const __half2 *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.cg` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half __ldcg(const __half *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.ca` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half2 __ldca(const __half2 *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.ca` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half __ldca(const __half *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.cs` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half2 __ldcs(const __half2 *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.cs` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half __ldcs(const __half *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.lu` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half2 __ldlu(const __half2 *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.lu` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half __ldlu(const __half *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.cv` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half2 __ldcv(const __half2 *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.cv` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half __ldcv(const __half *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.wb` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stwb(__half2 *const ptr, const __half2 value); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.wb` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stwb(__half *const ptr, const __half value); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.cg` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stcg(__half2 *const ptr, const __half2 value); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.cg` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stcg(__half *const ptr, const __half value); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.cs` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stcs(__half2 *const ptr, const __half2 value); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.cs` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stcs(__half *const ptr, const __half value); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.wt` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stwt(__half2 *const ptr, const __half2 value); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.wt` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stwt(__half *const ptr, const __half value); +#endif /*defined(__cplusplus) && ( !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320) )*/ +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ + +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs half2 vector if-equal comparison. +* +* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The vector result of if-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __heq2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector not-equal comparison. +* +* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The vector result of not-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hne2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector less-equal comparison. +* +* \details Performs \p half2 vector less-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The \p half2 result of less-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hle2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector greater-equal comparison. +* +* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The vector result of greater-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hge2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector less-than comparison. +* +* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The half2 vector result of less-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hlt2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector greater-than comparison. +* +* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The vector result of greater-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hgt2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered if-equal comparison. +* +* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The vector result of unordered if-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hequ2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered not-equal comparison. +* +* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The vector result of unordered not-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hneu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered less-equal comparison. +* +* Performs \p half2 vector less-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The vector result of unordered less-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hleu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered greater-equal comparison. +* +* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The \p half2 vector result of unordered greater-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hgeu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered less-than comparison. +* +* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The vector result of unordered less-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hltu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered greater-than comparison. +* +* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The \p half2 vector result of unordered greater-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hgtu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs half2 vector if-equal comparison. +* +* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns unsigned int +* - The vector mask result of if-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __heq2_mask(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector not-equal comparison. +* +* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns unsigned int +* - The vector mask result of not-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hne2_mask(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector less-equal comparison. +* +* \details Performs \p half2 vector less-equal comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns unsigned int +* - The vector mask result of less-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hle2_mask(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector greater-equal comparison. +* +* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns unsigned int +* - The vector mask result of greater-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hge2_mask(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector less-than comparison. +* +* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns unsigned int +* - The vector mask result of less-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hlt2_mask(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector greater-than comparison. +* +* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns unsigned int +* - The vector mask result of greater-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hgt2_mask(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered if-equal comparison. +* +* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns unsigned int +* - The vector mask result of unordered if-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hequ2_mask(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered not-equal comparison. +* +* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns unsigned int +* - The vector mask result of unordered not-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hneu2_mask(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered less-equal comparison. +* +* Performs \p half2 vector less-equal comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns unsigned int +* - The vector mask result of unordered less-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hleu2_mask(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered greater-equal comparison. +* +* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns unsigned int +* - The vector mask result of unordered greater-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hgeu2_mask(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered less-than comparison. +* +* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns unsigned int +* - The vector mask result of unordered less-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hltu2_mask(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered greater-than comparison. +* +* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to \p 0xFFFF for true, or \p 0x0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns unsigned int +* - The vector mask result of unordered greater-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hgtu2_mask(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Determine whether \p half2 argument is a NaN. +* +* \details Determine whether each half of input \p half2 number \p a is a NaN. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The half2 with the corresponding \p half results set to +* 1.0 for NaN, 0.0 otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hisnan2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector addition in round-to-nearest-even mode. +* +* \details Performs \p half2 vector add of inputs \p a and \p b, in round-to-nearest-even +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-95 +* \endinternal +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The sum of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hadd2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector subtraction in round-to-nearest-even mode. +* +* \details Subtracts \p half2 input vector \p b from input vector \p a in +* round-to-nearest-even mode. +* \internal +* \req DEEPLEARN-SRM_REQ-104 +* \endinternal +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The subtraction of vector \p b from \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hsub2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector multiplication in round-to-nearest-even mode. +* +* \details Performs \p half2 vector multiplication of inputs \p a and \p b, in +* round-to-nearest-even mode. +* \internal +* \req DEEPLEARN-SRM_REQ-102 +* \endinternal +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The result of elementwise multiplying the vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmul2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector addition in round-to-nearest-even mode. +* +* \details Performs \p half2 vector add of inputs \p a and \p b, in round-to-nearest-even +* mode. Prevents floating-point contractions of mul+add into fma. +* \internal +* \req DEEPLEARN-SRM_REQ-95 +* \endinternal +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The sum of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hadd2_rn(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector subtraction in round-to-nearest-even mode. +* +* \details Subtracts \p half2 input vector \p b from input vector \p a in +* round-to-nearest-even mode. Prevents floating-point contractions of mul+sub +* into fma. +* \internal +* \req DEEPLEARN-SRM_REQ-104 +* \endinternal +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The subtraction of vector \p b from \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hsub2_rn(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector multiplication in round-to-nearest-even mode. +* +* \details Performs \p half2 vector multiplication of inputs \p a and \p b, in +* round-to-nearest-even mode. Prevents floating-point contractions of +* mul+add or sub into fma. +* \internal +* \req DEEPLEARN-SRM_REQ-102 +* \endinternal +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The result of elementwise multiplying the vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmul2_rn(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector division in round-to-nearest-even mode. +* +* \details Divides \p half2 input vector \p a by input vector \p b in round-to-nearest-even +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-103 +* \endinternal +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The elementwise division of \p a with \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __h2div(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Calculates the absolute value of both halves of the input \p half2 number and +* returns the result. +* +* \details Calculates the absolute value of both halves of the input \p half2 number and +* returns the result. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - Returns \p a with the absolute value of both halves. +* +* \see __habs(__half) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __habs2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector addition in round-to-nearest-even mode, with +* saturation to [0.0, 1.0]. +* +* \details Performs \p half2 vector add of inputs \p a and \p b, in round-to-nearest-even +* mode, and clamps the results to range [0.0, 1.0]. NaN results are flushed to +* +0.0. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The sum of \p a and \p b, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hadd2_sat(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector subtraction in round-to-nearest-even mode, +* with saturation to [0.0, 1.0]. +* +* \details Subtracts \p half2 input vector \p b from input vector \p a in +* round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN +* results are flushed to +0.0. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The subtraction of vector \p b from \p a, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hsub2_sat(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector multiplication in round-to-nearest-even mode, +* with saturation to [0.0, 1.0]. +* +* \details Performs \p half2 vector multiplication of inputs \p a and \p b, in +* round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN +* results are flushed to +0.0. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The result of elementwise multiplication of vectors \p a and \p b, +* with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmul2_sat(const __half2 a, const __half2 b); + +#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector fused multiply-add in round-to-nearest-even +* mode. +* +* \details Performs \p half2 vector multiply on inputs \p a and \p b, +* then performs a \p half2 vector add of the result with \p c, +* rounding the result once in round-to-nearest-even mode. +* \internal +* \req DEEPLEARN-SRM_REQ-105 +* \endinternal +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* \param[in] c - half2. Is only being read. +* +* \returns half2 +* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hfma2(const __half2 a, const __half2 b, const __half2 c); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector fused multiply-add in round-to-nearest-even +* mode, with saturation to [0.0, 1.0]. +* +* \details Performs \p half2 vector multiply on inputs \p a and \p b, +* then performs a \p half2 vector add of the result with \p c, +* rounding the result once in round-to-nearest-even mode, and clamps the +* results to range [0.0, 1.0]. NaN results are flushed to +0.0. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* \param[in] c - half2. Is only being read. +* +* \returns half2 +* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c, +* with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hfma2_sat(const __half2 a, const __half2 b, const __half2 c); +#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Negates both halves of the input \p half2 number and returns the +* result. +* +* \details Negates both halves of the input \p half2 number \p a and returns the result. +* \internal +* \req DEEPLEARN-SRM_REQ-101 +* \endinternal +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - Returns \p a with both halves negated. +* +* \see __hneg(__half) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hneg2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Calculates the absolute value of input \p half number and returns the result. +* +* \details Calculates the absolute value of input \p half number and returns the result. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The absolute value of \p a. +* - __habs \cuda_math_formula (\pm 0)\end_cuda_math_formula returns +0. +* - __habs \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula +\infty \end_cuda_math_formula. +* - __habs(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __habs(const __half a); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half addition in round-to-nearest-even mode. +* +* \details Performs \p half addition of inputs \p a and \p b, in round-to-nearest-even +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-94 +* \endinternal +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* - The sum of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hadd(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half subtraction in round-to-nearest-even mode. +* +* \details Subtracts \p half input \p b from input \p a in round-to-nearest-even +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-97 +* \endinternal +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* - The result of subtracting \p b from \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hsub(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half multiplication in round-to-nearest-even mode. +* +* \details Performs \p half multiplication of inputs \p a and \p b, in round-to-nearest-even +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-99 +* \endinternal +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* - The result of multiplying \p a and \p b. +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmul(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half addition in round-to-nearest-even mode. +* +* \details Performs \p half addition of inputs \p a and \p b, in round-to-nearest-even +* mode. Prevents floating-point contractions of mul+add into fma. +* \internal +* \req DEEPLEARN-SRM_REQ-94 +* \endinternal +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* - The sum of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hadd_rn(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half subtraction in round-to-nearest-even mode. +* +* \details Subtracts \p half input \p b from input \p a in round-to-nearest-even +* mode. Prevents floating-point contractions of mul+sub into fma. +* \internal +* \req DEEPLEARN-SRM_REQ-97 +* \endinternal +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* - The result of subtracting \p b from \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hsub_rn(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half multiplication in round-to-nearest-even mode. +* +* \details Performs \p half multiplication of inputs \p a and \p b, in round-to-nearest-even +* mode. Prevents floating-point contractions of mul+add or sub into fma. +* \internal +* \req DEEPLEARN-SRM_REQ-99 +* \endinternal +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* - The result of multiplying \p a and \p b. +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmul_rn(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half division in round-to-nearest-even mode. +* +* \details Divides \p half input \p a by input \p b in round-to-nearest-even +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-98 +* \endinternal +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* - The result of dividing \p a by \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hdiv(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half addition in round-to-nearest-even mode, with +* saturation to [0.0, 1.0]. +* +* \details Performs \p half add of inputs \p a and \p b, in round-to-nearest-even mode, +* and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* - The sum of \p a and \p b, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hadd_sat(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half subtraction in round-to-nearest-even mode, with +* saturation to [0.0, 1.0]. +* +* \details Subtracts \p half input \p b from input \p a in round-to-nearest-even +* mode, +* and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* - The result of subtraction of \p b from \p a, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hsub_sat(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half multiplication in round-to-nearest-even mode, with +* saturation to [0.0, 1.0]. +* +* \details Performs \p half multiplication of inputs \p a and \p b, in round-to-nearest-even +* mode, and clamps the result to range [0.0, 1.0]. NaN results are flushed to +* +0.0. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* - The result of multiplying \p a and \p b, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmul_sat(const __half a, const __half b); + +#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half fused multiply-add in round-to-nearest-even mode. +* +* \details Performs \p half multiply on inputs \p a and \p b, +* then performs a \p half add of the result with \p c, +* rounding the result once in round-to-nearest-even mode. +* \internal +* \req DEEPLEARN-SRM_REQ-96 +* \endinternal +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* \param[in] c - half. Is only being read. +* +* \returns half +* - The result of fused multiply-add operation on \p +* a, \p b, and \p c. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hfma(const __half a, const __half b, const __half c); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half fused multiply-add in round-to-nearest-even mode, +* with saturation to [0.0, 1.0]. +* +* \details Performs \p half multiply on inputs \p a and \p b, +* then performs a \p half add of the result with \p c, +* rounding the result once in round-to-nearest-even mode, and clamps the result +* to range [0.0, 1.0]. NaN results are flushed to +0.0. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* \param[in] c - half. Is only being read. +* +* \returns half +* - The result of fused multiply-add operation on \p +* a, \p b, and \p c, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hfma_sat(const __half a, const __half b, const __half c); +#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */ + +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Negates input \p half number and returns the result. +* +* \details Negates input \p half number and returns the result. +* \internal +* \req DEEPLEARN-SRM_REQ-100 +* \endinternal +* \param[in] a - half. Is only being read. +* +* \returns half +* - Negated input \p a. +* - __hneg \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \mp 0 \end_cuda_math_formula. +* - __hneg \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \mp \infty \end_cuda_math_formula. +* - __hneg(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hneg(const __half a); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector if-equal comparison and returns boolean true +* if both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half if-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of if-equal comparison +* of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbeq2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector not-equal comparison and returns boolean +* true if both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half not-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of not-equal comparison +* of vectors \p a and \p b are true, +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbne2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector less-equal comparison and returns boolean +* true if both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector less-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half less-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of less-equal comparison +* of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hble2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector greater-equal comparison and returns boolean +* true if both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half greater-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of greater-equal +* comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbge2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector less-than comparison and returns boolean +* true if both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half less-than comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of less-than comparison +* of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hblt2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector greater-than comparison and returns boolean +* true if both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half greater-than comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of greater-than +* comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbgt2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered if-equal comparison and returns +* boolean true if both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half if-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of unordered if-equal +* comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbequ2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered not-equal comparison and returns +* boolean true if both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half not-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of unordered not-equal +* comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbneu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered less-equal comparison and returns +* boolean true if both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector less-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half less-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of unordered less-equal +* comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbleu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered greater-equal comparison and +* returns boolean true if both \p half results are true, boolean false +* otherwise. +* +* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half greater-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of unordered +* greater-equal comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbgeu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered less-than comparison and returns +* boolean true if both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half less-than comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of unordered less-than comparison of +* vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbltu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered greater-than comparison and +* returns boolean true if both \p half results are true, boolean false +* otherwise. +* +* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half greater-than comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of unordered +* greater-than comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbgtu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half if-equal comparison. +* +* \details Performs \p half if-equal comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of if-equal comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ bool __heq(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half not-equal comparison. +* +* \details Performs \p half not-equal comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of not-equal comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hne(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half less-equal comparison. +* +* \details Performs \p half less-equal comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of less-equal comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hle(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half greater-equal comparison. +* +* \details Performs \p half greater-equal comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of greater-equal comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hge(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half less-than comparison. +* +* \details Performs \p half less-than comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of less-than comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hlt(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half greater-than comparison. +* +* \details Performs \p half greater-than comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of greater-than comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hgt(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half unordered if-equal comparison. +* +* \details Performs \p half if-equal comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of unordered if-equal comparison of \p a and +* \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hequ(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half unordered not-equal comparison. +* +* \details Performs \p half not-equal comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of unordered not-equal comparison of \p a and +* \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hneu(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half unordered less-equal comparison. +* +* \details Performs \p half less-equal comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of unordered less-equal comparison of \p a and +* \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hleu(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half unordered greater-equal comparison. +* +* \details Performs \p half greater-equal comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of unordered greater-equal comparison of \p a +* and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hgeu(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half unordered less-than comparison. +* +* \details Performs \p half less-than comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of unordered less-than comparison of \p a and +* \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hltu(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half unordered greater-than comparison. +* +* \details Performs \p half greater-than comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of unordered greater-than comparison of \p a +* and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hgtu(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Determine whether \p half argument is a NaN. +* +* \details Determine whether \p half value \p a is a NaN. +* \param[in] a - half. Is only being read. +* +* \returns bool +* - true if argument is NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hisnan(const __half a); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Calculates \p half maximum of two input values, NaNs pass through. +* +* \details Calculates \p half max(\p a, \p b) +* defined as (\p a > \p b) ? \p a : \p b. +* - If either of inputs is NaN, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmax_nan(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Calculates \p half minimum of two input values, NaNs pass through. +* +* \details Calculates \p half min(\p a, \p b) +* defined as (\p a < \p b) ? \p a : \p b. +* - If either of inputs is NaN, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmin_nan(const __half a, const __half b); +#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half fused multiply-add in round-to-nearest-even mode with relu saturation. +* +* \details Performs \p half multiply on inputs \p a and \p b, +* then performs a \p half add of the result with \p c, +* rounding the result once in round-to-nearest-even mode. +* Then negative result is clamped to 0. +* NaN result is converted to canonical NaN. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* \param[in] c - half. Is only being read. +* +* \returns half +* - The result of fused multiply-add operation on \p +* a, \p b, and \p c with relu saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hfma_relu(const __half a, const __half b, const __half c); +#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Calculates \p half2 vector maximum of two inputs, NaNs pass through. +* +* \details Calculates \p half2 vector max(\p a, \p b). +* Elementwise \p half operation is defined as +* (\p a > \p b) ? \p a : \p b. +* - If either of inputs is NaN, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The result of elementwise maximum of vectors \p a and \p b, with NaNs pass through +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmax2_nan(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Calculates \p half2 vector minimum of two inputs, NaNs pass through. +* +* \details Calculates \p half2 vector min(\p a, \p b). +* Elementwise \p half operation is defined as +* (\p a < \p b) ? \p a : \p b. +* - If either of inputs is NaN, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The result of elementwise minimum of vectors \p a and \p b, with NaNs pass through +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmin2_nan(const __half2 a, const __half2 b); +#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector fused multiply-add in round-to-nearest-even +* mode with relu saturation. +* +* \details Performs \p half2 vector multiply on inputs \p a and \p b, +* then performs a \p half2 vector add of the result with \p c, +* rounding the result once in round-to-nearest-even mode. +* Then negative result is clamped to 0. +* NaN result is converted to canonical NaN. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* \param[in] c - half2. Is only being read. +* +* \returns half2 +* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c with relu saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hfma2_relu(const __half2 a, const __half2 b, const __half2 c); + +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs fast complex multiply-accumulate +* +* \details Interprets vector \p half2 input pairs \p a, \p b, and \p c as +* complex numbers in \p half precision: (a.x + I*a.y), (b.x + I*b.y), (c.x + I*c.y) +* and performs complex multiply-accumulate operation: a*b + c in a simple way: +* ((a.x*b.x + c.x) - a.y*b.y) + I*((a.x*b.y + c.y) + a.y*b.x) +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* \param[in] c - half2. Is only being read. +* +* \returns half2 +* - The result of complex multiply-accumulate operation on complex numbers \p a, \p b, and \p c +* - __half2 result = __hcmadd(a, b, c) is numerically in agreement with: +* - result.x = __hfma(-a.y, b.y, __hfma(a.x, b.x, c.x)) +* - result.y = __hfma( a.y, b.x, __hfma(a.x, b.y, c.y)) +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hcmadd(const __half2 a, const __half2 b, const __half2 c); +#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */ +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half square root in round-to-nearest-even mode. +* +* \details Calculates \p half square root of input: \cuda_math_formula \sqrt{a} \end_cuda_math_formula in round-to-nearest-even mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The square root of \p a. +* - hsqrt \cuda_math_formula (+\infty)\end_cuda_math_formula returns \cuda_math_formula +\infty \end_cuda_math_formula. +* - hsqrt \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula. +* - hsqrt \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns NaN. +* - hsqrt(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hsqrt(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half reciprocal square root in round-to-nearest-even +* mode. +* +* \details Calculates \p half reciprocal square root of input: \cuda_math_formula \frac{1}{\sqrt{a}}\end_cuda_math_formula in round-to-nearest-even +* mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The reciprocal square root of \p a. +* - hrsqrt \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula. +* - hrsqrt \cuda_math_formula (+\infty)\end_cuda_math_formula returns +0. +* - hrsqrt \cuda_math_formula (x), x < 0.0\end_cuda_math_formula returns NaN. +* - hrsqrt(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hrsqrt(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half reciprocal in round-to-nearest-even mode. +* +* \details Calculates \p half reciprocal of input: \cuda_math_formula \frac{1}{a}\end_cuda_math_formula in round-to-nearest-even mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The reciprocal of \p a. +* - hrcp \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula \pm \infty \end_cuda_math_formula. +* - hrcp \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns \cuda_math_formula \pm 0 \end_cuda_math_formula. +* - hrcp(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hrcp(const __half a); +#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half natural logarithm in round-to-nearest-even mode. +* +* \details Calculates \p half natural logarithm of input: \cuda_math_formula \ln(a)\end_cuda_math_formula in round-to-nearest-even +* mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The natural logarithm of \p a. +* - hlog \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula -\infty \end_cuda_math_formula. +* - hlog(1) returns +0. +* - hlog(x), x < 0 returns NaN. +* - hlog \cuda_math_formula (+\infty)\end_cuda_math_formula returns \cuda_math_formula +\infty \end_cuda_math_formula. +* - hlog(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hlog(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half binary logarithm in round-to-nearest-even mode. +* +* \details Calculates \p half binary logarithm of input: \cuda_math_formula \log_{2}(a)\end_cuda_math_formula in round-to-nearest-even +* mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The binary logarithm of \p a. +* - hlog2 \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula -\infty \end_cuda_math_formula. +* - hlog2(1) returns +0. +* - hlog2(x), x < 0 returns NaN. +* - hlog2 \cuda_math_formula (+\infty)\end_cuda_math_formula returns \cuda_math_formula +\infty \end_cuda_math_formula. +* - hlog2(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hlog2(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half decimal logarithm in round-to-nearest-even mode. +* +* \details Calculates \p half decimal logarithm of input: \cuda_math_formula \log_{10}(a)\end_cuda_math_formula in round-to-nearest-even +* mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The decimal logarithm of \p a. +* - hlog10 \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula -\infty \end_cuda_math_formula. +* - hlog10(1) returns +0. +* - hlog10(x), x < 0 returns NaN. +* - hlog10 \cuda_math_formula (+\infty)\end_cuda_math_formula returns \cuda_math_formula +\infty \end_cuda_math_formula. +* - hlog10(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hlog10(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half natural exponential function in round-to-nearest-even +* mode. +* +* \details Calculates \p half natural exponential function of input: \cuda_math_formula e^{a}\end_cuda_math_formula in +* round-to-nearest-even mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The natural exponential function on \p a. +* - hexp \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 1. +* - hexp \cuda_math_formula (-\infty)\end_cuda_math_formula returns +0. +* - hexp \cuda_math_formula (+\infty)\end_cuda_math_formula returns \cuda_math_formula +\infty \end_cuda_math_formula. +* - hexp(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hexp(const __half a); +#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */ + +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates approximate \p half hyperbolic tangent function. +* +* \details Calculates approximate \p half hyperbolic tangent function: \cuda_math_formula \tanh(a)\end_cuda_math_formula. +* This operation uses HW acceleration on devices of compute capability 7.5 and higher. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The approximate hyperbolic tangent function of \p a. +* - htanh_approx \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula (\pm 0)\end_cuda_math_formula. +* - htanh_approx \cuda_math_formula (\pm\infty)\end_cuda_math_formula returns \cuda_math_formula (\pm 1)\end_cuda_math_formula. +* - htanh_approx(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half htanh_approx(const __half a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector approximate hyperbolic tangent function. +* +* \details Calculates \p half2 approximate hyperbolic tangent function of input vector \p a. +* This operation uses HW acceleration on devices of compute capability 7.5 and higher. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise approximate hyperbolic tangent function on vector \p a. +* +* \see htanh_approx(__half) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2tanh_approx(const __half2 a); + +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half hyperbolic tangent function in +* round-to-nearest-even mode. +* +* \details Calculates \p half hyperbolic tangent function: \cuda_math_formula \tanh(a)\end_cuda_math_formula in +* round-to-nearest-even mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The hyperbolic tangent function of \p a. +* - htanh \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula (\pm 0)\end_cuda_math_formula. +* - htanh \cuda_math_formula (\pm\infty)\end_cuda_math_formula returns \cuda_math_formula (\pm 1)\end_cuda_math_formula. +* - htanh(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half htanh(const __half a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector hyperbolic tangent function in round-to-nearest-even +* mode. +* +* \details Calculates \p half2 hyperbolic tangent function of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise hyperbolic tangent function on vector \p a. +* +* \see htanh(__half) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2tanh(const __half2 a); + +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half binary exponential function in round-to-nearest-even +* mode. +* +* \details Calculates \p half binary exponential function of input: \cuda_math_formula 2^{a}\end_cuda_math_formula in +* round-to-nearest-even mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The binary exponential function on \p a. +* - hexp2 \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 1. +* - hexp2 \cuda_math_formula (-\infty)\end_cuda_math_formula returns +0. +* - hexp2 \cuda_math_formula (+\infty)\end_cuda_math_formula returns \cuda_math_formula +\infty \end_cuda_math_formula. +* - hexp2(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hexp2(const __half a); +#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half decimal exponential function in round-to-nearest-even +* mode. +* +* \details Calculates \p half decimal exponential function of input: \cuda_math_formula 10^{a}\end_cuda_math_formula in +* round-to-nearest-even mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The decimal exponential function on \p a. +* - hexp10 \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 1. +* - hexp10 \cuda_math_formula (-\infty)\end_cuda_math_formula returns +0. +* - hexp10 \cuda_math_formula (+\infty)\end_cuda_math_formula returns \cuda_math_formula +\infty \end_cuda_math_formula. +* - hexp10(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hexp10(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half cosine in round-to-nearest-even mode. +* +* \details Calculates \p half cosine of input \p a in round-to-nearest-even mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The cosine of \p a. +* - hcos \cuda_math_formula (\pm 0)\end_cuda_math_formula returns 1. +* - hcos \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns NaN. +* - hcos(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hcos(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half sine in round-to-nearest-even mode. +* +* \details Calculates \p half sine of input \p a in round-to-nearest-even mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The sine of \p a. +* - hsin \cuda_math_formula (\pm 0)\end_cuda_math_formula returns \cuda_math_formula (\pm 0)\end_cuda_math_formula. +* - hsin \cuda_math_formula (\pm \infty)\end_cuda_math_formula returns NaN. +* - hsin(NaN) returns NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hsin(const __half a); +#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector square root in round-to-nearest-even mode. +* +* \details Calculates \p half2 square root of input vector \p a in round-to-nearest-even +* mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise square root on vector \p a. +* +* \see hsqrt(__half) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2sqrt(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector reciprocal square root in round-to-nearest-even +* mode. +* +* \details Calculates \p half2 reciprocal square root of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise reciprocal square root on vector \p a. +* +* \see hrsqrt(__half) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2rsqrt(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector reciprocal in round-to-nearest-even mode. +* +* \details Calculates \p half2 reciprocal of input vector \p a in round-to-nearest-even +* mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise reciprocal on vector \p a. +* +* \see hrcp(__half) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2rcp(const __half2 a); +#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector natural logarithm in round-to-nearest-even +* mode. +* +* \details Calculates \p half2 natural logarithm of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise natural logarithm on vector \p a. +* +* \see hlog(__half) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2log(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector binary logarithm in round-to-nearest-even +* mode. +* +* \details Calculates \p half2 binary logarithm of input vector \p a in round-to-nearest-even +* mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise binary logarithm on vector \p a. +* +* \see hlog2(__half) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2log2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector decimal logarithm in round-to-nearest-even +* mode. +* +* \details Calculates \p half2 decimal logarithm of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise decimal logarithm on vector \p a. +* +* \see hlog10(__half) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2log10(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector exponential function in round-to-nearest-even +* mode. +* +* \details Calculates \p half2 exponential function of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise exponential function on vector \p a. +* +* \see hexp(__half) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2exp(const __half2 a); +#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */ +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector binary exponential function in +* round-to-nearest-even mode. +* +* \details Calculates \p half2 binary exponential function of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise binary exponential function on vector \p a. +* +* \see hexp2(__half) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2exp2(const __half2 a); +#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector decimal exponential function in +* round-to-nearest-even mode. +* +* \details Calculates \p half2 decimal exponential function of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise decimal exponential function on vector \p a. +* +* \see hexp10(__half) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2exp10(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector cosine in round-to-nearest-even mode. +* +* \details Calculates \p half2 cosine of input vector \p a in round-to-nearest-even +* mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise cosine on vector \p a. +* +* \see hcos(__half) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2cos(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector sine in round-to-nearest-even mode. +* +* \details Calculates \p half2 sine of input vector \p a in round-to-nearest-even mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise sine on vector \p a. +* +* \see hsin(__half) for further details. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2sin(const __half2 a); +#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */ + +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Vector add \p val to the value stored at \p address in global or shared memory, and writes this +* value back to \p address. The atomicity of the add operation is guaranteed separately for each of the +* two \p __half elements; the entire \p __half2 is not guaranteed to be atomic as a single 32-bit access. +* +* \details The location of \p address must be in global or shared memory. This operation has undefined +* behavior otherwise. This operation is natively supported by devices of compute capability 6.x and higher, +* older devices use emulation path. +* +* \param[in] address - half2*. An address in global or shared memory. +* \param[in] val - half2. The value to be added. +* +* \returns half2 +* - The old value read from \p address. +* +* \note_ref_guide_atomic +*/ +__CUDA_FP16_DECL__ __half2 atomicAdd(__half2 *const address, const __half2 val); + +#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700))) || defined(_NVHPC_CUDA) +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Adds \p val to the value stored at \p address in global or shared memory, and writes this value +* back to \p address. This operation is performed in one atomic operation. +* +* \details The location of \p address must be in global or shared memory. This operation has undefined +* behavior otherwise. This operation is only supported by devices of compute capability 7.x and higher. +* +* \param[in] address - half*. An address in global or shared memory. +* \param[in] val - half. The value to be added. +* +* \returns half +* - The old value read from \p address. +* +* \note_ref_guide_atomic +*/ +__CUDA_FP16_DECL__ __half atomicAdd(__half *const address, const __half val); +#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700))) || defined(_NVHPC_CUDA) */ +#endif /*defined(__CUDACC__) || defined(_NVHPC_CUDA)*/ + + +#endif /* defined(__cplusplus) */ + +#if !defined(_MSC_VER) && __cplusplus >= 201103L +# define __CPP_VERSION_AT_LEAST_11_FP16 +#elif _MSC_FULL_VER >= 190024210 && _MSVC_LANG >= 201103L +# define __CPP_VERSION_AT_LEAST_11_FP16 +#endif + +// implicitly provided by NVRTC +#if !defined(__CUDACC_RTC__) +#include +#endif /* !defined(__CUDACC_RTC__) */ + +/* C++11 header for ::std::move. + * In RTC mode, ::std::move is provided implicitly; don't include the header + */ +#if defined(__CPP_VERSION_AT_LEAST_11_FP16) && !defined(__CUDACC_RTC__) +#include +#endif /* __cplusplus >= 201103L && !defined(__CUDACC_RTC__) */ + +/* C++ header for ::std::memcpy (used for type punning in host-side implementations). + * When compiling as a CUDA source file memcpy is provided implicitly. + * !defined(__CUDACC__) implies !defined(__CUDACC_RTC__). + */ +#if defined(__cplusplus) && !defined(__CUDACC__) +#include +#endif /* defined(__cplusplus) && !defined(__CUDACC__) */ + +#if (defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3)))) +#define __CUDA_FP16_INLINE__ +#define __CUDA_FP16_FORCEINLINE__ +#else +#define __CUDA_FP16_INLINE__ inline +#define __CUDA_FP16_FORCEINLINE__ __forceinline__ +#endif /* (defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3)))) */ + +/* Set up structure-alignment attribute */ +#if defined(__CUDACC__) +#define __CUDA_ALIGN__(align) __align__(align) +#else +/* Define alignment macro based on compiler type (cannot assume C11 "_Alignas" is available) */ +#if __cplusplus >= 201103L +#define __CUDA_ALIGN__(n) alignas(n) /* C++11 kindly gives us a keyword for this */ +#else /* !defined(__CPP_VERSION_AT_LEAST_11_FP16)*/ +#if defined(__GNUC__) +#define __CUDA_ALIGN__(n) __attribute__ ((aligned(n))) +#elif defined(_MSC_VER) +#define __CUDA_ALIGN__(n) __declspec(align(n)) +#else +#define __CUDA_ALIGN__(n) +#endif /* defined(__GNUC__) */ +#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */ +#endif /* defined(__CUDACC__) */ + +// define __CUDA_FP16_CONSTEXPR__ in order to +// use constexpr where possible, with supporting C++ dialects +// undef after use +#if (defined __CPP_VERSION_AT_LEAST_11_FP16) +#define __CUDA_FP16_CONSTEXPR__ constexpr +#else +#define __CUDA_FP16_CONSTEXPR__ +#endif + +/** + * \ingroup CUDA_MATH_INTRINSIC_HALF + * \brief __half_raw data type + * \details Type allows static initialization of \p half until it becomes + * a built-in type. + * + * - Note: this initialization is as a bit-field representation of \p half, + * and not a conversion from \p short to \p half. + * Such representation will be deprecated in a future version of CUDA. + * + * - Note: this is visible to non-nvcc compilers, including C-only compilations + */ +typedef struct __CUDA_ALIGN__(2) { + /** + * Storage field contains bits representation of the \p half floating-point number. + */ + unsigned short x; +} __half_raw; + +/** + * \ingroup CUDA_MATH_INTRINSIC_HALF + * \brief __half2_raw data type + * \details Type allows static initialization of \p half2 until it becomes + * a built-in type. + * + * - Note: this initialization is as a bit-field representation of \p half2, + * and not a conversion from \p short2 to \p half2. + * Such representation will be deprecated in a future version of CUDA. + * + * - Note: this is visible to non-nvcc compilers, including C-only compilations + */ +typedef struct __CUDA_ALIGN__(4) { + /** + * Storage field contains bits of the lower \p half part. + */ + unsigned short x; + /** + * Storage field contains bits of the upper \p half part. + */ + unsigned short y; +} __half2_raw; + +/* All other definitions in this file are only visible to C++ compilers */ +#if defined(__cplusplus) + +/* Hide GCC member initialization list warnings because of host/device in-function init requirement */ +#if defined(__GNUC__) +#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Weffc++" +#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */ +#endif /* defined(__GNUC__) */ + +/* class' : multiple assignment operators specified + The class has multiple assignment operators of a single type. This warning is informational */ +#if defined(_MSC_VER) && _MSC_VER >= 1500 +#pragma warning( push ) +#pragma warning( disable:4522 ) +#endif /* defined(_MSC_VER) && _MSC_VER >= 1500 */ + +// forward-declaration of bfloat type to be used in converting constructor +struct __nv_bfloat16; + +/** + * \ingroup CUDA_MATH_INTRINSIC_HALF + * \brief __half data type + * \details This structure implements the datatype for storing + * half-precision floating-point numbers. The structure implements + * assignment, arithmetic and comparison operators, and type conversions. + * 16 bits are being used in total: 1 sign bit, 5 bits for the exponent, + * and the significand is being stored in 10 bits. + * The total precision is 11 bits. There are 15361 representable + * numbers within the interval [0.0, 1.0], endpoints included. + * On average we have log10(2**11) ~ 3.311 decimal digits. + * + * The objective here is to provide IEEE754-compliant implementation + * of \p binary16 type and arithmetic with limitations due to + * device HW not supporting floating-point exceptions. + */ +struct __CUDA_ALIGN__(2) __half { +protected: + /** + * Protected storage variable contains the bits of floating-point data. + */ + unsigned short __x; + +public: + /** + * \ingroup CUDA_MATH__HALF_MISC + * \brief Constructor by default. + * \details Emtpy default constructor, result is uninitialized. + */ +#if defined(__CPP_VERSION_AT_LEAST_11_FP16) + __half() = default; +#else + __CUDA_HOSTDEVICE__ __half() {} +#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */ + + /* Convert to/from __half_raw */ + /** + * \ingroup CUDA_MATH__HALF_MISC + * Constructor from \p __half_raw. + */ + __CUDA_HOSTDEVICE__ __CUDA_FP16_CONSTEXPR__ __half(const __half_raw &hr) : __x(hr.x) { } + /** + * \ingroup CUDA_MATH__HALF_MISC + * Assignment operator from \p __half_raw. + */ + __CUDA_HOSTDEVICE__ __half &operator=(const __half_raw &hr); + /** + * \ingroup CUDA_MATH__HALF_MISC + * Assignment operator from \p __half_raw to \p volatile \p __half. + */ + __CUDA_HOSTDEVICE__ volatile __half &operator=(const __half_raw &hr) volatile; + /** + * \ingroup CUDA_MATH__HALF_MISC + * Assignment operator from \p volatile \p __half_raw to \p volatile \p __half. + */ + __CUDA_HOSTDEVICE__ volatile __half &operator=(const volatile __half_raw &hr) volatile; + /** + * \ingroup CUDA_MATH__HALF_MISC + * Type cast to \p __half_raw operator. + */ + __CUDA_HOSTDEVICE__ operator __half_raw() const; + /** + * \ingroup CUDA_MATH__HALF_MISC + * Type cast to \p __half_raw operator with \p volatile input. + */ + __CUDA_HOSTDEVICE__ operator __half_raw() const volatile; +#if !defined(__CUDA_NO_HALF_CONVERSIONS__) +#if defined(__CPP_VERSION_AT_LEAST_11_FP16) + /** + * \ingroup CUDA_MATH__HALF_MISC + * Construct \p __half from \p __nv_bfloat16 input using default round-to-nearest-even rounding mode. + * Need to include the header file \p cuda_bf16.h + */ + explicit __CUDA_HOSTDEVICE__ __half(const __nv_bfloat16 f); //forward declaration only, implemented in cuda_bf16.hpp +#endif /* #if defined(__CPP_VERSION_AT_LEAST_11_FP16) */ + /* Construct from float/double */ + /** + * \ingroup CUDA_MATH__HALF_MISC + * Construct \p __half from \p float input using default round-to-nearest-even rounding mode. + * + * \see __float2half(float) for further details. + */ + __CUDA_HOSTDEVICE__ __half(const float f) { __x = __float2half(f).__x; } + /** + * \ingroup CUDA_MATH__HALF_MISC + * Construct \p __half from \p double input using default round-to-nearest-even rounding mode. + * + * \see __double2half(double) for further details. + */ + __CUDA_HOSTDEVICE__ __half(const double f) { __x = __double2half(f).__x; } + /** + * \ingroup CUDA_MATH__HALF_MISC + * Type cast to \p float operator. + */ + __CUDA_HOSTDEVICE__ operator float() const; + /** + * \ingroup CUDA_MATH__HALF_MISC + * Type cast to \p __half assignment operator from \p float input using default round-to-nearest-even rounding mode. + * + * \see __float2half(float) for further details. + */ + __CUDA_HOSTDEVICE__ __half &operator=(const float f); + + /* We omit "cast to double" operator, so as to not be ambiguous about up-cast */ + /** + * \ingroup CUDA_MATH__HALF_MISC + * Type cast to \p __half assignment operator from \p double input using default round-to-nearest-even rounding mode. + * + * \see __double2half(double) for further details. + */ + __CUDA_HOSTDEVICE__ __half &operator=(const double f); + +/* + * Implicit type conversions to/from integer types were only available to nvcc compilation. + * Introducing them for all compilers is a potentially breaking change that may affect + * overloads resolution and will require users to update their code. + * Define __CUDA_FP16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__ to opt-out. + */ +#if !(defined __CUDA_FP16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__) || (defined __CUDACC__) + /* Allow automatic construction from types supported natively in hardware */ + /* Note we do avoid constructor init-list because of special host/device compilation rules */ + + /** + * \ingroup CUDA_MATH__HALF_MISC + * Construct \p __half from \p short integer input using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __half(const short val) { __x = __short2half_rn(val).__x; } + /** + * \ingroup CUDA_MATH__HALF_MISC + * Construct \p __half from \p unsigned \p short integer input using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __half(const unsigned short val) { __x = __ushort2half_rn(val).__x; } + /** + * \ingroup CUDA_MATH__HALF_MISC + * Construct \p __half from \p int input using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __half(const int val) { __x = __int2half_rn(val).__x; } + /** + * \ingroup CUDA_MATH__HALF_MISC + * Construct \p __half from \p unsigned \p int input using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __half(const unsigned int val) { __x = __uint2half_rn(val).__x; } + /** + * \ingroup CUDA_MATH__HALF_MISC + * Construct \p __half from \p long input using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __half(const long val) { + /* Suppress VS warning: warning C4127: conditional expression is constant */ +#if defined(_MSC_VER) && !defined(__CUDA_ARCH__) +#pragma warning (disable: 4127) +#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */ + if (sizeof(long) == sizeof(long long)) +#if defined(_MSC_VER) && !defined(__CUDA_ARCH__) +#pragma warning (default: 4127) +#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */ + { + __x = __ll2half_rn(static_cast(val)).__x; + } else { + __x = __int2half_rn(static_cast(val)).__x; + } + } + /** + * \ingroup CUDA_MATH__HALF_MISC + * Construct \p __half from \p unsigned \p long input using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __half(const unsigned long val) { + /* Suppress VS warning: warning C4127: conditional expression is constant */ +#if defined(_MSC_VER) && !defined(__CUDA_ARCH__) +#pragma warning (disable: 4127) +#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */ + if (sizeof(unsigned long) == sizeof(unsigned long long)) +#if defined(_MSC_VER) && !defined(__CUDA_ARCH__) +#pragma warning (default: 4127) +#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */ + { + __x = __ull2half_rn(static_cast(val)).__x; + } else { + __x = __uint2half_rn(static_cast(val)).__x; + } + } + + /** + * \ingroup CUDA_MATH__HALF_MISC + * Construct \p __half from \p long \p long input using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __half(const long long val) { __x = __ll2half_rn(val).__x; } + /** + * \ingroup CUDA_MATH__HALF_MISC + * Construct \p __half from \p unsigned \p long \p long input using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __half(const unsigned long long val) { __x = __ull2half_rn(val).__x; } + + /* Allow automatic casts to supported built-in types, matching all that are permitted with float */ + + /** + * \ingroup CUDA_MATH__HALF_MISC + * Conversion operator to \p signed \p char data type. + * Using round-toward-zero rounding mode. + * + * \see __half2char_rz(__half) for further details. + */ + __CUDA_HOSTDEVICE__ operator signed char() const; + /** + * \ingroup CUDA_MATH__HALF_MISC + * Conversion operator to \p unsigned \p char data type. + * Using round-toward-zero rounding mode. + * + * \see __half2uchar_rz(__half) for further details. + */ + __CUDA_HOSTDEVICE__ operator unsigned char() const; + /** + * \ingroup CUDA_MATH__HALF_MISC + * Conversion operator to an implementation defined \p char data type. + * Using round-toward-zero rounding mode. + * + * Detects signedness of the \p char type and proceeds accordingly, see + * further details in __half2char_rz(__half) and __half2uchar_rz(__half). + */ + __CUDA_HOSTDEVICE__ operator char() const; + /** + * \ingroup CUDA_MATH__HALF_MISC + * Conversion operator to \p short data type. + * Using round-toward-zero rounding mode. + * + * \see __half2short_rz(__half) for further details. + */ + __CUDA_HOSTDEVICE__ operator short() const; + /** + * \ingroup CUDA_MATH__HALF_MISC + * Conversion operator to \p unsigned \p short data type. + * Using round-toward-zero rounding mode. + * + * \see __half2ushort_rz(__half) for further details. + */ + __CUDA_HOSTDEVICE__ operator unsigned short() const; + /** + * \ingroup CUDA_MATH__HALF_MISC + * Conversion operator to \p int data type. + * Using round-toward-zero rounding mode. + * + * \see __half2int_rz(__half) for further details. + */ + __CUDA_HOSTDEVICE__ operator int() const; + /** + * \ingroup CUDA_MATH__HALF_MISC + * Conversion operator to \p unsigned \p int data type. + * Using round-toward-zero rounding mode. + * + * \see __half2uint_rz(__half) for further details. + */ + __CUDA_HOSTDEVICE__ operator unsigned int() const; + /** + * \ingroup CUDA_MATH__HALF_MISC + * Conversion operator to \p long data type. + * Using round-toward-zero rounding mode. + * + * Detects size of the \p long type and proceeds accordingly, see + * further details in __half2int_rz(__half) and __half2ll_rz(__half). + */ + __CUDA_HOSTDEVICE__ operator long() const; + /** + * \ingroup CUDA_MATH__HALF_MISC + * Conversion operator to \p unsigned \p long data type. + * Using round-toward-zero rounding mode. + * + * Detects size of the \p unsigned \p long type and proceeds + * accordingly, see further details in __half2uint_rz(__half) and __half2ull_rz(__half). + */ + __CUDA_HOSTDEVICE__ operator unsigned long() const; + /** + * \ingroup CUDA_MATH__HALF_MISC + * Conversion operator to \p long \p long data type. + * Using round-toward-zero rounding mode. + * + * \see __half2ll_rz(__half) for further details. + */ + __CUDA_HOSTDEVICE__ operator long long() const; + /** + * \ingroup CUDA_MATH__HALF_MISC + * Conversion operator to \p unsigned \p long \p long data type. + * Using round-toward-zero rounding mode. + * + * \see __half2ull_rz(__half) for further details. + */ + __CUDA_HOSTDEVICE__ operator unsigned long long() const; + /** + * \ingroup CUDA_MATH__HALF_MISC + * Type cast from \p short assignment operator, using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __half &operator=(const short val); + /** + * \ingroup CUDA_MATH__HALF_MISC + * Type cast from \p unsigned \p short assignment operator, using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __half &operator=(const unsigned short val); + /** + * \ingroup CUDA_MATH__HALF_MISC + * Type cast from \p int assignment operator, using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __half &operator=(const int val); + /** + * \ingroup CUDA_MATH__HALF_MISC + * Type cast from \p unsigned \p int assignment operator, using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __half &operator=(const unsigned int val); + /** + * \ingroup CUDA_MATH__HALF_MISC + * Type cast from \p long \p long assignment operator, using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __half &operator=(const long long val); + /** + * \ingroup CUDA_MATH__HALF_MISC + * Type cast from \p unsigned \p long \p long assignment operator, using default round-to-nearest-even rounding mode. + */ + __CUDA_HOSTDEVICE__ __half &operator=(const unsigned long long val); + /** + * \ingroup CUDA_MATH__HALF_MISC + * Conversion operator to \p bool data type. + * +0 and -0 inputs convert to \p false. + * Non-zero inputs convert to \p true. + */ + __CUDA_HOSTDEVICE__ __CUDA_FP16_CONSTEXPR__ operator bool() const { return (__x & 0x7FFFU) != 0U; } +#endif /* #if !(defined __CUDA_FP16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__) || (defined __CUDACC__) */ +#endif /* !defined(__CUDA_NO_HALF_CONVERSIONS__) */ +}; + +#if !defined(__CUDA_NO_HALF_OPERATORS__) +/* Some basic arithmetic operations expected of a built-in */ + +/** + * \ingroup CUDA_MATH__HALF_ARITHMETIC + * Performs \p half addition operation. + * \see __hadd(__half, __half) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator+(const __half &lh, const __half &rh); +/** + * \ingroup CUDA_MATH__HALF_ARITHMETIC + * Performs \p half subtraction operation. + * \see __hsub(__half, __half) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator-(const __half &lh, const __half &rh); +/** + * \ingroup CUDA_MATH__HALF_ARITHMETIC + * Performs \p half multiplication operation. + * \see __hmul(__half, __half) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator*(const __half &lh, const __half &rh); +/** + * \ingroup CUDA_MATH__HALF_ARITHMETIC + * Performs \p half division operation. + * \see __hdiv(__half, __half) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator/(const __half &lh, const __half &rh); +/** + * \ingroup CUDA_MATH__HALF_ARITHMETIC + * Performs \p half compound assignment with addition operation. + * \see __hadd(__half, __half) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator+=(__half &lh, const __half &rh); +/** + * \ingroup CUDA_MATH__HALF_ARITHMETIC + * Performs \p half compound assignment with subtraction operation. + * \see __hsub(__half, __half) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator-=(__half &lh, const __half &rh); +/** + * \ingroup CUDA_MATH__HALF_ARITHMETIC + * Performs \p half compound assignment with multiplication operation. + * \see __hmul(__half, __half) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator*=(__half &lh, const __half &rh); +/** + * \ingroup CUDA_MATH__HALF_ARITHMETIC + * Performs \p half compound assignment with division operation. + * \see __hdiv(__half, __half) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator/=(__half &lh, const __half &rh); +/* Note for increment and decrement we use the raw value 0x3C00U equating to half(1.0F), to avoid the extra conversion */ +/** + * \ingroup CUDA_MATH__HALF_ARITHMETIC + * Performs \p half prefix increment operation. + * \see __hadd(__half, __half) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator++(__half &h); +/** + * \ingroup CUDA_MATH__HALF_ARITHMETIC + * Performs \p half prefix decrement operation. + * \see __hsub(__half, __half) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator--(__half &h); +/** + * \ingroup CUDA_MATH__HALF_ARITHMETIC + * Performs \p half postfix increment operation. + * \see __hadd(__half, __half) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator++(__half &h, const int ignored); +/** + * \ingroup CUDA_MATH__HALF_ARITHMETIC + * Performs \p half postfix decrement operation. + * \see __hsub(__half, __half) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator--(__half &h, const int ignored); + +/* Unary plus and inverse operators */ +/** + * \ingroup CUDA_MATH__HALF_ARITHMETIC + * Implements \p half unary plus operator, returns input value. + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator+(const __half &h); +/** + * \ingroup CUDA_MATH__HALF_ARITHMETIC + * Implements \p half unary minus operator. + * \see __hneg(__half) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator-(const __half &h); +/* Some basic comparison operations to make it look like a built-in */ +/** + * \ingroup CUDA_MATH__HALF_COMPARISON + * Performs \p half ordered compare equal operation. + * \see __heq(__half, __half) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator==(const __half &lh, const __half &rh); +/** + * \ingroup CUDA_MATH__HALF_COMPARISON + * Performs \p half unordered compare not-equal operation. + * \see __hneu(__half, __half) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator!=(const __half &lh, const __half &rh); +/** + * \ingroup CUDA_MATH__HALF_COMPARISON + * Performs \p half ordered greater-than compare operation. + * \see __hgt(__half, __half) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator> (const __half &lh, const __half &rh); +/** + * \ingroup CUDA_MATH__HALF_COMPARISON + * Performs \p half ordered less-than compare operation. + * \see __hlt(__half, __half) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator< (const __half &lh, const __half &rh); +/** + * \ingroup CUDA_MATH__HALF_COMPARISON + * Performs \p half ordered greater-or-equal compare operation. + * \see __hge(__half, __half) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator>=(const __half &lh, const __half &rh); +/** + * \ingroup CUDA_MATH__HALF_COMPARISON + * Performs \p half ordered less-or-equal compare operation. + * \see __hle(__half, __half) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator<=(const __half &lh, const __half &rh); +#endif /* !defined(__CUDA_NO_HALF_OPERATORS__) */ + +/** + * \ingroup CUDA_MATH_INTRINSIC_HALF + * \brief __half2 data type + * \details This structure implements the datatype for storing two + * half-precision floating-point numbers. + * The structure implements assignment, arithmetic and comparison + * operators, and type conversions. + * + * - NOTE: __half2 is visible to non-nvcc host compilers + */ +struct __CUDA_ALIGN__(4) __half2 { + /** + * Storage field holding lower \p __half part. + */ + __half x; + /** + * Storage field holding upper \p __half part. + */ + __half y; + + // All construct/copy/assign/move +public: + /** + * \ingroup CUDA_MATH__HALF_MISC + * \brief Constructor by default. + * \details Emtpy default constructor, result is uninitialized. + */ +#if defined(__CPP_VERSION_AT_LEAST_11_FP16) + __half2() = default; + /** + * \ingroup CUDA_MATH__HALF_MISC + * Move constructor, available for \p C++11 and later dialects + */ + __CUDA_HOSTDEVICE__ __half2(const __half2 &&src) { +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __HALF2_TO_UI(*this) = ::std::move(__HALF2_TO_CUI(src)); +, + this->x = src.x; + this->y = src.y; +) +} + /** + * \ingroup CUDA_MATH__HALF_MISC + * Move assignment operator, available for \p C++11 and later dialects + */ + __CUDA_HOSTDEVICE__ __half2 &operator=(const __half2 &&src); +#else + __CUDA_HOSTDEVICE__ __half2() { } +#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */ + + /** + * \ingroup CUDA_MATH__HALF_MISC + * Constructor from two \p __half variables + */ + __CUDA_HOSTDEVICE__ __CUDA_FP16_CONSTEXPR__ __half2(const __half &a, const __half &b) : x(a), y(b) { } + /** + * \ingroup CUDA_MATH__HALF_MISC + * Copy constructor + */ + __CUDA_HOSTDEVICE__ __half2(const __half2 &src) { +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __HALF2_TO_UI(*this) = __HALF2_TO_CUI(src); +, + this->x = src.x; + this->y = src.y; +) +} /** + * \ingroup CUDA_MATH__HALF_MISC + * Copy assignment operator + */ + __CUDA_HOSTDEVICE__ __half2 &operator=(const __half2 &src); + + /* Convert to/from __half2_raw */ + /** + * \ingroup CUDA_MATH__HALF_MISC + * Constructor from \p __half2_raw + */ + __CUDA_HOSTDEVICE__ __half2(const __half2_raw &h2r ) { +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __HALF2_TO_UI(*this) = __HALF2_TO_CUI(h2r); +, + __half_raw tr; + tr.x = h2r.x; + this->x = static_cast<__half>(tr); + tr.x = h2r.y; + this->y = static_cast<__half>(tr); +) +} + /** + * \ingroup CUDA_MATH__HALF_MISC + * Assignment operator from \p __half2_raw + */ + __CUDA_HOSTDEVICE__ __half2 &operator=(const __half2_raw &h2r); + /** + * \ingroup CUDA_MATH__HALF_MISC + * Conversion operator to \p __half2_raw + */ + __CUDA_HOSTDEVICE__ operator __half2_raw() const; +}; + +#if !defined(__CUDA_NO_HALF2_OPERATORS__) +/** + * \ingroup CUDA_MATH__HALF2_ARITHMETIC + * Performs packed \p half addition operation. + * \see __hadd2(__half2, __half2) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator+(const __half2 &lh, const __half2 &rh); +/** + * \ingroup CUDA_MATH__HALF2_ARITHMETIC + * Performs packed \p half subtraction operation. + * \see __hsub2(__half2, __half2) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator-(const __half2 &lh, const __half2 &rh); +/** + * \ingroup CUDA_MATH__HALF2_ARITHMETIC + * Performs packed \p half multiplication operation. + * \see __hmul2(__half2, __half2) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator*(const __half2 &lh, const __half2 &rh); +/** + * \ingroup CUDA_MATH__HALF2_ARITHMETIC + * Performs packed \p half division operation. + * \see __h2div(__half2, __half2) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator/(const __half2 &lh, const __half2 &rh); +/** + * \ingroup CUDA_MATH__HALF2_ARITHMETIC + * Performs packed \p half compound assignment with addition operation. + * \see __hadd2(__half2, __half2) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2& operator+=(__half2 &lh, const __half2 &rh); +/** + * \ingroup CUDA_MATH__HALF2_ARITHMETIC + * Performs packed \p half compound assignment with subtraction operation. + * \see __hsub2(__half2, __half2) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2& operator-=(__half2 &lh, const __half2 &rh); +/** + * \ingroup CUDA_MATH__HALF2_ARITHMETIC + * Performs packed \p half compound assignment with multiplication operation. + * \see __hmul2(__half2, __half2) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2& operator*=(__half2 &lh, const __half2 &rh); +/** + * \ingroup CUDA_MATH__HALF2_ARITHMETIC + * Performs packed \p half compound assignment with division operation. + * \see __h2div(__half2, __half2) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2& operator/=(__half2 &lh, const __half2 &rh); +/** + * \ingroup CUDA_MATH__HALF2_ARITHMETIC + * Performs packed \p half prefix increment operation. + * \see __hadd2(__half2, __half2) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 &operator++(__half2 &h); +/** + * \ingroup CUDA_MATH__HALF2_ARITHMETIC + * Performs packed \p half prefix decrement operation. + * \see __hsub2(__half2, __half2) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 &operator--(__half2 &h); +/** + * \ingroup CUDA_MATH__HALF2_ARITHMETIC + * Performs packed \p half postfix increment operation. + * \see __hadd2(__half2, __half2) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator++(__half2 &h, const int ignored); +/** + * \ingroup CUDA_MATH__HALF2_ARITHMETIC + * Performs packed \p half postfix decrement operation. + * \see __hsub2(__half2, __half2) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator--(__half2 &h, const int ignored); +/** + * \ingroup CUDA_MATH__HALF2_ARITHMETIC + * Implements packed \p half unary plus operator, returns input value. + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator+(const __half2 &h); +/** + * \ingroup CUDA_MATH__HALF2_ARITHMETIC + * Implements packed \p half unary minus operator. + * \see __hneg2(__half2) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator-(const __half2 &h); +/** + * \ingroup CUDA_MATH__HALF2_COMPARISON + * Performs packed \p half ordered compare equal operation. + * \see __hbeq2(__half2, __half2) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator==(const __half2 &lh, const __half2 &rh); +/** + * \ingroup CUDA_MATH__HALF2_COMPARISON + * Performs packed \p half unordered compare not-equal operation. + * \see __hbneu2(__half2, __half2) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator!=(const __half2 &lh, const __half2 &rh); +/** + * \ingroup CUDA_MATH__HALF2_COMPARISON + * Performs packed \p half ordered greater-than compare operation. + * \see __hbgt2(__half2, __half2) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator>(const __half2 &lh, const __half2 &rh); +/** + * \ingroup CUDA_MATH__HALF2_COMPARISON + * Performs packed \p half ordered less-than compare operation. + * \see __hblt2(__half2, __half2) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator<(const __half2 &lh, const __half2 &rh); +/** + * \ingroup CUDA_MATH__HALF2_COMPARISON + * Performs packed \p half ordered greater-or-equal compare operation. + * \see __hbge2(__half2, __half2) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator>=(const __half2 &lh, const __half2 &rh); +/** + * \ingroup CUDA_MATH__HALF2_COMPARISON + * Performs packed \p half ordered less-or-equal compare operation. + * \see __hble2(__half2, __half2) + */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator<=(const __half2 &lh, const __half2 &rh); + +#endif /* !defined(__CUDA_NO_HALF2_OPERATORS__) */ +#endif /* defined(__cplusplus) */ + +#if (defined(__FORCE_INCLUDE_CUDA_FP16_HPP_FROM_FP16_H__) || \ + !(defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3))))) + +/* Note the .hpp file is included to capture the "half" & "half2" built-in function definitions. For NVRTC, the built-in + function definitions are compiled at NVRTC library build-time and are available through the NVRTC built-ins library at + link time. +*/ +#include "cuda_fp16.hpp" +#endif /* (defined(__FORCE_INCLUDE_CUDA_FP16_HPP_FROM_FP16_H__) || \ + !(defined(__CUDACC_RTC__) && ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 3))))) */ + +/* Define first-class types "half" and "half2", unless user specifies otherwise via "#define CUDA_NO_HALF" */ +/* C cannot ever have these types defined here, because __half and __half2 are C++ classes */ +#if defined(__cplusplus) && !defined(CUDA_NO_HALF) +/** + * \ingroup CUDA_MATH_INTRINSIC_HALF + * \brief This datatype is meant to be the first-class or fundamental + * implementation of the half-precision numbers format. + * + * \details Should be implemented in the compiler in the future. + * Current implementation is a simple typedef to a respective + * user-level type with underscores. + */ +typedef __half half; + +/** + * \ingroup CUDA_MATH_INTRINSIC_HALF + * \brief This datatype is meant to be the first-class or fundamental + * implementation of type for pairs of half-precision numbers. + * + * \details Should be implemented in the compiler in the future. + * Current implementation is a simple typedef to a respective + * user-level type with underscores. + */ +typedef __half2 half2; +// for consistency with __nv_bfloat16 + +/** + * \ingroup CUDA_MATH_INTRINSIC_HALF + * \brief This datatype is an \p __nv_ prefixed alias + */ +typedef __half __nv_half; +/** + * \ingroup CUDA_MATH_INTRINSIC_HALF + * \brief This datatype is an \p __nv_ prefixed alias + */ +typedef __half2 __nv_half2; +/** + * \ingroup CUDA_MATH_INTRINSIC_HALF + * \brief This datatype is an \p __nv_ prefixed alias + */ +typedef __half_raw __nv_half_raw; +/** + * \ingroup CUDA_MATH_INTRINSIC_HALF + * \brief This datatype is an \p __nv_ prefixed alias + */ +typedef __half2_raw __nv_half2_raw; +/** + * \ingroup CUDA_MATH_INTRINSIC_HALF + * \brief This datatype is an \p nv_ prefixed alias + */ +typedef __half nv_half; +/** + * \ingroup CUDA_MATH_INTRINSIC_HALF + * \brief This datatype is an \p nv_ prefixed alias + */ +typedef __half2 nv_half2; +#endif /* defined(__cplusplus) && !defined(CUDA_NO_HALF) */ + +#undef __CUDA_FP16_DECL__ +#undef __CUDA_HOSTDEVICE_FP16_DECL__ +#undef __CUDA_HOSTDEVICE__ +#undef __CUDA_FP16_INLINE__ +#undef __CUDA_FP16_FORCEINLINE__ +#undef ___CUDA_FP16_STRINGIFY_INNERMOST +#undef __CUDA_FP16_STRINGIFY + +#endif /* end of include guard: __CUDA_FP16_H__ */ diff --git a/numba_cuda/numba/cuda/include/13/cuda_fp16.hpp b/numba_cuda/numba/cuda/include/13/cuda_fp16.hpp new file mode 100644 index 000000000..4259992df --- /dev/null +++ b/numba_cuda/numba/cuda/include/13/cuda_fp16.hpp @@ -0,0 +1,3483 @@ +/* +* Copyright 1993-2024 NVIDIA Corporation. All rights reserved. +* +* NOTICE TO LICENSEE: +* +* This source code and/or documentation ("Licensed Deliverables") are +* subject to NVIDIA intellectual property rights under U.S. and +* international Copyright laws. +* +* These Licensed Deliverables contained herein is PROPRIETARY and +* CONFIDENTIAL to NVIDIA and is being provided under the terms and +* conditions of a form of NVIDIA software license agreement by and +* between NVIDIA and Licensee ("License Agreement") or electronically +* accepted by Licensee. Notwithstanding any terms or conditions to +* the contrary in the License Agreement, reproduction or disclosure +* of the Licensed Deliverables to any third party without the express +* written consent of NVIDIA is prohibited. +* +* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE +* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS +* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. +* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED +* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, +* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. +* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY +* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY +* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS +* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE +* OF THESE LICENSED DELIVERABLES. +* +* U.S. Government End Users. These Licensed Deliverables are a +* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT +* 1995), consisting of "commercial computer software" and "commercial +* computer software documentation" as such terms are used in 48 +* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government +* only as a commercial end item. Consistent with 48 C.F.R.12.212 and +* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all +* U.S. Government End Users acquire the Licensed Deliverables with +* only those rights set forth herein. +* +* Any use of the Licensed Deliverables in individual and commercial +* software must include, in the user documentation and internal +* comments to the code, the above Disclaimer and U.S. Government End +* Users Notice. +*/ + +#if !defined(__CUDA_FP16_HPP__) +#define __CUDA_FP16_HPP__ + +#if !defined(__CUDA_FP16_H__) +#error "Do not include this file directly. Instead, include cuda_fp16.h." +#endif + +#if !defined(IF_DEVICE_OR_CUDACC) +#if defined(__CUDACC__) + #define IF_DEVICE_OR_CUDACC(d, c, f) NV_IF_ELSE_TARGET(NV_IS_DEVICE, d, c) +#else + #define IF_DEVICE_OR_CUDACC(d, c, f) NV_IF_ELSE_TARGET(NV_IS_DEVICE, d, f) +#endif +#endif + +/* Macros for half & half2 binary arithmetic */ +#define __BINARY_OP_HALF_MACRO(name) /* do */ {\ + __half val; \ + asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16 %0,%1,%2;\n}" \ + :"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)),"h"(__HALF_TO_CUS(b))); \ + return val; \ +} /* while(0) */ +#define __BINARY_OP_HALF2_MACRO(name) /* do */ {\ + __half2 val; \ + asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16x2 %0,%1,%2;\n}" \ + :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \ + return val; \ +} /* while(0) */ +#define __TERNARY_OP_HALF_MACRO(name) /* do */ {\ + __half val; \ + asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16 %0,%1,%2,%3;\n}" \ + :"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)),"h"(__HALF_TO_CUS(b)),"h"(__HALF_TO_CUS(c))); \ + return val; \ +} /* while(0) */ +#define __TERNARY_OP_HALF2_MACRO(name) /* do */ {\ + __half2 val; \ + asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16x2 %0,%1,%2,%3;\n}" \ + :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b)),"r"(__HALF2_TO_CUI(c))); \ + return val; \ +} /* while(0) */ + +/* All other definitions in this file are only visible to C++ compilers */ +#if defined(__cplusplus) + +/** + * \ingroup CUDA_MATH_INTRINSIC_HALF_CONSTANTS + * \brief Defines floating-point positive infinity value for the \p half data type + */ +#define CUDART_INF_FP16 __ushort_as_half((unsigned short)0x7C00U) +/** + * \ingroup CUDA_MATH_INTRINSIC_HALF_CONSTANTS + * \brief Defines canonical NaN value for the \p half data type + */ +#define CUDART_NAN_FP16 __ushort_as_half((unsigned short)0x7FFFU) +/** + * \ingroup CUDA_MATH_INTRINSIC_HALF_CONSTANTS + * \brief Defines a minimum representable (denormalized) value for the \p half data type + */ +#define CUDART_MIN_DENORM_FP16 __ushort_as_half((unsigned short)0x0001U) +/** + * \ingroup CUDA_MATH_INTRINSIC_HALF_CONSTANTS + * \brief Defines a maximum representable value for the \p half data type + */ +#define CUDART_MAX_NORMAL_FP16 __ushort_as_half((unsigned short)0x7BFFU) +/** + * \ingroup CUDA_MATH_INTRINSIC_HALF_CONSTANTS + * \brief Defines a negative zero value for the \p half data type + */ +#define CUDART_NEG_ZERO_FP16 __ushort_as_half((unsigned short)0x8000U) +/** + * \ingroup CUDA_MATH_INTRINSIC_HALF_CONSTANTS + * \brief Defines a positive zero value for the \p half data type + */ +#define CUDART_ZERO_FP16 __ushort_as_half((unsigned short)0x0000U) +/** + * \ingroup CUDA_MATH_INTRINSIC_HALF_CONSTANTS + * \brief Defines a value of 1.0 for the \p half data type + */ +#define CUDART_ONE_FP16 __ushort_as_half((unsigned short)0x3C00U) + +#if !(defined __DOXYGEN_ONLY__) + +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const __half_raw &hr) { __x = hr.x; return *this; } +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ volatile __half &__half::operator=(const __half_raw &hr) volatile { __x = hr.x; return *this; } +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ volatile __half &__half::operator=(const volatile __half_raw &hr) volatile { __x = hr.x; return *this; } +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator __half_raw() const { __half_raw ret; ret.x = __x; return ret; } +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator __half_raw() const volatile { __half_raw ret; ret.x = __x; return ret; } +#if !defined(__CUDA_NO_HALF_CONVERSIONS__) +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator float() const { return __half2float(*this); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const float f) { __x = __float2half(f).__x; return *this; } +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const double f) { __x = __double2half(f).__x; return *this; } +#if !(defined __CUDA_FP16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__) || (defined __CUDACC__) +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator signed char() const { return __half2char_rz(*this); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator unsigned char() const { return __half2uchar_rz(*this); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator char() const { + char value; + /* Suppress VS warning: warning C4127: conditional expression is constant */ +#if defined(_MSC_VER) && !defined(__CUDA_ARCH__) +#pragma warning (push) +#pragma warning (disable: 4127) +#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */ + if (((char)-1) < (char)0) +#if defined(_MSC_VER) && !defined(__CUDA_ARCH__) +#pragma warning (pop) +#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */ + { + value = static_cast(__half2char_rz(*this)); + } + else + { + value = static_cast(__half2uchar_rz(*this)); + } + return value; +} +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator short() const { return __half2short_rz(*this); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator unsigned short() const { return __half2ushort_rz(*this); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator int() const { return __half2int_rz(*this); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator unsigned int() const { return __half2uint_rz(*this); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator long() const { + long retval; + /* Suppress VS warning: warning C4127: conditional expression is constant */ +#if defined(_MSC_VER) && !defined(__CUDA_ARCH__) +#pragma warning (push) +#pragma warning (disable: 4127) +#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */ + if (sizeof(long) == sizeof(long long)) +#if defined(_MSC_VER) && !defined(__CUDA_ARCH__) +#pragma warning (pop) +#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */ + { + retval = static_cast(__half2ll_rz(*this)); + } + else + { + retval = static_cast(__half2int_rz(*this)); + } + return retval; +} +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator unsigned long() const { + unsigned long retval; + /* Suppress VS warning: warning C4127: conditional expression is constant */ +#if defined(_MSC_VER) && !defined(__CUDA_ARCH__) +#pragma warning (push) +#pragma warning (disable: 4127) +#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */ + if (sizeof(unsigned long) == sizeof(unsigned long long)) +#if defined(_MSC_VER) && !defined(__CUDA_ARCH__) +#pragma warning (pop) +#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */ + { + retval = static_cast(__half2ull_rz(*this)); + } + else + { + retval = static_cast(__half2uint_rz(*this)); + } + return retval; +} +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator long long() const { return __half2ll_rz(*this); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half::operator unsigned long long() const { return __half2ull_rz(*this); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const short val) { __x = __short2half_rn(val).__x; return *this; } +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const unsigned short val) { __x = __ushort2half_rn(val).__x; return *this; } +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const int val) { __x = __int2half_rn(val).__x; return *this; } +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const unsigned int val) { __x = __uint2half_rn(val).__x; return *this; } +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const long long val) { __x = __ll2half_rn(val).__x; return *this; } +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const unsigned long long val) { __x = __ull2half_rn(val).__x; return *this; } + +#endif /* #if !(defined __CUDA_FP16_DISABLE_IMPLICIT_INTEGER_CONVERTS_FOR_HOST_COMPILERS__) || (defined __CUDACC__) */ +#endif /* !defined(__CUDA_NO_HALF_CONVERSIONS__) */ +#if !defined(__CUDA_NO_HALF_OPERATORS__) +/* Some basic arithmetic operations expected of a built-in */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator+(const __half &lh, const __half &rh) { return __hadd(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator-(const __half &lh, const __half &rh) { return __hsub(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator*(const __half &lh, const __half &rh) { return __hmul(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator/(const __half &lh, const __half &rh) { return __hdiv(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator+=(__half &lh, const __half &rh) { lh = __hadd(lh, rh); return lh; } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator-=(__half &lh, const __half &rh) { lh = __hsub(lh, rh); return lh; } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator*=(__half &lh, const __half &rh) { lh = __hmul(lh, rh); return lh; } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator/=(__half &lh, const __half &rh) { lh = __hdiv(lh, rh); return lh; } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator++(__half &h) { __half_raw one; one.x = 0x3C00U; h += one; return h; } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half &operator--(__half &h) { __half_raw one; one.x = 0x3C00U; h -= one; return h; } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator++(__half &h, const int ignored) +{ + // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators. + static_cast(ignored); + + const __half ret = h; + __half_raw one; + one.x = 0x3C00U; + h += one; + return ret; +} +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator--(__half &h, const int ignored) +{ + // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators. + static_cast(ignored); + + const __half ret = h; + __half_raw one; + one.x = 0x3C00U; + h -= one; + return ret; +} +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator+(const __half &h) { return h; } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half operator-(const __half &h) { return __hneg(h); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator==(const __half &lh, const __half &rh) { return __heq(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator!=(const __half &lh, const __half &rh) { return __hneu(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator> (const __half &lh, const __half &rh) { return __hgt(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator< (const __half &lh, const __half &rh) { return __hlt(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator>=(const __half &lh, const __half &rh) { return __hge(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator<=(const __half &lh, const __half &rh) { return __hle(lh, rh); } +#endif /* !defined(__CUDA_NO_HALF_OPERATORS__) */ +#if defined(__CPP_VERSION_AT_LEAST_11_FP16) +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half2 &__half2::operator=(const __half2 &&src) { +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __HALF2_TO_UI(*this) = ::std::move(__HALF2_TO_CUI(src)); +, + this->x = src.x; + this->y = src.y; +) + return *this; +} +#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */ +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half2 &__half2::operator=(const __half2 &src) { +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __HALF2_TO_UI(*this) = __HALF2_TO_CUI(src); +, + this->x = src.x; + this->y = src.y; +) + return *this; +} +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half2 &__half2::operator=(const __half2_raw &h2r) { +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __HALF2_TO_UI(*this) = __HALF2_TO_CUI(h2r); +, + __half_raw tr; + tr.x = h2r.x; + this->x = static_cast<__half>(tr); + tr.x = h2r.y; + this->y = static_cast<__half>(tr); +) + return *this; +} +__CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half2::operator __half2_raw() const { + __half2_raw ret; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + ret.x = 0U; + ret.y = 0U; + __HALF2_TO_UI(ret) = __HALF2_TO_CUI(*this); +, + ret.x = static_cast<__half_raw>(this->x).x; + ret.y = static_cast<__half_raw>(this->y).x; +) + return ret; +} +#if !defined(__CUDA_NO_HALF2_OPERATORS__) +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator+(const __half2 &lh, const __half2 &rh) { return __hadd2(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator-(const __half2 &lh, const __half2 &rh) { return __hsub2(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator*(const __half2 &lh, const __half2 &rh) { return __hmul2(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator/(const __half2 &lh, const __half2 &rh) { return __h2div(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2& operator+=(__half2 &lh, const __half2 &rh) { lh = __hadd2(lh, rh); return lh; } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2& operator-=(__half2 &lh, const __half2 &rh) { lh = __hsub2(lh, rh); return lh; } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2& operator*=(__half2 &lh, const __half2 &rh) { lh = __hmul2(lh, rh); return lh; } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2& operator/=(__half2 &lh, const __half2 &rh) { lh = __h2div(lh, rh); return lh; } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 &operator++(__half2 &h) { __half2_raw one; one.x = 0x3C00U; one.y = 0x3C00U; h = __hadd2(h, one); return h; } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 &operator--(__half2 &h) { __half2_raw one; one.x = 0x3C00U; one.y = 0x3C00U; h = __hsub2(h, one); return h; } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator++(__half2 &h, const int ignored) +{ + // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators. + static_cast(ignored); + + const __half2 ret = h; + __half2_raw one; + one.x = 0x3C00U; + one.y = 0x3C00U; + h = __hadd2(h, one); + return ret; +} +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator--(__half2 &h, const int ignored) +{ + // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators. + static_cast(ignored); + + const __half2 ret = h; + __half2_raw one; + one.x = 0x3C00U; + one.y = 0x3C00U; + h = __hsub2(h, one); + return ret; +} +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator+(const __half2 &h) { return h; } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ __half2 operator-(const __half2 &h) { return __hneg2(h); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator==(const __half2 &lh, const __half2 &rh) { return __hbeq2(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator!=(const __half2 &lh, const __half2 &rh) { return __hbneu2(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator>(const __half2 &lh, const __half2 &rh) { return __hbgt2(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator<(const __half2 &lh, const __half2 &rh) { return __hblt2(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator>=(const __half2 &lh, const __half2 &rh) { return __hbge2(lh, rh); } +__CUDA_HOSTDEVICE__ __CUDA_FP16_FORCEINLINE__ bool operator<=(const __half2 &lh, const __half2 &rh) { return __hble2(lh, rh); } +#endif /* !defined(__CUDA_NO_HALF2_OPERATORS__) */ + +/* Restore warning for multiple assignment operators */ +#if defined(_MSC_VER) && _MSC_VER >= 1500 +#pragma warning( pop ) +#endif /* defined(_MSC_VER) && _MSC_VER >= 1500 */ + +/* Restore -Weffc++ warnings from here on */ +#if defined(__GNUC__) +#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) +#pragma GCC diagnostic pop +#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */ +#endif /* defined(__GNUC__) */ + +#undef __CUDA_HOSTDEVICE__ +#undef __CUDA_ALIGN__ + +#ifndef __CUDACC_RTC__ /* no host functions in NVRTC mode */ +static inline unsigned short __internal_float2half(const float f, unsigned int &sign, unsigned int &remainder) +{ + unsigned int x; + unsigned int u; + unsigned int result; +#if defined(__CUDACC__) + (void)memcpy(&x, &f, sizeof(f)); +#else + (void)::std::memcpy(&x, &f, sizeof(f)); +#endif + u = (x & 0x7fffffffU); + sign = ((x >> 16U) & 0x8000U); + // NaN/+Inf/-Inf + if (u >= 0x7f800000U) { + remainder = 0U; + result = ((u == 0x7f800000U) ? (sign | 0x7c00U) : 0x7fffU); + } else if (u > 0x477fefffU) { // Overflows + remainder = 0x80000000U; + result = (sign | 0x7bffU); + } else if (u >= 0x38800000U) { // Normal numbers + remainder = u << 19U; + u -= 0x38000000U; + result = (sign | (u >> 13U)); + } else if (u < 0x33000001U) { // +0/-0 + remainder = u; + result = sign; + } else { // Denormal numbers + const unsigned int exponent = u >> 23U; + const unsigned int shift = 0x7eU - exponent; + unsigned int mantissa = (u & 0x7fffffU); + mantissa |= 0x800000U; + remainder = mantissa << (32U - shift); + result = (sign | (mantissa >> shift)); + result &= 0x0000FFFFU; + } + return static_cast(result); +} +#endif /* #if !defined(__CUDACC_RTC__) */ + +__CUDA_HOSTDEVICE_FP16_DECL__ __half __double2half(const double a) +{ +IF_DEVICE_OR_CUDACC( + __half val; + asm("{ cvt.rn.f16.f64 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "d"(a)); + return val; +, + __half result; + // Perform rounding to 11 bits of precision, convert value + // to float and call existing float to half conversion. + // By pre-rounding to 11 bits we avoid additional rounding + // in float to half conversion. + unsigned long long int absa; + unsigned long long int ua; + (void)memcpy(&ua, &a, sizeof(a)); + absa = (ua & 0x7fffffffffffffffULL); + if ((absa >= 0x40f0000000000000ULL) || (absa <= 0x3e60000000000000ULL)) + { + // |a| >= 2^16 or NaN or |a| <= 2^(-25) + // double-rounding is not a problem + result = __float2half(static_cast(a)); + } + else + { + // here 2^(-25) < |a| < 2^16 + // prepare shifter value such that a + shifter + // done in double precision performs round-to-nearest-even + // and (a + shifter) - shifter results in a rounded to + // 11 bits of precision. Shifter needs to have exponent of + // a plus 53 - 11 = 42 and a leading bit in mantissa to guard + // against negative values. + // So need to have |a| capped to avoid overflow in exponent. + // For inputs that are smaller than half precision minnorm + // we prepare fixed shifter exponent. + unsigned long long shifterBits; + if (absa >= 0x3f10000000000000ULL) + { // Here if |a| >= 2^(-14) + // add 42 to exponent bits + shifterBits = (ua & 0x7ff0000000000000ULL) + 0x02A0000000000000ULL; + } + else + { // 2^(-25) < |a| < 2^(-14), potentially results in denormal + // set exponent bits to 42 - 14 + bias + shifterBits = 0x41B0000000000000ULL; + } + // set leading mantissa bit to protect against negative inputs + shifterBits |= 0x0008000000000000ULL; + double shifter; + (void)memcpy(&shifter, &shifterBits, sizeof(shifterBits)); + double aShiftRound = a + shifter; + + // Prevent the compiler from optimizing away a + shifter - shifter + // by doing intermediate memcopy and harmless bitwize operation + unsigned long long int aShiftRoundBits; + (void)memcpy(&aShiftRoundBits, &aShiftRound, sizeof(aShiftRound)); + + // the value is positive, so this operation doesn't change anything + aShiftRoundBits &= 0x7fffffffffffffffULL; + + (void)memcpy(&aShiftRound, &aShiftRoundBits, sizeof(aShiftRound)); + + result = __float2half(static_cast(aShiftRound - shifter)); + } + + return result; +, + __half result; + /* + // Perform rounding to 11 bits of precision, convert value + // to float and call existing float to half conversion. + // By pre-rounding to 11 bits we avoid additional rounding + // in float to half conversion. + */ + unsigned long long int absa; + unsigned long long int ua; + (void)::std::memcpy(&ua, &a, sizeof(a)); + absa = (ua & 0x7fffffffffffffffULL); + if ((absa >= 0x40f0000000000000ULL) || (absa <= 0x3e60000000000000ULL)) + { + /* + // |a| >= 2^16 or NaN or |a| <= 2^(-25) + // double-rounding is not a problem + */ + result = __float2half(static_cast(a)); + } + else + { + /* + // here 2^(-25) < |a| < 2^16 + // prepare shifter value such that a + shifter + // done in double precision performs round-to-nearest-even + // and (a + shifter) - shifter results in a rounded to + // 11 bits of precision. Shifter needs to have exponent of + // a plus 53 - 11 = 42 and a leading bit in mantissa to guard + // against negative values. + // So need to have |a| capped to avoid overflow in exponent. + // For inputs that are smaller than half precision minnorm + // we prepare fixed shifter exponent. + */ + unsigned long long shifterBits; + if (absa >= 0x3f10000000000000ULL) + { + /* + // Here if |a| >= 2^(-14) + // add 42 to exponent bits + */ + shifterBits = (ua & 0x7ff0000000000000ULL) + 0x02A0000000000000ULL; + } + else + { + /* + // 2^(-25) < |a| < 2^(-14), potentially results in denormal + // set exponent bits to 42 - 14 + bias + */ + shifterBits = 0x41B0000000000000ULL; + } + // set leading mantissa bit to protect against negative inputs + shifterBits |= 0x0008000000000000ULL; + double shifter; + (void)::std::memcpy(&shifter, &shifterBits, sizeof(shifterBits)); + double aShiftRound = a + shifter; + + /* + // Prevent the compiler from optimizing away a + shifter - shifter + // by doing intermediate memcopy and harmless bitwize operation + */ + unsigned long long int aShiftRoundBits; + (void)::std::memcpy(&aShiftRoundBits, &aShiftRound, sizeof(aShiftRound)); + + // the value is positive, so this operation doesn't change anything + aShiftRoundBits &= 0x7fffffffffffffffULL; + + (void)::std::memcpy(&aShiftRound, &aShiftRoundBits, sizeof(aShiftRound)); + + result = __float2half(static_cast(aShiftRound - shifter)); + } + + return result; +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half(const float a) +{ + __half val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a)); +, + __half_raw r; + unsigned int sign = 0U; + unsigned int remainder = 0U; + r.x = __internal_float2half(a, sign, remainder); + if ((remainder > 0x80000000U) || ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) { + r.x++; + } + val = r; +) + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rn(const float a) +{ + __half val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a)); +, + __half_raw r; + unsigned int sign = 0U; + unsigned int remainder = 0U; + r.x = __internal_float2half(a, sign, remainder); + if ((remainder > 0x80000000U) || ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) { + r.x++; + } + val = r; +) + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rz(const float a) +{ + __half val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{ cvt.rz.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a)); +, + __half_raw r; + unsigned int sign = 0U; + unsigned int remainder = 0U; + r.x = __internal_float2half(a, sign, remainder); + val = r; +) + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rd(const float a) +{ + __half val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{ cvt.rm.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a)); +, + __half_raw r; + unsigned int sign = 0U; + unsigned int remainder = 0U; + r.x = __internal_float2half(a, sign, remainder); + if ((remainder != 0U) && (sign != 0U)) { + r.x++; + } + val = r; +) + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_ru(const float a) +{ + __half val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{ cvt.rp.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a)); +, + __half_raw r; + unsigned int sign = 0U; + unsigned int remainder = 0U; + r.x = __internal_float2half(a, sign, remainder); + if ((remainder != 0U) && (sign == 0U)) { + r.x++; + } + val = r; +) + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float2half2_rn(const float a) +{ + __half2 val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{.reg .f16 low;\n" + " cvt.rn.f16.f32 low, %1;\n" + " mov.b32 %0, {low,low};}\n" : "=r"(__HALF2_TO_UI(val)) : "f"(a)); +, + val = __half2(__float2half_rn(a), __float2half_rn(a)); +) + return val; +} + +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +__CUDA_FP16_DECL__ __half2 __internal_device_float2_to_half2_rn(const float a, const float b) { + __half2 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + asm("{ cvt.rn.f16x2.f32 %0, %2, %1; }\n" + : "=r"(__HALF2_TO_UI(val)) : "f"(a), "f"(b)); +, + asm("{.reg .f16 low,high;\n" + " cvt.rn.f16.f32 low, %1;\n" + " cvt.rn.f16.f32 high, %2;\n" + " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "f"(a), "f"(b)); +) + return val; +} + +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ + +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __floats2half2_rn(const float a, const float b) +{ + __half2 val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + val = __internal_device_float2_to_half2_rn(a,b); +, + val = __half2(__float2half_rn(a), __float2half_rn(b)); +) + return val; +} + +#ifndef __CUDACC_RTC__ /* no host functions in NVRTC mode */ +static inline float __internal_half2float(const unsigned short h) +{ + unsigned int sign = ((static_cast(h) >> 15U) & 1U); + unsigned int exponent = ((static_cast(h) >> 10U) & 0x1fU); + unsigned int mantissa = ((static_cast(h) & 0x3ffU) << 13U); + float f; + if (exponent == 0x1fU) { /* NaN or Inf */ + /* discard sign of a NaN */ + sign = ((mantissa != 0U) ? (sign >> 1U) : sign); + mantissa = ((mantissa != 0U) ? 0x7fffffU : 0U); + exponent = 0xffU; + } else if (exponent == 0U) { /* Denorm or Zero */ + if (mantissa != 0U) { + unsigned int msb; + exponent = 0x71U; + do { + msb = (mantissa & 0x400000U); + mantissa <<= 1U; /* normalize */ + --exponent; + } while (msb == 0U); + mantissa &= 0x7fffffU; /* 1.mantissa is implicit */ + } + } else { + exponent += 0x70U; + } + const unsigned int u = ((sign << 31U) | (exponent << 23U) | mantissa); +#if defined(__CUDACC__) + (void)memcpy(&f, &u, sizeof(u)); +#else + (void)::std::memcpy(&f, &u, sizeof(u)); +#endif + return f; +} +#endif /* !defined(__CUDACC_RTC__) */ + +__CUDA_HOSTDEVICE_FP16_DECL__ float __half2float(const __half a) +{ + float val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(__HALF_TO_CUS(a))); +, + val = __internal_half2float(static_cast<__half_raw>(a).x); +) + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ float __low2float(const __half2 a) +{ + float val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high},%1;\n" + " cvt.f32.f16 %0, low;}\n" : "=f"(val) : "r"(__HALF2_TO_CUI(a))); +, + val = __internal_half2float(static_cast<__half2_raw>(a).x); +) + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ float __high2float(const __half2 a) +{ + float val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high},%1;\n" + " cvt.f32.f16 %0, high;}\n" : "=f"(val) : "r"(__HALF2_TO_CUI(a))); +, + val = __internal_half2float(static_cast<__half2_raw>(a).y); +) + return val; +} + +__CUDA_HOSTDEVICE_FP16_DECL__ signed char __half2char_rz(const __half h) +{ + signed char i; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + unsigned int tmp; + asm("cvt.rzi.s8.f16 %0, %1;" : "=r"(tmp) : "h"(__HALF_TO_CUS(h))); + const unsigned char u = static_cast(tmp); + i = static_cast(u); +, + const float f = __half2float(h); + const signed char max_val = (signed char)0x7fU; + const signed char min_val = (signed char)0x80U; + const unsigned short bits = static_cast(static_cast<__half_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xF800U) { + // NaN + i = 0; + } else if (f > static_cast(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + i = min_val; + } else { + // normal value, conversion is well-defined + i = static_cast(f); + } +) + return i; +} + +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned char __half2uchar_rz(const __half h) +{ + unsigned char i; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + unsigned int tmp; + asm("cvt.rzi.u8.f16 %0, %1;" : "=r"(tmp) : "h"(__HALF_TO_CUS(h))); + i = static_cast(tmp); +, + const float f = __half2float(h); + const unsigned char max_val = 0xffU; + const unsigned char min_val = 0U; + const unsigned short bits = static_cast(static_cast<__half_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xF800U) { + // NaN + i = 0U; + } else if (f > static_cast(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + i = min_val; + } else { + // normal value, conversion is well-defined + i = static_cast(f); + } +) + return i; +} + +__CUDA_HOSTDEVICE_FP16_DECL__ short int __half2short_rz(const __half h) +{ + short int i; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rzi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h))); +, + const float f = __half2float(h); + const short int max_val = (short int)0x7fffU; + const short int min_val = (short int)0x8000U; + const unsigned short bits = static_cast(static_cast<__half_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xF800U) { + // NaN + i = 0; + } else if (f > static_cast(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + i = min_val; + } else { + // normal value, conversion is well-defined + i = static_cast(f); + } +) + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned short int __half2ushort_rz(const __half h) +{ + unsigned short int i; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rzi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h))); +, + const float f = __half2float(h); + const unsigned short int max_val = 0xffffU; + const unsigned short int min_val = 0U; + const unsigned short bits = static_cast(static_cast<__half_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xF800U) { + // NaN + i = 0U; + } else if (f > static_cast(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + i = min_val; + } else { + // normal value, conversion is well-defined + i = static_cast(f); + } +) + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ int __half2int_rz(const __half h) +{ + int i; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rzi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h))); +, + const float f = __half2float(h); + const int max_val = (int)0x7fffffffU; + const int min_val = (int)0x80000000U; + const unsigned short bits = static_cast(static_cast<__half_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xF800U) { + // NaN + i = 0; + } else if (f > static_cast(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + i = min_val; + } else { + // normal value, conversion is well-defined + i = static_cast(f); + } +) + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __half2uint_rz(const __half h) +{ + unsigned int i; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rzi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h))); +, + const float f = __half2float(h); + const unsigned int max_val = 0xffffffffU; + const unsigned int min_val = 0U; + const unsigned short bits = static_cast(static_cast<__half_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xF800U) { + // NaN + i = 0U; + } else if (f > static_cast(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + i = min_val; + } else { + // normal value, conversion is well-defined + i = static_cast(f); + } +) + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ long long int __half2ll_rz(const __half h) +{ + long long int i; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rzi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h))); +, + const float f = __half2float(h); + const long long int max_val = (long long int)0x7fffffffffffffffULL; + const long long int min_val = (long long int)0x8000000000000000ULL; + const unsigned short bits = static_cast(static_cast<__half_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xF800U) { + // NaN + i = min_val; + } else if (f > static_cast(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + i = min_val; + } else { + // normal value, conversion is well-defined + i = static_cast(f); + } +) + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned long long int __half2ull_rz(const __half h) +{ + unsigned long long int i; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rzi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h))); +, + const float f = __half2float(h); + const unsigned long long int max_val = 0xffffffffffffffffULL; + const unsigned long long int min_val = 0ULL; + const unsigned short bits = static_cast(static_cast<__half_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xF800U) { + // NaN + i = 0x8000000000000000ULL; + } else if (f > static_cast(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + i = min_val; + } else { + // normal value, conversion is well-defined + i = static_cast(f); + } +) + return i; +} +/* CUDA vector-types compatible vector creation function (note returns __half2, not half2) */ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 make_half2(const __half x, const __half y) +{ + __half2 t; t.x = x; t.y = y; return t; +} + + +/* Definitions of intrinsics */ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float22half2_rn(const float2 a) +{ + const __half2 val = __floats2half2_rn(a.x, a.y); + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ float2 __half22float2(const __half2 a) +{ + float hi_float; + float lo_float; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high},%1;\n" + " cvt.f32.f16 %0, low;}\n" : "=f"(lo_float) : "r"(__HALF2_TO_CUI(a))); + + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high},%1;\n" + " cvt.f32.f16 %0, high;}\n" : "=f"(hi_float) : "r"(__HALF2_TO_CUI(a))); +, + lo_float = __internal_half2float(((__half2_raw)a).x); + hi_float = __internal_half2float(((__half2_raw)a).y); +) + return make_float2(lo_float, hi_float); +} +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +__CUDA_FP16_DECL__ int __half2int_rn(const __half h) +{ + int i; + asm("cvt.rni.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ int __half2int_rd(const __half h) +{ + int i; + asm("cvt.rmi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ int __half2int_ru(const __half h) +{ + int i; + asm("cvt.rpi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_rn(const int i) +{ + __half h; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rn.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); +, + // double-rounding is not a problem here: if integer + // has more than 24 bits, it is already too large to + // be represented in half precision, and result will + // be infinity. + const float f = static_cast(i); + h = __float2half_rn(f); +) + return h; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_rz(const int i) +{ + __half h; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rz.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); +, + const float f = static_cast(i); + h = __float2half_rz(f); +) + return h; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_rd(const int i) +{ + __half h; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rm.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); +, + const float f = static_cast(i); + h = __float2half_rd(f); +) + return h; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_ru(const int i) +{ + __half h; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rp.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); +, + const float f = static_cast(i); + h = __float2half_ru(f); +) + return h; +} +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +__CUDA_FP16_DECL__ short int __half2short_rn(const __half h) +{ + short int i; + asm("cvt.rni.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ short int __half2short_rd(const __half h) +{ + short int i; + asm("cvt.rmi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ short int __half2short_ru(const __half h) +{ + short int i; + asm("cvt.rpi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_rn(const short int i) +{ + __half h; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rn.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); +, + const float f = static_cast(i); + h = __float2half_rn(f); +) + return h; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_rz(const short int i) +{ + __half h; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rz.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); +, + const float f = static_cast(i); + h = __float2half_rz(f); +) + return h; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_rd(const short int i) +{ + __half h; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rm.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); +, + const float f = static_cast(i); + h = __float2half_rd(f); +) + return h; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_ru(const short int i) +{ + __half h; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rp.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); +, + const float f = static_cast(i); + h = __float2half_ru(f); +) + return h; +} +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +__CUDA_FP16_DECL__ unsigned int __half2uint_rn(const __half h) +{ + unsigned int i; + asm("cvt.rni.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned int __half2uint_rd(const __half h) +{ + unsigned int i; + asm("cvt.rmi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned int __half2uint_ru(const __half h) +{ + unsigned int i; + asm("cvt.rpi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_rn(const unsigned int i) +{ + __half h; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rn.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); +, + // double-rounding is not a problem here: if integer + // has more than 24 bits, it is already too large to + // be represented in half precision, and result will + // be infinity. + const float f = static_cast(i); + h = __float2half_rn(f); +) + return h; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_rz(const unsigned int i) +{ + __half h; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rz.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); +, + const float f = static_cast(i); + h = __float2half_rz(f); +) + return h; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_rd(const unsigned int i) +{ + __half h; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rm.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); +, + const float f = static_cast(i); + h = __float2half_rd(f); +) + return h; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_ru(const unsigned int i) +{ + __half h; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rp.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); +, + const float f = static_cast(i); + h = __float2half_ru(f); +) + return h; +} +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +__CUDA_FP16_DECL__ unsigned short int __half2ushort_rn(const __half h) +{ + unsigned short int i; + asm("cvt.rni.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned short int __half2ushort_rd(const __half h) +{ + unsigned short int i; + asm("cvt.rmi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned short int __half2ushort_ru(const __half h) +{ + unsigned short int i; + asm("cvt.rpi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_rn(const unsigned short int i) +{ + __half h; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rn.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); +, + const float f = static_cast(i); + h = __float2half_rn(f); +) + return h; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_rz(const unsigned short int i) +{ + __half h; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rz.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); +, + const float f = static_cast(i); + h = __float2half_rz(f); +) + return h; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_rd(const unsigned short int i) +{ + __half h; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rm.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); +, + const float f = static_cast(i); + h = __float2half_rd(f); +) + return h; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_ru(const unsigned short int i) +{ + __half h; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rp.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); +, + const float f = static_cast(i); + h = __float2half_ru(f); +) + return h; +} +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +__CUDA_FP16_DECL__ unsigned long long int __half2ull_rn(const __half h) +{ + unsigned long long int i; + asm("cvt.rni.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned long long int __half2ull_rd(const __half h) +{ + unsigned long long int i; + asm("cvt.rmi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned long long int __half2ull_ru(const __half h) +{ + unsigned long long int i; + asm("cvt.rpi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_rn(const unsigned long long int i) +{ + __half h; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rn.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); +, + // double-rounding is not a problem here: if integer + // has more than 24 bits, it is already too large to + // be represented in half precision, and result will + // be infinity. + const float f = static_cast(i); + h = __float2half_rn(f); +) + return h; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_rz(const unsigned long long int i) +{ + __half h; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rz.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); +, + const float f = static_cast(i); + h = __float2half_rz(f); +) + return h; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_rd(const unsigned long long int i) +{ + __half h; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rm.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); +, + const float f = static_cast(i); + h = __float2half_rd(f); +) + return h; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_ru(const unsigned long long int i) +{ + __half h; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rp.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); +, + const float f = static_cast(i); + h = __float2half_ru(f); +) + return h; +} +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +__CUDA_FP16_DECL__ long long int __half2ll_rn(const __half h) +{ + long long int i; + asm("cvt.rni.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ long long int __half2ll_rd(const __half h) +{ + long long int i; + asm("cvt.rmi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ long long int __half2ll_ru(const __half h) +{ + long long int i; + asm("cvt.rpi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_rn(const long long int i) +{ + __half h; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rn.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); +, + // double-rounding is not a problem here: if integer + // has more than 24 bits, it is already too large to + // be represented in half precision, and result will + // be infinity. + const float f = static_cast(i); + h = __float2half_rn(f); +) + return h; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_rz(const long long int i) +{ + __half h; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rz.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); +, + const float f = static_cast(i); + h = __float2half_rz(f); +) + return h; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_rd(const long long int i) +{ + __half h; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rm.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); +, + const float f = static_cast(i); + h = __float2half_rd(f); +) + return h; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_ru(const long long int i) +{ + __half h; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rp.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); +, + const float f = static_cast(i); + h = __float2half_ru(f); +) + return h; +} +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +__CUDA_FP16_DECL__ __half htrunc(const __half h) +{ + __half r; + asm("cvt.rzi.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h))); + return r; +} +__CUDA_FP16_DECL__ __half hceil(const __half h) +{ + __half r; + asm("cvt.rpi.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h))); + return r; +} +__CUDA_FP16_DECL__ __half hfloor(const __half h) +{ + __half r; + asm("cvt.rmi.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h))); + return r; +} +__CUDA_FP16_DECL__ __half hrint(const __half h) +{ + __half r; + asm("cvt.rni.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h))); + return r; +} + +__CUDA_FP16_DECL__ __half2 h2trunc(const __half2 h) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " cvt.rzi.f16.f16 low, low;\n" + " cvt.rzi.f16.f16 high, high;\n" + " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2ceil(const __half2 h) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " cvt.rpi.f16.f16 low, low;\n" + " cvt.rpi.f16.f16 high, high;\n" + " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2floor(const __half2 h) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " cvt.rmi.f16.f16 low, low;\n" + " cvt.rmi.f16.f16 high, high;\n" + " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2rint(const __half2 h) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " cvt.rni.f16.f16 low, low;\n" + " cvt.rni.f16.f16 high, high;\n" + " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h))); + return val; +} +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __lows2half2(const __half2 a, const __half2 b) +{ + __half2 val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{.reg .f16 alow,ahigh,blow,bhigh;\n" + " mov.b32 {alow,ahigh}, %1;\n" + " mov.b32 {blow,bhigh}, %2;\n" + " mov.b32 %0, {alow,blow};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)), "r"(__HALF2_TO_CUI(b))); +, + val.x = a.x; + val.y = b.x; +) + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __highs2half2(const __half2 a, const __half2 b) +{ + __half2 val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{.reg .f16 alow,ahigh,blow,bhigh;\n" + " mov.b32 {alow,ahigh}, %1;\n" + " mov.b32 {blow,bhigh}, %2;\n" + " mov.b32 %0, {ahigh,bhigh};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)), "r"(__HALF2_TO_CUI(b))); +, + val.x = a.y; + val.y = b.y; +) + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __low2half(const __half2 a) +{ + __half ret; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b16 %0, low;}" : "=h"(__HALF_TO_US(ret)) : "r"(__HALF2_TO_CUI(a))); +, + ret = a.x; +) + return ret; +} +__CUDA_HOSTDEVICE_FP16_DECL__ int __hisinf(const __half a) +{ + int retval; + const __half_raw araw = __half_raw(a); + if (araw.x == 0xFC00U) { + retval = -1; + } else if (araw.x == 0x7C00U) { + retval = 1; + } else { + retval = 0; + } + return retval; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __low2half2(const __half2 a) +{ + __half2 val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b32 %0, {low,low};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); +, + val.x = a.x; + val.y = a.x; +) + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __high2half2(const __half2 a) +{ + __half2 val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b32 %0, {high,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); +, + val.x = a.y; + val.y = a.y; +) + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __high2half(const __half2 a) +{ + __half ret; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b16 %0, high;}" : "=h"(__HALF_TO_US(ret)) : "r"(__HALF2_TO_CUI(a))); +, + ret = a.y; +) + return ret; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __halves2half2(const __half a, const __half b) +{ + __half2 val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{ mov.b32 %0, {%1,%2};}\n" + : "=r"(__HALF2_TO_UI(val)) : "h"(__HALF_TO_CUS(a)), "h"(__HALF_TO_CUS(b))); +, + val.x = a; + val.y = b; +) + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __half2half2(const __half a) +{ + __half2 val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{ mov.b32 %0, {%1,%1};}\n" + : "=r"(__HALF2_TO_UI(val)) : "h"(__HALF_TO_CUS(a))); +, + val.x = a; + val.y = a; +) + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __lowhigh2highlow(const __half2 a) +{ + __half2 val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b32 %0, {high,low};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); +, + val.x = a.y; + val.y = a.x; +) + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ short int __half_as_short(const __half h) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return static_cast(__HALF_TO_CUS(h)); +, + return static_cast(__half_raw(h).x); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned short int __half_as_ushort(const __half h) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __HALF_TO_CUS(h); +, + return __half_raw(h).x; +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __short_as_half(const short int i) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __half h; + __HALF_TO_US(h) = static_cast(i); + return h; +, + __half_raw hr; + hr.x = static_cast(i); + return __half(hr); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort_as_half(const unsigned short int i) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __half h; + __HALF_TO_US(h) = i; + return h; +, + __half_raw hr; + hr.x = i; + return __half(hr);) +} + +/****************************************************************************** +* __half arithmetic * +******************************************************************************/ +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +__CUDA_FP16_DECL__ __half __internal_device_hmax(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + __BINARY_OP_HALF_MACRO(max) +, + const float fa = __half2float(a); + const float fb = __half2float(b); + float fr; + asm("{max.f32 %0,%1,%2;\n}" + :"=f"(fr) : "f"(fa), "f"(fb)); + const __half hr = __float2half(fr); + return hr; +) +} +__CUDA_FP16_DECL__ __half __internal_device_hmin(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + __BINARY_OP_HALF_MACRO(min) +, + const float fa = __half2float(a); + const float fb = __half2float(b); + float fr; + asm("{min.f32 %0,%1,%2;\n}" + :"=f"(fr) : "f"(fa), "f"(fb)); + const __half hr = __float2half(fr); + return hr; +) +} +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmax(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __internal_device_hmax(a, b); +, + __half maxval; + + maxval = (__hge(a, b) || __hisnan(b)) ? a : b; + + if (__hisnan(maxval)) + { + // if both inputs are NaN, return canonical NaN + maxval = CUDART_NAN_FP16; + } + else if (__heq(a, b)) + { + // hmax(+0.0, -0.0) = +0.0 + // unsigned compare 0x8000U > 0x0000U + __half_raw ra = __half_raw(a); + __half_raw rb = __half_raw(b); + maxval = (ra.x > rb.x) ? b : a; + } + return maxval; +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmin(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + return __internal_device_hmin(a, b); +, + __half minval; + + minval = (__hle(a, b) || __hisnan(b)) ? a : b; + + if (__hisnan(minval)) + { + // if both inputs are NaN, return canonical NaN + minval = CUDART_NAN_FP16; + } + else if (__heq(a, b)) + { + // hmin(+0.0, -0.0) = -0.0 + // unsigned compare 0x8000U > 0x0000U + __half_raw ra = __half_raw(a); + __half_raw rb = __half_raw(b); + minval = (ra.x > rb.x) ? a : b; + } + + return minval; +) +} + + +/****************************************************************************** +* __half2 arithmetic * +******************************************************************************/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmax2(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + __BINARY_OP_HALF2_MACRO(max) +, + __half2 val; + val.x = __hmax(a.x, b.x); + val.y = __hmax(a.y, b.y); + return val; +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmin2(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + __BINARY_OP_HALF2_MACRO(min) +, + __half2 val; + val.x = __hmin(a.x, b.x); + val.y = __hmin(a.y, b.y); + return val; +) +} + +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300) || defined(_NVHPC_CUDA) +/****************************************************************************** +* __half, __half2 warp shuffle * +******************************************************************************/ +#define __SHUFFLE_HALF2_MACRO(name) /* do */ {\ + __half2 r; \ + asm volatile ("{" __CUDA_FP16_STRINGIFY(name) " %0,%1,%2,%3;\n}" \ + :"=r"(__HALF2_TO_UI(r)): "r"(__HALF2_TO_CUI(var)), "r"(delta), "r"(c)); \ + return r; \ +} /* while(0) */ + +#define __SHUFFLE_SYNC_HALF2_MACRO(name, var, delta, c, mask) /* do */ {\ + __half2 r; \ + asm volatile ("{" __CUDA_FP16_STRINGIFY(name) " %0,%1,%2,%3,%4;\n}" \ + :"=r"(__HALF2_TO_UI(r)): "r"(__HALF2_TO_CUI(var)), "r"(delta), "r"(c), "r"(mask)); \ + return r; \ +} /* while(0) */ + +#if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ < 700) + +__CUDA_FP16_DECL__ __half2 __shfl(const __half2 var, const int delta, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = ((warp_size - static_cast(width)) << 8U) | 0x1fU; + __SHUFFLE_HALF2_MACRO(shfl.idx.b32) +} +__CUDA_FP16_DECL__ __half2 __shfl_up(const __half2 var, const unsigned int delta, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = (warp_size - static_cast(width)) << 8U; + __SHUFFLE_HALF2_MACRO(shfl.up.b32) +} +__CUDA_FP16_DECL__ __half2 __shfl_down(const __half2 var, const unsigned int delta, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = ((warp_size - static_cast(width)) << 8U) | 0x1fU; + __SHUFFLE_HALF2_MACRO(shfl.down.b32) +} +__CUDA_FP16_DECL__ __half2 __shfl_xor(const __half2 var, const int delta, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = ((warp_size - static_cast(width)) << 8U) | 0x1fU; + __SHUFFLE_HALF2_MACRO(shfl.bfly.b32) +} + +#endif /* defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ < 700) */ + +__CUDA_FP16_DECL__ __half2 __shfl_sync(const unsigned int mask, const __half2 var, const int srcLane, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = ((warp_size - static_cast(width)) << 8U) | 0x1fU; + __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.idx.b32, var, srcLane, c, mask) +} +__CUDA_FP16_DECL__ __half2 __shfl_up_sync(const unsigned int mask, const __half2 var, const unsigned int delta, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = (warp_size - static_cast(width)) << 8U; + __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.up.b32, var, delta, c, mask) +} +__CUDA_FP16_DECL__ __half2 __shfl_down_sync(const unsigned int mask, const __half2 var, const unsigned int delta, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = ((warp_size - static_cast(width)) << 8U) | 0x1fU; + __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.down.b32, var, delta, c, mask) +} +__CUDA_FP16_DECL__ __half2 __shfl_xor_sync(const unsigned int mask, const __half2 var, const int laneMask, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = ((warp_size - static_cast(width)) << 8U) | 0x1fU; + __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.bfly.b32, var, laneMask, c, mask) +} + +#undef __SHUFFLE_HALF2_MACRO +#undef __SHUFFLE_SYNC_HALF2_MACRO + +#if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ < 700) + +__CUDA_FP16_DECL__ __half __shfl(const __half var, const int delta, const int width) +{ + const __half2 temp1 = __halves2half2(var, var); + const __half2 temp2 = __shfl(temp1, delta, width); + return __low2half(temp2); +} +__CUDA_FP16_DECL__ __half __shfl_up(const __half var, const unsigned int delta, const int width) +{ + const __half2 temp1 = __halves2half2(var, var); + const __half2 temp2 = __shfl_up(temp1, delta, width); + return __low2half(temp2); +} +__CUDA_FP16_DECL__ __half __shfl_down(const __half var, const unsigned int delta, const int width) +{ + const __half2 temp1 = __halves2half2(var, var); + const __half2 temp2 = __shfl_down(temp1, delta, width); + return __low2half(temp2); +} +__CUDA_FP16_DECL__ __half __shfl_xor(const __half var, const int delta, const int width) +{ + const __half2 temp1 = __halves2half2(var, var); + const __half2 temp2 = __shfl_xor(temp1, delta, width); + return __low2half(temp2); +} + +#endif /* defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ < 700) */ + +__CUDA_FP16_DECL__ __half __shfl_sync(const unsigned int mask, const __half var, const int srcLane, const int width) +{ + const __half2 temp1 = __halves2half2(var, var); + const __half2 temp2 = __shfl_sync(mask, temp1, srcLane, width); + return __low2half(temp2); +} +__CUDA_FP16_DECL__ __half __shfl_up_sync(const unsigned int mask, const __half var, const unsigned int delta, const int width) +{ + const __half2 temp1 = __halves2half2(var, var); + const __half2 temp2 = __shfl_up_sync(mask, temp1, delta, width); + return __low2half(temp2); +} +__CUDA_FP16_DECL__ __half __shfl_down_sync(const unsigned int mask, const __half var, const unsigned int delta, const int width) +{ + const __half2 temp1 = __halves2half2(var, var); + const __half2 temp2 = __shfl_down_sync(mask, temp1, delta, width); + return __low2half(temp2); +} +__CUDA_FP16_DECL__ __half __shfl_xor_sync(const unsigned int mask, const __half var, const int laneMask, const int width) +{ + const __half2 temp1 = __halves2half2(var, var); + const __half2 temp2 = __shfl_xor_sync(mask, temp1, laneMask, width); + return __low2half(temp2); +} + +#endif /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300) || defined(_NVHPC_CUDA) */ +/****************************************************************************** +* __half and __half2 __ldg,__ldcg,__ldca,__ldcs * +******************************************************************************/ + +#if defined(__cplusplus) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320) || defined(_NVHPC_CUDA)) +#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__) +#define __LDG_PTR "l" +#else +#define __LDG_PTR "r" +#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/ +__CUDA_FP16_DECL__ __half2 __ldg(const __half2 *const ptr) +{ + __half2 ret; + asm ("ld.global.nc.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half __ldg(const __half *const ptr) +{ + __half ret; + asm ("ld.global.nc.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half2 __ldcg(const __half2 *const ptr) +{ + __half2 ret; + asm ("ld.global.cg.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half __ldcg(const __half *const ptr) +{ + __half ret; + asm ("ld.global.cg.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half2 __ldca(const __half2 *const ptr) +{ + __half2 ret; + asm ("ld.global.ca.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half __ldca(const __half *const ptr) +{ + __half ret; + asm ("ld.global.ca.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half2 __ldcs(const __half2 *const ptr) +{ + __half2 ret; + asm ("ld.global.cs.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half __ldcs(const __half *const ptr) +{ + __half ret; + asm ("ld.global.cs.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half2 __ldlu(const __half2 *const ptr) +{ + __half2 ret; + asm ("ld.global.lu.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr) : "memory"); + return ret; +} +__CUDA_FP16_DECL__ __half __ldlu(const __half *const ptr) +{ + __half ret; + asm ("ld.global.lu.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr) : "memory"); + return ret; +} +__CUDA_FP16_DECL__ __half2 __ldcv(const __half2 *const ptr) +{ + __half2 ret; + asm ("ld.global.cv.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr) : "memory"); + return ret; +} +__CUDA_FP16_DECL__ __half __ldcv(const __half *const ptr) +{ + __half ret; + asm ("ld.global.cv.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr) : "memory"); + return ret; +} +__CUDA_FP16_DECL__ void __stwb(__half2 *const ptr, const __half2 value) +{ + asm ("st.global.wb.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory"); +} +__CUDA_FP16_DECL__ void __stwb(__half *const ptr, const __half value) +{ + asm ("st.global.wb.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__HALF_TO_CUS(value)) : "memory"); +} +__CUDA_FP16_DECL__ void __stcg(__half2 *const ptr, const __half2 value) +{ + asm ("st.global.cg.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory"); +} +__CUDA_FP16_DECL__ void __stcg(__half *const ptr, const __half value) +{ + asm ("st.global.cg.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__HALF_TO_CUS(value)) : "memory"); +} +__CUDA_FP16_DECL__ void __stcs(__half2 *const ptr, const __half2 value) +{ + asm ("st.global.cs.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory"); +} +__CUDA_FP16_DECL__ void __stcs(__half *const ptr, const __half value) +{ + asm ("st.global.cs.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__HALF_TO_CUS(value)) : "memory"); +} +__CUDA_FP16_DECL__ void __stwt(__half2 *const ptr, const __half2 value) +{ + asm ("st.global.wt.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory"); +} +__CUDA_FP16_DECL__ void __stwt(__half *const ptr, const __half value) +{ + asm ("st.global.wt.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__HALF_TO_CUS(value)) : "memory"); +} +#undef __LDG_PTR +#endif /* defined(__cplusplus) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320) || defined(_NVHPC_CUDA)) */ +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ + +/****************************************************************************** +* __half2 comparison * +******************************************************************************/ +#define __COMPARISON_OP_HALF2_MACRO(name) /* do */ {\ + __half2 val; \ + asm( "{ " __CUDA_FP16_STRINGIFY(name) ".f16x2.f16x2 %0,%1,%2;\n}" \ + :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \ + return val; \ +} /* while(0) */ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __heq2(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF2_MACRO(set.eq) +, + __half2_raw val; + val.x = __heq(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U; + val.y = __heq(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U; + return __half2(val); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hne2(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF2_MACRO(set.ne) +, + __half2_raw val; + val.x = __hne(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U; + val.y = __hne(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U; + return __half2(val); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hle2(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF2_MACRO(set.le) +, + __half2_raw val; + val.x = __hle(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U; + val.y = __hle(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U; + return __half2(val); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hge2(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF2_MACRO(set.ge) +, + __half2_raw val; + val.x = __hge(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U; + val.y = __hge(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U; + return __half2(val); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hlt2(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF2_MACRO(set.lt) +, + __half2_raw val; + val.x = __hlt(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U; + val.y = __hlt(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U; + return __half2(val); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hgt2(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF2_MACRO(set.gt) +, + __half2_raw val; + val.x = __hgt(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U; + val.y = __hgt(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U; + return __half2(val); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hequ2(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF2_MACRO(set.equ) +, + __half2_raw val; + val.x = __hequ(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U; + val.y = __hequ(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U; + return __half2(val); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hneu2(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF2_MACRO(set.neu) +, + __half2_raw val; + val.x = __hneu(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U; + val.y = __hneu(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U; + return __half2(val); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hleu2(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF2_MACRO(set.leu) +, + __half2_raw val; + val.x = __hleu(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U; + val.y = __hleu(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U; + return __half2(val); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hgeu2(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF2_MACRO(set.geu) +, + __half2_raw val; + val.x = __hgeu(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U; + val.y = __hgeu(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U; + return __half2(val); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hltu2(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF2_MACRO(set.ltu) +, + __half2_raw val; + val.x = __hltu(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U; + val.y = __hltu(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U; + return __half2(val); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hgtu2(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF2_MACRO(set.gtu) +, + __half2_raw val; + val.x = __hgtu(a.x, b.x) ? (unsigned short)0x3C00U : (unsigned short)0U; + val.y = __hgtu(a.y, b.y) ? (unsigned short)0x3C00U : (unsigned short)0U; + return __half2(val); +) +} +#undef __COMPARISON_OP_HALF2_MACRO +/****************************************************************************** +* __half2 comparison with mask output * +******************************************************************************/ +#define __COMPARISON_OP_HALF2_MACRO_MASK(name) /* do */ {\ + unsigned val; \ + asm( "{ " __CUDA_FP16_STRINGIFY(name) ".u32.f16x2 %0,%1,%2;\n}" \ + :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \ + return val; \ +} /* while(0) */ +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __heq2_mask(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF2_MACRO_MASK(set.eq) +, + const unsigned short px = __heq(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U; + const unsigned short py = __heq(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U; + unsigned ur = (unsigned)py; + ur <<= (unsigned)16U; + ur |= (unsigned)px; + return ur; +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hne2_mask(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF2_MACRO_MASK(set.ne) +, + const unsigned short px = __hne(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U; + const unsigned short py = __hne(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U; + unsigned ur = (unsigned)py; + ur <<= (unsigned)16U; + ur |= (unsigned)px; + return ur; +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hle2_mask(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF2_MACRO_MASK(set.le) +, + const unsigned short px = __hle(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U; + const unsigned short py = __hle(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U; + unsigned ur = (unsigned)py; + ur <<= (unsigned)16U; + ur |= (unsigned)px; + return ur; +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hge2_mask(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF2_MACRO_MASK(set.ge) +, + const unsigned short px = __hge(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U; + const unsigned short py = __hge(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U; + unsigned ur = (unsigned)py; + ur <<= (unsigned)16U; + ur |= (unsigned)px; + return ur; +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hlt2_mask(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF2_MACRO_MASK(set.lt) +, + const unsigned short px = __hlt(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U; + const unsigned short py = __hlt(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U; + unsigned ur = (unsigned)py; + ur <<= (unsigned)16U; + ur |= (unsigned)px; + return ur; +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hgt2_mask(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF2_MACRO_MASK(set.gt) +, + const unsigned short px = __hgt(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U; + const unsigned short py = __hgt(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U; + unsigned ur = (unsigned)py; + ur <<= (unsigned)16U; + ur |= (unsigned)px; + return ur; +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hequ2_mask(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF2_MACRO_MASK(set.equ) +, + const unsigned short px = __hequ(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U; + const unsigned short py = __hequ(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U; + unsigned ur = (unsigned)py; + ur <<= (unsigned)16U; + ur |= (unsigned)px; + return ur; +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hneu2_mask(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF2_MACRO_MASK(set.neu) +, + const unsigned short px = __hneu(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U; + const unsigned short py = __hneu(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U; + unsigned ur = (unsigned)py; + ur <<= (unsigned)16U; + ur |= (unsigned)px; + return ur; +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hleu2_mask(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF2_MACRO_MASK(set.leu) +, + const unsigned short px = __hleu(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U; + const unsigned short py = __hleu(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U; + unsigned ur = (unsigned)py; + ur <<= (unsigned)16U; + ur |= (unsigned)px; + return ur; +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hgeu2_mask(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF2_MACRO_MASK(set.geu) +, + const unsigned short px = __hgeu(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U; + const unsigned short py = __hgeu(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U; + unsigned ur = (unsigned)py; + ur <<= (unsigned)16U; + ur |= (unsigned)px; + return ur; +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hltu2_mask(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF2_MACRO_MASK(set.ltu) +, + const unsigned short px = __hltu(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U; + const unsigned short py = __hltu(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U; + unsigned ur = (unsigned)py; + ur <<= (unsigned)16U; + ur |= (unsigned)px; + return ur; +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __hgtu2_mask(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF2_MACRO_MASK(set.gtu) +, + const unsigned short px = __hgtu(a.x, b.x) ? (unsigned short)0xFFFFU : (unsigned short)0U; + const unsigned short py = __hgtu(a.y, b.y) ? (unsigned short)0xFFFFU : (unsigned short)0U; + unsigned ur = (unsigned)py; + ur <<= (unsigned)16U; + ur |= (unsigned)px; + return ur; +) +} +#undef __COMPARISON_OP_HALF2_MACRO_MASK + +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbeq2(const __half2 a, const __half2 b) +{ + const unsigned mask = __heq2_mask(a, b); + return (mask == 0xFFFFFFFFU); +} +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbne2(const __half2 a, const __half2 b) +{ + const unsigned mask = __hne2_mask(a, b); + return (mask == 0xFFFFFFFFU); +} +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hble2(const __half2 a, const __half2 b) +{ + const unsigned mask = __hle2_mask(a, b); + return (mask == 0xFFFFFFFFU); +} +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbge2(const __half2 a, const __half2 b) +{ + const unsigned mask = __hge2_mask(a, b); + return (mask == 0xFFFFFFFFU); +} +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hblt2(const __half2 a, const __half2 b) +{ + const unsigned mask = __hlt2_mask(a, b); + return (mask == 0xFFFFFFFFU); +} +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbgt2(const __half2 a, const __half2 b) +{ + const unsigned mask = __hgt2_mask(a, b); + return (mask == 0xFFFFFFFFU); +} +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbequ2(const __half2 a, const __half2 b) +{ + const unsigned mask = __hequ2_mask(a, b); + return (mask == 0xFFFFFFFFU); +} +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbneu2(const __half2 a, const __half2 b) +{ + const unsigned mask = __hneu2_mask(a, b); + return (mask == 0xFFFFFFFFU); +} +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbleu2(const __half2 a, const __half2 b) +{ + const unsigned mask = __hleu2_mask(a, b); + return (mask == 0xFFFFFFFFU); +} +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbgeu2(const __half2 a, const __half2 b) +{ + const unsigned mask = __hgeu2_mask(a, b); + return (mask == 0xFFFFFFFFU); +} +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbltu2(const __half2 a, const __half2 b) +{ + const unsigned mask = __hltu2_mask(a, b); + return (mask == 0xFFFFFFFFU); +} +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hbgtu2(const __half2 a, const __half2 b) +{ + const unsigned mask = __hgtu2_mask(a, b); + return (mask == 0xFFFFFFFFU); +} +/****************************************************************************** +* __half comparison * +******************************************************************************/ +#define __COMPARISON_OP_HALF_MACRO(name) /* do */ {\ + unsigned short val; \ + asm( "{ .reg .pred __$temp3;\n" \ + " setp." __CUDA_FP16_STRINGIFY(name) ".f16 __$temp3, %1, %2;\n" \ + " selp.u16 %0, 1, 0, __$temp3;}" \ + : "=h"(val) : "h"(__HALF_TO_CUS(a)), "h"(__HALF_TO_CUS(b))); \ + return (val != 0U) ? true : false; \ +} /* while(0) */ +__CUDA_HOSTDEVICE_FP16_DECL__ bool __heq(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF_MACRO(eq) +, + const float fa = __half2float(a); + const float fb = __half2float(b); + return (fa == fb); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hne(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF_MACRO(ne) +, + const float fa = __half2float(a); + const float fb = __half2float(b); + return (fa != fb) && (!__hisnan(a)) && (!__hisnan(b)); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hle(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF_MACRO(le) +, + const float fa = __half2float(a); + const float fb = __half2float(b); + return (fa <= fb); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hge(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF_MACRO(ge) +, + const float fa = __half2float(a); + const float fb = __half2float(b); + return (fa >= fb); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hlt(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF_MACRO(lt) +, + const float fa = __half2float(a); + const float fb = __half2float(b); + return (fa < fb); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hgt(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF_MACRO(gt) +, + const float fa = __half2float(a); + const float fb = __half2float(b); + return (fa > fb); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hequ(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF_MACRO(equ) +, + const float fa = __half2float(a); + const float fb = __half2float(b); + return (fa == fb) || (__hisnan(a)) || (__hisnan(b)); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hneu(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF_MACRO(neu) +, + const float fa = __half2float(a); + const float fb = __half2float(b); + return (fa != fb); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hleu(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF_MACRO(leu) +, + const float fa = __half2float(a); + const float fb = __half2float(b); + return (fa <= fb) || (__hisnan(a)) || (__hisnan(b)); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hgeu(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF_MACRO(geu) +, + const float fa = __half2float(a); + const float fb = __half2float(b); + return (fa >= fb) || (__hisnan(a)) || (__hisnan(b)); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hltu(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF_MACRO(ltu) +, + const float fa = __half2float(a); + const float fb = __half2float(b); + return (fa < fb) || (__hisnan(a)) || (__hisnan(b)); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hgtu(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __COMPARISON_OP_HALF_MACRO(gtu) +, + const float fa = __half2float(a); + const float fb = __half2float(b); + return (fa > fb) || (__hisnan(a)) || (__hisnan(b)); +) +} +#undef __COMPARISON_OP_HALF_MACRO +/****************************************************************************** +* __half2 arithmetic * +******************************************************************************/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hadd2(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __BINARY_OP_HALF2_MACRO(add) +, + __half2 val; + val.x = __hadd(a.x, b.x); + val.y = __hadd(a.y, b.y); + return val; +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hsub2(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __BINARY_OP_HALF2_MACRO(sub) +, + __half2 val; + val.x = __hsub(a.x, b.x); + val.y = __hsub(a.y, b.y); + return val; +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmul2(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __BINARY_OP_HALF2_MACRO(mul) +, + __half2 val; + val.x = __hmul(a.x, b.x); + val.y = __hmul(a.y, b.y); + return val; +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hadd2_sat(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __BINARY_OP_HALF2_MACRO(add.sat) +, + __half2 val; + val.x = __hadd_sat(a.x, b.x); + val.y = __hadd_sat(a.y, b.y); + return val; +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hsub2_sat(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __BINARY_OP_HALF2_MACRO(sub.sat) +, + __half2 val; + val.x = __hsub_sat(a.x, b.x); + val.y = __hsub_sat(a.y, b.y); + return val; +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmul2_sat(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __BINARY_OP_HALF2_MACRO(mul.sat) +, + __half2 val; + val.x = __hmul_sat(a.x, b.x); + val.y = __hmul_sat(a.y, b.y); + return val; +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hadd2_rn(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __BINARY_OP_HALF2_MACRO(add.rn) +, + __half2 val; + val.x = __hadd_rn(a.x, b.x); + val.y = __hadd_rn(a.y, b.y); + return val; +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hsub2_rn(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __BINARY_OP_HALF2_MACRO(sub.rn) +, + __half2 val; + val.x = __hsub_rn(a.x, b.x); + val.y = __hsub_rn(a.y, b.y); + return val; +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmul2_rn(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __BINARY_OP_HALF2_MACRO(mul.rn) +, + __half2 val; + val.x = __hmul_rn(a.x, b.x); + val.y = __hmul_rn(a.y, b.y); + return val; +) +} +#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) +__CUDA_FP16_DECL__ __half2 __hfma2(const __half2 a, const __half2 b, const __half2 c) +{ + __TERNARY_OP_HALF2_MACRO(fma.rn) +} +__CUDA_FP16_DECL__ __half2 __hfma2_sat(const __half2 a, const __half2 b, const __half2 c) +{ + __TERNARY_OP_HALF2_MACRO(fma.rn.sat) +} +#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __h2div(const __half2 a, const __half2 b) { + __half ha = __low2half(a); + __half hb = __low2half(b); + + const __half v1 = __hdiv(ha, hb); + + ha = __high2half(a); + hb = __high2half(b); + + const __half v2 = __hdiv(ha, hb); + + return __halves2half2(v1, v2); +} + +/****************************************************************************** +* __half arithmetic * +******************************************************************************/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hadd(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __BINARY_OP_HALF_MACRO(add) +, + const float fa = __half2float(a); + const float fb = __half2float(b); + return __float2half(fa + fb); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hsub(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __BINARY_OP_HALF_MACRO(sub) +, + const float fa = __half2float(a); + const float fb = __half2float(b); + return __float2half(fa - fb); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmul(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __BINARY_OP_HALF_MACRO(mul) +, + const float fa = __half2float(a); + const float fb = __half2float(b); + return __float2half(fa * fb); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hadd_sat(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __BINARY_OP_HALF_MACRO(add.sat) +, + return __hmin(__hmax(__hadd(a, b), CUDART_ZERO_FP16), CUDART_ONE_FP16); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hsub_sat(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __BINARY_OP_HALF_MACRO(sub.sat) +, + return __hmin(__hmax(__hsub(a, b), CUDART_ZERO_FP16), CUDART_ONE_FP16); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmul_sat(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __BINARY_OP_HALF_MACRO(mul.sat) +, + return __hmin(__hmax(__hmul(a, b), CUDART_ZERO_FP16), CUDART_ONE_FP16); +) +} + +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hadd_rn(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __BINARY_OP_HALF_MACRO(add.rn) +, + const float fa = __half2float(a); + const float fb = __half2float(b); + return __float2half(fa + fb); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hsub_rn(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __BINARY_OP_HALF_MACRO(sub.rn) +, + const float fa = __half2float(a); + const float fb = __half2float(b); + return __float2half(fa - fb); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmul_rn(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __BINARY_OP_HALF_MACRO(mul.rn) +, + const float fa = __half2float(a); + const float fb = __half2float(b); + return __float2half(fa * fb); +) +} +#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) +__CUDA_FP16_DECL__ __half __hfma(const __half a, const __half b, const __half c) +{ + __TERNARY_OP_HALF_MACRO(fma.rn) +} +__CUDA_FP16_DECL__ __half __hfma_sat(const __half a, const __half b, const __half c) +{ + __TERNARY_OP_HALF_MACRO(fma.rn.sat) +} +#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hdiv(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + __half v; + __half abs; + __half den; + __HALF_TO_US(den) = 0x008FU; + + float rcp; + const float fa = __half2float(a); + const float fb = __half2float(b); + + asm("{rcp.approx.ftz.f32 %0, %1;\n}" :"=f"(rcp) : "f"(fb)); + + float fv = rcp * fa; + + v = __float2half(fv); + abs = __habs(v); + if (__hlt(abs, den) && __hlt(__float2half(0.0f), abs)) { + const float err = __fmaf_rn(-fb, fv, fa); + fv = __fmaf_rn(rcp, err, fv); + v = __float2half(fv); + } + return v; +, + const float fa = __half2float(a); + const float fb = __half2float(b); + return __float2half(fa / fb); +) +} + +/****************************************************************************** +* __half2 functions * +******************************************************************************/ +#if defined(_NVHPC_CUDA) || defined(__CUDACC__) +#define __APPROX_FCAST(fun) /* do */ {\ + __half val;\ + asm("{.reg.b32 f; \n"\ + " .reg.b16 r; \n"\ + " mov.b16 r,%1; \n"\ + " cvt.f32.f16 f,r; \n"\ + " " __CUDA_FP16_STRINGIFY(fun) ".approx.ftz.f32 f,f; \n"\ + " cvt.rn.f16.f32 r,f; \n"\ + " mov.b16 %0,r; \n"\ + "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));\ + return val;\ +} /* while(0) */ +#define __APPROX_FCAST2(fun) /* do */ {\ + __half2 val;\ + asm("{.reg.b16 hl, hu; \n"\ + " .reg.b32 fl, fu; \n"\ + " mov.b32 {hl, hu}, %1; \n"\ + " cvt.f32.f16 fl, hl; \n"\ + " cvt.f32.f16 fu, hu; \n"\ + " " __CUDA_FP16_STRINGIFY(fun) ".approx.ftz.f32 fl, fl; \n"\ + " " __CUDA_FP16_STRINGIFY(fun) ".approx.ftz.f32 fu, fu; \n"\ + " cvt.rn.f16.f32 hl, fl; \n"\ + " cvt.rn.f16.f32 hu, fu; \n"\ + " mov.b32 %0, {hl, hu}; \n"\ + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); \ + return val;\ +} /* while(0) */ +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) || defined(_NVHPC_CUDA) +#define __SPEC_CASE2(i,r, spc, ulp) \ + "{.reg.b32 spc, ulp, p;\n"\ + " mov.b32 spc," __CUDA_FP16_STRINGIFY(spc) ";\n"\ + " mov.b32 ulp," __CUDA_FP16_STRINGIFY(ulp) ";\n"\ + " set.eq.f16x2.f16x2 p," __CUDA_FP16_STRINGIFY(i) ", spc;\n"\ + " fma.rn.f16x2 " __CUDA_FP16_STRINGIFY(r) ",p,ulp," __CUDA_FP16_STRINGIFY(r) ";\n}\n" +#define __SPEC_CASE(i,r, spc, ulp) \ + "{.reg.b16 spc, ulp, p;\n"\ + " mov.b16 spc," __CUDA_FP16_STRINGIFY(spc) ";\n"\ + " mov.b16 ulp," __CUDA_FP16_STRINGIFY(ulp) ";\n"\ + " set.eq.f16.f16 p," __CUDA_FP16_STRINGIFY(i) ", spc;\n"\ + " fma.rn.f16 " __CUDA_FP16_STRINGIFY(r) ",p,ulp," __CUDA_FP16_STRINGIFY(r) ";\n}\n" +static __device__ __forceinline__ float __float_simpl_sinf(float a); +static __device__ __forceinline__ float __float_simpl_cosf(float a); +__CUDA_FP16_DECL__ __half hsin(const __half a) { + const float sl = __float_simpl_sinf(__half2float(a)); + __half r = __float2half_rn(sl); + asm("{\n\t" + " .reg.b16 i,r,t; \n\t" + " mov.b16 r, %0; \n\t" + " mov.b16 i, %1; \n\t" + " and.b16 t, r, 0x8000U; \n\t" + " abs.f16 r, r; \n\t" + " abs.f16 i, i; \n\t" + __SPEC_CASE(i, r, 0X32B3U, 0x0800U) + __SPEC_CASE(i, r, 0X5CB0U, 0x9000U) + " or.b16 r,r,t; \n\t" + " mov.b16 %0, r; \n" + "}\n" : "+h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a))); + return r; +} +__CUDA_FP16_DECL__ __half2 h2sin(const __half2 a) { + const float sl = __float_simpl_sinf(__half2float(a.x)); + const float sh = __float_simpl_sinf(__half2float(a.y)); + __half2 r = __floats2half2_rn(sl, sh); + asm("{\n\t" + " .reg.b32 i,r,t; \n\t" + " mov.b32 r, %0; \n\t" + " mov.b32 i, %1; \n\t" + " and.b32 t, r, 0x80008000U; \n\t" + " abs.f16x2 r, r; \n\t" + " abs.f16x2 i, i; \n\t" + __SPEC_CASE2(i, r, 0X32B332B3U, 0x08000800U) + __SPEC_CASE2(i, r, 0X5CB05CB0U, 0x90009000U) + " or.b32 r, r, t; \n\t" + " mov.b32 %0, r; \n" + "}\n" : "+r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a))); + return r; +} +__CUDA_FP16_DECL__ __half hcos(const __half a) { + const float cl = __float_simpl_cosf(__half2float(a)); + __half r = __float2half_rn(cl); + asm("{\n\t" + " .reg.b16 i,r; \n\t" + " mov.b16 r, %0; \n\t" + " mov.b16 i, %1; \n\t" + " abs.f16 i, i; \n\t" + __SPEC_CASE(i, r, 0X2B7CU, 0x1000U) + " mov.b16 %0, r; \n" + "}\n" : "+h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a))); + return r; +} +__CUDA_FP16_DECL__ __half2 h2cos(const __half2 a) { + const float cl = __float_simpl_cosf(__half2float(a.x)); + const float ch = __float_simpl_cosf(__half2float(a.y)); + __half2 r = __floats2half2_rn(cl, ch); + asm("{\n\t" + " .reg.b32 i,r; \n\t" + " mov.b32 r, %0; \n\t" + " mov.b32 i, %1; \n\t" + " abs.f16x2 i, i; \n\t" + __SPEC_CASE2(i, r, 0X2B7C2B7CU, 0x10001000U) + " mov.b32 %0, r; \n" + "}\n" : "+r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a))); + return r; +} +static __device__ __forceinline__ float __internal_trig_reduction_kernel(const float a, unsigned int *const quadrant) +{ + const float ar = __fmaf_rn(a, 0.636619772F, 12582912.0F); + const unsigned q = __float_as_uint(ar); + const float j = __fsub_rn(ar, 12582912.0F); + float t = __fmaf_rn(j, -1.5707962512969971e+000F, a); + t = __fmaf_rn(j, -7.5497894158615964e-008F, t); + *quadrant = q; + return t; +} +static __device__ __forceinline__ float __internal_sin_cos_kernel(const float x, const unsigned int i) +{ + float z; + const float x2 = x*x; + float a8; + float a6; + float a4; + float a2; + float a1; + float a0; + + if ((i & 1U) != 0U) { + // cos + a8 = 2.44331571e-5F; + a6 = -1.38873163e-3F; + a4 = 4.16666457e-2F; + a2 = -5.00000000e-1F; + a1 = x2; + a0 = 1.0F; + } + else { + // sin + a8 = -1.95152959e-4F; + a6 = 8.33216087e-3F; + a4 = -1.66666546e-1F; + a2 = 0.0F; + a1 = x; + a0 = x; + } + + z = __fmaf_rn(a8, x2, a6); + z = __fmaf_rn(z, x2, a4); + z = __fmaf_rn(z, x2, a2); + z = __fmaf_rn(z, a1, a0); + + if ((i & 2U) != 0U) { + z = -z; + } + return z; +} +static __device__ __forceinline__ float __float_simpl_sinf(float a) +{ + float z; + unsigned i; + a = __internal_trig_reduction_kernel(a, &i); + z = __internal_sin_cos_kernel(a, i); + return z; +} +static __device__ __forceinline__ float __float_simpl_cosf(float a) +{ + float z; + unsigned i; + a = __internal_trig_reduction_kernel(a, &i); + z = __internal_sin_cos_kernel(a, (i & 0x3U) + 1U); + return z; +} + +__CUDA_FP16_DECL__ __half hexp(const __half a) { + __half val; + asm("{.reg.b32 f, C, nZ; \n" + " .reg.b16 h,r; \n" + " mov.b16 h,%1; \n" + " cvt.f32.f16 f,h; \n" + " mov.b32 C, 0x3fb8aa3bU; \n" + " mov.b32 nZ, 0x80000000U;\n" + " fma.rn.f32 f,f,C,nZ; \n" + " ex2.approx.ftz.f32 f,f; \n" + " cvt.rn.f16.f32 r,f; \n" + __SPEC_CASE(h, r, 0X1F79U, 0x9400U) + __SPEC_CASE(h, r, 0X25CFU, 0x9400U) + __SPEC_CASE(h, r, 0XC13BU, 0x0400U) + __SPEC_CASE(h, r, 0XC1EFU, 0x0200U) + " mov.b16 %0,r; \n" + "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2exp(const __half2 a) { + __half2 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 h,r,fl,fu,C,nZ; \n" + " mov.b32 {hl, hu}, %1; \n" + " mov.b32 h, %1; \n" + " cvt.f32.f16 fl, hl; \n" + " cvt.f32.f16 fu, hu; \n" + " mov.b32 C, 0x3fb8aa3bU; \n" + " mov.b32 nZ, 0x80000000U;\n" + " fma.rn.f32 fl,fl,C,nZ; \n" + " fma.rn.f32 fu,fu,C,nZ; \n" + " ex2.approx.ftz.f32 fl, fl; \n" + " ex2.approx.ftz.f32 fu, fu; \n" + " cvt.rn.f16.f32 hl, fl; \n" + " cvt.rn.f16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + __SPEC_CASE2(h, r, 0X1F791F79U, 0x94009400U) + __SPEC_CASE2(h, r, 0X25CF25CFU, 0x94009400U) + __SPEC_CASE2(h, r, 0XC13BC13BU, 0x04000400U) + __SPEC_CASE2(h, r, 0XC1EFC1EFU, 0x02000200U) + " mov.b32 %0, r; \n" + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +#endif /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) || defined(_NVHPC_CUDA) */ + +__CUDA_FP16_DECL__ __half htanh(const __half a) { + float f = __half2float(a); + f = tanhf(f); + __half h = __float2half_rn(f); + return h; +} +__CUDA_FP16_DECL__ __half2 h2tanh(const __half2 a) { + float2 f = __half22float2(a); + f.x = tanhf(f.x); + f.y = tanhf(f.y); + __half2 h = __float22half2_rn(f); + return h; +} + +__CUDA_FP16_DECL__ __half htanh_approx(const __half a) { + __half r; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_75, + __half_raw hr = (__half_raw)a; + asm("tanh.approx.f16 %0, %0;" : "+h"(hr.x)); + r = (__half)hr; +, + r = htanh(a); +) + return r; +} +__CUDA_FP16_DECL__ __half2 h2tanh_approx(const __half2 a) { + __half2 res; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_75, + asm("tanh.approx.f16x2 %0, %1;" : "=r"(__HALF2_TO_UI(res)) : "r"(__HALF2_TO_CUI(a))); +, + res = h2tanh(a); +) + return res; +} + +__CUDA_FP16_DECL__ __half hexp2(const __half a) { + __half val; + asm("{.reg.b32 f, ULP; \n" + " .reg.b16 r; \n" + " mov.b16 r,%1; \n" + " cvt.f32.f16 f,r; \n" + " ex2.approx.ftz.f32 f,f; \n" + " mov.b32 ULP, 0x33800000U;\n" + " fma.rn.f32 f,f,ULP,f; \n" + " cvt.rn.f16.f32 r,f; \n" + " mov.b16 %0,r; \n" + "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2exp2(const __half2 a) { + __half2 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 fl, fu, ULP; \n" + " mov.b32 {hl, hu}, %1; \n" + " cvt.f32.f16 fl, hl; \n" + " cvt.f32.f16 fu, hu; \n" + " ex2.approx.ftz.f32 fl, fl; \n" + " ex2.approx.ftz.f32 fu, fu; \n" + " mov.b32 ULP, 0x33800000U;\n" + " fma.rn.f32 fl,fl,ULP,fl; \n" + " fma.rn.f32 fu,fu,ULP,fu; \n" + " cvt.rn.f16.f32 hl, fl; \n" + " cvt.rn.f16.f32 hu, fu; \n" + " mov.b32 %0, {hl, hu}; \n" + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) || defined(_NVHPC_CUDA) +__CUDA_FP16_DECL__ __half hexp10(const __half a) { + __half val; + asm("{.reg.b16 h,r; \n" + " .reg.b32 f, C, nZ; \n" + " mov.b16 h, %1; \n" + " cvt.f32.f16 f, h; \n" + " mov.b32 C, 0x40549A78U; \n" + " mov.b32 nZ, 0x80000000U;\n" + " fma.rn.f32 f,f,C,nZ; \n" + " ex2.approx.ftz.f32 f, f; \n" + " cvt.rn.f16.f32 r, f; \n" + __SPEC_CASE(h, r, 0x34DEU, 0x9800U) + __SPEC_CASE(h, r, 0x9766U, 0x9000U) + __SPEC_CASE(h, r, 0x9972U, 0x1000U) + __SPEC_CASE(h, r, 0xA5C4U, 0x1000U) + __SPEC_CASE(h, r, 0xBF0AU, 0x8100U) + " mov.b16 %0, r; \n" + "}":"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2exp10(const __half2 a) { + __half2 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 h,r,fl,fu,C,nZ; \n" + " mov.b32 {hl, hu}, %1; \n" + " mov.b32 h, %1; \n" + " cvt.f32.f16 fl, hl; \n" + " cvt.f32.f16 fu, hu; \n" + " mov.b32 C, 0x40549A78U; \n" + " mov.b32 nZ, 0x80000000U;\n" + " fma.rn.f32 fl,fl,C,nZ; \n" + " fma.rn.f32 fu,fu,C,nZ; \n" + " ex2.approx.ftz.f32 fl, fl; \n" + " ex2.approx.ftz.f32 fu, fu; \n" + " cvt.rn.f16.f32 hl, fl; \n" + " cvt.rn.f16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + __SPEC_CASE2(h, r, 0x34DE34DEU, 0x98009800U) + __SPEC_CASE2(h, r, 0x97669766U, 0x90009000U) + __SPEC_CASE2(h, r, 0x99729972U, 0x10001000U) + __SPEC_CASE2(h, r, 0xA5C4A5C4U, 0x10001000U) + __SPEC_CASE2(h, r, 0xBF0ABF0AU, 0x81008100U) + " mov.b32 %0, r; \n" + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +__CUDA_FP16_DECL__ __half hlog2(const __half a) { + __half val; + asm("{.reg.b16 h, r; \n" + " .reg.b32 f; \n" + " mov.b16 h, %1; \n" + " cvt.f32.f16 f, h; \n" + " lg2.approx.ftz.f32 f, f; \n" + " cvt.rn.f16.f32 r, f; \n" + __SPEC_CASE(r, r, 0xA2E2U, 0x8080U) + __SPEC_CASE(r, r, 0xBF46U, 0x9400U) + " mov.b16 %0, r; \n" + "}":"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2log2(const __half2 a) { + __half2 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 fl, fu, r, p; \n" + " mov.b32 {hl, hu}, %1; \n" + " cvt.f32.f16 fl, hl; \n" + " cvt.f32.f16 fu, hu; \n" + " lg2.approx.ftz.f32 fl, fl; \n" + " lg2.approx.ftz.f32 fu, fu; \n" + " cvt.rn.f16.f32 hl, fl; \n" + " cvt.rn.f16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + __SPEC_CASE2(r, r, 0xA2E2A2E2U, 0x80808080U) + __SPEC_CASE2(r, r, 0xBF46BF46U, 0x94009400U) + " mov.b32 %0, r; \n" + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +__CUDA_FP16_DECL__ __half hlog(const __half a) { + __half val; + asm("{.reg.b32 f, C; \n" + " .reg.b16 r,h; \n" + " mov.b16 h,%1; \n" + " cvt.f32.f16 f,h; \n" + " lg2.approx.ftz.f32 f,f; \n" + " mov.b32 C, 0x3f317218U; \n" + " mul.f32 f,f,C; \n" + " cvt.rn.f16.f32 r,f; \n" + __SPEC_CASE(h, r, 0X160DU, 0x9C00U) + __SPEC_CASE(h, r, 0X3BFEU, 0x8010U) + __SPEC_CASE(h, r, 0X3C0BU, 0x8080U) + __SPEC_CASE(h, r, 0X6051U, 0x1C00U) + " mov.b16 %0,r; \n" + "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2log(const __half2 a) { + __half2 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 r, fl, fu, C, h; \n" + " mov.b32 {hl, hu}, %1; \n" + " mov.b32 h, %1; \n" + " cvt.f32.f16 fl, hl; \n" + " cvt.f32.f16 fu, hu; \n" + " lg2.approx.ftz.f32 fl, fl; \n" + " lg2.approx.ftz.f32 fu, fu; \n" + " mov.b32 C, 0x3f317218U; \n" + " mul.f32 fl,fl,C; \n" + " mul.f32 fu,fu,C; \n" + " cvt.rn.f16.f32 hl, fl; \n" + " cvt.rn.f16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + __SPEC_CASE2(h, r, 0X160D160DU, 0x9C009C00U) + __SPEC_CASE2(h, r, 0X3BFE3BFEU, 0x80108010U) + __SPEC_CASE2(h, r, 0X3C0B3C0BU, 0x80808080U) + __SPEC_CASE2(h, r, 0X60516051U, 0x1C001C00U) + " mov.b32 %0, r; \n" + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +__CUDA_FP16_DECL__ __half hlog10(const __half a) { + __half val; + asm("{.reg.b16 h, r; \n" + " .reg.b32 f, C; \n" + " mov.b16 h, %1; \n" + " cvt.f32.f16 f, h; \n" + " lg2.approx.ftz.f32 f, f; \n" + " mov.b32 C, 0x3E9A209BU; \n" + " mul.f32 f,f,C; \n" + " cvt.rn.f16.f32 r, f; \n" + __SPEC_CASE(h, r, 0x338FU, 0x1000U) + __SPEC_CASE(h, r, 0x33F8U, 0x9000U) + __SPEC_CASE(h, r, 0x57E1U, 0x9800U) + __SPEC_CASE(h, r, 0x719DU, 0x9C00U) + " mov.b16 %0, r; \n" + "}":"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2log10(const __half2 a) { + __half2 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 r, fl, fu, C, h; \n" + " mov.b32 {hl, hu}, %1; \n" + " mov.b32 h, %1; \n" + " cvt.f32.f16 fl, hl; \n" + " cvt.f32.f16 fu, hu; \n" + " lg2.approx.ftz.f32 fl, fl; \n" + " lg2.approx.ftz.f32 fu, fu; \n" + " mov.b32 C, 0x3E9A209BU; \n" + " mul.f32 fl,fl,C; \n" + " mul.f32 fu,fu,C; \n" + " cvt.rn.f16.f32 hl, fl; \n" + " cvt.rn.f16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + __SPEC_CASE2(h, r, 0x338F338FU, 0x10001000U) + __SPEC_CASE2(h, r, 0x33F833F8U, 0x90009000U) + __SPEC_CASE2(h, r, 0x57E157E1U, 0x98009800U) + __SPEC_CASE2(h, r, 0x719D719DU, 0x9C009C00U) + " mov.b32 %0, r; \n" + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +#undef __SPEC_CASE2 +#undef __SPEC_CASE +#endif /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) || defined(_NVHPC_CUDA) */ +__CUDA_FP16_DECL__ __half2 h2rcp(const __half2 a) { + __APPROX_FCAST2(rcp) +} +__CUDA_FP16_DECL__ __half hrcp(const __half a) { + __APPROX_FCAST(rcp) +} +__CUDA_FP16_DECL__ __half2 h2rsqrt(const __half2 a) { + __APPROX_FCAST2(rsqrt) +} +__CUDA_FP16_DECL__ __half hrsqrt(const __half a) { + __APPROX_FCAST(rsqrt) +} +__CUDA_FP16_DECL__ __half2 h2sqrt(const __half2 a) { + __APPROX_FCAST2(sqrt) +} +__CUDA_FP16_DECL__ __half hsqrt(const __half a) { + __APPROX_FCAST(sqrt) +} +#undef __APPROX_FCAST +#undef __APPROX_FCAST2 +#endif /* defined(_NVHPC_CUDA) || defined(__CUDACC__) */ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hisnan2(const __half2 a) +{ + __half2 r; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + asm("{set.nan.f16x2.f16x2 %0,%1,%2;\n}" + :"=r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a)), "r"(__HALF2_TO_CUI(a))); +, + __half2_raw val; + val.x = __hisnan(a.x) ? (unsigned short)0x3C00U : (unsigned short)0U; + val.y = __hisnan(a.y) ? (unsigned short)0x3C00U : (unsigned short)0U; + r = __half2(val); +) + return r; +} +__CUDA_HOSTDEVICE_FP16_DECL__ bool __hisnan(const __half a) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __half r; + asm("{set.nan.f16.f16 %0,%1,%2;\n}" + :"=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a)), "h"(__HALF_TO_CUS(a))); + return __HALF_TO_CUS(r) != 0U; +, + const __half_raw hr = static_cast<__half_raw>(a); + return ((hr.x & (unsigned short)0x7FFFU) > (unsigned short)0x7C00U); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hneg2(const __half2 a) +{ + __half2 r; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + asm("{neg.f16x2 %0,%1;\n}" + :"=r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a))); +, + r.x = __hneg(a.x); + r.y = __hneg(a.y); +) + return r; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hneg(const __half a) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __half r; + asm("{neg.f16 %0,%1;\n}" + :"=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a))); + return r; +, + const float fa = __half2float(a); + return __float2half(-fa); +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __habs2(const __half2 a) +{ + __half2 r; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + asm("{abs.f16x2 %0,%1;\n}" + :"=r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a))); +, + r.x = __habs(a.x); + r.y = __habs(a.y); +) + return r; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __habs(const __half a) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, + __half r; + asm("{abs.f16 %0,%1;\n}" + :"=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a))); + return r; +, + __half_raw abs_a_raw = static_cast<__half_raw>(a); + abs_a_raw.x &= (unsigned short)0x7FFFU; + if (abs_a_raw.x > (unsigned short)0x7C00U) + { + // return canonical NaN + abs_a_raw.x = (unsigned short)0x7FFFU; + } + return static_cast<__half>(abs_a_raw); +) +} +#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) +__CUDA_FP16_DECL__ __half2 __hcmadd(const __half2 a, const __half2 b, const __half2 c) +{ + // fast version of complex multiply-accumulate + // (a.re, a.im) * (b.re, b.im) + (c.re, c.im) + // acc.re = (c.re + a.re*b.re) - a.im*b.im + // acc.im = (c.im + a.re*b.im) + a.im*b.re + __half real_tmp = __hfma(a.x, b.x, c.x); + __half img_tmp = __hfma(a.x, b.y, c.y); + real_tmp = __hfma(__hneg(a.y), b.y, real_tmp); + img_tmp = __hfma(a.y, b.x, img_tmp); + return make_half2(real_tmp, img_tmp); +} +#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */ + +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmax_nan(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + __BINARY_OP_HALF_MACRO(max.NaN) +, + __half maxval; + if (__hisnan(a) || __hisnan(b)) + { + maxval = CUDART_NAN_FP16; + } + else + { + maxval = __hmax(a, b); + } + return maxval; +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __hmin_nan(const __half a, const __half b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + __BINARY_OP_HALF_MACRO(min.NaN) +, + __half minval; + if (__hisnan(a) || __hisnan(b)) + { + minval = CUDART_NAN_FP16; + } + else + { + minval = __hmin(a, b); + } + return minval; +) +} + +#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) +__CUDA_FP16_DECL__ __half __hfma_relu(const __half a, const __half b, const __half c) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + __TERNARY_OP_HALF_MACRO(fma.rn.relu) +, + return __hmax_nan(__hfma(a, b, c), CUDART_ZERO_FP16); +) +} +#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */ + +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmax2_nan(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + __BINARY_OP_HALF2_MACRO(max.NaN) +, + __half2 result = __hmax2(a, b); + if (__hisnan(a.x) || __hisnan(b.x)) + { + result.x = CUDART_NAN_FP16; + } + if (__hisnan(a.y) || __hisnan(b.y)) + { + result.y = CUDART_NAN_FP16; + } + return result; +) +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __hmin2_nan(const __half2 a, const __half2 b) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + __BINARY_OP_HALF2_MACRO(min.NaN) +, + __half2 result = __hmin2(a, b); + if (__hisnan(a.x) || __hisnan(b.x)) + { + result.x = CUDART_NAN_FP16; + } + if (__hisnan(a.y) || __hisnan(b.y)) + { + result.y = CUDART_NAN_FP16; + } + return result; +) +} +#if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) +__CUDA_FP16_DECL__ __half2 __hfma2_relu(const __half2 a, const __half2 b, const __half2 c) +{ +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + __TERNARY_OP_HALF2_MACRO(fma.rn.relu) +, + __half2_raw hzero; + hzero.x = (unsigned short)0U; + hzero.y = (unsigned short)0U; + return __hmax2_nan(__hfma2(a, b, c), __half2(hzero)); +) +} +#endif /* defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) || defined(_NVHPC_CUDA) */ + +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +/* Define __PTR for atomicAdd prototypes below, undef after done */ +#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__) +#define __PTR "l" +#else +#define __PTR "r" +#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/ + +__CUDA_FP16_DECL__ __half2 atomicAdd(__half2 *const address, const __half2 val) { +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_60, + __half2 r; + asm volatile ("{ atom.add.noftz.f16x2 %0,[%1],%2; }\n" + : "=r"(__HALF2_TO_UI(r)) : __PTR(address), "r"(__HALF2_TO_CUI(val)) + : "memory"); + return r; +, + unsigned int* address_as_uint = (unsigned int*)address; + unsigned int old = *address_as_uint; + unsigned int assumed; + do { + assumed = old; + __half2 new_val = __hadd2(val, *(__half2*)&assumed); + old = atomicCAS(address_as_uint, assumed, *(unsigned int*)&new_val); + } while (assumed != old); + return *(__half2*)&old; +) +} + +#if (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700))) || defined(_NVHPC_CUDA) +__CUDA_FP16_DECL__ __half atomicAdd(__half *const address, const __half val) { + __half r; + asm volatile ("{ atom.add.noftz.f16 %0,[%1],%2; }\n" + : "=h"(__HALF_TO_US(r)) + : __PTR(address), "h"(__HALF_TO_CUS(val)) + : "memory"); + return r; +} +#endif /* (defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700))) || defined(_NVHPC_CUDA) */ + +#undef __PTR +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ +#endif /* !(defined __DOXYGEN_ONLY__) */ +#endif /* defined(__cplusplus) */ + +#undef __TERNARY_OP_HALF2_MACRO +#undef __TERNARY_OP_HALF_MACRO +#undef __BINARY_OP_HALF2_MACRO +#undef __BINARY_OP_HALF_MACRO + +#undef __CUDA_HOSTDEVICE_FP16_DECL__ +#undef __CUDA_FP16_DECL__ + +#undef __HALF_TO_US +#undef __HALF_TO_CUS +#undef __HALF2_TO_UI +#undef __HALF2_TO_CUI +#undef __CUDA_FP16_CONSTEXPR__ + +#if defined(__CPP_VERSION_AT_LEAST_11_FP16) +#undef __CPP_VERSION_AT_LEAST_11_FP16 +#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */ + +#undef ___CUDA_FP16_STRINGIFY_INNERMOST +#undef __CUDA_FP16_STRINGIFY + +#endif /* end of include guard: __CUDA_FP16_HPP__ */ From e33c7cc6d57a751d93555cc5b1cf49362b8d0b31 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 19 Aug 2025 00:33:22 -0700 Subject: [PATCH 39/56] update doc --- docs/source/reference/types.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/reference/types.rst b/docs/source/reference/types.rst index 40112210a..303822eaf 100644 --- a/docs/source/reference/types.rst +++ b/docs/source/reference/types.rst @@ -80,7 +80,7 @@ Data Movement and Casts Construction of a single instance of a ``bfloat16`` object: -.. function:: numba.cuda.bf16.bfloat16(b) +.. function:: numba.cuda.types.bfloat16(b) Constructs a ``bfloat16`` from existing device `scalar`. Supported scalar types: From bc3b27de97e60cab7a0abf4df793ff0875c3caa9 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 19 Aug 2025 11:39:21 -0700 Subject: [PATCH 40/56] remove bfloattype custom impl --- numba_cuda/numba/cuda/models.py | 41 +-------------------------------- 1 file changed, 1 insertion(+), 40 deletions(-) diff --git a/numba_cuda/numba/cuda/models.py b/numba_cuda/numba/cuda/models.py index d6e28b82f..335f504fa 100644 --- a/numba_cuda/numba/cuda/models.py +++ b/numba_cuda/numba/cuda/models.py @@ -1,4 +1,3 @@ -import struct import functools from llvmlite import ir @@ -46,46 +45,8 @@ def __init__(self, dmm, fe_type): register_model(CUDADispatcher)(models.OpaqueModel) -def _as_bfloat(value): - # Step 1: Reinterpret the input as u32 bits - u = struct.unpack("I", struct.pack("f", value))[0] - - # Step 2: Truncate (or round, we choose truncate) last 16 bits - trunc = u >> 16 - - # Step 3: Unpack them back to Python floats - f = struct.unpack("f", struct.pack("I", trunc))[0] - - return f - - -class BfloatType(ir.types._BaseFloatType): - """Brain-float type""" - - null = "0.0" - intrinsic_name = "bfloat" - - def __str__(self): - return "bfloat" - - def format_constant(self, value): - return ir.types._format_double(_as_bfloat(value)) - - -BfloatType._create_instance() - - @register_model(Bfloat16) class _model___nv_bfloat16(PrimitiveModel): def __init__(self, dmm, fe_type): - from numba.cuda.api import get_current_device - - major, minor = get_current_device().compute_capability - - # Blackwell device leverage latest nvvm (llvm 20+ dialect) which has - # bfloat type - if major >= 10: - be_type = BfloatType() - else: - be_type = ir.IntType(16) + be_type = ir.IntType(16) super(_model___nv_bfloat16, self).__init__(dmm, fe_type, be_type) From edea3c3e7022e8f6b32a748b3638801b8feb3380 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 19 Aug 2025 12:23:47 -0700 Subject: [PATCH 41/56] add print tests --- .../numba/cuda/tests/cudapy/test_print.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_print.py b/numba_cuda/numba/cuda/tests/cudapy/test_print.py index 0dbb3139b..a723885e0 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_print.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_print.py @@ -99,6 +99,20 @@ def print_too_many(r): cuda.synchronize() """ +print_bfloat16_usecase = """\ +from numba import cuda + +@cuda.jit +def print_bfloat16(): + # 0.9375 is a dyadic rational, it's integer significand can expand within 7 digits. + # printing this should not give any rounding error. + a = cuda.types.bfloat16(0.9375) + print(a, a, a) + +print_bfloat16[1, 1]() +cuda.synchronize() +""" + class TestPrint(CUDATestCase): # Note that in these tests we generally strip the output to avoid dealing @@ -145,6 +159,10 @@ def test_dim3(self): expected = [str(i) for i in np.ndindex(2, 2, 2)] self.assertEqual(sorted(lines), expected) + def test_bfloat16(self): + output, _ = self.run_code(print_bfloat16_usecase) + self.assertEqual(output.strip(), "0.937500 0.937500 0.937500") + @skip_on_cudasim("cudasim can print unlimited output") def test_too_many_args(self): # Tests that we emit the format string and warn when there are more From 0fa0174d42320b011f3fd7b3c563042d48f80c5d Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 19 Aug 2025 12:30:50 -0700 Subject: [PATCH 42/56] add documentation for bfloat16 type --- numba_cuda/numba/cuda/types.py | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/numba_cuda/numba/cuda/types.py b/numba_cuda/numba/cuda/types.py index 17a4184d1..1f7786c37 100644 --- a/numba_cuda/numba/cuda/types.py +++ b/numba_cuda/numba/cuda/types.py @@ -43,7 +43,24 @@ class CUDADispatcher(types.Dispatcher): class Bfloat16(types.Number): """ - A bfloat16 type. + A bfloat16 type. Has 8 exponent bits and 7 significand bits. + + Conversion rules: + Floats: + from: + fp32, fp64: UNSAFE + fp16: UNSAFE (loses precision) + to: + fp32, fp64: PROMOTE (same exponent, more mantissa) + fp16: UNSAFE (loses range) + + Integers: + from: + int8: SAFE + other int: All UNSAFE (bf16 cannot represent all integers in range) + to: UNSAFE (loses precision, round to zeros) + + All other conversions are not allowed. """ def __init__(self): @@ -59,8 +76,8 @@ def can_convert_from(self, typingctx, other): elif isinstance(other, types.Integer): if other.bitwidth == 8: return Conversion.safe - - return Conversion.unsafe + else: + return Conversion.unsafe def can_convert_to(self, typingctx, other): if isinstance(other, types.Float): @@ -71,8 +88,6 @@ def can_convert_to(self, typingctx, other): elif isinstance(other, types.Integer): return Conversion.unsafe - return Conversion.unsafe - def unify(self, typingctx, other): if isinstance(other, (types.Float, types.Integer)): return typingctx.unify_pairs(self, other) @@ -83,7 +98,9 @@ def cast_python_value(self, value): return ml_dtypes.bfloat16(value) except ImportError: - raise NotImplementedError + raise NotImplementedError( + "Please install ml_dtypes to use bfloat16 on host." + ) bfloat16 = Bfloat16() From e09ffc6dff6a935c4cd9325f1d4eab4120921787 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 19 Aug 2025 12:34:58 -0700 Subject: [PATCH 43/56] update ci script and pyproject toml to make ml_dtypes a test time dependency --- ci/test_conda.sh | 1 + ci/test_conda_ctypes_binding.sh | 1 + pyproject.toml | 1 + 3 files changed, 3 insertions(+) diff --git a/ci/test_conda.sh b/ci/test_conda.sh index 06c3c6e06..4aa989c81 100755 --- a/ci/test_conda.sh +++ b/ci/test_conda.sh @@ -30,6 +30,7 @@ rapids-mamba-retry create -n test \ pytest \ pytest-xdist \ cffi \ + ml_dtypes \ python=${RAPIDS_PY_VERSION} # Temporarily allow unbound variables for conda activation. diff --git a/ci/test_conda_ctypes_binding.sh b/ci/test_conda_ctypes_binding.sh index a274c021e..4365eb0b7 100755 --- a/ci/test_conda_ctypes_binding.sh +++ b/ci/test_conda_ctypes_binding.sh @@ -24,6 +24,7 @@ rapids-mamba-retry create -n test \ pytest \ pytest-xdist \ cffi \ + ml_dtypes \ python=${RAPIDS_PY_VERSION} # Temporarily allow unbound variables for conda activation. diff --git a/pyproject.toml b/pyproject.toml index f3add3728..0c3fa9479 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,6 +48,7 @@ test = [ "pytest", "pytest-xdist", "filecheck", + "ml_dtypes", ] test-cu12 = [ "numba-cuda[cu12]", From 14b9fd5cfb5d61d87efe4a3ac453099955bf9a33 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 20 Aug 2025 10:36:18 -0700 Subject: [PATCH 44/56] add manual implementation of bf16->fp64, litint->bf16 --- numba_cuda/numba/cuda/_internal/cuda_bf16.py | 28 ++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/numba_cuda/numba/cuda/_internal/cuda_bf16.py b/numba_cuda/numba/cuda/_internal/cuda_bf16.py index a2af16d04..055ae2e00 100644 --- a/numba_cuda/numba/cuda/_internal/cuda_bf16.py +++ b/numba_cuda/numba/cuda/_internal/cuda_bf16.py @@ -1599,6 +1599,34 @@ def impl(context, builder, fromty, toty, value): _from___nv_bfloat16_to_bool__lower(shim_stream, shim_obj) +# C++ does not provide a conversion operator from bfloat16 to double, so we need to implement it manually. +def _from___nv_bfloat16_to_float64__lower(): + @lower_cast(_type___nv_bfloat16, float64) + def impl(context, builder, fromty, toty, value): + # Hand rolled bfloat16 -> float32 -> double conversion with zero-ext + bits32 = builder.zext(value, ir.IntType(32)) + shift = builder.shl(bits32, ir.Constant(ir.IntType(32), 16)) + f32 = builder.bitcast(shift, ir.FloatType()) + # printf("%f") expects a double; promote to f64 to match vararg expectation + f64 = builder.fpext(f32, ir.DoubleType()) + return f64 + + +_from___nv_bfloat16_to_float64__lower() + + +def _literalint_to_bf16_lower(): + @lower_cast(types.IntegerLiteral, _type___nv_bfloat16) + def impl(context, builder, fromty, toty, value): + f32 = context.cast(builder, value, fromty, float32) + i32 = builder.bitcast(f32, ir.IntType(32)) + i16 = builder.trunc(i32, ir.IntType(16)) + return i16 + + +_literalint_to_bf16_lower() + + # Typing for __nv_bfloat162 class _type_class___nv_bfloat162(Type): def __init__(self): From b7b70c6b8e6548df7eaa8ab4dc9be197378b5312 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 20 Aug 2025 15:30:50 -0700 Subject: [PATCH 45/56] Maintain original overload resolution for all native operations --- numba_cuda/numba/cuda/_internal/cuda_bf16.py | 73 +++++++++++--------- 1 file changed, 40 insertions(+), 33 deletions(-) diff --git a/numba_cuda/numba/cuda/_internal/cuda_bf16.py b/numba_cuda/numba/cuda/_internal/cuda_bf16.py index 055ae2e00..8fc977cb0 100644 --- a/numba_cuda/numba/cuda/_internal/cuda_bf16.py +++ b/numba_cuda/numba/cuda/_internal/cuda_bf16.py @@ -25,6 +25,14 @@ from numba.core.imputils import Registry as TargetRegistry from numba.core.imputils import lower_cast from numba.core.typing import signature +from numba.core.typing.old_builtins import ( + BinOp, + BinOpTrueDiv, + UnaryNegate, + UnaryPositive, + UnorderedCmpOp, + OrderedCmpOp, +) from numba.core.typing.templates import AttributeTemplate, ConcreteTemplate from numba.core.typing.templates import Registry as TypingRegistry from numba.cuda import CUSource, declare_device @@ -1607,7 +1615,6 @@ def impl(context, builder, fromty, toty, value): bits32 = builder.zext(value, ir.IntType(32)) shift = builder.shl(bits32, ir.Constant(ir.IntType(32), 16)) f32 = builder.bitcast(shift, ir.FloatType()) - # printf("%f") expects a double; promote to f64 to match vararg expectation f64 = builder.fpext(f32, ir.DoubleType()) return f64 @@ -15962,8 +15969,8 @@ class _typing___half(ConcreteTemplate): @register_global(operator.add) -class _typing_operator_add(ConcreteTemplate): - cases = [ +class _typing_operator_add(BinOp): + cases = BinOp.cases + [ signature( _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16 ), @@ -15974,8 +15981,8 @@ class _typing_operator_add(ConcreteTemplate): @register_global(operator.sub) -class _typing_operator_sub(ConcreteTemplate): - cases = [ +class _typing_operator_sub(BinOp): + cases = BinOp.cases + [ signature( _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16 ), @@ -15986,8 +15993,8 @@ class _typing_operator_sub(ConcreteTemplate): @register_global(operator.mul) -class _typing_operator_mul(ConcreteTemplate): - cases = [ +class _typing_operator_mul(BinOp): + cases = BinOp.cases + [ signature( _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16 ), @@ -15998,8 +16005,8 @@ class _typing_operator_mul(ConcreteTemplate): @register_global(operator.truediv) -class _typing_operator_truediv(ConcreteTemplate): - cases = [ +class _typing_operator_truediv(BinOpTrueDiv): + cases = BinOpTrueDiv.cases + [ signature( _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16 ), @@ -16010,8 +16017,8 @@ class _typing_operator_truediv(ConcreteTemplate): @register_global(operator.iadd) -class _typing_operator_iadd(ConcreteTemplate): - cases = [ +class _typing_operator_iadd(BinOp): + cases = BinOp.cases + [ signature( _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16 ), @@ -16022,8 +16029,8 @@ class _typing_operator_iadd(ConcreteTemplate): @register_global(operator.isub) -class _typing_operator_isub(ConcreteTemplate): - cases = [ +class _typing_operator_isub(BinOp): + cases = BinOp.cases + [ signature( _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16 ), @@ -16034,8 +16041,8 @@ class _typing_operator_isub(ConcreteTemplate): @register_global(operator.imul) -class _typing_operator_imul(ConcreteTemplate): - cases = [ +class _typing_operator_imul(BinOp): + cases = BinOp.cases + [ signature( _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16 ), @@ -16046,8 +16053,8 @@ class _typing_operator_imul(ConcreteTemplate): @register_global(operator.itruediv) -class _typing_operator_itruediv(ConcreteTemplate): - cases = [ +class _typing_operator_itruediv(BinOpTrueDiv): + cases = BinOp.cases + [ signature( _type___nv_bfloat16, _type___nv_bfloat16, _type___nv_bfloat16 ), @@ -16058,64 +16065,64 @@ class _typing_operator_itruediv(ConcreteTemplate): @register_global(operator.pos) -class _typing_operator_pos(ConcreteTemplate): - cases = [ +class _typing_operator_pos(UnaryPositive): + cases = UnaryPositive.cases + [ signature(_type___nv_bfloat16, _type___nv_bfloat16), signature(_type___nv_bfloat162, _type___nv_bfloat162), ] @register_global(operator.neg) -class _typing_operator_neg(ConcreteTemplate): - cases = [ +class _typing_operator_neg(UnaryNegate): + cases = UnaryNegate.cases + [ signature(_type___nv_bfloat16, _type___nv_bfloat16), signature(_type___nv_bfloat162, _type___nv_bfloat162), ] @register_global(operator.eq) -class _typing_operator_eq(ConcreteTemplate): - cases = [ +class _typing_operator_eq(UnorderedCmpOp): + cases = UnorderedCmpOp.cases + [ signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16), signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162), ] @register_global(operator.ne) -class _typing_operator_ne(ConcreteTemplate): - cases = [ +class _typing_operator_ne(UnorderedCmpOp): + cases = UnorderedCmpOp.cases + [ signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16), signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162), ] @register_global(operator.gt) -class _typing_operator_gt(ConcreteTemplate): - cases = [ +class _typing_operator_gt(OrderedCmpOp): + cases = OrderedCmpOp.cases + [ signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16), signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162), ] @register_global(operator.lt) -class _typing_operator_lt(ConcreteTemplate): - cases = [ +class _typing_operator_lt(OrderedCmpOp): + cases = OrderedCmpOp.cases + [ signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16), signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162), ] @register_global(operator.ge) -class _typing_operator_ge(ConcreteTemplate): - cases = [ +class _typing_operator_ge(OrderedCmpOp): + cases = OrderedCmpOp.cases + [ signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16), signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162), ] @register_global(operator.le) -class _typing_operator_le(ConcreteTemplate): - cases = [ +class _typing_operator_le(OrderedCmpOp): + cases = OrderedCmpOp.cases + [ signature(bool_, _type___nv_bfloat16, _type___nv_bfloat16), signature(bool_, _type___nv_bfloat162, _type___nv_bfloat162), ] From 220287aed7eed13eb089c1bb10c80d7b66f484ea Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 20 Aug 2025 15:32:03 -0700 Subject: [PATCH 46/56] remove operator function exposure (a numbast bug) --- numba_cuda/numba/cuda/_internal/cuda_bf16.py | 34 -------------------- 1 file changed, 34 deletions(-) diff --git a/numba_cuda/numba/cuda/_internal/cuda_bf16.py b/numba_cuda/numba/cuda/_internal/cuda_bf16.py index 8fc977cb0..3b63e14ff 100644 --- a/numba_cuda/numba/cuda/_internal/cuda_bf16.py +++ b/numba_cuda/numba/cuda/_internal/cuda_bf16.py @@ -16385,40 +16385,6 @@ class _typing_operator_le(OrderedCmpOp): "h2cos", "h2sin", "atomicAdd", - "atomicAdd", - "operator+", - "operator-", - "operator*", - "operator/", - "operator+=", - "operator-=", - "operator*=", - "operator/=", - "operator+", - "operator-", - "operator==", - "operator!=", - "operator>", - "operator<", - "operator>=", - "operator<=", - "operator+", - "operator-", - "operator*", - "operator/", - "operator+=", - "operator-=", - "operator*=", - "operator/=", - "operator+", - "operator-", - "operator==", - "operator!=", - "operator>", - "operator<", - "operator>=", - "operator<=", - "__half", ] From 0f6683e3c689217a147f041ce1d1f2cf3ba18c3d Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 20 Aug 2025 22:17:40 -0700 Subject: [PATCH 47/56] remove ml_dtypes dependency in core --- numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py | 10 ---------- numba_cuda/numba/cuda/types.py | 10 ---------- 2 files changed, 20 deletions(-) diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py index 8f0d4569b..0d6a5935e 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py @@ -1,5 +1,3 @@ -import unittest -from importlib.util import find_spec import numpy as np from ml_dtypes import bfloat16 as mldtypes_bf16 @@ -572,14 +570,6 @@ def kernel(out): _bf16_ulp_distance(raw[4:], f8_expected), 2 ) - @unittest.skipIf( - find_spec("ml_dtypes") is None, - "ml_dtypes is required to use bfloat16 on host", - ) - def test_use_bfloat16_on_host(self): - x = bfloat16(3.0) - self.assertEqual(x, 3.0) - def _bf16_ulp_rank(bits_int16: np.ndarray) -> np.ndarray: """ diff --git a/numba_cuda/numba/cuda/types.py b/numba_cuda/numba/cuda/types.py index 1f7786c37..f1b23a836 100644 --- a/numba_cuda/numba/cuda/types.py +++ b/numba_cuda/numba/cuda/types.py @@ -92,15 +92,5 @@ def unify(self, typingctx, other): if isinstance(other, (types.Float, types.Integer)): return typingctx.unify_pairs(self, other) - def cast_python_value(self, value): - try: - import ml_dtypes # noqa: F401 - - return ml_dtypes.bfloat16(value) - except ImportError: - raise NotImplementedError( - "Please install ml_dtypes to use bfloat16 on host." - ) - bfloat16 = Bfloat16() From c309442f40a09c9562a10d1363bdfa2013b992f9 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 20 Aug 2025 22:56:00 -0700 Subject: [PATCH 48/56] use builtin not old_builtin --- numba_cuda/numba/cuda/_internal/cuda_bf16.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numba_cuda/numba/cuda/_internal/cuda_bf16.py b/numba_cuda/numba/cuda/_internal/cuda_bf16.py index 3b63e14ff..1edc43555 100644 --- a/numba_cuda/numba/cuda/_internal/cuda_bf16.py +++ b/numba_cuda/numba/cuda/_internal/cuda_bf16.py @@ -25,7 +25,7 @@ from numba.core.imputils import Registry as TargetRegistry from numba.core.imputils import lower_cast from numba.core.typing import signature -from numba.core.typing.old_builtins import ( +from numba.core.typing.builtins import ( BinOp, BinOpTrueDiv, UnaryNegate, From 6ed0e815ca441e6b3bac1a9b843ec4ad55b1b907 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 20 Aug 2025 23:10:01 -0700 Subject: [PATCH 49/56] add ml_dtypes to simulator ci --- ci/test_simulator.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/test_simulator.sh b/ci/test_simulator.sh index 4bdaf8bef..bb85a8733 100755 --- a/ci/test_simulator.sh +++ b/ci/test_simulator.sh @@ -13,6 +13,7 @@ DEPENDENCIES=( "pytest" "pytest-xdist" "cffi" + "ml_dtypes" "python=${RAPIDS_PY_VERSION}" "numba-cuda" ) From c0250cb45be80afb212c1eb8523376c2d65dfed8 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 20 Aug 2025 23:25:03 -0700 Subject: [PATCH 50/56] fix sub-sub section headers --- docs/source/reference/types.rst | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/docs/source/reference/types.rst b/docs/source/reference/types.rst index f06caf5f4..9cc4c2bf2 100644 --- a/docs/source/reference/types.rst +++ b/docs/source/reference/types.rst @@ -370,7 +370,7 @@ Special value predicates: `_. Precision Conversion and Data Movement -************************************* +************************************** The following conversion intrinsics convert between ``bfloat16`` and other scalar types. Rounding-mode suffixes: @@ -381,7 +381,7 @@ scalar types. Rounding-mode suffixes: - ``_ru``: round-up (towards +∞) Floating-point conversions -========================== +^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. function:: numba.cuda.bf16.float32_to_bfloat16(x) @@ -403,13 +403,13 @@ Floating-point conversions Convert a ``float32`` to ``bfloat16`` using the specified rounding mode. Integer conversions -=================== +^^^^^^^^^^^^^^^^^^^^ Representative APIs for each integer width are listed below. All have rounding-mode variants ``_rn``, ``_rz``, ``_rd``, ``_ru``. int16 (signed 16-bit) ---------------------- +""""""""""""""""""""" .. function:: numba.cuda.bf16.int16_to_bfloat16_rn(x) .. function:: numba.cuda.bf16.int16_to_bfloat16_rz(x) @@ -426,7 +426,7 @@ int16 (signed 16-bit) Convert a ``bfloat16`` to ``int16`` with the selected rounding mode. uint16 (unsigned 16-bit) ------------------------- +""""""""""""""""""""""""" .. function:: numba.cuda.bf16.uint16_to_bfloat16_rn(x) .. function:: numba.cuda.bf16.uint16_to_bfloat16_rz(x) @@ -443,7 +443,7 @@ uint16 (unsigned 16-bit) Convert a ``bfloat16`` to ``uint16`` with the selected rounding mode. int32 (signed 32-bit) ---------------------- +""""""""""""""""""""" .. function:: numba.cuda.bf16.int32_to_bfloat16_rn(x) .. function:: numba.cuda.bf16.int32_to_bfloat16_rz(x) @@ -460,7 +460,7 @@ int32 (signed 32-bit) Convert a ``bfloat16`` to ``int32`` with the selected rounding mode. uint32 (unsigned 32-bit) ------------------------- +""""""""""""""""""""""""" .. function:: numba.cuda.bf16.uint32_to_bfloat16_rn(x) .. function:: numba.cuda.bf16.uint32_to_bfloat16_rz(x) @@ -477,7 +477,7 @@ uint32 (unsigned 32-bit) Convert a ``bfloat16`` to ``uint32`` with the selected rounding mode. int64 (signed 64-bit) ---------------------- +""""""""""""""""""""" .. function:: numba.cuda.bf16.int64_to_bfloat16_rn(x) .. function:: numba.cuda.bf16.int64_to_bfloat16_rz(x) @@ -494,7 +494,7 @@ int64 (signed 64-bit) Convert a ``bfloat16`` to ``int64`` with the selected rounding mode. uint64 (unsigned 64-bit) ------------------------- +""""""""""""""""""""""""" .. function:: numba.cuda.bf16.uint64_to_bfloat16_rn(x) .. function:: numba.cuda.bf16.uint64_to_bfloat16_rz(x) @@ -511,7 +511,7 @@ uint64 (unsigned 64-bit) Convert a ``bfloat16`` to ``uint64`` with the selected rounding mode. 8-bit conversions -================= +^^^^^^^^^^^^^^^^^^ .. function:: numba.cuda.bf16.bfloat16_to_int8_rz(x) @@ -522,7 +522,7 @@ uint64 (unsigned 64-bit) Convert a ``bfloat16`` to ``uint8`` with round-towards-zero. Bit Reinterpret Casts -********************* +^^^^^^^^^^^^^^^^^^^^^ These APIs reinterpret bits without numeric conversion: From 13d7cb7838c5d72521aa499cba4ed7eee4022e74 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 20 Aug 2025 23:44:16 -0700 Subject: [PATCH 51/56] skip simulator for roundtrip --- numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py index 4e9b3dd30..f8f47644b 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py @@ -344,6 +344,8 @@ def kernel(out): self.assertAlmostEqual(out[3], 2.0, delta=1e-3) def test_bfloat16_as_bitcast(self): + self.skip_unsupported() + @cuda.jit def roundtrip_kernel(test_val, i2, u2): i2[0] = int16_as_bfloat16(bfloat16_as_int16(test_val)) From d41c67cf0eabe491c6fc1ea80f53bdb20c55e61d Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Thu, 21 Aug 2025 09:50:05 -0700 Subject: [PATCH 52/56] use numba typing templates --- numba_cuda/numba/cuda/_internal/cuda_bf16.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/numba_cuda/numba/cuda/_internal/cuda_bf16.py b/numba_cuda/numba/cuda/_internal/cuda_bf16.py index c7a8ed934..33beb2b5a 100644 --- a/numba_cuda/numba/cuda/_internal/cuda_bf16.py +++ b/numba_cuda/numba/cuda/_internal/cuda_bf16.py @@ -36,8 +36,8 @@ UnorderedCmpOp, OrderedCmpOp, ) -from numba.core.typing.templates import AttributeTemplate, ConcreteTemplate -from numba.core.typing.templates import Registry as TypingRegistry +from numba.cuda.typing.templates import AttributeTemplate, ConcreteTemplate +from numba.cuda.typing.templates import Registry as TypingRegistry from numba.cuda import CUSource, declare_device from numba.cuda.vector_types import vector_types from numba.extending import as_numba_type From 7d77ada2314e6ac206cba1c2c07783ac6c9277ae Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Thu, 21 Aug 2025 10:54:56 -0700 Subject: [PATCH 53/56] skip lto test without nvjitlink --- numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py index 31d1d13e4..7d4343e35 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py @@ -5,6 +5,7 @@ from numba.cuda.testing import unittest, CUDATestCase import numpy as np import operator +from numba.cuda.testing import skip_if_nvjitlink_missing from numba import ( config, @@ -293,6 +294,7 @@ def kernel(arr): np.testing.assert_allclose(arr, [3], atol=1e-2) + @skip_if_nvjitlink_missing("LTO is not supported without nvjitlink.") def test_bf16_intrinsics_used_in_lto(self): self.skip_unsupported() From 212f4f0139f55279240eabb837153a210a7400f8 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Thu, 21 Aug 2025 10:58:13 -0700 Subject: [PATCH 54/56] skip cuda sim for bfloat16 tests --- .../numba/cuda/tests/cudapy/test_bfloat16.py | 191 +++++++++--------- 1 file changed, 98 insertions(+), 93 deletions(-) diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py index f8f47644b..95e5fe140 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py @@ -14,100 +14,105 @@ uint16, uint32, uint64, + config, ) -from numba.cuda.bf16 import ( - bfloat16, - habs, - hadd, - hsub, - hmul, - hadd_rn, - hsub_rn, - hmul_rn, - hdiv, - hadd_sat, - hsub_sat, - hmul_sat, - hfma, - hfma_sat, - hneg, - hfma_relu, - # Comparison intrinsics - heq, - hne, - hge, - hgt, - hle, - hlt, - hmax, - hmin, - hmax_nan, - hmin_nan, - hisnan, - hisinf, - # Conversion intrinsics (NumPy-style names) - bfloat16_to_int8_rz, - bfloat16_to_uint8_rz, - int16_to_bfloat16_rn, - int16_to_bfloat16_rz, - int16_to_bfloat16_rd, - int16_to_bfloat16_ru, - bfloat16_to_int16_rn, - bfloat16_to_int16_rz, - bfloat16_to_int16_rd, - bfloat16_to_int16_ru, - uint16_to_bfloat16_rn, - uint16_to_bfloat16_rz, - uint16_to_bfloat16_rd, - uint16_to_bfloat16_ru, - bfloat16_to_uint16_rn, - bfloat16_to_uint16_rz, - bfloat16_to_uint16_rd, - bfloat16_to_uint16_ru, - int32_to_bfloat16_rn, - int32_to_bfloat16_rz, - int32_to_bfloat16_rd, - int32_to_bfloat16_ru, - bfloat16_to_int32_rn, - bfloat16_to_int32_rz, - bfloat16_to_int32_rd, - bfloat16_to_int32_ru, - uint32_to_bfloat16_rn, - uint32_to_bfloat16_rz, - uint32_to_bfloat16_rd, - uint32_to_bfloat16_ru, - bfloat16_to_uint32_rn, - bfloat16_to_uint32_rz, - bfloat16_to_uint32_rd, - bfloat16_to_uint32_ru, - bfloat16_to_int64_rn, - bfloat16_to_int64_rz, - bfloat16_to_int64_rd, - bfloat16_to_int64_ru, - int64_to_bfloat16_rn, - int64_to_bfloat16_rz, - int64_to_bfloat16_rd, - int64_to_bfloat16_ru, - bfloat16_to_uint64_rn, - bfloat16_to_uint64_rz, - bfloat16_to_uint64_rd, - bfloat16_to_uint64_ru, - uint64_to_bfloat16_rn, - uint64_to_bfloat16_rz, - uint64_to_bfloat16_rd, - uint64_to_bfloat16_ru, - bfloat16_as_int16, - int16_as_bfloat16, - bfloat16_as_uint16, - uint16_as_bfloat16, - bfloat16_to_float32, - float32_to_bfloat16, - float64_to_bfloat16, - float32_to_bfloat16_rn, - float32_to_bfloat16_rz, - float32_to_bfloat16_rd, - float32_to_bfloat16_ru, -) + + +if not config.ENABLE_CUDASIM: + from numba.cuda.bf16 import ( + bfloat16, + habs, + hadd, + hsub, + hmul, + hadd_rn, + hsub_rn, + hmul_rn, + hdiv, + hadd_sat, + hsub_sat, + hmul_sat, + hfma, + hfma_sat, + hneg, + hfma_relu, + # Comparison intrinsics + heq, + hne, + hge, + hgt, + hle, + hlt, + hmax, + hmin, + hmax_nan, + hmin_nan, + hisnan, + hisinf, + # Conversion intrinsics (NumPy-style names) + bfloat16_to_int8_rz, + bfloat16_to_uint8_rz, + int16_to_bfloat16_rn, + int16_to_bfloat16_rz, + int16_to_bfloat16_rd, + int16_to_bfloat16_ru, + bfloat16_to_int16_rn, + bfloat16_to_int16_rz, + bfloat16_to_int16_rd, + bfloat16_to_int16_ru, + uint16_to_bfloat16_rn, + uint16_to_bfloat16_rz, + uint16_to_bfloat16_rd, + uint16_to_bfloat16_ru, + bfloat16_to_uint16_rn, + bfloat16_to_uint16_rz, + bfloat16_to_uint16_rd, + bfloat16_to_uint16_ru, + int32_to_bfloat16_rn, + int32_to_bfloat16_rz, + int32_to_bfloat16_rd, + int32_to_bfloat16_ru, + bfloat16_to_int32_rn, + bfloat16_to_int32_rz, + bfloat16_to_int32_rd, + bfloat16_to_int32_ru, + uint32_to_bfloat16_rn, + uint32_to_bfloat16_rz, + uint32_to_bfloat16_rd, + uint32_to_bfloat16_ru, + bfloat16_to_uint32_rn, + bfloat16_to_uint32_rz, + bfloat16_to_uint32_rd, + bfloat16_to_uint32_ru, + bfloat16_to_int64_rn, + bfloat16_to_int64_rz, + bfloat16_to_int64_rd, + bfloat16_to_int64_ru, + int64_to_bfloat16_rn, + int64_to_bfloat16_rz, + int64_to_bfloat16_rd, + int64_to_bfloat16_ru, + bfloat16_to_uint64_rn, + bfloat16_to_uint64_rz, + bfloat16_to_uint64_rd, + bfloat16_to_uint64_ru, + uint64_to_bfloat16_rn, + uint64_to_bfloat16_rz, + uint64_to_bfloat16_rd, + uint64_to_bfloat16_ru, + bfloat16_as_int16, + int16_as_bfloat16, + bfloat16_as_uint16, + uint16_as_bfloat16, + bfloat16_to_float32, + float32_to_bfloat16, + float64_to_bfloat16, + float32_to_bfloat16_rn, + float32_to_bfloat16_rz, + float32_to_bfloat16_rd, + float32_to_bfloat16_ru, + ) + from numba.cuda.testing import CUDATestCase import math From ac576bed3602129be944607d20eafee17792df78 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Thu, 21 Aug 2025 13:04:51 -0700 Subject: [PATCH 55/56] update simulator tests --- numba_cuda/numba/cuda/tests/cudapy/test_print.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_print.py b/numba_cuda/numba/cuda/tests/cudapy/test_print.py index 15e8b7ebf..f6df2c1e3 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_print.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_print.py @@ -103,7 +103,11 @@ def print_too_many(r): """ print_bfloat16_usecase = """\ -from numba import cuda +from numba import cuda, config + +if config.ENABLE_CUDASIM: + print("bfloat16 on host is not yet supported.") + exit(0) @cuda.jit def print_bfloat16(): From f3946db1dc4b7df4a3490209c7dd92f692ad69e2 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Thu, 21 Aug 2025 22:19:46 -0700 Subject: [PATCH 56/56] skip simulator test on host --- numba_cuda/numba/cuda/tests/cudapy/test_print.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_print.py b/numba_cuda/numba/cuda/tests/cudapy/test_print.py index f6df2c1e3..ff27fd169 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_print.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_print.py @@ -105,10 +105,6 @@ def print_too_many(r): print_bfloat16_usecase = """\ from numba import cuda, config -if config.ENABLE_CUDASIM: - print("bfloat16 on host is not yet supported.") - exit(0) - @cuda.jit def print_bfloat16(): # 0.9375 is a dyadic rational, it's integer significand can expand within 7 digits. @@ -166,6 +162,7 @@ def test_dim3(self): expected = [str(i) for i in np.ndindex(2, 2, 2)] self.assertEqual(sorted(lines), expected) + @skip_on_cudasim("bfloat16 on host is not yet supported.") def test_bfloat16(self): output, _ = self.run_code(print_bfloat16_usecase) self.assertEqual(output.strip(), "0.937500 0.937500 0.937500")