NVIDIA · brandon-b-miller · Sep 29, 2025 · Sep 29, 2025
diff --git a/numba_cuda/numba/cuda/testing.py b/numba_cuda/numba/cuda/testing.py
@@ -17,6 +17,7 @@
 from typing import Iterable, Union
 from io import StringIO
 import unittest
+import numpy as np
 
 if PYVERSION >= (3, 10):
     from filecheck.matcher import Matcher
@@ -44,6 +45,8 @@ class CUDATestCase(TestCase):
     matches FileCheck checks, and is not specific to CUDADispatcher.
     """
 
+    FLOAT16_RTOL = np.finfo(np.float16).eps
+
     def setUp(self):
         self._low_occupancy_warnings = config.CUDA_LOW_OCCUPANCY_WARNINGS
         self._warn_on_implicit_copy = config.CUDA_WARN_ON_IMPLICIT_COPY

diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py b/numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py
@@ -629,7 +629,7 @@ def test_hadd(self):
         arg1 = np.array([3.0], dtype=np.float16)
         arg2 = np.array([4.0], dtype=np.float16)
         compiled[1, 1](ary, arg1, arg2)
-        np.testing.assert_allclose(ary[0], arg1 + arg2)
+        np.testing.assert_allclose(ary[0], arg1 + arg2, rtol=self.FLOAT16_RTOL)
 
     @skip_unless_cc_53
     def test_hadd_scalar(self):
@@ -639,7 +639,7 @@ def test_hadd_scalar(self):
         arg2 = np.float16(3.0)
         compiled[1, 1](ary, arg1, arg2)
         ref = arg1 + arg2
-        np.testing.assert_allclose(ary[0], ref)
+        np.testing.assert_allclose(ary[0], ref, rtol=self.FLOAT16_RTOL)
 
     @skip_on_cudasim("Compilation unsupported in the simulator")
     @skip_if_nvjitlink_missing("Numbast generated bindings")
@@ -657,7 +657,9 @@ def test_hfma(self):
         arg2 = np.array([3.0], dtype=np.float16)
         arg3 = np.array([4.0], dtype=np.float16)
         compiled[1, 1](ary, arg1, arg2, arg3)
-        np.testing.assert_allclose(ary[0], arg1 * arg2 + arg3)
+        np.testing.assert_allclose(
+            ary[0], arg1 * arg2 + arg3, rtol=self.FLOAT16_RTOL
+        )
 
     @skip_unless_cc_53
     def test_hfma_scalar(self):
@@ -668,7 +670,7 @@ def test_hfma_scalar(self):
         arg3 = np.float16(4.0)
         compiled[1, 1](ary, arg1, arg2, arg3)
         ref = arg1 * arg2 + arg3
-        np.testing.assert_allclose(ary[0], ref)
+        np.testing.assert_allclose(ary[0], ref, rtol=self.FLOAT16_RTOL)
 
     @skip_on_cudasim("Compilation unsupported in the simulator")
     @skip_if_nvjitlink_missing("Numbast generated bindings")
@@ -687,7 +689,7 @@ def test_hsub(self):
         arg1 = np.array([3.0], dtype=np.float16)
         arg2 = np.array([4.0], dtype=np.float16)
         compiled[1, 1](ary, arg1, arg2)
-        np.testing.assert_allclose(ary[0], arg1 - arg2)
+        np.testing.assert_allclose(ary[0], arg1 - arg2, rtol=self.FLOAT16_RTOL)
 
     @skip_unless_cc_53
     def test_hsub_scalar(self):
@@ -697,7 +699,7 @@ def test_hsub_scalar(self):
         arg2 = np.float16(1.57)
         compiled[1, 1](ary, arg1, arg2)
         ref = arg1 - arg2
-        np.testing.assert_allclose(ary[0], ref)
+        np.testing.assert_allclose(ary[0], ref, rtol=self.FLOAT16_RTOL)
 
     @skip_on_cudasim("Compilation unsupported in the simulator")
     @skip_if_nvjitlink_missing("Numbast generated bindings")
@@ -714,7 +716,7 @@ def test_hmul(self):
         arg1 = np.array([3.0], dtype=np.float16)
         arg2 = np.array([4.0], dtype=np.float16)
         compiled[1, 1](ary, arg1, arg2)
-        np.testing.assert_allclose(ary[0], arg1 * arg2)
+        np.testing.assert_allclose(ary[0], arg1 * arg2, rtol=self.FLOAT16_RTOL)
 
     @skip_unless_cc_53
     def test_hmul_scalar(self):
@@ -724,7 +726,7 @@ def test_hmul_scalar(self):
         arg2 = np.float16(1.57)
         compiled[1, 1](ary, arg1, arg2)
         ref = arg1 * arg2
-        np.testing.assert_allclose(ary[0], ref)
+        np.testing.assert_allclose(ary[0], ref, rtol=self.FLOAT16_RTOL)
 
     @skip_on_cudasim("Compilation unsupported in the simulator")
     @skip_if_nvjitlink_missing("Numbast generated bindings")
@@ -743,7 +745,7 @@ def test_hdiv_scalar(self):
 
         compiled[1, 1](ary, arg1, arg2)
         ref = arg1 / arg2
-        np.testing.assert_allclose(ary[0], ref)
+        np.testing.assert_allclose(ary[0], ref, rtol=self.FLOAT16_RTOL)
 
     @skip_unless_cc_53
     def test_hdiv(self):
@@ -754,15 +756,15 @@ def test_hdiv(self):
 
         compiled.forall(ary.size)(ary, arry1, arry2)
         ref = arry1 / arry2
-        np.testing.assert_allclose(ary, ref)
+        np.testing.assert_allclose(ary, ref, rtol=self.FLOAT16_RTOL)
 
     @skip_unless_cc_53
     def test_hneg(self):
         compiled = cuda.jit("void(f2[:], f2[:])")(simple_hneg)
         ary = np.zeros(1, dtype=np.float16)
         arg1 = np.array([3.0], dtype=np.float16)
         compiled[1, 1](ary, arg1)
-        np.testing.assert_allclose(ary[0], -arg1)
+        np.testing.assert_allclose(ary[0], -arg1, rtol=self.FLOAT16_RTOL)
 
     @skip_unless_cc_53
     def test_hneg_scalar(self):
@@ -771,7 +773,7 @@ def test_hneg_scalar(self):
         arg1 = np.float16(3.1415926)
         compiled[1, 1](ary, arg1)
         ref = -arg1
-        np.testing.assert_allclose(ary[0], ref)
+        np.testing.assert_allclose(ary[0], ref, rtol=self.FLOAT16_RTOL)
 
     @skip_on_cudasim("Compilation unsupported in the simulator")
     @skip_if_nvjitlink_missing("Numbast generated bindings")
@@ -787,7 +789,7 @@ def test_habs(self):
         ary = np.zeros(1, dtype=np.float16)
         arg1 = np.array([-3.0], dtype=np.float16)
         compiled[1, 1](ary, arg1)
-        np.testing.assert_allclose(ary[0], abs(arg1))
+        np.testing.assert_allclose(ary[0], abs(arg1), rtol=self.FLOAT16_RTOL)
 
     @skip_unless_cc_53
     def test_habs_scalar(self):
@@ -796,7 +798,7 @@ def test_habs_scalar(self):
         arg1 = np.float16(-3.1415926)
         compiled[1, 1](ary, arg1)
         ref = abs(arg1)
-        np.testing.assert_allclose(ary[0], ref)
+        np.testing.assert_allclose(ary[0], ref, rtol=self.FLOAT16_RTOL)
 
     @skip_on_cudasim("Compilation unsupported in the simulator")
     @skip_if_nvjitlink_missing("Numbast generated bindings")
@@ -849,15 +851,15 @@ def test_fp16_intrinsics_common(self):
                 kernel = cuda.jit("void(f2[:], f2[:])")(kernel)
                 kernel[1, N](r, x)
                 expected = fn(x, dtype=np.float16)
-                np.testing.assert_allclose(r, expected)
+                np.testing.assert_allclose(r, expected, rtol=self.FLOAT16_RTOL)
 
         x2 = np.random.randint(1, 10, size=N).astype(np.float16)
         for kernel, fn in zip(exp_kernels, expected_exp_functions):
             with self.subTest(fn=fn):
                 kernel = cuda.jit("void(f2[:], f2[:])")(kernel)
                 kernel[1, N](r, x2)
                 expected = fn(x2, dtype=np.float16)
-                np.testing.assert_allclose(r, expected)
+                np.testing.assert_allclose(r, expected, rtol=self.FLOAT16_RTOL)
 
     @skip_unless_cc_53
     def test_hexp10(self):
@@ -876,7 +878,7 @@ def hexp10_vectors(r, x):
 
         # Run the kernel
         hexp10_vectors[1, N](r, x)
-        np.testing.assert_allclose(r, 10**x)
+        np.testing.assert_allclose(r, 10**x, rtol=self.FLOAT16_RTOL)
 
     @skip_unless_cc_53
     def test_fp16_comparison(self):
@@ -948,10 +950,10 @@ def test_hmax(self):
         arg1 = np.float16(3.0)
         arg2 = np.float16(4.0)
         compiled[1, 1](ary, arg1, arg2)
-        np.testing.assert_allclose(ary[0], arg2)
+        np.testing.assert_allclose(ary[0], arg2, rtol=self.FLOAT16_RTOL)
         arg1 = np.float16(5.0)
         compiled[1, 1](ary, arg1, arg2)
-        np.testing.assert_allclose(ary[0], arg1)
+        np.testing.assert_allclose(ary[0], arg1, rtol=self.FLOAT16_RTOL)
 
     @skip_unless_cc_53
     def test_hmin(self):
@@ -960,10 +962,10 @@ def test_hmin(self):
         arg1 = np.float16(3.0)
         arg2 = np.float16(4.0)
         compiled[1, 1](ary, arg1, arg2)
-        np.testing.assert_allclose(ary[0], arg1)
+        np.testing.assert_allclose(ary[0], arg1, rtol=self.FLOAT16_RTOL)
         arg1 = np.float16(5.0)
         compiled[1, 1](ary, arg1, arg2)
-        np.testing.assert_allclose(ary[0], arg2)
+        np.testing.assert_allclose(ary[0], arg2, rtol=self.FLOAT16_RTOL)
 
     def test_cbrt_f32(self):
         compiled = cuda.jit("void(float32[:], float32)")(simple_cbrt)