diff --git a/.flake8 b/.flake8
deleted file mode 100644
index 591e4a2af..000000000
--- a/.flake8
+++ /dev/null
@@ -1,52 +0,0 @@
-[flake8]
-ignore =
-    # Extra space in brackets
-    E20,
-    # Multiple spaces around ","
-    E231,E241,
-    # Comments
-    E26,
-    # Assigning lambda expression
-    E731,
-    # Ambiguous variable names
-    E741,
-    # line break before binary operator
-    W503,
-    # line break after binary operator
-    W504,
-max-line-length = 80
-
-exclude =
-    __pycache__
-    .git
-    *.pyc
-    *~
-    *.o
-    *.so
-    *.cpp
-    *.c
-    *.h
-
-per-file-ignores =
-    # Slightly long line in the standard version file
-    numba_cuda/_version.py: E501
-    # "Unused" imports / potentially undefined names in init files
-    numba_cuda/numba/cuda/__init__.py:F401,F403,F405
-    numba_cuda/numba/cuda/simulator/__init__.py:F401,F403
-    numba_cuda/numba/cuda/simulator/cudadrv/__init__.py:F401
-    # Ignore star imports, unused imports, and "may be defined by star imports"
-    # errors in device_init because its purpose is to bring together a lot of
-    # the public API to be star-imported in numba.cuda.__init__
-    numba_cuda/numba/cuda/device_init.py:F401,F403,F405
-    # libdevice.py is an autogenerated file containing stubs for all the device
-    # functions. Some of the lines in docstrings are a little over-long, as they
-    # contain the URLs of the reference pages in the online libdevice
-    # documentation.
-    numba_cuda/numba/cuda/libdevice.py:E501
-    # Ignore too-long lines in the doc examples, prioritising readability
-    # in the docs over line length in the example source (especially given that
-    # the test code is already indented by 8 spaces)
-    numba_cuda/numba/cuda/tests/doc_examples/test_random.py:E501
-    numba_cuda/numba/cuda/tests/doc_examples/test_cg.py:E501
-    numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py:E501
-    numba_cuda/numba/tests/doc_examples/test_interval_example.py:E501
diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
new file mode 100644
index 000000000..e1edafb25
--- /dev/null
+++ b/.git-blame-ignore-revs
@@ -0,0 +1,2 @@
+# Migrate code style to ruff
+06b62024f77bb92b585315fe61b9ba15e0885d71
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 0a114cd32..478cd1ef5 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,5 +1,23 @@
 repos:
-- repo: https://github.com/PyCQA/flake8
-  rev: 7.1.0
+- repo: https://github.com/pre-commit/pre-commit-hooks
+  rev: v5.0.0 # Use the latest version or a specific tag
   hooks:
-  - id: flake8
+    - id: check-added-large-files
+    - id: check-ast
+    - id: check-json
+    - id: check-merge-conflict
+    - id: check-toml
+    - id: check-yaml
+      exclude: ^conda/recipes/numba-cuda/meta.yaml
+    - id: debug-statements
+    - id: end-of-file-fixer
+    - id: requirements-txt-fixer
+    - id: trailing-whitespace
+    - id: mixed-line-ending
+      args: ['--fix=lf']
+- repo: https://github.com/astral-sh/ruff-pre-commit
+  rev: v0.11.2
+  hooks:
+    - id: ruff
+      args: [--fix]
+    - id: ruff-format
diff --git a/docs/make.bat b/docs/make.bat
index 9c55982a2..3629950d6 100644
--- a/docs/make.bat
+++ b/docs/make.bat
@@ -1,39 +1,39 @@
-@ECHO OFF
-
-pushd %~dp0
-
-REM Command file for Sphinx documentation
-
-if "%SPHINXBUILD%" == "" (
-	set SPHINXBUILD=sphinx-build
-)
-set SOURCEDIR=source
-set BUILDDIR=build
-
-%SPHINXBUILD% >NUL 2>NUL
-if errorlevel 9009 (
-	echo.
-	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
-	echo.installed, then set the SPHINXBUILD environment variable to point
-	echo.to the full path of the 'sphinx-build' executable. Alternatively you
-	echo.may add the Sphinx directory to PATH.
-	echo.
-	echo.If you don't have Sphinx installed, grab it from
-	echo.https://www.sphinx-doc.org/
-	exit /b 1
-)
-
-if "%1" == "" goto help
-
-if "%SPHINXOPTS%" == "" (
-	set SPHINXOPTS=-W
-)
-
-%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-goto end
-
-:help
-%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-
-:end
-popd
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.https://www.sphinx-doc.org/
+	exit /b 1
+)
+
+if "%1" == "" goto help
+
+if "%SPHINXOPTS%" == "" (
+	set SPHINXOPTS=-W
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/docs/source/conf.py b/docs/source/conf.py
index dae885740..23c78665b 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -6,39 +6,40 @@
 # -- Project information -----------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
 
-project = 'Numba CUDA'
-copyright = '2012-2024 Anaconda Inc. 2024, NVIDIA Corporation.'
-author = 'NVIDIA Corporation'
+project = "Numba CUDA"
+copyright = "2012-2024 Anaconda Inc. 2024, NVIDIA Corporation."
+author = "NVIDIA Corporation"
 
 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
 
-extensions = ['numpydoc', 'sphinx.ext.intersphinx', 'sphinx.ext.autodoc']
+extensions = ["numpydoc", "sphinx.ext.intersphinx", "sphinx.ext.autodoc"]
 
-templates_path = ['_templates']
+templates_path = ["_templates"]
 exclude_patterns = []
 
 intersphinx_mapping = {
-    'python': ('https://docs.python.org/3', None),
-    'numpy': ('https://numpy.org/doc/stable/', None),
-    'llvmlite': ('https://llvmlite.readthedocs.io/en/latest/', None),
-    'numba': ('https://numba.readthedocs.io/en/latest/', None),
+    "python": ("https://docs.python.org/3", None),
+    "numpy": ("https://numpy.org/doc/stable/", None),
+    "llvmlite": ("https://llvmlite.readthedocs.io/en/latest/", None),
+    "numba": ("https://numba.readthedocs.io/en/latest/", None),
 }
 
 # To prevent autosummary warnings
 numpydoc_show_class_members = False
 
-autodoc_typehints = 'none'
+autodoc_typehints = "none"
 
 # -- Options for HTML output -------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
 
 try:
     import nvidia_sphinx_theme  # noqa: F401
+
     html_theme = "nvidia_sphinx_theme"
 except ImportError:
     html_theme = "sphinx_rtd_theme"
 
-html_static_path = ['_static']
+html_static_path = ["_static"]
 html_favicon = "_static/numba-green-icon-rgb.svg"
 html_show_sphinx = False
diff --git a/docs/source/reference/types.rst b/docs/source/reference/types.rst
index 31197241e..7ed079f1c 100644
--- a/docs/source/reference/types.rst
+++ b/docs/source/reference/types.rst
@@ -19,7 +19,7 @@ this is the recommended way to instantiate vector types.
 
 For convenience, users adapting existing kernels from CUDA C/C++ to Python may use
 aliases consistent with the C/C++ namings. For example, ``float3`` aliases ``float32x3``,
-``long3`` aliases ``int32x3`` or ``int64x3`` (depending on the platform), etc. 
+``long3`` aliases ``int32x3`` or ``int64x3`` (depending on the platform), etc.
 
 Second, unlike CUDA C/C++ where factory functions are used, vector types are constructed directly
 with their constructor. For example, to construct a ``float32x3``:
@@ -44,7 +44,7 @@ vector type. For example, all of the following constructions are valid:
     # Construct a 4-component vector with 2 2-component vectors
     u4 = uint32x4(u2, u2)
 
-The 1st, 2nd, 3rd and 4th component of the vector type can be accessed through fields 
+The 1st, 2nd, 3rd and 4th component of the vector type can be accessed through fields
 ``x``, ``y``, ``z``, and ``w`` respectively. The components are immutable after
 construction in the present version of Numba; it is expected that support for
 mutating vector components will be added in a future release.
diff --git a/docs/source/user/cooperative_groups.rst b/docs/source/user/cooperative_groups.rst
index a08fa3784..0ce70614d 100644
--- a/docs/source/user/cooperative_groups.rst
+++ b/docs/source/user/cooperative_groups.rst
@@ -50,7 +50,7 @@ overloads:
 This can be used to ensure that the kernel is launched with no more than the
 maximum number of blocks. Exceeding the maximum number of blocks for the
 cooperative launch will result in a ``CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE``
-error. 
+error.
 
 
 Applications and Example
diff --git a/docs/source/user/device-management.rst b/docs/source/user/device-management.rst
index 8f9beb4db..12878961d 100644
--- a/docs/source/user/device-management.rst
+++ b/docs/source/user/device-management.rst
@@ -89,4 +89,3 @@ For example, to obtain the UUID of the current device:
    dev = cuda.current_context().device
    # prints e.g. "GPU-e6489c45-5b68-3b03-bab7-0e7c8e809643"
    print(dev.uuid)
-
diff --git a/docs/source/user/examples.rst b/docs/source/user/examples.rst
index 8adcf313b..5425f4bb1 100644
--- a/docs/source/user/examples.rst
+++ b/docs/source/user/examples.rst
@@ -101,7 +101,7 @@ propagates through an object over time. It works by discretizing the problem in
 1. The domain is partitioned into a mesh of points that each have an individual temperature.
 2. Time is partitioned into discrete intervals that are advanced forward sequentially.
 
-Then, the following assumption is applied: The temperature of a point after some interval 
+Then, the following assumption is applied: The temperature of a point after some interval
 has passed is some weighted average of the temperature of the points that are directly
 adjacent to it. Intuitively, if all the points in the domain are very hot
 and a single point in the middle is very cold, as time passes, the hot points will cause
@@ -109,9 +109,9 @@ the cold one to heat up and the cold point will cause the surrounding hot pieces
 slightly. Simply put, the heat spreads throughout the object.
 
 We can implement this simulation using a Numba kernel. Let's start simple by assuming
-we have a one dimensional object which we'll represent with an array of values. The position 
+we have a one dimensional object which we'll represent with an array of values. The position
 of the element in the array is the position of a point within the object, and the value
-of the element represents the temperature. 
+of the element represents the temperature.
 
 .. literalinclude:: ../../../numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py
    :language: python
@@ -138,7 +138,7 @@ The initial state of the problem can be visualized as:
 
 In our kernel each thread will be responsible for managing the temperature update for a single element
 in a loop over the desired number of timesteps. The kernel is below. Note the use of cooperative group
-synchronization and the use of two buffers swapped at each iteration to avoid race conditions. See 
+synchronization and the use of two buffers swapped at each iteration to avoid race conditions. See
 :func:`numba.cuda.cg.this_grid() <numba.cuda.cg.this_grid>` for details.
 
 .. literalinclude:: ../../../numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py
@@ -237,15 +237,15 @@ A common problem in business analytics is that of grouping the activity of users
 sessions, called "sessionization". The idea is that users generally traverse through a website and perform
 various actions (clicking something, filling out a form, etc.) in discrete groups. Perhaps a customer spends
 some time shopping for an item in the morning and then again at night - often the business is interested in
-treating these periods as separate interactions with their service, and this creates the problem of 
+treating these periods as separate interactions with their service, and this creates the problem of
 programmatically splitting up activity in some agreed-upon way.
 
-Here we'll illustrate how to write a Numba kernel to solve this problem. We'll start with data 
-containing two fields: let ``user_id`` represent a unique ID corresponding to an individual customer, and let 
-``action_time`` be a time that some unknown action was taken on the service. Right now, we'll assume there's 
+Here we'll illustrate how to write a Numba kernel to solve this problem. We'll start with data
+containing two fields: let ``user_id`` represent a unique ID corresponding to an individual customer, and let
+``action_time`` be a time that some unknown action was taken on the service. Right now, we'll assume there's
 only one type of action, so all there is to know is when it happened.
 
-Our goal will be to create a new column called ``session_id``, which contains a label corresponding to a unique 
+Our goal will be to create a new column called ``session_id``, which contains a label corresponding to a unique
 session. We'll define the boundary between sessions as when there has been at least one hour between clicks.
 
 
@@ -256,7 +256,7 @@ session. We'll define the boundary between sessions as when there has been at le
    :end-before: ex_sessionize.import.end
    :dedent: 8
    :linenos:
-   
+
 Here is a solution using Numba:
 
 .. literalinclude:: ../../../numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py
@@ -285,8 +285,8 @@ and a similar pattern is seen throughout.
 JIT Function CPU-GPU Compatibility
 ==================================
 
-This example demonstrates how ``numba.jit`` can be used to jit compile a function for the CPU, while at the same time making 
-it available for use inside CUDA kernels. This can be very useful for users that are migrating workflows from CPU to GPU as 
+This example demonstrates how ``numba.jit`` can be used to jit compile a function for the CPU, while at the same time making
+it available for use inside CUDA kernels. This can be very useful for users that are migrating workflows from CPU to GPU as
 they can directly reuse potential business logic with fewer code changes.
 
 Take the following example function:
@@ -309,7 +309,7 @@ The function ``business_logic`` can be run standalone in compiled form on the CP
    :dedent: 8
    :linenos:
 
-It can also be directly reused threadwise inside a GPU kernel. For example one may 
+It can also be directly reused threadwise inside a GPU kernel. For example one may
 generate some vectors to represent ``x``, ``y``, and ``z``:
 
 .. literalinclude:: ../../../numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py
@@ -345,12 +345,12 @@ This kernel can be invoked in the normal way:
 Monte Carlo Integration
 =======================
 
-This example shows how to use Numba to approximate the value of a definite integral by rapidly generating 
+This example shows how to use Numba to approximate the value of a definite integral by rapidly generating
 random numbers on the GPU. A detailed description of the mathematical mechanics of Monte Carlo integration
-is out of the scope of the example, but it can briefly be described as an averaging process where the area 
+is out of the scope of the example, but it can briefly be described as an averaging process where the area
 under the curve is approximated by taking the average of many rectangles formed by its function values.
 
-In addition, this example shows how to perform reductions in numba using the 
+In addition, this example shows how to perform reductions in numba using the
 :func:`cuda.reduce() <numba.cuda.Reduce>` API.
 
 .. literalinclude:: ../../../numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py
diff --git a/docs/source/user/external-memory.rst b/docs/source/user/external-memory.rst
index 28a8f59f0..a13071f80 100644
--- a/docs/source/user/external-memory.rst
+++ b/docs/source/user/external-memory.rst
@@ -52,7 +52,7 @@ sections, using the :func:`~numba.cuda.defer_cleanup` context manager.
 When an EMM Plugin is in use, the deallocation strategy is implemented by the
 EMM, and Numba's internal deallocation mechanism is not used. The EMM
 Plugin could implement:
-  
+
 - A similar strategy to the Numba deallocation behaviour, or
 - Something more appropriate to the plugin - for example, deallocated memory
   might immediately be returned to a memory pool.
diff --git a/docs/source/user/intrinsics.rst b/docs/source/user/intrinsics.rst
index 521c1d918..4dc342b89 100644
--- a/docs/source/user/intrinsics.rst
+++ b/docs/source/user/intrinsics.rst
@@ -54,5 +54,3 @@ Multiple dimension arrays are supported by using a tuple of ints for the index::
     result = np.zeros((3, 3, 3), dtype=np.float64)
     max_example_3d[(2, 2, 2), (5, 5, 5)](result, arr)
     print(result[0, 1, 2], '==', np.max(arr))
-
-
diff --git a/docs/source/user/laplace_final.svg b/docs/source/user/laplace_final.svg
index 4f3b197fb..1f88cc93e 100644
--- a/docs/source/user/laplace_final.svg
+++ b/docs/source/user/laplace_final.svg
@@ -21,19 +21,19 @@
  </defs>
  <g id="figure_1">
   <g id="patch_1">
-   <path d="M 0 427.68 
-L 760.32 427.68 
-L 760.32 0 
-L 0 0 
+   <path d="M 0 427.68
+L 760.32 427.68
+L 760.32 0
+L 0 0
 z
 " style="fill: #ffffff"/>
   </g>
   <g id="axes_1">
    <g id="patch_2">
-    <path d="M 95.04 380.6352 
-L 684.288 380.6352 
-L 684.288 51.3216 
-L 95.04 51.3216 
+    <path d="M 95.04 380.6352
+L 684.288 380.6352
+L 684.288 51.3216
+L 95.04 51.3216
 z
 " style="fill: #ffffff"/>
    </g>
@@ -41,8 +41,8 @@ z
     <g id="xtick_1">
      <g id="line2d_1">
       <defs>
-       <path id="mdea99e8948" d="M 0 0 
-L 0 3.5 
+       <path id="mdea99e8948" d="M 0 0
+L 0 3.5
 " style="stroke: #000000; stroke-width: 0.8"/>
       </defs>
       <g>
@@ -53,25 +53,25 @@ L 0 3.5
       <!-- 0 -->
       <g transform="translate(91.85875 395.233637)scale(0.1 -0.1)">
        <defs>
-        <path id="DejaVuSans-30" d="M 2034 4250 
-Q 1547 4250 1301 3770 
-Q 1056 3291 1056 2328 
-Q 1056 1369 1301 889 
-Q 1547 409 2034 409 
-Q 2525 409 2770 889 
-Q 3016 1369 3016 2328 
-Q 3016 3291 2770 3770 
-Q 2525 4250 2034 4250 
+        <path id="DejaVuSans-30" d="M 2034 4250
+Q 1547 4250 1301 3770
+Q 1056 3291 1056 2328
+Q 1056 1369 1301 889
+Q 1547 409 2034 409
+Q 2525 409 2770 889
+Q 3016 1369 3016 2328
+Q 3016 3291 2770 3770
+Q 2525 4250 2034 4250
 z
-M 2034 4750 
-Q 2819 4750 3233 4129 
-Q 3647 3509 3647 2328 
-Q 3647 1150 3233 529 
-Q 2819 -91 2034 -91 
-Q 1250 -91 836 529 
-Q 422 1150 422 2328 
-Q 422 3509 836 4129 
-Q 1250 4750 2034 4750 
+M 2034 4750
+Q 2819 4750 3233 4129
+Q 3647 3509 3647 2328
+Q 3647 1150 3233 529
+Q 2819 -91 2034 -91
+Q 1250 -91 836 529
+Q 422 1150 422 2328
+Q 422 3509 836 4129
+Q 1250 4750 2034 4750
 z
 " transform="scale(0.015625)"/>
        </defs>
@@ -89,28 +89,28 @@ z
       <!-- 200 -->
       <g transform="translate(203.228118 395.233637)scale(0.1 -0.1)">
        <defs>
-        <path id="DejaVuSans-32" d="M 1228 531 
-L 3431 531 
-L 3431 0 
-L 469 0 
-L 469 531 
-Q 828 903 1448 1529 
-Q 2069 2156 2228 2338 
-Q 2531 2678 2651 2914 
-Q 2772 3150 2772 3378 
-Q 2772 3750 2511 3984 
-Q 2250 4219 1831 4219 
-Q 1534 4219 1204 4116 
-Q 875 4013 500 3803 
-L 500 4441 
-Q 881 4594 1212 4672 
-Q 1544 4750 1819 4750 
-Q 2544 4750 2975 4387 
-Q 3406 4025 3406 3419 
-Q 3406 3131 3298 2873 
-Q 3191 2616 2906 2266 
-Q 2828 2175 2409 1742 
-Q 1991 1309 1228 531 
+        <path id="DejaVuSans-32" d="M 1228 531
+L 3431 531
+L 3431 0
+L 469 0
+L 469 531
+Q 828 903 1448 1529
+Q 2069 2156 2228 2338
+Q 2531 2678 2651 2914
+Q 2772 3150 2772 3378
+Q 2772 3750 2511 3984
+Q 2250 4219 1831 4219
+Q 1534 4219 1204 4116
+Q 875 4013 500 3803
+L 500 4441
+Q 881 4594 1212 4672
+Q 1544 4750 1819 4750
+Q 2544 4750 2975 4387
+Q 3406 4025 3406 3419
+Q 3406 3131 3298 2873
+Q 3191 2616 2906 2266
+Q 2828 2175 2409 1742
+Q 1991 1309 1228 531
 z
 " transform="scale(0.015625)"/>
        </defs>
@@ -130,23 +130,23 @@ z
       <!-- 400 -->
       <g transform="translate(320.959986 395.233637)scale(0.1 -0.1)">
        <defs>
-        <path id="DejaVuSans-34" d="M 2419 4116 
-L 825 1625 
-L 2419 1625 
-L 2419 4116 
+        <path id="DejaVuSans-34" d="M 2419 4116
+L 825 1625
+L 2419 1625
+L 2419 4116
 z
-M 2253 4666 
-L 3047 4666 
-L 3047 1625 
-L 3713 1625 
-L 3713 1100 
-L 3047 1100 
-L 3047 0 
-L 2419 0 
-L 2419 1100 
-L 313 1100 
-L 313 1709 
-L 2253 4666 
+M 2253 4666
+L 3047 4666
+L 3047 1625
+L 3713 1625
+L 3713 1100
+L 3047 1100
+L 3047 0
+L 2419 0
+L 2419 1100
+L 313 1100
+L 313 1709
+L 2253 4666
 z
 " transform="scale(0.015625)"/>
        </defs>
@@ -166,34 +166,34 @@ z
       <!-- 600 -->
       <g transform="translate(438.691854 395.233637)scale(0.1 -0.1)">
        <defs>
-        <path id="DejaVuSans-36" d="M 2113 2584 
-Q 1688 2584 1439 2293 
-Q 1191 2003 1191 1497 
-Q 1191 994 1439 701 
-Q 1688 409 2113 409 
-Q 2538 409 2786 701 
-Q 3034 994 3034 1497 
-Q 3034 2003 2786 2293 
-Q 2538 2584 2113 2584 
+        <path id="DejaVuSans-36" d="M 2113 2584
+Q 1688 2584 1439 2293
+Q 1191 2003 1191 1497
+Q 1191 994 1439 701
+Q 1688 409 2113 409
+Q 2538 409 2786 701
+Q 3034 994 3034 1497
+Q 3034 2003 2786 2293
+Q 2538 2584 2113 2584
 z
-M 3366 4563 
-L 3366 3988 
-Q 3128 4100 2886 4159 
-Q 2644 4219 2406 4219 
-Q 1781 4219 1451 3797 
-Q 1122 3375 1075 2522 
-Q 1259 2794 1537 2939 
-Q 1816 3084 2150 3084 
-Q 2853 3084 3261 2657 
-Q 3669 2231 3669 1497 
-Q 3669 778 3244 343 
-Q 2819 -91 2113 -91 
-Q 1303 -91 875 529 
-Q 447 1150 447 2328 
-Q 447 3434 972 4092 
-Q 1497 4750 2381 4750 
-Q 2619 4750 2861 4703 
-Q 3103 4656 3366 4563 
+M 3366 4563
+L 3366 3988
+Q 3128 4100 2886 4159
+Q 2644 4219 2406 4219
+Q 1781 4219 1451 3797
+Q 1122 3375 1075 2522
+Q 1259 2794 1537 2939
+Q 1816 3084 2150 3084
+Q 2853 3084 3261 2657
+Q 3669 2231 3669 1497
+Q 3669 778 3244 343
+Q 2819 -91 2113 -91
+Q 1303 -91 875 529
+Q 447 1150 447 2328
+Q 447 3434 972 4092
+Q 1497 4750 2381 4750
+Q 2619 4750 2861 4703
+Q 3103 4656 3366 4563
 z
 " transform="scale(0.015625)"/>
        </defs>
@@ -213,43 +213,43 @@ z
       <!-- 800 -->
       <g transform="translate(556.423723 395.233637)scale(0.1 -0.1)">
        <defs>
-        <path id="DejaVuSans-38" d="M 2034 2216 
-Q 1584 2216 1326 1975 
-Q 1069 1734 1069 1313 
-Q 1069 891 1326 650 
-Q 1584 409 2034 409 
-Q 2484 409 2743 651 
-Q 3003 894 3003 1313 
-Q 3003 1734 2745 1975 
-Q 2488 2216 2034 2216 
+        <path id="DejaVuSans-38" d="M 2034 2216
+Q 1584 2216 1326 1975
+Q 1069 1734 1069 1313
+Q 1069 891 1326 650
+Q 1584 409 2034 409
+Q 2484 409 2743 651
+Q 3003 894 3003 1313
+Q 3003 1734 2745 1975
+Q 2488 2216 2034 2216
 z
-M 1403 2484 
-Q 997 2584 770 2862 
-Q 544 3141 544 3541 
-Q 544 4100 942 4425 
-Q 1341 4750 2034 4750 
-Q 2731 4750 3128 4425 
-Q 3525 4100 3525 3541 
-Q 3525 3141 3298 2862 
-Q 3072 2584 2669 2484 
-Q 3125 2378 3379 2068 
-Q 3634 1759 3634 1313 
-Q 3634 634 3220 271 
-Q 2806 -91 2034 -91 
-Q 1263 -91 848 271 
-Q 434 634 434 1313 
-Q 434 1759 690 2068 
-Q 947 2378 1403 2484 
+M 1403 2484
+Q 997 2584 770 2862
+Q 544 3141 544 3541
+Q 544 4100 942 4425
+Q 1341 4750 2034 4750
+Q 2731 4750 3128 4425
+Q 3525 4100 3525 3541
+Q 3525 3141 3298 2862
+Q 3072 2584 2669 2484
+Q 3125 2378 3379 2068
+Q 3634 1759 3634 1313
+Q 3634 634 3220 271
+Q 2806 -91 2034 -91
+Q 1263 -91 848 271
+Q 434 634 434 1313
+Q 434 1759 690 2068
+Q 947 2378 1403 2484
 z
-M 1172 3481 
-Q 1172 3119 1398 2916 
-Q 1625 2713 2034 2713 
-Q 2441 2713 2670 2916 
-Q 2900 3119 2900 3481 
-Q 2900 3844 2670 4047 
-Q 2441 4250 2034 4250 
-Q 1625 4250 1398 4047 
-Q 1172 3844 1172 3481 
+M 1172 3481
+Q 1172 3119 1398 2916
+Q 1625 2713 2034 2713
+Q 2441 2713 2670 2916
+Q 2900 3119 2900 3481
+Q 2900 3844 2670 4047
+Q 2441 4250 2034 4250
+Q 1625 4250 1398 4047
+Q 1172 3844 1172 3481
 z
 " transform="scale(0.015625)"/>
        </defs>
@@ -269,18 +269,18 @@ z
       <!-- 1000 -->
       <g transform="translate(670.974341 395.233637)scale(0.1 -0.1)">
        <defs>
-        <path id="DejaVuSans-31" d="M 794 531 
-L 1825 531 
-L 1825 4091 
-L 703 3866 
-L 703 4441 
-L 1819 4666 
-L 2450 4666 
-L 2450 531 
-L 3481 531 
-L 3481 0 
-L 794 0 
-L 794 531 
+        <path id="DejaVuSans-31" d="M 794 531
+L 1825 531
+L 1825 4091
+L 703 3866
+L 703 4441
+L 1819 4666
+L 2450 4666
+L 2450 531
+L 3481 531
+L 3481 0
+L 794 0
+L 794 531
 z
 " transform="scale(0.015625)"/>
        </defs>
@@ -295,130 +295,130 @@ z
      <!-- Position -->
      <g transform="translate(342.95025 419.549575)scale(0.24 -0.24)">
       <defs>
-       <path id="DejaVuSans-50" d="M 1259 4147 
-L 1259 2394 
-L 2053 2394 
-Q 2494 2394 2734 2622 
-Q 2975 2850 2975 3272 
-Q 2975 3691 2734 3919 
-Q 2494 4147 2053 4147 
-L 1259 4147 
+       <path id="DejaVuSans-50" d="M 1259 4147
+L 1259 2394
+L 2053 2394
+Q 2494 2394 2734 2622
+Q 2975 2850 2975 3272
+Q 2975 3691 2734 3919
+Q 2494 4147 2053 4147
+L 1259 4147
 z
-M 628 4666 
-L 2053 4666 
-Q 2838 4666 3239 4311 
-Q 3641 3956 3641 3272 
-Q 3641 2581 3239 2228 
-Q 2838 1875 2053 1875 
-L 1259 1875 
-L 1259 0 
-L 628 0 
-L 628 4666 
+M 628 4666
+L 2053 4666
+Q 2838 4666 3239 4311
+Q 3641 3956 3641 3272
+Q 3641 2581 3239 2228
+Q 2838 1875 2053 1875
+L 1259 1875
+L 1259 0
+L 628 0
+L 628 4666
 z
 " transform="scale(0.015625)"/>
-       <path id="DejaVuSans-6f" d="M 1959 3097 
-Q 1497 3097 1228 2736 
-Q 959 2375 959 1747 
-Q 959 1119 1226 758 
-Q 1494 397 1959 397 
-Q 2419 397 2687 759 
-Q 2956 1122 2956 1747 
-Q 2956 2369 2687 2733 
-Q 2419 3097 1959 3097 
+       <path id="DejaVuSans-6f" d="M 1959 3097
+Q 1497 3097 1228 2736
+Q 959 2375 959 1747
+Q 959 1119 1226 758
+Q 1494 397 1959 397
+Q 2419 397 2687 759
+Q 2956 1122 2956 1747
+Q 2956 2369 2687 2733
+Q 2419 3097 1959 3097
 z
-M 1959 3584 
-Q 2709 3584 3137 3096 
-Q 3566 2609 3566 1747 
-Q 3566 888 3137 398 
-Q 2709 -91 1959 -91 
-Q 1206 -91 779 398 
-Q 353 888 353 1747 
-Q 353 2609 779 3096 
-Q 1206 3584 1959 3584 
+M 1959 3584
+Q 2709 3584 3137 3096
+Q 3566 2609 3566 1747
+Q 3566 888 3137 398
+Q 2709 -91 1959 -91
+Q 1206 -91 779 398
+Q 353 888 353 1747
+Q 353 2609 779 3096
+Q 1206 3584 1959 3584
 z
 " transform="scale(0.015625)"/>
-       <path id="DejaVuSans-73" d="M 2834 3397 
-L 2834 2853 
-Q 2591 2978 2328 3040 
-Q 2066 3103 1784 3103 
-Q 1356 3103 1142 2972 
-Q 928 2841 928 2578 
-Q 928 2378 1081 2264 
-Q 1234 2150 1697 2047 
-L 1894 2003 
-Q 2506 1872 2764 1633 
-Q 3022 1394 3022 966 
-Q 3022 478 2636 193 
-Q 2250 -91 1575 -91 
-Q 1294 -91 989 -36 
-Q 684 19 347 128 
-L 347 722 
-Q 666 556 975 473 
-Q 1284 391 1588 391 
-Q 1994 391 2212 530 
-Q 2431 669 2431 922 
-Q 2431 1156 2273 1281 
-Q 2116 1406 1581 1522 
-L 1381 1569 
-Q 847 1681 609 1914 
-Q 372 2147 372 2553 
-Q 372 3047 722 3315 
-Q 1072 3584 1716 3584 
-Q 2034 3584 2315 3537 
-Q 2597 3491 2834 3397 
+       <path id="DejaVuSans-73" d="M 2834 3397
+L 2834 2853
+Q 2591 2978 2328 3040
+Q 2066 3103 1784 3103
+Q 1356 3103 1142 2972
+Q 928 2841 928 2578
+Q 928 2378 1081 2264
+Q 1234 2150 1697 2047
+L 1894 2003
+Q 2506 1872 2764 1633
+Q 3022 1394 3022 966
+Q 3022 478 2636 193
+Q 2250 -91 1575 -91
+Q 1294 -91 989 -36
+Q 684 19 347 128
+L 347 722
+Q 666 556 975 473
+Q 1284 391 1588 391
+Q 1994 391 2212 530
+Q 2431 669 2431 922
+Q 2431 1156 2273 1281
+Q 2116 1406 1581 1522
+L 1381 1569
+Q 847 1681 609 1914
+Q 372 2147 372 2553
+Q 372 3047 722 3315
+Q 1072 3584 1716 3584
+Q 2034 3584 2315 3537
+Q 2597 3491 2834 3397
 z
 " transform="scale(0.015625)"/>
-       <path id="DejaVuSans-69" d="M 603 3500 
-L 1178 3500 
-L 1178 0 
-L 603 0 
-L 603 3500 
+       <path id="DejaVuSans-69" d="M 603 3500
+L 1178 3500
+L 1178 0
+L 603 0
+L 603 3500
 z
-M 603 4863 
-L 1178 4863 
-L 1178 4134 
-L 603 4134 
-L 603 4863 
+M 603 4863
+L 1178 4863
+L 1178 4134
+L 603 4134
+L 603 4863
 z
 " transform="scale(0.015625)"/>
-       <path id="DejaVuSans-74" d="M 1172 4494 
-L 1172 3500 
-L 2356 3500 
-L 2356 3053 
-L 1172 3053 
-L 1172 1153 
-Q 1172 725 1289 603 
-Q 1406 481 1766 481 
-L 2356 481 
-L 2356 0 
-L 1766 0 
-Q 1100 0 847 248 
-Q 594 497 594 1153 
-L 594 3053 
-L 172 3053 
-L 172 3500 
-L 594 3500 
-L 594 4494 
-L 1172 4494 
+       <path id="DejaVuSans-74" d="M 1172 4494
+L 1172 3500
+L 2356 3500
+L 2356 3053
+L 1172 3053
+L 1172 1153
+Q 1172 725 1289 603
+Q 1406 481 1766 481
+L 2356 481
+L 2356 0
+L 1766 0
+Q 1100 0 847 248
+Q 594 497 594 1153
+L 594 3053
+L 172 3053
+L 172 3500
+L 594 3500
+L 594 4494
+L 1172 4494
 z
 " transform="scale(0.015625)"/>
-       <path id="DejaVuSans-6e" d="M 3513 2113 
-L 3513 0 
-L 2938 0 
-L 2938 2094 
-Q 2938 2591 2744 2837 
-Q 2550 3084 2163 3084 
-Q 1697 3084 1428 2787 
-Q 1159 2491 1159 1978 
-L 1159 0 
-L 581 0 
-L 581 3500 
-L 1159 3500 
-L 1159 2956 
-Q 1366 3272 1645 3428 
-Q 1925 3584 2291 3584 
-Q 2894 3584 3203 3211 
-Q 3513 2838 3513 2113 
+       <path id="DejaVuSans-6e" d="M 3513 2113
+L 3513 0
+L 2938 0
+L 2938 2094
+Q 2938 2591 2744 2837
+Q 2550 3084 2163 3084
+Q 1697 3084 1428 2787
+Q 1159 2491 1159 1978
+L 1159 0
+L 581 0
+L 581 3500
+L 1159 3500
+L 1159 2956
+Q 1366 3272 1645 3428
+Q 1925 3584 2291 3584
+Q 2894 3584 3203 3211
+Q 3513 2838 3513 2113
 z
 " transform="scale(0.015625)"/>
       </defs>
@@ -437,8 +437,8 @@ z
     <g id="ytick_1">
      <g id="line2d_7">
       <defs>
-       <path id="mded8b8a71f" d="M 0 0 
-L -3.5 0 
+       <path id="mded8b8a71f" d="M 0 0
+L -3.5 0
 " style="stroke: #000000; stroke-width: 0.8"/>
       </defs>
       <g>
@@ -490,36 +490,36 @@ L -3.5 0
       <!-- 30 -->
       <g transform="translate(75.315 209.004255)scale(0.1 -0.1)">
        <defs>
-        <path id="DejaVuSans-33" d="M 2597 2516 
-Q 3050 2419 3304 2112 
-Q 3559 1806 3559 1356 
-Q 3559 666 3084 287 
-Q 2609 -91 1734 -91 
-Q 1441 -91 1130 -33 
-Q 819 25 488 141 
-L 488 750 
-Q 750 597 1062 519 
-Q 1375 441 1716 441 
-Q 2309 441 2620 675 
-Q 2931 909 2931 1356 
-Q 2931 1769 2642 2001 
-Q 2353 2234 1838 2234 
-L 1294 2234 
-L 1294 2753 
-L 1863 2753 
-Q 2328 2753 2575 2939 
-Q 2822 3125 2822 3475 
-Q 2822 3834 2567 4026 
-Q 2313 4219 1838 4219 
-Q 1578 4219 1281 4162 
-Q 984 4106 628 3988 
-L 628 4550 
-Q 988 4650 1302 4700 
-Q 1616 4750 1894 4750 
-Q 2613 4750 3031 4423 
-Q 3450 4097 3450 3541 
-Q 3450 3153 3228 2886 
-Q 3006 2619 2597 2516 
+        <path id="DejaVuSans-33" d="M 2597 2516
+Q 3050 2419 3304 2112
+Q 3559 1806 3559 1356
+Q 3559 666 3084 287
+Q 2609 -91 1734 -91
+Q 1441 -91 1130 -33
+Q 819 25 488 141
+L 488 750
+Q 750 597 1062 519
+Q 1375 441 1716 441
+Q 2309 441 2620 675
+Q 2931 909 2931 1356
+Q 2931 1769 2642 2001
+Q 2353 2234 1838 2234
+L 1294 2234
+L 1294 2753
+L 1863 2753
+Q 2328 2753 2575 2939
+Q 2822 3125 2822 3475
+Q 2822 3834 2567 4026
+Q 2313 4219 1838 4219
+Q 1578 4219 1281 4162
+Q 984 4106 628 3988
+L 628 4550
+Q 988 4650 1302 4700
+Q 1616 4750 1894 4750
+Q 2613 4750 3031 4423
+Q 3450 4097 3450 3541
+Q 3450 3153 3228 2886
+Q 3006 2619 2597 2516
 z
 " transform="scale(0.015625)"/>
        </defs>
@@ -552,29 +552,29 @@ z
       <!-- 50 -->
       <g transform="translate(75.315 92.050812)scale(0.1 -0.1)">
        <defs>
-        <path id="DejaVuSans-35" d="M 691 4666 
-L 3169 4666 
-L 3169 4134 
-L 1269 4134 
-L 1269 2991 
-Q 1406 3038 1543 3061 
-Q 1681 3084 1819 3084 
-Q 2600 3084 3056 2656 
-Q 3513 2228 3513 1497 
-Q 3513 744 3044 326 
-Q 2575 -91 1722 -91 
-Q 1428 -91 1123 -41 
-Q 819 9 494 109 
-L 494 744 
-Q 775 591 1075 516 
-Q 1375 441 1709 441 
-Q 2250 441 2565 725 
-Q 2881 1009 2881 1497 
-Q 2881 1984 2565 2268 
-Q 2250 2553 1709 2553 
-Q 1456 2553 1204 2497 
-Q 953 2441 691 2322 
-L 691 4666 
+        <path id="DejaVuSans-35" d="M 691 4666
+L 3169 4666
+L 3169 4134
+L 1269 4134
+L 1269 2991
+Q 1406 3038 1543 3061
+Q 1681 3084 1819 3084
+Q 2600 3084 3056 2656
+Q 3513 2228 3513 1497
+Q 3513 744 3044 326
+Q 2575 -91 1722 -91
+Q 1428 -91 1123 -41
+Q 819 9 494 109
+L 494 744
+Q 775 591 1075 516
+Q 1375 441 1709 441
+Q 2250 441 2565 725
+Q 2881 1009 2881 1497
+Q 2881 1984 2565 2268
+Q 2250 2553 1709 2553
+Q 1456 2553 1204 2497
+Q 953 2441 691 2322
+L 691 4666
 z
 " transform="scale(0.015625)"/>
        </defs>
@@ -587,168 +587,168 @@ z
      <!-- Temperature -->
      <g transform="translate(66.32375 291.985275)rotate(-90)scale(0.24 -0.24)">
       <defs>
-       <path id="DejaVuSans-54" d="M -19 4666 
-L 3928 4666 
-L 3928 4134 
-L 2272 4134 
-L 2272 0 
-L 1638 0 
-L 1638 4134 
-L -19 4134 
-L -19 4666 
+       <path id="DejaVuSans-54" d="M -19 4666
+L 3928 4666
+L 3928 4134
+L 2272 4134
+L 2272 0
+L 1638 0
+L 1638 4134
+L -19 4134
+L -19 4666
 z
 " transform="scale(0.015625)"/>
-       <path id="DejaVuSans-65" d="M 3597 1894 
-L 3597 1613 
-L 953 1613 
-Q 991 1019 1311 708 
-Q 1631 397 2203 397 
-Q 2534 397 2845 478 
-Q 3156 559 3463 722 
-L 3463 178 
-Q 3153 47 2828 -22 
-Q 2503 -91 2169 -91 
-Q 1331 -91 842 396 
-Q 353 884 353 1716 
-Q 353 2575 817 3079 
-Q 1281 3584 2069 3584 
-Q 2775 3584 3186 3129 
-Q 3597 2675 3597 1894 
+       <path id="DejaVuSans-65" d="M 3597 1894
+L 3597 1613
+L 953 1613
+Q 991 1019 1311 708
+Q 1631 397 2203 397
+Q 2534 397 2845 478
+Q 3156 559 3463 722
+L 3463 178
+Q 3153 47 2828 -22
+Q 2503 -91 2169 -91
+Q 1331 -91 842 396
+Q 353 884 353 1716
+Q 353 2575 817 3079
+Q 1281 3584 2069 3584
+Q 2775 3584 3186 3129
+Q 3597 2675 3597 1894
 z
-M 3022 2063 
-Q 3016 2534 2758 2815 
-Q 2500 3097 2075 3097 
-Q 1594 3097 1305 2825 
-Q 1016 2553 972 2059 
-L 3022 2063 
+M 3022 2063
+Q 3016 2534 2758 2815
+Q 2500 3097 2075 3097
+Q 1594 3097 1305 2825
+Q 1016 2553 972 2059
+L 3022 2063
 z
 " transform="scale(0.015625)"/>
-       <path id="DejaVuSans-6d" d="M 3328 2828 
-Q 3544 3216 3844 3400 
-Q 4144 3584 4550 3584 
-Q 5097 3584 5394 3201 
-Q 5691 2819 5691 2113 
-L 5691 0 
-L 5113 0 
-L 5113 2094 
-Q 5113 2597 4934 2840 
-Q 4756 3084 4391 3084 
-Q 3944 3084 3684 2787 
-Q 3425 2491 3425 1978 
-L 3425 0 
-L 2847 0 
-L 2847 2094 
-Q 2847 2600 2669 2842 
-Q 2491 3084 2119 3084 
-Q 1678 3084 1418 2786 
-Q 1159 2488 1159 1978 
-L 1159 0 
-L 581 0 
-L 581 3500 
-L 1159 3500 
-L 1159 2956 
-Q 1356 3278 1631 3431 
-Q 1906 3584 2284 3584 
-Q 2666 3584 2933 3390 
-Q 3200 3197 3328 2828 
+       <path id="DejaVuSans-6d" d="M 3328 2828
+Q 3544 3216 3844 3400
+Q 4144 3584 4550 3584
+Q 5097 3584 5394 3201
+Q 5691 2819 5691 2113
+L 5691 0
+L 5113 0
+L 5113 2094
+Q 5113 2597 4934 2840
+Q 4756 3084 4391 3084
+Q 3944 3084 3684 2787
+Q 3425 2491 3425 1978
+L 3425 0
+L 2847 0
+L 2847 2094
+Q 2847 2600 2669 2842
+Q 2491 3084 2119 3084
+Q 1678 3084 1418 2786
+Q 1159 2488 1159 1978
+L 1159 0
+L 581 0
+L 581 3500
+L 1159 3500
+L 1159 2956
+Q 1356 3278 1631 3431
+Q 1906 3584 2284 3584
+Q 2666 3584 2933 3390
+Q 3200 3197 3328 2828
 z
 " transform="scale(0.015625)"/>
-       <path id="DejaVuSans-70" d="M 1159 525 
-L 1159 -1331 
-L 581 -1331 
-L 581 3500 
-L 1159 3500 
-L 1159 2969 
-Q 1341 3281 1617 3432 
-Q 1894 3584 2278 3584 
-Q 2916 3584 3314 3078 
-Q 3713 2572 3713 1747 
-Q 3713 922 3314 415 
-Q 2916 -91 2278 -91 
-Q 1894 -91 1617 61 
-Q 1341 213 1159 525 
+       <path id="DejaVuSans-70" d="M 1159 525
+L 1159 -1331
+L 581 -1331
+L 581 3500
+L 1159 3500
+L 1159 2969
+Q 1341 3281 1617 3432
+Q 1894 3584 2278 3584
+Q 2916 3584 3314 3078
+Q 3713 2572 3713 1747
+Q 3713 922 3314 415
+Q 2916 -91 2278 -91
+Q 1894 -91 1617 61
+Q 1341 213 1159 525
 z
-M 3116 1747 
-Q 3116 2381 2855 2742 
-Q 2594 3103 2138 3103 
-Q 1681 3103 1420 2742 
-Q 1159 2381 1159 1747 
-Q 1159 1113 1420 752 
-Q 1681 391 2138 391 
-Q 2594 391 2855 752 
-Q 3116 1113 3116 1747 
+M 3116 1747
+Q 3116 2381 2855 2742
+Q 2594 3103 2138 3103
+Q 1681 3103 1420 2742
+Q 1159 2381 1159 1747
+Q 1159 1113 1420 752
+Q 1681 391 2138 391
+Q 2594 391 2855 752
+Q 3116 1113 3116 1747
 z
 " transform="scale(0.015625)"/>
-       <path id="DejaVuSans-72" d="M 2631 2963 
-Q 2534 3019 2420 3045 
-Q 2306 3072 2169 3072 
-Q 1681 3072 1420 2755 
-Q 1159 2438 1159 1844 
-L 1159 0 
-L 581 0 
-L 581 3500 
-L 1159 3500 
-L 1159 2956 
-Q 1341 3275 1631 3429 
-Q 1922 3584 2338 3584 
-Q 2397 3584 2469 3576 
-Q 2541 3569 2628 3553 
-L 2631 2963 
+       <path id="DejaVuSans-72" d="M 2631 2963
+Q 2534 3019 2420 3045
+Q 2306 3072 2169 3072
+Q 1681 3072 1420 2755
+Q 1159 2438 1159 1844
+L 1159 0
+L 581 0
+L 581 3500
+L 1159 3500
+L 1159 2956
+Q 1341 3275 1631 3429
+Q 1922 3584 2338 3584
+Q 2397 3584 2469 3576
+Q 2541 3569 2628 3553
+L 2631 2963
 z
 " transform="scale(0.015625)"/>
-       <path id="DejaVuSans-61" d="M 2194 1759 
-Q 1497 1759 1228 1600 
-Q 959 1441 959 1056 
-Q 959 750 1161 570 
-Q 1363 391 1709 391 
-Q 2188 391 2477 730 
-Q 2766 1069 2766 1631 
-L 2766 1759 
-L 2194 1759 
+       <path id="DejaVuSans-61" d="M 2194 1759
+Q 1497 1759 1228 1600
+Q 959 1441 959 1056
+Q 959 750 1161 570
+Q 1363 391 1709 391
+Q 2188 391 2477 730
+Q 2766 1069 2766 1631
+L 2766 1759
+L 2194 1759
 z
-M 3341 1997 
-L 3341 0 
-L 2766 0 
-L 2766 531 
-Q 2569 213 2275 61 
-Q 1981 -91 1556 -91 
-Q 1019 -91 701 211 
-Q 384 513 384 1019 
-Q 384 1609 779 1909 
-Q 1175 2209 1959 2209 
-L 2766 2209 
-L 2766 2266 
-Q 2766 2663 2505 2880 
-Q 2244 3097 1772 3097 
-Q 1472 3097 1187 3025 
-Q 903 2953 641 2809 
-L 641 3341 
-Q 956 3463 1253 3523 
-Q 1550 3584 1831 3584 
-Q 2591 3584 2966 3190 
-Q 3341 2797 3341 1997 
+M 3341 1997
+L 3341 0
+L 2766 0
+L 2766 531
+Q 2569 213 2275 61
+Q 1981 -91 1556 -91
+Q 1019 -91 701 211
+Q 384 513 384 1019
+Q 384 1609 779 1909
+Q 1175 2209 1959 2209
+L 2766 2209
+L 2766 2266
+Q 2766 2663 2505 2880
+Q 2244 3097 1772 3097
+Q 1472 3097 1187 3025
+Q 903 2953 641 2809
+L 641 3341
+Q 956 3463 1253 3523
+Q 1550 3584 1831 3584
+Q 2591 3584 2966 3190
+Q 3341 2797 3341 1997
 z
 " transform="scale(0.015625)"/>
-       <path id="DejaVuSans-75" d="M 544 1381 
-L 544 3500 
-L 1119 3500 
-L 1119 1403 
-Q 1119 906 1312 657 
-Q 1506 409 1894 409 
-Q 2359 409 2629 706 
-Q 2900 1003 2900 1516 
-L 2900 3500 
-L 3475 3500 
-L 3475 0 
-L 2900 0 
-L 2900 538 
-Q 2691 219 2414 64 
-Q 2138 -91 1772 -91 
-Q 1169 -91 856 284 
-Q 544 659 544 1381 
+       <path id="DejaVuSans-75" d="M 544 1381
+L 544 3500
+L 1119 3500
+L 1119 1403
+Q 1119 906 1312 657
+Q 1506 409 1894 409
+Q 2359 409 2629 706
+Q 2900 1003 2900 1516
+L 2900 3500
+L 3475 3500
+L 3475 0
+L 2900 0
+L 2900 538
+Q 2691 219 2414 64
+Q 2138 -91 1772 -91
+Q 1169 -91 856 284
+Q 544 659 544 1381
 z
-M 1991 3584 
-L 1991 3584 
+M 1991 3584
+L 1991 3584
 z
 " transform="scale(0.015625)"/>
       </defs>
@@ -767,125 +767,125 @@ z
     </g>
    </g>
    <g id="line2d_13">
-    <path d="M 95.04 380.6352 
-L 222.779077 380.529075 
-L 235.729582 380.282199 
-L 243.970813 379.915133 
-L 250.446066 379.406046 
-L 255.744 378.766883 
-L 260.453275 377.962709 
-L 264.57389 377.018844 
-L 268.105846 375.978796 
-L 271.637802 374.686465 
-L 274.581099 373.381548 
-L 277.524396 371.835995 
-L 280.467692 370.015621 
-L 283.410989 367.883451 
-L 285.765626 365.921436 
-L 288.120264 363.713694 
-L 290.474901 361.238701 
-L 292.829538 358.474235 
-L 295.184176 355.397908 
-L 297.538813 351.987431 
-L 299.893451 348.220835 
-L 302.248088 344.077154 
-L 304.602725 339.526635 
-L 306.957363 334.561291 
-L 309.312 329.163589 
-L 312.255297 321.78728 
-L 315.198593 313.692617 
-L 318.14189 304.866826 
-L 321.085187 295.305618 
-L 324.028484 285.00832 
-L 327.56044 271.724356 
-L 331.092396 257.491814 
-L 335.213011 239.815134 
-L 339.922286 218.452801 
-L 345.808879 190.573925 
-L 356.404747 140.249826 
-L 360.525363 121.846751 
-L 364.057319 107.108755 
-L 367.000615 95.761174 
-L 369.355253 87.403982 
-L 371.70989 79.767014 
-L 374.064527 72.918457 
-L 375.830505 68.33741 
-L 377.596484 64.262089 
-L 379.362462 60.725229 
-L 381.12844 57.732218 
-L 382.894418 55.297498 
-L 384.071736 53.990912 
-L 385.249055 52.941263 
-L 386.426374 52.151061 
-L 387.603692 51.622212 
-L 388.781011 51.355994 
-L 389.95833 51.353058 
-L 391.135648 51.61342 
-L 392.312967 52.136471 
-L 393.490286 52.920959 
-L 394.667604 53.96499 
-L 395.844923 55.266207 
-L 397.022242 56.828276 
-L 398.78822 59.649475 
-L 400.554198 63.018479 
-L 402.320176 66.917588 
-L 404.086154 71.326333 
-L 406.440791 77.95742 
-L 408.795429 85.392994 
-L 411.150066 93.566486 
-L 414.093363 104.71072 
-L 417.036659 116.760021 
-L 420.568615 132.184312 
-L 425.27789 153.916868 
-L 434.10778 196.168455 
-L 440.583033 226.545841 
-L 445.292308 247.498702 
-L 449.412923 264.715831 
-L 452.944879 278.497039 
-L 456.476835 291.278339 
-L 459.420132 301.142925 
-L 462.363429 310.276618 
-L 465.306725 318.677631 
-L 468.250022 326.354584 
-L 471.193319 333.324327 
-L 473.547956 338.392009 
-L 475.902593 343.036068 
-L 478.257231 347.278437 
-L 480.611868 351.138941 
-L 482.966505 354.63858 
-L 485.321143 357.799137 
-L 487.67578 360.642848 
-L 490.030418 363.192205 
-L 492.385055 365.456926 
-L 494.739692 367.467195 
-L 497.682989 369.660056 
-L 500.626286 371.534221 
-L 503.569582 373.127028 
-L 506.512879 374.473205 
-L 509.456176 375.604492 
-L 512.988132 376.714066 
-L 516.520088 377.600877 
-L 520.640703 378.406028 
-L 525.349978 379.087343 
-L 530.647912 379.624016 
-L 537.123165 380.047007 
-L 545.364396 380.350594 
-L 557.137582 380.540892 
-L 578.917978 380.625338 
-L 683.699341 380.6352 
-L 683.699341 380.6352 
+    <path d="M 95.04 380.6352
+L 222.779077 380.529075
+L 235.729582 380.282199
+L 243.970813 379.915133
+L 250.446066 379.406046
+L 255.744 378.766883
+L 260.453275 377.962709
+L 264.57389 377.018844
+L 268.105846 375.978796
+L 271.637802 374.686465
+L 274.581099 373.381548
+L 277.524396 371.835995
+L 280.467692 370.015621
+L 283.410989 367.883451
+L 285.765626 365.921436
+L 288.120264 363.713694
+L 290.474901 361.238701
+L 292.829538 358.474235
+L 295.184176 355.397908
+L 297.538813 351.987431
+L 299.893451 348.220835
+L 302.248088 344.077154
+L 304.602725 339.526635
+L 306.957363 334.561291
+L 309.312 329.163589
+L 312.255297 321.78728
+L 315.198593 313.692617
+L 318.14189 304.866826
+L 321.085187 295.305618
+L 324.028484 285.00832
+L 327.56044 271.724356
+L 331.092396 257.491814
+L 335.213011 239.815134
+L 339.922286 218.452801
+L 345.808879 190.573925
+L 356.404747 140.249826
+L 360.525363 121.846751
+L 364.057319 107.108755
+L 367.000615 95.761174
+L 369.355253 87.403982
+L 371.70989 79.767014
+L 374.064527 72.918457
+L 375.830505 68.33741
+L 377.596484 64.262089
+L 379.362462 60.725229
+L 381.12844 57.732218
+L 382.894418 55.297498
+L 384.071736 53.990912
+L 385.249055 52.941263
+L 386.426374 52.151061
+L 387.603692 51.622212
+L 388.781011 51.355994
+L 389.95833 51.353058
+L 391.135648 51.61342
+L 392.312967 52.136471
+L 393.490286 52.920959
+L 394.667604 53.96499
+L 395.844923 55.266207
+L 397.022242 56.828276
+L 398.78822 59.649475
+L 400.554198 63.018479
+L 402.320176 66.917588
+L 404.086154 71.326333
+L 406.440791 77.95742
+L 408.795429 85.392994
+L 411.150066 93.566486
+L 414.093363 104.71072
+L 417.036659 116.760021
+L 420.568615 132.184312
+L 425.27789 153.916868
+L 434.10778 196.168455
+L 440.583033 226.545841
+L 445.292308 247.498702
+L 449.412923 264.715831
+L 452.944879 278.497039
+L 456.476835 291.278339
+L 459.420132 301.142925
+L 462.363429 310.276618
+L 465.306725 318.677631
+L 468.250022 326.354584
+L 471.193319 333.324327
+L 473.547956 338.392009
+L 475.902593 343.036068
+L 478.257231 347.278437
+L 480.611868 351.138941
+L 482.966505 354.63858
+L 485.321143 357.799137
+L 487.67578 360.642848
+L 490.030418 363.192205
+L 492.385055 365.456926
+L 494.739692 367.467195
+L 497.682989 369.660056
+L 500.626286 371.534221
+L 503.569582 373.127028
+L 506.512879 374.473205
+L 509.456176 375.604492
+L 512.988132 376.714066
+L 516.520088 377.600877
+L 520.640703 378.406028
+L 525.349978 379.087343
+L 530.647912 379.624016
+L 537.123165 380.047007
+L 545.364396 380.350594
+L 557.137582 380.540892
+L 578.917978 380.625338
+L 683.699341 380.6352
+L 683.699341 380.6352
 " clip-path="url(#p24d58054f1)" style="fill: none; stroke: #000000; stroke-width: 3; stroke-linecap: square"/>
     <defs>
-     <path id="m7b5e1d1292" d="M 0 -3 
-L -0.673542 -0.927051 
-L -2.85317 -0.927051 
-L -1.089814 0.354102 
-L -1.763356 2.427051 
-L -0 1.145898 
-L 1.763356 2.427051 
-L 1.089814 0.354102 
-L 2.85317 -0.927051 
-L 0.673542 -0.927051 
+     <path id="m7b5e1d1292" d="M 0 -3
+L -0.673542 -0.927051
+L -2.85317 -0.927051
+L -1.089814 0.354102
+L -1.763356 2.427051
+L -0 1.145898
+L 1.763356 2.427051
+L 1.089814 0.354102
+L 2.85317 -0.927051
+L 0.673542 -0.927051
 z
 " style="stroke: #000000; stroke-linejoin: bevel"/>
     </defs>
@@ -1894,23 +1894,23 @@ z
     </g>
    </g>
    <g id="patch_3">
-    <path d="M 95.04 380.6352 
-L 95.04 51.3216 
+    <path d="M 95.04 380.6352
+L 95.04 51.3216
 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
    <g id="patch_4">
-    <path d="M 684.288 380.6352 
-L 684.288 51.3216 
+    <path d="M 684.288 380.6352
+L 684.288 51.3216
 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
    <g id="patch_5">
-    <path d="M 95.04 380.6352 
-L 684.288 380.6352 
+    <path d="M 95.04 380.6352
+L 684.288 380.6352
 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
    <g id="patch_6">
-    <path d="M 95.04 51.3216 
-L 684.288 51.3216 
+    <path d="M 95.04 51.3216
+L 684.288 51.3216
 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
    <g id="text_15">
@@ -1918,17 +1918,17 @@ L 684.288 51.3216
     <g transform="translate(326.4765 45.3216)scale(0.24 -0.24)">
      <defs>
       <path id="DejaVuSans-20" transform="scale(0.015625)"/>
-      <path id="DejaVuSans-3d" d="M 678 2906 
-L 4684 2906 
-L 4684 2381 
-L 678 2381 
-L 678 2906 
+      <path id="DejaVuSans-3d" d="M 678 2906
+L 4684 2906
+L 4684 2381
+L 678 2381
+L 678 2906
 z
-M 678 1631 
-L 4684 1631 
-L 4684 1100 
-L 678 1100 
-L 678 1631 
+M 678 1631
+L 4684 1631
+L 4684 1100
+L 678 1100
+L 678 1631
 z
 " transform="scale(0.015625)"/>
      </defs>
diff --git a/docs/source/user/laplace_initial.svg b/docs/source/user/laplace_initial.svg
index dbede3687..204626f84 100644
--- a/docs/source/user/laplace_initial.svg
+++ b/docs/source/user/laplace_initial.svg
@@ -21,19 +21,19 @@
  </defs>
  <g id="figure_1">
   <g id="patch_1">
-   <path d="M 0 427.68 
-L 760.32 427.68 
-L 760.32 0 
-L 0 0 
+   <path d="M 0 427.68
+L 760.32 427.68
+L 760.32 0
+L 0 0
 z
 " style="fill: #ffffff"/>
   </g>
   <g id="axes_1">
    <g id="patch_2">
-    <path d="M 95.04 380.6352 
-L 684.288 380.6352 
-L 684.288 51.3216 
-L 95.04 51.3216 
+    <path d="M 95.04 380.6352
+L 684.288 380.6352
+L 684.288 51.3216
+L 95.04 51.3216
 z
 " style="fill: #ffffff"/>
    </g>
@@ -41,8 +41,8 @@ z
     <g id="xtick_1">
      <g id="line2d_1">
       <defs>
-       <path id="mdbd2280614" d="M 0 0 
-L 0 3.5 
+       <path id="mdbd2280614" d="M 0 0
+L 0 3.5
 " style="stroke: #000000; stroke-width: 0.8"/>
       </defs>
       <g>
@@ -53,25 +53,25 @@ L 0 3.5
       <!-- 0 -->
       <g transform="translate(91.85875 395.233637)scale(0.1 -0.1)">
        <defs>
-        <path id="DejaVuSans-30" d="M 2034 4250 
-Q 1547 4250 1301 3770 
-Q 1056 3291 1056 2328 
-Q 1056 1369 1301 889 
-Q 1547 409 2034 409 
-Q 2525 409 2770 889 
-Q 3016 1369 3016 2328 
-Q 3016 3291 2770 3770 
-Q 2525 4250 2034 4250 
+        <path id="DejaVuSans-30" d="M 2034 4250
+Q 1547 4250 1301 3770
+Q 1056 3291 1056 2328
+Q 1056 1369 1301 889
+Q 1547 409 2034 409
+Q 2525 409 2770 889
+Q 3016 1369 3016 2328
+Q 3016 3291 2770 3770
+Q 2525 4250 2034 4250
 z
-M 2034 4750 
-Q 2819 4750 3233 4129 
-Q 3647 3509 3647 2328 
-Q 3647 1150 3233 529 
-Q 2819 -91 2034 -91 
-Q 1250 -91 836 529 
-Q 422 1150 422 2328 
-Q 422 3509 836 4129 
-Q 1250 4750 2034 4750 
+M 2034 4750
+Q 2819 4750 3233 4129
+Q 3647 3509 3647 2328
+Q 3647 1150 3233 529
+Q 2819 -91 2034 -91
+Q 1250 -91 836 529
+Q 422 1150 422 2328
+Q 422 3509 836 4129
+Q 1250 4750 2034 4750
 z
 " transform="scale(0.015625)"/>
        </defs>
@@ -89,28 +89,28 @@ z
       <!-- 200 -->
       <g transform="translate(203.228118 395.233637)scale(0.1 -0.1)">
        <defs>
-        <path id="DejaVuSans-32" d="M 1228 531 
-L 3431 531 
-L 3431 0 
-L 469 0 
-L 469 531 
-Q 828 903 1448 1529 
-Q 2069 2156 2228 2338 
-Q 2531 2678 2651 2914 
-Q 2772 3150 2772 3378 
-Q 2772 3750 2511 3984 
-Q 2250 4219 1831 4219 
-Q 1534 4219 1204 4116 
-Q 875 4013 500 3803 
-L 500 4441 
-Q 881 4594 1212 4672 
-Q 1544 4750 1819 4750 
-Q 2544 4750 2975 4387 
-Q 3406 4025 3406 3419 
-Q 3406 3131 3298 2873 
-Q 3191 2616 2906 2266 
-Q 2828 2175 2409 1742 
-Q 1991 1309 1228 531 
+        <path id="DejaVuSans-32" d="M 1228 531
+L 3431 531
+L 3431 0
+L 469 0
+L 469 531
+Q 828 903 1448 1529
+Q 2069 2156 2228 2338
+Q 2531 2678 2651 2914
+Q 2772 3150 2772 3378
+Q 2772 3750 2511 3984
+Q 2250 4219 1831 4219
+Q 1534 4219 1204 4116
+Q 875 4013 500 3803
+L 500 4441
+Q 881 4594 1212 4672
+Q 1544 4750 1819 4750
+Q 2544 4750 2975 4387
+Q 3406 4025 3406 3419
+Q 3406 3131 3298 2873
+Q 3191 2616 2906 2266
+Q 2828 2175 2409 1742
+Q 1991 1309 1228 531
 z
 " transform="scale(0.015625)"/>
        </defs>
@@ -130,23 +130,23 @@ z
       <!-- 400 -->
       <g transform="translate(320.959986 395.233637)scale(0.1 -0.1)">
        <defs>
-        <path id="DejaVuSans-34" d="M 2419 4116 
-L 825 1625 
-L 2419 1625 
-L 2419 4116 
+        <path id="DejaVuSans-34" d="M 2419 4116
+L 825 1625
+L 2419 1625
+L 2419 4116
 z
-M 2253 4666 
-L 3047 4666 
-L 3047 1625 
-L 3713 1625 
-L 3713 1100 
-L 3047 1100 
-L 3047 0 
-L 2419 0 
-L 2419 1100 
-L 313 1100 
-L 313 1709 
-L 2253 4666 
+M 2253 4666
+L 3047 4666
+L 3047 1625
+L 3713 1625
+L 3713 1100
+L 3047 1100
+L 3047 0
+L 2419 0
+L 2419 1100
+L 313 1100
+L 313 1709
+L 2253 4666
 z
 " transform="scale(0.015625)"/>
        </defs>
@@ -166,34 +166,34 @@ z
       <!-- 600 -->
       <g transform="translate(438.691854 395.233637)scale(0.1 -0.1)">
        <defs>
-        <path id="DejaVuSans-36" d="M 2113 2584 
-Q 1688 2584 1439 2293 
-Q 1191 2003 1191 1497 
-Q 1191 994 1439 701 
-Q 1688 409 2113 409 
-Q 2538 409 2786 701 
-Q 3034 994 3034 1497 
-Q 3034 2003 2786 2293 
-Q 2538 2584 2113 2584 
+        <path id="DejaVuSans-36" d="M 2113 2584
+Q 1688 2584 1439 2293
+Q 1191 2003 1191 1497
+Q 1191 994 1439 701
+Q 1688 409 2113 409
+Q 2538 409 2786 701
+Q 3034 994 3034 1497
+Q 3034 2003 2786 2293
+Q 2538 2584 2113 2584
 z
-M 3366 4563 
-L 3366 3988 
-Q 3128 4100 2886 4159 
-Q 2644 4219 2406 4219 
-Q 1781 4219 1451 3797 
-Q 1122 3375 1075 2522 
-Q 1259 2794 1537 2939 
-Q 1816 3084 2150 3084 
-Q 2853 3084 3261 2657 
-Q 3669 2231 3669 1497 
-Q 3669 778 3244 343 
-Q 2819 -91 2113 -91 
-Q 1303 -91 875 529 
-Q 447 1150 447 2328 
-Q 447 3434 972 4092 
-Q 1497 4750 2381 4750 
-Q 2619 4750 2861 4703 
-Q 3103 4656 3366 4563 
+M 3366 4563
+L 3366 3988
+Q 3128 4100 2886 4159
+Q 2644 4219 2406 4219
+Q 1781 4219 1451 3797
+Q 1122 3375 1075 2522
+Q 1259 2794 1537 2939
+Q 1816 3084 2150 3084
+Q 2853 3084 3261 2657
+Q 3669 2231 3669 1497
+Q 3669 778 3244 343
+Q 2819 -91 2113 -91
+Q 1303 -91 875 529
+Q 447 1150 447 2328
+Q 447 3434 972 4092
+Q 1497 4750 2381 4750
+Q 2619 4750 2861 4703
+Q 3103 4656 3366 4563
 z
 " transform="scale(0.015625)"/>
        </defs>
@@ -213,43 +213,43 @@ z
       <!-- 800 -->
       <g transform="translate(556.423723 395.233637)scale(0.1 -0.1)">
        <defs>
-        <path id="DejaVuSans-38" d="M 2034 2216 
-Q 1584 2216 1326 1975 
-Q 1069 1734 1069 1313 
-Q 1069 891 1326 650 
-Q 1584 409 2034 409 
-Q 2484 409 2743 651 
-Q 3003 894 3003 1313 
-Q 3003 1734 2745 1975 
-Q 2488 2216 2034 2216 
+        <path id="DejaVuSans-38" d="M 2034 2216
+Q 1584 2216 1326 1975
+Q 1069 1734 1069 1313
+Q 1069 891 1326 650
+Q 1584 409 2034 409
+Q 2484 409 2743 651
+Q 3003 894 3003 1313
+Q 3003 1734 2745 1975
+Q 2488 2216 2034 2216
 z
-M 1403 2484 
-Q 997 2584 770 2862 
-Q 544 3141 544 3541 
-Q 544 4100 942 4425 
-Q 1341 4750 2034 4750 
-Q 2731 4750 3128 4425 
-Q 3525 4100 3525 3541 
-Q 3525 3141 3298 2862 
-Q 3072 2584 2669 2484 
-Q 3125 2378 3379 2068 
-Q 3634 1759 3634 1313 
-Q 3634 634 3220 271 
-Q 2806 -91 2034 -91 
-Q 1263 -91 848 271 
-Q 434 634 434 1313 
-Q 434 1759 690 2068 
-Q 947 2378 1403 2484 
+M 1403 2484
+Q 997 2584 770 2862
+Q 544 3141 544 3541
+Q 544 4100 942 4425
+Q 1341 4750 2034 4750
+Q 2731 4750 3128 4425
+Q 3525 4100 3525 3541
+Q 3525 3141 3298 2862
+Q 3072 2584 2669 2484
+Q 3125 2378 3379 2068
+Q 3634 1759 3634 1313
+Q 3634 634 3220 271
+Q 2806 -91 2034 -91
+Q 1263 -91 848 271
+Q 434 634 434 1313
+Q 434 1759 690 2068
+Q 947 2378 1403 2484
 z
-M 1172 3481 
-Q 1172 3119 1398 2916 
-Q 1625 2713 2034 2713 
-Q 2441 2713 2670 2916 
-Q 2900 3119 2900 3481 
-Q 2900 3844 2670 4047 
-Q 2441 4250 2034 4250 
-Q 1625 4250 1398 4047 
-Q 1172 3844 1172 3481 
+M 1172 3481
+Q 1172 3119 1398 2916
+Q 1625 2713 2034 2713
+Q 2441 2713 2670 2916
+Q 2900 3119 2900 3481
+Q 2900 3844 2670 4047
+Q 2441 4250 2034 4250
+Q 1625 4250 1398 4047
+Q 1172 3844 1172 3481
 z
 " transform="scale(0.015625)"/>
        </defs>
@@ -269,18 +269,18 @@ z
       <!-- 1000 -->
       <g transform="translate(670.974341 395.233637)scale(0.1 -0.1)">
        <defs>
-        <path id="DejaVuSans-31" d="M 794 531 
-L 1825 531 
-L 1825 4091 
-L 703 3866 
-L 703 4441 
-L 1819 4666 
-L 2450 4666 
-L 2450 531 
-L 3481 531 
-L 3481 0 
-L 794 0 
-L 794 531 
+        <path id="DejaVuSans-31" d="M 794 531
+L 1825 531
+L 1825 4091
+L 703 3866
+L 703 4441
+L 1819 4666
+L 2450 4666
+L 2450 531
+L 3481 531
+L 3481 0
+L 794 0
+L 794 531
 z
 " transform="scale(0.015625)"/>
        </defs>
@@ -295,130 +295,130 @@ z
      <!-- Position -->
      <g transform="translate(342.95025 419.549575)scale(0.24 -0.24)">
       <defs>
-       <path id="DejaVuSans-50" d="M 1259 4147 
-L 1259 2394 
-L 2053 2394 
-Q 2494 2394 2734 2622 
-Q 2975 2850 2975 3272 
-Q 2975 3691 2734 3919 
-Q 2494 4147 2053 4147 
-L 1259 4147 
+       <path id="DejaVuSans-50" d="M 1259 4147
+L 1259 2394
+L 2053 2394
+Q 2494 2394 2734 2622
+Q 2975 2850 2975 3272
+Q 2975 3691 2734 3919
+Q 2494 4147 2053 4147
+L 1259 4147
 z
-M 628 4666 
-L 2053 4666 
-Q 2838 4666 3239 4311 
-Q 3641 3956 3641 3272 
-Q 3641 2581 3239 2228 
-Q 2838 1875 2053 1875 
-L 1259 1875 
-L 1259 0 
-L 628 0 
-L 628 4666 
+M 628 4666
+L 2053 4666
+Q 2838 4666 3239 4311
+Q 3641 3956 3641 3272
+Q 3641 2581 3239 2228
+Q 2838 1875 2053 1875
+L 1259 1875
+L 1259 0
+L 628 0
+L 628 4666
 z
 " transform="scale(0.015625)"/>
-       <path id="DejaVuSans-6f" d="M 1959 3097 
-Q 1497 3097 1228 2736 
-Q 959 2375 959 1747 
-Q 959 1119 1226 758 
-Q 1494 397 1959 397 
-Q 2419 397 2687 759 
-Q 2956 1122 2956 1747 
-Q 2956 2369 2687 2733 
-Q 2419 3097 1959 3097 
+       <path id="DejaVuSans-6f" d="M 1959 3097
+Q 1497 3097 1228 2736
+Q 959 2375 959 1747
+Q 959 1119 1226 758
+Q 1494 397 1959 397
+Q 2419 397 2687 759
+Q 2956 1122 2956 1747
+Q 2956 2369 2687 2733
+Q 2419 3097 1959 3097
 z
-M 1959 3584 
-Q 2709 3584 3137 3096 
-Q 3566 2609 3566 1747 
-Q 3566 888 3137 398 
-Q 2709 -91 1959 -91 
-Q 1206 -91 779 398 
-Q 353 888 353 1747 
-Q 353 2609 779 3096 
-Q 1206 3584 1959 3584 
+M 1959 3584
+Q 2709 3584 3137 3096
+Q 3566 2609 3566 1747
+Q 3566 888 3137 398
+Q 2709 -91 1959 -91
+Q 1206 -91 779 398
+Q 353 888 353 1747
+Q 353 2609 779 3096
+Q 1206 3584 1959 3584
 z
 " transform="scale(0.015625)"/>
-       <path id="DejaVuSans-73" d="M 2834 3397 
-L 2834 2853 
-Q 2591 2978 2328 3040 
-Q 2066 3103 1784 3103 
-Q 1356 3103 1142 2972 
-Q 928 2841 928 2578 
-Q 928 2378 1081 2264 
-Q 1234 2150 1697 2047 
-L 1894 2003 
-Q 2506 1872 2764 1633 
-Q 3022 1394 3022 966 
-Q 3022 478 2636 193 
-Q 2250 -91 1575 -91 
-Q 1294 -91 989 -36 
-Q 684 19 347 128 
-L 347 722 
-Q 666 556 975 473 
-Q 1284 391 1588 391 
-Q 1994 391 2212 530 
-Q 2431 669 2431 922 
-Q 2431 1156 2273 1281 
-Q 2116 1406 1581 1522 
-L 1381 1569 
-Q 847 1681 609 1914 
-Q 372 2147 372 2553 
-Q 372 3047 722 3315 
-Q 1072 3584 1716 3584 
-Q 2034 3584 2315 3537 
-Q 2597 3491 2834 3397 
+       <path id="DejaVuSans-73" d="M 2834 3397
+L 2834 2853
+Q 2591 2978 2328 3040
+Q 2066 3103 1784 3103
+Q 1356 3103 1142 2972
+Q 928 2841 928 2578
+Q 928 2378 1081 2264
+Q 1234 2150 1697 2047
+L 1894 2003
+Q 2506 1872 2764 1633
+Q 3022 1394 3022 966
+Q 3022 478 2636 193
+Q 2250 -91 1575 -91
+Q 1294 -91 989 -36
+Q 684 19 347 128
+L 347 722
+Q 666 556 975 473
+Q 1284 391 1588 391
+Q 1994 391 2212 530
+Q 2431 669 2431 922
+Q 2431 1156 2273 1281
+Q 2116 1406 1581 1522
+L 1381 1569
+Q 847 1681 609 1914
+Q 372 2147 372 2553
+Q 372 3047 722 3315
+Q 1072 3584 1716 3584
+Q 2034 3584 2315 3537
+Q 2597 3491 2834 3397
 z
 " transform="scale(0.015625)"/>
-       <path id="DejaVuSans-69" d="M 603 3500 
-L 1178 3500 
-L 1178 0 
-L 603 0 
-L 603 3500 
+       <path id="DejaVuSans-69" d="M 603 3500
+L 1178 3500
+L 1178 0
+L 603 0
+L 603 3500
 z
-M 603 4863 
-L 1178 4863 
-L 1178 4134 
-L 603 4134 
-L 603 4863 
+M 603 4863
+L 1178 4863
+L 1178 4134
+L 603 4134
+L 603 4863
 z
 " transform="scale(0.015625)"/>
-       <path id="DejaVuSans-74" d="M 1172 4494 
-L 1172 3500 
-L 2356 3500 
-L 2356 3053 
-L 1172 3053 
-L 1172 1153 
-Q 1172 725 1289 603 
-Q 1406 481 1766 481 
-L 2356 481 
-L 2356 0 
-L 1766 0 
-Q 1100 0 847 248 
-Q 594 497 594 1153 
-L 594 3053 
-L 172 3053 
-L 172 3500 
-L 594 3500 
-L 594 4494 
-L 1172 4494 
+       <path id="DejaVuSans-74" d="M 1172 4494
+L 1172 3500
+L 2356 3500
+L 2356 3053
+L 1172 3053
+L 1172 1153
+Q 1172 725 1289 603
+Q 1406 481 1766 481
+L 2356 481
+L 2356 0
+L 1766 0
+Q 1100 0 847 248
+Q 594 497 594 1153
+L 594 3053
+L 172 3053
+L 172 3500
+L 594 3500
+L 594 4494
+L 1172 4494
 z
 " transform="scale(0.015625)"/>
-       <path id="DejaVuSans-6e" d="M 3513 2113 
-L 3513 0 
-L 2938 0 
-L 2938 2094 
-Q 2938 2591 2744 2837 
-Q 2550 3084 2163 3084 
-Q 1697 3084 1428 2787 
-Q 1159 2491 1159 1978 
-L 1159 0 
-L 581 0 
-L 581 3500 
-L 1159 3500 
-L 1159 2956 
-Q 1366 3272 1645 3428 
-Q 1925 3584 2291 3584 
-Q 2894 3584 3203 3211 
-Q 3513 2838 3513 2113 
+       <path id="DejaVuSans-6e" d="M 3513 2113
+L 3513 0
+L 2938 0
+L 2938 2094
+Q 2938 2591 2744 2837
+Q 2550 3084 2163 3084
+Q 1697 3084 1428 2787
+Q 1159 2491 1159 1978
+L 1159 0
+L 581 0
+L 581 3500
+L 1159 3500
+L 1159 2956
+Q 1366 3272 1645 3428
+Q 1925 3584 2291 3584
+Q 2894 3584 3203 3211
+Q 3513 2838 3513 2113
 z
 " transform="scale(0.015625)"/>
       </defs>
@@ -437,8 +437,8 @@ z
     <g id="ytick_1">
      <g id="line2d_7">
       <defs>
-       <path id="maf970daf46" d="M 0 0 
-L -3.5 0 
+       <path id="maf970daf46" d="M 0 0
+L -3.5 0
 " style="stroke: #000000; stroke-width: 0.8"/>
       </defs>
       <g>
@@ -537,168 +537,168 @@ L -3.5 0
      <!-- Temperature -->
      <g transform="translate(47.23625 291.985275)rotate(-90)scale(0.24 -0.24)">
       <defs>
-       <path id="DejaVuSans-54" d="M -19 4666 
-L 3928 4666 
-L 3928 4134 
-L 2272 4134 
-L 2272 0 
-L 1638 0 
-L 1638 4134 
-L -19 4134 
-L -19 4666 
+       <path id="DejaVuSans-54" d="M -19 4666
+L 3928 4666
+L 3928 4134
+L 2272 4134
+L 2272 0
+L 1638 0
+L 1638 4134
+L -19 4134
+L -19 4666
 z
 " transform="scale(0.015625)"/>
-       <path id="DejaVuSans-65" d="M 3597 1894 
-L 3597 1613 
-L 953 1613 
-Q 991 1019 1311 708 
-Q 1631 397 2203 397 
-Q 2534 397 2845 478 
-Q 3156 559 3463 722 
-L 3463 178 
-Q 3153 47 2828 -22 
-Q 2503 -91 2169 -91 
-Q 1331 -91 842 396 
-Q 353 884 353 1716 
-Q 353 2575 817 3079 
-Q 1281 3584 2069 3584 
-Q 2775 3584 3186 3129 
-Q 3597 2675 3597 1894 
+       <path id="DejaVuSans-65" d="M 3597 1894
+L 3597 1613
+L 953 1613
+Q 991 1019 1311 708
+Q 1631 397 2203 397
+Q 2534 397 2845 478
+Q 3156 559 3463 722
+L 3463 178
+Q 3153 47 2828 -22
+Q 2503 -91 2169 -91
+Q 1331 -91 842 396
+Q 353 884 353 1716
+Q 353 2575 817 3079
+Q 1281 3584 2069 3584
+Q 2775 3584 3186 3129
+Q 3597 2675 3597 1894
 z
-M 3022 2063 
-Q 3016 2534 2758 2815 
-Q 2500 3097 2075 3097 
-Q 1594 3097 1305 2825 
-Q 1016 2553 972 2059 
-L 3022 2063 
+M 3022 2063
+Q 3016 2534 2758 2815
+Q 2500 3097 2075 3097
+Q 1594 3097 1305 2825
+Q 1016 2553 972 2059
+L 3022 2063
 z
 " transform="scale(0.015625)"/>
-       <path id="DejaVuSans-6d" d="M 3328 2828 
-Q 3544 3216 3844 3400 
-Q 4144 3584 4550 3584 
-Q 5097 3584 5394 3201 
-Q 5691 2819 5691 2113 
-L 5691 0 
-L 5113 0 
-L 5113 2094 
-Q 5113 2597 4934 2840 
-Q 4756 3084 4391 3084 
-Q 3944 3084 3684 2787 
-Q 3425 2491 3425 1978 
-L 3425 0 
-L 2847 0 
-L 2847 2094 
-Q 2847 2600 2669 2842 
-Q 2491 3084 2119 3084 
-Q 1678 3084 1418 2786 
-Q 1159 2488 1159 1978 
-L 1159 0 
-L 581 0 
-L 581 3500 
-L 1159 3500 
-L 1159 2956 
-Q 1356 3278 1631 3431 
-Q 1906 3584 2284 3584 
-Q 2666 3584 2933 3390 
-Q 3200 3197 3328 2828 
+       <path id="DejaVuSans-6d" d="M 3328 2828
+Q 3544 3216 3844 3400
+Q 4144 3584 4550 3584
+Q 5097 3584 5394 3201
+Q 5691 2819 5691 2113
+L 5691 0
+L 5113 0
+L 5113 2094
+Q 5113 2597 4934 2840
+Q 4756 3084 4391 3084
+Q 3944 3084 3684 2787
+Q 3425 2491 3425 1978
+L 3425 0
+L 2847 0
+L 2847 2094
+Q 2847 2600 2669 2842
+Q 2491 3084 2119 3084
+Q 1678 3084 1418 2786
+Q 1159 2488 1159 1978
+L 1159 0
+L 581 0
+L 581 3500
+L 1159 3500
+L 1159 2956
+Q 1356 3278 1631 3431
+Q 1906 3584 2284 3584
+Q 2666 3584 2933 3390
+Q 3200 3197 3328 2828
 z
 " transform="scale(0.015625)"/>
-       <path id="DejaVuSans-70" d="M 1159 525 
-L 1159 -1331 
-L 581 -1331 
-L 581 3500 
-L 1159 3500 
-L 1159 2969 
-Q 1341 3281 1617 3432 
-Q 1894 3584 2278 3584 
-Q 2916 3584 3314 3078 
-Q 3713 2572 3713 1747 
-Q 3713 922 3314 415 
-Q 2916 -91 2278 -91 
-Q 1894 -91 1617 61 
-Q 1341 213 1159 525 
+       <path id="DejaVuSans-70" d="M 1159 525
+L 1159 -1331
+L 581 -1331
+L 581 3500
+L 1159 3500
+L 1159 2969
+Q 1341 3281 1617 3432
+Q 1894 3584 2278 3584
+Q 2916 3584 3314 3078
+Q 3713 2572 3713 1747
+Q 3713 922 3314 415
+Q 2916 -91 2278 -91
+Q 1894 -91 1617 61
+Q 1341 213 1159 525
 z
-M 3116 1747 
-Q 3116 2381 2855 2742 
-Q 2594 3103 2138 3103 
-Q 1681 3103 1420 2742 
-Q 1159 2381 1159 1747 
-Q 1159 1113 1420 752 
-Q 1681 391 2138 391 
-Q 2594 391 2855 752 
-Q 3116 1113 3116 1747 
+M 3116 1747
+Q 3116 2381 2855 2742
+Q 2594 3103 2138 3103
+Q 1681 3103 1420 2742
+Q 1159 2381 1159 1747
+Q 1159 1113 1420 752
+Q 1681 391 2138 391
+Q 2594 391 2855 752
+Q 3116 1113 3116 1747
 z
 " transform="scale(0.015625)"/>
-       <path id="DejaVuSans-72" d="M 2631 2963 
-Q 2534 3019 2420 3045 
-Q 2306 3072 2169 3072 
-Q 1681 3072 1420 2755 
-Q 1159 2438 1159 1844 
-L 1159 0 
-L 581 0 
-L 581 3500 
-L 1159 3500 
-L 1159 2956 
-Q 1341 3275 1631 3429 
-Q 1922 3584 2338 3584 
-Q 2397 3584 2469 3576 
-Q 2541 3569 2628 3553 
-L 2631 2963 
+       <path id="DejaVuSans-72" d="M 2631 2963
+Q 2534 3019 2420 3045
+Q 2306 3072 2169 3072
+Q 1681 3072 1420 2755
+Q 1159 2438 1159 1844
+L 1159 0
+L 581 0
+L 581 3500
+L 1159 3500
+L 1159 2956
+Q 1341 3275 1631 3429
+Q 1922 3584 2338 3584
+Q 2397 3584 2469 3576
+Q 2541 3569 2628 3553
+L 2631 2963
 z
 " transform="scale(0.015625)"/>
-       <path id="DejaVuSans-61" d="M 2194 1759 
-Q 1497 1759 1228 1600 
-Q 959 1441 959 1056 
-Q 959 750 1161 570 
-Q 1363 391 1709 391 
-Q 2188 391 2477 730 
-Q 2766 1069 2766 1631 
-L 2766 1759 
-L 2194 1759 
+       <path id="DejaVuSans-61" d="M 2194 1759
+Q 1497 1759 1228 1600
+Q 959 1441 959 1056
+Q 959 750 1161 570
+Q 1363 391 1709 391
+Q 2188 391 2477 730
+Q 2766 1069 2766 1631
+L 2766 1759
+L 2194 1759
 z
-M 3341 1997 
-L 3341 0 
-L 2766 0 
-L 2766 531 
-Q 2569 213 2275 61 
-Q 1981 -91 1556 -91 
-Q 1019 -91 701 211 
-Q 384 513 384 1019 
-Q 384 1609 779 1909 
-Q 1175 2209 1959 2209 
-L 2766 2209 
-L 2766 2266 
-Q 2766 2663 2505 2880 
-Q 2244 3097 1772 3097 
-Q 1472 3097 1187 3025 
-Q 903 2953 641 2809 
-L 641 3341 
-Q 956 3463 1253 3523 
-Q 1550 3584 1831 3584 
-Q 2591 3584 2966 3190 
-Q 3341 2797 3341 1997 
+M 3341 1997
+L 3341 0
+L 2766 0
+L 2766 531
+Q 2569 213 2275 61
+Q 1981 -91 1556 -91
+Q 1019 -91 701 211
+Q 384 513 384 1019
+Q 384 1609 779 1909
+Q 1175 2209 1959 2209
+L 2766 2209
+L 2766 2266
+Q 2766 2663 2505 2880
+Q 2244 3097 1772 3097
+Q 1472 3097 1187 3025
+Q 903 2953 641 2809
+L 641 3341
+Q 956 3463 1253 3523
+Q 1550 3584 1831 3584
+Q 2591 3584 2966 3190
+Q 3341 2797 3341 1997
 z
 " transform="scale(0.015625)"/>
-       <path id="DejaVuSans-75" d="M 544 1381 
-L 544 3500 
-L 1119 3500 
-L 1119 1403 
-Q 1119 906 1312 657 
-Q 1506 409 1894 409 
-Q 2359 409 2629 706 
-Q 2900 1003 2900 1516 
-L 2900 3500 
-L 3475 3500 
-L 3475 0 
-L 2900 0 
-L 2900 538 
-Q 2691 219 2414 64 
-Q 2138 -91 1772 -91 
-Q 1169 -91 856 284 
-Q 544 659 544 1381 
+       <path id="DejaVuSans-75" d="M 544 1381
+L 544 3500
+L 1119 3500
+L 1119 1403
+Q 1119 906 1312 657
+Q 1506 409 1894 409
+Q 2359 409 2629 706
+Q 2900 1003 2900 1516
+L 2900 3500
+L 3475 3500
+L 3475 0
+L 2900 0
+L 2900 538
+Q 2691 219 2414 64
+Q 2138 -91 1772 -91
+Q 1169 -91 856 284
+Q 544 659 544 1381
 z
-M 1991 3584 
-L 1991 3584 
+M 1991 3584
+L 1991 3584
 z
 " transform="scale(0.015625)"/>
       </defs>
@@ -717,24 +717,24 @@ z
     </g>
    </g>
    <g id="line2d_13">
-    <path d="M 95.04 380.6352 
-L 388.781011 380.6352 
-L 389.36967 51.354528 
-L 389.95833 380.6352 
-L 683.699341 380.6352 
-L 683.699341 380.6352 
+    <path d="M 95.04 380.6352
+L 388.781011 380.6352
+L 389.36967 51.354528
+L 389.95833 380.6352
+L 683.699341 380.6352
+L 683.699341 380.6352
 " clip-path="url(#pe338a5f14f)" style="fill: none; stroke: #000000; stroke-width: 3; stroke-linecap: square"/>
     <defs>
-     <path id="m9f78a702ea" d="M 0 -3 
-L -0.673542 -0.927051 
-L -2.85317 -0.927051 
-L -1.089814 0.354102 
-L -1.763356 2.427051 
-L -0 1.145898 
-L 1.763356 2.427051 
-L 1.089814 0.354102 
-L 2.85317 -0.927051 
-L 0.673542 -0.927051 
+     <path id="m9f78a702ea" d="M 0 -3
+L -0.673542 -0.927051
+L -2.85317 -0.927051
+L -1.089814 0.354102
+L -1.763356 2.427051
+L -0 1.145898
+L 1.763356 2.427051
+L 1.089814 0.354102
+L 2.85317 -0.927051
+L 0.673542 -0.927051
 z
 " style="stroke: #000000; stroke-linejoin: bevel"/>
     </defs>
@@ -1743,73 +1743,73 @@ z
     </g>
    </g>
    <g id="patch_3">
-    <path d="M 95.04 380.6352 
-L 95.04 51.3216 
+    <path d="M 95.04 380.6352
+L 95.04 51.3216
 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
    <g id="patch_4">
-    <path d="M 684.288 380.6352 
-L 684.288 51.3216 
+    <path d="M 684.288 380.6352
+L 684.288 51.3216
 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
    <g id="patch_5">
-    <path d="M 95.04 380.6352 
-L 684.288 380.6352 
+    <path d="M 95.04 380.6352
+L 684.288 380.6352
 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
    <g id="patch_6">
-    <path d="M 95.04 51.3216 
-L 684.288 51.3216 
+    <path d="M 95.04 51.3216
+L 684.288 51.3216
 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
    <g id="text_15">
     <!-- Initial State -->
     <g transform="translate(320.8815 45.3216)scale(0.24 -0.24)">
      <defs>
-      <path id="DejaVuSans-49" d="M 628 4666 
-L 1259 4666 
-L 1259 0 
-L 628 0 
-L 628 4666 
+      <path id="DejaVuSans-49" d="M 628 4666
+L 1259 4666
+L 1259 0
+L 628 0
+L 628 4666
 z
 " transform="scale(0.015625)"/>
-      <path id="DejaVuSans-6c" d="M 603 4863 
-L 1178 4863 
-L 1178 0 
-L 603 0 
-L 603 4863 
+      <path id="DejaVuSans-6c" d="M 603 4863
+L 1178 4863
+L 1178 0
+L 603 0
+L 603 4863
 z
 " transform="scale(0.015625)"/>
       <path id="DejaVuSans-20" transform="scale(0.015625)"/>
-      <path id="DejaVuSans-53" d="M 3425 4513 
-L 3425 3897 
-Q 3066 4069 2747 4153 
-Q 2428 4238 2131 4238 
-Q 1616 4238 1336 4038 
-Q 1056 3838 1056 3469 
-Q 1056 3159 1242 3001 
-Q 1428 2844 1947 2747 
-L 2328 2669 
-Q 3034 2534 3370 2195 
-Q 3706 1856 3706 1288 
-Q 3706 609 3251 259 
-Q 2797 -91 1919 -91 
-Q 1588 -91 1214 -16 
-Q 841 59 441 206 
-L 441 856 
-Q 825 641 1194 531 
-Q 1563 422 1919 422 
-Q 2459 422 2753 634 
-Q 3047 847 3047 1241 
-Q 3047 1584 2836 1778 
-Q 2625 1972 2144 2069 
-L 1759 2144 
-Q 1053 2284 737 2584 
-Q 422 2884 422 3419 
-Q 422 4038 858 4394 
-Q 1294 4750 2059 4750 
-Q 2388 4750 2728 4690 
-Q 3069 4631 3425 4513 
+      <path id="DejaVuSans-53" d="M 3425 4513
+L 3425 3897
+Q 3066 4069 2747 4153
+Q 2428 4238 2131 4238
+Q 1616 4238 1336 4038
+Q 1056 3838 1056 3469
+Q 1056 3159 1242 3001
+Q 1428 2844 1947 2747
+L 2328 2669
+Q 3034 2534 3370 2195
+Q 3706 1856 3706 1288
+Q 3706 609 3251 259
+Q 2797 -91 1919 -91
+Q 1588 -91 1214 -16
+Q 841 59 441 206
+L 441 856
+Q 825 641 1194 531
+Q 1563 422 1919 422
+Q 2459 422 2753 634
+Q 3047 847 3047 1241
+Q 3047 1584 2836 1778
+Q 2625 1972 2144 2069
+L 1759 2144
+Q 1053 2284 737 2584
+Q 422 2884 422 3419
+Q 422 4038 858 4394
+Q 1294 4750 2059 4750
+Q 2388 4750 2728 4690
+Q 3069 4631 3425 4513
 z
 " transform="scale(0.015625)"/>
      </defs>
diff --git a/docs/source/user/memory.rst b/docs/source/user/memory.rst
index 116531e83..c876a8bf6 100644
--- a/docs/source/user/memory.rst
+++ b/docs/source/user/memory.rst
@@ -126,10 +126,10 @@ traditional dynamic memory management.
    device function). *shape* is either an integer or a tuple of integers
    representing the array's dimensions and must be a simple constant
    expression. A "simple constant expression" includes, but is not limited to:
-   
+
       #. A literal (e.g. ``10``)
       #. A local variable whose right-hand side is a literal or a simple constant
-         expression (e.g. ``shape``, where ``shape`` is defined earlier in the function 
+         expression (e.g. ``shape``, where ``shape`` is defined earlier in the function
          as ``shape = 10``)
       #. A global variable that is defined in the jitted function's globals by the time
          of compilation (e.g. ``shape``, where ``shape`` is defined using any expression
@@ -259,14 +259,14 @@ unlike traditional dynamic memory management.
 
    Allocate a local array of the given *shape* and *type* on the device.
    *shape* is either an integer or a tuple of integers representing the array's
-   dimensions and must be a simple constant expression. A "simple constant expression" 
+   dimensions and must be a simple constant expression. A "simple constant expression"
    includes, but is not limited to:
 
       #. A literal (e.g. ``10``)
       #. A local variable whose right-hand side is a literal or a simple constant
          expression (e.g. ``shape``, where ``shape`` is defined earlier in the function
          as ``shape = 10``)
-      #. A global variable that is defined in the jitted function's globals by the time 
+      #. A global variable that is defined in the jitted function's globals by the time
          of compilation (e.g. ``shape``, where ``shape`` is defined using any expression
          at global scope).
 
diff --git a/docs/source/user/simulator.rst b/docs/source/user/simulator.rst
index 099ffc347..b10a0e874 100644
--- a/docs/source/user/simulator.rst
+++ b/docs/source/user/simulator.rst
@@ -11,7 +11,7 @@ be used to debug CUDA Python code, either by adding print statements to your
 code, or by using the debugger to step through the execution of an individual
 thread.
 
-The simulator deliberately allows running non-CUDA code like starting a debugger 
+The simulator deliberately allows running non-CUDA code like starting a debugger
 and printing arbitrary expressions for debugging purposes. Therefore, it is
 best to start from code that compiles for the CUDA target, and then move over to
 the simulator to investigate issues.
@@ -24,7 +24,7 @@ Using the simulator
 ===================
 
 The simulator is enabled by setting the environment variable
-:envvar:`NUMBA_ENABLE_CUDASIM` to 1 prior to importing Numba. CUDA Python code 
+:envvar:`NUMBA_ENABLE_CUDASIM` to 1 prior to importing Numba. CUDA Python code
 may then be executed as normal. The easiest way to use the debugger inside a
 kernel is to only stop a single thread, otherwise the interaction with the
 debugger is difficult to handle. For example, the kernel below will stop in
@@ -93,8 +93,8 @@ Some limitations of the simulator include:
   structured array access by attribute that works with the hardware target may
   fail in the simulator - see :ref:`structured-array-access`.
 * Operations directly against device arrays are only partially supported, that
-  is, testing equality, less than, greater than, and basic mathematical 
-  operations are supported, but many other operations, such as the in-place 
+  is, testing equality, less than, greater than, and basic mathematical
+  operations are supported, but many other operations, such as the in-place
   operators and bit operators are not.
 * The :func:`ffs() <numba.cuda.ffs>` function only works correctly for values
   that can be represented using 32-bit integers.
diff --git a/docs/source/user/ufunc.rst b/docs/source/user/ufunc.rst
index 06f85e7ca..6beb5baab 100644
--- a/docs/source/user/ufunc.rst
+++ b/docs/source/user/ufunc.rst
@@ -64,7 +64,7 @@ the CUDA ufunc functionality.  This may be accomplished as follows::
 
     from numba import guvectorize
 
-    @guvectorize(['void(float32[:,:], float32[:,:], float32[:,:])'], 
+    @guvectorize(['void(float32[:,:], float32[:,:], float32[:,:])'],
                  '(m,n),(n,p)->(m,p)', target='cuda')
     def matmulcore(A, B, C):
         ...
diff --git a/numba_cuda/_version.py b/numba_cuda/_version.py
index 1cd1c11d6..01fe47f9f 100644
--- a/numba_cuda/_version.py
+++ b/numba_cuda/_version.py
@@ -15,5 +15,8 @@
 import importlib.resources
 
 __version__ = (
-    importlib.resources.files("numba_cuda").joinpath("VERSION").read_text().strip()
+    importlib.resources.files("numba_cuda")
+    .joinpath("VERSION")
+    .read_text()
+    .strip()
 )
diff --git a/numba_cuda/numba/cuda/__init__.py b/numba_cuda/numba/cuda/__init__.py
index 01d468155..639d4d469 100644
--- a/numba_cuda/numba/cuda/__init__.py
+++ b/numba_cuda/numba/cuda/__init__.py
@@ -7,8 +7,12 @@
     from .device_init import *
     from .device_init import _auto_device
 
-from numba.cuda.compiler import (compile, compile_for_current_device,
-                                 compile_ptx, compile_ptx_for_current_device)
+from numba.cuda.compiler import (
+    compile,
+    compile_for_current_device,
+    compile_ptx,
+    compile_ptx_for_current_device,
+)
 
 # This is the out-of-tree NVIDIA-maintained target. This is reported in Numba
 # sysinfo (`numba -s`):
diff --git a/numba_cuda/numba/cuda/api.py b/numba_cuda/numba/cuda/api.py
index 5dfe7c434..9a2300a35 100644
--- a/numba_cuda/numba/cuda/api.py
+++ b/numba_cuda/numba/cuda/api.py
@@ -2,7 +2,6 @@
 API that are reported to numba.cuda
 """
 
-
 import contextlib
 import os
 
@@ -28,35 +27,37 @@ def from_cuda_array_interface(desc, owner=None, sync=True):
     If ``sync`` is ``True``, then the imported stream (if present) will be
     synchronized.
     """
-    version = desc.get('version')
+    version = desc.get("version")
     # Mask introduced in version 1
     if 1 <= version:
-        mask = desc.get('mask')
+        mask = desc.get("mask")
         # Would ideally be better to detect if the mask is all valid
         if mask is not None:
-            raise NotImplementedError('Masked arrays are not supported')
+            raise NotImplementedError("Masked arrays are not supported")
 
-    shape = desc['shape']
-    strides = desc.get('strides')
-    dtype = np.dtype(desc['typestr'])
+    shape = desc["shape"]
+    strides = desc.get("strides")
+    dtype = np.dtype(desc["typestr"])
 
     shape, strides, dtype = prepare_shape_strides_dtype(
-        shape, strides, dtype, order='C')
+        shape, strides, dtype, order="C"
+    )
     size = driver.memory_size_from_info(shape, strides, dtype.itemsize)
 
-    devptr = driver.get_devptr_for_active_ctx(desc['data'][0])
+    devptr = driver.get_devptr_for_active_ctx(desc["data"][0])
     data = driver.MemoryPointer(
-        current_context(), devptr, size=size, owner=owner)
-    stream_ptr = desc.get('stream', None)
+        current_context(), devptr, size=size, owner=owner
+    )
+    stream_ptr = desc.get("stream", None)
     if stream_ptr is not None:
         stream = external_stream(stream_ptr)
         if sync and config.CUDA_ARRAY_INTERFACE_SYNC:
             stream.synchronize()
     else:
-        stream = 0 # No "Numba default stream", not the CUDA default stream
-    da = devicearray.DeviceNDArray(shape=shape, strides=strides,
-                                   dtype=dtype, gpu_data=data,
-                                   stream=stream)
+        stream = 0  # No "Numba default stream", not the CUDA default stream
+    da = devicearray.DeviceNDArray(
+        shape=shape, strides=strides, dtype=dtype, gpu_data=data, stream=stream
+    )
     return da
 
 
@@ -73,8 +74,9 @@ def as_cuda_array(obj, sync=True):
     if not is_cuda_array(obj):
         raise TypeError("*obj* doesn't implement the cuda array interface.")
     else:
-        return from_cuda_array_interface(obj.__cuda_array_interface__,
-                                         owner=obj, sync=sync)
+        return from_cuda_array_interface(
+            obj.__cuda_array_interface__, owner=obj, sync=sync
+        )
 
 
 def is_cuda_array(obj):
@@ -82,7 +84,7 @@ def is_cuda_array(obj):
 
     Does not verify the validity of the interface.
     """
-    return hasattr(obj, '__cuda_array_interface__')
+    return hasattr(obj, "__cuda_array_interface__")
 
 
 def is_float16_supported():
@@ -125,8 +127,9 @@ def to_device(obj, stream=0, copy=True, to=None):
         hary = d_ary.copy_to_host(stream=stream)
     """
     if to is None:
-        to, new = devicearray.auto_device(obj, stream=stream, copy=copy,
-                                          user_explicit=True)
+        to, new = devicearray.auto_device(
+            obj, stream=stream, copy=copy, user_explicit=True
+        )
         return to
     if copy:
         to.copy_to_device(obj, stream=stream)
@@ -134,20 +137,28 @@ def to_device(obj, stream=0, copy=True, to=None):
 
 
 @require_context
-def device_array(shape, dtype=np.float64, strides=None, order='C', stream=0):
+def device_array(shape, dtype=np.float64, strides=None, order="C", stream=0):
     """device_array(shape, dtype=np.float64, strides=None, order='C', stream=0)
 
     Allocate an empty device ndarray. Similar to :meth:`numpy.empty`.
     """
-    shape, strides, dtype = prepare_shape_strides_dtype(shape, strides, dtype,
-                                                        order)
-    return devicearray.DeviceNDArray(shape=shape, strides=strides, dtype=dtype,
-                                     stream=stream)
+    shape, strides, dtype = prepare_shape_strides_dtype(
+        shape, strides, dtype, order
+    )
+    return devicearray.DeviceNDArray(
+        shape=shape, strides=strides, dtype=dtype, stream=stream
+    )
 
 
 @require_context
-def managed_array(shape, dtype=np.float64, strides=None, order='C', stream=0,
-                  attach_global=True):
+def managed_array(
+    shape,
+    dtype=np.float64,
+    strides=None,
+    order="C",
+    stream=0,
+    attach_global=True,
+):
     """managed_array(shape, dtype=np.float64, strides=None, order='C', stream=0,
                      attach_global=True)
 
@@ -163,37 +174,48 @@ def managed_array(shape, dtype=np.float64, strides=None, order='C', stream=0,
                           *host*, and memory is only accessible by devices
                           with Compute Capability 6.0 and later.
     """
-    shape, strides, dtype = prepare_shape_strides_dtype(shape, strides, dtype,
-                                                        order)
+    shape, strides, dtype = prepare_shape_strides_dtype(
+        shape, strides, dtype, order
+    )
     bytesize = driver.memory_size_from_info(shape, strides, dtype.itemsize)
-    buffer = current_context().memallocmanaged(bytesize,
-                                               attach_global=attach_global)
-    npary = np.ndarray(shape=shape, strides=strides, dtype=dtype, order=order,
-                       buffer=buffer)
+    buffer = current_context().memallocmanaged(
+        bytesize, attach_global=attach_global
+    )
+    npary = np.ndarray(
+        shape=shape, strides=strides, dtype=dtype, order=order, buffer=buffer
+    )
     managedview = np.ndarray.view(npary, type=devicearray.ManagedNDArray)
     managedview.device_setup(buffer, stream=stream)
     return managedview
 
 
 @require_context
-def pinned_array(shape, dtype=np.float64, strides=None, order='C'):
+def pinned_array(shape, dtype=np.float64, strides=None, order="C"):
     """pinned_array(shape, dtype=np.float64, strides=None, order='C')
 
     Allocate an :class:`ndarray <numpy.ndarray>` with a buffer that is pinned
     (pagelocked).  Similar to :func:`np.empty() <numpy.empty>`.
     """
-    shape, strides, dtype = prepare_shape_strides_dtype(shape, strides, dtype,
-                                                        order)
-    bytesize = driver.memory_size_from_info(shape, strides,
-                                            dtype.itemsize)
+    shape, strides, dtype = prepare_shape_strides_dtype(
+        shape, strides, dtype, order
+    )
+    bytesize = driver.memory_size_from_info(shape, strides, dtype.itemsize)
     buffer = current_context().memhostalloc(bytesize)
-    return np.ndarray(shape=shape, strides=strides, dtype=dtype, order=order,
-                      buffer=buffer)
+    return np.ndarray(
+        shape=shape, strides=strides, dtype=dtype, order=order, buffer=buffer
+    )
 
 
 @require_context
-def mapped_array(shape, dtype=np.float64, strides=None, order='C', stream=0,
-                 portable=False, wc=False):
+def mapped_array(
+    shape,
+    dtype=np.float64,
+    strides=None,
+    order="C",
+    stream=0,
+    portable=False,
+    wc=False,
+):
     """mapped_array(shape, dtype=np.float64, strides=None, order='C', stream=0,
                     portable=False, wc=False)
 
@@ -206,12 +228,14 @@ def mapped_array(shape, dtype=np.float64, strides=None, order='C', stream=0,
         to write by the host and to read by the device, but slower to
         write by the host and slower to write by the device.
     """
-    shape, strides, dtype = prepare_shape_strides_dtype(shape, strides, dtype,
-                                                        order)
+    shape, strides, dtype = prepare_shape_strides_dtype(
+        shape, strides, dtype, order
+    )
     bytesize = driver.memory_size_from_info(shape, strides, dtype.itemsize)
     buffer = current_context().memhostalloc(bytesize, mapped=True)
-    npary = np.ndarray(shape=shape, strides=strides, dtype=dtype, order=order,
-                       buffer=buffer)
+    npary = np.ndarray(
+        shape=shape, strides=strides, dtype=dtype, order=order, buffer=buffer
+    )
     mappedview = np.ndarray.view(npary, type=devicearray.MappedNDArray)
     mappedview.device_setup(buffer, stream=stream)
     return mappedview
@@ -243,8 +267,9 @@ def open_ipc_array(handle, shape, dtype, strides=None, offset=0):
         driver_handle.reserved[:] = handle
     # use *IpcHandle* to open the IPC memory
     ipchandle = driver.IpcHandle(None, driver_handle, size, offset=offset)
-    yield ipchandle.open_array(current_context(), shape=shape,
-                               strides=strides, dtype=dtype)
+    yield ipchandle.open_array(
+        current_context(), shape=shape, strides=strides, dtype=dtype
+    )
     ipchandle.close()
 
 
@@ -260,7 +285,7 @@ def _contiguous_strides_like_array(ary):
     """
     # Don't recompute strides if the default strides will be sufficient to
     # create a contiguous array.
-    if ary.flags['C_CONTIGUOUS'] or ary.flags['F_CONTIGUOUS'] or ary.ndim <= 1:
+    if ary.flags["C_CONTIGUOUS"] or ary.flags["F_CONTIGUOUS"] or ary.ndim <= 1:
         return None
 
     # Otherwise, we need to compute new strides using an algorithm adapted from
@@ -270,7 +295,7 @@ def _contiguous_strides_like_array(ary):
 
     # Stride permutation. E.g. a stride array (4, -2, 12) becomes
     # [(1, -2), (0, 4), (2, 12)]
-    strideperm = [ x for x in enumerate(ary.strides) ]
+    strideperm = [x for x in enumerate(ary.strides)]
     strideperm.sort(key=lambda x: x[1])
 
     # Compute new strides using permutation
@@ -283,10 +308,10 @@ def _contiguous_strides_like_array(ary):
 
 
 def _order_like_array(ary):
-    if ary.flags['F_CONTIGUOUS'] and not ary.flags['C_CONTIGUOUS']:
-        return 'F'
+    if ary.flags["F_CONTIGUOUS"] and not ary.flags["C_CONTIGUOUS"]:
+        return "F"
     else:
-        return 'C'
+        return "C"
 
 
 def device_array_like(ary, stream=0):
@@ -296,8 +321,13 @@ def device_array_like(ary, stream=0):
     """
     strides = _contiguous_strides_like_array(ary)
     order = _order_like_array(ary)
-    return device_array(shape=ary.shape, dtype=ary.dtype, strides=strides,
-                        order=order, stream=stream)
+    return device_array(
+        shape=ary.shape,
+        dtype=ary.dtype,
+        strides=strides,
+        order=order,
+        stream=stream,
+    )
 
 
 def mapped_array_like(ary, stream=0, portable=False, wc=False):
@@ -307,8 +337,15 @@ def mapped_array_like(ary, stream=0, portable=False, wc=False):
     """
     strides = _contiguous_strides_like_array(ary)
     order = _order_like_array(ary)
-    return mapped_array(shape=ary.shape, dtype=ary.dtype, strides=strides,
-                        order=order, stream=stream, portable=portable, wc=wc)
+    return mapped_array(
+        shape=ary.shape,
+        dtype=ary.dtype,
+        strides=strides,
+        order=order,
+        stream=stream,
+        portable=portable,
+        wc=wc,
+    )
 
 
 def pinned_array_like(ary):
@@ -318,8 +355,9 @@ def pinned_array_like(ary):
     """
     strides = _contiguous_strides_like_array(ary)
     order = _order_like_array(ary)
-    return pinned_array(shape=ary.shape, dtype=ary.dtype, strides=strides,
-                        order=order)
+    return pinned_array(
+        shape=ary.shape, dtype=ary.dtype, strides=strides, order=order
+    )
 
 
 # Stream helper
@@ -373,13 +411,15 @@ def external_stream(ptr):
 @require_context
 @contextlib.contextmanager
 def pinned(*arylist):
-    """A context manager for temporary pinning a sequence of host ndarrays.
-    """
+    """A context manager for temporary pinning a sequence of host ndarrays."""
     pmlist = []
     for ary in arylist:
-        pm = current_context().mempin(ary, driver.host_pointer(ary),
-                                      driver.host_memory_size(ary),
-                                      mapped=False)
+        pm = current_context().mempin(
+            ary,
+            driver.host_pointer(ary),
+            driver.host_memory_size(ary),
+            mapped=False,
+        )
         pmlist.append(pm)
     yield
 
@@ -387,16 +427,18 @@ def pinned(*arylist):
 @require_context
 @contextlib.contextmanager
 def mapped(*arylist, **kws):
-    """A context manager for temporarily mapping a sequence of host ndarrays.
-    """
-    assert not kws or 'stream' in kws, "Only accept 'stream' as keyword."
-    stream = kws.get('stream', 0)
+    """A context manager for temporarily mapping a sequence of host ndarrays."""
+    assert not kws or "stream" in kws, "Only accept 'stream' as keyword."
+    stream = kws.get("stream", 0)
     pmlist = []
     devarylist = []
     for ary in arylist:
-        pm = current_context().mempin(ary, driver.host_pointer(ary),
-                                      driver.host_memory_size(ary),
-                                      mapped=True)
+        pm = current_context().mempin(
+            ary,
+            driver.host_pointer(ary),
+            driver.host_memory_size(ary),
+            mapped=True,
+        )
         pmlist.append(pm)
         devary = devicearray.from_array_like(ary, gpu_data=pm, stream=stream)
         devarylist.append(devary)
@@ -427,6 +469,7 @@ def event(timing=True):
 
 # Device selection
 
+
 def select_device(device_id):
     """
     Make the context associated with device *device_id* the current context.
@@ -468,7 +511,7 @@ def detect():
     Returns a boolean indicating whether any supported devices were detected.
     """
     devlist = list_devices()
-    print('Found %d CUDA devices' % len(devlist))
+    print("Found %d CUDA devices" % len(devlist))
     supported_count = 0
     for dev in devlist:
         attrs = []
@@ -476,29 +519,29 @@ def detect():
         kernel_timeout = dev.KERNEL_EXEC_TIMEOUT
         tcc = dev.TCC_DRIVER
         fp32_to_fp64_ratio = dev.SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO
-        attrs += [('Compute Capability', '%d.%d' % cc)]
-        attrs += [('PCI Device ID', dev.PCI_DEVICE_ID)]
-        attrs += [('PCI Bus ID', dev.PCI_BUS_ID)]
-        attrs += [('UUID', dev.uuid)]
-        attrs += [('Watchdog', 'Enabled' if kernel_timeout else 'Disabled')]
+        attrs += [("Compute Capability", "%d.%d" % cc)]
+        attrs += [("PCI Device ID", dev.PCI_DEVICE_ID)]
+        attrs += [("PCI Bus ID", dev.PCI_BUS_ID)]
+        attrs += [("UUID", dev.uuid)]
+        attrs += [("Watchdog", "Enabled" if kernel_timeout else "Disabled")]
         if os.name == "nt":
-            attrs += [('Compute Mode', 'TCC' if tcc else 'WDDM')]
-        attrs += [('FP32/FP64 Performance Ratio', fp32_to_fp64_ratio)]
+            attrs += [("Compute Mode", "TCC" if tcc else "WDDM")]
+        attrs += [("FP32/FP64 Performance Ratio", fp32_to_fp64_ratio)]
         if cc < (3, 5):
-            support = '[NOT SUPPORTED: CC < 3.5]'
+            support = "[NOT SUPPORTED: CC < 3.5]"
         elif cc < (5, 0):
-            support = '[SUPPORTED (DEPRECATED)]'
+            support = "[SUPPORTED (DEPRECATED)]"
             supported_count += 1
         else:
-            support = '[SUPPORTED]'
+            support = "[SUPPORTED]"
             supported_count += 1
 
-        print('id %d    %20s %40s' % (dev.id, dev.name, support))
+        print("id %d    %20s %40s" % (dev.id, dev.name, support))
         for key, val in attrs:
-            print('%40s: %s' % (key, val))
+            print("%40s: %s" % (key, val))
 
-    print('Summary:')
-    print('\t%d/%d devices are supported' % (supported_count, len(devlist)))
+    print("Summary:")
+    print("\t%d/%d devices are supported" % (supported_count, len(devlist)))
     return supported_count > 0
 
 
diff --git a/numba_cuda/numba/cuda/api_util.py b/numba_cuda/numba/cuda/api_util.py
index b8bffb7c1..1b2694af7 100644
--- a/numba_cuda/numba/cuda/api_util.py
+++ b/numba_cuda/numba/cuda/api_util.py
@@ -17,14 +17,14 @@ def _fill_stride_by_order(shape, dtype, order):
     if nd == 0:
         return ()
     strides = [0] * nd
-    if order == 'C':
+    if order == "C":
         strides[-1] = dtype.itemsize
         for d in reversed(range(nd - 1)):
             strides[d] = strides[d + 1] * shape[d + 1]
-    elif order == 'F':
+    elif order == "F":
         strides[0] = dtype.itemsize
         for d in range(1, nd):
             strides[d] = strides[d - 1] * shape[d - 1]
     else:
-        raise ValueError('must be either C/F order')
+        raise ValueError("must be either C/F order")
     return tuple(strides)
diff --git a/numba_cuda/numba/cuda/args.py b/numba_cuda/numba/cuda/args.py
index 472bd0b87..ff204c619 100644
--- a/numba_cuda/numba/cuda/args.py
+++ b/numba_cuda/numba/cuda/args.py
@@ -2,6 +2,7 @@
 Hints to wrap Kernel arguments to indicate how to manage host-device
 memory transfers before & after the kernel call.
 """
+
 import abc
 
 from numba.core.typing.typeof import typeof, Purpose
@@ -31,9 +32,8 @@ def _numba_type_(self):
 class In(ArgHint):
     def to_device(self, retr, stream=0):
         from .cudadrv.devicearray import auto_device
-        devary, _ = auto_device(
-            self.value,
-            stream=stream)
+
+        devary, _ = auto_device(self.value, stream=stream)
         # A dummy writeback functor to keep devary alive until the kernel
         # is called.
         retr.append(lambda: devary)
@@ -43,10 +43,8 @@ def to_device(self, retr, stream=0):
 class Out(ArgHint):
     def to_device(self, retr, stream=0):
         from .cudadrv.devicearray import auto_device
-        devary, conv = auto_device(
-            self.value,
-            copy=False,
-            stream=stream)
+
+        devary, conv = auto_device(self.value, copy=False, stream=stream)
         if conv:
             retr.append(lambda: devary.copy_to_host(self.value, stream=stream))
         return devary
@@ -55,9 +53,8 @@ def to_device(self, retr, stream=0):
 class InOut(ArgHint):
     def to_device(self, retr, stream=0):
         from .cudadrv.devicearray import auto_device
-        devary, conv = auto_device(
-            self.value,
-            stream=stream)
+
+        devary, conv = auto_device(self.value, stream=stream)
         if conv:
             retr.append(lambda: devary.copy_to_host(self.value, stream=stream))
         return devary
@@ -68,10 +65,9 @@ def wrap_arg(value, default=InOut):
 
 
 __all__ = [
-    'In',
-    'Out',
-    'InOut',
-
-    'ArgHint',
-    'wrap_arg',
+    "In",
+    "Out",
+    "InOut",
+    "ArgHint",
+    "wrap_arg",
 ]
diff --git a/numba_cuda/numba/cuda/cg.py b/numba_cuda/numba/cuda/cg.py
index 00d55704b..c3dc4add6 100644
--- a/numba_cuda/numba/cuda/cg.py
+++ b/numba_cuda/numba/cuda/cg.py
@@ -26,13 +26,13 @@ def codegen(context, builder, sig, args):
         one = context.get_constant(types.int32, 1)
         mod = builder.module
         return builder.call(
-            nvvmutils.declare_cudaCGGetIntrinsicHandle(mod),
-            (one,))
+            nvvmutils.declare_cudaCGGetIntrinsicHandle(mod), (one,)
+        )
 
     return sig, codegen
 
 
-@overload(this_grid, target='cuda')
+@overload(this_grid, target="cuda")
 def _ol_this_grid():
     def impl():
         return _this_grid()
@@ -48,13 +48,13 @@ def codegen(context, builder, sig, args):
         flags = context.get_constant(types.int32, 0)
         mod = builder.module
         return builder.call(
-            nvvmutils.declare_cudaCGSynchronize(mod),
-            (*args, flags))
+            nvvmutils.declare_cudaCGSynchronize(mod), (*args, flags)
+        )
 
     return sig, codegen
 
 
-@overload_method(GridGroupClass, 'sync', target='cuda')
+@overload_method(GridGroupClass, "sync", target="cuda")
 def _ol_grid_group_sync(group):
     def impl(group):
         return _grid_group_sync(group)
diff --git a/numba_cuda/numba/cuda/codegen.py b/numba_cuda/numba/cuda/codegen.py
index 426eb82b3..2660f574a 100644
--- a/numba_cuda/numba/cuda/codegen.py
+++ b/numba_cuda/numba/cuda/codegen.py
@@ -9,7 +9,7 @@
 import subprocess
 import tempfile
 
-CUDA_TRIPLE = 'nvptx64-nvidia-cuda'
+CUDA_TRIPLE = "nvptx64-nvidia-cuda"
 
 
 def run_nvdisasm(cubin, flags):
@@ -19,19 +19,24 @@ def run_nvdisasm(cubin, flags):
     fname = None
     try:
         fd, fname = tempfile.mkstemp()
-        with open(fname, 'wb') as f:
+        with open(fname, "wb") as f:
             f.write(cubin)
 
         try:
-            cp = subprocess.run(['nvdisasm', *flags, fname], check=True,
-                                stdout=subprocess.PIPE,
-                                stderr=subprocess.PIPE)
+            cp = subprocess.run(
+                ["nvdisasm", *flags, fname],
+                check=True,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+            )
         except FileNotFoundError as e:
-            msg = ("nvdisasm has not been found. You may need "
-                   "to install the CUDA toolkit and ensure that "
-                   "it is available on your PATH.\n")
+            msg = (
+                "nvdisasm has not been found. You may need "
+                "to install the CUDA toolkit and ensure that "
+                "it is available on your PATH.\n"
+            )
             raise RuntimeError(msg) from e
-        return cp.stdout.decode('utf-8')
+        return cp.stdout.decode("utf-8")
     finally:
         if fd is not None:
             os.close(fd)
@@ -41,13 +46,13 @@ def run_nvdisasm(cubin, flags):
 
 def disassemble_cubin(cubin):
     # Request lineinfo in disassembly
-    flags = ['-gi']
+    flags = ["-gi"]
     return run_nvdisasm(cubin, flags)
 
 
 def disassemble_cubin_for_cfg(cubin):
     # Request control flow graph in disassembly
-    flags = ['-cfg']
+    flags = ["-cfg"]
     return run_nvdisasm(cubin, flags)
 
 
@@ -65,7 +70,7 @@ def __init__(
         entry_name=None,
         max_registers=None,
         lto=False,
-        nvvm_options=None
+        nvvm_options=None,
     ):
         """
         codegen:
@@ -142,7 +147,7 @@ def get_asm_str(self, cc=None):
 
         arch = nvvm.get_arch_option(*cc)
         options = self._nvvm_options.copy()
-        options['arch'] = arch
+        options["arch"] = arch
 
         irs = self.llvm_strs
 
@@ -151,12 +156,12 @@ def get_asm_str(self, cc=None):
         # Sometimes the result from NVVM contains trailing whitespace and
         # nulls, which we strip so that the assembly dump looks a little
         # tidier.
-        ptx = ptx.decode().strip('\x00').strip()
+        ptx = ptx.decode().strip("\x00").strip()
 
         if config.DUMP_ASSEMBLY:
-            print(("ASSEMBLY %s" % self._name).center(80, '-'))
+            print(("ASSEMBLY %s" % self._name).center(80, "-"))
             print(ptx)
-            print('=' * 80)
+            print("=" * 80)
 
         self._ptx_cache[cc] = ptx
 
@@ -171,8 +176,8 @@ def get_ltoir(self, cc=None):
 
         arch = nvvm.get_arch_option(*cc)
         options = self._nvvm_options.copy()
-        options['arch'] = arch
-        options['gen-lto'] = None
+        options["arch"] = arch
+        options["gen-lto"] = None
 
         irs = self.llvm_strs
         ltoir = nvvm.compile_ir(irs, **options)
@@ -192,7 +197,7 @@ def _link_all(self, linker, cc, ignore_nonlto=False):
             linker.add_file_guess_ext(path, ignore_nonlto)
         if self.needs_cudadevrt:
             linker.add_file_guess_ext(
-                get_cudalib('cudadevrt', static=True), ignore_nonlto
+                get_cudalib("cudadevrt", static=True), ignore_nonlto
             )
 
     def get_cubin(self, cc=None):
@@ -207,22 +212,20 @@ def get_cubin(self, cc=None):
                 max_registers=self._max_registers,
                 cc=cc,
                 additional_flags=["-ptx"],
-                lto=self._lto
+                lto=self._lto,
             )
             # `-ptx` flag is meant to view the optimized PTX for LTO objects.
             # Non-LTO objects are not passed to linker.
             self._link_all(linker, cc, ignore_nonlto=True)
 
-            ptx = linker.get_linked_ptx().decode('utf-8')
+            ptx = linker.get_linked_ptx().decode("utf-8")
 
-            print(("ASSEMBLY (AFTER LTO) %s" % self._name).center(80, '-'))
+            print(("ASSEMBLY (AFTER LTO) %s" % self._name).center(80, "-"))
             print(ptx)
-            print('=' * 80)
+            print("=" * 80)
 
         linker = driver.Linker.new(
-            max_registers=self._max_registers,
-            cc=cc,
-            lto=self._lto
+            max_registers=self._max_registers, cc=cc, lto=self._lto
         )
         self._link_all(linker, cc, ignore_nonlto=False)
         cubin = linker.complete()
@@ -234,8 +237,10 @@ def get_cubin(self, cc=None):
 
     def get_cufunc(self):
         if self._entry_name is None:
-            msg = "Missing entry_name - are you trying to get the cufunc " \
-                  "for a device function?"
+            msg = (
+                "Missing entry_name - are you trying to get the cufunc "
+                "for a device function?"
+            )
             raise RuntimeError(msg)
 
         ctx = devices.get_context()
@@ -260,7 +265,7 @@ def get_linkerinfo(self, cc):
         try:
             return self._linkerinfo_cache[cc]
         except KeyError:
-            raise KeyError(f'No linkerinfo for CC {cc}')
+            raise KeyError(f"No linkerinfo for CC {cc}")
 
     def get_sass(self, cc=None):
         return disassemble_cubin(self.get_cubin(cc=cc))
@@ -271,7 +276,7 @@ def get_sass_cfg(self, cc=None):
     def add_ir_module(self, mod):
         self._raise_if_finalized()
         if self._module is not None:
-            raise RuntimeError('CUDACodeLibrary only supports one module')
+            raise RuntimeError("CUDACodeLibrary only supports one module")
         self._module = mod
 
     def add_linking_library(self, library):
@@ -291,12 +296,13 @@ def get_function(self, name):
         for fn in self._module.functions:
             if fn.name == name:
                 return fn
-        raise KeyError(f'Function {name} not found')
+        raise KeyError(f"Function {name} not found")
 
     @property
     def modules(self):
-        return [self._module] + [mod for lib in self._linking_libraries
-                                 for mod in lib.modules]
+        return [self._module] + [
+            mod for lib in self._linking_libraries for mod in lib.modules
+        ]
 
     @property
     def linking_libraries(self):
@@ -331,7 +337,7 @@ def finalize(self):
             for mod in library.modules:
                 for fn in mod.functions:
                     if not fn.is_declaration:
-                        fn.linkage = 'linkonce_odr'
+                        fn.linkage = "linkonce_odr"
 
         self._finalized = True
 
@@ -342,10 +348,10 @@ def _reduce_states(self):
         after deserialization.
         """
         if self._linking_files:
-            msg = 'Cannot pickle CUDACodeLibrary with linking files'
+            msg = "Cannot pickle CUDACodeLibrary with linking files"
             raise RuntimeError(msg)
         if not self._finalized:
-            raise RuntimeError('Cannot pickle unfinalized CUDACodeLibrary')
+            raise RuntimeError("Cannot pickle unfinalized CUDACodeLibrary")
         return dict(
             codegen=None,
             name=self.name,
@@ -356,13 +362,23 @@ def _reduce_states(self):
             linkerinfo_cache=self._linkerinfo_cache,
             max_registers=self._max_registers,
             nvvm_options=self._nvvm_options,
-            needs_cudadevrt=self.needs_cudadevrt
+            needs_cudadevrt=self.needs_cudadevrt,
         )
 
     @classmethod
-    def _rebuild(cls, codegen, name, entry_name, llvm_strs, ptx_cache,
-                 cubin_cache, linkerinfo_cache, max_registers, nvvm_options,
-                 needs_cudadevrt):
+    def _rebuild(
+        cls,
+        codegen,
+        name,
+        entry_name,
+        llvm_strs,
+        ptx_cache,
+        cubin_cache,
+        linkerinfo_cache,
+        max_registers,
+        nvvm_options,
+        needs_cudadevrt,
+    ):
         """
         Rebuild an instance.
         """
diff --git a/numba_cuda/numba/cuda/compiler.py b/numba_cuda/numba/cuda/compiler.py
index 49968890e..2009e777f 100644
--- a/numba_cuda/numba/cuda/compiler.py
+++ b/numba_cuda/numba/cuda/compiler.py
@@ -1,19 +1,39 @@
 from llvmlite import ir
 from numba.core.typing.templates import ConcreteTemplate
 from numba.core import ir as numba_ir
-from numba.core import (cgutils, types, typing, funcdesc, config, compiler,
-                        sigutils, utils)
-from numba.core.compiler import (sanitize_compile_result_entries, CompilerBase,
-                                 DefaultPassBuilder, Flags, Option,
-                                 CompileResult)
+from numba.core import (
+    cgutils,
+    types,
+    typing,
+    funcdesc,
+    config,
+    compiler,
+    sigutils,
+    utils,
+)
+from numba.core.compiler import (
+    sanitize_compile_result_entries,
+    CompilerBase,
+    DefaultPassBuilder,
+    Flags,
+    Option,
+    CompileResult,
+)
 from numba.core.compiler_lock import global_compiler_lock
-from numba.core.compiler_machinery import (FunctionPass, LoweringPass,
-                                           PassManager, register_pass)
+from numba.core.compiler_machinery import (
+    FunctionPass,
+    LoweringPass,
+    PassManager,
+    register_pass,
+)
 from numba.core.interpreter import Interpreter
 from numba.core.errors import NumbaInvalidConfigWarning
 from numba.core.untyped_passes import TranslateByteCode
-from numba.core.typed_passes import (IRLegalization, NativeLowering,
-                                     AnnotateTypes)
+from numba.core.typed_passes import (
+    IRLegalization,
+    NativeLowering,
+    AnnotateTypes,
+)
 from warnings import warn
 from numba.cuda import nvvmutils
 from numba.cuda.api import get_current_device
@@ -52,15 +72,9 @@ class CUDAFlags(Flags):
         doc="Compute Capability",
     )
     max_registers = Option(
-        type=_optional_int_type,
-        default=None,
-        doc="Max registers"
-    )
-    lto = Option(
-        type=bool,
-        default=False,
-        doc="Enable Link-time Optimization"
+        type=_optional_int_type, default=None, doc="Max registers"
     )
+    lto = Option(type=bool, default=False, doc="Enable Link-time Optimization")
 
 
 # The CUDACompileResult (CCR) has a specially-defined entry point equal to its
@@ -79,6 +93,7 @@ class CUDAFlags(Flags):
 #    point will no longer need to be a synthetic value, but will instead be a
 #    pointer to the compiled function as in the CPU target.
 
+
 class CUDACompileResult(CompileResult):
     @property
     def entry_point(self):
@@ -92,7 +107,6 @@ def cuda_compile_result(**entries):
 
 @register_pass(mutates_CFG=True, analysis_only=False)
 class CUDABackend(LoweringPass):
-
     _name = "cuda_backend"
 
     def __init__(self):
@@ -102,7 +116,7 @@ def run_pass(self, state):
         """
         Back-end: Packages lowering output in a compile result
         """
-        lowered = state['cr']
+        lowered = state["cr"]
         signature = typing.signature(state.return_type, *state.args)
 
         state.cr = cuda_compile_result(
@@ -137,9 +151,12 @@ def run_pass(self, state):
         nvvm_options = state.flags.nvvm_options
         max_registers = state.flags.max_registers
         lto = state.flags.lto
-        state.library = codegen.create_library(name, nvvm_options=nvvm_options,
-                                               max_registers=max_registers,
-                                               lto=lto)
+        state.library = codegen.create_library(
+            name,
+            nvvm_options=nvvm_options,
+            max_registers=max_registers,
+            lto=lto,
+        )
         # Enable object caching upfront so that the library can be serialized.
         state.library.enable_object_caching()
 
@@ -165,13 +182,15 @@ def _op_JUMP_IF(self, inst, pred, iftrue):
         gv_fn = numba_ir.Global("bool", bool, loc=self.loc)
         self.store(value=gv_fn, name=name)
 
-        callres = numba_ir.Expr.call(self.get(name), (self.get(pred),), (),
-                                     loc=self.loc)
+        callres = numba_ir.Expr.call(
+            self.get(name), (self.get(pred),), (), loc=self.loc
+        )
 
         pname = "$%spred" % (inst.offset)
         predicate = self.store(value=callres, name=pname)
-        bra = numba_ir.Branch(cond=predicate, truebr=truebr, falsebr=falsebr,
-                              loc=self.loc)
+        bra = numba_ir.Branch(
+            cond=predicate, truebr=truebr, falsebr=falsebr, loc=self.loc
+        )
         self.current_block.append(bra)
 
 
@@ -183,18 +202,18 @@ def __init__(self):
         FunctionPass.__init__(self)
 
     def run_pass(self, state):
-        func_id = state['func_id']
-        bc = state['bc']
+        func_id = state["func_id"]
+        bc = state["bc"]
         interp = CUDABytecodeInterpreter(func_id)
         func_ir = interp.interpret(bc)
-        state['func_ir'] = func_ir
+        state["func_ir"] = func_ir
         return True
 
 
 class CUDACompiler(CompilerBase):
     def define_pipelines(self):
         dpb = DefaultPassBuilder
-        pm = PassManager('cuda')
+        pm = PassManager("cuda")
 
         untyped_passes = dpb.define_untyped_pipeline(self.state)
 
@@ -225,10 +244,9 @@ def replace_translate_pass(implementation, description):
         return [pm]
 
     def define_cuda_lowering_pipeline(self, state):
-        pm = PassManager('cuda_lowering')
+        pm = PassManager("cuda_lowering")
         # legalise
-        pm.add_pass(IRLegalization,
-                    "ensure IR is legal prior to lowering")
+        pm.add_pass(IRLegalization, "ensure IR is legal prior to lowering")
         pm.add_pass(AnnotateTypes, "annotate types")
 
         # lower
@@ -241,13 +259,24 @@ def define_cuda_lowering_pipeline(self, state):
 
 
 @global_compiler_lock
-def compile_cuda(pyfunc, return_type, args, debug=False, lineinfo=False,
-                 inline=False, fastmath=False, nvvm_options=None,
-                 cc=None, max_registers=None, lto=False):
+def compile_cuda(
+    pyfunc,
+    return_type,
+    args,
+    debug=False,
+    lineinfo=False,
+    inline=False,
+    fastmath=False,
+    nvvm_options=None,
+    cc=None,
+    max_registers=None,
+    lto=False,
+):
     if cc is None:
-        raise ValueError('Compute Capability must be supplied')
+        raise ValueError("Compute Capability must be supplied")
 
     from .descriptor import cuda_target
+
     typingctx = cuda_target.typing_context
     targetctx = cuda_target.target_context
 
@@ -269,10 +298,10 @@ def compile_cuda(pyfunc, return_type, args, debug=False, lineinfo=False,
         flags.dbg_directives_only = True
 
     if debug:
-        flags.error_model = 'python'
+        flags.error_model = "python"
         flags.dbg_extend_lifetimes = True
     else:
-        flags.error_model = 'numpy'
+        flags.error_model = "numpy"
 
     if inline:
         flags.forceinline = True
@@ -286,15 +315,18 @@ def compile_cuda(pyfunc, return_type, args, debug=False, lineinfo=False,
 
     # Run compilation pipeline
     from numba.core.target_extension import target_override
-    with target_override('cuda'):
-        cres = compiler.compile_extra(typingctx=typingctx,
-                                      targetctx=targetctx,
-                                      func=pyfunc,
-                                      args=args,
-                                      return_type=return_type,
-                                      flags=flags,
-                                      locals={},
-                                      pipeline_class=CUDACompiler)
+
+    with target_override("cuda"):
+        cres = compiler.compile_extra(
+            typingctx=typingctx,
+            targetctx=targetctx,
+            func=pyfunc,
+            args=args,
+            return_type=return_type,
+            flags=flags,
+            locals={},
+            pipeline_class=CUDACompiler,
+        )
 
     library = cres.library
     library.finalize()
@@ -302,8 +334,9 @@ def compile_cuda(pyfunc, return_type, args, debug=False, lineinfo=False,
     return cres
 
 
-def cabi_wrap_function(context, lib, fndesc, wrapper_function_name,
-                       nvvm_options):
+def cabi_wrap_function(
+    context, lib, fndesc, wrapper_function_name, nvvm_options
+):
     """
     Wrap a Numba ABI function in a C ABI wrapper at the NVVM IR level.
 
@@ -311,9 +344,11 @@ def cabi_wrap_function(context, lib, fndesc, wrapper_function_name,
     """
     # The wrapper will be contained in a new library that links to the wrapped
     # function's library
-    library = lib.codegen.create_library(f'{lib.name}_function_',
-                                         entry_name=wrapper_function_name,
-                                         nvvm_options=nvvm_options)
+    library = lib.codegen.create_library(
+        f"{lib.name}_function_",
+        entry_name=wrapper_function_name,
+        nvvm_options=nvvm_options,
+    )
     library.add_linking_library(lib)
 
     # Determine the caller (C ABI) and wrapper (Numba ABI) function types
@@ -331,14 +366,15 @@ def cabi_wrap_function(context, lib, fndesc, wrapper_function_name,
     # its return value
 
     wrapfn = ir.Function(wrapper_module, wrapfnty, wrapper_function_name)
-    builder = ir.IRBuilder(wrapfn.append_basic_block(''))
+    builder = ir.IRBuilder(wrapfn.append_basic_block(""))
 
     arginfo = context.get_arg_packer(argtypes)
     callargs = arginfo.from_arguments(builder, wrapfn.args)
     # We get (status, return_value), but we ignore the status since we
     # can't propagate it through the C ABI anyway
     _, return_value = context.call_conv.call_function(
-        builder, func, restype, argtypes, callargs)
+        builder, func, restype, argtypes, callargs
+    )
     builder.ret(return_value)
 
     if config.DUMP_LLVM:
@@ -395,8 +431,10 @@ def kernel_fixup(kernel, debug):
 
         # Find all stores first
         for inst in block.instructions:
-            if (isinstance(inst, ir.StoreInstr)
-                    and inst.operands[1] == return_value):
+            if (
+                isinstance(inst, ir.StoreInstr)
+                and inst.operands[1] == return_value
+            ):
                 remove_list.append(inst)
 
         # Remove all stores
@@ -407,8 +445,9 @@ def kernel_fixup(kernel, debug):
     # value
 
     if isinstance(kernel.type, ir.PointerType):
-        new_type = ir.PointerType(ir.FunctionType(ir.VoidType(),
-                                                  kernel.type.pointee.args[1:]))
+        new_type = ir.PointerType(
+            ir.FunctionType(ir.VoidType(), kernel.type.pointee.args[1:])
+        )
     else:
         new_type = ir.FunctionType(ir.VoidType(), kernel.type.args[1:])
 
@@ -418,13 +457,13 @@ def kernel_fixup(kernel, debug):
 
     # If debug metadata is present, remove the return value from it
 
-    if kernel_metadata := getattr(kernel, 'metadata', None):
-        if dbg_metadata := kernel_metadata.get('dbg', None):
+    if kernel_metadata := getattr(kernel, "metadata", None):
+        if dbg_metadata := kernel_metadata.get("dbg", None):
             for name, value in dbg_metadata.operands:
                 if name == "type":
                     type_metadata = value
                     for tm_name, tm_value in type_metadata.operands:
-                        if tm_name == 'types':
+                        if tm_name == "types":
                             types = tm_value
                             types.operands = types.operands[1:]
                             if config.DUMP_LLVM:
@@ -435,26 +474,24 @@ def kernel_fixup(kernel, debug):
     nvvm.set_cuda_kernel(kernel)
 
     if config.DUMP_LLVM:
-        print(f"LLVM DUMP: Post kernel fixup {kernel.name}".center(80, '-'))
+        print(f"LLVM DUMP: Post kernel fixup {kernel.name}".center(80, "-"))
         print(kernel.module)
-        print('=' * 80)
+        print("=" * 80)
 
 
 def add_exception_store_helper(kernel):
-
     # Create global variables for exception state
 
     def define_error_gv(postfix):
         name = kernel.name + postfix
-        gv = cgutils.add_global_variable(kernel.module, ir.IntType(32),
-                                         name)
+        gv = cgutils.add_global_variable(kernel.module, ir.IntType(32), name)
         gv.initializer = ir.Constant(gv.type.pointee, None)
         return gv
 
     gv_exc = define_error_gv("__errcode__")
     gv_tid = []
     gv_ctaid = []
-    for i in 'xyz':
+    for i in "xyz":
         gv_tid.append(define_error_gv("__tid%s__" % i))
         gv_ctaid.append(define_error_gv("__ctaid%s__" % i))
 
@@ -484,18 +521,25 @@ def define_error_gv(postfix):
         # Use atomic cmpxchg to prevent rewriting the error status
         # Only the first error is recorded
 
-        xchg = builder.cmpxchg(gv_exc, old, status.code,
-                               'monotonic', 'monotonic')
+        xchg = builder.cmpxchg(
+            gv_exc, old, status.code, "monotonic", "monotonic"
+        )
         changed = builder.extract_value(xchg, 1)
 
         # If the xchange is successful, save the thread ID.
         sreg = nvvmutils.SRegBuilder(builder)
         with builder.if_then(changed):
-            for dim, ptr, in zip("xyz", gv_tid):
+            for (
+                dim,
+                ptr,
+            ) in zip("xyz", gv_tid):
                 val = sreg.tid(dim)
                 builder.store(val, ptr)
 
-            for dim, ptr, in zip("xyz", gv_ctaid):
+            for (
+                dim,
+                ptr,
+            ) in zip("xyz", gv_ctaid):
                 val = sreg.ctaid(dim)
                 builder.store(val, ptr)
 
@@ -505,9 +549,19 @@ def define_error_gv(postfix):
 
 
 @global_compiler_lock
-def compile(pyfunc, sig, debug=None, lineinfo=False, device=True,
-            fastmath=False, cc=None, opt=None, abi="c", abi_info=None,
-            output='ptx'):
+def compile(
+    pyfunc,
+    sig,
+    debug=None,
+    lineinfo=False,
+    device=True,
+    fastmath=False,
+    cc=None,
+    opt=None,
+    abi="c",
+    abi_info=None,
+    output="ptx",
+):
     """Compile a Python function to PTX or LTO-IR for a given set of argument
     types.
 
@@ -551,43 +605,49 @@ def compile(pyfunc, sig, debug=None, lineinfo=False, device=True,
     :rtype: tuple
     """
     if abi not in ("numba", "c"):
-        raise NotImplementedError(f'Unsupported ABI: {abi}')
+        raise NotImplementedError(f"Unsupported ABI: {abi}")
 
-    if abi == 'c' and not device:
-        raise NotImplementedError('The C ABI is not supported for kernels')
+    if abi == "c" and not device:
+        raise NotImplementedError("The C ABI is not supported for kernels")
 
     if output not in ("ptx", "ltoir"):
-        raise NotImplementedError(f'Unsupported output type: {output}')
+        raise NotImplementedError(f"Unsupported output type: {output}")
 
     debug = config.CUDA_DEBUGINFO_DEFAULT if debug is None else debug
     opt = (config.OPT != 0) if opt is None else opt
 
     if debug and opt:
-        msg = ("debug=True with opt=True "
-               "is not supported by CUDA. This may result in a crash"
-               " - set debug=False or opt=False.")
+        msg = (
+            "debug=True with opt=True "
+            "is not supported by CUDA. This may result in a crash"
+            " - set debug=False or opt=False."
+        )
         warn(NumbaInvalidConfigWarning(msg))
 
-    lto = (output == 'ltoir')
+    lto = output == "ltoir"
     abi_info = abi_info or dict()
 
-    nvvm_options = {
-        'fastmath': fastmath,
-        'opt': 3 if opt else 0
-    }
+    nvvm_options = {"fastmath": fastmath, "opt": 3 if opt else 0}
 
     if debug:
-        nvvm_options['g'] = None
+        nvvm_options["g"] = None
 
     if lto:
-        nvvm_options['gen-lto'] = None
+        nvvm_options["gen-lto"] = None
 
     args, return_type = sigutils.normalize_signature(sig)
 
     cc = cc or config.CUDA_DEFAULT_PTX_CC
-    cres = compile_cuda(pyfunc, return_type, args, debug=debug,
-                        lineinfo=lineinfo, fastmath=fastmath,
-                        nvvm_options=nvvm_options, cc=cc)
+    cres = compile_cuda(
+        pyfunc,
+        return_type,
+        args,
+        debug=debug,
+        lineinfo=lineinfo,
+        fastmath=fastmath,
+        nvvm_options=nvvm_options,
+        cc=cc,
+    )
     resty = cres.signature.return_type
 
     if resty and not device and resty != types.void:
@@ -598,9 +658,10 @@ def compile(pyfunc, sig, debug=None, lineinfo=False, device=True,
     if device:
         lib = cres.library
         if abi == "c":
-            wrapper_name = abi_info.get('abi_name', pyfunc.__name__)
-            lib = cabi_wrap_function(tgt, lib, cres.fndesc, wrapper_name,
-                                     nvvm_options)
+            wrapper_name = abi_info.get("abi_name", pyfunc.__name__)
+            lib = cabi_wrap_function(
+                tgt, lib, cres.fndesc, wrapper_name, nvvm_options
+            )
     else:
         lib = cres.library
         kernel = lib.get_function(cres.fndesc.llvm_func_name)
@@ -614,38 +675,94 @@ def compile(pyfunc, sig, debug=None, lineinfo=False, device=True,
     return code, resty
 
 
-def compile_for_current_device(pyfunc, sig, debug=None, lineinfo=False,
-                               device=True, fastmath=False, opt=None,
-                               abi="c", abi_info=None, output='ptx'):
+def compile_for_current_device(
+    pyfunc,
+    sig,
+    debug=None,
+    lineinfo=False,
+    device=True,
+    fastmath=False,
+    opt=None,
+    abi="c",
+    abi_info=None,
+    output="ptx",
+):
     """Compile a Python function to PTX or LTO-IR for a given signature for the
     current device's compute capabilility. This calls :func:`compile` with an
     appropriate ``cc`` value for the current device."""
     cc = get_current_device().compute_capability
-    return compile(pyfunc, sig, debug=debug, lineinfo=lineinfo, device=device,
-                   fastmath=fastmath, cc=cc, opt=opt, abi=abi,
-                   abi_info=abi_info, output=output)
+    return compile(
+        pyfunc,
+        sig,
+        debug=debug,
+        lineinfo=lineinfo,
+        device=device,
+        fastmath=fastmath,
+        cc=cc,
+        opt=opt,
+        abi=abi,
+        abi_info=abi_info,
+        output=output,
+    )
 
 
-def compile_ptx(pyfunc, sig, debug=None, lineinfo=False, device=False,
-                fastmath=False, cc=None, opt=None, abi="numba", abi_info=None):
+def compile_ptx(
+    pyfunc,
+    sig,
+    debug=None,
+    lineinfo=False,
+    device=False,
+    fastmath=False,
+    cc=None,
+    opt=None,
+    abi="numba",
+    abi_info=None,
+):
     """Compile a Python function to PTX for a given signature. See
     :func:`compile`. The defaults for this function are to compile a kernel
     with the Numba ABI, rather than :func:`compile`'s default of compiling a
     device function with the C ABI."""
-    return compile(pyfunc, sig, debug=debug, lineinfo=lineinfo, device=device,
-                   fastmath=fastmath, cc=cc, opt=opt, abi=abi,
-                   abi_info=abi_info, output='ptx')
+    return compile(
+        pyfunc,
+        sig,
+        debug=debug,
+        lineinfo=lineinfo,
+        device=device,
+        fastmath=fastmath,
+        cc=cc,
+        opt=opt,
+        abi=abi,
+        abi_info=abi_info,
+        output="ptx",
+    )
 
 
-def compile_ptx_for_current_device(pyfunc, sig, debug=None, lineinfo=False,
-                                   device=False, fastmath=False, opt=None,
-                                   abi="numba", abi_info=None):
+def compile_ptx_for_current_device(
+    pyfunc,
+    sig,
+    debug=None,
+    lineinfo=False,
+    device=False,
+    fastmath=False,
+    opt=None,
+    abi="numba",
+    abi_info=None,
+):
     """Compile a Python function to PTX for a given signature for the current
     device's compute capabilility. See :func:`compile_ptx`."""
     cc = get_current_device().compute_capability
-    return compile_ptx(pyfunc, sig, debug=debug, lineinfo=lineinfo,
-                       device=device, fastmath=fastmath, cc=cc, opt=opt,
-                       abi=abi, abi_info=abi_info)
+    return compile_ptx(
+        pyfunc,
+        sig,
+        debug=debug,
+        lineinfo=lineinfo,
+        device=device,
+        fastmath=fastmath,
+        cc=cc,
+        opt=opt,
+        abi=abi,
+        abi_info=abi_info,
+    )
 
 
 def declare_device_function(name, restype, argtypes, link):
@@ -654,6 +771,7 @@ def declare_device_function(name, restype, argtypes, link):
 
 def declare_device_function_template(name, restype, argtypes, link):
     from .descriptor import cuda_target
+
     typingctx = cuda_target.typing_context
     targetctx = cuda_target.target_context
     sig = typing.signature(restype, *argtypes)
@@ -664,7 +782,8 @@ class device_function_template(ConcreteTemplate):
         cases = [sig]
 
     fndesc = funcdesc.ExternalFunctionDescriptor(
-        name=name, restype=restype, argtypes=argtypes)
+        name=name, restype=restype, argtypes=argtypes
+    )
     typingctx.insert_user_function(extfn, device_function_template)
     targetctx.insert_user_function(extfn, fndesc)
 
diff --git a/numba_cuda/numba/cuda/cpp_function_wrappers.cu b/numba_cuda/numba/cuda/cpp_function_wrappers.cu
index a2cd1e054..105152805 100644
--- a/numba_cuda/numba/cuda/cpp_function_wrappers.cu
+++ b/numba_cuda/numba/cuda/cpp_function_wrappers.cu
@@ -23,7 +23,7 @@ FNDEF(hdiv)(
 )
 {
   __half retval = __hdiv(__short_as_half (x), __short_as_half (y));
-  
+
   *return_value = __half_as_short (retval);
   // Signal that no Python exception occurred
   return 0;
@@ -44,4 +44,3 @@ UNARY_FUNCTION(hceil)
 UNARY_FUNCTION(hrcp)
 UNARY_FUNCTION(hrint)
 UNARY_FUNCTION(htrunc)
-
diff --git a/numba_cuda/numba/cuda/cuda_fp16.h b/numba_cuda/numba/cuda/cuda_fp16.h
index 3001595e9..9780be106 100644
--- a/numba_cuda/numba/cuda/cuda_fp16.h
+++ b/numba_cuda/numba/cuda/cuda_fp16.h
@@ -112,33 +112,33 @@
 /* Forward-declaration of structures defined in "cuda_fp16.hpp" */
 
 /**
- * \brief half datatype 
- * 
- * \details This structure implements the datatype for storing 
- * half-precision floating-point numbers. The structure implements 
- * assignment operators and type conversions. 
- * 16 bits are being used in total: 1 sign bit, 5 bits for the exponent, 
- * and the significand is being stored in 10 bits. 
- * The total precision is 11 bits. There are 15361 representable 
- * numbers within the interval [0.0, 1.0], endpoints included. 
- * On average we have log10(2**11) ~ 3.311 decimal digits. 
- * 
+ * \brief half datatype
+ *
+ * \details This structure implements the datatype for storing
+ * half-precision floating-point numbers. The structure implements
+ * assignment operators and type conversions.
+ * 16 bits are being used in total: 1 sign bit, 5 bits for the exponent,
+ * and the significand is being stored in 10 bits.
+ * The total precision is 11 bits. There are 15361 representable
+ * numbers within the interval [0.0, 1.0], endpoints included.
+ * On average we have log10(2**11) ~ 3.311 decimal digits.
+ *
  * \internal
- * \req IEEE 754-2008 compliant implementation of half-precision 
- * floating-point numbers. 
+ * \req IEEE 754-2008 compliant implementation of half-precision
+ * floating-point numbers.
  * \endinternal
  */
 struct __half;
 
 /**
  * \brief half2 datatype
- * 
- * \details This structure implements the datatype for storing two 
- * half-precision floating-point numbers. 
- * The structure implements assignment operators and type conversions. 
- * 
+ *
+ * \details This structure implements the datatype for storing two
+ * half-precision floating-point numbers.
+ * The structure implements assignment operators and type conversions.
+ *
  * \internal
- * \req Vectorified version of half. 
+ * \req Vectorified version of half.
  * \endinternal
  */
 struct __half2;
@@ -161,12 +161,12 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half __double2half(const double a);
 /**
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Converts float number to half precision in round-to-nearest-even mode
-* and returns \p half with converted value. 
-* 
-* \details Converts float number \p a to half precision in round-to-nearest-even mode. 
-* \param[in] a - float. Is only being read. 
+* and returns \p half with converted value.
+*
+* \details Converts float number \p a to half precision in round-to-nearest-even mode.
+* \param[in] a - float. Is only being read.
 * \returns half
-* \retval a converted to half. 
+* \retval a converted to half.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -179,9 +179,9 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half(const float a);
 * and returns \p half with converted value.
 *
 * \details Converts float number \p a to half precision in round-to-nearest-even mode.
-* \param[in] a - float. Is only being read. 
+* \param[in] a - float. Is only being read.
 * \returns half
-* \retval a converted to half. 
+* \retval a converted to half.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -192,11 +192,11 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rn(const float a);
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Converts float number to half precision in round-towards-zero mode
 * and returns \p half with converted value.
-* 
+*
 * \details Converts float number \p a to half precision in round-towards-zero mode.
-* \param[in] a - float. Is only being read. 
+* \param[in] a - float. Is only being read.
 * \returns half
-* \retval a converted to half. 
+* \retval a converted to half.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -207,12 +207,12 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rz(const float a);
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Converts float number to half precision in round-down mode
 * and returns \p half with converted value.
-* 
+*
 * \details Converts float number \p a to half precision in round-down mode.
-* \param[in] a - float. Is only being read. 
-* 
+* \param[in] a - float. Is only being read.
+*
 * \returns half
-* \retval a converted to half. 
+* \retval a converted to half.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -223,12 +223,12 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rd(const float a);
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Converts float number to half precision in round-up mode
 * and returns \p half with converted value.
-* 
+*
 * \details Converts float number \p a to half precision in round-up mode.
-* \param[in] a - float. Is only being read. 
-* 
+* \param[in] a - float. Is only being read.
+*
 * \returns half
-* \retval a converted to half. 
+* \retval a converted to half.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -238,12 +238,12 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_ru(const float a);
 /**
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Converts \p half number to float.
-* 
+*
 * \details Converts half number \p a to float.
-* \param[in] a - float. Is only being read. 
-* 
+* \param[in] a - float. Is only being read.
+*
 * \returns float
-* \retval a converted to float. 
+* \retval a converted to float.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -257,7 +257,7 @@ __CUDA_HOSTDEVICE_FP16_DECL__ float __half2float(const __half a);
 *
 * \details Converts input \p a to half precision in round-to-nearest-even mode and
 * populates both halves of \p half2 with converted value.
-* \param[in] a - float. Is only being read. 
+* \param[in] a - float. Is only being read.
 *
 * \returns half2
 * \retval The \p half2 value with both halves equal to the converted half
@@ -277,9 +277,9 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float2half2_rn(const float a);
 * and combines the results into one \p half2 number. Low 16 bits of the return
 * value correspond to the input \p a, high 16 bits correspond to the input \p
 * b.
-* \param[in] a - float. Is only being read. 
-* \param[in] b - float. Is only being read. 
-* 
+* \param[in] a - float. Is only being read.
+* \param[in] b - float. Is only being read.
+*
 * \returns half2
 * \retval The \p half2 value with corresponding halves equal to the
 * converted input floats.
@@ -292,11 +292,11 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half2 __floats2half2_rn(const float a, const flo
 /**
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Converts low 16 bits of \p half2 to float and returns the result
-* 
+*
 * \details Converts low 16 bits of \p half2 input \p a to 32-bit floating-point number
 * and returns the result.
-* \param[in] a - half2. Is only being read. 
-* 
+* \param[in] a - half2. Is only being read.
+*
 * \returns float
 * \retval The low 16 bits of \p a converted to float.
 * \internal
@@ -308,11 +308,11 @@ __CUDA_HOSTDEVICE_FP16_DECL__ float __low2float(const __half2 a);
 /**
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Converts high 16 bits of \p half2 to float and returns the result
-* 
+*
 * \details Converts high 16 bits of \p half2 input \p a to 32-bit floating-point number
 * and returns the result.
-* \param[in] a - half2. Is only being read. 
-* 
+* \param[in] a - half2. Is only being read.
+*
 * \returns float
 * \retval The high 16 bits of \p a converted to float.
 * \internal
@@ -327,13 +327,13 @@ __CUDA_HOSTDEVICE_FP16_DECL__ float __high2float(const __half2 a);
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Converts both components of float2 number to half precision in
 * round-to-nearest-even mode and returns \p half2 with converted values.
-* 
+*
 * \details Converts both components of float2 to half precision in round-to-nearest
 * mode and combines the results into one \p half2 number. Low 16 bits of the
 * return value correspond to \p a.x and high 16 bits of the return value
 * correspond to \p a.y.
-* \param[in] a - float2. Is only being read. 
-*  
+* \param[in] a - float2. Is only being read.
+*
 * \returns half2
 * \retval The \p half2 which has corresponding halves equal to the
 * converted float2 components.
@@ -346,11 +346,11 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float22half2_rn(const float2 a);
 /**
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Converts both halves of \p half2 to float2 and returns the result.
-* 
+*
 * \details Converts both halves of \p half2 input \p a to float2 and returns the
 * result.
-* \param[in] a - half2. Is only being read. 
-* 
+* \param[in] a - half2. Is only being read.
+*
 * \returns float2
 * \retval a converted to float2.
 * \internal
@@ -362,13 +362,13 @@ __CUDA_HOSTDEVICE_FP16_DECL__ float2 __half22float2(const __half2 a);
 /**
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Convert a half to a signed integer in round-to-nearest-even mode.
-* 
+*
 * \details Convert the half-precision floating-point value \p h to a signed integer in
 * round-to-nearest-even mode.
-* \param[in] h - half. Is only being read. 
-* 
+* \param[in] h - half. Is only being read.
+*
 * \returns int
-* \retval h converted to a signed integer. 
+* \retval h converted to a signed integer.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -378,13 +378,13 @@ __CUDA_FP16_DECL__ int __half2int_rn(const __half h);
 /**
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Convert a half to a signed integer in round-towards-zero mode.
-* 
+*
 * \details Convert the half-precision floating-point value \p h to a signed integer in
 * round-towards-zero mode.
-* \param[in] h - half. Is only being read. 
-* 
+* \param[in] h - half. Is only being read.
+*
 * \returns int
-* \retval h converted to a signed integer. 
+* \retval h converted to a signed integer.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -394,13 +394,13 @@ __CUDA_HOSTDEVICE_FP16_DECL__ int __half2int_rz(const __half h);
 /**
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Convert a half to a signed integer in round-down mode.
-* 
+*
 * \details Convert the half-precision floating-point value \p h to a signed integer in
 * round-down mode.
-* \param[in] h - half. Is only being read. 
-* 
+* \param[in] h - half. Is only being read.
+*
 * \returns int
-* \retval h converted to a signed integer. 
+* \retval h converted to a signed integer.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -410,13 +410,13 @@ __CUDA_FP16_DECL__ int __half2int_rd(const __half h);
 /**
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Convert a half to a signed integer in round-up mode.
-* 
+*
 * \details Convert the half-precision floating-point value \p h to a signed integer in
 * round-up mode.
-* \param[in] h - half. Is only being read. 
-* 
+* \param[in] h - half. Is only being read.
+*
 * \returns int
-* \retval h converted to a signed integer. 
+* \retval h converted to a signed integer.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -427,13 +427,13 @@ __CUDA_FP16_DECL__ int __half2int_ru(const __half h);
 /**
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Convert a signed integer to a half in round-to-nearest-even mode.
-* 
+*
 * \details Convert the signed integer value \p i to a half-precision floating-point
 * value in round-to-nearest-even mode.
-* \param[in] i - int. Is only being read. 
-* 
+* \param[in] i - int. Is only being read.
+*
 * \returns half
-* \retval i converted to half. 
+* \retval i converted to half.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -443,13 +443,13 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_rn(const int i);
 /**
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Convert a signed integer to a half in round-towards-zero mode.
-* 
+*
 * \details Convert the signed integer value \p i to a half-precision floating-point
 * value in round-towards-zero mode.
-* \param[in] i - int. Is only being read. 
-* 
+* \param[in] i - int. Is only being read.
+*
 * \returns half
-* \retval i converted to half. 
+* \retval i converted to half.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -459,13 +459,13 @@ __CUDA_FP16_DECL__ __half __int2half_rz(const int i);
 /**
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Convert a signed integer to a half in round-down mode.
-* 
+*
 * \details Convert the signed integer value \p i to a half-precision floating-point
 * value in round-down mode.
-* \param[in] i - int. Is only being read. 
-* 
+* \param[in] i - int. Is only being read.
+*
 * \returns half
-* \retval i converted to half. 
+* \retval i converted to half.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -475,13 +475,13 @@ __CUDA_FP16_DECL__ __half __int2half_rd(const int i);
 /**
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Convert a signed integer to a half in round-up mode.
-* 
+*
 * \details Convert the signed integer value \p i to a half-precision floating-point
 * value in round-up mode.
-* \param[in] i - int. Is only being read. 
-* 
+* \param[in] i - int. Is only being read.
+*
 * \returns half
-* \retval i converted to half. 
+* \retval i converted to half.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -493,13 +493,13 @@ __CUDA_FP16_DECL__ __half __int2half_ru(const int i);
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Convert a half to a signed short integer in round-to-nearest-even
 * mode.
-* 
+*
 * \details Convert the half-precision floating-point value \p h to a signed short
 * integer in round-to-nearest-even mode.
-* \param[in] h - half. Is only being read. 
-* 
+* \param[in] h - half. Is only being read.
+*
 * \returns short int
-* \retval h converted to a signed short integer. 
+* \retval h converted to a signed short integer.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -509,13 +509,13 @@ __CUDA_FP16_DECL__ short int __half2short_rn(const __half h);
 /**
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Convert a half to a signed short integer in round-towards-zero mode.
-* 
+*
 * \details Convert the half-precision floating-point value \p h to a signed short
 * integer in round-towards-zero mode.
-* \param[in] h - half. Is only being read. 
-* 
+* \param[in] h - half. Is only being read.
+*
 * \returns short int
-* \retval h converted to a signed short integer. 
+* \retval h converted to a signed short integer.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -525,13 +525,13 @@ __CUDA_HOSTDEVICE_FP16_DECL__ short int __half2short_rz(const __half h);
 /**
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Convert a half to a signed short integer in round-down mode.
-* 
+*
 * \details Convert the half-precision floating-point value \p h to a signed short
 * integer in round-down mode.
-* \param[in] h - half. Is only being read. 
-* 
+* \param[in] h - half. Is only being read.
+*
 * \returns short int
-* \retval h converted to a signed short integer. 
+* \retval h converted to a signed short integer.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -541,13 +541,13 @@ __CUDA_FP16_DECL__ short int __half2short_rd(const __half h);
 /**
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Convert a half to a signed short integer in round-up mode.
-* 
+*
 * \details Convert the half-precision floating-point value \p h to a signed short
 * integer in round-up mode.
-* \param[in] h - half. Is only being read. 
-* 
+* \param[in] h - half. Is only being read.
+*
 * \returns short int
-* \retval h converted to a signed short integer. 
+* \retval h converted to a signed short integer.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -559,13 +559,13 @@ __CUDA_FP16_DECL__ short int __half2short_ru(const __half h);
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Convert a signed short integer to a half in round-to-nearest-even
 * mode.
-* 
+*
 * \details Convert the signed short integer value \p i to a half-precision floating-point
 * value in round-to-nearest-even mode.
-* \param[in] i - short int. Is only being read. 
-* 
+* \param[in] i - short int. Is only being read.
+*
 * \returns half
-* \retval i converted to half. 
+* \retval i converted to half.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -575,13 +575,13 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_rn(const short int i);
 /**
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Convert a signed short integer to a half in round-towards-zero mode.
-* 
+*
 * \details Convert the signed short integer value \p i to a half-precision floating-point
 * value in round-towards-zero mode.
-* \param[in] i - short int. Is only being read. 
-* 
+* \param[in] i - short int. Is only being read.
+*
 * \returns half
-* \retval i converted to half. 
+* \retval i converted to half.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -591,13 +591,13 @@ __CUDA_FP16_DECL__ __half __short2half_rz(const short int i);
 /**
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Convert a signed short integer to a half in round-down mode.
-* 
+*
 * \details Convert the signed short integer value \p i to a half-precision floating-point
 * value in round-down mode.
-* \param[in] i - short int. Is only being read. 
-* 
+* \param[in] i - short int. Is only being read.
+*
 * \returns half
-* \retval i converted to half. 
+* \retval i converted to half.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -607,13 +607,13 @@ __CUDA_FP16_DECL__ __half __short2half_rd(const short int i);
 /**
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Convert a signed short integer to a half in round-up mode.
-* 
+*
 * \details Convert the signed short integer value \p i to a half-precision floating-point
 * value in round-up mode.
-* \param[in] i - short int. Is only being read. 
-* 
+* \param[in] i - short int. Is only being read.
+*
 * \returns half
-* \retval i converted to half. 
+* \retval i converted to half.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -624,13 +624,13 @@ __CUDA_FP16_DECL__ __half __short2half_ru(const short int i);
 /**
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Convert a half to an unsigned integer in round-to-nearest-even mode.
-* 
+*
 * \details Convert the half-precision floating-point value \p h to an unsigned integer
 * in round-to-nearest-even mode.
-* \param[in] h - half. Is only being read. 
-* 
+* \param[in] h - half. Is only being read.
+*
 * \returns unsigned int
-* \retval h converted to an unsigned integer. 
+* \retval h converted to an unsigned integer.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -640,13 +640,13 @@ __CUDA_FP16_DECL__ unsigned int __half2uint_rn(const __half h);
 /**
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Convert a half to an unsigned integer in round-towards-zero mode.
-* 
+*
 * \details Convert the half-precision floating-point value \p h to an unsigned integer
 * in round-towards-zero mode.
-* \param[in] h - half. Is only being read. 
-* 
+* \param[in] h - half. Is only being read.
+*
 * \returns unsigned int
-* \retval h converted to an unsigned integer. 
+* \retval h converted to an unsigned integer.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -659,10 +659,10 @@ __CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __half2uint_rz(const __half h);
 *
 * \details Convert the half-precision floating-point value \p h to an unsigned integer
 * in round-down mode.
-* \param[in] h - half. Is only being read. 
+* \param[in] h - half. Is only being read.
 *
 * \returns unsigned int
-* \retval h converted to an unsigned integer. 
+* \retval h converted to an unsigned integer.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -675,10 +675,10 @@ __CUDA_FP16_DECL__ unsigned int __half2uint_rd(const __half h);
 *
 * \details Convert the half-precision floating-point value \p h to an unsigned integer
 * in round-up mode.
-* \param[in] h - half. Is only being read. 
+* \param[in] h - half. Is only being read.
 *
 * \returns unsigned int
-* \retval h converted to an unsigned integer. 
+* \retval h converted to an unsigned integer.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -689,13 +689,13 @@ __CUDA_FP16_DECL__ unsigned int __half2uint_ru(const __half h);
 /**
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Convert an unsigned integer to a half in round-to-nearest-even mode.
-* 
+*
 * \details Convert the unsigned integer value \p i to a half-precision floating-point
 * value in round-to-nearest-even mode.
-* \param[in] i - unsigned int. Is only being read. 
-* 
+* \param[in] i - unsigned int. Is only being read.
+*
 * \returns half
-* \retval i converted to half. 
+* \retval i converted to half.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -705,13 +705,13 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_rn(const unsigned int i);
 /**
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Convert an unsigned integer to a half in round-towards-zero mode.
-* 
+*
 * \details Convert the unsigned integer value \p i to a half-precision floating-point
 * value in round-towards-zero mode.
-* \param[in] i - unsigned int. Is only being read. 
-* 
+* \param[in] i - unsigned int. Is only being read.
+*
 * \returns half
-* \retval i converted to half.  
+* \retval i converted to half.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -721,13 +721,13 @@ __CUDA_FP16_DECL__ __half __uint2half_rz(const unsigned int i);
 /**
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Convert an unsigned integer to a half in round-down mode.
-* 
+*
 * \details Convert the unsigned integer value \p i to a half-precision floating-point
 * value in round-down mode.
-* \param[in] i - unsigned int. Is only being read. 
-* 
+* \param[in] i - unsigned int. Is only being read.
+*
 * \returns half
-* \retval i converted to half. 
+* \retval i converted to half.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -737,13 +737,13 @@ __CUDA_FP16_DECL__ __half __uint2half_rd(const unsigned int i);
 /**
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Convert an unsigned integer to a half in round-up mode.
-* 
+*
 * \details Convert the unsigned integer value \p i to a half-precision floating-point
 * value in round-up mode.
-* \param[in] i - unsigned int. Is only being read. 
-* 
+* \param[in] i - unsigned int. Is only being read.
+*
 * \returns half
-* \retval i converted to half. 
+* \retval i converted to half.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -755,13 +755,13 @@ __CUDA_FP16_DECL__ __half __uint2half_ru(const unsigned int i);
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Convert a half to an unsigned short integer in round-to-nearest-even
 * mode.
-* 
+*
 * \details Convert the half-precision floating-point value \p h to an unsigned short
 * integer in round-to-nearest-even mode.
-* \param[in] h - half. Is only being read. 
-* 
+* \param[in] h - half. Is only being read.
+*
 * \returns unsigned short int
-* \retval h converted to an unsigned short integer. 
+* \retval h converted to an unsigned short integer.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -772,13 +772,13 @@ __CUDA_FP16_DECL__ unsigned short int __half2ushort_rn(const __half h);
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Convert a half to an unsigned short integer in round-towards-zero
 * mode.
-* 
+*
 * \details Convert the half-precision floating-point value \p h to an unsigned short
 * integer in round-towards-zero mode.
-* \param[in] h - half. Is only being read. 
-* 
+* \param[in] h - half. Is only being read.
+*
 * \returns unsigned short int
-* \retval h converted to an unsigned short integer. 
+* \retval h converted to an unsigned short integer.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -788,25 +788,25 @@ __CUDA_HOSTDEVICE_FP16_DECL__ unsigned short int __half2ushort_rz(const __half h
 /**
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Convert a half to an unsigned short integer in round-down mode.
-* 
+*
 * \details Convert the half-precision floating-point value \p h to an unsigned short
 * integer in round-down mode.
-* \param[in] h - half. Is only being read. 
-* 
+* \param[in] h - half. Is only being read.
+*
 * \returns unsigned short int
-* \retval h converted to an unsigned short integer. 
+* \retval h converted to an unsigned short integer.
 */
 __CUDA_FP16_DECL__ unsigned short int __half2ushort_rd(const __half h);
 /**
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Convert a half to an unsigned short integer in round-up mode.
-* 
+*
 * \details Convert the half-precision floating-point value \p h to an unsigned short
 * integer in round-up mode.
-* \param[in] h - half. Is only being read. 
-* 
+* \param[in] h - half. Is only being read.
+*
 * \returns unsigned short int
-* \retval h converted to an unsigned short integer. 
+* \retval h converted to an unsigned short integer.
 */
 __CUDA_FP16_DECL__ unsigned short int __half2ushort_ru(const __half h);
 
@@ -814,13 +814,13 @@ __CUDA_FP16_DECL__ unsigned short int __half2ushort_ru(const __half h);
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Convert an unsigned short integer to a half in round-to-nearest-even
 * mode.
-* 
+*
 * \details Convert the unsigned short integer value \p i to a half-precision floating-point
 * value in round-to-nearest-even mode.
-* \param[in] i - unsigned short int. Is only being read. 
-* 
+* \param[in] i - unsigned short int. Is only being read.
+*
 * \returns half
-* \retval i converted to half. 
+* \retval i converted to half.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -831,13 +831,13 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_rn(const unsigned short int i
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Convert an unsigned short integer to a half in round-towards-zero
 * mode.
-* 
+*
 * \details Convert the unsigned short integer value \p i to a half-precision floating-point
 * value in round-towards-zero mode.
-* \param[in] i - unsigned short int. Is only being read. 
-* 
+* \param[in] i - unsigned short int. Is only being read.
+*
 * \returns half
-* \retval i converted to half. 
+* \retval i converted to half.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -847,13 +847,13 @@ __CUDA_FP16_DECL__ __half __ushort2half_rz(const unsigned short int i);
 /**
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Convert an unsigned short integer to a half in round-down mode.
-* 
+*
 * \details Convert the unsigned short integer value \p i to a half-precision floating-point
 * value in round-down mode.
-* \param[in] i - unsigned short int. Is only being read. 
-* 
+* \param[in] i - unsigned short int. Is only being read.
+*
 * \returns half
-* \retval i converted to half. 
+* \retval i converted to half.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -863,13 +863,13 @@ __CUDA_FP16_DECL__ __half __ushort2half_rd(const unsigned short int i);
 /**
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Convert an unsigned short integer to a half in round-up mode.
-* 
+*
 * \details Convert the unsigned short integer value \p i to a half-precision floating-point
 * value in round-up mode.
-* \param[in] i - unsigned short int. Is only being read. 
-* 
+* \param[in] i - unsigned short int. Is only being read.
+*
 * \returns half
-* \retval i converted to half. 
+* \retval i converted to half.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -881,13 +881,13 @@ __CUDA_FP16_DECL__ __half __ushort2half_ru(const unsigned short int i);
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Convert a half to an unsigned 64-bit integer in round-to-nearest-even
 * mode.
-* 
+*
 * \details Convert the half-precision floating-point value \p h to an unsigned 64-bit
 * integer in round-to-nearest-even mode.
-* \param[in] h - half. Is only being read. 
-* 
+* \param[in] h - half. Is only being read.
+*
 * \returns unsigned long long int
-* \retval h converted to an unsigned 64-bit integer. 
+* \retval h converted to an unsigned 64-bit integer.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -898,13 +898,13 @@ __CUDA_FP16_DECL__ unsigned long long int __half2ull_rn(const __half h);
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Convert a half to an unsigned 64-bit integer in round-towards-zero
 * mode.
-* 
+*
 * \details Convert the half-precision floating-point value \p h to an unsigned 64-bit
 * integer in round-towards-zero mode.
-* \param[in] h - half. Is only being read. 
-* 
+* \param[in] h - half. Is only being read.
+*
 * \returns unsigned long long int
-* \retval h converted to an unsigned 64-bit integer. 
+* \retval h converted to an unsigned 64-bit integer.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -914,13 +914,13 @@ __CUDA_HOSTDEVICE_FP16_DECL__ unsigned long long int __half2ull_rz(const __half
 /**
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Convert a half to an unsigned 64-bit integer in round-down mode.
-* 
+*
 * \details Convert the half-precision floating-point value \p h to an unsigned 64-bit
 * integer in round-down mode.
-* \param[in] h - half. Is only being read. 
-* 
+* \param[in] h - half. Is only being read.
+*
 * \returns unsigned long long int
-* \retval h converted to an unsigned 64-bit integer. 
+* \retval h converted to an unsigned 64-bit integer.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -930,13 +930,13 @@ __CUDA_FP16_DECL__ unsigned long long int __half2ull_rd(const __half h);
 /**
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Convert a half to an unsigned 64-bit integer in round-up mode.
-* 
+*
 * \details Convert the half-precision floating-point value \p h to an unsigned 64-bit
 * integer in round-up mode.
-* \param[in] h - half. Is only being read. 
-* 
+* \param[in] h - half. Is only being read.
+*
 * \returns unsigned long long int
-* \retval h converted to an unsigned 64-bit integer. 
+* \retval h converted to an unsigned 64-bit integer.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -948,13 +948,13 @@ __CUDA_FP16_DECL__ unsigned long long int __half2ull_ru(const __half h);
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Convert an unsigned 64-bit integer to a half in round-to-nearest-even
 * mode.
-* 
+*
 * \details Convert the unsigned 64-bit integer value \p i to a half-precision floating-point
 * value in round-to-nearest-even mode.
-* \param[in] i - unsigned long long int. Is only being read. 
-* 
+* \param[in] i - unsigned long long int. Is only being read.
+*
 * \returns half
-* \retval i converted to half. 
+* \retval i converted to half.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -965,13 +965,13 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_rn(const unsigned long long int
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Convert an unsigned 64-bit integer to a half in round-towards-zero
 * mode.
-* 
+*
 * \details Convert the unsigned 64-bit integer value \p i to a half-precision floating-point
 * value in round-towards-zero mode.
-* \param[in] i - unsigned long long int. Is only being read. 
-* 
+* \param[in] i - unsigned long long int. Is only being read.
+*
 * \returns half
-* \retval i converted to half. 
+* \retval i converted to half.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -981,13 +981,13 @@ __CUDA_FP16_DECL__ __half __ull2half_rz(const unsigned long long int i);
 /**
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Convert an unsigned 64-bit integer to a half in round-down mode.
-* 
+*
 * \details Convert the unsigned 64-bit integer value \p i to a half-precision floating-point
 * value in round-down mode.
-* \param[in] i - unsigned long long int. Is only being read. 
-* 
+* \param[in] i - unsigned long long int. Is only being read.
+*
 * \returns half
-* \retval i converted to half.  
+* \retval i converted to half.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -997,13 +997,13 @@ __CUDA_FP16_DECL__ __half __ull2half_rd(const unsigned long long int i);
 /**
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Convert an unsigned 64-bit integer to a half in round-up mode.
-* 
+*
 * \details Convert the unsigned 64-bit integer value \p i to a half-precision floating-point
 * value in round-up mode.
-* \param[in] i - unsigned long long int. Is only being read. 
-* 
+* \param[in] i - unsigned long long int. Is only being read.
+*
 * \returns half
-* \retval i converted to half. 
+* \retval i converted to half.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -1015,13 +1015,13 @@ __CUDA_FP16_DECL__ __half __ull2half_ru(const unsigned long long int i);
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Convert a half to a signed 64-bit integer in round-to-nearest-even
 * mode.
-* 
+*
 * \details Convert the half-precision floating-point value \p h to a signed 64-bit
 * integer in round-to-nearest-even mode.
-* \param[in] h - half. Is only being read. 
-* 
+* \param[in] h - half. Is only being read.
+*
 * \returns long long int
-* \retval h converted to a signed 64-bit integer. 
+* \retval h converted to a signed 64-bit integer.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -1031,13 +1031,13 @@ __CUDA_FP16_DECL__ long long int __half2ll_rn(const __half h);
 /**
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Convert a half to a signed 64-bit integer in round-towards-zero mode.
-* 
+*
 * \details Convert the half-precision floating-point value \p h to a signed 64-bit
 * integer in round-towards-zero mode.
-* \param[in] h - half. Is only being read. 
-* 
+* \param[in] h - half. Is only being read.
+*
 * \returns long long int
-* \retval h converted to a signed 64-bit integer. 
+* \retval h converted to a signed 64-bit integer.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -1047,13 +1047,13 @@ __CUDA_HOSTDEVICE_FP16_DECL__ long long int __half2ll_rz(const __half h);
 /**
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Convert a half to a signed 64-bit integer in round-down mode.
-* 
+*
 * \details Convert the half-precision floating-point value \p h to a signed 64-bit
 * integer in round-down mode.
-* \param[in] h - half. Is only being read. 
-* 
+* \param[in] h - half. Is only being read.
+*
 * \returns long long int
-* \retval h converted to a signed 64-bit integer. 
+* \retval h converted to a signed 64-bit integer.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -1063,13 +1063,13 @@ __CUDA_FP16_DECL__ long long int __half2ll_rd(const __half h);
 /**
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Convert a half to a signed 64-bit integer in round-up mode.
-* 
+*
 * \details Convert the half-precision floating-point value \p h to a signed 64-bit
 * integer in round-up mode.
-* \param[in] h - half. Is only being read. 
-* 
+* \param[in] h - half. Is only being read.
+*
 * \returns long long int
-* \retval h converted to a signed 64-bit integer. 
+* \retval h converted to a signed 64-bit integer.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -1081,13 +1081,13 @@ __CUDA_FP16_DECL__ long long int __half2ll_ru(const __half h);
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Convert a signed 64-bit integer to a half in round-to-nearest-even
 * mode.
-* 
+*
 * \details Convert the signed 64-bit integer value \p i to a half-precision floating-point
 * value in round-to-nearest-even mode.
-* \param[in] i - long long int. Is only being read. 
-* 
+* \param[in] i - long long int. Is only being read.
+*
 * \returns half
-* \retval i converted to half. 
+* \retval i converted to half.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -1097,25 +1097,25 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_rn(const long long int i);
 /**
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Convert a signed 64-bit integer to a half in round-towards-zero mode.
-* 
+*
 * \details Convert the signed 64-bit integer value \p i to a half-precision floating-point
 * value in round-towards-zero mode.
-* \param[in] i - long long int. Is only being read. 
-* 
+* \param[in] i - long long int. Is only being read.
+*
 * \returns half
-* \retval i converted to half. 
+* \retval i converted to half.
 */
 __CUDA_FP16_DECL__ __half __ll2half_rz(const long long int i);
 /**
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Convert a signed 64-bit integer to a half in round-down mode.
-* 
+*
 * \details Convert the signed 64-bit integer value \p i to a half-precision floating-point
 * value in round-down mode.
-* \param[in] i - long long int. Is only being read. 
-* 
+* \param[in] i - long long int. Is only being read.
+*
 * \returns half
-* \retval i converted to half. 
+* \retval i converted to half.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -1125,13 +1125,13 @@ __CUDA_FP16_DECL__ __half __ll2half_rd(const long long int i);
 /**
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Convert a signed 64-bit integer to a half in round-up mode.
-* 
+*
 * \details Convert the signed 64-bit integer value \p i to a half-precision floating-point
 * value in round-up mode.
-* \param[in] i - long long int. Is only being read. 
-* 
+* \param[in] i - long long int. Is only being read.
+*
 * \returns half
-* \retval i converted to half. 
+* \retval i converted to half.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -1142,13 +1142,13 @@ __CUDA_FP16_DECL__ __half __ll2half_ru(const long long int i);
 /**
 * \ingroup CUDA_MATH__HALF_FUNCTIONS
 * \brief Truncate input argument to the integral part.
-* 
+*
 * \details Round \p h to the nearest integer value that does not exceed \p h in
 * magnitude.
-* \param[in] h - half. Is only being read. 
-* 
+* \param[in] h - half. Is only being read.
+*
 * \returns half
-* \retval The truncated integer value. 
+* \retval The truncated integer value.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -1158,12 +1158,12 @@ __CUDA_FP16_DECL__ __half htrunc(const __half h);
 /**
 * \ingroup CUDA_MATH__HALF_FUNCTIONS
 * \brief Calculate ceiling of the input argument.
-* 
+*
 * \details Compute the smallest integer value not less than \p h.
-* \param[in] h - half. Is only being read. 
-* 
+* \param[in] h - half. Is only being read.
+*
 * \returns half
-* \retval The smallest integer value not less than \p h. 
+* \retval The smallest integer value not less than \p h.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -1173,12 +1173,12 @@ __CUDA_FP16_DECL__ __half hceil(const __half h);
 /**
 * \ingroup CUDA_MATH__HALF_FUNCTIONS
 * \brief Calculate the largest integer less than or equal to \p h.
-* 
+*
 * \details Calculate the largest integer value which is less than or equal to \p h.
-* \param[in] h - half. Is only being read. 
-* 
+* \param[in] h - half. Is only being read.
+*
 * \returns half
-* \retval The largest integer value which is less than or equal to \p h. 
+* \retval The largest integer value which is less than or equal to \p h.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -1189,13 +1189,13 @@ __CUDA_FP16_DECL__ __half hfloor(const __half h);
 * \ingroup CUDA_MATH__HALF_FUNCTIONS
 * \brief Round input to nearest integer value in half-precision floating-point
 * number.
-* 
+*
 * \details Round \p h to the nearest integer value in half-precision floating-point
 * format, with halfway cases rounded to the nearest even integer value.
-* \param[in] h - half. Is only being read. 
-* 
+* \param[in] h - half. Is only being read.
+*
 * \returns half
-* \retval The nearest integer to \p h. 
+* \retval The nearest integer to \p h.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -1206,13 +1206,13 @@ __CUDA_FP16_DECL__ __half hrint(const __half h);
 /**
 * \ingroup CUDA_MATH__HALF2_FUNCTIONS
 * \brief Truncate \p half2 vector input argument to the integral part.
-* 
+*
 * \details Round each component of vector \p h to the nearest integer value that does
 * not exceed \p h in magnitude.
-* \param[in] h - half2. Is only being read. 
-* 
+* \param[in] h - half2. Is only being read.
+*
 * \returns half2
-* \retval The truncated \p h. 
+* \retval The truncated \p h.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -1222,13 +1222,13 @@ __CUDA_FP16_DECL__ __half2 h2trunc(const __half2 h);
 /**
 * \ingroup CUDA_MATH__HALF2_FUNCTIONS
 * \brief Calculate \p half2 vector ceiling of the input argument.
-* 
+*
 * \details For each component of vector \p h compute the smallest integer value not less
 * than \p h.
-* \param[in] h - half2. Is only being read. 
-* 
+* \param[in] h - half2. Is only being read.
+*
 * \returns half2
-* \retval The vector of smallest integers not less than \p h. 
+* \retval The vector of smallest integers not less than \p h.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -1238,13 +1238,13 @@ __CUDA_FP16_DECL__ __half2 h2ceil(const __half2 h);
 /**
 * \ingroup CUDA_MATH__HALF2_FUNCTIONS
 * \brief Calculate the largest integer less than or equal to \p h.
-* 
+*
 * \details For each component of vector \p h calculate the largest integer value which
 * is less than or equal to \p h.
-* \param[in] h - half2. Is only being read. 
-* 
+* \param[in] h - half2. Is only being read.
+*
 * \returns half2
-* \retval The vector of largest integers which is less than or equal to \p h. 
+* \retval The vector of largest integers which is less than or equal to \p h.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -1255,14 +1255,14 @@ __CUDA_FP16_DECL__ __half2 h2floor(const __half2 h);
 * \ingroup CUDA_MATH__HALF2_FUNCTIONS
 * \brief Round input to nearest integer value in half-precision floating-point
 * number.
-* 
+*
 * \details Round each component of \p half2 vector \p h to the nearest integer value in
 * half-precision floating-point format, with halfway cases rounded to the
 * nearest even integer value.
-* \param[in] h - half2. Is only being read. 
-* 
+* \param[in] h - half2. Is only being read.
+*
 * \returns half2
-* \retval The vector of rounded integer values. 
+* \retval The vector of rounded integer values.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -1273,13 +1273,13 @@ __CUDA_FP16_DECL__ __half2 h2rint(const __half2 h);
 /**
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Returns \p half2 with both halves equal to the input value.
-* 
+*
 * \details Returns \p half2 number with both halves equal to the input \p a \p half
 * number.
-* \param[in] a - half. Is only being read. 
-* 
+* \param[in] a - half. Is only being read.
+*
 * \returns half2
-* \retval The vector which has both its halves equal to the input \p a. 
+* \retval The vector which has both its halves equal to the input \p a.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -1289,13 +1289,13 @@ __CUDA_FP16_DECL__ __half2 __half2half2(const __half a);
 /**
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Swaps both halves of the \p half2 input.
-* 
+*
 * \details Swaps both halves of the \p half2 input and returns a new \p half2 number
 * with swapped halves.
-* \param[in] a - half2. Is only being read. 
-* 
+* \param[in] a - half2. Is only being read.
+*
 * \returns half2
-* \retval a with its halves being swapped. 
+* \retval a with its halves being swapped.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -1305,17 +1305,17 @@ __CUDA_FP16_DECL__ __half2 __lowhigh2highlow(const __half2 a);
 /**
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Extracts low 16 bits from each of the two \p half2 inputs and combines
-* into one \p half2 number. 
-* 
+* into one \p half2 number.
+*
 * \details Extracts low 16 bits from each of the two \p half2 inputs and combines into
 * one \p half2 number. Low 16 bits from input \p a is stored in low 16 bits of
 * the return value, low 16 bits from input \p b is stored in high 16 bits of
-* the return value. 
-* \param[in] a - half2. Is only being read. 
-* \param[in] b - half2. Is only being read. 
-* 
+* the return value.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
 * \returns half2
-* \retval The low 16 bits of \p a and of \p b. 
+* \retval The low 16 bits of \p a and of \p b.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -1326,16 +1326,16 @@ __CUDA_FP16_DECL__ __half2 __lows2half2(const __half2 a, const __half2 b);
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Extracts high 16 bits from each of the two \p half2 inputs and
 * combines into one \p half2 number.
-* 
+*
 * \details Extracts high 16 bits from each of the two \p half2 inputs and combines into
 * one \p half2 number. High 16 bits from input \p a is stored in low 16 bits of
 * the return value, high 16 bits from input \p b is stored in high 16 bits of
 * the return value.
-* \param[in] a - half2. Is only being read. 
-* \param[in] b - half2. Is only being read. 
-* 
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
 * \returns half2
-* \retval The high 16 bits of \p a and of \p b. 
+* \retval The high 16 bits of \p a and of \p b.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -1347,10 +1347,10 @@ __CUDA_FP16_DECL__ __half2 __highs2half2(const __half2 a, const __half2 b);
 * \brief Returns high 16 bits of \p half2 input.
 *
 * \details Returns high 16 bits of \p half2 input \p a.
-* \param[in] a - half2. Is only being read. 
+* \param[in] a - half2. Is only being read.
 *
 * \returns half
-* \retval The high 16 bits of the input. 
+* \retval The high 16 bits of the input.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -1362,10 +1362,10 @@ __CUDA_FP16_DECL__ __half __high2half(const __half2 a);
 * \brief Returns low 16 bits of \p half2 input.
 *
 * \details Returns low 16 bits of \p half2 input \p a.
-* \param[in] a - half2. Is only being read. 
+* \param[in] a - half2. Is only being read.
 *
 * \returns half
-* \retval Returns \p half which contains low 16 bits of the input \p a. 
+* \retval Returns \p half which contains low 16 bits of the input \p a.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -1375,14 +1375,14 @@ __CUDA_FP16_DECL__ __half __low2half(const __half2 a);
 /**
 * \ingroup CUDA_MATH__HALF_COMPARISON
 * \brief Checks if the input \p half number is infinite.
-* 
-* \details Checks if the input \p half number \p a is infinite. 
-* \param[in] a - half. Is only being read. 
-* 
-* \returns int 
-* \retval -1 iff \p a is equal to negative infinity, 
-* \retval 1 iff \p a is equal to positive infinity, 
-* \retval 0 otherwise. 
+*
+* \details Checks if the input \p half number \p a is infinite.
+* \param[in] a - half. Is only being read.
+*
+* \returns int
+* \retval -1 iff \p a is equal to negative infinity,
+* \retval 1 iff \p a is equal to positive infinity,
+* \retval 0 otherwise.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -1392,15 +1392,15 @@ __CUDA_FP16_DECL__ int __hisinf(const __half a);
 /**
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Combines two \p half numbers into one \p half2 number.
-* 
+*
 * \details Combines two input \p half number \p a and \p b into one \p half2 number.
 * Input \p a is stored in low 16 bits of the return value, input \p b is stored
 * in high 16 bits of the return value.
-* \param[in] a - half. Is only being read. 
-* \param[in] b - half. Is only being read. 
-* 
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
 * \returns half2
-* \retval The half2 with one half equal to \p a and the other to \p b. 
+* \retval The half2 with one half equal to \p a and the other to \p b.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -1410,13 +1410,13 @@ __CUDA_FP16_DECL__ __half2 __halves2half2(const __half a, const __half b);
 /**
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Extracts low 16 bits from \p half2 input.
-* 
+*
 * \details Extracts low 16 bits from \p half2 input \p a and returns a new \p half2
 * number which has both halves equal to the extracted bits.
-* \param[in] a - half2. Is only being read. 
-* 
+* \param[in] a - half2. Is only being read.
+*
 * \returns half2
-* \retval The half2 with both halves equal to the low 16 bits of the input. 
+* \retval The half2 with both halves equal to the low 16 bits of the input.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -1426,13 +1426,13 @@ __CUDA_FP16_DECL__ __half2 __low2half2(const __half2 a);
 /**
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Extracts high 16 bits from \p half2 input.
-* 
+*
 * \details Extracts high 16 bits from \p half2 input \p a and returns a new \p half2
 * number which has both halves equal to the extracted bits.
-* \param[in] a - half2. Is only being read. 
-* 
+* \param[in] a - half2. Is only being read.
+*
 * \returns half2
-* \retval The half2 with both halves equal to the high 16 bits of the input. 
+* \retval The half2 with both halves equal to the high 16 bits of the input.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -1443,13 +1443,13 @@ __CUDA_FP16_DECL__ __half2 __high2half2(const __half2 a);
 /**
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Reinterprets bits in a \p half as a signed short integer.
-* 
+*
 * \details Reinterprets the bits in the half-precision floating-point number \p h
-* as a signed short integer. 
-* \param[in] h - half. Is only being read. 
-* 
+* as a signed short integer.
+* \param[in] h - half. Is only being read.
+*
 * \returns short int
-* \retval The reinterpreted value. 
+* \retval The reinterpreted value.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -1459,11 +1459,11 @@ __CUDA_FP16_DECL__ short int __half_as_short(const __half h);
 /**
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Reinterprets bits in a \p half as an unsigned short integer.
-* 
+*
 * \details Reinterprets the bits in the half-precision floating-point \p h
 * as an unsigned short number.
-* \param[in] h - half. Is only being read. 
-* 
+* \param[in] h - half. Is only being read.
+*
 * \returns unsigned short int
 * \retval The reinterpreted value.
 * \internal
@@ -1475,11 +1475,11 @@ __CUDA_FP16_DECL__ unsigned short int __half_as_ushort(const __half h);
 /**
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Reinterprets bits in a signed short integer as a \p half.
-* 
+*
 * \details Reinterprets the bits in the signed short integer \p i as a
 * half-precision floating-point number.
-* \param[in] i - short int. Is only being read. 
-* 
+* \param[in] i - short int. Is only being read.
+*
 * \returns half
 * \retval The reinterpreted value.
 * \internal
@@ -1491,11 +1491,11 @@ __CUDA_FP16_DECL__ __half __short_as_half(const short int i);
 /**
 * \ingroup CUDA_MATH__HALF_MISC
 * \brief Reinterprets bits in an unsigned short integer as a \p half.
-* 
+*
 * \details Reinterprets the bits in the unsigned short integer \p i as a
 * half-precision floating-point number.
-* \param[in] i - unsigned short int. Is only being read. 
-* 
+* \param[in] i - unsigned short int. Is only being read.
+*
 * \returns half
 * \retval The reinterpreted value.
 * \internal
@@ -1534,22 +1534,22 @@ __CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) __half
 
 /**
 * \ingroup CUDA_MATH__HALF_MISC
-* \brief Exchange a variable between threads within a warp. Direct copy from indexed thread. 
-* 
-* \details Returns the value of var held by the thread whose ID is given by delta. 
-* If width is less than warpSize then each subsection of the warp behaves as a separate 
-* entity with a starting logical thread ID of 0. If delta is outside the range [0:width-1], 
-* the value returned corresponds to the value of var held by the delta modulo width (i.e. 
-* within the same subsection). width must have a value which is a power of 2; 
-* results are undefined if width is not a power of 2, or is a number greater than 
-* warpSize. 
-* \param[in] mask - unsigned int. Is only being read. 
-* \param[in] var - half2. Is only being read. 
-* \param[in] delta - int. Is only being read. 
-* \param[in] width - int. Is only being read. 
-* 
-* \returns Returns the 4-byte word referenced by var from the source thread ID as half2. 
-* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. 
+* \brief Exchange a variable between threads within a warp. Direct copy from indexed thread.
+*
+* \details Returns the value of var held by the thread whose ID is given by delta.
+* If width is less than warpSize then each subsection of the warp behaves as a separate
+* entity with a starting logical thread ID of 0. If delta is outside the range [0:width-1],
+* the value returned corresponds to the value of var held by the delta modulo width (i.e.
+* within the same subsection). width must have a value which is a power of 2;
+* results are undefined if width is not a power of 2, or is a number greater than
+* warpSize.
+* \param[in] mask - unsigned int. Is only being read.
+* \param[in] var - half2. Is only being read.
+* \param[in] delta - int. Is only being read.
+* \param[in] width - int. Is only being read.
+*
+* \returns Returns the 4-byte word referenced by var from the source thread ID as half2.
+* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior not reentrant, not thread safe
@@ -1558,22 +1558,22 @@ __CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) __half
 __CUDA_FP16_DECL__ __half2 __shfl_sync(const unsigned mask, const __half2 var, const int delta, const int width = warpSize);
 /**
 * \ingroup CUDA_MATH__HALF_MISC
-* \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller. 
-* 
-* \details Calculates a source thread ID by subtracting delta from the caller's lane ID. 
-* The value of var held by the resulting lane ID is returned: in effect, var is shifted up 
-* the warp by delta threads. If width is less than warpSize then each subsection of the warp 
-* behaves as a separate entity with a starting logical thread ID of 0. The source thread index 
-* will not wrap around the value of width, so effectively the lower delta threads will be unchanged. 
-* width must have a value which is a power of 2; results are undefined if width is not a power of 2, 
-* or is a number greater than warpSize. 
-* \param[in] mask - unsigned int. Is only being read. 
-* \param[in] var - half2. Is only being read. 
-* \param[in] delta - int. Is only being read. 
-* \param[in] width - int. Is only being read. 
-* 
-* \returns Returns the 4-byte word referenced by var from the source thread ID as half2. 
-* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. 
+* \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller.
+*
+* \details Calculates a source thread ID by subtracting delta from the caller's lane ID.
+* The value of var held by the resulting lane ID is returned: in effect, var is shifted up
+* the warp by delta threads. If width is less than warpSize then each subsection of the warp
+* behaves as a separate entity with a starting logical thread ID of 0. The source thread index
+* will not wrap around the value of width, so effectively the lower delta threads will be unchanged.
+* width must have a value which is a power of 2; results are undefined if width is not a power of 2,
+* or is a number greater than warpSize.
+* \param[in] mask - unsigned int. Is only being read.
+* \param[in] var - half2. Is only being read.
+* \param[in] delta - int. Is only being read.
+* \param[in] width - int. Is only being read.
+*
+* \returns Returns the 4-byte word referenced by var from the source thread ID as half2.
+* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior not reentrant, not thread safe
@@ -1582,22 +1582,22 @@ __CUDA_FP16_DECL__ __half2 __shfl_sync(const unsigned mask, const __half2 var, c
 __CUDA_FP16_DECL__ __half2 __shfl_up_sync(const unsigned mask, const __half2 var, const unsigned int delta, const int width = warpSize);
 /**
 * \ingroup CUDA_MATH__HALF_MISC
-* \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller. 
-* 
-* \details Calculates a source thread ID by adding delta to the caller's thread ID. 
-* The value of var held by the resulting thread ID is returned: this has the effect 
-* of shifting var down the warp by delta threads. If width is less than warpSize then 
-* each subsection of the warp behaves as a separate entity with a starting logical 
-* thread ID of 0. As for __shfl_up_sync(), the ID number of the source thread 
-* will not wrap around the value of width and so the upper delta threads 
-* will remain unchanged. 
-* \param[in] mask - unsigned int. Is only being read. 
-* \param[in] var - half2. Is only being read. 
-* \param[in] delta - int. Is only being read. 
-* \param[in] width - int. Is only being read. 
-* 
-* \returns Returns the 4-byte word referenced by var from the source thread ID as half2. 
-* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. 
+* \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller.
+*
+* \details Calculates a source thread ID by adding delta to the caller's thread ID.
+* The value of var held by the resulting thread ID is returned: this has the effect
+* of shifting var down the warp by delta threads. If width is less than warpSize then
+* each subsection of the warp behaves as a separate entity with a starting logical
+* thread ID of 0. As for __shfl_up_sync(), the ID number of the source thread
+* will not wrap around the value of width and so the upper delta threads
+* will remain unchanged.
+* \param[in] mask - unsigned int. Is only being read.
+* \param[in] var - half2. Is only being read.
+* \param[in] delta - int. Is only being read.
+* \param[in] width - int. Is only being read.
+*
+* \returns Returns the 4-byte word referenced by var from the source thread ID as half2.
+* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior not reentrant, not thread safe
@@ -1606,21 +1606,21 @@ __CUDA_FP16_DECL__ __half2 __shfl_up_sync(const unsigned mask, const __half2 var
 __CUDA_FP16_DECL__ __half2 __shfl_down_sync(const unsigned mask, const __half2 var, const unsigned int delta, const int width = warpSize);
 /**
 * \ingroup CUDA_MATH__HALF_MISC
-* \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID. 
-* 
-* \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with mask: 
-* the value of var held by the resulting thread ID is returned. If width is less than warpSize then each 
-* group of width consecutive threads are able to access elements from earlier groups of threads, 
-* however if they attempt to access elements from later groups of threads their own value of var 
-* will be returned. This mode implements a butterfly addressing pattern such as is used in tree 
-* reduction and broadcast. 
-* \param[in] mask - unsigned int. Is only being read. 
-* \param[in] var - half2. Is only being read. 
-* \param[in] delta - int. Is only being read. 
-* \param[in] width - int. Is only being read. 
-* 
-* \returns Returns the 4-byte word referenced by var from the source thread ID as half2. 
-* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. 
+* \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID.
+*
+* \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with mask:
+* the value of var held by the resulting thread ID is returned. If width is less than warpSize then each
+* group of width consecutive threads are able to access elements from earlier groups of threads,
+* however if they attempt to access elements from later groups of threads their own value of var
+* will be returned. This mode implements a butterfly addressing pattern such as is used in tree
+* reduction and broadcast.
+* \param[in] mask - unsigned int. Is only being read.
+* \param[in] var - half2. Is only being read.
+* \param[in] delta - int. Is only being read.
+* \param[in] width - int. Is only being read.
+*
+* \returns Returns the 4-byte word referenced by var from the source thread ID as half2.
+* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior not reentrant, not thread safe
@@ -1629,22 +1629,22 @@ __CUDA_FP16_DECL__ __half2 __shfl_down_sync(const unsigned mask, const __half2 v
 __CUDA_FP16_DECL__ __half2 __shfl_xor_sync(const unsigned mask, const __half2 var, const int delta, const int width = warpSize);
 /**
 * \ingroup CUDA_MATH__HALF_MISC
-* \brief Exchange a variable between threads within a warp. Direct copy from indexed thread. 
-* 
-* \details Returns the value of var held by the thread whose ID is given by delta. 
-* If width is less than warpSize then each subsection of the warp behaves as a separate 
-* entity with a starting logical thread ID of 0. If delta is outside the range [0:width-1], 
-* the value returned corresponds to the value of var held by the delta modulo width (i.e. 
-* within the same subsection). width must have a value which is a power of 2; 
-* results are undefined if width is not a power of 2, or is a number greater than 
-* warpSize. 
-* \param[in] mask - unsigned int. Is only being read. 
-* \param[in] var - half. Is only being read. 
-* \param[in] delta - int. Is only being read. 
-* \param[in] width - int. Is only being read. 
-* 
-* \returns Returns the 2-byte word referenced by var from the source thread ID as half. 
-* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. 
+* \brief Exchange a variable between threads within a warp. Direct copy from indexed thread.
+*
+* \details Returns the value of var held by the thread whose ID is given by delta.
+* If width is less than warpSize then each subsection of the warp behaves as a separate
+* entity with a starting logical thread ID of 0. If delta is outside the range [0:width-1],
+* the value returned corresponds to the value of var held by the delta modulo width (i.e.
+* within the same subsection). width must have a value which is a power of 2;
+* results are undefined if width is not a power of 2, or is a number greater than
+* warpSize.
+* \param[in] mask - unsigned int. Is only being read.
+* \param[in] var - half. Is only being read.
+* \param[in] delta - int. Is only being read.
+* \param[in] width - int. Is only being read.
+*
+* \returns Returns the 2-byte word referenced by var from the source thread ID as half.
+* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior not reentrant, not thread safe
@@ -1653,21 +1653,21 @@ __CUDA_FP16_DECL__ __half2 __shfl_xor_sync(const unsigned mask, const __half2 va
 __CUDA_FP16_DECL__ __half __shfl_sync(const unsigned mask, const __half var, const int delta, const int width = warpSize);
 /**
 * \ingroup CUDA_MATH__HALF_MISC
-* \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller. 
-* \details Calculates a source thread ID by subtracting delta from the caller's lane ID. 
-* The value of var held by the resulting lane ID is returned: in effect, var is shifted up 
-* the warp by delta threads. If width is less than warpSize then each subsection of the warp 
-* behaves as a separate entity with a starting logical thread ID of 0. The source thread index 
-* will not wrap around the value of width, so effectively the lower delta threads will be unchanged. 
-* width must have a value which is a power of 2; results are undefined if width is not a power of 2, 
-* or is a number greater than warpSize. 
-* \param[in] mask - unsigned int. Is only being read. 
-* \param[in] var - half. Is only being read. 
-* \param[in] delta - int. Is only being read. 
-* \param[in] width - int. Is only being read. 
-* 
-* \returns Returns the 2-byte word referenced by var from the source thread ID as half. 
-* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. 
+* \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller.
+* \details Calculates a source thread ID by subtracting delta from the caller's lane ID.
+* The value of var held by the resulting lane ID is returned: in effect, var is shifted up
+* the warp by delta threads. If width is less than warpSize then each subsection of the warp
+* behaves as a separate entity with a starting logical thread ID of 0. The source thread index
+* will not wrap around the value of width, so effectively the lower delta threads will be unchanged.
+* width must have a value which is a power of 2; results are undefined if width is not a power of 2,
+* or is a number greater than warpSize.
+* \param[in] mask - unsigned int. Is only being read.
+* \param[in] var - half. Is only being read.
+* \param[in] delta - int. Is only being read.
+* \param[in] width - int. Is only being read.
+*
+* \returns Returns the 2-byte word referenced by var from the source thread ID as half.
+* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior not reentrant, not thread safe
@@ -1676,22 +1676,22 @@ __CUDA_FP16_DECL__ __half __shfl_sync(const unsigned mask, const __half var, con
 __CUDA_FP16_DECL__ __half __shfl_up_sync(const unsigned mask, const __half var, const unsigned int delta, const int width = warpSize);
 /**
 * \ingroup CUDA_MATH__HALF_MISC
-* \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller. 
-* 
-* \details Calculates a source thread ID by adding delta to the caller's thread ID. 
-* The value of var held by the resulting thread ID is returned: this has the effect 
-* of shifting var down the warp by delta threads. If width is less than warpSize then 
-* each subsection of the warp behaves as a separate entity with a starting logical 
-* thread ID of 0. As for __shfl_up_sync(), the ID number of the source thread 
-* will not wrap around the value of width and so the upper delta threads 
-* will remain unchanged. 
-* \param[in] mask - unsigned int. Is only being read. 
-* \param[in] var - half. Is only being read. 
-* \param[in] delta - int. Is only being read. 
-* \param[in] width - int. Is only being read. 
-* 
-* \returns Returns the 2-byte word referenced by var from the source thread ID as half. 
-* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. 
+* \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller.
+*
+* \details Calculates a source thread ID by adding delta to the caller's thread ID.
+* The value of var held by the resulting thread ID is returned: this has the effect
+* of shifting var down the warp by delta threads. If width is less than warpSize then
+* each subsection of the warp behaves as a separate entity with a starting logical
+* thread ID of 0. As for __shfl_up_sync(), the ID number of the source thread
+* will not wrap around the value of width and so the upper delta threads
+* will remain unchanged.
+* \param[in] mask - unsigned int. Is only being read.
+* \param[in] var - half. Is only being read.
+* \param[in] delta - int. Is only being read.
+* \param[in] width - int. Is only being read.
+*
+* \returns Returns the 2-byte word referenced by var from the source thread ID as half.
+* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior not reentrant, not thread safe
@@ -1700,21 +1700,21 @@ __CUDA_FP16_DECL__ __half __shfl_up_sync(const unsigned mask, const __half var,
 __CUDA_FP16_DECL__ __half __shfl_down_sync(const unsigned mask, const __half var, const unsigned int delta, const int width = warpSize);
 /**
 * \ingroup CUDA_MATH__HALF_MISC
-* \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID. 
-* 
-* \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with mask: 
-* the value of var held by the resulting thread ID is returned. If width is less than warpSize then each 
-* group of width consecutive threads are able to access elements from earlier groups of threads, 
-* however if they attempt to access elements from later groups of threads their own value of var 
-* will be returned. This mode implements a butterfly addressing pattern such as is used in tree 
-* reduction and broadcast. 
-* \param[in] mask - unsigned int. Is only being read. 
-* \param[in] var - half. Is only being read. 
-* \param[in] delta - int. Is only being read. 
-* \param[in] width - int. Is only being read. 
-* 
-* \returns Returns the 2-byte word referenced by var from the source thread ID as half. 
-* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. 
+* \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID.
+*
+* \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with mask:
+* the value of var held by the resulting thread ID is returned. If width is less than warpSize then each
+* group of width consecutive threads are able to access elements from earlier groups of threads,
+* however if they attempt to access elements from later groups of threads their own value of var
+* will be returned. This mode implements a butterfly addressing pattern such as is used in tree
+* reduction and broadcast.
+* \param[in] mask - unsigned int. Is only being read.
+* \param[in] var - half. Is only being read.
+* \param[in] delta - int. Is only being read.
+* \param[in] width - int. Is only being read.
+*
+* \returns Returns the 2-byte word referenced by var from the source thread ID as half.
+* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior not reentrant, not thread safe
@@ -1875,13 +1875,13 @@ __CUDA_FP16_DECL__ void __stwt(__half *const ptr, const __half value);
 /**
 * \ingroup CUDA_MATH__HALF2_COMPARISON
 * \brief Performs half2 vector if-equal comparison.
-* 
+*
 * \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b.
 * The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
 * NaN inputs generate false results.
-* \param[in] a - half2. Is only being read. 
-* \param[in] b - half2. Is only being read. 
-* 
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
 * \returns half2
 * \retval The vector result of if-equal comparison of vectors \p a and \p b.
 * \internal
@@ -1893,13 +1893,13 @@ __CUDA_FP16_DECL__ __half2 __heq2(const __half2 a, const __half2 b);
 /**
 * \ingroup CUDA_MATH__HALF2_COMPARISON
 * \brief Performs \p half2 vector not-equal comparison.
-* 
+*
 * \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b.
 * The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
 * NaN inputs generate false results.
-* \param[in] a - half2. Is only being read. 
-* \param[in] b - half2. Is only being read. 
-* 
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
 * \returns half2
 * \retval The vector result of not-equal comparison of vectors \p a and \p b.
 * \internal
@@ -1915,8 +1915,8 @@ __CUDA_FP16_DECL__ __half2 __hne2(const __half2 a, const __half2 b);
 * \details Performs \p half2 vector less-equal comparison of inputs \p a and \p b.
 * The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
 * NaN inputs generate false results.
-* \param[in] a - half2. Is only being read. 
-* \param[in] b - half2. Is only being read. 
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
 *
 * \returns half2
 * \retval The \p half2 result of less-equal comparison of vectors \p a and \p b.
@@ -1933,8 +1933,8 @@ __CUDA_FP16_DECL__ __half2 __hle2(const __half2 a, const __half2 b);
 * \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b.
 * The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
 * NaN inputs generate false results.
-* \param[in] a - half2. Is only being read. 
-* \param[in] b - half2. Is only being read. 
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
 *
 * \returns half2
 * \retval The vector result of greater-equal comparison of vectors \p a and \p b.
@@ -1951,8 +1951,8 @@ __CUDA_FP16_DECL__ __half2 __hge2(const __half2 a, const __half2 b);
 * \details Performs \p half2 vector less-than comparison of inputs \p a and \p b.
 * The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
 * NaN inputs generate false results.
-* \param[in] a - half2. Is only being read. 
-* \param[in] b - half2. Is only being read. 
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
 *
 * \returns half2
 * \retval The half2 vector result of less-than comparison of vectors \p a and \p b.
@@ -1965,13 +1965,13 @@ __CUDA_FP16_DECL__ __half2 __hlt2(const __half2 a, const __half2 b);
 /**
 * \ingroup CUDA_MATH__HALF2_COMPARISON
 * \brief Performs \p half2 vector greater-than comparison.
-* 
+*
 * \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b.
 * The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
 * NaN inputs generate false results.
-* \param[in] a - half2. Is only being read. 
-* \param[in] b - half2. Is only being read. 
-* 
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
 * \returns half2
 * \retval The vector result of greater-than comparison of vectors \p a and \p b.
 * \internal
@@ -1983,13 +1983,13 @@ __CUDA_FP16_DECL__ __half2 __hgt2(const __half2 a, const __half2 b);
 /**
 * \ingroup CUDA_MATH__HALF2_COMPARISON
 * \brief Performs \p half2 vector unordered if-equal comparison.
-* 
+*
 * \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b.
 * The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
 * NaN inputs generate true results.
-* \param[in] a - half2. Is only being read. 
-* \param[in] b - half2. Is only being read. 
-* 
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
 * \returns half2
 * \retval The vector result of unordered if-equal comparison of vectors \p a and \p b.
 * \internal
@@ -2005,8 +2005,8 @@ __CUDA_FP16_DECL__ __half2 __hequ2(const __half2 a, const __half2 b);
 * \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b.
 * The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
 * NaN inputs generate true results.
-* \param[in] a - half2. Is only being read. 
-* \param[in] b - half2. Is only being read. 
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
 *
 * \returns half2
 * \retval The vector result of unordered not-equal comparison of vectors \p a and \p b.
@@ -2023,8 +2023,8 @@ __CUDA_FP16_DECL__ __half2 __hneu2(const __half2 a, const __half2 b);
 * Performs \p half2 vector less-equal comparison of inputs \p a and \p b.
 * The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
 * NaN inputs generate true results.
-* \param[in] a - half2. Is only being read. 
-* \param[in] b - half2. Is only being read. 
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
 *
 * \returns half2
 * \retval The vector result of unordered less-equal comparison of vectors \p a and \p b.
@@ -2041,8 +2041,8 @@ __CUDA_FP16_DECL__ __half2 __hleu2(const __half2 a, const __half2 b);
 * \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b.
 * The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
 * NaN inputs generate true results.
-* \param[in] a - half2. Is only being read. 
-* \param[in] b - half2. Is only being read. 
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
 *
 * \returns half2
 * \retval The \p half2 vector result of unordered greater-equal comparison of vectors \p a and \p b.
@@ -2059,8 +2059,8 @@ __CUDA_FP16_DECL__ __half2 __hgeu2(const __half2 a, const __half2 b);
 * \details Performs \p half2 vector less-than comparison of inputs \p a and \p b.
 * The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
 * NaN inputs generate true results.
-* \param[in] a - half2. Is only being read. 
-* \param[in] b - half2. Is only being read. 
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
 *
 * \returns half2
 * \retval The vector result of unordered less-than comparison of vectors \p a and \p b.
@@ -2077,8 +2077,8 @@ __CUDA_FP16_DECL__ __half2 __hltu2(const __half2 a, const __half2 b);
 * \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b.
 * The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
 * NaN inputs generate true results.
-* \param[in] a - half2. Is only being read. 
-* \param[in] b - half2. Is only being read. 
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
 *
 * \returns half2
 * \retval The \p half2 vector result of unordered greater-than comparison of vectors \p a and \p b.
@@ -2093,11 +2093,11 @@ __CUDA_FP16_DECL__ __half2 __hgtu2(const __half2 a, const __half2 b);
 * \brief Determine whether \p half2 argument is a NaN.
 *
 * \details Determine whether each half of input \p half2 number \p a is a NaN.
-* \param[in] a - half2. Is only being read. 
+* \param[in] a - half2. Is only being read.
 *
 * \returns half2
 * \retval The half2 with the corresponding \p half results set to
-* 1.0 for NaN, 0.0 otherwise. 
+* 1.0 for NaN, 0.0 otherwise.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -2113,11 +2113,11 @@ __CUDA_FP16_DECL__ __half2 __hisnan2(const __half2 a);
 * \internal
 * \req DEEPLEARN-SRM_REQ-95
 * \endinternal
-* \param[in] a - half2. Is only being read. 
-* \param[in] b - half2. Is only being read. 
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
 *
 * \returns half2
-* \retval The sum of vectors \p a and \p b. 
+* \retval The sum of vectors \p a and \p b.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -2133,11 +2133,11 @@ __CUDA_FP16_DECL__ __half2 __hadd2(const __half2 a, const __half2 b);
 * \internal
 * \req DEEPLEARN-SRM_REQ-104
 * \endinternal
-* \param[in] a - half2. Is only being read. 
-* \param[in] b - half2. Is only being read. 
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
 *
 * \returns half2
-* \retval The subtraction of vector \p b from \p a. 
+* \retval The subtraction of vector \p b from \p a.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -2153,11 +2153,11 @@ __CUDA_FP16_DECL__ __half2 __hsub2(const __half2 a, const __half2 b);
 * \internal
 * \req DEEPLEARN-SRM_REQ-102
 * \endinternal
-* \param[in] a - half2. Is only being read. 
-* \param[in] b - half2. Is only being read. 
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
 *
 * \returns half2
-* \retval The result of elementwise multiplying the vectors \p a and \p b. 
+* \retval The result of elementwise multiplying the vectors \p a and \p b.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -2173,11 +2173,11 @@ __CUDA_FP16_DECL__ __half2 __hmul2(const __half2 a, const __half2 b);
 * \internal
 * \req DEEPLEARN-SRM_REQ-103
 * \endinternal
-* \param[in] a - half2. Is only being read. 
-* \param[in] b - half2. Is only being read. 
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
 *
 * \returns half2
-* \retval The elementwise division of \p a with \p b. 
+* \retval The elementwise division of \p a with \p b.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -2191,10 +2191,10 @@ __CUDA_FP16_DECL__ __half2 __h2div(const __half2 a, const __half2 b);
 *
 * \details Calculates the absolute value of both halves of the input \p half2 number and
 * returns the result.
-* \param[in] a - half2. Is only being read. 
+* \param[in] a - half2. Is only being read.
 *
 * \returns half2
-* \retval Returns \p a with the absolute value of both halves. 
+* \retval Returns \p a with the absolute value of both halves.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -2209,11 +2209,11 @@ __CUDA_FP16_DECL__ __half2 __habs2(const __half2 a);
 * \details Performs \p half2 vector add of inputs \p a and \p b, in round-to-nearest
 * mode, and clamps the results to range [0.0, 1.0]. NaN results are flushed to
 * +0.0.
-* \param[in] a - half2. Is only being read. 
-* \param[in] b - half2. Is only being read. 
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
 *
 * \returns half2
-* \retval The sum of \p a and \p b, with respect to saturation. 
+* \retval The sum of \p a and \p b, with respect to saturation.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -2228,8 +2228,8 @@ __CUDA_FP16_DECL__ __half2 __hadd2_sat(const __half2 a, const __half2 b);
 * \details Subtracts \p half2 input vector \p b from input vector \p a in
 * round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN
 * results are flushed to +0.0.
-* \param[in] a - half2. Is only being read. 
-* \param[in] b - half2. Is only being read. 
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
 *
 * \returns half2
 * \retval The subtraction of vector \p b from \p a, with respect to saturation.
@@ -2247,12 +2247,12 @@ __CUDA_FP16_DECL__ __half2 __hsub2_sat(const __half2 a, const __half2 b);
 * \details Performs \p half2 vector multiplication of inputs \p a and \p b, in
 * round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN
 * results are flushed to +0.0.
-* \param[in] a - half2. Is only being read. 
-* \param[in] b - half2. Is only being read. 
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
 *
 * \returns half2
-* \retval The result of elementwise multiplication of vectors \p a and \p b, 
-* with respect to saturation. 
+* \retval The result of elementwise multiplication of vectors \p a and \p b,
+* with respect to saturation.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -2270,12 +2270,12 @@ __CUDA_FP16_DECL__ __half2 __hmul2_sat(const __half2 a, const __half2 b);
 * \internal
 * \req DEEPLEARN-SRM_REQ-105
 * \endinternal
-* \param[in] a - half2. Is only being read. 
-* \param[in] b - half2. Is only being read. 
-* \param[in] c - half2. Is only being read. 
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+* \param[in] c - half2. Is only being read.
 *
 * \returns half2
-* \retval The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c. 
+* \retval The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -2291,13 +2291,13 @@ __CUDA_FP16_DECL__ __half2 __hfma2(const __half2 a, const __half2 b, const __hal
 * then performs a \p half2 vector add of the result with \p c,
 * rounding the result once in round-to-nearest-even mode, and clamps the
 * results to range [0.0, 1.0]. NaN results are flushed to +0.0.
-* \param[in] a - half2. Is only being read. 
-* \param[in] b - half2. Is only being read. 
-* \param[in] c - half2. Is only being read. 
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+* \param[in] c - half2. Is only being read.
 *
 * \returns half2
-* \retval The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c, 
-* with respect to saturation. 
+* \retval The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c,
+* with respect to saturation.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -2313,10 +2313,10 @@ __CUDA_FP16_DECL__ __half2 __hfma2_sat(const __half2 a, const __half2 b, const _
 * \internal
 * \req DEEPLEARN-SRM_REQ-101
 * \endinternal
-* \param[in] a - half2. Is only being read. 
+* \param[in] a - half2. Is only being read.
 *
 * \returns half2
-* \retval Returns \p a with both halves negated. 
+* \retval Returns \p a with both halves negated.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -2328,7 +2328,7 @@ __CUDA_FP16_DECL__ __half2 __hneg2(const __half2 a);
 * \brief Calculates the absolute value of input \p half number and returns the result.
 *
 * \details Calculates the absolute value of input \p half number and returns the result.
-* \param[in] a - half. Is only being read. 
+* \param[in] a - half. Is only being read.
 *
 * \returns half
 * \retval The absolute value of a.
@@ -2347,11 +2347,11 @@ __CUDA_FP16_DECL__ __half __habs(const __half a);
 * \internal
 * \req DEEPLEARN-SRM_REQ-94
 * \endinternal
-* \param[in] a - half. Is only being read. 
-* \param[in] b - half. Is only being read. 
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
 *
 * \returns half
-* \retval The sum of \p a and \p b. 
+* \retval The sum of \p a and \p b.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -2367,11 +2367,11 @@ __CUDA_FP16_DECL__ __half __hadd(const __half a, const __half b);
 * \internal
 * \req DEEPLEARN-SRM_REQ-97
 * \endinternal
-* \param[in] a - half. Is only being read. 
-* \param[in] b - half. Is only being read. 
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
 *
 * \returns half
-* \retval The result of subtracting \p b from \p a. 
+* \retval The result of subtracting \p b from \p a.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -2387,27 +2387,27 @@ __CUDA_FP16_DECL__ __half __hsub(const __half a, const __half b);
 * \internal
 * \req DEEPLEARN-SRM_REQ-99
 * \endinternal
-* \param[in] a - half. Is only being read. 
-* \param[in] b - half. Is only being read. 
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
 *
 * \returns half
-* \retval The result of multiplying \p a and \p b. 
+* \retval The result of multiplying \p a and \p b.
 */
 __CUDA_FP16_DECL__ __half __hmul(const __half a, const __half b);
 /**
 * \ingroup CUDA_MATH__HALF_ARITHMETIC
 * \brief Performs \p half division in round-to-nearest-even mode.
-* 
+*
 * \details Divides \p half input \p a by input \p b in round-to-nearest
 * mode.
 * \internal
 * \req DEEPLEARN-SRM_REQ-98
 * \endinternal
-* \param[in] a - half. Is only being read. 
-* \param[in] b - half. Is only being read. 
-* 
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
 * \returns half
-* \retval The result of dividing \p a by \p b. 
+* \retval The result of dividing \p a by \p b.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -2421,8 +2421,8 @@ __CUDA_FP16_DECL__  __half __hdiv(const __half a, const __half b);
 *
 * \details Performs \p half add of inputs \p a and \p b, in round-to-nearest-even mode,
 * and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0.
-* \param[in] a - half. Is only being read. 
-* \param[in] b - half. Is only being read. 
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
 *
 * \returns half
 * \retval The sum of \p a and \p b, with respect to saturation.
@@ -2440,8 +2440,8 @@ __CUDA_FP16_DECL__ __half __hadd_sat(const __half a, const __half b);
 * \details Subtracts \p half input \p b from input \p a in round-to-nearest
 * mode,
 * and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0.
-* \param[in] a - half. Is only being read. 
-* \param[in] b - half. Is only being read. 
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
 *
 * \returns half
 * \retval The result of subtraction of \p b from \p a, with respect to saturation.
@@ -2459,8 +2459,8 @@ __CUDA_FP16_DECL__ __half __hsub_sat(const __half a, const __half b);
 * \details Performs \p half multiplication of inputs \p a and \p b, in round-to-nearest
 * mode, and clamps the result to range [0.0, 1.0]. NaN results are flushed to
 * +0.0.
-* \param[in] a - half. Is only being read. 
-* \param[in] b - half. Is only being read. 
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
 *
 * \returns half
 * \retval The result of multiplying \p a and \p b, with respect to saturation.
@@ -2480,13 +2480,13 @@ __CUDA_FP16_DECL__ __half __hmul_sat(const __half a, const __half b);
 * \internal
 * \req DEEPLEARN-SRM_REQ-96
 * \endinternal
-* \param[in] a - half. Is only being read. 
-* \param[in] b - half. Is only being read. 
-* \param[in] c - half. Is only being read. 
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+* \param[in] c - half. Is only being read.
 *
 * \returns half
 * \retval The result of fused multiply-add operation on \p
-* a, \p b, and \p c. 
+* a, \p b, and \p c.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -2502,13 +2502,13 @@ __CUDA_FP16_DECL__ __half __hfma(const __half a, const __half b, const __half c)
 * then performs a \p half add of the result with \p c,
 * rounding the result once in round-to-nearest-even mode, and clamps the result
 * to range [0.0, 1.0]. NaN results are flushed to +0.0.
-* \param[in] a - half. Is only being read. 
-* \param[in] b - half. Is only being read. 
-* \param[in] c - half. Is only being read. 
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+* \param[in] c - half. Is only being read.
 *
 * \returns half
 * \retval The result of fused multiply-add operation on \p
-* a, \p b, and \p c, with respect to saturation. 
+* a, \p b, and \p c, with respect to saturation.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -2523,7 +2523,7 @@ __CUDA_FP16_DECL__ __half __hfma_sat(const __half a, const __half b, const __hal
 * \internal
 * \req DEEPLEARN-SRM_REQ-100
 * \endinternal
-* \param[in] a - half. Is only being read. 
+* \param[in] a - half. Is only being read.
 *
 * \returns half
 * \retval minus a
@@ -2542,8 +2542,8 @@ __CUDA_FP16_DECL__ __half __hneg(const __half a);
 * The bool result is set to true only if both \p half if-equal comparisons
 * evaluate to true, or false otherwise.
 * NaN inputs generate false results.
-* \param[in] a - half2. Is only being read. 
-* \param[in] b - half2. Is only being read. 
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
 *
 * \returns bool
 * \retval true if both \p half results of if-equal comparison
@@ -2564,13 +2564,13 @@ __CUDA_FP16_DECL__ bool __hbeq2(const __half2 a, const __half2 b);
 * The bool result is set to true only if both \p half not-equal comparisons
 * evaluate to true, or false otherwise.
 * NaN inputs generate false results.
-* \param[in] a - half2. Is only being read. 
-* \param[in] b - half2. Is only being read. 
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
 *
 * \returns bool
 * \retval true if both \p half results of not-equal comparison
-* of vectors \p a and \p b are true, 
-* \retval false otherwise. 
+* of vectors \p a and \p b are true,
+* \retval false otherwise.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -2586,13 +2586,13 @@ __CUDA_FP16_DECL__ bool __hbne2(const __half2 a, const __half2 b);
 * The bool result is set to true only if both \p half less-equal comparisons
 * evaluate to true, or false otherwise.
 * NaN inputs generate false results.
-* \param[in] a - half2. Is only being read. 
-* \param[in] b - half2. Is only being read. 
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
 *
 * \returns bool
 * \retval true if both \p half results of less-equal comparison
-* of vectors \p a and \p b are true; 
-* \retval false otherwise. 
+* of vectors \p a and \p b are true;
+* \retval false otherwise.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -2608,13 +2608,13 @@ __CUDA_FP16_DECL__ bool __hble2(const __half2 a, const __half2 b);
 * The bool result is set to true only if both \p half greater-equal comparisons
 * evaluate to true, or false otherwise.
 * NaN inputs generate false results.
-* \param[in] a - half2. Is only being read. 
-* \param[in] b - half2. Is only being read. 
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
 *
 * \returns bool
 * \retval true if both \p half results of greater-equal
-* comparison of vectors \p a and \p b are true; 
-* \retval false otherwise. 
+* comparison of vectors \p a and \p b are true;
+* \retval false otherwise.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -2630,13 +2630,13 @@ __CUDA_FP16_DECL__ bool __hbge2(const __half2 a, const __half2 b);
 * The bool result is set to true only if both \p half less-than comparisons
 * evaluate to true, or false otherwise.
 * NaN inputs generate false results.
-* \param[in] a - half2. Is only being read. 
-* \param[in] b - half2. Is only being read. 
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
 *
 * \returns bool
 * \retval true if both \p half results of less-than comparison
-* of vectors \p a and \p b are true; 
-* \retval false otherwise. 
+* of vectors \p a and \p b are true;
+* \retval false otherwise.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -2652,13 +2652,13 @@ __CUDA_FP16_DECL__ bool __hblt2(const __half2 a, const __half2 b);
 * The bool result is set to true only if both \p half greater-than comparisons
 * evaluate to true, or false otherwise.
 * NaN inputs generate false results.
-* \param[in] a - half2. Is only being read. 
-* \param[in] b - half2. Is only being read. 
-* 
-* \returns bool 
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns bool
 * \retval true if both \p half results of greater-than
-* comparison of vectors \p a and \p b are true; 
-* \retval false otherwise. 
+* comparison of vectors \p a and \p b are true;
+* \retval false otherwise.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -2674,13 +2674,13 @@ __CUDA_FP16_DECL__ bool __hbgt2(const __half2 a, const __half2 b);
 * The bool result is set to true only if both \p half if-equal comparisons
 * evaluate to true, or false otherwise.
 * NaN inputs generate true results.
-* \param[in] a - half2. Is only being read. 
-* \param[in] b - half2. Is only being read. 
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
 *
 * \returns bool
 * \retval true if both \p half results of unordered if-equal
-* comparison of vectors \p a and \p b are true; 
-* \retval false otherwise. 
+* comparison of vectors \p a and \p b are true;
+* \retval false otherwise.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -2696,13 +2696,13 @@ __CUDA_FP16_DECL__ bool __hbequ2(const __half2 a, const __half2 b);
 * The bool result is set to true only if both \p half not-equal comparisons
 * evaluate to true, or false otherwise.
 * NaN inputs generate true results.
-* \param[in] a - half2. Is only being read. 
-* \param[in] b - half2. Is only being read. 
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
 *
 * \returns bool
 * \retval true if both \p half results of unordered not-equal
 * comparison of vectors \p a and \p b are true;
-* \retval false otherwise. 
+* \retval false otherwise.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -2718,13 +2718,13 @@ __CUDA_FP16_DECL__ bool __hbneu2(const __half2 a, const __half2 b);
 * The bool result is set to true only if both \p half less-equal comparisons
 * evaluate to true, or false otherwise.
 * NaN inputs generate true results.
-* \param[in] a - half2. Is only being read. 
-* \param[in] b - half2. Is only being read. 
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
 *
 * \returns bool
 * \retval true if both \p half results of unordered less-equal
-* comparison of vectors \p a and \p b are true; 
-* \retval false otherwise. 
+* comparison of vectors \p a and \p b are true;
+* \retval false otherwise.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -2741,13 +2741,13 @@ __CUDA_FP16_DECL__ bool __hbleu2(const __half2 a, const __half2 b);
 * The bool result is set to true only if both \p half greater-equal comparisons
 * evaluate to true, or false otherwise.
 * NaN inputs generate true results.
-* \param[in] a - half2. Is only being read. 
-* \param[in] b - half2. Is only being read. 
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
 *
 * \returns bool
 * \retval true if both \p half results of unordered
-* greater-equal comparison of vectors \p a and \p b are true; 
-* \retval false otherwise. 
+* greater-equal comparison of vectors \p a and \p b are true;
+* \retval false otherwise.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -2763,13 +2763,13 @@ __CUDA_FP16_DECL__ bool __hbgeu2(const __half2 a, const __half2 b);
 * The bool result is set to true only if both \p half less-than comparisons
 * evaluate to true, or false otherwise.
 * NaN inputs generate true results.
-* \param[in] a - half2. Is only being read. 
-* \param[in] b - half2. Is only being read. 
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
 *
 * \returns bool
-* \retval true if both \p half results of unordered less-than comparison of 
-* vectors \p a and \p b are true; 
-* \retval false otherwise. 
+* \retval true if both \p half results of unordered less-than comparison of
+* vectors \p a and \p b are true;
+* \retval false otherwise.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -2786,13 +2786,13 @@ __CUDA_FP16_DECL__ bool __hbltu2(const __half2 a, const __half2 b);
 * The bool result is set to true only if both \p half greater-than comparisons
 * evaluate to true, or false otherwise.
 * NaN inputs generate true results.
-* \param[in] a - half2. Is only being read. 
-* \param[in] b - half2. Is only being read. 
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
 *
 * \returns bool
 * \retval true if both \p half results of unordered
 * greater-than comparison of vectors \p a and \p b are true;
-* \retval false otherwise. 
+* \retval false otherwise.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -2805,11 +2805,11 @@ __CUDA_FP16_DECL__ bool __hbgtu2(const __half2 a, const __half2 b);
 *
 * \details Performs \p half if-equal comparison of inputs \p a and \p b.
 * NaN inputs generate false results.
-* \param[in] a - half. Is only being read. 
-* \param[in] b - half. Is only being read. 
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
 *
 * \returns bool
-* \retval The boolean result of if-equal comparison of \p a and \p b. 
+* \retval The boolean result of if-equal comparison of \p a and \p b.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -2822,8 +2822,8 @@ __CUDA_FP16_DECL__ bool __heq(const __half a, const __half b);
 *
 * \details Performs \p half not-equal comparison of inputs \p a and \p b.
 * NaN inputs generate false results.
-* \param[in] a - half. Is only being read. 
-* \param[in] b - half. Is only being read. 
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
 *
 * \returns bool
 * \retval The boolean result of not-equal comparison of \p a and \p b.
@@ -2839,8 +2839,8 @@ __CUDA_FP16_DECL__ bool __hne(const __half a, const __half b);
 *
 * \details Performs \p half less-equal comparison of inputs \p a and \p b.
 * NaN inputs generate false results.
-* \param[in] a - half. Is only being read. 
-* \param[in] b - half. Is only being read. 
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
 *
 * \returns bool
 * \retval The boolean result of less-equal comparison of \p a and \p b.
@@ -2856,8 +2856,8 @@ __CUDA_FP16_DECL__ bool __hle(const __half a, const __half b);
 *
 * \details Performs \p half greater-equal comparison of inputs \p a and \p b.
 * NaN inputs generate false results.
-* \param[in] a - half. Is only being read. 
-* \param[in] b - half. Is only being read. 
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
 *
 * \returns bool
 * \retval The boolean result of greater-equal comparison of \p a and \p b.
@@ -2873,8 +2873,8 @@ __CUDA_FP16_DECL__ bool __hge(const __half a, const __half b);
 *
 * \details Performs \p half less-than comparison of inputs \p a and \p b.
 * NaN inputs generate false results.
-* \param[in] a - half. Is only being read. 
-* \param[in] b - half. Is only being read. 
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
 *
 * \returns bool
 * \retval The boolean result of less-than comparison of \p a and \p b.
@@ -2890,8 +2890,8 @@ __CUDA_FP16_DECL__ bool __hlt(const __half a, const __half b);
 *
 * \details Performs \p half greater-than comparison of inputs \p a and \p b.
 * NaN inputs generate false results.
-* \param[in] a - half. Is only being read. 
-* \param[in] b - half. Is only being read. 
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
 *
 * \returns bool
 * \retval The boolean result of greater-than comparison of \p a and \p b.
@@ -2907,8 +2907,8 @@ __CUDA_FP16_DECL__ bool __hgt(const __half a, const __half b);
 *
 * \details Performs \p half if-equal comparison of inputs \p a and \p b.
 * NaN inputs generate true results.
-* \param[in] a - half. Is only being read. 
-* \param[in] b - half. Is only being read. 
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
 *
 * \returns bool
 * \retval The boolean result of unordered if-equal comparison of \p a and
@@ -2925,8 +2925,8 @@ __CUDA_FP16_DECL__ bool __hequ(const __half a, const __half b);
 *
 * \details Performs \p half not-equal comparison of inputs \p a and \p b.
 * NaN inputs generate true results.
-* \param[in] a - half. Is only being read. 
-* \param[in] b - half. Is only being read. 
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
 *
 * \returns bool
 * \retval The boolean result of unordered not-equal comparison of \p a and
@@ -2943,8 +2943,8 @@ __CUDA_FP16_DECL__ bool __hneu(const __half a, const __half b);
 *
 * \details Performs \p half less-equal comparison of inputs \p a and \p b.
 * NaN inputs generate true results.
-* \param[in] a - half. Is only being read. 
-* \param[in] b - half. Is only being read. 
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
 *
 * \returns bool
 * \retval The boolean result of unordered less-equal comparison of \p a and
@@ -2961,8 +2961,8 @@ __CUDA_FP16_DECL__ bool __hleu(const __half a, const __half b);
 *
 * \details Performs \p half greater-equal comparison of inputs \p a and \p b.
 * NaN inputs generate true results.
-* \param[in] a - half. Is only being read. 
-* \param[in] b - half. Is only being read. 
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
 *
 * \returns bool
 * \retval The boolean result of unordered greater-equal comparison of \p a
@@ -2979,8 +2979,8 @@ __CUDA_FP16_DECL__ bool __hgeu(const __half a, const __half b);
 *
 * \details Performs \p half less-than comparison of inputs \p a and \p b.
 * NaN inputs generate true results.
-* \param[in] a - half. Is only being read. 
-* \param[in] b - half. Is only being read. 
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
 *
 * \returns bool
 * \retval The boolean result of unordered less-than comparison of \p a and
@@ -2997,8 +2997,8 @@ __CUDA_FP16_DECL__ bool __hltu(const __half a, const __half b);
 *
 * \details Performs \p half greater-than comparison of inputs \p a and \p b.
 * NaN inputs generate true results.
-* \param[in] a - half. Is only being read. 
-* \param[in] b - half. Is only being read. 
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
 *
 * \returns bool
 * \retval The boolean result of unordered greater-than comparison of \p a
@@ -3014,10 +3014,10 @@ __CUDA_FP16_DECL__ bool __hgtu(const __half a, const __half b);
 * \brief Determine whether \p half argument is a NaN.
 *
 * \details Determine whether \p half value \p a is a NaN.
-* \param[in] a - half. Is only being read. 
+* \param[in] a - half. Is only being read.
 *
 * \returns bool
-* \retval true iff argument is NaN. 
+* \retval true iff argument is NaN.
 * \internal
 * \exception-guarantee no-throw guarantee
 * \behavior reentrant, thread safe
@@ -3250,7 +3250,7 @@ __CUDA_FP16_DECL__ __half2 __hcmadd(const __half2 a, const __half2 b, const __ha
 * \brief Calculates \p half square root in round-to-nearest-even mode.
 *
 * \details Calculates \p half square root of input \p a in round-to-nearest-even mode.
-* \param[in] a - half. Is only being read. 
+* \param[in] a - half. Is only being read.
 *
 * \returns half
 * \retval The square root of \p a.
@@ -3267,7 +3267,7 @@ __CUDA_FP16_DECL__ __half hsqrt(const __half a);
 *
 * \details Calculates \p half reciprocal square root of input \p a in round-to-nearest
 * mode.
-* \param[in] a - half. Is only being read. 
+* \param[in] a - half. Is only being read.
 *
 * \returns half
 * \retval The reciprocal square root of \p a.
@@ -3282,7 +3282,7 @@ __CUDA_FP16_DECL__ __half hrsqrt(const __half a);
 * \brief Calculates \p half reciprocal in round-to-nearest-even mode.
 *
 * \details Calculates \p half reciprocal of input \p a in round-to-nearest-even mode.
-* \param[in] a - half. Is only being read. 
+* \param[in] a - half. Is only being read.
 *
 * \returns half
 * \retval The reciprocal of \p a.
@@ -3298,7 +3298,7 @@ __CUDA_FP16_DECL__ __half hrcp(const __half a);
 *
 * \details Calculates \p half natural logarithm of input \p a in round-to-nearest-even
 * mode.
-* \param[in] a - half. Is only being read. 
+* \param[in] a - half. Is only being read.
 *
 * \returns half
 * \retval The natural logarithm of \p a.
@@ -3314,7 +3314,7 @@ __CUDA_FP16_DECL__ __half hlog(const __half a);
 *
 * \details Calculates \p half binary logarithm of input \p a in round-to-nearest-even
 * mode.
-* \param[in] a - half. Is only being read. 
+* \param[in] a - half. Is only being read.
 *
 * \returns half
 * \retval The binary logarithm of \p a.
@@ -3330,7 +3330,7 @@ __CUDA_FP16_DECL__ __half hlog2(const __half a);
 *
 * \details Calculates \p half decimal logarithm of input \p a in round-to-nearest-even
 * mode.
-* \param[in] a - half. Is only being read. 
+* \param[in] a - half. Is only being read.
 *
 * \returns half
 * \retval The decimal logarithm of \p a.
@@ -3347,7 +3347,7 @@ __CUDA_FP16_DECL__ __half hlog10(const __half a);
 *
 * \details Calculates \p half natural exponential function of input \p a in
 * round-to-nearest-even mode.
-* \param[in] a - half. Is only being read. 
+* \param[in] a - half. Is only being read.
 *
 * \returns half
 * \retval The natural exponential function on \p a.
@@ -3364,7 +3364,7 @@ __CUDA_FP16_DECL__ __half hexp(const __half a);
 *
 * \details Calculates \p half binary exponential function of input \p a in
 * round-to-nearest-even mode.
-* \param[in] a - half. Is only being read. 
+* \param[in] a - half. Is only being read.
 *
 * \returns half
 * \retval The binary exponential function on \p a.
@@ -3381,7 +3381,7 @@ __CUDA_FP16_DECL__ __half hexp2(const __half a);
 *
 * \details Calculates \p half decimal exponential function of input \p a in
 * round-to-nearest-even mode.
-* \param[in] a - half. Is only being read. 
+* \param[in] a - half. Is only being read.
 *
 * \returns half
 * \retval The decimal exponential function on \p a.
@@ -3396,7 +3396,7 @@ __CUDA_FP16_DECL__ __half hexp10(const __half a);
 * \brief Calculates \p half cosine in round-to-nearest-even mode.
 *
 * \details Calculates \p half cosine of input \p a in round-to-nearest-even mode.
-* \param[in] a - half. Is only being read. 
+* \param[in] a - half. Is only being read.
 *
 * \returns half
 * \retval The cosine of \p a.
@@ -3411,7 +3411,7 @@ __CUDA_FP16_DECL__ __half hcos(const __half a);
 * \brief Calculates \p half sine in round-to-nearest-even mode.
 *
 * \details Calculates \p half sine of input \p a in round-to-nearest-even mode.
-* \param[in] a - half. Is only being read. 
+* \param[in] a - half. Is only being read.
 *
 * \returns half
 * \retval The sine of \p a.
@@ -3427,7 +3427,7 @@ __CUDA_FP16_DECL__ __half hsin(const __half a);
 *
 * \details Calculates \p half2 square root of input vector \p a in round-to-nearest
 * mode.
-* \param[in] a - half2. Is only being read. 
+* \param[in] a - half2. Is only being read.
 *
 * \returns half2
 * \retval The elementwise square root on vector \p a.
@@ -3444,7 +3444,7 @@ __CUDA_FP16_DECL__ __half2 h2sqrt(const __half2 a);
 *
 * \details Calculates \p half2 reciprocal square root of input vector \p a in
 * round-to-nearest-even mode.
-* \param[in] a - half2. Is only being read. 
+* \param[in] a - half2. Is only being read.
 *
 * \returns half2
 * \retval The elementwise reciprocal square root on vector \p a.
@@ -3460,7 +3460,7 @@ __CUDA_FP16_DECL__ __half2 h2rsqrt(const __half2 a);
 *
 * \details Calculates \p half2 reciprocal of input vector \p a in round-to-nearest-even
 * mode.
-* \param[in] a - half2. Is only being read. 
+* \param[in] a - half2. Is only being read.
 *
 * \returns half2
 * \retval The elementwise reciprocal on vector \p a.
@@ -3477,7 +3477,7 @@ __CUDA_FP16_DECL__ __half2 h2rcp(const __half2 a);
 *
 * \details Calculates \p half2 natural logarithm of input vector \p a in
 * round-to-nearest-even mode.
-* \param[in] a - half2. Is only being read. 
+* \param[in] a - half2. Is only being read.
 *
 * \returns half2
 * \retval The elementwise natural logarithm on vector \p a.
@@ -3494,7 +3494,7 @@ __CUDA_FP16_DECL__ __half2 h2log(const __half2 a);
 *
 * \details Calculates \p half2 binary logarithm of input vector \p a in round-to-nearest
 * mode.
-* \param[in] a - half2. Is only being read. 
+* \param[in] a - half2. Is only being read.
 *
 * \returns half2
 * \retval The elementwise binary logarithm on vector \p a.
@@ -3511,7 +3511,7 @@ __CUDA_FP16_DECL__ __half2 h2log2(const __half2 a);
 *
 * \details Calculates \p half2 decimal logarithm of input vector \p a in
 * round-to-nearest-even mode.
-* \param[in] a - half2. Is only being read. 
+* \param[in] a - half2. Is only being read.
 *
 * \returns half2
 * \retval The elementwise decimal logarithm on vector \p a.
@@ -3528,7 +3528,7 @@ __CUDA_FP16_DECL__ __half2 h2log10(const __half2 a);
 *
 * \details Calculates \p half2 exponential function of input vector \p a in
 * round-to-nearest-even mode.
-* \param[in] a - half2. Is only being read. 
+* \param[in] a - half2. Is only being read.
 *
 * \returns half2
 * \retval The elementwise exponential function on vector \p a.
@@ -3545,7 +3545,7 @@ __CUDA_FP16_DECL__ __half2 h2exp(const __half2 a);
 *
 * \details Calculates \p half2 binary exponential function of input vector \p a in
 * round-to-nearest-even mode.
-* \param[in] a - half2. Is only being read. 
+* \param[in] a - half2. Is only being read.
 *
 * \returns half2
 * \retval The elementwise binary exponential function on vector \p a.
@@ -3559,11 +3559,11 @@ __CUDA_FP16_DECL__ __half2 h2exp2(const __half2 a);
 * \ingroup CUDA_MATH__HALF2_FUNCTIONS
 * \brief Calculates \p half2 vector decimal exponential function in
 * round-to-nearest-even mode.
-* 
+*
 * \details Calculates \p half2 decimal exponential function of input vector \p a in
 * round-to-nearest-even mode.
-* \param[in] a - half2. Is only being read. 
-* 
+* \param[in] a - half2. Is only being read.
+*
 * \returns half2
 * \retval The elementwise decimal exponential function on vector \p a.
 * \internal
@@ -3575,11 +3575,11 @@ __CUDA_FP16_DECL__ __half2 h2exp10(const __half2 a);
 /**
 * \ingroup CUDA_MATH__HALF2_FUNCTIONS
 * \brief Calculates \p half2 vector cosine in round-to-nearest-even mode.
-* 
+*
 * \details Calculates \p half2 cosine of input vector \p a in round-to-nearest-even
 * mode.
-* \param[in] a - half2. Is only being read. 
-* 
+* \param[in] a - half2. Is only being read.
+*
 * \returns half2
 * \retval The elementwise cosine on vector \p a.
 * \internal
@@ -3591,10 +3591,10 @@ __CUDA_FP16_DECL__ __half2 h2cos(const __half2 a);
 /**
 * \ingroup CUDA_MATH__HALF2_FUNCTIONS
 * \brief Calculates \p half2 vector sine in round-to-nearest-even mode.
-* 
+*
 * \details Calculates \p half2 sine of input vector \p a in round-to-nearest-even mode.
-* \param[in] a - half2. Is only being read. 
-* 
+* \param[in] a - half2. Is only being read.
+*
 * \returns half2
 * \retval The elementwise sine on vector \p a.
 * \internal
diff --git a/numba_cuda/numba/cuda/cuda_fp16.hpp b/numba_cuda/numba/cuda/cuda_fp16.hpp
index 19bbd3412..2bc123b58 100644
--- a/numba_cuda/numba/cuda/cuda_fp16.hpp
+++ b/numba_cuda/numba/cuda/cuda_fp16.hpp
@@ -60,7 +60,7 @@
 #   define __CPP_VERSION_AT_LEAST_11_FP16
 #endif
 
-/* C++11 header for std::move. 
+/* C++11 header for std::move.
  * In RTC mode, std::move is provided implicitly; don't include the header
  */
 #if defined(__CPP_VERSION_AT_LEAST_11_FP16) && !defined(__CUDACC_RTC__)
@@ -145,7 +145,7 @@
 * Types which allow static initialization of "half" and "half2" until
 * these become an actual builtin. Note this initialization is as a
 * bitfield representation of "half", and not a conversion from short->half.
-* Such a representation will be deprecated in a future version of CUDA. 
+* Such a representation will be deprecated in a future version of CUDA.
 * (Note these are visible to non-nvcc compilers, including C-only compilation)
 */
 typedef struct __CUDA_ALIGN__(2) {
@@ -2443,7 +2443,7 @@ __CUDA_FP16_DECL__  __half atomicAdd(__half *const address, const __half val) {
 
 #undef __CUDA_HOSTDEVICE_FP16_DECL__
 #undef __CUDA_FP16_DECL__
- 
+
 /* Define first-class types "half" and "half2", unless user specifies otherwise via "#define CUDA_NO_HALF" */
 /* C cannot ever have these types defined here, because __half and __half2 are C++ classes */
 #if defined(__cplusplus) && !defined(CUDA_NO_HALF)
diff --git a/numba_cuda/numba/cuda/cuda_paths.py b/numba_cuda/numba/cuda/cuda_paths.py
index 4290a0a95..7d7f7ce6f 100644
--- a/numba_cuda/numba/cuda/cuda_paths.py
+++ b/numba_cuda/numba/cuda/cuda_paths.py
@@ -9,7 +9,7 @@
 from numba import config
 
 
-_env_path_tuple = namedtuple('_env_path_tuple', ['by', 'info'])
+_env_path_tuple = namedtuple("_env_path_tuple", ["by", "info"])
 
 
 def _find_valid_path(options):
@@ -21,16 +21,16 @@ def _find_valid_path(options):
         if data is not None:
             return by, data
     else:
-        return '<unknown>', None
+        return "<unknown>", None
 
 
 def _get_libdevice_path_decision():
     options = [
-        ('Conda environment', get_conda_ctk()),
-        ('Conda environment (NVIDIA package)', get_nvidia_libdevice_ctk()),
-        ('CUDA_HOME', get_cuda_home('nvvm', 'libdevice')),
-        ('System', get_system_ctk('nvvm', 'libdevice')),
-        ('Debian package', get_debian_pkg_libdevice()),
+        ("Conda environment", get_conda_ctk()),
+        ("Conda environment (NVIDIA package)", get_nvidia_libdevice_ctk()),
+        ("CUDA_HOME", get_cuda_home("nvvm", "libdevice")),
+        ("System", get_system_ctk("nvvm", "libdevice")),
+        ("Debian package", get_debian_pkg_libdevice()),
     ]
     by, libdir = _find_valid_path(options)
     return by, libdir
@@ -38,17 +38,17 @@ def _get_libdevice_path_decision():
 
 def _nvvm_lib_dir():
     if IS_WIN32:
-        return 'nvvm', 'bin'
+        return "nvvm", "bin"
     else:
-        return 'nvvm', 'lib64'
+        return "nvvm", "lib64"
 
 
 def _get_nvvm_path_decision():
     options = [
-        ('Conda environment', get_conda_ctk()),
-        ('Conda environment (NVIDIA package)', get_nvidia_nvvm_ctk()),
-        ('CUDA_HOME', get_cuda_home(*_nvvm_lib_dir())),
-        ('System', get_system_ctk(*_nvvm_lib_dir())),
+        ("Conda environment", get_conda_ctk()),
+        ("Conda environment (NVIDIA package)", get_nvidia_nvvm_ctk()),
+        ("CUDA_HOME", get_cuda_home(*_nvvm_lib_dir())),
+        ("System", get_system_ctk(*_nvvm_lib_dir())),
     ]
     by, path = _find_valid_path(options)
     return by, path
@@ -57,7 +57,7 @@ def _get_nvvm_path_decision():
 def _get_libdevice_paths():
     by, libdir = _get_libdevice_path_decision()
     # Search for pattern
-    pat = r'libdevice(\.\d+)*\.bc$'
+    pat = r"libdevice(\.\d+)*\.bc$"
     candidates = find_file(re.compile(pat), libdir)
     # Keep only the max (most recent version) of the bitcode files.
     out = max(candidates, default=None)
@@ -66,24 +66,24 @@ def _get_libdevice_paths():
 
 def _cudalib_path():
     if IS_WIN32:
-        return 'bin'
+        return "bin"
     else:
-        return 'lib64'
+        return "lib64"
 
 
 def _cuda_home_static_cudalib_path():
     if IS_WIN32:
-        return ('lib', 'x64')
+        return ("lib", "x64")
     else:
-        return ('lib64',)
+        return ("lib64",)
 
 
 def _get_cudalib_dir_path_decision():
     options = [
-        ('Conda environment', get_conda_ctk()),
-        ('Conda environment (NVIDIA package)', get_nvidia_cudalib_ctk()),
-        ('CUDA_HOME', get_cuda_home(_cudalib_path())),
-        ('System', get_system_ctk(_cudalib_path())),
+        ("Conda environment", get_conda_ctk()),
+        ("Conda environment (NVIDIA package)", get_nvidia_cudalib_ctk()),
+        ("CUDA_HOME", get_cuda_home(_cudalib_path())),
+        ("System", get_system_ctk(_cudalib_path())),
     ]
     by, libdir = _find_valid_path(options)
     return by, libdir
@@ -91,10 +91,10 @@ def _get_cudalib_dir_path_decision():
 
 def _get_static_cudalib_dir_path_decision():
     options = [
-        ('Conda environment', get_conda_ctk()),
-        ('Conda environment (NVIDIA package)', get_nvidia_static_cudalib_ctk()),
-        ('CUDA_HOME', get_cuda_home(*_cuda_home_static_cudalib_path())),
-        ('System', get_system_ctk(_cudalib_path())),
+        ("Conda environment", get_conda_ctk()),
+        ("Conda environment (NVIDIA package)", get_nvidia_static_cudalib_ctk()),
+        ("CUDA_HOME", get_cuda_home(*_cuda_home_static_cudalib_path())),
+        ("System", get_system_ctk(_cudalib_path())),
     ]
     by, libdir = _find_valid_path(options)
     return by, libdir
@@ -111,25 +111,23 @@ def _get_static_cudalib_dir():
 
 
 def get_system_ctk(*subdirs):
-    """Return path to system-wide cudatoolkit; or, None if it doesn't exist.
-    """
+    """Return path to system-wide cudatoolkit; or, None if it doesn't exist."""
     # Linux?
-    if sys.platform.startswith('linux'):
+    if sys.platform.startswith("linux"):
         # Is cuda alias to /usr/local/cuda?
         # We are intentionally not getting versioned cuda installation.
-        base = '/usr/local/cuda'
+        base = "/usr/local/cuda"
         if os.path.exists(base):
             return os.path.join(base, *subdirs)
 
 
 def get_conda_ctk():
-    """Return path to directory containing the shared libraries of cudatoolkit.
-    """
-    is_conda_env = os.path.exists(os.path.join(sys.prefix, 'conda-meta'))
+    """Return path to directory containing the shared libraries of cudatoolkit."""
+    is_conda_env = os.path.exists(os.path.join(sys.prefix, "conda-meta"))
     if not is_conda_env:
         return
     # Assume the existence of NVVM to imply cudatoolkit installed
-    paths = find_lib('nvvm')
+    paths = find_lib("nvvm")
     if not paths:
         return
     # Use the directory name of the max path
@@ -137,9 +135,8 @@ def get_conda_ctk():
 
 
 def get_nvidia_nvvm_ctk():
-    """Return path to directory containing the NVVM shared library.
-    """
-    is_conda_env = os.path.exists(os.path.join(sys.prefix, 'conda-meta'))
+    """Return path to directory containing the NVVM shared library."""
+    is_conda_env = os.path.exists(os.path.join(sys.prefix, "conda-meta"))
     if not is_conda_env:
         return
 
@@ -147,16 +144,16 @@ def get_nvidia_nvvm_ctk():
     # conda package is installed.
 
     # First, try the location used on Linux and the Windows 11.x packages
-    libdir = os.path.join(sys.prefix, 'nvvm', _cudalib_path())
+    libdir = os.path.join(sys.prefix, "nvvm", _cudalib_path())
     if not os.path.exists(libdir) or not os.path.isdir(libdir):
         # If that fails, try the location used for Windows 12.x packages
-        libdir = os.path.join(sys.prefix, 'Library', 'nvvm', _cudalib_path())
+        libdir = os.path.join(sys.prefix, "Library", "nvvm", _cudalib_path())
         if not os.path.exists(libdir) or not os.path.isdir(libdir):
             # If that doesn't exist either, assume we don't have the NVIDIA
             # conda package
             return
 
-    paths = find_lib('nvvm', libdir=libdir)
+    paths = find_lib("nvvm", libdir=libdir)
     if not paths:
         return
     # Use the directory name of the max path
@@ -164,39 +161,36 @@ def get_nvidia_nvvm_ctk():
 
 
 def get_nvidia_libdevice_ctk():
-    """Return path to directory containing the libdevice library.
-    """
+    """Return path to directory containing the libdevice library."""
     nvvm_ctk = get_nvidia_nvvm_ctk()
     if not nvvm_ctk:
         return
     nvvm_dir = os.path.dirname(nvvm_ctk)
-    return os.path.join(nvvm_dir, 'libdevice')
+    return os.path.join(nvvm_dir, "libdevice")
 
 
 def get_nvidia_cudalib_ctk():
-    """Return path to directory containing the shared libraries of cudatoolkit.
-    """
+    """Return path to directory containing the shared libraries of cudatoolkit."""
     nvvm_ctk = get_nvidia_nvvm_ctk()
     if not nvvm_ctk:
         return
     env_dir = os.path.dirname(os.path.dirname(nvvm_ctk))
-    subdir = 'bin' if IS_WIN32 else 'lib'
+    subdir = "bin" if IS_WIN32 else "lib"
     return os.path.join(env_dir, subdir)
 
 
 def get_nvidia_static_cudalib_ctk():
-    """Return path to directory containing the static libraries of cudatoolkit.
-    """
+    """Return path to directory containing the static libraries of cudatoolkit."""
     nvvm_ctk = get_nvidia_nvvm_ctk()
     if not nvvm_ctk:
         return
 
     if IS_WIN32 and ("Library" not in nvvm_ctk):
         # Location specific to CUDA 11.x packages on Windows
-        dirs = ('Lib', 'x64')
+        dirs = ("Lib", "x64")
     else:
         # Linux, or Windows with CUDA 12.x packages
-        dirs = ('lib',)
+        dirs = ("lib",)
 
     env_dir = os.path.dirname(os.path.dirname(nvvm_ctk))
     return os.path.join(env_dir, *dirs)
@@ -207,17 +201,17 @@ def get_cuda_home(*subdirs):
     If *subdirs* are the subdirectory name to be appended in the resulting
     path.
     """
-    cuda_home = os.environ.get('CUDA_HOME')
+    cuda_home = os.environ.get("CUDA_HOME")
     if cuda_home is None:
         # Try Windows CUDA installation without Anaconda
-        cuda_home = os.environ.get('CUDA_PATH')
+        cuda_home = os.environ.get("CUDA_PATH")
     if cuda_home is not None:
         return os.path.join(cuda_home, *subdirs)
 
 
 def _get_nvvm_path():
     by, path = _get_nvvm_path_decision()
-    candidates = find_lib('nvvm', path)
+    candidates = find_lib("nvvm", path)
     path = max(candidates) if candidates else None
     return _env_path_tuple(by, path)
 
@@ -234,16 +228,16 @@ def get_cuda_paths():
     Note: The result of the function is cached.
     """
     # Check cache
-    if hasattr(get_cuda_paths, '_cached_result'):
+    if hasattr(get_cuda_paths, "_cached_result"):
         return get_cuda_paths._cached_result
     else:
         # Not in cache
         d = {
-            'nvvm': _get_nvvm_path(),
-            'libdevice': _get_libdevice_paths(),
-            'cudalib_dir': _get_cudalib_dir(),
-            'static_cudalib_dir': _get_static_cudalib_dir(),
-            'include_dir': _get_include_dir(),
+            "nvvm": _get_nvvm_path(),
+            "libdevice": _get_libdevice_paths(),
+            "cudalib_dir": _get_cudalib_dir(),
+            "static_cudalib_dir": _get_static_cudalib_dir(),
+            "include_dir": _get_include_dir(),
         }
         # Cache result
         get_cuda_paths._cached_result = d
@@ -255,7 +249,7 @@ def get_debian_pkg_libdevice():
     Return the Debian NVIDIA Maintainers-packaged libdevice location, if it
     exists.
     """
-    pkg_libdevice_location = '/usr/lib/nvidia-cuda-toolkit/libdevice'
+    pkg_libdevice_location = "/usr/lib/nvidia-cuda-toolkit/libdevice"
     if not os.path.exists(pkg_libdevice_location):
         return None
     return pkg_libdevice_location
@@ -274,13 +268,10 @@ def get_current_cuda_target_name():
     machine = platform.machine()
 
     if system == "Linux":
-        arch_to_targets = {
-            'x86_64': 'x86_64-linux',
-            'aarch64': 'sbsa-linux'
-        }
+        arch_to_targets = {"x86_64": "x86_64-linux", "aarch64": "sbsa-linux"}
     elif system == "Windows":
         arch_to_targets = {
-            'AMD64': 'x64',
+            "AMD64": "x64",
         }
     else:
         arch_to_targets = {}
@@ -293,26 +284,28 @@ def get_conda_include_dir():
     Return the include directory in the current conda environment, if one
     is active and it exists.
     """
-    is_conda_env = os.path.exists(os.path.join(sys.prefix, 'conda-meta'))
+    is_conda_env = os.path.exists(os.path.join(sys.prefix, "conda-meta"))
     if not is_conda_env:
         return
 
     if platform.system() == "Windows":
-        include_dir = os.path.join(
-            sys.prefix, 'Library', 'include'
-        )
+        include_dir = os.path.join(sys.prefix, "Library", "include")
     elif target_name := get_current_cuda_target_name():
         include_dir = os.path.join(
-            sys.prefix, 'targets', target_name, 'include'
+            sys.prefix, "targets", target_name, "include"
         )
     else:
         # A fallback when target cannot determined
         # though usually it shouldn't.
-        include_dir = os.path.join(sys.prefix, 'include')
+        include_dir = os.path.join(sys.prefix, "include")
 
-    if (os.path.exists(include_dir) and os.path.isdir(include_dir)
-            and os.path.exists(os.path.join(include_dir,
-                                            'cuda_device_runtime_api.h'))):
+    if (
+        os.path.exists(include_dir)
+        and os.path.isdir(include_dir)
+        and os.path.exists(
+            os.path.join(include_dir, "cuda_device_runtime_api.h")
+        )
+    ):
         return include_dir
     return
 
@@ -320,8 +313,8 @@ def get_conda_include_dir():
 def _get_include_dir():
     """Find the root include directory."""
     options = [
-        ('Conda environment (NVIDIA package)', get_conda_include_dir()),
-        ('CUDA_INCLUDE_PATH Config Entry', config.CUDA_INCLUDE_PATH),
+        ("Conda environment (NVIDIA package)", get_conda_include_dir()),
+        ("CUDA_INCLUDE_PATH Config Entry", config.CUDA_INCLUDE_PATH),
         # TODO: add others
     ]
     by, include_dir = _find_valid_path(options)
diff --git a/numba_cuda/numba/cuda/cudadecl.py b/numba_cuda/numba/cuda/cudadecl.py
index de2541e58..547272601 100644
--- a/numba_cuda/numba/cuda/cudadecl.py
+++ b/numba_cuda/numba/cuda/cudadecl.py
@@ -1,15 +1,23 @@
 import operator
 from numba.core import types
-from numba.core.typing.npydecl import (parse_dtype, parse_shape,
-                                       register_number_classes,
-                                       register_numpy_ufunc,
-                                       trigonometric_functions,
-                                       comparison_functions,
-                                       math_operations,
-                                       bit_twiddling_functions)
-from numba.core.typing.templates import (AttributeTemplate, ConcreteTemplate,
-                                         AbstractTemplate, CallableTemplate,
-                                         signature, Registry)
+from numba.core.typing.npydecl import (
+    parse_dtype,
+    parse_shape,
+    register_number_classes,
+    register_numpy_ufunc,
+    trigonometric_functions,
+    comparison_functions,
+    math_operations,
+    bit_twiddling_functions,
+)
+from numba.core.typing.templates import (
+    AttributeTemplate,
+    ConcreteTemplate,
+    AbstractTemplate,
+    CallableTemplate,
+    signature,
+    Registry,
+)
 from numba.cuda.types import dim3
 from numba.core.typeconv import Conversion
 from numba import cuda
@@ -26,15 +34,15 @@
 class Cuda_array_decl(CallableTemplate):
     def generic(self):
         def typer(shape, dtype):
-
             # Only integer literals and tuples of integer literals are valid
             # shapes
             if isinstance(shape, types.Integer):
                 if not isinstance(shape, types.IntegerLiteral):
                     return None
             elif isinstance(shape, (types.Tuple, types.UniTuple)):
-                if any([not isinstance(s, types.IntegerLiteral)
-                        for s in shape]):
+                if any(
+                    [not isinstance(s, types.IntegerLiteral) for s in shape]
+                ):
                     return None
             else:
                 return None
@@ -42,7 +50,7 @@ def typer(shape, dtype):
             ndim = parse_shape(shape)
             nb_dtype = parse_dtype(dtype)
             if nb_dtype is not None and ndim is not None:
-                return types.Array(dtype=nb_dtype, ndim=ndim, layout='C')
+                return types.Array(dtype=nb_dtype, ndim=ndim, layout="C")
 
         return typer
 
@@ -64,6 +72,7 @@ class Cuda_const_array_like(CallableTemplate):
     def generic(self):
         def typer(ndarray):
             return ndarray
+
         return typer
 
 
@@ -95,22 +104,49 @@ class Cuda_syncwarp(ConcreteTemplate):
 class Cuda_shfl_sync_intrinsic(ConcreteTemplate):
     key = cuda.shfl_sync_intrinsic
     cases = [
-        signature(types.Tuple((types.i4, types.b1)),
-                  types.i4, types.i4, types.i4, types.i4, types.i4),
-        signature(types.Tuple((types.i8, types.b1)),
-                  types.i4, types.i4, types.i8, types.i4, types.i4),
-        signature(types.Tuple((types.f4, types.b1)),
-                  types.i4, types.i4, types.f4, types.i4, types.i4),
-        signature(types.Tuple((types.f8, types.b1)),
-                  types.i4, types.i4, types.f8, types.i4, types.i4),
+        signature(
+            types.Tuple((types.i4, types.b1)),
+            types.i4,
+            types.i4,
+            types.i4,
+            types.i4,
+            types.i4,
+        ),
+        signature(
+            types.Tuple((types.i8, types.b1)),
+            types.i4,
+            types.i4,
+            types.i8,
+            types.i4,
+            types.i4,
+        ),
+        signature(
+            types.Tuple((types.f4, types.b1)),
+            types.i4,
+            types.i4,
+            types.f4,
+            types.i4,
+            types.i4,
+        ),
+        signature(
+            types.Tuple((types.f8, types.b1)),
+            types.i4,
+            types.i4,
+            types.f8,
+            types.i4,
+            types.i4,
+        ),
     ]
 
 
 @register
 class Cuda_vote_sync_intrinsic(ConcreteTemplate):
     key = cuda.vote_sync_intrinsic
-    cases = [signature(types.Tuple((types.i4, types.b1)),
-                       types.i4, types.i4, types.b1)]
+    cases = [
+        signature(
+            types.Tuple((types.i4, types.b1)), types.i4, types.i4, types.b1
+        )
+    ]
 
 
 @register
@@ -153,6 +189,7 @@ class Cuda_popc(ConcreteTemplate):
     Supported types from `llvm.popc`
     [here](http://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#bit-manipulations-intrinics)
     """
+
     key = cuda.popc
     cases = [
         signature(types.int8, types.int8),
@@ -172,6 +209,7 @@ class Cuda_fma(ConcreteTemplate):
     Supported types from `llvm.fma`
     [here](https://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#standard-c-library-intrinics)
     """
+
     key = cuda.fma
     cases = [
         signature(types.float32, types.float32, types.float32, types.float32),
@@ -189,7 +227,6 @@ class Cuda_hfma(ConcreteTemplate):
 
 @register
 class Cuda_cbrt(ConcreteTemplate):
-
     key = cuda.cbrt
     cases = [
         signature(types.float32, types.float32),
@@ -212,6 +249,7 @@ class Cuda_clz(ConcreteTemplate):
     Supported types from `llvm.ctlz`
     [here](http://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#bit-manipulations-intrinics)
     """
+
     key = cuda.clz
     cases = [
         signature(types.int8, types.int8),
@@ -231,6 +269,7 @@ class Cuda_ffs(ConcreteTemplate):
     Supported types from `llvm.cttz`
     [here](http://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#bit-manipulations-intrinics)
     """
+
     key = cuda.ffs
     cases = [
         signature(types.uint32, types.int8),
@@ -254,10 +293,16 @@ def generic(self, args, kws):
 
         # per docs
         # http://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-selp
-        supported_types = (types.float64, types.float32,
-                           types.int16, types.uint16,
-                           types.int32, types.uint32,
-                           types.int64, types.uint64)
+        supported_types = (
+            types.float64,
+            types.float32,
+            types.int16,
+            types.uint16,
+            types.int32,
+            types.uint32,
+            types.int64,
+            types.uint64,
+        )
 
         if a != b or a not in supported_types:
             return
@@ -298,7 +343,6 @@ class Cuda_fp16_binary(ConcreteTemplate):
 
 @register_global(float)
 class Float(AbstractTemplate):
-
     def generic(self, args, kws):
         assert not kws
 
@@ -313,11 +357,11 @@ def _genfp16_binary_comparison(l_key):
     class Cuda_fp16_cmp(ConcreteTemplate):
         key = l_key
 
-        cases = [
-            signature(types.b1, types.float16, types.float16)
-        ]
+        cases = [signature(types.b1, types.float16, types.float16)]
+
     return Cuda_fp16_cmp
 
+
 # If multiple ConcreteTemplates provide typing for a single function, then
 # function resolution will pick the first compatible typing it finds even if it
 # involves inserting a cast that would be considered undesirable (in this
@@ -340,9 +384,10 @@ class Cuda_fp16_operator(AbstractTemplate):
         def generic(self, args, kws):
             assert not kws
 
-            if len(args) == 2 and \
-                    (args[0] == types.float16 or args[1] == types.float16):
-                if (args[0] == types.float16):
+            if len(args) == 2 and (
+                args[0] == types.float16 or args[1] == types.float16
+            ):
+                if args[0] == types.float16:
                     convertible = self.context.can_convert(args[1], args[0])
                 else:
                     convertible = self.context.can_convert(args[0], args[1])
@@ -355,9 +400,11 @@ def generic(self, args, kws):
                 # 3. fp16 to int8 (safe conversion) -
                 #  - Conversion.safe
 
-                if (convertible == Conversion.exact) or \
-                   (convertible == Conversion.promote) or \
-                   (convertible == Conversion.safe):
+                if (
+                    (convertible == Conversion.exact)
+                    or (convertible == Conversion.promote)
+                    or (convertible == Conversion.safe)
+                ):
                     return signature(retty, types.float16, types.float16)
 
     return Cuda_fp16_operator
@@ -404,38 +451,42 @@ def _genfp16_binary_operator(op):
 
 def _resolve_wrapped_unary(fname):
     link = tuple()
-    decl = declare_device_function_template(f'__numba_wrapper_{fname}',
-                                            types.float16,
-                                            (types.float16,),
-                                            link)
+    decl = declare_device_function_template(
+        f"__numba_wrapper_{fname}", types.float16, (types.float16,), link
+    )
     return types.Function(decl)
 
 
 def _resolve_wrapped_binary(fname):
     link = tuple()
-    decl = declare_device_function_template(f'__numba_wrapper_{fname}',
-                                            types.float16,
-                                            (types.float16, types.float16,),
-                                            link)
+    decl = declare_device_function_template(
+        f"__numba_wrapper_{fname}",
+        types.float16,
+        (
+            types.float16,
+            types.float16,
+        ),
+        link,
+    )
     return types.Function(decl)
 
 
-hsin_device = _resolve_wrapped_unary('hsin')
-hcos_device = _resolve_wrapped_unary('hcos')
-hlog_device = _resolve_wrapped_unary('hlog')
-hlog10_device = _resolve_wrapped_unary('hlog10')
-hlog2_device = _resolve_wrapped_unary('hlog2')
-hexp_device = _resolve_wrapped_unary('hexp')
-hexp10_device = _resolve_wrapped_unary('hexp10')
-hexp2_device = _resolve_wrapped_unary('hexp2')
-hsqrt_device = _resolve_wrapped_unary('hsqrt')
-hrsqrt_device = _resolve_wrapped_unary('hrsqrt')
-hfloor_device = _resolve_wrapped_unary('hfloor')
-hceil_device = _resolve_wrapped_unary('hceil')
-hrcp_device = _resolve_wrapped_unary('hrcp')
-hrint_device = _resolve_wrapped_unary('hrint')
-htrunc_device = _resolve_wrapped_unary('htrunc')
-hdiv_device = _resolve_wrapped_binary('hdiv')
+hsin_device = _resolve_wrapped_unary("hsin")
+hcos_device = _resolve_wrapped_unary("hcos")
+hlog_device = _resolve_wrapped_unary("hlog")
+hlog10_device = _resolve_wrapped_unary("hlog10")
+hlog2_device = _resolve_wrapped_unary("hlog2")
+hexp_device = _resolve_wrapped_unary("hexp")
+hexp10_device = _resolve_wrapped_unary("hexp10")
+hexp2_device = _resolve_wrapped_unary("hexp2")
+hsqrt_device = _resolve_wrapped_unary("hsqrt")
+hrsqrt_device = _resolve_wrapped_unary("hrsqrt")
+hfloor_device = _resolve_wrapped_unary("hfloor")
+hceil_device = _resolve_wrapped_unary("hceil")
+hrcp_device = _resolve_wrapped_unary("hrcp")
+hrint_device = _resolve_wrapped_unary("hrint")
+htrunc_device = _resolve_wrapped_unary("htrunc")
+hdiv_device = _resolve_wrapped_binary("hdiv")
 
 
 # generate atomic operations
@@ -455,15 +506,20 @@ def generic(self, args, kws):
                 return signature(ary.dtype, ary, types.intp, ary.dtype)
             elif ary.ndim > 1:
                 return signature(ary.dtype, ary, idx, ary.dtype)
+
     return Cuda_atomic
 
 
-all_numba_types = (types.float64, types.float32,
-                   types.int32, types.uint32,
-                   types.int64, types.uint64)
+all_numba_types = (
+    types.float64,
+    types.float32,
+    types.int32,
+    types.uint32,
+    types.int64,
+    types.uint64,
+)
 
-integer_numba_types = (types.int32, types.uint32,
-                       types.int64, types.uint64)
+integer_numba_types = (types.int32, types.uint32, types.int64, types.uint64)
 
 unsigned_int_numba_types = (types.uint32, types.uint64)
 
@@ -811,5 +867,5 @@ def resolve_local(self, mod):
     register_numpy_ufunc(func, register_global)
 
 for func in math_operations:
-    if func in ('log', 'log2', 'log10'):
+    if func in ("log", "log2", "log10"):
         register_numpy_ufunc(func, register_global)
diff --git a/numba_cuda/numba/cuda/cudadrv/__init__.py b/numba_cuda/numba/cuda/cudadrv/__init__.py
index 33bfca345..c7d60a5e3 100644
--- a/numba_cuda/numba/cuda/cudadrv/__init__.py
+++ b/numba_cuda/numba/cuda/cudadrv/__init__.py
@@ -5,5 +5,7 @@
 - Device array implementation
 
 """
+
 from numba.core import config
-assert not config.ENABLE_CUDASIM, 'Cannot use real driver API with simulator'
+
+assert not config.ENABLE_CUDASIM, "Cannot use real driver API with simulator"
diff --git a/numba_cuda/numba/cuda/cudadrv/devicearray.py b/numba_cuda/numba/cuda/cudadrv/devicearray.py
index 87b00edcf..7ffbca924 100644
--- a/numba_cuda/numba/cuda/cudadrv/devicearray.py
+++ b/numba_cuda/numba/cuda/cudadrv/devicearray.py
@@ -25,7 +25,7 @@
 from warnings import warn
 
 try:
-    lru_cache = getattr(functools, 'lru_cache')(None)
+    lru_cache = getattr(functools, "lru_cache")(None)
 except AttributeError:
     # Python 3.1 or lower
     def lru_cache(func):
@@ -34,7 +34,7 @@ def lru_cache(func):
 
 def is_cuda_ndarray(obj):
     "Check if an object is a CUDA ndarray"
-    return getattr(obj, '__cuda_ndarray__', False)
+    return getattr(obj, "__cuda_ndarray__", False)
 
 
 def verify_cuda_ndarray_interface(obj):
@@ -45,25 +45,25 @@ def requires_attr(attr, typ):
         if not hasattr(obj, attr):
             raise AttributeError(attr)
         if not isinstance(getattr(obj, attr), typ):
-            raise AttributeError('%s must be of type %s' % (attr, typ))
+            raise AttributeError("%s must be of type %s" % (attr, typ))
 
-    requires_attr('shape', tuple)
-    requires_attr('strides', tuple)
-    requires_attr('dtype', np.dtype)
-    requires_attr('size', int)
+    requires_attr("shape", tuple)
+    requires_attr("strides", tuple)
+    requires_attr("dtype", np.dtype)
+    requires_attr("size", int)
 
 
 def require_cuda_ndarray(obj):
     "Raises ValueError is is_cuda_ndarray(obj) evaluates False"
     if not is_cuda_ndarray(obj):
-        raise ValueError('require an cuda ndarray object')
+        raise ValueError("require an cuda ndarray object")
 
 
 class DeviceNDArrayBase(_devicearray.DeviceArray):
-    """A on GPU NDArray representation
-    """
+    """A on GPU NDArray representation"""
+
     __cuda_memory__ = True
-    __cuda_ndarray__ = True     # There must be gpu_data attribute
+    __cuda_ndarray__ = True  # There must be gpu_data attribute
 
     def __init__(self, shape, strides, dtype, stream=0, gpu_data=None):
         """
@@ -88,9 +88,10 @@ def __init__(self, shape, strides, dtype, stream=0, gpu_data=None):
         dtype = np.dtype(dtype)
         self.ndim = len(shape)
         if len(strides) != self.ndim:
-            raise ValueError('strides not match ndim')
-        self._dummy = dummyarray.Array.from_desc(0, shape, strides,
-                                                 dtype.itemsize)
+            raise ValueError("strides not match ndim")
+        self._dummy = dummyarray.Array.from_desc(
+            0, shape, strides, dtype.itemsize
+        )
         self.shape = tuple(shape)
         self.strides = tuple(strides)
         self.dtype = dtype
@@ -99,7 +100,8 @@ def __init__(self, shape, strides, dtype, stream=0, gpu_data=None):
         if self.size > 0:
             if gpu_data is None:
                 self.alloc_size = _driver.memory_size_from_info(
-                    self.shape, self.strides, self.dtype.itemsize)
+                    self.shape, self.strides, self.dtype.itemsize
+                )
                 gpu_data = devices.get_context().memalloc(self.alloc_size)
             else:
                 self.alloc_size = _driver.device_memory_size(gpu_data)
@@ -109,8 +111,9 @@ def __init__(self, shape, strides, dtype, stream=0, gpu_data=None):
                 null = _driver.binding.CUdeviceptr(0)
             else:
                 null = c_void_p(0)
-            gpu_data = _driver.MemoryPointer(context=devices.get_context(),
-                                             pointer=null, size=0)
+            gpu_data = _driver.MemoryPointer(
+                context=devices.get_context(), pointer=null, size=0
+            )
             self.alloc_size = 0
 
         self.gpu_data = gpu_data
@@ -130,12 +133,12 @@ def __cuda_array_interface__(self):
                 ptr = 0
 
         return {
-            'shape': tuple(self.shape),
-            'strides': None if is_contiguous(self) else tuple(self.strides),
-            'data': (ptr, False),
-            'typestr': self.dtype.str,
-            'stream': int(self.stream) if self.stream != 0 else None,
-            'version': 3,
+            "shape": tuple(self.shape),
+            "strides": None if is_contiguous(self) else tuple(self.strides),
+            "data": (ptr, False),
+            "typestr": self.dtype.str,
+            "stream": int(self.stream) if self.stream != 0 else None,
+            "version": 3,
         }
 
     def bind(self, stream=0):
@@ -160,6 +163,7 @@ def transpose(self, axes=None):
             raise ValueError("invalid axes list %r" % (axes,))
         else:
             from numba.cuda.kernels.transpose import transpose
+
             return transpose(self)
 
     def _default_stream(self, stream):
@@ -186,20 +190,19 @@ def _numba_type_(self):
         # layouts.
 
         broadcast = 0 in self.strides
-        if self.flags['C_CONTIGUOUS'] and not broadcast:
-            layout = 'C'
-        elif self.flags['F_CONTIGUOUS'] and not broadcast:
-            layout = 'F'
+        if self.flags["C_CONTIGUOUS"] and not broadcast:
+            layout = "C"
+        elif self.flags["F_CONTIGUOUS"] and not broadcast:
+            layout = "F"
         else:
-            layout = 'A'
+            layout = "A"
 
         dtype = numpy_support.from_dtype(self.dtype)
         return types.Array(dtype, self.ndim, layout)
 
     @property
     def device_ctypes_pointer(self):
-        """Returns the ctypes pointer to the GPU data buffer
-        """
+        """Returns the ctypes pointer to the GPU data buffer"""
         if self.gpu_data is None:
             if _driver.USE_NV_BINDING:
                 return _driver.binding.CUdeviceptr(0)
@@ -232,13 +235,16 @@ def copy_to_device(self, ary, stream=0):
             # (i.e., in order to materialize a writable strided view)
             ary_core = np.array(
                 ary_core,
-                order='C' if self_core.flags['C_CONTIGUOUS'] else 'F',
+                order="C" if self_core.flags["C_CONTIGUOUS"] else "F",
                 subok=True,
-                copy=(not ary_core.flags['WRITEABLE'])
-                if numpy_version < (2, 0) else None)
+                copy=(not ary_core.flags["WRITEABLE"])
+                if numpy_version < (2, 0)
+                else None,
+            )
             check_array_compatibility(self_core, ary_core)
-            _driver.host_to_device(self, ary_core, self.alloc_size,
-                                   stream=stream)
+            _driver.host_to_device(
+                self, ary_core, self.alloc_size, stream=stream
+            )
 
     @devices.require_context
     def copy_to_host(self, ary=None, stream=0):
@@ -264,7 +270,7 @@ def copy_to_host(self, ary=None, stream=0):
             result_array = d_arr.copy_to_host()
         """
         if any(s < 0 for s in self.strides):
-            msg = 'D->H copy not implemented for negative strides: {}'
+            msg = "D->H copy not implemented for negative strides: {}"
             raise NotImplementedError(msg.format(self.strides))
         assert self.alloc_size >= 0, "Negative memory size"
         stream = self._default_stream(stream)
@@ -275,16 +281,22 @@ def copy_to_host(self, ary=None, stream=0):
             hostary = ary
 
         if self.alloc_size != 0:
-            _driver.device_to_host(hostary, self, self.alloc_size,
-                                   stream=stream)
+            _driver.device_to_host(
+                hostary, self, self.alloc_size, stream=stream
+            )
 
         if ary is None:
             if self.size == 0:
-                hostary = np.ndarray(shape=self.shape, dtype=self.dtype,
-                                     buffer=hostary)
+                hostary = np.ndarray(
+                    shape=self.shape, dtype=self.dtype, buffer=hostary
+                )
             else:
-                hostary = np.ndarray(shape=self.shape, dtype=self.dtype,
-                                     strides=self.strides, buffer=hostary)
+                hostary = np.ndarray(
+                    shape=self.shape,
+                    dtype=self.dtype,
+                    strides=self.strides,
+                    buffer=hostary,
+                )
         return hostary
 
     def split(self, section, stream=0):
@@ -305,12 +317,16 @@ def split(self, section, stream=0):
             end = min(begin + section, self.size)
             shape = (end - begin,)
             gpu_data = self.gpu_data.view(begin * itemsize, end * itemsize)
-            yield DeviceNDArray(shape, strides, dtype=self.dtype, stream=stream,
-                                gpu_data=gpu_data)
+            yield DeviceNDArray(
+                shape,
+                strides,
+                dtype=self.dtype,
+                stream=stream,
+                gpu_data=gpu_data,
+            )
 
     def as_cuda_arg(self):
-        """Returns a device memory object that is used as the argument.
-        """
+        """Returns a device memory object that is used as the argument."""
         return self.gpu_data
 
     def get_ipc_handle(self):
@@ -368,8 +384,7 @@ def view(self, dtype):
                 )
 
             shape[-1], rem = divmod(
-                shape[-1] * self.dtype.itemsize,
-                dtype.itemsize
+                shape[-1] * self.dtype.itemsize, dtype.itemsize
             )
 
             if rem != 0:
@@ -398,14 +413,16 @@ def nbytes(self):
 
 
 class DeviceRecord(DeviceNDArrayBase):
-    '''
+    """
     An on-GPU record type
-    '''
+    """
+
     def __init__(self, dtype, stream=0, gpu_data=None):
         shape = ()
         strides = ()
-        super(DeviceRecord, self).__init__(shape, strides, dtype, stream,
-                                           gpu_data)
+        super(DeviceRecord, self).__init__(
+            shape, strides, dtype, stream, gpu_data
+        )
 
     @property
     def flags(self):
@@ -415,7 +432,7 @@ def flags(self):
         with an existing `numpy.ndarray` (as the C- and F- contiguous flags
         aren't writeable).
         """
-        return dict(self._dummy.flags) # defensive copy
+        return dict(self._dummy.flags)  # defensive copy
 
     @property
     def _numba_type_(self):
@@ -431,8 +448,7 @@ def __getitem__(self, item):
 
     @devices.require_context
     def getitem(self, item, stream=0):
-        """Do `__getitem__(item)` with CUDA stream
-        """
+        """Do `__getitem__(item)` with CUDA stream"""
         return self._do_getitem(item, stream)
 
     def _do_getitem(self, item, stream=0):
@@ -442,22 +458,24 @@ def _do_getitem(self, item, stream=0):
 
         if typ.shape == ():
             if typ.names is not None:
-                return DeviceRecord(dtype=typ, stream=stream,
-                                    gpu_data=newdata)
+                return DeviceRecord(dtype=typ, stream=stream, gpu_data=newdata)
             else:
                 hostary = np.empty(1, dtype=typ)
-                _driver.device_to_host(dst=hostary, src=newdata,
-                                       size=typ.itemsize,
-                                       stream=stream)
+                _driver.device_to_host(
+                    dst=hostary, src=newdata, size=typ.itemsize, stream=stream
+                )
             return hostary[0]
         else:
-            shape, strides, dtype = \
-                prepare_shape_strides_dtype(typ.shape,
-                                            None,
-                                            typ.subdtype[0], 'C')
-            return DeviceNDArray(shape=shape, strides=strides,
-                                 dtype=dtype, gpu_data=newdata,
-                                 stream=stream)
+            shape, strides, dtype = prepare_shape_strides_dtype(
+                typ.shape, None, typ.subdtype[0], "C"
+            )
+            return DeviceNDArray(
+                shape=shape,
+                strides=strides,
+                dtype=dtype,
+                gpu_data=newdata,
+                stream=stream,
+            )
 
     @devices.require_context
     def __setitem__(self, key, value):
@@ -465,12 +483,10 @@ def __setitem__(self, key, value):
 
     @devices.require_context
     def setitem(self, key, value, stream=0):
-        """Do `__setitem__(key, value)` with CUDA stream
-        """
+        """Do `__setitem__(key, value)` with CUDA stream"""
         return self._do_setitem(key, value, stream=stream)
 
     def _do_setitem(self, key, value, stream=0):
-
         stream = self._default_stream(stream)
 
         # If the record didn't have a default stream, and the user didn't
@@ -515,6 +531,7 @@ def _assign_kernel(ndim):
         @cuda.jit
         def kernel(lhs, rhs):
             lhs[()] = rhs[()]
+
         return kernel
 
     @cuda.jit
@@ -531,9 +548,7 @@ def kernel(lhs, rhs):
 
         # [0, :] is the to-index (into `lhs`)
         # [1, :] is the from-index (into `rhs`)
-        idx = cuda.local.array(
-            shape=(2, ndim),
-            dtype=types.int64)
+        idx = cuda.local.array(shape=(2, ndim), dtype=types.int64)
 
         for i in range(ndim - 1, -1, -1):
             idx[0, i] = location % lhs.shape[i]
@@ -541,17 +556,19 @@ def kernel(lhs, rhs):
             location //= lhs.shape[i]
 
         lhs[to_fixed_tuple(idx[0], ndim)] = rhs[to_fixed_tuple(idx[1], ndim)]
+
     return kernel
 
 
 class DeviceNDArray(DeviceNDArrayBase):
-    '''
+    """
     An on-GPU array type
-    '''
+    """
+
     def is_f_contiguous(self):
-        '''
+        """
         Return true if the array is Fortran-contiguous.
-        '''
+        """
         return self._dummy.is_f_contig
 
     @property
@@ -562,12 +579,12 @@ def flags(self):
         with an existing `numpy.ndarray` (as the C- and F- contiguous flags
         aren't writeable).
         """
-        return dict(self._dummy.flags) # defensive copy
+        return dict(self._dummy.flags)  # defensive copy
 
     def is_c_contiguous(self):
-        '''
+        """
         Return true if the array is C-contiguous.
-        '''
+        """
         return self._dummy.is_c_contig
 
     def __array__(self, dtype=None, copy=None):
@@ -590,7 +607,7 @@ def reshape(self, *newshape, **kws):
         Reshape the array without changing its contents, similarly to
         :meth:`numpy.ndarray.reshape`. Example::
 
-            d_arr = d_arr.reshape(20, 50, order='F')
+            d_arr = d_arr.reshape(20, 50, order="F")
         """
         if len(newshape) == 1 and isinstance(newshape[0], (tuple, list)):
             newshape = newshape[0]
@@ -598,31 +615,43 @@ def reshape(self, *newshape, **kws):
         cls = type(self)
         if newshape == self.shape:
             # nothing to do
-            return cls(shape=self.shape, strides=self.strides,
-                       dtype=self.dtype, gpu_data=self.gpu_data)
+            return cls(
+                shape=self.shape,
+                strides=self.strides,
+                dtype=self.dtype,
+                gpu_data=self.gpu_data,
+            )
 
         newarr, extents = self._dummy.reshape(*newshape, **kws)
 
         if extents == [self._dummy.extent]:
-            return cls(shape=newarr.shape, strides=newarr.strides,
-                       dtype=self.dtype, gpu_data=self.gpu_data)
+            return cls(
+                shape=newarr.shape,
+                strides=newarr.strides,
+                dtype=self.dtype,
+                gpu_data=self.gpu_data,
+            )
         else:
             raise NotImplementedError("operation requires copying")
 
-    def ravel(self, order='C', stream=0):
-        '''
+    def ravel(self, order="C", stream=0):
+        """
         Flattens a contiguous array without changing its contents, similar to
         :meth:`numpy.ndarray.ravel`. If the array is not contiguous, raises an
         exception.
-        '''
+        """
         stream = self._default_stream(stream)
         cls = type(self)
         newarr, extents = self._dummy.ravel(order=order)
 
         if extents == [self._dummy.extent]:
-            return cls(shape=newarr.shape, strides=newarr.strides,
-                       dtype=self.dtype, gpu_data=self.gpu_data,
-                       stream=stream)
+            return cls(
+                shape=newarr.shape,
+                strides=newarr.strides,
+                dtype=self.dtype,
+                gpu_data=self.gpu_data,
+                stream=stream,
+            )
 
         else:
             raise NotImplementedError("operation requires copying")
@@ -633,8 +662,7 @@ def __getitem__(self, item):
 
     @devices.require_context
     def getitem(self, item, stream=0):
-        """Do `__getitem__(item)` with CUDA stream
-        """
+        """Do `__getitem__(item)` with CUDA stream"""
         return self._do_getitem(item, stream)
 
     def _do_getitem(self, item, stream=0):
@@ -649,22 +677,36 @@ def _do_getitem(self, item, stream=0):
             if not arr.is_array:
                 # Check for structured array type (record)
                 if self.dtype.names is not None:
-                    return DeviceRecord(dtype=self.dtype, stream=stream,
-                                        gpu_data=newdata)
+                    return DeviceRecord(
+                        dtype=self.dtype, stream=stream, gpu_data=newdata
+                    )
                 else:
                     # Element indexing
                     hostary = np.empty(1, dtype=self.dtype)
-                    _driver.device_to_host(dst=hostary, src=newdata,
-                                           size=self._dummy.itemsize,
-                                           stream=stream)
+                    _driver.device_to_host(
+                        dst=hostary,
+                        src=newdata,
+                        size=self._dummy.itemsize,
+                        stream=stream,
+                    )
                 return hostary[0]
             else:
-                return cls(shape=arr.shape, strides=arr.strides,
-                           dtype=self.dtype, gpu_data=newdata, stream=stream)
+                return cls(
+                    shape=arr.shape,
+                    strides=arr.strides,
+                    dtype=self.dtype,
+                    gpu_data=newdata,
+                    stream=stream,
+                )
         else:
             newdata = self.gpu_data.view(*arr.extent)
-            return cls(shape=arr.shape, strides=arr.strides,
-                       dtype=self.dtype, gpu_data=newdata, stream=stream)
+            return cls(
+                shape=arr.shape,
+                strides=arr.strides,
+                dtype=self.dtype,
+                gpu_data=newdata,
+                stream=stream,
+            )
 
     @devices.require_context
     def __setitem__(self, key, value):
@@ -672,12 +714,10 @@ def __setitem__(self, key, value):
 
     @devices.require_context
     def setitem(self, key, value, stream=0):
-        """Do `__setitem__(key, value)` with CUDA stream
-        """
+        """Do `__setitem__(key, value)` with CUDA stream"""
         return self._do_setitem(key, value, stream=stream)
 
     def _do_setitem(self, key, value, stream=0):
-
         stream = self._default_stream(stream)
 
         # If the array didn't have a default stream, and the user didn't provide
@@ -706,23 +746,26 @@ def _do_setitem(self, key, value, stream=0):
             strides=strides,
             dtype=self.dtype,
             gpu_data=newdata,
-            stream=stream)
+            stream=stream,
+        )
 
         # (2) prepare RHS
 
         rhs, _ = auto_device(value, stream=stream, user_explicit=True)
         if rhs.ndim > lhs.ndim:
-            raise ValueError("Can't assign %s-D array to %s-D self" % (
-                rhs.ndim,
-                lhs.ndim))
+            raise ValueError(
+                "Can't assign %s-D array to %s-D self" % (rhs.ndim, lhs.ndim)
+            )
         rhs_shape = np.ones(lhs.ndim, dtype=np.int64)
         # negative indices would not work if rhs.ndim == 0
-        rhs_shape[lhs.ndim - rhs.ndim:] = rhs.shape
+        rhs_shape[lhs.ndim - rhs.ndim :] = rhs.shape
         rhs = rhs.reshape(*rhs_shape)
         for i, (l, r) in enumerate(zip(lhs.shape, rhs.shape)):
             if r != 1 and l != r:
-                raise ValueError("Can't copy sequence with size %d to array "
-                                 "axis %d with dimension %d" % ( r, i, l))
+                raise ValueError(
+                    "Can't copy sequence with size %d to array "
+                    "axis %d with dimension %d" % (r, i, l)
+                )
 
         # (3) do the copy
 
@@ -751,6 +794,7 @@ class IpcArrayHandle(object):
             some_code(ipc_array)
         # ipc_array is dead at this point
     """
+
     def __init__(self, ipc_handle, array_desc):
         self._array_desc = array_desc
         self._ipc_handle = ipc_handle
@@ -798,8 +842,9 @@ def device_setup(self, gpu_data, stream=0):
 
 def from_array_like(ary, stream=0, gpu_data=None):
     "Create a DeviceNDArray object that is like ary."
-    return DeviceNDArray(ary.shape, ary.strides, ary.dtype, stream=stream,
-                         gpu_data=gpu_data)
+    return DeviceNDArray(
+        ary.shape, ary.strides, ary.dtype, stream=stream, gpu_data=gpu_data
+    )
 
 
 def from_record_like(rec, stream=0, gpu_data=None):
@@ -841,15 +886,17 @@ def is_contiguous(ary):
     return True
 
 
-errmsg_contiguous_buffer = ("Array contains non-contiguous buffer and cannot "
-                            "be transferred as a single memory region. Please "
-                            "ensure contiguous buffer with numpy "
-                            ".ascontiguousarray()")
+errmsg_contiguous_buffer = (
+    "Array contains non-contiguous buffer and cannot "
+    "be transferred as a single memory region. Please "
+    "ensure contiguous buffer with numpy "
+    ".ascontiguousarray()"
+)
 
 
 def sentry_contiguous(ary):
     core = array_core(ary)
-    if not core.flags['C_CONTIGUOUS'] and not core.flags['F_CONTIGUOUS']:
+    if not core.flags["C_CONTIGUOUS"] and not core.flags["F_CONTIGUOUS"]:
         raise ValueError(errmsg_contiguous_buffer)
 
 
@@ -861,7 +908,7 @@ def auto_device(obj, stream=0, copy=True, user_explicit=False):
     """
     if _driver.is_device_memory(obj):
         return obj, False
-    elif hasattr(obj, '__cuda_array_interface__'):
+    elif hasattr(obj, "__cuda_array_interface__"):
         return numba.cuda.as_cuda_array(obj), False
     else:
         if isinstance(obj, np.void):
@@ -873,9 +920,8 @@ def auto_device(obj, stream=0, copy=True, user_explicit=False):
             # into this function (with no overhead -- copies -- for `obj`s
             # that are already `ndarray`s.
             obj = np.array(
-                obj,
-                copy=False if numpy_version < (2, 0) else None,
-                subok=True)
+                obj, copy=False if numpy_version < (2, 0) else None, subok=True
+            )
             sentry_contiguous(obj)
             devobj = from_array_like(obj, stream=stream)
         if copy:
@@ -883,13 +929,14 @@ def auto_device(obj, stream=0, copy=True, user_explicit=False):
                 config.CUDA_WARN_ON_IMPLICIT_COPY
                 and not config.DISABLE_PERFORMANCE_WARNINGS
             ):
-                if (
-                    not user_explicit and
-                    (not isinstance(obj, DeviceNDArray)
-                     and isinstance(obj, np.ndarray))
+                if not user_explicit and (
+                    not isinstance(obj, DeviceNDArray)
+                    and isinstance(obj, np.ndarray)
                 ):
-                    msg = ("Host array used in CUDA kernel will incur "
-                           "copy overhead to/from device.")
+                    msg = (
+                        "Host array used in CUDA kernel will incur "
+                        "copy overhead to/from device."
+                    )
                     warn(NumbaPerformanceWarning(msg))
             devobj.copy_to_device(obj, stream=stream)
         return devobj, True
@@ -898,13 +945,16 @@ def auto_device(obj, stream=0, copy=True, user_explicit=False):
 def check_array_compatibility(ary1, ary2):
     ary1sq, ary2sq = ary1.squeeze(), ary2.squeeze()
     if ary1.dtype != ary2.dtype:
-        raise TypeError('incompatible dtype: %s vs. %s' %
-                        (ary1.dtype, ary2.dtype))
+        raise TypeError(
+            "incompatible dtype: %s vs. %s" % (ary1.dtype, ary2.dtype)
+        )
     if ary1sq.shape != ary2sq.shape:
-        raise ValueError('incompatible shape: %s vs. %s' %
-                         (ary1.shape, ary2.shape))
+        raise ValueError(
+            "incompatible shape: %s vs. %s" % (ary1.shape, ary2.shape)
+        )
     # We check strides only if the size is nonzero, because strides are
     # irrelevant (and can differ) for zero-length copies.
     if ary1.size and ary1sq.strides != ary2sq.strides:
-        raise ValueError('incompatible strides: %s vs. %s' %
-                         (ary1.strides, ary2.strides))
+        raise ValueError(
+            "incompatible strides: %s vs. %s" % (ary1.strides, ary2.strides)
+        )
diff --git a/numba_cuda/numba/cuda/cudadrv/devices.py b/numba_cuda/numba/cuda/cudadrv/devices.py
index 6cc9e2e39..a570f91dd 100644
--- a/numba_cuda/numba/cuda/cudadrv/devices.py
+++ b/numba_cuda/numba/cuda/cudadrv/devices.py
@@ -10,6 +10,7 @@
 - This module must be imported by the main-thread.
 
 """
+
 import functools
 import threading
 from contextlib import contextmanager
@@ -24,8 +25,10 @@ def __getattr__(self, attr):
             # Device list is not initialized.
             # Query all CUDA devices.
             numdev = driver.get_device_count()
-            gpus = [_DeviceContextManager(driver.get_device(devid))
-                    for devid in range(numdev)]
+            gpus = [
+                _DeviceContextManager(driver.get_device(devid))
+                for devid in range(numdev)
+            ]
             # Define "lst" to avoid re-initialization
             self.lst = gpus
             return gpus
@@ -34,13 +37,13 @@ def __getattr__(self, attr):
         return super(_DeviceList, self).__getattr__(attr)
 
     def __getitem__(self, devnum):
-        '''
+        """
         Returns the context manager for device *devnum*.
-        '''
+        """
         return self.lst[devnum]
 
     def __str__(self):
-        return ', '.join([str(d) for d in self.lst])
+        return ", ".join([str(d) for d in self.lst])
 
     def __iter__(self):
         return iter(self.lst)
@@ -50,8 +53,7 @@ def __len__(self):
 
     @property
     def current(self):
-        """Returns the active device or None if there's no active device
-        """
+        """Returns the active device or None if there's no active device"""
         with driver.get_active_context() as ac:
             devnum = ac.devnum
             if devnum is not None:
@@ -164,8 +166,10 @@ def _get_or_create_context_uncached(self, devnum):
                         ctx_handle = ctx.handle.value
                         ac_ctx_handle = ac.context_handle.value
                     if ctx_handle != ac_ctx_handle:
-                        msg = ('Numba cannot operate on non-primary'
-                               ' CUDA context {:x}')
+                        msg = (
+                            "Numba cannot operate on non-primary"
+                            " CUDA context {:x}"
+                        )
                         raise RuntimeError(msg.format(ac_ctx_handle))
                     # Ensure the context is ready
                     ctx.prepare_for_use()
@@ -178,12 +182,12 @@ def _activate_context_for(self, devnum):
             # Detect unexpected context switch
             cached_ctx = self._get_attached_context()
             if cached_ctx is not None and cached_ctx is not newctx:
-                raise RuntimeError('Cannot switch CUDA-context.')
+                raise RuntimeError("Cannot switch CUDA-context.")
             newctx.push()
             return newctx
 
     def _get_attached_context(self):
-        return getattr(self._tls, 'attached_context', None)
+        return getattr(self._tls, "attached_context", None)
 
     def _set_attached_context(self, ctx):
         self._tls.attached_context = ctx
@@ -226,6 +230,7 @@ def require_context(fn):
 
     Note: The function *fn* cannot switch CUDA-context.
     """
+
     @functools.wraps(fn)
     def _require_cuda_context(*args, **kws):
         with _runtime.ensure_context():
diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py
index 1641bf779..8db11880b 100644
--- a/numba_cuda/numba/cuda/cudadrv/driver.py
+++ b/numba_cuda/numba/cuda/cudadrv/driver.py
@@ -10,6 +10,7 @@
 system to freeze in some cases.
 
 """
+
 import sys
 import os
 import ctypes
@@ -25,8 +26,17 @@
 import re
 from itertools import product
 from abc import ABCMeta, abstractmethod
-from ctypes import (c_int, byref, c_size_t, c_char, c_char_p, addressof,
-                    c_void_p, c_float, c_uint)
+from ctypes import (
+    c_int,
+    byref,
+    c_size_t,
+    c_char,
+    c_char_p,
+    addressof,
+    c_void_p,
+    c_float,
+    c_uint,
+)
 import contextlib
 import importlib
 import numpy as np
@@ -51,13 +61,14 @@
 
 if USE_NV_BINDING:
     from cuda import cuda as binding
+
     # There is no definition of the default stream in the Nvidia bindings (nor
     # is there at the C/C++ level), so we define it here so we don't need to
     # use a magic number 0 in places where we want the default stream.
     CU_STREAM_DEFAULT = 0
 
 MIN_REQUIRED_CC = (3, 5)
-SUPPORTS_IPC = sys.platform.startswith('linux')
+SUPPORTS_IPC = sys.platform.startswith("linux")
 
 
 _py_decref = ctypes.pythonapi.Py_DecRef
@@ -71,10 +82,9 @@
     "to be available"
 )
 
-ENABLE_PYNVJITLINK = (
-    _readenv("NUMBA_CUDA_ENABLE_PYNVJITLINK", bool, False)
-    or getattr(config, "CUDA_ENABLE_PYNVJITLINK", False)
-)
+ENABLE_PYNVJITLINK = _readenv(
+    "NUMBA_CUDA_ENABLE_PYNVJITLINK", bool, False
+) or getattr(config, "CUDA_ENABLE_PYNVJITLINK", False)
 if not hasattr(config, "CUDA_ENABLE_PYNVJITLINK"):
     config.CUDA_ENABLE_PYNVJITLINK = ENABLE_PYNVJITLINK
 
@@ -94,7 +104,7 @@ def make_logger():
         if config.CUDA_LOG_LEVEL:
             # create a simple handler that prints to stderr
             handler = logging.StreamHandler(sys.stderr)
-            fmt = '== CUDA [%(relativeCreated)d] %(levelname)5s -- %(message)s'
+            fmt = "== CUDA [%(relativeCreated)d] %(levelname)5s -- %(message)s"
             handler.setFormatter(logging.Formatter(fmt=fmt))
             logger.addHandler(handler)
         else:
@@ -122,50 +132,52 @@ def __str__(self):
 
 
 def locate_driver_and_loader():
-
     envpath = config.CUDA_DRIVER
 
-    if envpath == '0':
+    if envpath == "0":
         # Force fail
         _raise_driver_not_found()
 
     # Determine DLL type
-    if sys.platform == 'win32':
+    if sys.platform == "win32":
         dlloader = ctypes.WinDLL
-        dldir = ['\\windows\\system32']
-        dlnames = ['nvcuda.dll']
-    elif sys.platform == 'darwin':
+        dldir = ["\\windows\\system32"]
+        dlnames = ["nvcuda.dll"]
+    elif sys.platform == "darwin":
         dlloader = ctypes.CDLL
-        dldir = ['/usr/local/cuda/lib']
-        dlnames = ['libcuda.dylib']
+        dldir = ["/usr/local/cuda/lib"]
+        dlnames = ["libcuda.dylib"]
     else:
         # Assume to be *nix like
         dlloader = ctypes.CDLL
-        dldir = ['/usr/lib', '/usr/lib64']
-        dlnames = ['libcuda.so', 'libcuda.so.1']
+        dldir = ["/usr/lib", "/usr/lib64"]
+        dlnames = ["libcuda.so", "libcuda.so.1"]
 
     if envpath:
         try:
             envpath = os.path.abspath(envpath)
         except ValueError:
-            raise ValueError("NUMBA_CUDA_DRIVER %s is not a valid path" %
-                             envpath)
+            raise ValueError(
+                "NUMBA_CUDA_DRIVER %s is not a valid path" % envpath
+            )
         if not os.path.isfile(envpath):
-            raise ValueError("NUMBA_CUDA_DRIVER %s is not a valid file "
-                             "path.  Note it must be a filepath of the .so/"
-                             ".dll/.dylib or the driver" % envpath)
+            raise ValueError(
+                "NUMBA_CUDA_DRIVER %s is not a valid file "
+                "path.  Note it must be a filepath of the .so/"
+                ".dll/.dylib or the driver" % envpath
+            )
         candidates = [envpath]
     else:
         # First search for the name in the default library path.
         # If that is not found, try the specific path.
-        candidates = dlnames + [os.path.join(x, y)
-                                for x, y in product(dldir, dlnames)]
+        candidates = dlnames + [
+            os.path.join(x, y) for x, y in product(dldir, dlnames)
+        ]
 
     return dlloader, candidates
 
 
 def load_driver(dlloader, candidates):
-
     # Load the driver; Collect driver error information
     path_not_exist = []
     driver_load_error = []
@@ -184,7 +196,7 @@ def load_driver(dlloader, candidates):
     if all(path_not_exist):
         _raise_driver_not_found()
     else:
-        errmsg = '\n'.join(str(e) for e in driver_load_error)
+        errmsg = "\n".join(str(e) for e in driver_load_error)
         _raise_driver_error(errmsg)
 
 
@@ -216,7 +228,7 @@ def _raise_driver_error(e):
 
 
 def _build_reverse_error_map():
-    prefix = 'CUDA_ERROR'
+    prefix = "CUDA_ERROR"
     map = utils.UniqueDict()
     for name in dir(enums):
         if name.startswith(prefix):
@@ -236,6 +248,7 @@ class Driver(object):
     """
     Driver API functions are lazily bound.
     """
+
     _singleton = None
 
     def __new__(cls):
@@ -254,9 +267,11 @@ def __init__(self):
         self.pid = None
         try:
             if config.DISABLE_CUDA:
-                msg = ("CUDA is disabled due to setting NUMBA_DISABLE_CUDA=1 "
-                       "in the environment, or because CUDA is unsupported on "
-                       "32-bit systems.")
+                msg = (
+                    "CUDA is disabled due to setting NUMBA_DISABLE_CUDA=1 "
+                    "in the environment, or because CUDA is unsupported on "
+                    "32-bit systems."
+                )
                 raise CudaSupportError(msg)
             self.lib = find_driver()
         except CudaSupportError as e:
@@ -273,7 +288,7 @@ def ensure_initialized(self):
 
         self.is_initialized = True
         try:
-            _logger.info('init')
+            _logger.info("init")
             self.cuInit(0)
         except CudaAPIError as e:
             description = f"{e.msg} ({e.code})"
@@ -292,8 +307,9 @@ def __getattr__(self, fname):
         self.ensure_initialized()
 
         if self.initialization_error is not None:
-            raise CudaSupportError("Error at driver init: \n%s:" %
-                                   self.initialization_error)
+            raise CudaSupportError(
+                "Error at driver init: \n%s:" % self.initialization_error
+            )
 
         if USE_NV_BINDING:
             return self._cuda_python_wrap_fn(fname)
@@ -317,12 +333,12 @@ def _ctypes_wrap_fn(self, fname, libfn=None):
 
         def verbose_cuda_api_call(*args):
             argstr = ", ".join([str(arg) for arg in args])
-            _logger.debug('call driver api: %s(%s)', libfn.__name__, argstr)
+            _logger.debug("call driver api: %s(%s)", libfn.__name__, argstr)
             retcode = libfn(*args)
             self._check_ctypes_error(fname, retcode)
 
         def safe_cuda_api_call(*args):
-            _logger.debug('call driver api: %s', libfn.__name__)
+            _logger.debug("call driver api: %s", libfn.__name__)
             retcode = libfn(*args)
             self._check_ctypes_error(fname, retcode)
 
@@ -340,11 +356,11 @@ def _cuda_python_wrap_fn(self, fname):
 
         def verbose_cuda_api_call(*args):
             argstr = ", ".join([str(arg) for arg in args])
-            _logger.debug('call driver api: %s(%s)', libfn.__name__, argstr)
+            _logger.debug("call driver api: %s(%s)", libfn.__name__, argstr)
             return self._check_cuda_python_error(fname, libfn(*args))
 
         def safe_cuda_api_call(*args):
-            _logger.debug('call driver api: %s', libfn.__name__)
+            _logger.debug("call driver api: %s", libfn.__name__)
             return self._check_cuda_python_error(fname, libfn(*args))
 
         if config.CUDA_LOG_API_ARGS:
@@ -361,27 +377,27 @@ def _find_api(self, fname):
         # binding. For the NVidia binding, it handles linking to the correct
         # variant.
         if config.CUDA_PER_THREAD_DEFAULT_STREAM and not USE_NV_BINDING:
-            variants = ('_v2_ptds', '_v2_ptsz', '_ptds', '_ptsz', '_v2', '')
+            variants = ("_v2_ptds", "_v2_ptsz", "_ptds", "_ptsz", "_v2", "")
         else:
-            variants = ('_v2', '')
+            variants = ("_v2", "")
 
         for variant in variants:
             try:
-                return getattr(self.lib, f'{fname}{variant}')
+                return getattr(self.lib, f"{fname}{variant}")
             except AttributeError:
                 pass
 
         # Not found.
         # Delay missing function error to use
         def absent_function(*args, **kws):
-            raise CudaDriverError(f'Driver missing function: {fname}')
+            raise CudaDriverError(f"Driver missing function: {fname}")
 
         setattr(self, fname, absent_function)
         return absent_function
 
     def _detect_fork(self):
         if self.pid is not None and _getpid() != self.pid:
-            msg = 'pid %s forked from pid %s after CUDA driver init'
+            msg = "pid %s forked from pid %s after CUDA driver init"
             _logger.critical(msg, _getpid(), self.pid)
             raise CudaDriverError("CUDA initialized before forking")
 
@@ -425,13 +441,11 @@ def get_device_count(self):
         return count.value
 
     def list_devices(self):
-        """Returns a list of active devices
-        """
+        """Returns a list of active devices"""
         return list(self.devices.values())
 
     def reset(self):
-        """Reset all devices
-        """
+        """Reset all devices"""
         for dev in self.devices.values():
             dev.reset()
 
@@ -449,8 +463,7 @@ def pop_active_context(self):
                     return popped
 
     def get_active_context(self):
-        """Returns an instance of ``_ActiveContext``.
-        """
+        """Returns an instance of ``_ActiveContext``."""
         return _ActiveContext()
 
     def get_version(self):
@@ -477,12 +490,13 @@ class _ActiveContext(object):
     Once entering the context, it is assumed that the active CUDA context is
     not changed until the context is exited.
     """
+
     _tls_cache = threading.local()
 
     def __enter__(self):
         is_top = False
         # check TLS cache
-        if hasattr(self._tls_cache, 'ctx_devnum'):
+        if hasattr(self._tls_cache, "ctx_devnum"):
             hctx, devnum = self._tls_cache.ctx_devnum
         # Not cached. Query the driver API.
         else:
@@ -515,11 +529,10 @@ def __enter__(self):
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         if self._is_top:
-            delattr(self._tls_cache, 'ctx_devnum')
+            delattr(self._tls_cache, "ctx_devnum")
 
     def __bool__(self):
-        """Returns True is there's a valid and active CUDA context.
-        """
+        """Returns True is there's a valid and active CUDA context."""
         return self.context_handle is not None
 
     __nonzero__ = __bool__
@@ -533,7 +546,7 @@ def _build_reverse_device_attrs():
     map = utils.UniqueDict()
     for name in dir(enums):
         if name.startswith(prefix):
-            map[name[len(prefix):]] = getattr(enums, name)
+            map[name[len(prefix) :]] = getattr(enums, name)
     return map
 
 
@@ -545,6 +558,7 @@ class Device(object):
     The device object owns the CUDA contexts.  This is owned by the driver
     object.  User should not construct devices directly.
     """
+
     @classmethod
     def from_identity(self, identity):
         """Create Device object from device identity created by
@@ -579,15 +593,17 @@ def __init__(self, devnum):
         self.attributes = {}
 
         # Read compute capability
-        self.compute_capability = (self.COMPUTE_CAPABILITY_MAJOR,
-                                   self.COMPUTE_CAPABILITY_MINOR)
+        self.compute_capability = (
+            self.COMPUTE_CAPABILITY_MAJOR,
+            self.COMPUTE_CAPABILITY_MINOR,
+        )
 
         # Read name
         bufsz = 128
 
         if USE_NV_BINDING:
             buf = driver.cuDeviceGetName(bufsz, self.id)
-            name = buf.decode('utf-8').rstrip('\0')
+            name = buf.decode("utf-8").rstrip("\0")
         else:
             buf = (c_char * bufsz)()
             driver.cuDeviceGetName(buf, bufsz, self.id)
@@ -604,31 +620,31 @@ def __init__(self, devnum):
             driver.cuDeviceGetUuid(byref(uuid), self.id)
             uuid_vals = tuple(bytes(uuid))
 
-        b = '%02x'
+        b = "%02x"
         b2 = b * 2
         b4 = b * 4
         b6 = b * 6
-        fmt = f'GPU-{b4}-{b2}-{b2}-{b2}-{b6}'
+        fmt = f"GPU-{b4}-{b2}-{b2}-{b2}-{b6}"
         self.uuid = fmt % uuid_vals
 
         self.primary_context = None
 
     def get_device_identity(self):
         return {
-            'pci_domain_id': self.PCI_DOMAIN_ID,
-            'pci_bus_id': self.PCI_BUS_ID,
-            'pci_device_id': self.PCI_DEVICE_ID,
+            "pci_domain_id": self.PCI_DOMAIN_ID,
+            "pci_bus_id": self.PCI_BUS_ID,
+            "pci_device_id": self.PCI_DEVICE_ID,
         }
 
     def __repr__(self):
         return "<CUDA device %d '%s'>" % (self.id, self.name)
 
     def __getattr__(self, attr):
-        """Read attributes lazily
-        """
+        """Read attributes lazily"""
         if USE_NV_BINDING:
-            code = getattr(binding.CUdevice_attribute,
-                           f'CU_DEVICE_ATTRIBUTE_{attr}')
+            code = getattr(
+                binding.CUdevice_attribute, f"CU_DEVICE_ATTRIBUTE_{attr}"
+            )
             value = driver.cuDeviceGetAttribute(code, self.id)
         else:
             try:
@@ -698,17 +714,18 @@ def supports_float16(self):
 
 def met_requirement_for_device(device):
     if device.compute_capability < MIN_REQUIRED_CC:
-        raise CudaSupportError("%s has compute capability < %s" %
-                               (device, MIN_REQUIRED_CC))
+        raise CudaSupportError(
+            "%s has compute capability < %s" % (device, MIN_REQUIRED_CC)
+        )
 
 
 class BaseCUDAMemoryManager(object, metaclass=ABCMeta):
     """Abstract base class for External Memory Management (EMM) Plugins."""
 
     def __init__(self, *args, **kwargs):
-        if 'context' not in kwargs:
+        if "context" not in kwargs:
             raise RuntimeError("Memory manager requires a context")
-        self.context = kwargs.pop('context')
+        self.context = kwargs.pop("context")
 
     @abstractmethod
     def memalloc(self, size):
@@ -864,8 +881,7 @@ def _attempt_allocation(self, allocator):
             else:
                 raise
 
-    def memhostalloc(self, size, mapped=False, portable=False,
-                     wc=False):
+    def memhostalloc(self, size, mapped=False, portable=False, wc=False):
         """Implements the allocation of pinned host memory.
 
         It is recommended that this method is not overridden by EMM Plugin
@@ -880,6 +896,7 @@ def memhostalloc(self, size, mapped=False, portable=False,
             flags |= enums.CU_MEMHOSTALLOC_WRITECOMBINED
 
         if USE_NV_BINDING:
+
             def allocator():
                 return driver.cuMemHostAlloc(size, flags)
 
@@ -946,16 +963,19 @@ def allocator():
         ctx = weakref.proxy(self.context)
 
         if mapped:
-            mem = MappedMemory(ctx, pointer, size, owner=owner,
-                               finalizer=finalizer)
+            mem = MappedMemory(
+                ctx, pointer, size, owner=owner, finalizer=finalizer
+            )
             self.allocations[alloc_key] = mem
             return mem.own()
         else:
-            return PinnedMemory(ctx, pointer, size, owner=owner,
-                                finalizer=finalizer)
+            return PinnedMemory(
+                ctx, pointer, size, owner=owner, finalizer=finalizer
+            )
 
     def memallocmanaged(self, size, attach_global):
         if USE_NV_BINDING:
+
             def allocator():
                 ma_flags = binding.CUmemAttach_flags
 
@@ -1014,8 +1034,7 @@ def defer_cleanup(self):
 
 
 class GetIpcHandleMixin:
-    """A class that provides a default implementation of ``get_ipc_handle()``.
-    """
+    """A class that provides a default implementation of ``get_ipc_handle()``."""
 
     def get_ipc_handle(self, memory):
         """Open an IPC memory handle by using ``cuMemGetAddressRange`` to
@@ -1034,8 +1053,9 @@ def get_ipc_handle(self, memory):
             offset = memory.handle.value - base
         source_info = self.context.device.get_device_identity()
 
-        return IpcHandle(memory, ipchandle, memory.size, source_info,
-                         offset=offset)
+        return IpcHandle(
+            memory, ipchandle, memory.size, source_info, offset=offset
+        )
 
 
 class NumbaCUDAMemoryManager(GetIpcHandleMixin, HostOnlyCUDAMemoryManager):
@@ -1050,6 +1070,7 @@ def initialize(self):
 
     def memalloc(self, size):
         if USE_NV_BINDING:
+
             def allocator():
                 return driver.cuMemAlloc(size)
 
@@ -1098,7 +1119,7 @@ def _ensure_memory_manager():
     if _memory_manager:
         return
 
-    if config.CUDA_MEMORY_MANAGER == 'default':
+    if config.CUDA_MEMORY_MANAGER == "default":
         _memory_manager = NumbaCUDAMemoryManager
         return
 
@@ -1106,8 +1127,9 @@ def _ensure_memory_manager():
         mgr_module = importlib.import_module(config.CUDA_MEMORY_MANAGER)
         set_memory_manager(mgr_module._numba_memory_manager)
     except Exception:
-        raise RuntimeError("Failed to use memory manager from %s" %
-                           config.CUDA_MEMORY_MANAGER)
+        raise RuntimeError(
+            "Failed to use memory manager from %s" % config.CUDA_MEMORY_MANAGER
+        )
 
 
 def set_memory_manager(mm_plugin):
@@ -1124,8 +1146,10 @@ def set_memory_manager(mm_plugin):
     dummy = mm_plugin(context=None)
     iv = dummy.interface_version
     if iv != _SUPPORTED_EMM_INTERFACE_VERSION:
-        err = "EMM Plugin interface has version %d - version %d required" \
-              % (iv, _SUPPORTED_EMM_INTERFACE_VERSION)
+        err = "EMM Plugin interface has version %d - version %d required" % (
+            iv,
+            _SUPPORTED_EMM_INTERFACE_VERSION,
+        )
         raise RuntimeError(err)
 
     _memory_manager = mm_plugin
@@ -1140,7 +1164,7 @@ def __new__(cls, *args, **kwargs):
         return super().__new__(cls, 0)
 
     def __str__(self):
-        return '?'
+        return "?"
 
 
 _SizeNotSet = _SizeNotSet()
@@ -1153,6 +1177,7 @@ class _PendingDeallocs(object):
     modified later once the driver is initialized and the total memory capacity
     known.
     """
+
     def __init__(self, capacity=_SizeNotSet):
         self._cons = deque()
         self._disable_count = 0
@@ -1172,11 +1197,13 @@ def add_item(self, dtor, handle, size=_SizeNotSet):
         byte size of the resource added.  It is an optional argument.  Some
         resources (e.g. CUModule) has an unknown memory footprint on the device.
         """
-        _logger.info('add pending dealloc: %s %s bytes', dtor.__name__, size)
+        _logger.info("add pending dealloc: %s %s bytes", dtor.__name__, size)
         self._cons.append((dtor, handle, size))
         self._size += int(size)
-        if (len(self._cons) > config.CUDA_DEALLOCS_COUNT or
-                self._size > self._max_pending_bytes):
+        if (
+            len(self._cons) > config.CUDA_DEALLOCS_COUNT
+            or self._size > self._max_pending_bytes
+        ):
             self.clear()
 
     def clear(self):
@@ -1187,7 +1214,7 @@ def clear(self):
         if not self.is_disabled:
             while self._cons:
                 [dtor, handle, size] = self._cons.popleft()
-                _logger.info('dealloc: %s %s bytes', dtor.__name__, size)
+                _logger.info("dealloc: %s %s bytes", dtor.__name__, size)
                 dtor(handle)
             self._size = 0
 
@@ -1251,19 +1278,19 @@ def reset(self):
         Clean up all owned resources in this context.
         """
         # Free owned resources
-        _logger.info('reset context of device %s', self.device.id)
+        _logger.info("reset context of device %s", self.device.id)
         self.memory_manager.reset()
         self.modules.clear()
         # Clear trash
         self.deallocations.clear()
 
     def get_memory_info(self):
-        """Returns (free, total) memory in bytes in the context.
-        """
+        """Returns (free, total) memory in bytes in the context."""
         return self.memory_manager.get_memory_info()
 
-    def get_active_blocks_per_multiprocessor(self, func, blocksize, memsize,
-                                             flags=None):
+    def get_active_blocks_per_multiprocessor(
+        self, func, blocksize, memsize, flags=None
+    ):
         """Return occupancy of a function.
         :param func: kernel for which occupancy is calculated
         :param blocksize: block size the kernel is intended to be launched with
@@ -1275,8 +1302,9 @@ def get_active_blocks_per_multiprocessor(self, func, blocksize, memsize,
         else:
             return self._ctypes_active_blocks_per_multiprocessor(*args)
 
-    def _cuda_python_active_blocks_per_multiprocessor(self, func, blocksize,
-                                                      memsize, flags):
+    def _cuda_python_active_blocks_per_multiprocessor(
+        self, func, blocksize, memsize, flags
+    ):
         ps = [func.handle, blocksize, memsize]
 
         if not flags:
@@ -1285,8 +1313,9 @@ def _cuda_python_active_blocks_per_multiprocessor(self, func, blocksize,
         ps.append(flags)
         return driver.cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(*ps)
 
-    def _ctypes_active_blocks_per_multiprocessor(self, func, blocksize,
-                                                 memsize, flags):
+    def _ctypes_active_blocks_per_multiprocessor(
+        self, func, blocksize, memsize, flags
+    ):
         retval = c_int()
         args = (byref(retval), func.handle, blocksize, memsize)
 
@@ -1297,8 +1326,9 @@ def _ctypes_active_blocks_per_multiprocessor(self, func, blocksize,
 
         return retval.value
 
-    def get_max_potential_block_size(self, func, b2d_func, memsize,
-                                     blocksizelimit, flags=None):
+    def get_max_potential_block_size(
+        self, func, b2d_func, memsize, blocksizelimit, flags=None
+    ):
         """Suggest a launch configuration with reasonable occupancy.
         :param func: kernel for which occupancy is calculated
         :param b2d_func: function that calculates how much per-block dynamic
@@ -1315,13 +1345,20 @@ def get_max_potential_block_size(self, func, b2d_func, memsize,
         else:
             return self._ctypes_max_potential_block_size(*args)
 
-    def _ctypes_max_potential_block_size(self, func, b2d_func, memsize,
-                                         blocksizelimit, flags):
+    def _ctypes_max_potential_block_size(
+        self, func, b2d_func, memsize, blocksizelimit, flags
+    ):
         gridsize = c_int()
         blocksize = c_int()
         b2d_cb = cu_occupancy_b2d_size(b2d_func)
-        args = [byref(gridsize), byref(blocksize), func.handle, b2d_cb,
-                memsize, blocksizelimit]
+        args = [
+            byref(gridsize),
+            byref(blocksize),
+            func.handle,
+            b2d_cb,
+            memsize,
+            blocksizelimit,
+        ]
 
         if not flags:
             driver.cuOccupancyMaxPotentialBlockSize(*args)
@@ -1331,10 +1368,11 @@ def _ctypes_max_potential_block_size(self, func, b2d_func, memsize,
 
         return (gridsize.value, blocksize.value)
 
-    def _cuda_python_max_potential_block_size(self, func, b2d_func, memsize,
-                                              blocksizelimit, flags):
+    def _cuda_python_max_potential_block_size(
+        self, func, b2d_func, memsize, blocksizelimit, flags
+    ):
         b2d_cb = ctypes.CFUNCTYPE(c_size_t, c_int)(b2d_func)
-        ptr = int.from_bytes(b2d_cb, byteorder='little')
+        ptr = int.from_bytes(b2d_cb, byteorder="little")
         driver_b2d_cb = binding.CUoccupancyB2DSize(ptr)
         args = [func.handle, driver_b2d_cb, memsize, blocksizelimit]
 
@@ -1387,7 +1425,7 @@ def get_ipc_handle(self, memory):
         Returns an *IpcHandle* from a GPU allocation.
         """
         if not SUPPORTS_IPC:
-            raise OSError('OS does not support CUDA IPC')
+            raise OSError("OS does not support CUDA IPC")
         return self.memory_manager.get_ipc_handle(memory)
 
     def open_ipc_handle(self, handle, size):
@@ -1400,13 +1438,13 @@ def open_ipc_handle(self, handle, size):
             driver.cuIpcOpenMemHandle(byref(dptr), handle, flags)
 
         # wrap it
-        return MemoryPointer(context=weakref.proxy(self), pointer=dptr,
-                             size=size)
+        return MemoryPointer(
+            context=weakref.proxy(self), pointer=dptr, size=size
+        )
 
     def enable_peer_access(self, peer_context, flags=0):
-        """Enable peer access between the current context and the peer context
-        """
-        assert flags == 0, '*flags* is reserved and MUST be zero'
+        """Enable peer access between the current context and the peer context"""
+        assert flags == 0, "*flags* is reserved and MUST be zero"
         driver.cuCtxEnablePeerAccess(peer_context, flags)
 
     def can_access_peer(self, peer_device):
@@ -1415,18 +1453,22 @@ def can_access_peer(self, peer_device):
         """
         if USE_NV_BINDING:
             peer_device = binding.CUdevice(peer_device)
-            can_access_peer = driver.cuDeviceCanAccessPeer(self.device.id,
-                                                           peer_device)
+            can_access_peer = driver.cuDeviceCanAccessPeer(
+                self.device.id, peer_device
+            )
         else:
             can_access_peer = c_int()
-            driver.cuDeviceCanAccessPeer(byref(can_access_peer),
-                                         self.device.id, peer_device,)
+            driver.cuDeviceCanAccessPeer(
+                byref(can_access_peer),
+                self.device.id,
+                peer_device,
+            )
 
         return bool(can_access_peer)
 
     def create_module_ptx(self, ptx):
         if isinstance(ptx, str):
-            ptx = ptx.encode('utf8')
+            ptx = ptx.encode("utf8")
         if USE_NV_BINDING:
             image = ptx
         else:
@@ -1481,8 +1523,11 @@ def create_stream(self):
         else:
             handle = drvapi.cu_stream()
             driver.cuStreamCreate(byref(handle), 0)
-        return Stream(weakref.proxy(self), handle,
-                      _stream_finalizer(self.deallocations, handle))
+        return Stream(
+            weakref.proxy(self),
+            handle,
+            _stream_finalizer(self.deallocations, handle),
+        )
 
     def create_external_stream(self, ptr):
         if not isinstance(ptr, int):
@@ -1491,8 +1536,7 @@ def create_external_stream(self, ptr):
             handle = binding.CUstream(ptr)
         else:
             handle = drvapi.cu_stream(ptr)
-        return Stream(weakref.proxy(self), handle, None,
-                      external=True)
+        return Stream(weakref.proxy(self), handle, None, external=True)
 
     def create_event(self, timing=True):
         flags = 0
@@ -1503,8 +1547,11 @@ def create_event(self, timing=True):
         else:
             handle = drvapi.cu_event()
             driver.cuEventCreate(byref(handle), flags)
-        return Event(weakref.proxy(self), handle,
-                     finalizer=_event_finalizer(self.deallocations, handle))
+        return Event(
+            weakref.proxy(self),
+            handle,
+            finalizer=_event_finalizer(self.deallocations, handle),
+        )
 
     def synchronize(self):
         driver.cuCtxSynchronize()
@@ -1557,16 +1604,21 @@ def load_module_image_ctypes(context, image):
 
     handle = drvapi.cu_module()
     try:
-        driver.cuModuleLoadDataEx(byref(handle), image, len(options),
-                                  option_keys, option_vals)
+        driver.cuModuleLoadDataEx(
+            byref(handle), image, len(options), option_keys, option_vals
+        )
     except CudaAPIError as e:
         msg = "cuModuleLoadDataEx error:\n%s" % jiterrors.value.decode("utf8")
         raise CudaAPIError(e.code, msg)
 
     info_log = jitinfo.value
 
-    return CtypesModule(weakref.proxy(context), handle, info_log,
-                        _module_finalizer(context, handle))
+    return CtypesModule(
+        weakref.proxy(context),
+        handle,
+        info_log,
+        _module_finalizer(context, handle),
+    )
 
 
 def load_module_image_cuda_python(context, image):
@@ -1591,17 +1643,22 @@ def load_module_image_cuda_python(context, image):
     option_vals = [v for v in options.values()]
 
     try:
-        handle = driver.cuModuleLoadDataEx(image, len(options), option_keys,
-                                           option_vals)
+        handle = driver.cuModuleLoadDataEx(
+            image, len(options), option_keys, option_vals
+        )
     except CudaAPIError as e:
-        err_string = jiterrors.decode('utf-8')
+        err_string = jiterrors.decode("utf-8")
         msg = "cuModuleLoadDataEx error:\n%s" % err_string
         raise CudaAPIError(e.code, msg)
 
-    info_log = jitinfo.decode('utf-8')
+    info_log = jitinfo.decode("utf-8")
 
-    return CudaPythonModule(weakref.proxy(context), handle, info_log,
-                            _module_finalizer(context, handle))
+    return CudaPythonModule(
+        weakref.proxy(context),
+        handle,
+        info_log,
+        _module_finalizer(context, handle),
+    )
 
 
 def _alloc_finalizer(memory_manager, ptr, alloc_key, size):
@@ -1704,6 +1761,7 @@ class _CudaIpcImpl(object):
     """Implementation of GPU IPC using CUDA driver API.
     This requires the devices to be peer accessible.
     """
+
     def __init__(self, parent):
         self.base = parent.base
         self.handle = parent.handle
@@ -1717,10 +1775,10 @@ def open(self, context):
         Import the IPC memory and returns a raw CUDA memory pointer object
         """
         if self.base is not None:
-            raise ValueError('opening IpcHandle from original process')
+            raise ValueError("opening IpcHandle from original process")
 
         if self._opened_mem is not None:
-            raise ValueError('IpcHandle is already opened')
+            raise ValueError("IpcHandle is already opened")
 
         mem = context.open_ipc_handle(self.handle, self.offset + self.size)
         # this object owns the opened allocation
@@ -1731,7 +1789,7 @@ def open(self, context):
 
     def close(self):
         if self._opened_mem is None:
-            raise ValueError('IpcHandle not opened')
+            raise ValueError("IpcHandle not opened")
         driver.cuIpcCloseMemHandle(self._opened_mem.handle)
         self._opened_mem = None
 
@@ -1740,6 +1798,7 @@ class _StagedIpcImpl(object):
     """Implementation of GPU IPC using custom staging logic to workaround
     CUDA IPC limitation on peer accessibility between devices.
     """
+
     def __init__(self, parent, source_info):
         self.parent = parent
         self.base = parent.base
@@ -1795,6 +1854,7 @@ class IpcHandle(object):
                    referred to by this IPC handle.
     :type offset: int
     """
+
     def __init__(self, base, handle, size, source_info=None, offset=0):
         self.base = base
         self.handle = handle
@@ -1818,12 +1878,11 @@ def can_access_peer(self, context):
         return context.can_access_peer(source_device.id)
 
     def open_staged(self, context):
-        """Open the IPC by allowing staging on the host memory first.
-        """
+        """Open the IPC by allowing staging on the host memory first."""
         self._sentry_source_info()
 
         if self._impl is not None:
-            raise ValueError('IpcHandle is already opened')
+            raise ValueError("IpcHandle is already opened")
 
         self._impl = _StagedIpcImpl(self, self.source_info)
         return self._impl.open(context)
@@ -1833,7 +1892,7 @@ def open_direct(self, context):
         Import the IPC memory and returns a raw CUDA memory pointer object
         """
         if self._impl is not None:
-            raise ValueError('IpcHandle is already opened')
+            raise ValueError("IpcHandle is already opened")
 
         self._impl = _CudaIpcImpl(self)
         return self._impl.open(context)
@@ -1864,12 +1923,13 @@ def open_array(self, context, shape, dtype, strides=None):
             strides = dtype.itemsize
         dptr = self.open(context)
         # read the device pointer as an array
-        return devicearray.DeviceNDArray(shape=shape, strides=strides,
-                                         dtype=dtype, gpu_data=dptr)
+        return devicearray.DeviceNDArray(
+            shape=shape, strides=strides, dtype=dtype, gpu_data=dptr
+        )
 
     def close(self):
         if self._impl is None:
-            raise ValueError('IpcHandle not opened')
+            raise ValueError("IpcHandle not opened")
         self._impl.close()
         self._impl = None
 
@@ -1895,8 +1955,13 @@ def _rebuild(cls, handle_ary, size, source_info, offset):
         else:
             handle = drvapi.cu_ipc_mem_handle()
         handle.reserved = handle_ary
-        return cls(base=None, handle=handle, size=size,
-                   source_info=source_info, offset=offset)
+        return cls(
+            base=None,
+            handle=handle,
+            size=size,
+            source_info=source_info,
+            offset=offset,
+        )
 
 
 class MemoryPointer(object):
@@ -1930,6 +1995,7 @@ class MemoryPointer(object):
     :param finalizer: A function that is called when the buffer is to be freed.
     :type finalizer: function
     """
+
     __cuda_memory__ = True
 
     def __init__(self, context, pointer, size, owner=None, finalizer=None):
@@ -1965,8 +2031,9 @@ def free(self):
     def memset(self, byte, count=None, stream=0):
         count = self.size if count is None else count
         if stream:
-            driver.cuMemsetD8Async(self.device_pointer, byte, count,
-                                   stream.handle)
+            driver.cuMemsetD8Async(
+                self.device_pointer, byte, count, stream.handle
+            )
         else:
             driver.cuMemsetD8(self.device_pointer, byte, count)
 
@@ -1980,12 +2047,12 @@ def view(self, start, stop=None):
         if not self.device_pointer_value:
             if size != 0:
                 raise RuntimeError("non-empty slice into empty slice")
-            view = self      # new view is just a reference to self
+            view = self  # new view is just a reference to self
         # Handle normal case
         else:
             base = self.device_pointer_value + start
             if size < 0:
-                raise RuntimeError('size cannot be negative')
+                raise RuntimeError("size cannot be negative")
             if USE_NV_BINDING:
                 pointer = binding.CUdeviceptr()
                 ctypes_ptr = drvapi.cu_device_ptr.from_address(pointer.getPtr())
@@ -2021,6 +2088,7 @@ class AutoFreePointer(MemoryPointer):
 
     Constructor arguments are the same as for :class:`MemoryPointer`.
     """
+
     def __init__(self, *args, **kwargs):
         super(AutoFreePointer, self).__init__(*args, **kwargs)
         # Releease the self reference to the buffer, so that the finalizer
@@ -2063,8 +2131,9 @@ def __init__(self, context, pointer, size, owner=None, finalizer=None):
             self._bufptr_ = self.host_pointer.value
 
         self.device_pointer = devptr
-        super(MappedMemory, self).__init__(context, devptr, size,
-                                           finalizer=finalizer)
+        super(MappedMemory, self).__init__(
+            context, devptr, size, finalizer=finalizer
+        )
         self.handle = self.host_pointer
 
         # For buffer interface
@@ -2179,8 +2248,7 @@ def deref():
         weakref.finalize(self, deref)
 
     def __getattr__(self, fname):
-        """Proxy MemoryPointer methods
-        """
+        """Proxy MemoryPointer methods"""
         return getattr(self._view, fname)
 
 
@@ -2211,18 +2279,15 @@ def __repr__(self):
         if USE_NV_BINDING:
             default_streams = {
                 CU_STREAM_DEFAULT: "<Default CUDA stream on %s>",
-                binding.CU_STREAM_LEGACY:
-                    "<Legacy default CUDA stream on %s>",
-                binding.CU_STREAM_PER_THREAD:
-                    "<Per-thread default CUDA stream on %s>",
+                binding.CU_STREAM_LEGACY: "<Legacy default CUDA stream on %s>",
+                binding.CU_STREAM_PER_THREAD: "<Per-thread default CUDA stream on %s>",
             }
             ptr = int(self.handle) or 0
         else:
             default_streams = {
                 drvapi.CU_STREAM_DEFAULT: "<Default CUDA stream on %s>",
                 drvapi.CU_STREAM_LEGACY: "<Legacy default CUDA stream on %s>",
-                drvapi.CU_STREAM_PER_THREAD:
-                    "<Per-thread default CUDA stream on %s>",
+                drvapi.CU_STREAM_PER_THREAD: "<Per-thread default CUDA stream on %s>",
             }
             ptr = self.handle.value or drvapi.CU_STREAM_DEFAULT
 
@@ -2234,18 +2299,18 @@ def __repr__(self):
             return "<CUDA stream %d on %s>" % (ptr, self.context)
 
     def synchronize(self):
-        '''
+        """
         Wait for all commands in this stream to execute. This will commit any
         pending memory transfers.
-        '''
+        """
         driver.cuStreamSynchronize(self.handle)
 
     @contextlib.contextmanager
     def auto_synchronize(self):
-        '''
+        """
         A context manager that waits for all commands in this stream to execute
         and commits any pending memory transfers upon exiting the context.
-        '''
+        """
         yield self
         self.synchronize()
 
@@ -2272,7 +2337,7 @@ def add_callback(self, callback, arg=None):
         data = (self, callback, arg)
         _py_incref(data)
         if USE_NV_BINDING:
-            ptr = int.from_bytes(self._stream_callback, byteorder='little')
+            ptr = int.from_bytes(self._stream_callback, byteorder="little")
             stream_callback = binding.CUstreamCallback(ptr)
             # The callback needs to receive a pointer to the data PyObject
             data = id(data)
@@ -2373,9 +2438,9 @@ def elapsed_time(self, evtend):
 
 
 def event_elapsed_time(evtstart, evtend):
-    '''
+    """
     Compute the elapsed time between two events in milliseconds.
-    '''
+    """
     if USE_NV_BINDING:
         return driver.cuEventElapsedTime(evtstart.handle, evtend.handle)
     else:
@@ -2408,34 +2473,35 @@ def get_global_symbol(self, name):
 
 
 class CtypesModule(Module):
-
     def get_function(self, name):
         handle = drvapi.cu_function()
-        driver.cuModuleGetFunction(byref(handle), self.handle,
-                                   name.encode('utf8'))
+        driver.cuModuleGetFunction(
+            byref(handle), self.handle, name.encode("utf8")
+        )
         return CtypesFunction(weakref.proxy(self), handle, name)
 
     def get_global_symbol(self, name):
         ptr = drvapi.cu_device_ptr()
         size = drvapi.c_size_t()
-        driver.cuModuleGetGlobal(byref(ptr), byref(size), self.handle,
-                                 name.encode('utf8'))
+        driver.cuModuleGetGlobal(
+            byref(ptr), byref(size), self.handle, name.encode("utf8")
+        )
         return MemoryPointer(self.context, ptr, size), size.value
 
 
 class CudaPythonModule(Module):
-
     def get_function(self, name):
-        handle = driver.cuModuleGetFunction(self.handle, name.encode('utf8'))
+        handle = driver.cuModuleGetFunction(self.handle, name.encode("utf8"))
         return CudaPythonFunction(weakref.proxy(self), handle, name)
 
     def get_global_symbol(self, name):
-        ptr, size = driver.cuModuleGetGlobal(self.handle, name.encode('utf8'))
+        ptr, size = driver.cuModuleGetGlobal(self.handle, name.encode("utf8"))
         return MemoryPointer(self.context, ptr, size), size
 
 
-FuncAttr = namedtuple("FuncAttr", ["regs", "shared", "local", "const",
-                                   "maxthreads"])
+FuncAttr = namedtuple(
+    "FuncAttr", ["regs", "shared", "local", "const", "maxthreads"]
+)
 
 
 class Function(metaclass=ABCMeta):
@@ -2458,8 +2524,9 @@ def device(self):
         return self.module.context.device
 
     @abstractmethod
-    def cache_config(self, prefer_equal=False, prefer_cache=False,
-                     prefer_shared=False):
+    def cache_config(
+        self, prefer_equal=False, prefer_cache=False, prefer_shared=False
+    ):
         """Set the cache configuration for this function."""
 
     @abstractmethod
@@ -2473,9 +2540,9 @@ def read_func_attr_all(self):
 
 
 class CtypesFunction(Function):
-
-    def cache_config(self, prefer_equal=False, prefer_cache=False,
-                     prefer_shared=False):
+    def cache_config(
+        self, prefer_equal=False, prefer_cache=False, prefer_shared=False
+    ):
         prefer_equal = prefer_equal or (prefer_cache and prefer_shared)
         if prefer_equal:
             flag = enums.CU_FUNC_CACHE_PREFER_EQUAL
@@ -2498,15 +2565,17 @@ def read_func_attr_all(self):
         lmem = self.read_func_attr(enums.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES)
         smem = self.read_func_attr(enums.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES)
         maxtpb = self.read_func_attr(
-            enums.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK)
-        return FuncAttr(regs=nregs, const=cmem, local=lmem, shared=smem,
-                        maxthreads=maxtpb)
+            enums.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK
+        )
+        return FuncAttr(
+            regs=nregs, const=cmem, local=lmem, shared=smem, maxthreads=maxtpb
+        )
 
 
 class CudaPythonFunction(Function):
-
-    def cache_config(self, prefer_equal=False, prefer_cache=False,
-                     prefer_shared=False):
+    def cache_config(
+        self, prefer_equal=False, prefer_cache=False, prefer_shared=False
+    ):
         prefer_equal = prefer_equal or (prefer_cache and prefer_shared)
         attr = binding.CUfunction_attribute
         if prefer_equal:
@@ -2529,19 +2598,26 @@ def read_func_attr_all(self):
         lmem = self.read_func_attr(attr.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES)
         smem = self.read_func_attr(attr.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES)
         maxtpb = self.read_func_attr(
-            attr.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK)
-        return FuncAttr(regs=nregs, const=cmem, local=lmem, shared=smem,
-                        maxthreads=maxtpb)
-
+            attr.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK
+        )
+        return FuncAttr(
+            regs=nregs, const=cmem, local=lmem, shared=smem, maxthreads=maxtpb
+        )
 
-def launch_kernel(cufunc_handle,
-                  gx, gy, gz,
-                  bx, by, bz,
-                  sharedmem,
-                  hstream,
-                  args,
-                  cooperative=False):
 
+def launch_kernel(
+    cufunc_handle,
+    gx,
+    gy,
+    gz,
+    bx,
+    by,
+    bz,
+    sharedmem,
+    hstream,
+    args,
+    cooperative=False,
+):
     param_ptrs = [addressof(arg) for arg in args]
     params = (c_void_p * len(param_ptrs))(*param_ptrs)
 
@@ -2553,46 +2629,54 @@ def launch_kernel(cufunc_handle,
         extra = None
 
     if cooperative:
-        driver.cuLaunchCooperativeKernel(cufunc_handle,
-                                         gx, gy, gz,
-                                         bx, by, bz,
-                                         sharedmem,
-                                         hstream,
-                                         params_for_launch)
+        driver.cuLaunchCooperativeKernel(
+            cufunc_handle,
+            gx,
+            gy,
+            gz,
+            bx,
+            by,
+            bz,
+            sharedmem,
+            hstream,
+            params_for_launch,
+        )
     else:
-        driver.cuLaunchKernel(cufunc_handle,
-                              gx, gy, gz,
-                              bx, by, bz,
-                              sharedmem,
-                              hstream,
-                              params_for_launch,
-                              extra)
+        driver.cuLaunchKernel(
+            cufunc_handle,
+            gx,
+            gy,
+            gz,
+            bx,
+            by,
+            bz,
+            sharedmem,
+            hstream,
+            params_for_launch,
+            extra,
+        )
 
 
 class Linker(metaclass=ABCMeta):
     """Abstract base class for linkers"""
 
     @classmethod
-    def new(cls,
-            max_registers=0,
-            lineinfo=False,
-            cc=None,
-            lto=None,
-            additional_flags=None
-            ):
-
+    def new(
+        cls,
+        max_registers=0,
+        lineinfo=False,
+        cc=None,
+        lto=None,
+        additional_flags=None,
+    ):
         driver_ver = driver.get_version()
-        if (
-            config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY
-            and driver_ver >= (12, 0)
+        if config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY and driver_ver >= (
+            12,
+            0,
         ):
-            raise ValueError(
-                "Use CUDA_ENABLE_PYNVJITLINK for CUDA >= 12.0 MVC"
-            )
+            raise ValueError("Use CUDA_ENABLE_PYNVJITLINK for CUDA >= 12.0 MVC")
         if config.CUDA_ENABLE_PYNVJITLINK and driver_ver < (12, 0):
-            raise ValueError(
-                "Enabling pynvjitlink requires CUDA 12."
-            )
+            raise ValueError("Enabling pynvjitlink requires CUDA 12.")
         if config.CUDA_ENABLE_PYNVJITLINK:
             linker = PyNvJitLinker
 
@@ -2641,9 +2725,9 @@ def add_cu(self, cu, name):
         ptx, log = nvrtc.compile(cu, name, cc)
 
         if config.DUMP_ASSEMBLY:
-            print(("ASSEMBLY %s" % name).center(80, '-'))
+            print(("ASSEMBLY %s" % name).center(80, "-"))
             print(ptx)
-            print('=' * 80)
+            print("=" * 80)
 
         # Link the program's PTX using the normal linker mechanism
         ptx_name = os.path.splitext(name)[0] + ".ptx"
@@ -2654,7 +2738,7 @@ def add_file(self, path, kind):
         """Add code from a file to the link"""
 
     def add_cu_file(self, path):
-        with open(path, 'rb') as f:
+        with open(path, "rb") as f:
             cu = f.read()
         self.add_cu(cu, os.path.basename(path))
 
@@ -2672,24 +2756,24 @@ def add_file_guess_ext(self, path_or_code, ignore_nonlto=False):
 
         if isinstance(path_or_code, str):
             ext = pathlib.Path(path_or_code).suffix
-            if ext == '':
+            if ext == "":
                 raise RuntimeError(
                     "Don't know how to link file with no extension"
                 )
-            elif ext == '.cu':
+            elif ext == ".cu":
                 self.add_cu_file(path_or_code)
             else:
-                kind = FILE_EXTENSION_MAP.get(ext.lstrip('.'), None)
+                kind = FILE_EXTENSION_MAP.get(ext.lstrip("."), None)
                 if kind is None:
                     raise RuntimeError(
-                        "Don't know how to link file with extension "
-                        f"{ext}"
+                        f"Don't know how to link file with extension {ext}"
                     )
 
                 if ignore_nonlto:
                     warn_and_return = False
                     if kind in (
-                        FILE_EXTENSION_MAP["fatbin"], FILE_EXTENSION_MAP["o"]
+                        FILE_EXTENSION_MAP["fatbin"],
+                        FILE_EXTENSION_MAP["o"],
                     ):
                         entry_types = inspect_obj_content(path_or_code)
                         if "nvvm" not in entry_types:
@@ -2754,6 +2838,7 @@ class MVCLinker(Linker):
     Linker supporting Minor Version Compatibility, backed by the cubinlinker
     package.
     """
+
     def __init__(self, max_registers=None, lineinfo=False, cc=None):
         try:
             from cubinlinker import CubinLinker
@@ -2761,18 +2846,20 @@ def __init__(self, max_registers=None, lineinfo=False, cc=None):
             raise ImportError(_MVC_ERROR_MESSAGE) from err
 
         if cc is None:
-            raise RuntimeError("MVCLinker requires Compute Capability to be "
-                               "specified, but cc is None")
+            raise RuntimeError(
+                "MVCLinker requires Compute Capability to be "
+                "specified, but cc is None"
+            )
 
         super().__init__(max_registers, lineinfo, cc)
 
         arch = f"sm_{cc[0] * 10 + cc[1]}"
-        ptx_compile_opts = ['--gpu-name', arch, '-c']
+        ptx_compile_opts = ["--gpu-name", arch, "-c"]
         if max_registers:
             arg = f"--maxrregcount={max_registers}"
             ptx_compile_opts.append(arg)
         if lineinfo:
-            ptx_compile_opts.append('--generate-line-info')
+            ptx_compile_opts.append("--generate-line-info")
         self.ptx_compile_options = tuple(ptx_compile_opts)
 
         self._linker = CubinLinker(f"--arch={arch}")
@@ -2785,7 +2872,7 @@ def info_log(self):
     def error_log(self):
         return self._linker.error_log
 
-    def add_ptx(self, ptx, name='<cudapy-ptx>'):
+    def add_ptx(self, ptx, name="<cudapy-ptx>"):
         try:
             from ptxcompiler import compile_ptx
             from cubinlinker import CubinLinkerError
@@ -2804,19 +2891,19 @@ def add_file(self, path, kind):
             raise ImportError(_MVC_ERROR_MESSAGE) from err
 
         try:
-            with open(path, 'rb') as f:
+            with open(path, "rb") as f:
                 data = f.read()
         except FileNotFoundError:
-            raise LinkerError(f'{path} not found')
+            raise LinkerError(f"{path} not found")
 
         name = pathlib.Path(path).name
-        if kind == FILE_EXTENSION_MAP['cubin']:
+        if kind == FILE_EXTENSION_MAP["cubin"]:
             fn = self._linker.add_cubin
-        elif kind == FILE_EXTENSION_MAP['fatbin']:
+        elif kind == FILE_EXTENSION_MAP["fatbin"]:
             fn = self._linker.add_fatbin
-        elif kind == FILE_EXTENSION_MAP['a']:
+        elif kind == FILE_EXTENSION_MAP["a"]:
             raise LinkerError(f"Don't know how to link {kind}")
-        elif kind == FILE_EXTENSION_MAP['ptx']:
+        elif kind == FILE_EXTENSION_MAP["ptx"]:
             return self.add_ptx(data, name)
         else:
             raise LinkerError(f"Don't know how to link {kind}")
@@ -2842,6 +2929,7 @@ class CtypesLinker(Linker):
     """
     Links for current device if no CC given
     """
+
     def __init__(self, max_registers=0, lineinfo=False, cc=None):
         super().__init__(max_registers, lineinfo, cc)
 
@@ -2875,8 +2963,9 @@ def __init__(self, max_registers=0, lineinfo=False, cc=None):
         option_vals = (c_void_p * len(raw_values))(*raw_values)
 
         self.handle = handle = drvapi.cu_link_state()
-        driver.cuLinkCreate(len(raw_keys), option_keys, option_vals,
-                            byref(self.handle))
+        driver.cuLinkCreate(
+            len(raw_keys), option_keys, option_vals, byref(self.handle)
+        )
 
         weakref.finalize(self, driver.cuLinkDestroy, handle)
 
@@ -2887,19 +2976,27 @@ def __init__(self, max_registers=0, lineinfo=False, cc=None):
 
     @property
     def info_log(self):
-        return self.linker_info_buf.value.decode('utf8')
+        return self.linker_info_buf.value.decode("utf8")
 
     @property
     def error_log(self):
-        return self.linker_errors_buf.value.decode('utf8')
+        return self.linker_errors_buf.value.decode("utf8")
 
-    def add_ptx(self, ptx, name='<cudapy-ptx>'):
+    def add_ptx(self, ptx, name="<cudapy-ptx>"):
         ptxbuf = c_char_p(ptx)
-        namebuf = c_char_p(name.encode('utf8'))
+        namebuf = c_char_p(name.encode("utf8"))
         self._keep_alive += [ptxbuf, namebuf]
         try:
-            driver.cuLinkAddData(self.handle, enums.CU_JIT_INPUT_PTX,
-                                 ptxbuf, len(ptx), namebuf, 0, None, None)
+            driver.cuLinkAddData(
+                self.handle,
+                enums.CU_JIT_INPUT_PTX,
+                ptxbuf,
+                len(ptx),
+                namebuf,
+                0,
+                None,
+                None,
+            )
         except CudaAPIError as e:
             raise LinkerError("%s\n%s" % (e, self.error_log))
 
@@ -2911,7 +3008,7 @@ def add_file(self, path, kind):
             driver.cuLinkAddFile(self.handle, kind, pathbuf, 0, None, None)
         except CudaAPIError as e:
             if e.code == enums.CUDA_ERROR_FILE_NOT_FOUND:
-                msg = f'{path} not found'
+                msg = f"{path} not found"
             else:
                 msg = "%s\n%s" % (e, self.error_log)
             raise LinkerError(msg)
@@ -2926,7 +3023,7 @@ def complete(self):
             raise LinkerError("%s\n%s" % (e, self.error_log))
 
         size = size.value
-        assert size > 0, 'linker returned a zero sized cubin'
+        assert size > 0, "linker returned a zero sized cubin"
         del self._keep_alive[:]
 
         # We return a copy of the cubin because it's owned by the linker
@@ -2938,6 +3035,7 @@ class CudaPythonLinker(Linker):
     """
     Links for current device if no CC given
     """
+
     def __init__(self, max_registers=0, lineinfo=False, cc=None):
         super().__init__(max_registers, lineinfo, cc)
 
@@ -2964,8 +3062,9 @@ def __init__(self, max_registers=0, lineinfo=False, cc=None):
             options[jit_option.CU_JIT_TARGET_FROM_CUCONTEXT] = 1
         else:
             cc_val = cc[0] * 10 + cc[1]
-            cc_enum = getattr(binding.CUjit_target,
-                              f'CU_TARGET_COMPUTE_{cc_val}')
+            cc_enum = getattr(
+                binding.CUjit_target, f"CU_TARGET_COMPUTE_{cc_val}"
+            )
             options[jit_option.CU_JIT_TARGET] = cc_enum
 
         raw_keys = list(options.keys())
@@ -2982,19 +3081,20 @@ def __init__(self, max_registers=0, lineinfo=False, cc=None):
 
     @property
     def info_log(self):
-        return self.linker_info_buf.decode('utf8')
+        return self.linker_info_buf.decode("utf8")
 
     @property
     def error_log(self):
-        return self.linker_errors_buf.decode('utf8')
+        return self.linker_errors_buf.decode("utf8")
 
-    def add_ptx(self, ptx, name='<cudapy-ptx>'):
-        namebuf = name.encode('utf8')
+    def add_ptx(self, ptx, name="<cudapy-ptx>"):
+        namebuf = name.encode("utf8")
         self._keep_alive += [ptx, namebuf]
         try:
             input_ptx = binding.CUjitInputType.CU_JIT_INPUT_PTX
-            driver.cuLinkAddData(self.handle, input_ptx, ptx, len(ptx),
-                                 namebuf, 0, [], [])
+            driver.cuLinkAddData(
+                self.handle, input_ptx, ptx, len(ptx), namebuf, 0, [], []
+            )
         except CudaAPIError as e:
             raise LinkerError("%s\n%s" % (e, self.error_log))
 
@@ -3006,7 +3106,7 @@ def add_file(self, path, kind):
             driver.cuLinkAddFile(self.handle, kind, pathbuf, 0, [], [])
         except CudaAPIError as e:
             if e.code == binding.CUresult.CUDA_ERROR_FILE_NOT_FOUND:
-                msg = f'{path} not found'
+                msg = f"{path} not found"
             else:
                 msg = "%s\n%s" % (e, self.error_log)
             raise LinkerError(msg)
@@ -3017,7 +3117,7 @@ def complete(self):
         except CudaAPIError as e:
             raise LinkerError("%s\n%s" % (e, self.error_log))
 
-        assert size > 0, 'linker returned a zero sized cubin'
+        assert size > 0, "linker returned a zero sized cubin"
         del self._keep_alive[:]
         # We return a copy of the cubin because it's owned by the linker
         cubin_ptr = ctypes.cast(cubin_buf, ctypes.POINTER(ctypes.c_char))
@@ -3151,6 +3251,7 @@ def complete(self):
         except NvJitLinkError as e:
             raise LinkerError from e
 
+
 # -----------------------------------------------------------------------------
 
 
@@ -3200,7 +3301,7 @@ def device_memory_size(devmem):
     The result is cached in the device memory object.
     It may query the driver for the memory size of the device memory allocation.
     """
-    sz = getattr(devmem, '_cuda_memsize_', None)
+    sz = getattr(devmem, "_cuda_memsize_", None)
     if sz is None:
         s, e = device_extents(devmem)
         if USE_NV_BINDING:
@@ -3213,10 +3314,9 @@ def device_memory_size(devmem):
 
 
 def _is_datetime_dtype(obj):
-    """Returns True if the obj.dtype is datetime64 or timedelta64
-    """
-    dtype = getattr(obj, 'dtype', None)
-    return dtype is not None and dtype.char in 'Mm'
+    """Returns True if the obj.dtype is datetime64 or timedelta64"""
+    dtype = getattr(obj, "dtype", None)
+    return dtype is not None and dtype.char in "Mm"
 
 
 def _workaround_for_datetime(obj):
@@ -3295,12 +3395,11 @@ def is_device_memory(obj):
     "device_pointer" which value is an int object carrying the pointer
     value of the device memory address.  This is not tested in this method.
     """
-    return getattr(obj, '__cuda_memory__', False)
+    return getattr(obj, "__cuda_memory__", False)
 
 
 def require_device_memory(obj):
-    """A sentry for methods that accept CUDA memory object.
-    """
+    """A sentry for methods that accept CUDA memory object."""
     if not is_device_memory(obj):
         raise Exception("Not a CUDA memory object.")
 
@@ -3391,16 +3490,16 @@ def device_memset(dst, val, size, stream=0):
 
 
 def profile_start():
-    '''
+    """
     Enable profile collection in the current context.
-    '''
+    """
     driver.cuProfilerStart()
 
 
 def profile_stop():
-    '''
+    """
     Disable profile collection in the current context.
-    '''
+    """
     driver.cuProfilerStop()
 
 
@@ -3427,18 +3526,21 @@ def inspect_obj_content(objpath: str):
     Given path to a fatbin or object, use `cuobjdump` to examine its content
     Return the set of entries in the object.
     """
-    code_types :set[str] = set()
+    code_types: set[str] = set()
 
     try:
-        out = subprocess.run(["cuobjdump", objpath], check=True,
-                             capture_output=True)
+        out = subprocess.run(
+            ["cuobjdump", objpath], check=True, capture_output=True
+        )
     except FileNotFoundError as e:
-        msg = ("cuobjdump has not been found. You may need "
-               "to install the CUDA toolkit and ensure that "
-               "it is available on your PATH.\n")
+        msg = (
+            "cuobjdump has not been found. You may need "
+            "to install the CUDA toolkit and ensure that "
+            "it is available on your PATH.\n"
+        )
         raise RuntimeError(msg) from e
 
-    objtable = out.stdout.decode('utf-8')
+    objtable = out.stdout.decode("utf-8")
     entry_pattern = r"Fatbin (.*) code"
     for line in objtable.split("\n"):
         if match := re.match(entry_pattern, line):
diff --git a/numba_cuda/numba/cuda/cudadrv/drvapi.py b/numba_cuda/numba/cuda/cudadrv/drvapi.py
index 7f6dfbbdc..1aeeecc44 100644
--- a/numba_cuda/numba/cuda/cudadrv/drvapi.py
+++ b/numba_cuda/numba/cuda/cudadrv/drvapi.py
@@ -1,20 +1,31 @@
-from ctypes import (c_byte, c_char_p, c_float, c_int, c_size_t, c_uint,
-                    c_uint8, c_void_p, py_object, CFUNCTYPE, POINTER,
-                    Structure)
+from ctypes import (
+    c_byte,
+    c_char_p,
+    c_float,
+    c_int,
+    c_size_t,
+    c_uint,
+    c_uint8,
+    c_void_p,
+    py_object,
+    CFUNCTYPE,
+    POINTER,
+    Structure,
+)
 
 cu_device = c_int
-cu_device_attribute = c_int     # enum
-cu_context = c_void_p           # an opaque handle
-cu_module = c_void_p            # an opaque handle
-cu_jit_option = c_int           # enum
-cu_jit_input_type = c_int       # enum
-cu_function = c_void_p          # an opaque handle
-cu_device_ptr = c_size_t        # defined as unsigned long long
-cu_stream = c_void_p            # an opaque handle
+cu_device_attribute = c_int  # enum
+cu_context = c_void_p  # an opaque handle
+cu_module = c_void_p  # an opaque handle
+cu_jit_option = c_int  # enum
+cu_jit_input_type = c_int  # enum
+cu_function = c_void_p  # an opaque handle
+cu_device_ptr = c_size_t  # defined as unsigned long long
+cu_stream = c_void_p  # an opaque handle
 cu_event = c_void_p
 cu_link_state = c_void_p
 cu_function_attribute = c_int
-cu_uuid = (c_byte * 16)         # Device UUID
+cu_uuid = c_byte * 16  # Device UUID
 
 cu_stream_callback_pyobj = CFUNCTYPE(None, cu_stream, c_int, py_object)
 
@@ -33,154 +44,145 @@ class cu_ipc_mem_handle(Structure):
 
 API_PROTOTYPES = {
     # CUresult cuInit(unsigned int Flags);
-    'cuInit' : (c_int, c_uint),
-
+    "cuInit": (c_int, c_uint),
     # CUresult cuDriverGetVersion (int* driverVersion )
-    'cuDriverGetVersion': (c_int, POINTER(c_int)),
-
+    "cuDriverGetVersion": (c_int, POINTER(c_int)),
     # CUresult cuDeviceGetCount(int *count);
-    'cuDeviceGetCount': (c_int, POINTER(c_int)),
-
+    "cuDeviceGetCount": (c_int, POINTER(c_int)),
     # CUresult cuDeviceGet(CUdevice *device, int ordinal);
-    'cuDeviceGet': (c_int, POINTER(cu_device), c_int),
-
+    "cuDeviceGet": (c_int, POINTER(cu_device), c_int),
     # CUresult cuDeviceGetName ( char* name, int  len, CUdevice dev )
-    'cuDeviceGetName': (c_int, c_char_p, c_int, cu_device),
-
+    "cuDeviceGetName": (c_int, c_char_p, c_int, cu_device),
     # CUresult cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib,
     #                               CUdevice dev);
-    'cuDeviceGetAttribute': (c_int, POINTER(c_int), cu_device_attribute,
-                             cu_device),
-
+    "cuDeviceGetAttribute": (
+        c_int,
+        POINTER(c_int),
+        cu_device_attribute,
+        cu_device,
+    ),
     # CUresult cuDeviceComputeCapability(int *major, int *minor,
     #                                    CUdevice dev);
-    'cuDeviceComputeCapability': (c_int, POINTER(c_int), POINTER(c_int),
-                                  cu_device),
-
+    "cuDeviceComputeCapability": (
+        c_int,
+        POINTER(c_int),
+        POINTER(c_int),
+        cu_device,
+    ),
     # CUresult cuDevicePrimaryCtxGetState(
     #              CUdevice dev,
     #              unsigned int* flags,
     #              int* active)
-    'cuDevicePrimaryCtxGetState': (c_int,
-                                   cu_device, POINTER(c_uint), POINTER(c_int)),
-
+    "cuDevicePrimaryCtxGetState": (
+        c_int,
+        cu_device,
+        POINTER(c_uint),
+        POINTER(c_int),
+    ),
     # CUresult cuDevicePrimaryCtxRelease ( CUdevice dev )
-    'cuDevicePrimaryCtxRelease': (c_int, cu_device),
-
+    "cuDevicePrimaryCtxRelease": (c_int, cu_device),
     # CUresult cuDevicePrimaryCtxReset ( CUdevice dev )
-    'cuDevicePrimaryCtxReset': (c_int, cu_device),
-
+    "cuDevicePrimaryCtxReset": (c_int, cu_device),
     # CUresult cuDevicePrimaryCtxRetain ( CUcontext* pctx, CUdevice dev )
-    'cuDevicePrimaryCtxRetain': (c_int, POINTER(cu_context), cu_device),
-
+    "cuDevicePrimaryCtxRetain": (c_int, POINTER(cu_context), cu_device),
     # CUresult cuDevicePrimaryCtxSetFlags ( CUdevice dev, unsigned int  flags )
-    'cuDevicePrimaryCtxSetFlags': (c_int, cu_device, c_uint),
-
+    "cuDevicePrimaryCtxSetFlags": (c_int, cu_device, c_uint),
     # CUresult cuCtxCreate(CUcontext *pctx, unsigned int flags,
     #                      CUdevice dev);
-    'cuCtxCreate': (c_int, POINTER(cu_context), c_uint, cu_device),
-
+    "cuCtxCreate": (c_int, POINTER(cu_context), c_uint, cu_device),
     # CUresult cuCtxGetDevice (	CUdevice * 	device	 )
-    'cuCtxGetDevice': (c_int, POINTER(cu_device)),
-
+    "cuCtxGetDevice": (c_int, POINTER(cu_device)),
     # CUresult cuCtxGetCurrent (CUcontext *pctx);
-    'cuCtxGetCurrent': (c_int, POINTER(cu_context)),
-
+    "cuCtxGetCurrent": (c_int, POINTER(cu_context)),
     # CUresult cuCtxPushCurrent (CUcontext pctx);
-    'cuCtxPushCurrent': (c_int, cu_context),
-
+    "cuCtxPushCurrent": (c_int, cu_context),
     # CUresult cuCtxPopCurrent (CUcontext *pctx);
-    'cuCtxPopCurrent': (c_int, POINTER(cu_context)),
-
+    "cuCtxPopCurrent": (c_int, POINTER(cu_context)),
     # CUresult cuCtxDestroy(CUcontext pctx);
-    'cuCtxDestroy': (c_int, cu_context),
-
+    "cuCtxDestroy": (c_int, cu_context),
     # CUresult cuModuleLoadDataEx(CUmodule *module, const void *image,
     #                             unsigned int numOptions,
     #                             CUjit_option *options,
     #                             void **optionValues);
-    'cuModuleLoadDataEx': (c_int, cu_module, c_void_p, c_uint,
-                           POINTER(cu_jit_option), POINTER(c_void_p)),
-
+    "cuModuleLoadDataEx": (
+        c_int,
+        cu_module,
+        c_void_p,
+        c_uint,
+        POINTER(cu_jit_option),
+        POINTER(c_void_p),
+    ),
     # CUresult cuModuleUnload(CUmodule hmod);
-    'cuModuleUnload': (c_int, cu_module),
-
+    "cuModuleUnload": (c_int, cu_module),
     # CUresult cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod,
     #                              const char *name);
-    'cuModuleGetFunction': (c_int, cu_function, cu_module, c_char_p),
-
+    "cuModuleGetFunction": (c_int, cu_function, cu_module, c_char_p),
     # CUresult cuModuleGetGlobal ( CUdeviceptr* dptr, size_t* bytes, CUmodule
     #                              hmod, const char* name )
-    'cuModuleGetGlobal': (c_int, POINTER(cu_device_ptr), POINTER(c_size_t),
-                          cu_module, c_char_p),
-
+    "cuModuleGetGlobal": (
+        c_int,
+        POINTER(cu_device_ptr),
+        POINTER(c_size_t),
+        cu_module,
+        c_char_p,
+    ),
     # CUresult CUDAAPI cuFuncSetCacheConfig(CUfunction hfunc,
     #                                       CUfunc_cache config);
-    'cuFuncSetCacheConfig': (c_int, cu_function, c_uint),
-
+    "cuFuncSetCacheConfig": (c_int, cu_function, c_uint),
     # CUresult cuMemAlloc(CUdeviceptr *dptr, size_t bytesize);
-    'cuMemAlloc': (c_int, POINTER(cu_device_ptr), c_size_t),
-
+    "cuMemAlloc": (c_int, POINTER(cu_device_ptr), c_size_t),
     # CUresult cuMemAllocManaged(CUdeviceptr *dptr, size_t bytesize,
     #                            unsigned int flags);
-    'cuMemAllocManaged': (c_int, c_void_p, c_size_t, c_uint),
-
+    "cuMemAllocManaged": (c_int, c_void_p, c_size_t, c_uint),
     # CUresult cuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, size_t N)
-    'cuMemsetD8': (c_int, cu_device_ptr, c_uint8, c_size_t),
-
+    "cuMemsetD8": (c_int, cu_device_ptr, c_uint8, c_size_t),
     # CUresult cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc,
     #                          size_t N, CUstream hStream);
-    'cuMemsetD8Async': (c_int,
-                        cu_device_ptr, c_uint8, c_size_t, cu_stream),
-
+    "cuMemsetD8Async": (c_int, cu_device_ptr, c_uint8, c_size_t, cu_stream),
     # CUresult cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost,
     #                       size_t ByteCount);
-    'cuMemcpyHtoD': (c_int, cu_device_ptr, c_void_p, c_size_t),
-
+    "cuMemcpyHtoD": (c_int, cu_device_ptr, c_void_p, c_size_t),
     # CUresult cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost,
     #                            size_t ByteCount, CUstream hStream);
-    'cuMemcpyHtoDAsync': (c_int, cu_device_ptr, c_void_p, c_size_t,
-                          cu_stream),
-
+    "cuMemcpyHtoDAsync": (c_int, cu_device_ptr, c_void_p, c_size_t, cu_stream),
     # CUresult cuMemcpyDtoD(CUdeviceptr dstDevice, const void *srcDevice,
     #                       size_t ByteCount);
-    'cuMemcpyDtoD': (c_int, cu_device_ptr, cu_device_ptr, c_size_t),
-
+    "cuMemcpyDtoD": (c_int, cu_device_ptr, cu_device_ptr, c_size_t),
     # CUresult cuMemcpyDtoDAsync(CUdeviceptr dstDevice, const void *srcDevice,
     #                            size_t ByteCount, CUstream hStream);
-    'cuMemcpyDtoDAsync': (c_int, cu_device_ptr, cu_device_ptr, c_size_t,
-                          cu_stream),
-
-
+    "cuMemcpyDtoDAsync": (
+        c_int,
+        cu_device_ptr,
+        cu_device_ptr,
+        c_size_t,
+        cu_stream,
+    ),
     # CUresult cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice,
     #                       size_t ByteCount);
-    'cuMemcpyDtoH': (c_int, c_void_p, cu_device_ptr, c_size_t),
-
+    "cuMemcpyDtoH": (c_int, c_void_p, cu_device_ptr, c_size_t),
     # CUresult cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice,
     #                            size_t ByteCount, CUstream hStream);
-    'cuMemcpyDtoHAsync': (c_int, c_void_p, cu_device_ptr, c_size_t,
-                          cu_stream),
-
+    "cuMemcpyDtoHAsync": (c_int, c_void_p, cu_device_ptr, c_size_t, cu_stream),
     # CUresult cuMemFree(CUdeviceptr dptr);
-    'cuMemFree': (c_int, cu_device_ptr),
-
+    "cuMemFree": (c_int, cu_device_ptr),
     # CUresult cuStreamCreate(CUstream *phStream, unsigned int Flags);
-    'cuStreamCreate': (c_int, POINTER(cu_stream), c_uint),
-
+    "cuStreamCreate": (c_int, POINTER(cu_stream), c_uint),
     # CUresult cuStreamDestroy(CUstream hStream);
-    'cuStreamDestroy': (c_int, cu_stream),
-
+    "cuStreamDestroy": (c_int, cu_stream),
     # CUresult cuStreamSynchronize(CUstream hStream);
-    'cuStreamSynchronize': (c_int, cu_stream),
-
+    "cuStreamSynchronize": (c_int, cu_stream),
     # CUresult cuStreamAddCallback(
     #              CUstream hStream,
     #              CUstreamCallback callback,
     #              void* userData,
     #              unsigned int flags)
-    'cuStreamAddCallback': (c_int, cu_stream, cu_stream_callback_pyobj,
-                            py_object, c_uint),
-
+    "cuStreamAddCallback": (
+        c_int,
+        cu_stream,
+        cu_stream_callback_pyobj,
+        py_object,
+        c_uint,
+    ),
     # CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX,
     #                        unsigned int gridDimY,
     #                        unsigned int gridDimZ,
@@ -190,10 +192,20 @@ class cu_ipc_mem_handle(Structure):
     #                        unsigned int sharedMemBytes,
     #                        CUstream hStream, void **kernelParams,
     #                        void ** extra)
-    'cuLaunchKernel': (c_int, cu_function, c_uint, c_uint, c_uint,
-                       c_uint, c_uint, c_uint, c_uint, cu_stream,
-                       POINTER(c_void_p), POINTER(c_void_p)),
-
+    "cuLaunchKernel": (
+        c_int,
+        cu_function,
+        c_uint,
+        c_uint,
+        c_uint,
+        c_uint,
+        c_uint,
+        c_uint,
+        c_uint,
+        cu_stream,
+        POINTER(c_void_p),
+        POINTER(c_void_p),
+    ),
     # CUresult cuLaunchCooperativeKernel(CUfunction f, unsigned int gridDimX,
     #                                   unsigned int gridDimY,
     #                                   unsigned int gridDimZ,
@@ -202,197 +214,219 @@ class cu_ipc_mem_handle(Structure):
     #                                   unsigned int blockDimZ,
     #                                   unsigned int sharedMemBytes,
     #                                   CUstream hStream, void **kernelParams)
-    'cuLaunchCooperativeKernel': (c_int, cu_function, c_uint, c_uint, c_uint,
-                                  c_uint, c_uint, c_uint, c_uint, cu_stream,
-                                  POINTER(c_void_p)),
-
+    "cuLaunchCooperativeKernel": (
+        c_int,
+        cu_function,
+        c_uint,
+        c_uint,
+        c_uint,
+        c_uint,
+        c_uint,
+        c_uint,
+        c_uint,
+        cu_stream,
+        POINTER(c_void_p),
+    ),
     #  CUresult cuMemHostAlloc (	void ** 	pp,
     #                               size_t 	bytesize,
     #                               unsigned int 	Flags
     #                           )
-    'cuMemHostAlloc': (c_int, c_void_p, c_size_t, c_uint),
-
+    "cuMemHostAlloc": (c_int, c_void_p, c_size_t, c_uint),
     #  CUresult cuMemFreeHost (	void * 	p	 )
-    'cuMemFreeHost': (c_int, c_void_p),
-
+    "cuMemFreeHost": (c_int, c_void_p),
     # CUresult cuMemHostRegister(void * 	p,
     #                            size_t 	bytesize,
     #                            unsigned int 	Flags)
-    'cuMemHostRegister': (c_int, c_void_p, c_size_t, c_uint),
-
+    "cuMemHostRegister": (c_int, c_void_p, c_size_t, c_uint),
     # CUresult cuMemHostUnregister(void * 	p)
-    'cuMemHostUnregister': (c_int, c_void_p),
-
+    "cuMemHostUnregister": (c_int, c_void_p),
     # CUresult cuMemHostGetDevicePointer(CUdeviceptr * pdptr,
     #                                    void *        p,
     #                                    unsigned int  Flags)
-    'cuMemHostGetDevicePointer': (c_int, POINTER(cu_device_ptr),
-                                  c_void_p, c_uint),
-
+    "cuMemHostGetDevicePointer": (
+        c_int,
+        POINTER(cu_device_ptr),
+        c_void_p,
+        c_uint,
+    ),
     # CUresult cuMemGetInfo(size_t * free, size_t * total)
-    'cuMemGetInfo' : (c_int, POINTER(c_size_t), POINTER(c_size_t)),
-
+    "cuMemGetInfo": (c_int, POINTER(c_size_t), POINTER(c_size_t)),
     # CUresult cuEventCreate (	CUevent * 	phEvent,
     #                               unsigned int 	Flags )
-    'cuEventCreate': (c_int, POINTER(cu_event), c_uint),
-
+    "cuEventCreate": (c_int, POINTER(cu_event), c_uint),
     # CUresult cuEventDestroy (	CUevent 	hEvent	 )
-    'cuEventDestroy': (c_int, cu_event),
-
+    "cuEventDestroy": (c_int, cu_event),
     # CUresult cuEventElapsedTime (	float * 	pMilliseconds,
     #                                   CUevent 	hStart,
     #                                   CUevent 	hEnd )
-    'cuEventElapsedTime': (c_int, POINTER(c_float), cu_event, cu_event),
-
+    "cuEventElapsedTime": (c_int, POINTER(c_float), cu_event, cu_event),
     # CUresult cuEventQuery (	CUevent 	hEvent	 )
-    'cuEventQuery': (c_int, cu_event),
-
+    "cuEventQuery": (c_int, cu_event),
     # CUresult cuEventRecord (	CUevent 	hEvent,
     #                               CUstream 	hStream )
-    'cuEventRecord': (c_int, cu_event, cu_stream),
-
+    "cuEventRecord": (c_int, cu_event, cu_stream),
     # CUresult cuEventSynchronize (	CUevent 	hEvent	 )
-    'cuEventSynchronize': (c_int, cu_event),
-
-
+    "cuEventSynchronize": (c_int, cu_event),
     # CUresult cuStreamWaitEvent (	CUstream        hStream,
     #                                   CUevent         hEvent,
     #                                	unsigned int 	Flags )
-    'cuStreamWaitEvent': (c_int, cu_stream, cu_event, c_uint),
-
+    "cuStreamWaitEvent": (c_int, cu_stream, cu_event, c_uint),
     # CUresult 	cuPointerGetAttribute (
     #               void *data,
     #               CUpointer_attribute attribute,
     #               CUdeviceptr ptr)
-    'cuPointerGetAttribute': (c_int, c_void_p, c_uint, cu_device_ptr),
-
+    "cuPointerGetAttribute": (c_int, c_void_p, c_uint, cu_device_ptr),
     #    CUresult cuMemGetAddressRange (	CUdeviceptr * 	pbase,
     #                                        size_t * 	psize,
     #                                        CUdeviceptr 	dptr
     #                                        )
-    'cuMemGetAddressRange': (c_int,
-                             POINTER(cu_device_ptr),
-                             POINTER(c_size_t),
-                             cu_device_ptr),
-
+    "cuMemGetAddressRange": (
+        c_int,
+        POINTER(cu_device_ptr),
+        POINTER(c_size_t),
+        cu_device_ptr,
+    ),
     #    CUresult cuMemHostGetFlags (	unsigned int * 	pFlags,
     #                                   void * 	p )
-    'cuMemHostGetFlags': (c_int,
-                          POINTER(c_uint),
-                          c_void_p),
-
+    "cuMemHostGetFlags": (c_int, POINTER(c_uint), c_void_p),
     #   CUresult cuCtxSynchronize ( void )
-    'cuCtxSynchronize' : (c_int,),
-
+    "cuCtxSynchronize": (c_int,),
     #    CUresult
     #    cuLinkCreate(unsigned int numOptions, CUjit_option *options,
     #                 void **optionValues, CUlinkState *stateOut);
-    'cuLinkCreate': (c_int,
-                     c_uint, POINTER(cu_jit_option),
-                     POINTER(c_void_p), POINTER(cu_link_state)),
-
+    "cuLinkCreate": (
+        c_int,
+        c_uint,
+        POINTER(cu_jit_option),
+        POINTER(c_void_p),
+        POINTER(cu_link_state),
+    ),
     #    CUresult
     #    cuLinkAddData(CUlinkState state, CUjitInputType type, void *data,
     #                  size_t size, const char *name, unsigned
     #                  int numOptions, CUjit_option *options,
     #                  void **optionValues);
-    'cuLinkAddData': (c_int,
-                      cu_link_state, cu_jit_input_type, c_void_p,
-                      c_size_t, c_char_p, c_uint, POINTER(cu_jit_option),
-                      POINTER(c_void_p)),
-
+    "cuLinkAddData": (
+        c_int,
+        cu_link_state,
+        cu_jit_input_type,
+        c_void_p,
+        c_size_t,
+        c_char_p,
+        c_uint,
+        POINTER(cu_jit_option),
+        POINTER(c_void_p),
+    ),
     #    CUresult
     #    cuLinkAddFile(CUlinkState state, CUjitInputType type,
     #                  const char *path, unsigned int numOptions,
     #                  CUjit_option *options, void **optionValues);
-
-    'cuLinkAddFile': (c_int,
-                      cu_link_state, cu_jit_input_type, c_char_p, c_uint,
-                      POINTER(cu_jit_option), POINTER(c_void_p)),
-
+    "cuLinkAddFile": (
+        c_int,
+        cu_link_state,
+        cu_jit_input_type,
+        c_char_p,
+        c_uint,
+        POINTER(cu_jit_option),
+        POINTER(c_void_p),
+    ),
     #    CUresult CUDAAPI
     #    cuLinkComplete(CUlinkState state, void **cubinOut, size_t *sizeOut)
-    'cuLinkComplete': (c_int,
-                       cu_link_state, POINTER(c_void_p), POINTER(c_size_t)),
-
+    "cuLinkComplete": (
+        c_int,
+        cu_link_state,
+        POINTER(c_void_p),
+        POINTER(c_size_t),
+    ),
     #    CUresult CUDAAPI
     #    cuLinkDestroy(CUlinkState state)
-    'cuLinkDestroy': (c_int, cu_link_state),
-
+    "cuLinkDestroy": (c_int, cu_link_state),
     # cuProfilerStart ( void )
-    'cuProfilerStart': (c_int,),
-
+    "cuProfilerStart": (c_int,),
     # cuProfilerStop ( void )
-    'cuProfilerStop': (c_int,),
-
+    "cuProfilerStop": (c_int,),
     # CUresult cuFuncGetAttribute ( int* pi, CUfunction_attribute attrib,
     #                              CUfunction hfunc )
-    'cuFuncGetAttribute': (c_int,
-                           POINTER(c_int), cu_function_attribute, cu_function),
-
+    "cuFuncGetAttribute": (
+        c_int,
+        POINTER(c_int),
+        cu_function_attribute,
+        cu_function,
+    ),
     # CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessor(
     #                      int *numBlocks,
     #                      CUfunction func,
     #                      int blockSize,
     #                      size_t dynamicSMemSize);
-    'cuOccupancyMaxActiveBlocksPerMultiprocessor': (c_int, POINTER(c_int),
-                                                    cu_function, c_size_t,
-                                                    c_uint),
-
+    "cuOccupancyMaxActiveBlocksPerMultiprocessor": (
+        c_int,
+        POINTER(c_int),
+        cu_function,
+        c_size_t,
+        c_uint,
+    ),
     # CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
     #                      int *numBlocks,
     #                      CUfunction func,
     #                      int blockSize,
     #                      size_t dynamicSMemSize,
     #                      unsigned int flags);
-    'cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags': (c_int,
-                                                             POINTER(c_int),
-                                                             cu_function,
-                                                             c_size_t, c_uint),
-
+    "cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags": (
+        c_int,
+        POINTER(c_int),
+        cu_function,
+        c_size_t,
+        c_uint,
+    ),
     # CUresult CUDAAPI cuOccupancyMaxPotentialBlockSize(
     #                      int *minGridSize, int *blockSize,
     #                      CUfunction func,
     #                      CUoccupancyB2DSize blockSizeToDynamicSMemSize,
     #                      size_t dynamicSMemSize, int blockSizeLimit);
-    'cuOccupancyMaxPotentialBlockSize': (c_int, POINTER(c_int), POINTER(c_int),
-                                         cu_function, cu_occupancy_b2d_size,
-                                         c_size_t, c_int),
-
+    "cuOccupancyMaxPotentialBlockSize": (
+        c_int,
+        POINTER(c_int),
+        POINTER(c_int),
+        cu_function,
+        cu_occupancy_b2d_size,
+        c_size_t,
+        c_int,
+    ),
     # CUresult CUDAAPI cuOccupancyMaxPotentialBlockSizeWithFlags(
     #                      int *minGridSize, int *blockSize,
     #                      CUfunction func,
     #                      CUoccupancyB2DSize blockSizeToDynamicSMemSize,
     #                      size_t dynamicSMemSize, int blockSizeLimit,
     #                      unsigned int flags);
-    'cuOccupancyMaxPotentialBlockSizeWithFlags': (c_int, POINTER(c_int),
-                                                  POINTER(c_int), cu_function,
-                                                  cu_occupancy_b2d_size,
-                                                  c_size_t, c_int, c_uint),
-
+    "cuOccupancyMaxPotentialBlockSizeWithFlags": (
+        c_int,
+        POINTER(c_int),
+        POINTER(c_int),
+        cu_function,
+        cu_occupancy_b2d_size,
+        c_size_t,
+        c_int,
+        c_uint,
+    ),
     # CUresult cuIpcGetMemHandle ( CUipcMemHandle* pHandle, CUdeviceptr dptr )
-    'cuIpcGetMemHandle': (c_int,
-                          POINTER(cu_ipc_mem_handle), cu_device_ptr),
-
+    "cuIpcGetMemHandle": (c_int, POINTER(cu_ipc_mem_handle), cu_device_ptr),
     # CUresult cuIpcOpenMemHandle(
     #              CUdeviceptr* pdptr,
     #              CUipcMemHandle handle,
     #              unsigned int Flags)
-    'cuIpcOpenMemHandle': (c_int, POINTER(cu_device_ptr), cu_ipc_mem_handle,
-                           c_uint),
-
+    "cuIpcOpenMemHandle": (
+        c_int,
+        POINTER(cu_device_ptr),
+        cu_ipc_mem_handle,
+        c_uint,
+    ),
     # CUresult cuIpcCloseMemHandle ( CUdeviceptr dptr )
-
-    'cuIpcCloseMemHandle': (c_int, cu_device_ptr),
-
+    "cuIpcCloseMemHandle": (c_int, cu_device_ptr),
     # CUresult cuCtxEnablePeerAccess (CUcontext peerContext, unsigned int Flags)
-    'cuCtxEnablePeerAccess': (c_int, cu_context, c_int),
-
+    "cuCtxEnablePeerAccess": (c_int, cu_context, c_int),
     # CUresult cuDeviceCanAccessPeer ( int* canAccessPeer,
     #                                  CUdevice dev, CUdevice peerDev )
-    'cuDeviceCanAccessPeer': (c_int,
-                              POINTER(c_int), cu_device, cu_device),
-
+    "cuDeviceCanAccessPeer": (c_int, POINTER(c_int), cu_device, cu_device),
     # CUresult cuDeviceGetUuid ( CUuuid* uuid, CUdevice dev )
-    'cuDeviceGetUuid': (c_int, POINTER(cu_uuid), cu_device),
+    "cuDeviceGetUuid": (c_int, POINTER(cu_uuid), cu_device),
 }
diff --git a/numba_cuda/numba/cuda/cudadrv/dummyarray.py b/numba_cuda/numba/cuda/cudadrv/dummyarray.py
index 38e1b890e..a3e21b633 100644
--- a/numba_cuda/numba/cuda/cudadrv/dummyarray.py
+++ b/numba_cuda/numba/cuda/cudadrv/dummyarray.py
@@ -20,7 +20,7 @@
     np.ctypeslib.ndpointer(np.ctypeslib.c_intp, ndim=1),  # newstrides
     ctypes.c_long,  # itemsize
     ctypes.c_int,  # is_f_order
-)(_helperlib.c_helpers['attempt_nocopy_reshape'])
+)(_helperlib.c_helpers["attempt_nocopy_reshape"])
 
 
 class Dim(object):
@@ -37,7 +37,8 @@ class Dim(object):
     stride:
         item stride
     """
-    __slots__ = 'start', 'stop', 'size', 'stride', 'single'
+
+    __slots__ = "start", "stop", "size", "stride", "single"
 
     def __init__(self, start, stop, size, stride, single):
         self.start = start
@@ -58,15 +59,11 @@ def __getitem__(self, item):
             else:
                 size = _compute_size(start, stop, stride)
             ret = Dim(
-                start=start,
-                stop=stop,
-                size=size,
-                stride=stride,
-                single=False
+                start=start, stop=stop, size=size, stride=stride, single=False
             )
             return ret
         else:
-            sliced = self[item:item + 1] if item != -1 else self[-1:]
+            sliced = self[item : item + 1] if item != -1 else self[-1:]
             if sliced.size != 1:
                 raise IndexError
             return Dim(
@@ -85,8 +82,13 @@ def __repr__(self):
         return strfmt % (self.start, self.stop, self.size, self.stride)
 
     def normalize(self, base):
-        return Dim(start=self.start - base, stop=self.stop - base,
-                   size=self.size, stride=self.stride, single=self.single)
+        return Dim(
+            start=self.start - base,
+            stop=self.stop - base,
+            size=self.size,
+            stride=self.stride,
+            single=self.single,
+        )
 
     def copy(self, start=None, stop=None, size=None, stride=None, single=None):
         if start is None:
@@ -143,14 +145,16 @@ class Array(object):
     extent: (start, end)
         start and end offset containing the memory region
     """
+
     is_array = True
 
     @classmethod
     def from_desc(cls, offset, shape, strides, itemsize):
         dims = []
         for ashape, astride in zip(shape, strides):
-            dim = Dim(offset, offset + ashape * astride, ashape, astride,
-                      single=False)
+            dim = Dim(
+                offset, offset + ashape * astride, ashape, astride, single=False
+            )
             dims.append(dim)
             offset = 0  # offset only applies to first dimension
         return cls(dims, itemsize)
@@ -173,23 +177,23 @@ def _compute_layout(self):
 
         # Records have no dims, and we can treat them as contiguous
         if not self.dims:
-            return {'C_CONTIGUOUS': True, 'F_CONTIGUOUS': True}
+            return {"C_CONTIGUOUS": True, "F_CONTIGUOUS": True}
 
         # If this is a broadcast array then it is not contiguous
         if any([dim.stride == 0 for dim in self.dims]):
-            return {'C_CONTIGUOUS': False, 'F_CONTIGUOUS': False}
+            return {"C_CONTIGUOUS": False, "F_CONTIGUOUS": False}
 
-        flags = {'C_CONTIGUOUS': True, 'F_CONTIGUOUS': True}
+        flags = {"C_CONTIGUOUS": True, "F_CONTIGUOUS": True}
 
         # Check C contiguity
         sd = self.itemsize
         for dim in reversed(self.dims):
             if dim.size == 0:
                 # Contiguous by definition
-                return {'C_CONTIGUOUS': True, 'F_CONTIGUOUS': True}
+                return {"C_CONTIGUOUS": True, "F_CONTIGUOUS": True}
             if dim.size != 1:
                 if dim.stride != sd:
-                    flags['C_CONTIGUOUS'] = False
+                    flags["C_CONTIGUOUS"] = False
                 sd *= dim.size
 
         # Check F contiguity
@@ -197,7 +201,7 @@ def _compute_layout(self):
         for dim in self.dims:
             if dim.size != 1:
                 if dim.stride != sd:
-                    flags['F_CONTIGUOUS'] = False
+                    flags["F_CONTIGUOUS"] = False
                     return flags
                 sd *= dim.size
 
@@ -208,11 +212,11 @@ def _compute_extent(self):
         lastidx = [s - 1 for s in self.shape]
         start = compute_index(firstidx, self.dims)
         stop = compute_index(lastidx, self.dims) + self.itemsize
-        stop = max(stop, start)   # ensure positive extent
+        stop = max(stop, start)  # ensure positive extent
         return Extent(start, stop)
 
     def __repr__(self):
-        return '<Array dims=%s itemsize=%s>' % (self.dims, self.itemsize)
+        return "<Array dims=%s itemsize=%s>" % (self.dims, self.itemsize)
 
     def __getitem__(self, item):
         if not isinstance(item, tuple):
@@ -240,15 +244,14 @@ def __getitem__(self, item):
 
     @property
     def is_c_contig(self):
-        return self.flags['C_CONTIGUOUS']
+        return self.flags["C_CONTIGUOUS"]
 
     @property
     def is_f_contig(self):
-        return self.flags['F_CONTIGUOUS']
+        return self.flags["F_CONTIGUOUS"]
 
     def iter_contiguous_extent(self):
-        """ Generates extents
-        """
+        """Generates extents"""
         if self.is_c_contig or self.is_f_contig:
             yield self.extent
         else:
@@ -279,11 +282,11 @@ def reshape(self, *newdims, **kws):
         if newdims == self.shape:
             return self, None
 
-        order = kws.pop('order', 'C')
+        order = kws.pop("order", "C")
         if kws:
-            raise TypeError('unknown keyword arguments %s' % kws.keys())
-        if order not in 'CFA':
-            raise ValueError('order not C|F|A')
+            raise TypeError("unknown keyword arguments %s" % kws.keys())
+        if order not in "CFA":
+            raise ValueError("order not C|F|A")
 
         # check for exactly one instance of -1 in newdims
         # https://github.com/numpy/numpy/blob/623bc1fae1d47df24e7f1e29321d0c0ba2771ce0/numpy/core/src/multiarray/shape.c#L470-L515   # noqa: E501
@@ -301,25 +304,28 @@ def reshape(self, *newdims, **kws):
         # compute the missing dimension
         if unknownidx >= 0:
             if knownsize == 0 or self.size % knownsize != 0:
-                raise ValueError("cannot infer valid shape "
-                                 "for unknown dimension")
+                raise ValueError(
+                    "cannot infer valid shape for unknown dimension"
+                )
             else:
-                newdims = newdims[0:unknownidx] \
-                    + (self.size // knownsize,) \
-                    + newdims[unknownidx + 1:]
+                newdims = (
+                    newdims[0:unknownidx]
+                    + (self.size // knownsize,)
+                    + newdims[unknownidx + 1 :]
+                )
 
         newsize = functools.reduce(operator.mul, newdims, 1)
 
-        if order == 'A':
-            order = 'F' if self.is_f_contig else 'C'
+        if order == "A":
+            order = "F" if self.is_f_contig else "C"
 
         if newsize != self.size:
             raise ValueError("reshape changes the size of the array")
 
         if self.is_c_contig or self.is_f_contig:
-            if order == 'C':
+            if order == "C":
                 newstrides = list(iter_strides_c_contig(self, newdims))
-            elif order == 'F':
+            elif order == "F":
                 newstrides = list(iter_strides_f_contig(self, newdims))
             else:
                 raise AssertionError("unreachable")
@@ -340,12 +346,16 @@ def reshape(self, *newdims, **kws):
                 newdims,
                 newstrides,
                 self.itemsize,
-                order == 'F',
+                order == "F",
             ):
-                raise NotImplementedError('reshape would require copy')
+                raise NotImplementedError("reshape would require copy")
 
-        ret = self.from_desc(self.extent.begin, shape=newdims,
-                             strides=newstrides, itemsize=self.itemsize)
+        ret = self.from_desc(
+            self.extent.begin,
+            shape=newdims,
+            strides=newstrides,
+            itemsize=self.itemsize,
+        )
 
         return ret, list(self.iter_contiguous_extent())
 
@@ -377,16 +387,21 @@ def squeeze(self, axis=None):
         )
         return newarr, list(self.iter_contiguous_extent())
 
-    def ravel(self, order='C'):
-        if order not in 'CFA':
-            raise ValueError('order not C|F|A')
+    def ravel(self, order="C"):
+        if order not in "CFA":
+            raise ValueError("order not C|F|A")
 
-        if (order in 'CA' and self.is_c_contig
-                or order in 'FA' and self.is_f_contig):
+        if (
+            order in "CA"
+            and self.is_c_contig
+            or order in "FA"
+            and self.is_f_contig
+        ):
             newshape = (self.size,)
             newstrides = (self.itemsize,)
-            arr = self.from_desc(self.extent.begin, newshape, newstrides,
-                                 self.itemsize)
+            arr = self.from_desc(
+                self.extent.begin, newshape, newstrides, self.itemsize
+            )
             return arr, list(self.iter_contiguous_extent())
 
         else:
@@ -394,8 +409,7 @@ def ravel(self, order='C'):
 
 
 def iter_strides_f_contig(arr, shape=None):
-    """yields the f-contiguous strides
-    """
+    """yields the f-contiguous strides"""
     shape = arr.shape if shape is None else shape
     itemsize = arr.itemsize
     yield itemsize
@@ -406,8 +420,7 @@ def iter_strides_f_contig(arr, shape=None):
 
 
 def iter_strides_c_contig(arr, shape=None):
-    """yields the c-contiguous strides
-    """
+    """yields the c-contiguous strides"""
     shape = arr.shape if shape is None else shape
     itemsize = arr.itemsize
 
@@ -438,8 +451,7 @@ def is_element_indexing(item, ndim):
 
 
 def _compute_size(start, stop, step):
-    """Algorithm adapted from cpython rangeobject.c
-    """
+    """Algorithm adapted from cpython rangeobject.c"""
     if step > 0:
         lo = start
         hi = stop
diff --git a/numba_cuda/numba/cuda/cudadrv/enums.py b/numba_cuda/numba/cuda/cudadrv/enums.py
index e40bb182f..987234b6f 100644
--- a/numba_cuda/numba/cuda/cudadrv/enums.py
+++ b/numba_cuda/numba/cuda/cudadrv/enums.py
@@ -140,7 +140,7 @@
 # Force synchronous blocking on cudaMemcpy/cudaMemset
 CU_CTX_SYNC_MEMOPS = 0x80
 
-CU_CTX_FLAGS_MASK = 0xff
+CU_CTX_FLAGS_MASK = 0xFF
 
 
 # DEFINES
diff --git a/numba_cuda/numba/cuda/cudadrv/error.py b/numba_cuda/numba/cuda/cudadrv/error.py
index ec3420586..87528d06d 100644
--- a/numba_cuda/numba/cuda/cudadrv/error.py
+++ b/numba_cuda/numba/cuda/cudadrv/error.py
@@ -12,7 +12,7 @@ class CudaSupportError(ImportError):
 
 class NvvmError(Exception):
     def __str__(self):
-        return '\n'.join(map(str, self.args))
+        return "\n".join(map(str, self.args))
 
 
 class NvvmSupportError(ImportError):
@@ -25,7 +25,7 @@ class NvvmWarning(Warning):
 
 class NvrtcError(Exception):
     def __str__(self):
-        return '\n'.join(map(str, self.args))
+        return "\n".join(map(str, self.args))
 
 
 class NvrtcCompilationError(NvrtcError):
diff --git a/numba_cuda/numba/cuda/cudadrv/libs.py b/numba_cuda/numba/cuda/cudadrv/libs.py
index 70c385041..7388db898 100644
--- a/numba_cuda/numba/cuda/cudadrv/libs.py
+++ b/numba_cuda/numba/cuda/cudadrv/libs.py
@@ -21,25 +21,25 @@
 from numba.core import config
 
 
-if sys.platform == 'win32':
-    _dllnamepattern = '%s.dll'
-    _staticnamepattern = '%s.lib'
-elif sys.platform == 'darwin':
-    _dllnamepattern = 'lib%s.dylib'
-    _staticnamepattern = 'lib%s.a'
+if sys.platform == "win32":
+    _dllnamepattern = "%s.dll"
+    _staticnamepattern = "%s.lib"
+elif sys.platform == "darwin":
+    _dllnamepattern = "lib%s.dylib"
+    _staticnamepattern = "lib%s.a"
 else:
-    _dllnamepattern = 'lib%s.so'
-    _staticnamepattern = 'lib%s.a'
+    _dllnamepattern = "lib%s.so"
+    _staticnamepattern = "lib%s.a"
 
 
 def get_libdevice():
     d = get_cuda_paths()
-    paths = d['libdevice'].info
+    paths = d["libdevice"].info
     return paths
 
 
 def open_libdevice():
-    with open(get_libdevice(), 'rb') as bcfile:
+    with open(get_libdevice(), "rb") as bcfile:
         return bcfile.read()
 
 
@@ -50,10 +50,10 @@ def get_cudalib(lib, static=False):
     'libnvvm.so' for 'nvvm') so that we may attempt to load it using the system
     loader's search mechanism.
     """
-    if lib == 'nvvm':
-        return get_cuda_paths()['nvvm'].info or _dllnamepattern % 'nvvm'
+    if lib == "nvvm":
+        return get_cuda_paths()["nvvm"].info or _dllnamepattern % "nvvm"
     else:
-        dir_type = 'static_cudalib_dir' if static else 'cudalib_dir'
+        dir_type = "static_cudalib_dir" if static else "cudalib_dir"
         libdir = get_cuda_paths()[dir_type].info
 
     candidates = find_lib(lib, libdir, static=static)
@@ -68,7 +68,7 @@ def get_cuda_include_dir():
     configuration.
     """
 
-    return get_cuda_paths()['include_dir'].info
+    return get_cuda_paths()["include_dir"].info
 
 
 def check_cuda_include_dir(path):
@@ -86,39 +86,38 @@ def open_cudalib(lib):
 
 def check_static_lib(path):
     if not os.path.isfile(path):
-        raise FileNotFoundError(f'{path} not found')
+        raise FileNotFoundError(f"{path} not found")
 
 
 def _get_source_variable(lib, static=False):
-    if lib == 'nvvm':
-        return get_cuda_paths()['nvvm'].by
-    elif lib == 'libdevice':
-        return get_cuda_paths()['libdevice'].by
-    elif lib == 'include_dir':
-        return get_cuda_paths()['include_dir'].by
+    if lib == "nvvm":
+        return get_cuda_paths()["nvvm"].by
+    elif lib == "libdevice":
+        return get_cuda_paths()["libdevice"].by
+    elif lib == "include_dir":
+        return get_cuda_paths()["include_dir"].by
     else:
-        dir_type = 'static_cudalib_dir' if static else 'cudalib_dir'
+        dir_type = "static_cudalib_dir" if static else "cudalib_dir"
         return get_cuda_paths()[dir_type].by
 
 
 def test():
-    """Test library lookup.  Path info is printed to stdout.
-    """
+    """Test library lookup.  Path info is printed to stdout."""
     failed = False
 
     # Check for the driver
     try:
         dlloader, candidates = locate_driver_and_loader()
-        print('Finding driver from candidates:')
+        print("Finding driver from candidates:")
         for location in candidates:
-            print(f'\t{location}')
-        print(f'Using loader {dlloader}')
-        print('\tTrying to load driver', end='...')
+            print(f"\t{location}")
+        print(f"Using loader {dlloader}")
+        print("\tTrying to load driver", end="...")
         dll, path = load_driver(dlloader, candidates)
-        print('\tok')
-        print(f'\t\tLoaded from {path}')
+        print("\tok")
+        print(f"\t\tLoaded from {path}")
     except CudaSupportError as e:
-        print(f'\tERROR: failed to open driver: {e}')
+        print(f"\tERROR: failed to open driver: {e}")
         failed = True
 
     # Find the absolute location of the driver on Linux. Various driver-related
@@ -127,9 +126,9 @@ def test():
     # Providing the absolute location of the driver indicates its version
     # number in the soname (e.g. "libcuda.so.530.30.02"), which can be used to
     # look up whether the driver was intended for "native" Linux.
-    if sys.platform == 'linux' and not failed:
+    if sys.platform == "linux" and not failed:
         pid = os.getpid()
-        mapsfile = os.path.join(os.path.sep, 'proc', f'{pid}', 'maps')
+        mapsfile = os.path.join(os.path.sep, "proc", f"{pid}", "maps")
         try:
             with open(mapsfile) as f:
                 maps = f.read()
@@ -140,58 +139,61 @@ def test():
             # It's helpful to report that this went wrong to the user, but we
             # don't set failed to True because this doesn't have any connection
             # to actual CUDA functionality.
-            print(f'\tERROR: Could not open {mapsfile} to determine absolute '
-                  'path to libcuda.so')
+            print(
+                f"\tERROR: Could not open {mapsfile} to determine absolute "
+                "path to libcuda.so"
+            )
         else:
             # In this case we could read the maps, so we can report the
             # relevant ones to the user
-            locations = set(s for s in maps.split() if 'libcuda.so' in s)
-            print('\tMapped libcuda.so paths:')
+            locations = set(s for s in maps.split() if "libcuda.so" in s)
+            print("\tMapped libcuda.so paths:")
             for location in locations:
-                print(f'\t\t{location}')
+                print(f"\t\t{location}")
 
     # Checks for dynamic libraries
-    libs = 'nvvm nvrtc cudart'.split()
+    libs = "nvvm nvrtc cudart".split()
     for lib in libs:
         path = get_cudalib(lib)
-        print('Finding {} from {}'.format(lib, _get_source_variable(lib)))
-        print('\tLocated at', path)
+        print("Finding {} from {}".format(lib, _get_source_variable(lib)))
+        print("\tLocated at", path)
 
         try:
-            print('\tTrying to open library', end='...')
+            print("\tTrying to open library", end="...")
             open_cudalib(lib)
-            print('\tok')
+            print("\tok")
         except OSError as e:
-            print('\tERROR: failed to open %s:\n%s' % (lib, e))
+            print("\tERROR: failed to open %s:\n%s" % (lib, e))
             failed = True
 
     # Check for cudadevrt (the only static library)
-    lib = 'cudadevrt'
+    lib = "cudadevrt"
     path = get_cudalib(lib, static=True)
-    print('Finding {} from {}'.format(lib, _get_source_variable(lib,
-                                                                static=True)))
-    print('\tLocated at', path)
+    print(
+        "Finding {} from {}".format(lib, _get_source_variable(lib, static=True))
+    )
+    print("\tLocated at", path)
 
     try:
-        print('\tChecking library', end='...')
+        print("\tChecking library", end="...")
         check_static_lib(path)
-        print('\tok')
+        print("\tok")
     except FileNotFoundError as e:
-        print('\tERROR: failed to find %s:\n%s' % (lib, e))
+        print("\tERROR: failed to find %s:\n%s" % (lib, e))
         failed = True
 
     # Check for libdevice
-    where = _get_source_variable('libdevice')
-    print(f'Finding libdevice from {where}')
+    where = _get_source_variable("libdevice")
+    print(f"Finding libdevice from {where}")
     path = get_libdevice()
-    print('\tLocated at', path)
+    print("\tLocated at", path)
 
     try:
-        print('\tChecking library', end='...')
+        print("\tChecking library", end="...")
         check_static_lib(path)
-        print('\tok')
+        print("\tok")
     except FileNotFoundError as e:
-        print('\tERROR: failed to find %s:\n%s' % (lib, e))
+        print("\tERROR: failed to find %s:\n%s" % (lib, e))
         failed = True
 
     # Check cuda include paths
@@ -199,16 +201,16 @@ def test():
     print("Include directory configuration variable:")
     print(f"\tCUDA_INCLUDE_PATH={config.CUDA_INCLUDE_PATH}")
 
-    where = _get_source_variable('include_dir')
-    print(f'Finding include directory from {where}')
+    where = _get_source_variable("include_dir")
+    print(f"Finding include directory from {where}")
     include = get_cuda_include_dir()
-    print('\tLocated at', include)
+    print("\tLocated at", include)
     try:
-        print('\tChecking include directory', end='...')
+        print("\tChecking include directory", end="...")
         check_cuda_include_dir(include)
-        print('\tok')
+        print("\tok")
     except FileNotFoundError as e:
-        print('\tERROR: failed to find cuda include directory:\n%s' % e)
+        print("\tERROR: failed to find cuda include directory:\n%s" % e)
         failed = True
 
     return not failed
diff --git a/numba_cuda/numba/cuda/cudadrv/mappings.py b/numba_cuda/numba/cuda/cudadrv/mappings.py
index 95d369efd..aa94d22e9 100644
--- a/numba_cuda/numba/cuda/cudadrv/mappings.py
+++ b/numba_cuda/numba/cuda/cudadrv/mappings.py
@@ -1,24 +1,26 @@
 from numba import config
 from . import enums
+
 if config.CUDA_USE_NVIDIA_BINDING:
     from cuda import cuda
+
     jitty = cuda.CUjitInputType
     FILE_EXTENSION_MAP = {
-        'o': jitty.CU_JIT_INPUT_OBJECT,
-        'ptx': jitty.CU_JIT_INPUT_PTX,
-        'a': jitty.CU_JIT_INPUT_LIBRARY,
-        'lib': jitty.CU_JIT_INPUT_LIBRARY,
-        'cubin': jitty.CU_JIT_INPUT_CUBIN,
-        'fatbin': jitty.CU_JIT_INPUT_FATBINARY,
-        'ltoir': jitty.CU_JIT_INPUT_NVVM,
+        "o": jitty.CU_JIT_INPUT_OBJECT,
+        "ptx": jitty.CU_JIT_INPUT_PTX,
+        "a": jitty.CU_JIT_INPUT_LIBRARY,
+        "lib": jitty.CU_JIT_INPUT_LIBRARY,
+        "cubin": jitty.CU_JIT_INPUT_CUBIN,
+        "fatbin": jitty.CU_JIT_INPUT_FATBINARY,
+        "ltoir": jitty.CU_JIT_INPUT_NVVM,
     }
 else:
     FILE_EXTENSION_MAP = {
-        'o': enums.CU_JIT_INPUT_OBJECT,
-        'ptx': enums.CU_JIT_INPUT_PTX,
-        'a': enums.CU_JIT_INPUT_LIBRARY,
-        'lib': enums.CU_JIT_INPUT_LIBRARY,
-        'cubin': enums.CU_JIT_INPUT_CUBIN,
-        'fatbin': enums.CU_JIT_INPUT_FATBINARY,
-        'ltoir': enums.CU_JIT_INPUT_NVVM,
+        "o": enums.CU_JIT_INPUT_OBJECT,
+        "ptx": enums.CU_JIT_INPUT_PTX,
+        "a": enums.CU_JIT_INPUT_LIBRARY,
+        "lib": enums.CU_JIT_INPUT_LIBRARY,
+        "cubin": enums.CU_JIT_INPUT_CUBIN,
+        "fatbin": enums.CU_JIT_INPUT_FATBINARY,
+        "ltoir": enums.CU_JIT_INPUT_NVVM,
     }
diff --git a/numba_cuda/numba/cuda/cudadrv/nvrtc.py b/numba_cuda/numba/cuda/cudadrv/nvrtc.py
index 5ab970c02..145873848 100644
--- a/numba_cuda/numba/cuda/cudadrv/nvrtc.py
+++ b/numba_cuda/numba/cuda/cudadrv/nvrtc.py
@@ -1,7 +1,10 @@
 from ctypes import byref, c_char, c_char_p, c_int, c_size_t, c_void_p, POINTER
 from enum import IntEnum
-from numba.cuda.cudadrv.error import (NvrtcError, NvrtcCompilationError,
-                                      NvrtcSupportError)
+from numba.cuda.cudadrv.error import (
+    NvrtcError,
+    NvrtcCompilationError,
+    NvrtcSupportError,
+)
 from numba.cuda.cuda_paths import get_cuda_paths
 import functools
 import os
@@ -39,6 +42,7 @@ class NvrtcProgram:
     the class own an nvrtcProgram; when an instance is deleted, the underlying
     nvrtcProgram is destroyed using the appropriate NVRTC API.
     """
+
     def __init__(self, nvrtc, handle):
         self._nvrtc = nvrtc
         self._handle = handle
@@ -66,42 +70,56 @@ class NVRTC:
         # nvrtcResult nvrtcGetLTOIRSize(nvrtcProgram prog, size_t *ltoSizeRet);
         "nvrtcGetLTOIRSize": (nvrtc_result, nvrtc_program, POINTER(c_size_t)),
         # nvrtcResult nvrtcGetLTOIR(nvrtcProgram prog, char *lto);
-        "nvrtcGetLTOIR": (nvrtc_result, nvrtc_program, c_char_p)
+        "nvrtcGetLTOIR": (nvrtc_result, nvrtc_program, c_char_p),
     }
 
     _PROTOTYPES = {
         # nvrtcResult nvrtcVersion(int *major, int *minor)
-        'nvrtcVersion': (nvrtc_result, POINTER(c_int), POINTER(c_int)),
+        "nvrtcVersion": (nvrtc_result, POINTER(c_int), POINTER(c_int)),
         # nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog,
         #                                const char *src,
         #                                const char *name,
         #                                int numHeaders,
         #                                const char * const *headers,
         #                                const char * const *includeNames)
-        'nvrtcCreateProgram': (nvrtc_result, nvrtc_program, c_char_p, c_char_p,
-                               c_int, POINTER(c_char_p), POINTER(c_char_p)),
+        "nvrtcCreateProgram": (
+            nvrtc_result,
+            nvrtc_program,
+            c_char_p,
+            c_char_p,
+            c_int,
+            POINTER(c_char_p),
+            POINTER(c_char_p),
+        ),
         # nvrtcResult nvrtcDestroyProgram(nvrtcProgram *prog);
-        'nvrtcDestroyProgram': (nvrtc_result, POINTER(nvrtc_program)),
+        "nvrtcDestroyProgram": (nvrtc_result, POINTER(nvrtc_program)),
         # nvrtcResult nvrtcCompileProgram(nvrtcProgram prog,
         #                                 int numOptions,
         #                                 const char * const *options)
-        'nvrtcCompileProgram': (nvrtc_result, nvrtc_program, c_int,
-                                POINTER(c_char_p)),
+        "nvrtcCompileProgram": (
+            nvrtc_result,
+            nvrtc_program,
+            c_int,
+            POINTER(c_char_p),
+        ),
         # nvrtcResult nvrtcGetPTXSize(nvrtcProgram prog, size_t *ptxSizeRet);
-        'nvrtcGetPTXSize': (nvrtc_result, nvrtc_program, POINTER(c_size_t)),
+        "nvrtcGetPTXSize": (nvrtc_result, nvrtc_program, POINTER(c_size_t)),
         # nvrtcResult nvrtcGetPTX(nvrtcProgram prog, char *ptx);
-        'nvrtcGetPTX': (nvrtc_result, nvrtc_program, c_char_p),
+        "nvrtcGetPTX": (nvrtc_result, nvrtc_program, c_char_p),
         # nvrtcResult nvrtcGetCUBINSize(nvrtcProgram prog,
         #                               size_t *cubinSizeRet);
-        'nvrtcGetCUBINSize': (nvrtc_result, nvrtc_program, POINTER(c_size_t)),
+        "nvrtcGetCUBINSize": (nvrtc_result, nvrtc_program, POINTER(c_size_t)),
         # nvrtcResult nvrtcGetCUBIN(nvrtcProgram prog, char *cubin);
-        'nvrtcGetCUBIN': (nvrtc_result, nvrtc_program, c_char_p),
+        "nvrtcGetCUBIN": (nvrtc_result, nvrtc_program, c_char_p),
         # nvrtcResult nvrtcGetProgramLogSize(nvrtcProgram prog,
         #                                    size_t *logSizeRet);
-        'nvrtcGetProgramLogSize': (nvrtc_result, nvrtc_program,
-                                   POINTER(c_size_t)),
+        "nvrtcGetProgramLogSize": (
+            nvrtc_result,
+            nvrtc_program,
+            POINTER(c_size_t),
+        ),
         # nvrtcResult nvrtcGetProgramLog(nvrtcProgram prog, char *log);
-        'nvrtcGetProgramLog': (nvrtc_result, nvrtc_program, c_char_p),
+        "nvrtcGetProgramLog": (nvrtc_result, nvrtc_program, c_char_p),
     }
 
     # Singleton reference
@@ -111,14 +129,16 @@ def __new__(cls):
         with _nvrtc_lock:
             if cls.__INSTANCE is None:
                 from numba.cuda.cudadrv.libs import open_cudalib
+
                 cls.__INSTANCE = inst = object.__new__(cls)
                 try:
-                    lib = open_cudalib('nvrtc')
+                    lib = open_cudalib("nvrtc")
                 except OSError as e:
                     cls.__INSTANCE = None
                     raise NvrtcSupportError("NVRTC cannot be loaded") from e
 
                 from numba.cuda.cudadrv.runtime import get_version
+
                 if get_version() >= (12, 0):
                     inst._PROTOTYPES |= inst._CU12ONLY_PROTOTYPES
 
@@ -137,9 +157,11 @@ def checked_call(*args, func=func, name=name):
                             try:
                                 error_name = NvrtcResult(error).name
                             except ValueError:
-                                error_name = ('Unknown nvrtc_result '
-                                              f'(error code: {error})')
-                            msg = f'Failed to call {name}: {error_name}'
+                                error_name = (
+                                    "Unknown nvrtc_result "
+                                    f"(error code: {error})"
+                                )
+                            msg = f"Failed to call {name}: {error_name}"
                             raise NvrtcError(msg)
 
                     setattr(inst, name, checked_call)
@@ -182,7 +204,7 @@ def compile_program(self, program, options):
         # prior to the call to nvrtcCompileProgram
         encoded_options = [opt.encode() for opt in options]
         option_pointers = [c_char_p(opt) for opt in encoded_options]
-        c_options_type = (c_char_p * len(options))
+        c_options_type = c_char_p * len(options)
         c_options = c_options_type(*option_pointers)
         try:
             self.nvrtcCompileProgram(program.handle, len(options), c_options)
@@ -257,7 +279,7 @@ def compile(src, name, cc, ltoir=False):
     # - Relocatable Device Code (rdc) is needed to prevent device functions
     #   being optimized away.
     major, minor = cc
-    arch = f'--gpu-architecture=compute_{major}{minor}'
+    arch = f"--gpu-architecture=compute_{major}{minor}"
 
     cuda_include = [
         f"-I{get_cuda_paths()['include_dir'].info}",
@@ -265,12 +287,12 @@ def compile(src, name, cc, ltoir=False):
 
     cudadrv_path = os.path.dirname(os.path.abspath(__file__))
     numba_cuda_path = os.path.dirname(cudadrv_path)
-    numba_include = f'-I{numba_cuda_path}'
+    numba_include = f"-I{numba_cuda_path}"
 
     nrt_path = os.path.join(numba_cuda_path, "runtime")
-    nrt_include = f'-I{nrt_path}'
+    nrt_include = f"-I{nrt_path}"
 
-    options = [arch, *cuda_include, numba_include, nrt_include, '-rdc', 'true']
+    options = [arch, *cuda_include, numba_include, nrt_include, "-rdc", "true"]
 
     if ltoir:
         options.append("-dlto")
@@ -286,12 +308,12 @@ def compile(src, name, cc, ltoir=False):
 
     # If the compile failed, provide the log in an exception
     if compile_error:
-        msg = (f'NVRTC Compilation failure whilst compiling {name}:\n\n{log}')
+        msg = f"NVRTC Compilation failure whilst compiling {name}:\n\n{log}"
         raise NvrtcError(msg)
 
     # Otherwise, if there's any content in the log, present it as a warning
     if log:
-        msg = (f"NVRTC log messages whilst compiling {name}:\n\n{log}")
+        msg = f"NVRTC log messages whilst compiling {name}:\n\n{log}"
         warnings.warn(msg)
 
     if ltoir:
diff --git a/numba_cuda/numba/cuda/cudadrv/nvvm.py b/numba_cuda/numba/cuda/cudadrv/nvvm.py
index 0844661e2..b46fb0a39 100644
--- a/numba_cuda/numba/cuda/cudadrv/nvvm.py
+++ b/numba_cuda/numba/cuda/cudadrv/nvvm.py
@@ -1,12 +1,12 @@
 """
 This is a direct translation of nvvm.h
 """
+
 import logging
 import re
 import sys
 import warnings
-from ctypes import (c_void_p, c_int, POINTER, c_char_p, c_size_t, byref,
-                    c_char)
+from ctypes import c_void_p, c_int, POINTER, c_char_p, c_size_t, byref, c_char
 
 import threading
 
@@ -31,7 +31,7 @@
 # Result code
 nvvm_result = c_int
 
-RESULT_CODE_NAMES = '''
+RESULT_CODE_NAMES = """
 NVVM_SUCCESS
 NVVM_ERROR_OUT_OF_MEMORY
 NVVM_ERROR_PROGRAM_CREATION_FAILURE
@@ -42,19 +42,23 @@
 NVVM_ERROR_INVALID_OPTION
 NVVM_ERROR_NO_MODULE_IN_PROGRAM
 NVVM_ERROR_COMPILATION
-'''.split()
+""".split()
 
 for i, k in enumerate(RESULT_CODE_NAMES):
     setattr(sys.modules[__name__], k, i)
 
 # Data layouts. NVVM IR 1.8 (CUDA 11.6) introduced 128-bit integer support.
 
-_datalayout_original = ('e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-'
-                        'i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-'
-                        'v64:64:64-v128:128:128-n16:32:64')
-_datalayout_i128 = ('e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-'
-                    'i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-'
-                    'v64:64:64-v128:128:128-n16:32:64')
+_datalayout_original = (
+    "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-"
+    "i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-"
+    "v64:64:64-v128:128:128-n16:32:64"
+)
+_datalayout_i128 = (
+    "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-"
+    "i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-"
+    "v64:64:64-v128:128:128-n16:32:64"
+)
 
 
 def is_available():
@@ -73,59 +77,74 @@ def is_available():
 
 
 class NVVM(object):
-    '''Process-wide singleton.
-    '''
-    _PROTOTYPES = {
+    """Process-wide singleton."""
 
+    _PROTOTYPES = {
         # nvvmResult nvvmVersion(int *major, int *minor)
-        'nvvmVersion': (nvvm_result, POINTER(c_int), POINTER(c_int)),
-
+        "nvvmVersion": (nvvm_result, POINTER(c_int), POINTER(c_int)),
         # nvvmResult nvvmCreateProgram(nvvmProgram *cu)
-        'nvvmCreateProgram': (nvvm_result, POINTER(nvvm_program)),
-
+        "nvvmCreateProgram": (nvvm_result, POINTER(nvvm_program)),
         # nvvmResult nvvmDestroyProgram(nvvmProgram *cu)
-        'nvvmDestroyProgram': (nvvm_result, POINTER(nvvm_program)),
-
+        "nvvmDestroyProgram": (nvvm_result, POINTER(nvvm_program)),
         # nvvmResult nvvmAddModuleToProgram(nvvmProgram cu, const char *buffer,
         #                                   size_t size, const char *name)
-        'nvvmAddModuleToProgram': (
-            nvvm_result, nvvm_program, c_char_p, c_size_t, c_char_p),
-
+        "nvvmAddModuleToProgram": (
+            nvvm_result,
+            nvvm_program,
+            c_char_p,
+            c_size_t,
+            c_char_p,
+        ),
         # nvvmResult nvvmLazyAddModuleToProgram(nvvmProgram cu,
         #                                       const char* buffer,
         #                                       size_t size,
         #                                       const char *name)
-        'nvvmLazyAddModuleToProgram': (
-            nvvm_result, nvvm_program, c_char_p, c_size_t, c_char_p),
-
+        "nvvmLazyAddModuleToProgram": (
+            nvvm_result,
+            nvvm_program,
+            c_char_p,
+            c_size_t,
+            c_char_p,
+        ),
         # nvvmResult nvvmCompileProgram(nvvmProgram cu, int numOptions,
         #                          const char **options)
-        'nvvmCompileProgram': (
-            nvvm_result, nvvm_program, c_int, POINTER(c_char_p)),
-
+        "nvvmCompileProgram": (
+            nvvm_result,
+            nvvm_program,
+            c_int,
+            POINTER(c_char_p),
+        ),
         # nvvmResult nvvmGetCompiledResultSize(nvvmProgram cu,
         #                                      size_t *bufferSizeRet)
-        'nvvmGetCompiledResultSize': (
-            nvvm_result, nvvm_program, POINTER(c_size_t)),
-
+        "nvvmGetCompiledResultSize": (
+            nvvm_result,
+            nvvm_program,
+            POINTER(c_size_t),
+        ),
         # nvvmResult nvvmGetCompiledResult(nvvmProgram cu, char *buffer)
-        'nvvmGetCompiledResult': (nvvm_result, nvvm_program, c_char_p),
-
+        "nvvmGetCompiledResult": (nvvm_result, nvvm_program, c_char_p),
         # nvvmResult nvvmGetProgramLogSize(nvvmProgram cu,
         #                                      size_t *bufferSizeRet)
-        'nvvmGetProgramLogSize': (nvvm_result, nvvm_program, POINTER(c_size_t)),
-
+        "nvvmGetProgramLogSize": (nvvm_result, nvvm_program, POINTER(c_size_t)),
         # nvvmResult nvvmGetProgramLog(nvvmProgram cu, char *buffer)
-        'nvvmGetProgramLog': (nvvm_result, nvvm_program, c_char_p),
-
+        "nvvmGetProgramLog": (nvvm_result, nvvm_program, c_char_p),
         # nvvmResult nvvmIRVersion (int* majorIR, int* minorIR, int* majorDbg,
         #                           int* minorDbg )
-        'nvvmIRVersion': (nvvm_result, POINTER(c_int), POINTER(c_int),
-                          POINTER(c_int), POINTER(c_int)),
+        "nvvmIRVersion": (
+            nvvm_result,
+            POINTER(c_int),
+            POINTER(c_int),
+            POINTER(c_int),
+            POINTER(c_int),
+        ),
         # nvvmResult nvvmVerifyProgram (nvvmProgram prog, int numOptions,
         #                               const char** options)
-        'nvvmVerifyProgram': (nvvm_result, nvvm_program, c_int,
-                              POINTER(c_char_p))
+        "nvvmVerifyProgram": (
+            nvvm_result,
+            nvvm_program,
+            c_int,
+            POINTER(c_char_p),
+        ),
     }
 
     # Singleton reference
@@ -136,11 +155,13 @@ def __new__(cls):
             if cls.__INSTANCE is None:
                 cls.__INSTANCE = inst = object.__new__(cls)
                 try:
-                    inst.driver = open_cudalib('nvvm')
+                    inst.driver = open_cudalib("nvvm")
                 except OSError as e:
                     cls.__INSTANCE = None
-                    errmsg = ("libNVVM cannot be found. Do `conda install "
-                              "cudatoolkit`:\n%s")
+                    errmsg = (
+                        "libNVVM cannot be found. Do `conda install "
+                        "cudatoolkit`:\n%s"
+                    )
                     raise NvvmSupportError(errmsg % e)
 
                 # Find & populate functions
@@ -175,7 +196,7 @@ def get_version(self):
         major = c_int()
         minor = c_int()
         err = self.nvvmVersion(byref(major), byref(minor))
-        self.check_error(err, 'Failed to get version.')
+        self.check_error(err, "Failed to get version.")
         return major.value, minor.value
 
     def get_ir_version(self):
@@ -183,9 +204,10 @@ def get_ir_version(self):
         minorIR = c_int()
         majorDbg = c_int()
         minorDbg = c_int()
-        err = self.nvvmIRVersion(byref(majorIR), byref(minorIR),
-                                 byref(majorDbg), byref(minorDbg))
-        self.check_error(err, 'Failed to get IR version.')
+        err = self.nvvmIRVersion(
+            byref(majorIR), byref(minorIR), byref(majorDbg), byref(minorDbg)
+        )
+        self.check_error(err, "Failed to get IR version.")
         return majorIR.value, minorIR.value, majorDbg.value, minorDbg.value
 
     def check_error(self, error, msg, exit=False):
@@ -223,18 +245,18 @@ def __init__(self, options):
         self.driver = NVVM()
         self._handle = nvvm_program()
         err = self.driver.nvvmCreateProgram(byref(self._handle))
-        self.driver.check_error(err, 'Failed to create CU')
+        self.driver.check_error(err, "Failed to create CU")
 
         def stringify_option(k, v):
-            k = k.replace('_', '-')
+            k = k.replace("_", "-")
 
             if v is None:
-                return f'-{k}'.encode('utf-8')
+                return f"-{k}".encode("utf-8")
 
             if isinstance(v, bool):
                 v = int(v)
 
-            return f'-{k}={v}'.encode('utf-8')
+            return f"-{k}={v}".encode("utf-8")
 
         options = [stringify_option(k, v) for k, v in options.items()]
         option_ptrs = (c_char_p * len(options))(*[c_char_p(x) for x in options])
@@ -248,17 +270,18 @@ def stringify_option(k, v):
     def __del__(self):
         driver = NVVM()
         err = driver.nvvmDestroyProgram(byref(self._handle))
-        driver.check_error(err, 'Failed to destroy CU', exit=True)
+        driver.check_error(err, "Failed to destroy CU", exit=True)
 
     def add_module(self, buffer):
         """
-         Add a module level NVVM IR to a compilation unit.
-         - The buffer should contain an NVVM module IR either in the bitcode
-           representation (LLVM3.0) or in the text representation.
+        Add a module level NVVM IR to a compilation unit.
+        - The buffer should contain an NVVM module IR either in the bitcode
+          representation (LLVM3.0) or in the text representation.
         """
-        err = self.driver.nvvmAddModuleToProgram(self._handle, buffer,
-                                                 len(buffer), None)
-        self.driver.check_error(err, 'Failed to add module')
+        err = self.driver.nvvmAddModuleToProgram(
+            self._handle, buffer, len(buffer), None
+        )
+        self.driver.check_error(err, "Failed to add module")
 
     def lazy_add_module(self, buffer):
         """
@@ -266,37 +289,41 @@ def lazy_add_module(self, buffer):
         The buffer should contain NVVM module IR either in the bitcode
         representation or in the text representation.
         """
-        err = self.driver.nvvmLazyAddModuleToProgram(self._handle, buffer,
-                                                     len(buffer), None)
-        self.driver.check_error(err, 'Failed to add module')
+        err = self.driver.nvvmLazyAddModuleToProgram(
+            self._handle, buffer, len(buffer), None
+        )
+        self.driver.check_error(err, "Failed to add module")
 
     def verify(self):
         """
         Run the NVVM verifier on all code added to the compilation unit.
         """
-        err = self.driver.nvvmVerifyProgram(self._handle, self.n_options,
-                                            self.option_ptrs)
-        self._try_error(err, 'Failed to verify\n')
+        err = self.driver.nvvmVerifyProgram(
+            self._handle, self.n_options, self.option_ptrs
+        )
+        self._try_error(err, "Failed to verify\n")
 
     def compile(self):
         """
         Compile all modules added to the compilation unit and return the
         resulting PTX or LTO-IR (depending on the options).
         """
-        err = self.driver.nvvmCompileProgram(self._handle, self.n_options,
-                                             self.option_ptrs)
-        self._try_error(err, 'Failed to compile\n')
+        err = self.driver.nvvmCompileProgram(
+            self._handle, self.n_options, self.option_ptrs
+        )
+        self._try_error(err, "Failed to compile\n")
 
         # Get result
         result_size = c_size_t()
-        err = self.driver.nvvmGetCompiledResultSize(self._handle,
-                                                    byref(result_size))
+        err = self.driver.nvvmGetCompiledResultSize(
+            self._handle, byref(result_size)
+        )
 
-        self._try_error(err, 'Failed to get size of compiled result.')
+        self._try_error(err, "Failed to get size of compiled result.")
 
         output_buffer = (c_char * result_size.value)()
         err = self.driver.nvvmGetCompiledResult(self._handle, output_buffer)
-        self._try_error(err, 'Failed to get compiled result.')
+        self._try_error(err, "Failed to get compiled result.")
 
         # Get log
         self.log = self.get_log()
@@ -311,26 +338,37 @@ def _try_error(self, err, msg):
     def get_log(self):
         reslen = c_size_t()
         err = self.driver.nvvmGetProgramLogSize(self._handle, byref(reslen))
-        self.driver.check_error(err, 'Failed to get compilation log size.')
+        self.driver.check_error(err, "Failed to get compilation log size.")
 
         if reslen.value > 1:
             logbuf = (c_char * reslen.value)()
             err = self.driver.nvvmGetProgramLog(self._handle, logbuf)
-            self.driver.check_error(err, 'Failed to get compilation log.')
+            self.driver.check_error(err, "Failed to get compilation log.")
 
-            return logbuf.value.decode('utf8')  # populate log attribute
+            return logbuf.value.decode("utf8")  # populate log attribute
 
-        return ''
+        return ""
 
 
 COMPUTE_CAPABILITIES = (
-    (3, 5), (3, 7),
-    (5, 0), (5, 2), (5, 3),
-    (6, 0), (6, 1), (6, 2),
-    (7, 0), (7, 2), (7, 5),
-    (8, 0), (8, 6), (8, 7), (8, 9),
+    (3, 5),
+    (3, 7),
+    (5, 0),
+    (5, 2),
+    (5, 3),
+    (6, 0),
+    (6, 1),
+    (6, 2),
+    (7, 0),
+    (7, 2),
+    (7, 5),
+    (8, 0),
+    (8, 6),
+    (8, 7),
+    (8, 9),
     (9, 0),
-    (10, 0), (10, 1),
+    (10, 0),
+    (10, 1),
     (12, 0),
 )
 
@@ -358,20 +396,27 @@ def ccs_supported_by_ctk(ctk_version):
     try:
         # For supported versions, we look up the range of supported CCs
         min_cc, max_cc = CTK_SUPPORTED[ctk_version]
-        return tuple([cc for cc in COMPUTE_CAPABILITIES
-                      if min_cc <= cc <= max_cc])
+        return tuple(
+            [cc for cc in COMPUTE_CAPABILITIES if min_cc <= cc <= max_cc]
+        )
     except KeyError:
         # For unsupported CUDA toolkit versions, all we can do is assume all
         # non-deprecated versions we are aware of are supported.
-        return tuple([cc for cc in COMPUTE_CAPABILITIES
-                      if cc >= config.CUDA_DEFAULT_PTX_CC])
+        return tuple(
+            [
+                cc
+                for cc in COMPUTE_CAPABILITIES
+                if cc >= config.CUDA_DEFAULT_PTX_CC
+            ]
+        )
 
 
 def get_supported_ccs():
     try:
         from numba.cuda.cudadrv.runtime import runtime
+
         cudart_version = runtime.get_version()
-    except: # noqa: E722
+    except:  # noqa: E722
         # We can't support anything if there's an error getting the runtime
         # version (e.g. if it's not present or there's another issue)
         _supported_cc = ()
@@ -382,9 +427,11 @@ def get_supported_ccs():
     if cudart_version < min_cudart:
         _supported_cc = ()
         ctk_ver = f"{cudart_version[0]}.{cudart_version[1]}"
-        unsupported_ver = (f"CUDA Toolkit {ctk_ver} is unsupported by Numba - "
-                           f"{min_cudart[0]}.{min_cudart[1]} is the minimum "
-                           "required version.")
+        unsupported_ver = (
+            f"CUDA Toolkit {ctk_ver} is unsupported by Numba - "
+            f"{min_cudart[0]}.{min_cudart[1]} is the minimum "
+            "required version."
+        )
         warnings.warn(unsupported_ver)
         return _supported_cc
 
@@ -403,8 +450,10 @@ def find_closest_arch(mycc):
     supported_ccs = NVVM().supported_ccs
 
     if not supported_ccs:
-        msg = "No supported GPU compute capabilities found. " \
-              "Please check your cudatoolkit version matches your CUDA version."
+        msg = (
+            "No supported GPU compute capabilities found. "
+            "Please check your cudatoolkit version matches your CUDA version."
+        )
         raise NvvmSupportError(msg)
 
     for i, cc in enumerate(supported_ccs):
@@ -415,8 +464,10 @@ def find_closest_arch(mycc):
             # Exceeded
             if i == 0:
                 # CC lower than supported
-                msg = "GPU compute capability %d.%d is not supported" \
-                      "(requires >=%d.%d)" % (mycc + cc)
+                msg = (
+                    "GPU compute capability %d.%d is not supported"
+                    "(requires >=%d.%d)" % (mycc + cc)
+                )
                 raise NvvmSupportError(msg)
             else:
                 # return the previous CC
@@ -427,16 +478,15 @@ def find_closest_arch(mycc):
 
 
 def get_arch_option(major, minor):
-    """Matches with the closest architecture option
-    """
+    """Matches with the closest architecture option"""
     if config.FORCE_CUDA_CC:
         arch = config.FORCE_CUDA_CC
     else:
         arch = find_closest_arch((major, minor))
-    return 'compute_%d%d' % arch
+    return "compute_%d%d" % arch
 
 
-MISSING_LIBDEVICE_FILE_MSG = '''Missing libdevice file.
+MISSING_LIBDEVICE_FILE_MSG = """Missing libdevice file.
 Please ensure you have a CUDA Toolkit 11.2 or higher.
 For CUDA 12, ``cuda-nvcc`` and ``cuda-nvrtc`` are required:
 
@@ -445,7 +495,7 @@ def get_arch_option(major, minor):
 For CUDA 11, ``cudatoolkit`` is required:
 
     $ conda install -c conda-forge cudatoolkit "cuda-version>=11.2,<12.0"
-'''
+"""
 
 
 class LibDevice(object):
@@ -466,7 +516,7 @@ def get(self):
 cas_nvvm = """
     %cas_success = cmpxchg volatile {Ti}* %iptr, {Ti} %old, {Ti} %new monotonic monotonic
     %cas = extractvalue {{ {Ti}, i1 }} %cas_success, 0
-""" # noqa: E501
+"""  # noqa: E501
 
 
 # Translation of code from CUDA Programming Guide v6.5, section B.12
@@ -490,7 +540,7 @@ def get(self):
     %result = bitcast {Ti} %old to {T}
     ret {T} %result
 }}
-""" # noqa: E501
+"""  # noqa: E501
 
 ir_numba_atomic_inc_template = """
 define internal {T} @___numba_atomic_{Tu}_inc({T}* %iptr, {T} %val) alwaysinline {{
@@ -510,7 +560,7 @@ def get(self):
 done:
     ret {T} %old
 }}
-""" # noqa: E501
+"""  # noqa: E501
 
 ir_numba_atomic_dec_template = """
 define internal {T} @___numba_atomic_{Tu}_dec({T}* %iptr, {T} %val) alwaysinline {{
@@ -530,7 +580,7 @@ def get(self):
 done:
     ret {T} %old
 }}
-""" # noqa: E501
+"""  # noqa: E501
 
 ir_numba_atomic_minmax_template = """
 define internal {T} @___numba_atomic_{T}_{NAN}{FUNC}({T}* %ptr, {T} %val) alwaysinline {{
@@ -561,7 +611,7 @@ def get(self):
 done:
     ret {T} %ptrval
 }}
-""" # noqa: E501
+"""  # noqa: E501
 
 
 def ir_cas(Ti):
@@ -574,8 +624,15 @@ def ir_numba_atomic_binary(T, Ti, OP, FUNC):
 
 
 def ir_numba_atomic_minmax(T, Ti, NAN, OP, PTR_OR_VAL, FUNC):
-    params = dict(T=T, Ti=Ti, NAN=NAN, OP=OP, PTR_OR_VAL=PTR_OR_VAL,
-                  FUNC=FUNC, CAS=ir_cas(Ti))
+    params = dict(
+        T=T,
+        Ti=Ti,
+        NAN=NAN,
+        OP=OP,
+        PTR_OR_VAL=PTR_OR_VAL,
+        FUNC=FUNC,
+        CAS=ir_cas(Ti),
+    )
 
     return ir_numba_atomic_minmax_template.format(**params)
 
@@ -590,41 +647,115 @@ def ir_numba_atomic_dec(T, Tu):
 
 def llvm_replace(llvmir):
     replacements = [
-        ('declare double @"___numba_atomic_double_add"(double* %".1", double %".2")',     # noqa: E501
-         ir_numba_atomic_binary(T='double', Ti='i64', OP='fadd', FUNC='add')),
-        ('declare float @"___numba_atomic_float_sub"(float* %".1", float %".2")',         # noqa: E501
-         ir_numba_atomic_binary(T='float', Ti='i32', OP='fsub', FUNC='sub')),
-        ('declare double @"___numba_atomic_double_sub"(double* %".1", double %".2")',     # noqa: E501
-         ir_numba_atomic_binary(T='double', Ti='i64', OP='fsub', FUNC='sub')),
-        ('declare i64 @"___numba_atomic_u64_inc"(i64* %".1", i64 %".2")',
-         ir_numba_atomic_inc(T='i64', Tu='u64')),
-        ('declare i64 @"___numba_atomic_u64_dec"(i64* %".1", i64 %".2")',
-         ir_numba_atomic_dec(T='i64', Tu='u64')),
-        ('declare float @"___numba_atomic_float_max"(float* %".1", float %".2")',         # noqa: E501
-         ir_numba_atomic_minmax(T='float', Ti='i32', NAN='', OP='nnan olt',
-                                PTR_OR_VAL='ptr', FUNC='max')),
-        ('declare double @"___numba_atomic_double_max"(double* %".1", double %".2")',     # noqa: E501
-         ir_numba_atomic_minmax(T='double', Ti='i64', NAN='', OP='nnan olt',
-                                PTR_OR_VAL='ptr', FUNC='max')),
-        ('declare float @"___numba_atomic_float_min"(float* %".1", float %".2")',         # noqa: E501
-         ir_numba_atomic_minmax(T='float', Ti='i32', NAN='', OP='nnan ogt',
-                                PTR_OR_VAL='ptr', FUNC='min')),
-        ('declare double @"___numba_atomic_double_min"(double* %".1", double %".2")',     # noqa: E501
-         ir_numba_atomic_minmax(T='double', Ti='i64', NAN='', OP='nnan ogt',
-                                PTR_OR_VAL='ptr', FUNC='min')),
-        ('declare float @"___numba_atomic_float_nanmax"(float* %".1", float %".2")',      # noqa: E501
-         ir_numba_atomic_minmax(T='float', Ti='i32', NAN='nan', OP='ult',
-                                PTR_OR_VAL='', FUNC='max')),
-        ('declare double @"___numba_atomic_double_nanmax"(double* %".1", double %".2")',  # noqa: E501
-         ir_numba_atomic_minmax(T='double', Ti='i64', NAN='nan', OP='ult',
-                                PTR_OR_VAL='', FUNC='max')),
-        ('declare float @"___numba_atomic_float_nanmin"(float* %".1", float %".2")',      # noqa: E501
-         ir_numba_atomic_minmax(T='float', Ti='i32', NAN='nan', OP='ugt',
-                                PTR_OR_VAL='', FUNC='min')),
-        ('declare double @"___numba_atomic_double_nanmin"(double* %".1", double %".2")',  # noqa: E501
-         ir_numba_atomic_minmax(T='double', Ti='i64', NAN='nan', OP='ugt',
-                                PTR_OR_VAL='', FUNC='min')),
-        ('immarg', '')
+        (
+            'declare double @"___numba_atomic_double_add"(double* %".1", double %".2")',  # noqa: E501
+            ir_numba_atomic_binary(T="double", Ti="i64", OP="fadd", FUNC="add"),
+        ),
+        (
+            'declare float @"___numba_atomic_float_sub"(float* %".1", float %".2")',  # noqa: E501
+            ir_numba_atomic_binary(T="float", Ti="i32", OP="fsub", FUNC="sub"),
+        ),
+        (
+            'declare double @"___numba_atomic_double_sub"(double* %".1", double %".2")',  # noqa: E501
+            ir_numba_atomic_binary(T="double", Ti="i64", OP="fsub", FUNC="sub"),
+        ),
+        (
+            'declare i64 @"___numba_atomic_u64_inc"(i64* %".1", i64 %".2")',
+            ir_numba_atomic_inc(T="i64", Tu="u64"),
+        ),
+        (
+            'declare i64 @"___numba_atomic_u64_dec"(i64* %".1", i64 %".2")',
+            ir_numba_atomic_dec(T="i64", Tu="u64"),
+        ),
+        (
+            'declare float @"___numba_atomic_float_max"(float* %".1", float %".2")',  # noqa: E501
+            ir_numba_atomic_minmax(
+                T="float",
+                Ti="i32",
+                NAN="",
+                OP="nnan olt",
+                PTR_OR_VAL="ptr",
+                FUNC="max",
+            ),
+        ),
+        (
+            'declare double @"___numba_atomic_double_max"(double* %".1", double %".2")',  # noqa: E501
+            ir_numba_atomic_minmax(
+                T="double",
+                Ti="i64",
+                NAN="",
+                OP="nnan olt",
+                PTR_OR_VAL="ptr",
+                FUNC="max",
+            ),
+        ),
+        (
+            'declare float @"___numba_atomic_float_min"(float* %".1", float %".2")',  # noqa: E501
+            ir_numba_atomic_minmax(
+                T="float",
+                Ti="i32",
+                NAN="",
+                OP="nnan ogt",
+                PTR_OR_VAL="ptr",
+                FUNC="min",
+            ),
+        ),
+        (
+            'declare double @"___numba_atomic_double_min"(double* %".1", double %".2")',  # noqa: E501
+            ir_numba_atomic_minmax(
+                T="double",
+                Ti="i64",
+                NAN="",
+                OP="nnan ogt",
+                PTR_OR_VAL="ptr",
+                FUNC="min",
+            ),
+        ),
+        (
+            'declare float @"___numba_atomic_float_nanmax"(float* %".1", float %".2")',  # noqa: E501
+            ir_numba_atomic_minmax(
+                T="float",
+                Ti="i32",
+                NAN="nan",
+                OP="ult",
+                PTR_OR_VAL="",
+                FUNC="max",
+            ),
+        ),
+        (
+            'declare double @"___numba_atomic_double_nanmax"(double* %".1", double %".2")',  # noqa: E501
+            ir_numba_atomic_minmax(
+                T="double",
+                Ti="i64",
+                NAN="nan",
+                OP="ult",
+                PTR_OR_VAL="",
+                FUNC="max",
+            ),
+        ),
+        (
+            'declare float @"___numba_atomic_float_nanmin"(float* %".1", float %".2")',  # noqa: E501
+            ir_numba_atomic_minmax(
+                T="float",
+                Ti="i32",
+                NAN="nan",
+                OP="ugt",
+                PTR_OR_VAL="",
+                FUNC="min",
+            ),
+        ),
+        (
+            'declare double @"___numba_atomic_double_nanmin"(double* %".1", double %".2")',  # noqa: E501
+            ir_numba_atomic_minmax(
+                T="double",
+                Ti="i64",
+                NAN="nan",
+                OP="ugt",
+                PTR_OR_VAL="",
+                FUNC="min",
+            ),
+        ),
+        ("immarg", ""),
     ]
 
     for decl, fn in replacements:
@@ -639,19 +770,21 @@ def compile_ir(llvmir, **options):
     if isinstance(llvmir, str):
         llvmir = [llvmir]
 
-    if options.pop('fastmath', False):
-        options.update({
-            'ftz': True,
-            'fma': True,
-            'prec_div': False,
-            'prec_sqrt': False,
-        })
+    if options.pop("fastmath", False):
+        options.update(
+            {
+                "ftz": True,
+                "fma": True,
+                "prec_div": False,
+                "prec_sqrt": False,
+            }
+        )
 
     cu = CompilationUnit(options)
 
     for mod in llvmir:
         mod = llvm_replace(mod)
-        cu.add_module(mod.encode('utf8'))
+        cu.add_module(mod.encode("utf8"))
     cu.verify()
 
     # We add libdevice following verification so that it is not subject to the
@@ -671,16 +804,16 @@ def llvm150_to_70_ir(ir):
     """
     buf = []
     for line in ir.splitlines():
-        if line.startswith('attributes #'):
+        if line.startswith("attributes #"):
             # Remove function attributes unsupported by LLVM 7.0
             m = re_attributes_def.match(line)
             attrs = m.group(1).split()
-            attrs = ' '.join(a for a in attrs if a != 'willreturn')
+            attrs = " ".join(a for a in attrs if a != "willreturn")
             line = line.replace(m.group(1), attrs)
 
         buf.append(line)
 
-    return '\n'.join(buf)
+    return "\n".join(buf)
 
 
 def set_cuda_kernel(function):
@@ -704,7 +837,7 @@ def set_cuda_kernel(function):
     mdvalue = ir.Constant(ir.IntType(32), 1)
     md = module.add_metadata((function, mdstr, mdvalue))
 
-    nmd = cgutils.get_or_insert_named_metadata(module, 'nvvm.annotations')
+    nmd = cgutils.get_or_insert_named_metadata(module, "nvvm.annotations")
     nmd.add(md)
 
     # Create the used list
@@ -713,13 +846,13 @@ def set_cuda_kernel(function):
 
     fnptr = function.bitcast(ptrty)
 
-    llvm_used = ir.GlobalVariable(module, usedty, 'llvm.used')
-    llvm_used.linkage = 'appending'
-    llvm_used.section = 'llvm.metadata'
+    llvm_used = ir.GlobalVariable(module, usedty, "llvm.used")
+    llvm_used.linkage = "appending"
+    llvm_used.section = "llvm.metadata"
     llvm_used.initializer = ir.Constant(usedty, [fnptr])
 
     # Remove 'noinline' if it is present.
-    function.attributes.discard('noinline')
+    function.attributes.discard("noinline")
 
 
 def add_ir_version(mod):
@@ -728,4 +861,4 @@ def add_ir_version(mod):
     i32 = ir.IntType(32)
     ir_versions = [i32(v) for v in NVVM().get_ir_version()]
     md_ver = mod.add_metadata(ir_versions)
-    mod.add_named_metadata('nvvmir.version', md_ver)
+    mod.add_named_metadata("nvvmir.version", md_ver)
diff --git a/numba_cuda/numba/cuda/cudadrv/rtapi.py b/numba_cuda/numba/cuda/cudadrv/rtapi.py
index 4a88457f9..4d30f5c63 100644
--- a/numba_cuda/numba/cuda/cudadrv/rtapi.py
+++ b/numba_cuda/numba/cuda/cudadrv/rtapi.py
@@ -6,5 +6,5 @@
 
 API_PROTOTYPES = {
     # cudaError_t cudaRuntimeGetVersion ( int* runtimeVersion )
-    'cudaRuntimeGetVersion': (c_int, POINTER(c_int)),
+    "cudaRuntimeGetVersion": (c_int, POINTER(c_int)),
 }
diff --git a/numba_cuda/numba/cuda/cudadrv/runtime.py b/numba_cuda/numba/cuda/cudadrv/runtime.py
index 20634d8f4..d665f4db1 100644
--- a/numba_cuda/numba/cuda/cudadrv/runtime.py
+++ b/numba_cuda/numba/cuda/cudadrv/runtime.py
@@ -21,6 +21,7 @@ class CudaRuntimeAPIError(CudaRuntimeError):
     """
     Raised when there is an error accessing a C API from the CUDA Runtime.
     """
+
     def __init__(self, code, msg):
         self.code = code
         self.msg = msg
@@ -44,11 +45,13 @@ def _initialize(self):
         _logger = make_logger()
 
         if config.DISABLE_CUDA:
-            msg = ("CUDA is disabled due to setting NUMBA_DISABLE_CUDA=1 "
-                   "in the environment, or because CUDA is unsupported on "
-                   "32-bit systems.")
+            msg = (
+                "CUDA is disabled due to setting NUMBA_DISABLE_CUDA=1 "
+                "in the environment, or because CUDA is unsupported on "
+                "32-bit systems."
+            )
             raise CudaSupportError(msg)
-        self.lib = open_cudalib('cudart')
+        self.lib = open_cudalib("cudart")
 
         self.is_initialized = True
 
@@ -76,9 +79,10 @@ def __getattr__(self, fname):
     def _wrap_api_call(self, fname, libfn):
         @functools.wraps(libfn)
         def safe_cuda_api_call(*args):
-            _logger.debug('call runtime api: %s', libfn.__name__)
+            _logger.debug("call runtime api: %s", libfn.__name__)
             retcode = libfn(*args)
             self._check_error(fname, retcode)
+
         return safe_cuda_api_call
 
     def _check_error(self, fname, retcode):
@@ -125,11 +129,19 @@ def is_supported_version(self):
     def supported_versions(self):
         """A tuple of all supported CUDA toolkit versions. Versions are given in
         the form ``(major_version, minor_version)``."""
-        if sys.platform not in ('linux', 'win32') or config.MACHINE_BITS != 64:
+        if sys.platform not in ("linux", "win32") or config.MACHINE_BITS != 64:
             # Only 64-bit Linux and Windows are supported
             return ()
-        return ((11, 0), (11, 1), (11, 2), (11, 3), (11, 4), (11, 5), (11, 6),
-                (11, 7))
+        return (
+            (11, 0),
+            (11, 1),
+            (11, 2),
+            (11, 3),
+            (11, 4),
+            (11, 5),
+            (11, 6),
+            (11, 7),
+        )
 
 
 runtime = Runtime()
diff --git a/numba_cuda/numba/cuda/cudaimpl.py b/numba_cuda/numba/cuda/cudaimpl.py
index 0ec08298c..931c43e31 100644
--- a/numba_cuda/numba/cuda/cudaimpl.py
+++ b/numba_cuda/numba/cuda/cudaimpl.py
@@ -29,48 +29,49 @@ def initialize_dim3(builder, prefix):
     return cgutils.pack_struct(builder, (x, y, z))
 
 
-@lower_attr(types.Module(cuda), 'threadIdx')
+@lower_attr(types.Module(cuda), "threadIdx")
 def cuda_threadIdx(context, builder, sig, args):
-    return initialize_dim3(builder, 'tid')
+    return initialize_dim3(builder, "tid")
 
 
-@lower_attr(types.Module(cuda), 'blockDim')
+@lower_attr(types.Module(cuda), "blockDim")
 def cuda_blockDim(context, builder, sig, args):
-    return initialize_dim3(builder, 'ntid')
+    return initialize_dim3(builder, "ntid")
 
 
-@lower_attr(types.Module(cuda), 'blockIdx')
+@lower_attr(types.Module(cuda), "blockIdx")
 def cuda_blockIdx(context, builder, sig, args):
-    return initialize_dim3(builder, 'ctaid')
+    return initialize_dim3(builder, "ctaid")
 
 
-@lower_attr(types.Module(cuda), 'gridDim')
+@lower_attr(types.Module(cuda), "gridDim")
 def cuda_gridDim(context, builder, sig, args):
-    return initialize_dim3(builder, 'nctaid')
+    return initialize_dim3(builder, "nctaid")
 
 
-@lower_attr(types.Module(cuda), 'laneid')
+@lower_attr(types.Module(cuda), "laneid")
 def cuda_laneid(context, builder, sig, args):
-    return nvvmutils.call_sreg(builder, 'laneid')
+    return nvvmutils.call_sreg(builder, "laneid")
 
 
-@lower_attr(dim3, 'x')
+@lower_attr(dim3, "x")
 def dim3_x(context, builder, sig, args):
     return builder.extract_value(args, 0)
 
 
-@lower_attr(dim3, 'y')
+@lower_attr(dim3, "y")
 def dim3_y(context, builder, sig, args):
     return builder.extract_value(args, 1)
 
 
-@lower_attr(dim3, 'z')
+@lower_attr(dim3, "z")
 def dim3_z(context, builder, sig, args):
     return builder.extract_value(args, 2)
 
 
 # -----------------------------------------------------------------------------
 
+
 @lower(cuda.const.array_like, types.Array)
 def cuda_const_array_like(context, builder, sig, args):
     # This is a no-op because CUDATargetContext.make_constant_array already
@@ -95,48 +96,68 @@ def _get_unique_smem_id(name):
 def cuda_shared_array_integer(context, builder, sig, args):
     length = sig.args[0].literal_value
     dtype = parse_dtype(sig.args[1])
-    return _generic_array(context, builder, shape=(length,), dtype=dtype,
-                          symbol_name=_get_unique_smem_id('_cudapy_smem'),
-                          addrspace=nvvm.ADDRSPACE_SHARED,
-                          can_dynsized=True)
+    return _generic_array(
+        context,
+        builder,
+        shape=(length,),
+        dtype=dtype,
+        symbol_name=_get_unique_smem_id("_cudapy_smem"),
+        addrspace=nvvm.ADDRSPACE_SHARED,
+        can_dynsized=True,
+    )
 
 
 @lower(cuda.shared.array, types.Tuple, types.Any)
 @lower(cuda.shared.array, types.UniTuple, types.Any)
 def cuda_shared_array_tuple(context, builder, sig, args):
-    shape = [ s.literal_value for s in sig.args[0] ]
+    shape = [s.literal_value for s in sig.args[0]]
     dtype = parse_dtype(sig.args[1])
-    return _generic_array(context, builder, shape=shape, dtype=dtype,
-                          symbol_name=_get_unique_smem_id('_cudapy_smem'),
-                          addrspace=nvvm.ADDRSPACE_SHARED,
-                          can_dynsized=True)
+    return _generic_array(
+        context,
+        builder,
+        shape=shape,
+        dtype=dtype,
+        symbol_name=_get_unique_smem_id("_cudapy_smem"),
+        addrspace=nvvm.ADDRSPACE_SHARED,
+        can_dynsized=True,
+    )
 
 
 @lower(cuda.local.array, types.IntegerLiteral, types.Any)
 def cuda_local_array_integer(context, builder, sig, args):
     length = sig.args[0].literal_value
     dtype = parse_dtype(sig.args[1])
-    return _generic_array(context, builder, shape=(length,), dtype=dtype,
-                          symbol_name='_cudapy_lmem',
-                          addrspace=nvvm.ADDRSPACE_LOCAL,
-                          can_dynsized=False)
+    return _generic_array(
+        context,
+        builder,
+        shape=(length,),
+        dtype=dtype,
+        symbol_name="_cudapy_lmem",
+        addrspace=nvvm.ADDRSPACE_LOCAL,
+        can_dynsized=False,
+    )
 
 
 @lower(cuda.local.array, types.Tuple, types.Any)
 @lower(cuda.local.array, types.UniTuple, types.Any)
 def ptx_lmem_alloc_array(context, builder, sig, args):
-    shape = [ s.literal_value for s in sig.args[0] ]
+    shape = [s.literal_value for s in sig.args[0]]
     dtype = parse_dtype(sig.args[1])
-    return _generic_array(context, builder, shape=shape, dtype=dtype,
-                          symbol_name='_cudapy_lmem',
-                          addrspace=nvvm.ADDRSPACE_LOCAL,
-                          can_dynsized=False)
+    return _generic_array(
+        context,
+        builder,
+        shape=shape,
+        dtype=dtype,
+        symbol_name="_cudapy_lmem",
+        addrspace=nvvm.ADDRSPACE_LOCAL,
+        can_dynsized=False,
+    )
 
 
 @lower(stubs.threadfence_block)
 def ptx_threadfence_block(context, builder, sig, args):
     assert not args
-    fname = 'llvm.nvvm.membar.cta'
+    fname = "llvm.nvvm.membar.cta"
     lmod = builder.module
     fnty = ir.FunctionType(ir.VoidType(), ())
     sync = cgutils.get_or_insert_function(lmod, fnty, fname)
@@ -147,7 +168,7 @@ def ptx_threadfence_block(context, builder, sig, args):
 @lower(stubs.threadfence_system)
 def ptx_threadfence_system(context, builder, sig, args):
     assert not args
-    fname = 'llvm.nvvm.membar.sys'
+    fname = "llvm.nvvm.membar.sys"
     lmod = builder.module
     fnty = ir.FunctionType(ir.VoidType(), ())
     sync = cgutils.get_or_insert_function(lmod, fnty, fname)
@@ -158,7 +179,7 @@ def ptx_threadfence_system(context, builder, sig, args):
 @lower(stubs.threadfence)
 def ptx_threadfence_device(context, builder, sig, args):
     assert not args
-    fname = 'llvm.nvvm.membar.gl'
+    fname = "llvm.nvvm.membar.gl"
     lmod = builder.module
     fnty = ir.FunctionType(ir.VoidType(), ())
     sync = cgutils.get_or_insert_function(lmod, fnty, fname)
@@ -175,7 +196,7 @@ def ptx_syncwarp(context, builder, sig, args):
 
 @lower(stubs.syncwarp, types.i4)
 def ptx_syncwarp_mask(context, builder, sig, args):
-    fname = 'llvm.nvvm.bar.warp.sync'
+    fname = "llvm.nvvm.bar.warp.sync"
     lmod = builder.module
     fnty = ir.FunctionType(ir.VoidType(), (ir.IntType(32),))
     sync = cgutils.get_or_insert_function(lmod, fnty, fname)
@@ -183,14 +204,18 @@ def ptx_syncwarp_mask(context, builder, sig, args):
     return context.get_dummy_value()
 
 
-@lower(stubs.shfl_sync_intrinsic, types.i4, types.i4, types.i4, types.i4,
-       types.i4)
-@lower(stubs.shfl_sync_intrinsic, types.i4, types.i4, types.i8, types.i4,
-       types.i4)
-@lower(stubs.shfl_sync_intrinsic, types.i4, types.i4, types.f4, types.i4,
-       types.i4)
-@lower(stubs.shfl_sync_intrinsic, types.i4, types.i4, types.f8, types.i4,
-       types.i4)
+@lower(
+    stubs.shfl_sync_intrinsic, types.i4, types.i4, types.i4, types.i4, types.i4
+)
+@lower(
+    stubs.shfl_sync_intrinsic, types.i4, types.i4, types.i8, types.i4, types.i4
+)
+@lower(
+    stubs.shfl_sync_intrinsic, types.i4, types.i4, types.f4, types.i4, types.i4
+)
+@lower(
+    stubs.shfl_sync_intrinsic, types.i4, types.i4, types.f8, types.i4, types.i4
+)
 def ptx_shfl_sync_i32(context, builder, sig, args):
     """
     The NVVM intrinsic for shfl only supports i32, but the cuda intrinsic
@@ -203,12 +228,17 @@ def ptx_shfl_sync_i32(context, builder, sig, args):
     value_type = sig.args[2]
     if value_type in types.real_domain:
         value = builder.bitcast(value, ir.IntType(value_type.bitwidth))
-    fname = 'llvm.nvvm.shfl.sync.i32'
+    fname = "llvm.nvvm.shfl.sync.i32"
     lmod = builder.module
     fnty = ir.FunctionType(
         ir.LiteralStructType((ir.IntType(32), ir.IntType(1))),
-                            (ir.IntType(32), ir.IntType(32), ir.IntType(32),
-                             ir.IntType(32), ir.IntType(32))
+        (
+            ir.IntType(32),
+            ir.IntType(32),
+            ir.IntType(32),
+            ir.IntType(32),
+            ir.IntType(32),
+        ),
     )
     func = cgutils.get_or_insert_function(lmod, fnty, fname)
     if value_type.bitwidth == 32:
@@ -239,11 +269,12 @@ def ptx_shfl_sync_i32(context, builder, sig, args):
 
 @lower(stubs.vote_sync_intrinsic, types.i4, types.i4, types.boolean)
 def ptx_vote_sync(context, builder, sig, args):
-    fname = 'llvm.nvvm.vote.sync'
+    fname = "llvm.nvvm.vote.sync"
     lmod = builder.module
-    fnty = ir.FunctionType(ir.LiteralStructType((ir.IntType(32),
-                                                 ir.IntType(1))),
-                           (ir.IntType(32), ir.IntType(32), ir.IntType(1)))
+    fnty = ir.FunctionType(
+        ir.LiteralStructType((ir.IntType(32), ir.IntType(1))),
+        (ir.IntType(32), ir.IntType(32), ir.IntType(1)),
+    )
     func = cgutils.get_or_insert_function(lmod, fnty, fname)
     return builder.call(func, args)
 
@@ -257,7 +288,7 @@ def ptx_match_any_sync(context, builder, sig, args):
     width = sig.args[1].bitwidth
     if sig.args[1] in types.real_domain:
         value = builder.bitcast(value, ir.IntType(width))
-    fname = 'llvm.nvvm.match.any.sync.i{}'.format(width)
+    fname = "llvm.nvvm.match.any.sync.i{}".format(width)
     lmod = builder.module
     fnty = ir.FunctionType(ir.IntType(32), (ir.IntType(32), ir.IntType(width)))
     func = cgutils.get_or_insert_function(lmod, fnty, fname)
@@ -273,27 +304,35 @@ def ptx_match_all_sync(context, builder, sig, args):
     width = sig.args[1].bitwidth
     if sig.args[1] in types.real_domain:
         value = builder.bitcast(value, ir.IntType(width))
-    fname = 'llvm.nvvm.match.all.sync.i{}'.format(width)
+    fname = "llvm.nvvm.match.all.sync.i{}".format(width)
     lmod = builder.module
-    fnty = ir.FunctionType(ir.LiteralStructType((ir.IntType(32),
-                                                 ir.IntType(1))),
-                           (ir.IntType(32), ir.IntType(width)))
+    fnty = ir.FunctionType(
+        ir.LiteralStructType((ir.IntType(32), ir.IntType(1))),
+        (ir.IntType(32), ir.IntType(width)),
+    )
     func = cgutils.get_or_insert_function(lmod, fnty, fname)
     return builder.call(func, (mask, value))
 
 
 @lower(stubs.activemask)
 def ptx_activemask(context, builder, sig, args):
-    activemask = ir.InlineAsm(ir.FunctionType(ir.IntType(32), []),
-                              "activemask.b32 $0;", '=r', side_effect=True)
+    activemask = ir.InlineAsm(
+        ir.FunctionType(ir.IntType(32), []),
+        "activemask.b32 $0;",
+        "=r",
+        side_effect=True,
+    )
     return builder.call(activemask, [])
 
 
 @lower(stubs.lanemask_lt)
 def ptx_lanemask_lt(context, builder, sig, args):
-    activemask = ir.InlineAsm(ir.FunctionType(ir.IntType(32), []),
-                              "mov.u32 $0, %lanemask_lt;", '=r',
-                              side_effect=True)
+    activemask = ir.InlineAsm(
+        ir.FunctionType(ir.IntType(32), []),
+        "mov.u32 $0, %lanemask_lt;",
+        "=r",
+        side_effect=True,
+    )
     return builder.call(activemask, [])
 
 
@@ -308,7 +347,7 @@ def ptx_fma(context, builder, sig, args):
 
 
 def float16_float_ty_constraint(bitwidth):
-    typemap = {32: ('f32', 'f'), 64: ('f64', 'd')}
+    typemap = {32: ("f32", "f"), 64: ("f64", "d")}
 
     try:
         return typemap[bitwidth]
@@ -342,7 +381,7 @@ def float_to_float16_cast(context, builder, fromty, toty, val):
 
 
 def float16_int_constraint(bitwidth):
-    typemap = { 8: 'c', 16: 'h', 32: 'r', 64: 'l' }
+    typemap = {8: "c", 16: "h", 32: "r", 64: "l"}
 
     try:
         return typemap[bitwidth]
@@ -355,12 +394,12 @@ def float16_int_constraint(bitwidth):
 def float16_to_integer_cast(context, builder, fromty, toty, val):
     bitwidth = toty.bitwidth
     constraint = float16_int_constraint(bitwidth)
-    signedness = 's' if toty.signed else 'u'
+    signedness = "s" if toty.signed else "u"
 
     fnty = ir.FunctionType(context.get_value_type(toty), [ir.IntType(16)])
-    asm = ir.InlineAsm(fnty,
-                       f"cvt.rni.{signedness}{bitwidth}.f16 $0, $1;",
-                       f"={constraint},h")
+    asm = ir.InlineAsm(
+        fnty, f"cvt.rni.{signedness}{bitwidth}.f16 $0, $1;", f"={constraint},h"
+    )
     return builder.call(asm, [val])
 
 
@@ -369,40 +408,38 @@ def float16_to_integer_cast(context, builder, fromty, toty, val):
 def integer_to_float16_cast(context, builder, fromty, toty, val):
     bitwidth = fromty.bitwidth
     constraint = float16_int_constraint(bitwidth)
-    signedness = 's' if fromty.signed else 'u'
+    signedness = "s" if fromty.signed else "u"
 
-    fnty = ir.FunctionType(ir.IntType(16),
-                           [context.get_value_type(fromty)])
-    asm = ir.InlineAsm(fnty,
-                       f"cvt.rn.f16.{signedness}{bitwidth} $0, $1;",
-                       f"=h,{constraint}")
+    fnty = ir.FunctionType(ir.IntType(16), [context.get_value_type(fromty)])
+    asm = ir.InlineAsm(
+        fnty, f"cvt.rn.f16.{signedness}{bitwidth} $0, $1;", f"=h,{constraint}"
+    )
     return builder.call(asm, [val])
 
 
 def lower_fp16_binary(fn, op):
     @lower(fn, types.float16, types.float16)
     def ptx_fp16_binary(context, builder, sig, args):
-        fnty = ir.FunctionType(ir.IntType(16),
-                               [ir.IntType(16), ir.IntType(16)])
-        asm = ir.InlineAsm(fnty, f'{op}.f16 $0,$1,$2;', '=h,h,h')
+        fnty = ir.FunctionType(ir.IntType(16), [ir.IntType(16), ir.IntType(16)])
+        asm = ir.InlineAsm(fnty, f"{op}.f16 $0,$1,$2;", "=h,h,h")
         return builder.call(asm, args)
 
 
-lower_fp16_binary(stubs.fp16.hadd, 'add')
-lower_fp16_binary(operator.add, 'add')
-lower_fp16_binary(operator.iadd, 'add')
-lower_fp16_binary(stubs.fp16.hsub, 'sub')
-lower_fp16_binary(operator.sub, 'sub')
-lower_fp16_binary(operator.isub, 'sub')
-lower_fp16_binary(stubs.fp16.hmul, 'mul')
-lower_fp16_binary(operator.mul, 'mul')
-lower_fp16_binary(operator.imul, 'mul')
+lower_fp16_binary(stubs.fp16.hadd, "add")
+lower_fp16_binary(operator.add, "add")
+lower_fp16_binary(operator.iadd, "add")
+lower_fp16_binary(stubs.fp16.hsub, "sub")
+lower_fp16_binary(operator.sub, "sub")
+lower_fp16_binary(operator.isub, "sub")
+lower_fp16_binary(stubs.fp16.hmul, "mul")
+lower_fp16_binary(operator.mul, "mul")
+lower_fp16_binary(operator.imul, "mul")
 
 
 @lower(stubs.fp16.hneg, types.float16)
 def ptx_fp16_hneg(context, builder, sig, args):
     fnty = ir.FunctionType(ir.IntType(16), [ir.IntType(16)])
-    asm = ir.InlineAsm(fnty, 'neg.f16 $0, $1;', '=h,h')
+    asm = ir.InlineAsm(fnty, "neg.f16 $0, $1;", "=h,h")
     return builder.call(asm, args)
 
 
@@ -414,7 +451,7 @@ def operator_hneg(context, builder, sig, args):
 @lower(stubs.fp16.habs, types.float16)
 def ptx_fp16_habs(context, builder, sig, args):
     fnty = ir.FunctionType(ir.IntType(16), [ir.IntType(16)])
-    asm = ir.InlineAsm(fnty, 'abs.f16 $0, $1;', '=h,h')
+    asm = ir.InlineAsm(fnty, "abs.f16 $0, $1;", "=h,h")
     return builder.call(asm, args)
 
 
@@ -450,27 +487,28 @@ def fp16_div(x, y):
 def _gen_fp16_cmp(op):
     def ptx_fp16_comparison(context, builder, sig, args):
         fnty = ir.FunctionType(ir.IntType(16), [ir.IntType(16), ir.IntType(16)])
-        asm = ir.InlineAsm(fnty, _fp16_cmp.format(op=op), '=h,h,h')
+        asm = ir.InlineAsm(fnty, _fp16_cmp.format(op=op), "=h,h,h")
         result = builder.call(asm, args)
 
         zero = context.get_constant(types.int16, 0)
         int_result = builder.bitcast(result, ir.IntType(16))
         return builder.icmp_unsigned("!=", int_result, zero)
+
     return ptx_fp16_comparison
 
 
-lower(stubs.fp16.heq, types.float16, types.float16)(_gen_fp16_cmp('eq'))
-lower(operator.eq, types.float16, types.float16)(_gen_fp16_cmp('eq'))
-lower(stubs.fp16.hne, types.float16, types.float16)(_gen_fp16_cmp('ne'))
-lower(operator.ne, types.float16, types.float16)(_gen_fp16_cmp('ne'))
-lower(stubs.fp16.hge, types.float16, types.float16)(_gen_fp16_cmp('ge'))
-lower(operator.ge, types.float16, types.float16)(_gen_fp16_cmp('ge'))
-lower(stubs.fp16.hgt, types.float16, types.float16)(_gen_fp16_cmp('gt'))
-lower(operator.gt, types.float16, types.float16)(_gen_fp16_cmp('gt'))
-lower(stubs.fp16.hle, types.float16, types.float16)(_gen_fp16_cmp('le'))
-lower(operator.le, types.float16, types.float16)(_gen_fp16_cmp('le'))
-lower(stubs.fp16.hlt, types.float16, types.float16)(_gen_fp16_cmp('lt'))
-lower(operator.lt, types.float16, types.float16)(_gen_fp16_cmp('lt'))
+lower(stubs.fp16.heq, types.float16, types.float16)(_gen_fp16_cmp("eq"))
+lower(operator.eq, types.float16, types.float16)(_gen_fp16_cmp("eq"))
+lower(stubs.fp16.hne, types.float16, types.float16)(_gen_fp16_cmp("ne"))
+lower(operator.ne, types.float16, types.float16)(_gen_fp16_cmp("ne"))
+lower(stubs.fp16.hge, types.float16, types.float16)(_gen_fp16_cmp("ge"))
+lower(operator.ge, types.float16, types.float16)(_gen_fp16_cmp("ge"))
+lower(stubs.fp16.hgt, types.float16, types.float16)(_gen_fp16_cmp("gt"))
+lower(operator.gt, types.float16, types.float16)(_gen_fp16_cmp("gt"))
+lower(stubs.fp16.hle, types.float16, types.float16)(_gen_fp16_cmp("le"))
+lower(operator.le, types.float16, types.float16)(_gen_fp16_cmp("le"))
+lower(stubs.fp16.hlt, types.float16, types.float16)(_gen_fp16_cmp("lt"))
+lower(operator.lt, types.float16, types.float16)(_gen_fp16_cmp("lt"))
 
 
 def lower_fp16_minmax(fn, fname, op):
@@ -480,8 +518,8 @@ def ptx_fp16_minmax(context, builder, sig, args):
         return builder.select(choice, args[0], args[1])
 
 
-lower_fp16_minmax(stubs.fp16.hmax, 'max', 'gt')
-lower_fp16_minmax(stubs.fp16.hmin, 'min', 'lt')
+lower_fp16_minmax(stubs.fp16.hmax, "max", "gt")
+lower_fp16_minmax(stubs.fp16.hmin, "min", "lt")
 
 # See:
 # https://docs.nvidia.com/cuda/libdevice-users-guide/__nv_cbrt.html#__nv_cbrt
@@ -489,8 +527,8 @@ def ptx_fp16_minmax(context, builder, sig, args):
 
 
 cbrt_funcs = {
-    types.float32: '__nv_cbrtf',
-    types.float64: '__nv_cbrt',
+    types.float32: "__nv_cbrtf",
+    types.float64: "__nv_cbrt",
 }
 
 
@@ -514,7 +552,8 @@ def ptx_brev_u4(context, builder, sig, args):
     fn = cgutils.get_or_insert_function(
         builder.module,
         ir.FunctionType(ir.IntType(32), (ir.IntType(32),)),
-        '__nv_brev')
+        "__nv_brev",
+    )
     return builder.call(fn, args)
 
 
@@ -526,15 +565,14 @@ def ptx_brev_u8(context, builder, sig, args):
     fn = cgutils.get_or_insert_function(
         builder.module,
         ir.FunctionType(ir.IntType(64), (ir.IntType(64),)),
-        '__nv_brevll')
+        "__nv_brevll",
+    )
     return builder.call(fn, args)
 
 
 @lower(stubs.clz, types.Any)
 def ptx_clz(context, builder, sig, args):
-    return builder.ctlz(
-        args[0],
-        context.get_constant(types.boolean, 0))
+    return builder.ctlz(args[0], context.get_constant(types.boolean, 0))
 
 
 @lower(stubs.ffs, types.i4)
@@ -543,7 +581,8 @@ def ptx_ffs_32(context, builder, sig, args):
     fn = cgutils.get_or_insert_function(
         builder.module,
         ir.FunctionType(ir.IntType(32), (ir.IntType(32),)),
-        '__nv_ffs')
+        "__nv_ffs",
+    )
     return builder.call(fn, args)
 
 
@@ -553,7 +592,8 @@ def ptx_ffs_64(context, builder, sig, args):
     fn = cgutils.get_or_insert_function(
         builder.module,
         ir.FunctionType(ir.IntType(32), (ir.IntType(64),)),
-        '__nv_ffsll')
+        "__nv_ffsll",
+    )
     return builder.call(fn, args)
 
 
@@ -567,10 +607,9 @@ def ptx_selp(context, builder, sig, args):
 def ptx_max_f4(context, builder, sig, args):
     fn = cgutils.get_or_insert_function(
         builder.module,
-        ir.FunctionType(
-            ir.FloatType(),
-            (ir.FloatType(), ir.FloatType())),
-        '__nv_fmaxf')
+        ir.FunctionType(ir.FloatType(), (ir.FloatType(), ir.FloatType())),
+        "__nv_fmaxf",
+    )
     return builder.call(fn, args)
 
 
@@ -580,25 +619,26 @@ def ptx_max_f4(context, builder, sig, args):
 def ptx_max_f8(context, builder, sig, args):
     fn = cgutils.get_or_insert_function(
         builder.module,
-        ir.FunctionType(
-            ir.DoubleType(),
-            (ir.DoubleType(), ir.DoubleType())),
-        '__nv_fmax')
+        ir.FunctionType(ir.DoubleType(), (ir.DoubleType(), ir.DoubleType())),
+        "__nv_fmax",
+    )
 
-    return builder.call(fn, [
-        context.cast(builder, args[0], sig.args[0], types.double),
-        context.cast(builder, args[1], sig.args[1], types.double),
-    ])
+    return builder.call(
+        fn,
+        [
+            context.cast(builder, args[0], sig.args[0], types.double),
+            context.cast(builder, args[1], sig.args[1], types.double),
+        ],
+    )
 
 
 @lower(min, types.f4, types.f4)
 def ptx_min_f4(context, builder, sig, args):
     fn = cgutils.get_or_insert_function(
         builder.module,
-        ir.FunctionType(
-            ir.FloatType(),
-            (ir.FloatType(), ir.FloatType())),
-        '__nv_fminf')
+        ir.FunctionType(ir.FloatType(), (ir.FloatType(), ir.FloatType())),
+        "__nv_fminf",
+    )
     return builder.call(fn, args)
 
 
@@ -608,15 +648,17 @@ def ptx_min_f4(context, builder, sig, args):
 def ptx_min_f8(context, builder, sig, args):
     fn = cgutils.get_or_insert_function(
         builder.module,
-        ir.FunctionType(
-            ir.DoubleType(),
-            (ir.DoubleType(), ir.DoubleType())),
-        '__nv_fmin')
+        ir.FunctionType(ir.DoubleType(), (ir.DoubleType(), ir.DoubleType())),
+        "__nv_fmin",
+    )
 
-    return builder.call(fn, [
-        context.cast(builder, args[0], sig.args[0], types.double),
-        context.cast(builder, args[1], sig.args[1], types.double),
-    ])
+    return builder.call(
+        fn,
+        [
+            context.cast(builder, args[0], sig.args[0], types.double),
+            context.cast(builder, args[1], sig.args[1], types.double),
+        ],
+    )
 
 
 @lower(round, types.f4)
@@ -624,19 +666,22 @@ def ptx_min_f8(context, builder, sig, args):
 def ptx_round(context, builder, sig, args):
     fn = cgutils.get_or_insert_function(
         builder.module,
-        ir.FunctionType(
-            ir.IntType(64),
-            (ir.DoubleType(),)),
-        '__nv_llrint')
-    return builder.call(fn, [
-        context.cast(builder, args[0], sig.args[0], types.double),
-    ])
+        ir.FunctionType(ir.IntType(64), (ir.DoubleType(),)),
+        "__nv_llrint",
+    )
+    return builder.call(
+        fn,
+        [
+            context.cast(builder, args[0], sig.args[0], types.double),
+        ],
+    )
 
 
 # This rounding implementation follows the algorithm used in the "fallback
 # version" of double_round in CPython.
 # https://github.com/python/cpython/blob/a755410e054e1e2390de5830befc08fe80706c66/Objects/floatobject.c#L964-L1007
 
+
 @lower(round, types.f4, types.Integer)
 @lower(round, types.f8, types.Integer)
 def round_to_impl(context, builder, sig, args):
@@ -651,7 +696,7 @@ def round_ndigits(x, ndigits):
                 pow1 = 10.0 ** (ndigits - 22)
                 pow2 = 1e22
             else:
-                pow1 = 10.0 ** ndigits
+                pow1 = 10.0**ndigits
                 pow2 = 1.0
             y = (x * pow1) * pow2
             if math.isinf(y):
@@ -662,7 +707,7 @@ def round_ndigits(x, ndigits):
             y = x / pow1
 
         z = round(y)
-        if (math.fabs(y - z) == 0.5):
+        if math.fabs(y - z) == 0.5:
             # halfway between two integers; use round-half-even
             z = 2.0 * round(y / 2.0)
 
@@ -673,19 +718,25 @@ def round_ndigits(x, ndigits):
 
         return z
 
-    return context.compile_internal(builder, round_ndigits, sig, args, )
+    return context.compile_internal(
+        builder,
+        round_ndigits,
+        sig,
+        args,
+    )
 
 
 def gen_deg_rad(const):
     def impl(context, builder, sig, args):
-        argty, = sig.args
+        (argty,) = sig.args
         factor = context.get_constant(argty, const)
         return builder.fmul(factor, args[0])
+
     return impl
 
 
-_deg2rad = math.pi / 180.
-_rad2deg = 180. / math.pi
+_deg2rad = math.pi / 180.0
+_rad2deg = 180.0 / math.pi
 lower(math.radians, types.f4)(gen_deg_rad(_deg2rad))
 lower(math.radians, types.f8)(gen_deg_rad(_deg2rad))
 lower(math.degrees, types.f4)(gen_deg_rad(_rad2deg))
@@ -701,16 +752,18 @@ def _normalize_indices(context, builder, indty, inds, aryty, valty):
         indices = [inds]
     else:
         indices = cgutils.unpack_tuple(builder, inds, count=len(indty))
-    indices = [context.cast(builder, i, t, types.intp)
-               for t, i in zip(indty, indices)]
+    indices = [
+        context.cast(builder, i, t, types.intp) for t, i in zip(indty, indices)
+    ]
 
     dtype = aryty.dtype
     if dtype != valty:
         raise TypeError("expect %s but got %s" % (dtype, valty))
 
     if aryty.ndim != len(indty):
-        raise TypeError("indexing %d-D array with %d-D index" %
-                        (aryty.ndim, len(indty)))
+        raise TypeError(
+            "indexing %d-D array with %d-D index" % (aryty.ndim, len(indty))
+        )
 
     return indty, indices
 
@@ -722,14 +775,17 @@ def imp(context, builder, sig, args):
         ary, inds, val = args
         dtype = aryty.dtype
 
-        indty, indices = _normalize_indices(context, builder, indty, inds,
-                                            aryty, valty)
+        indty, indices = _normalize_indices(
+            context, builder, indty, inds, aryty, valty
+        )
 
         lary = context.make_array(aryty)(context, builder, ary)
-        ptr = cgutils.get_item_pointer(context, builder, aryty, lary, indices,
-                                       wraparound=True)
+        ptr = cgutils.get_item_pointer(
+            context, builder, aryty, lary, indices, wraparound=True
+        )
         # dispatcher to implementation base on dtype
         return dispatch_fn(context, builder, dtype, ptr, val)
+
     return imp
 
 
@@ -740,14 +796,16 @@ def imp(context, builder, sig, args):
 def ptx_atomic_add_tuple(context, builder, dtype, ptr, val):
     if dtype == types.float32:
         lmod = builder.module
-        return builder.call(nvvmutils.declare_atomic_add_float32(lmod),
-                            (ptr, val))
+        return builder.call(
+            nvvmutils.declare_atomic_add_float32(lmod), (ptr, val)
+        )
     elif dtype == types.float64:
         lmod = builder.module
-        return builder.call(nvvmutils.declare_atomic_add_float64(lmod),
-                            (ptr, val))
+        return builder.call(
+            nvvmutils.declare_atomic_add_float64(lmod), (ptr, val)
+        )
     else:
-        return builder.atomic_rmw('add', ptr, val, 'monotonic')
+        return builder.atomic_rmw("add", ptr, val, "monotonic")
 
 
 @lower(stubs.atomic.sub, types.Array, types.intp, types.Any)
@@ -757,14 +815,16 @@ def ptx_atomic_add_tuple(context, builder, dtype, ptr, val):
 def ptx_atomic_sub(context, builder, dtype, ptr, val):
     if dtype == types.float32:
         lmod = builder.module
-        return builder.call(nvvmutils.declare_atomic_sub_float32(lmod),
-                            (ptr, val))
+        return builder.call(
+            nvvmutils.declare_atomic_sub_float32(lmod), (ptr, val)
+        )
     elif dtype == types.float64:
         lmod = builder.module
-        return builder.call(nvvmutils.declare_atomic_sub_float64(lmod),
-                            (ptr, val))
+        return builder.call(
+            nvvmutils.declare_atomic_sub_float64(lmod), (ptr, val)
+        )
     else:
-        return builder.atomic_rmw('sub', ptr, val, 'monotonic')
+        return builder.atomic_rmw("sub", ptr, val, "monotonic")
 
 
 @lower(stubs.atomic.inc, types.Array, types.intp, types.Any)
@@ -775,10 +835,10 @@ def ptx_atomic_inc(context, builder, dtype, ptr, val):
     if dtype in cuda.cudadecl.unsigned_int_numba_types:
         bw = dtype.bitwidth
         lmod = builder.module
-        fn = getattr(nvvmutils, f'declare_atomic_inc_int{bw}')
+        fn = getattr(nvvmutils, f"declare_atomic_inc_int{bw}")
         return builder.call(fn(lmod), (ptr, val))
     else:
-        raise TypeError(f'Unimplemented atomic inc with {dtype} array')
+        raise TypeError(f"Unimplemented atomic inc with {dtype} array")
 
 
 @lower(stubs.atomic.dec, types.Array, types.intp, types.Any)
@@ -789,27 +849,27 @@ def ptx_atomic_dec(context, builder, dtype, ptr, val):
     if dtype in cuda.cudadecl.unsigned_int_numba_types:
         bw = dtype.bitwidth
         lmod = builder.module
-        fn = getattr(nvvmutils, f'declare_atomic_dec_int{bw}')
+        fn = getattr(nvvmutils, f"declare_atomic_dec_int{bw}")
         return builder.call(fn(lmod), (ptr, val))
     else:
-        raise TypeError(f'Unimplemented atomic dec with {dtype} array')
+        raise TypeError(f"Unimplemented atomic dec with {dtype} array")
 
 
 def ptx_atomic_bitwise(stub, op):
     @_atomic_dispatcher
     def impl_ptx_atomic(context, builder, dtype, ptr, val):
         if dtype in (cuda.cudadecl.integer_numba_types):
-            return builder.atomic_rmw(op, ptr, val, 'monotonic')
+            return builder.atomic_rmw(op, ptr, val, "monotonic")
         else:
-            raise TypeError(f'Unimplemented atomic {op} with {dtype} array')
+            raise TypeError(f"Unimplemented atomic {op} with {dtype} array")
 
     for ty in (types.intp, types.UniTuple, types.Tuple):
         lower(stub, types.Array, ty, types.Any)(impl_ptx_atomic)
 
 
-ptx_atomic_bitwise(stubs.atomic.and_, 'and')
-ptx_atomic_bitwise(stubs.atomic.or_, 'or')
-ptx_atomic_bitwise(stubs.atomic.xor, 'xor')
+ptx_atomic_bitwise(stubs.atomic.and_, "and")
+ptx_atomic_bitwise(stubs.atomic.or_, "or")
+ptx_atomic_bitwise(stubs.atomic.xor, "xor")
 
 
 @lower(stubs.atomic.exch, types.Array, types.intp, types.Any)
@@ -818,9 +878,9 @@ def impl_ptx_atomic(context, builder, dtype, ptr, val):
 @_atomic_dispatcher
 def ptx_atomic_exch(context, builder, dtype, ptr, val):
     if dtype in (cuda.cudadecl.integer_numba_types):
-        return builder.atomic_rmw('xchg', ptr, val, 'monotonic')
+        return builder.atomic_rmw("xchg", ptr, val, "monotonic")
     else:
-        raise TypeError(f'Unimplemented atomic exch with {dtype} array')
+        raise TypeError(f"Unimplemented atomic exch with {dtype} array")
 
 
 @lower(stubs.atomic.max, types.Array, types.intp, types.Any)
@@ -830,17 +890,19 @@ def ptx_atomic_exch(context, builder, dtype, ptr, val):
 def ptx_atomic_max(context, builder, dtype, ptr, val):
     lmod = builder.module
     if dtype == types.float64:
-        return builder.call(nvvmutils.declare_atomic_max_float64(lmod),
-                            (ptr, val))
+        return builder.call(
+            nvvmutils.declare_atomic_max_float64(lmod), (ptr, val)
+        )
     elif dtype == types.float32:
-        return builder.call(nvvmutils.declare_atomic_max_float32(lmod),
-                            (ptr, val))
+        return builder.call(
+            nvvmutils.declare_atomic_max_float32(lmod), (ptr, val)
+        )
     elif dtype in (types.int32, types.int64):
-        return builder.atomic_rmw('max', ptr, val, ordering='monotonic')
+        return builder.atomic_rmw("max", ptr, val, ordering="monotonic")
     elif dtype in (types.uint32, types.uint64):
-        return builder.atomic_rmw('umax', ptr, val, ordering='monotonic')
+        return builder.atomic_rmw("umax", ptr, val, ordering="monotonic")
     else:
-        raise TypeError('Unimplemented atomic max with %s array' % dtype)
+        raise TypeError("Unimplemented atomic max with %s array" % dtype)
 
 
 @lower(stubs.atomic.min, types.Array, types.intp, types.Any)
@@ -850,17 +912,19 @@ def ptx_atomic_max(context, builder, dtype, ptr, val):
 def ptx_atomic_min(context, builder, dtype, ptr, val):
     lmod = builder.module
     if dtype == types.float64:
-        return builder.call(nvvmutils.declare_atomic_min_float64(lmod),
-                            (ptr, val))
+        return builder.call(
+            nvvmutils.declare_atomic_min_float64(lmod), (ptr, val)
+        )
     elif dtype == types.float32:
-        return builder.call(nvvmutils.declare_atomic_min_float32(lmod),
-                            (ptr, val))
+        return builder.call(
+            nvvmutils.declare_atomic_min_float32(lmod), (ptr, val)
+        )
     elif dtype in (types.int32, types.int64):
-        return builder.atomic_rmw('min', ptr, val, ordering='monotonic')
+        return builder.atomic_rmw("min", ptr, val, ordering="monotonic")
     elif dtype in (types.uint32, types.uint64):
-        return builder.atomic_rmw('umin', ptr, val, ordering='monotonic')
+        return builder.atomic_rmw("umin", ptr, val, ordering="monotonic")
     else:
-        raise TypeError('Unimplemented atomic min with %s array' % dtype)
+        raise TypeError("Unimplemented atomic min with %s array" % dtype)
 
 
 @lower(stubs.atomic.nanmax, types.Array, types.intp, types.Any)
@@ -870,17 +934,19 @@ def ptx_atomic_min(context, builder, dtype, ptr, val):
 def ptx_atomic_nanmax(context, builder, dtype, ptr, val):
     lmod = builder.module
     if dtype == types.float64:
-        return builder.call(nvvmutils.declare_atomic_nanmax_float64(lmod),
-                            (ptr, val))
+        return builder.call(
+            nvvmutils.declare_atomic_nanmax_float64(lmod), (ptr, val)
+        )
     elif dtype == types.float32:
-        return builder.call(nvvmutils.declare_atomic_nanmax_float32(lmod),
-                            (ptr, val))
+        return builder.call(
+            nvvmutils.declare_atomic_nanmax_float32(lmod), (ptr, val)
+        )
     elif dtype in (types.int32, types.int64):
-        return builder.atomic_rmw('max', ptr, val, ordering='monotonic')
+        return builder.atomic_rmw("max", ptr, val, ordering="monotonic")
     elif dtype in (types.uint32, types.uint64):
-        return builder.atomic_rmw('umax', ptr, val, ordering='monotonic')
+        return builder.atomic_rmw("umax", ptr, val, ordering="monotonic")
     else:
-        raise TypeError('Unimplemented atomic max with %s array' % dtype)
+        raise TypeError("Unimplemented atomic max with %s array" % dtype)
 
 
 @lower(stubs.atomic.nanmin, types.Array, types.intp, types.Any)
@@ -890,17 +956,19 @@ def ptx_atomic_nanmax(context, builder, dtype, ptr, val):
 def ptx_atomic_nanmin(context, builder, dtype, ptr, val):
     lmod = builder.module
     if dtype == types.float64:
-        return builder.call(nvvmutils.declare_atomic_nanmin_float64(lmod),
-                            (ptr, val))
+        return builder.call(
+            nvvmutils.declare_atomic_nanmin_float64(lmod), (ptr, val)
+        )
     elif dtype == types.float32:
-        return builder.call(nvvmutils.declare_atomic_nanmin_float32(lmod),
-                            (ptr, val))
+        return builder.call(
+            nvvmutils.declare_atomic_nanmin_float32(lmod), (ptr, val)
+        )
     elif dtype in (types.int32, types.int64):
-        return builder.atomic_rmw('min', ptr, val, ordering='monotonic')
+        return builder.atomic_rmw("min", ptr, val, ordering="monotonic")
     elif dtype in (types.uint32, types.uint64):
-        return builder.atomic_rmw('umin', ptr, val, ordering='monotonic')
+        return builder.atomic_rmw("umin", ptr, val, ordering="monotonic")
     else:
-        raise TypeError('Unimplemented atomic min with %s array' % dtype)
+        raise TypeError("Unimplemented atomic min with %s array" % dtype)
 
 
 @lower(stubs.atomic.compare_and_swap, types.Array, types.Any, types.Any)
@@ -917,19 +985,21 @@ def ptx_atomic_cas(context, builder, sig, args):
     aryty, indty, oldty, valty = sig.args
     ary, inds, old, val = args
 
-    indty, indices = _normalize_indices(context, builder, indty, inds, aryty,
-                                        valty)
+    indty, indices = _normalize_indices(
+        context, builder, indty, inds, aryty, valty
+    )
 
     lary = context.make_array(aryty)(context, builder, ary)
-    ptr = cgutils.get_item_pointer(context, builder, aryty, lary, indices,
-                                   wraparound=True)
+    ptr = cgutils.get_item_pointer(
+        context, builder, aryty, lary, indices, wraparound=True
+    )
 
     if aryty.dtype in (cuda.cudadecl.integer_numba_types):
         lmod = builder.module
         bitwidth = aryty.dtype.bitwidth
         return nvvmutils.atomic_cmpxchg(builder, lmod, bitwidth, ptr, old, val)
     else:
-        raise TypeError('Unimplemented atomic cas with %s array' % aryty.dtype)
+        raise TypeError("Unimplemented atomic cas with %s array" % aryty.dtype)
 
 
 # -----------------------------------------------------------------------------
@@ -937,15 +1007,20 @@ def ptx_atomic_cas(context, builder, sig, args):
 
 @lower(breakpoint)
 def ptx_brkpt(context, builder, sig, args):
-    brkpt = ir.InlineAsm(ir.FunctionType(ir.VoidType(), []),
-                         "brkpt;", '', side_effect=True)
+    brkpt = ir.InlineAsm(
+        ir.FunctionType(ir.VoidType(), []), "brkpt;", "", side_effect=True
+    )
     builder.call(brkpt, ())
 
 
 @lower(stubs.nanosleep, types.uint32)
 def ptx_nanosleep(context, builder, sig, args):
-    nanosleep = ir.InlineAsm(ir.FunctionType(ir.VoidType(), [ir.IntType(32)]),
-                             "nanosleep.u32 $0;", 'r', side_effect=True)
+    nanosleep = ir.InlineAsm(
+        ir.FunctionType(ir.VoidType(), [ir.IntType(32)]),
+        "nanosleep.u32 $0;",
+        "r",
+        side_effect=True,
+    )
     ns = args[0]
     builder.call(nanosleep, [ns])
 
@@ -953,8 +1028,9 @@ def ptx_nanosleep(context, builder, sig, args):
 # -----------------------------------------------------------------------------
 
 
-def _generic_array(context, builder, shape, dtype, symbol_name, addrspace,
-                   can_dynsized=False):
+def _generic_array(
+    context, builder, shape, dtype, symbol_name, addrspace, can_dynsized=False
+):
     elemcount = reduce(operator.mul, shape, 1)
 
     # Check for valid shape for this type of allocation.
@@ -985,16 +1061,17 @@ def _generic_array(context, builder, shape, dtype, symbol_name, addrspace,
         lmod = builder.module
 
         # Create global variable in the requested address space
-        gvmem = cgutils.add_global_variable(lmod, laryty, symbol_name,
-                                            addrspace)
+        gvmem = cgutils.add_global_variable(
+            lmod, laryty, symbol_name, addrspace
+        )
         # Specify alignment to avoid misalignment bug
         align = context.get_abi_sizeof(lldtype)
         # Alignment is required to be a power of 2 for shared memory. If it is
         # not a power of 2 (e.g. for a Record array) then round up accordingly.
-        gvmem.align = 1 << (align - 1 ).bit_length()
+        gvmem.align = 1 << (align - 1).bit_length()
 
         if dynamic_smem:
-            gvmem.linkage = 'external'
+            gvmem.linkage = "external"
         else:
             ## Comment out the following line to workaround a NVVM bug
             ## which generates a invalid symbol name when the linkage
@@ -1005,8 +1082,9 @@ def _generic_array(context, builder, shape, dtype, symbol_name, addrspace,
             gvmem.initializer = ir.Constant(laryty, ir.Undefined)
 
         # Convert to generic address-space
-        dataptr = builder.addrspacecast(gvmem, ir.PointerType(ir.IntType(8)),
-                                        'generic')
+        dataptr = builder.addrspacecast(
+            gvmem, ir.PointerType(ir.IntType(8)), "generic"
+        )
 
     targetdata = ll.create_target_data(nvvm.NVVM().data_layout)
     lldtype = context.get_data_type(dtype)
@@ -1027,11 +1105,15 @@ def _generic_array(context, builder, shape, dtype, symbol_name, addrspace,
         # Unfortunately NVVM does not provide an intrinsic for the
         # %dynamic_smem_size register, so we must read it using inline
         # assembly.
-        get_dynshared_size = ir.InlineAsm(ir.FunctionType(ir.IntType(32), []),
-                                          "mov.u32 $0, %dynamic_smem_size;",
-                                          '=r', side_effect=True)
-        dynsmem_size = builder.zext(builder.call(get_dynshared_size, []),
-                                    ir.IntType(64))
+        get_dynshared_size = ir.InlineAsm(
+            ir.FunctionType(ir.IntType(32), []),
+            "mov.u32 $0, %dynamic_smem_size;",
+            "=r",
+            side_effect=True,
+        )
+        dynsmem_size = builder.zext(
+            builder.call(get_dynshared_size, []), ir.IntType(64)
+        )
         # Only 1-D dynamic shared memory is supported so the following is a
         # sufficient construction of the shape
         kitemsize = context.get_constant(types.intp, itemsize)
@@ -1041,15 +1123,17 @@ def _generic_array(context, builder, shape, dtype, symbol_name, addrspace,
 
     # Create array object
     ndim = len(shape)
-    aryty = types.Array(dtype=dtype, ndim=ndim, layout='C')
+    aryty = types.Array(dtype=dtype, ndim=ndim, layout="C")
     ary = context.make_array(aryty)(context, builder)
 
-    context.populate_array(ary,
-                           data=builder.bitcast(dataptr, ary.data.type),
-                           shape=kshape,
-                           strides=kstrides,
-                           itemsize=context.get_constant(types.intp, itemsize),
-                           meminfo=None)
+    context.populate_array(
+        ary,
+        data=builder.bitcast(dataptr, ary.data.type),
+        shape=kshape,
+        strides=kstrides,
+        itemsize=context.get_constant(types.intp, itemsize),
+        meminfo=None,
+    )
     return ary._getvalue()
 
 
diff --git a/numba_cuda/numba/cuda/cudamath.py b/numba_cuda/numba/cuda/cudamath.py
index 12d9715b6..f03c3b2ba 100644
--- a/numba_cuda/numba/cuda/cudamath.py
+++ b/numba_cuda/numba/cuda/cudamath.py
@@ -136,5 +136,5 @@ class Math_isnan(ConcreteTemplate):
 class Math_modf(ConcreteTemplate):
     cases = [
         signature(types.UniTuple(types.float64, 2), types.float64),
-        signature(types.UniTuple(types.float32, 2), types.float32)
+        signature(types.UniTuple(types.float32, 2), types.float32),
     ]
diff --git a/numba_cuda/numba/cuda/debuginfo.py b/numba_cuda/numba/cuda/debuginfo.py
index 8b65c825b..2cfc5916d 100644
--- a/numba_cuda/numba/cuda/debuginfo.py
+++ b/numba_cuda/numba/cuda/debuginfo.py
@@ -7,7 +7,6 @@
 
 
 class CUDADIBuilder(DIBuilder):
-
     def _var_type(self, lltype, size, datamodel=None):
         is_bool = False
         is_grid_group = False
@@ -34,11 +33,14 @@ def _var_type(self, lltype, size, datamodel=None):
             elif is_grid_group:
                 ditok = "DW_ATE_unsigned"
 
-            return m.add_debug_info('DIBasicType', {
-                'name': name,
-                'size': bitsize,
-                'encoding': ir.DIToken(ditok),
-            })
+            return m.add_debug_info(
+                "DIBasicType",
+                {
+                    "name": name,
+                    "size": bitsize,
+                    "encoding": ir.DIToken(ditok),
+                },
+            )
 
         # For other cases, use upstream Numba implementation
         return super()._var_type(lltype, size, datamodel=datamodel)
diff --git a/numba_cuda/numba/cuda/decorators.py b/numba_cuda/numba/cuda/decorators.py
index db62fb96a..edc904f0d 100644
--- a/numba_cuda/numba/cuda/decorators.py
+++ b/numba_cuda/numba/cuda/decorators.py
@@ -6,13 +6,24 @@
 from numba.cuda.simulator.kernel import FakeCUDAKernel
 
 
-_msg_deprecated_signature_arg = ("Deprecated keyword argument `{0}`. "
-                                 "Signatures should be passed as the first "
-                                 "positional argument.")
-
-
-def jit(func_or_sig=None, device=False, inline=False, link=[], debug=None,
-        opt=None, lineinfo=False, cache=False, **kws):
+_msg_deprecated_signature_arg = (
+    "Deprecated keyword argument `{0}`. "
+    "Signatures should be passed as the first "
+    "positional argument."
+)
+
+
+def jit(
+    func_or_sig=None,
+    device=False,
+    inline=False,
+    link=[],
+    debug=None,
+    opt=None,
+    lineinfo=False,
+    cache=False,
+    **kws,
+):
     """
     JIT compile a Python function for CUDA GPUs.
 
@@ -55,39 +66,43 @@ def jit(func_or_sig=None, device=False, inline=False, link=[], debug=None,
     """
 
     if link and config.ENABLE_CUDASIM:
-        raise NotImplementedError('Cannot link PTX in the simulator')
+        raise NotImplementedError("Cannot link PTX in the simulator")
 
-    if kws.get('boundscheck'):
+    if kws.get("boundscheck"):
         raise NotImplementedError("bounds checking is not supported for CUDA")
 
-    if kws.get('argtypes') is not None:
-        msg = _msg_deprecated_signature_arg.format('argtypes')
+    if kws.get("argtypes") is not None:
+        msg = _msg_deprecated_signature_arg.format("argtypes")
         raise DeprecationError(msg)
-    if kws.get('restype') is not None:
-        msg = _msg_deprecated_signature_arg.format('restype')
+    if kws.get("restype") is not None:
+        msg = _msg_deprecated_signature_arg.format("restype")
         raise DeprecationError(msg)
-    if kws.get('bind') is not None:
-        msg = _msg_deprecated_signature_arg.format('bind')
+    if kws.get("bind") is not None:
+        msg = _msg_deprecated_signature_arg.format("bind")
         raise DeprecationError(msg)
 
     debug = config.CUDA_DEBUGINFO_DEFAULT if debug is None else debug
     opt = (config.OPT != 0) if opt is None else opt
-    fastmath = kws.get('fastmath', False)
-    extensions = kws.get('extensions', [])
+    fastmath = kws.get("fastmath", False)
+    extensions = kws.get("extensions", [])
 
     if debug and opt:
-        msg = ("debug=True with opt=True "
-               "is not supported by CUDA. This may result in a crash"
-               " - set debug=False or opt=False.")
+        msg = (
+            "debug=True with opt=True "
+            "is not supported by CUDA. This may result in a crash"
+            " - set debug=False or opt=False."
+        )
         warn(NumbaInvalidConfigWarning(msg))
 
     if debug and lineinfo:
-        msg = ("debug and lineinfo are mutually exclusive. Use debug to get "
-               "full debug info (this disables some optimizations), or "
-               "lineinfo for line info only with code generation unaffected.")
+        msg = (
+            "debug and lineinfo are mutually exclusive. Use debug to get "
+            "full debug info (this disables some optimizations), or "
+            "lineinfo for line info only with code generation unaffected."
+        )
         warn(NumbaInvalidConfigWarning(msg))
 
-    if device and kws.get('link'):
+    if device and kws.get("link"):
         raise ValueError("link keyword invalid for device function")
 
     if sigutils.is_signature(func_or_sig):
@@ -101,19 +116,21 @@ def jit(func_or_sig=None, device=False, inline=False, link=[], debug=None,
 
     if signatures is not None:
         if config.ENABLE_CUDASIM:
+
             def jitwrapper(func):
                 return FakeCUDAKernel(func, device=device, fastmath=fastmath)
+
             return jitwrapper
 
         def _jit(func):
             targetoptions = kws.copy()
-            targetoptions['debug'] = debug
-            targetoptions['lineinfo'] = lineinfo
-            targetoptions['link'] = link
-            targetoptions['opt'] = opt
-            targetoptions['fastmath'] = fastmath
-            targetoptions['device'] = device
-            targetoptions['extensions'] = extensions
+            targetoptions["debug"] = debug
+            targetoptions["lineinfo"] = lineinfo
+            targetoptions["link"] = link
+            targetoptions["opt"] = opt
+            targetoptions["fastmath"] = fastmath
+            targetoptions["device"] = device
+            targetoptions["extensions"] = extensions
 
             disp = CUDADispatcher(func, targetoptions=targetoptions)
 
@@ -128,6 +145,7 @@ def _jit(func):
 
                 if device:
                     from numba.core import typeinfer
+
                     with typeinfer.register_dispatcher(disp):
                         disp.compile_device(argtypes, restype)
                 else:
@@ -142,29 +160,41 @@ def _jit(func):
     else:
         if func_or_sig is None:
             if config.ENABLE_CUDASIM:
+
                 def autojitwrapper(func):
-                    return FakeCUDAKernel(func, device=device,
-                                          fastmath=fastmath)
+                    return FakeCUDAKernel(
+                        func, device=device, fastmath=fastmath
+                    )
             else:
+
                 def autojitwrapper(func):
-                    return jit(func, device=device, debug=debug, opt=opt,
-                               lineinfo=lineinfo, link=link, cache=cache, **kws)
+                    return jit(
+                        func,
+                        device=device,
+                        debug=debug,
+                        opt=opt,
+                        lineinfo=lineinfo,
+                        link=link,
+                        cache=cache,
+                        **kws,
+                    )
 
             return autojitwrapper
         # func_or_sig is a function
         else:
             if config.ENABLE_CUDASIM:
-                return FakeCUDAKernel(func_or_sig, device=device,
-                                      fastmath=fastmath)
+                return FakeCUDAKernel(
+                    func_or_sig, device=device, fastmath=fastmath
+                )
             else:
                 targetoptions = kws.copy()
-                targetoptions['debug'] = debug
-                targetoptions['lineinfo'] = lineinfo
-                targetoptions['opt'] = opt
-                targetoptions['link'] = link
-                targetoptions['fastmath'] = fastmath
-                targetoptions['device'] = device
-                targetoptions['extensions'] = extensions
+                targetoptions["debug"] = debug
+                targetoptions["lineinfo"] = lineinfo
+                targetoptions["opt"] = opt
+                targetoptions["link"] = link
+                targetoptions["fastmath"] = fastmath
+                targetoptions["device"] = device
+                targetoptions["extensions"] = extensions
                 disp = CUDADispatcher(func_or_sig, targetoptions=targetoptions)
 
                 if cache:
@@ -191,7 +221,7 @@ def declare_device(name, sig, link=None):
 
     argtypes, restype = sigutils.normalize_signature(sig)
     if restype is None:
-        msg = 'Return type must be provided for device declarations'
+        msg = "Return type must be provided for device declarations"
         raise TypeError(msg)
 
     return declare_device_function(name, restype, argtypes, link)
diff --git a/numba_cuda/numba/cuda/descriptor.py b/numba_cuda/numba/cuda/descriptor.py
index b91ddf7a1..965f301be 100644
--- a/numba_cuda/numba/cuda/descriptor.py
+++ b/numba_cuda/numba/cuda/descriptor.py
@@ -30,4 +30,4 @@ def target_context(self):
         return self._targetctx
 
 
-cuda_target = CUDATarget('cuda')
+cuda_target = CUDATarget("cuda")
diff --git a/numba_cuda/numba/cuda/device_init.py b/numba_cuda/numba/cuda/device_init.py
index e4352903b..da8074754 100644
--- a/numba_cuda/numba/cuda/device_init.py
+++ b/numba_cuda/numba/cuda/device_init.py
@@ -1,21 +1,58 @@
 # Re export
 import sys
 from numba.cuda import cg
-from .stubs import (threadIdx, blockIdx, blockDim, gridDim, laneid, warpsize,
-                    syncwarp, shared, local, const, atomic,
-                    shfl_sync_intrinsic, vote_sync_intrinsic, match_any_sync,
-                    match_all_sync, threadfence_block, threadfence_system,
-                    threadfence, selp, popc, brev, clz, ffs, fma, cbrt,
-                    activemask, lanemask_lt, nanosleep, fp16,
-                    _vector_type_stubs)
-from .intrinsics import (grid, gridsize, syncthreads, syncthreads_and,
-                         syncthreads_count, syncthreads_or)
+from .stubs import (
+    threadIdx,
+    blockIdx,
+    blockDim,
+    gridDim,
+    laneid,
+    warpsize,
+    syncwarp,
+    shared,
+    local,
+    const,
+    atomic,
+    shfl_sync_intrinsic,
+    vote_sync_intrinsic,
+    match_any_sync,
+    match_all_sync,
+    threadfence_block,
+    threadfence_system,
+    threadfence,
+    selp,
+    popc,
+    brev,
+    clz,
+    ffs,
+    fma,
+    cbrt,
+    activemask,
+    lanemask_lt,
+    nanosleep,
+    fp16,
+    _vector_type_stubs,
+)
+from .intrinsics import (
+    grid,
+    gridsize,
+    syncthreads,
+    syncthreads_and,
+    syncthreads_count,
+    syncthreads_or,
+)
 from .cudadrv.error import CudaSupportError
-from numba.cuda.cudadrv.driver import (BaseCUDAMemoryManager,
-                                       HostOnlyCUDAMemoryManager,
-                                       GetIpcHandleMixin, MemoryPointer,
-                                       MappedMemory, PinnedMemory, MemoryInfo,
-                                       IpcHandle, set_memory_manager)
+from numba.cuda.cudadrv.driver import (
+    BaseCUDAMemoryManager,
+    HostOnlyCUDAMemoryManager,
+    GetIpcHandleMixin,
+    MemoryPointer,
+    MappedMemory,
+    PinnedMemory,
+    MemoryInfo,
+    IpcHandle,
+    set_memory_manager,
+)
 from numba.cuda.cudadrv.runtime import runtime
 from .cudadrv import nvvm
 from numba.cuda import initialize
@@ -26,13 +63,27 @@
 from .api import _auto_device
 from .args import In, Out, InOut
 
-from .intrinsic_wrapper import (all_sync, any_sync, eq_sync, ballot_sync,
-                                shfl_sync, shfl_up_sync, shfl_down_sync,
-                                shfl_xor_sync)
+from .intrinsic_wrapper import (
+    all_sync,
+    any_sync,
+    eq_sync,
+    ballot_sync,
+    shfl_sync,
+    shfl_up_sync,
+    shfl_down_sync,
+    shfl_xor_sync,
+)
 
 from .kernels import reduction
 from numba.cuda.cudadrv.linkable_code import (
-    Archive, CUSource, Cubin, Fatbin, LinkableCode, LTOIR, Object, PTXSource
+    Archive,
+    CUSource,
+    Cubin,
+    Fatbin,
+    LinkableCode,
+    LTOIR,
+    Object,
+    PTXSource,
 )
 
 reduce = Reduce = reduction.Reduce
diff --git a/numba_cuda/numba/cuda/deviceufunc.py b/numba_cuda/numba/cuda/deviceufunc.py
index c29335a91..e6e69f4db 100644
--- a/numba_cuda/numba/cuda/deviceufunc.py
+++ b/numba_cuda/numba/cuda/deviceufunc.py
@@ -72,12 +72,12 @@ class UFuncMechanism(object):
     """
     Prepare ufunc arguments for vectorize.
     """
+
     DEFAULT_STREAM = None
     SUPPORT_DEVICE_SLICING = False
 
     def __init__(self, typemap, args):
-        """Never used directly by user. Invoke by UFuncMechanism.call().
-        """
+        """Never used directly by user. Invoke by UFuncMechanism.call()."""
         self.typemap = typemap
         self.args = args
         nargs = len(self.args)
@@ -105,7 +105,7 @@ def _fill_argtypes(self):
         """
         for i, ary in enumerate(self.arrays):
             if ary is not None:
-                dtype = getattr(ary, 'dtype')
+                dtype = getattr(ary, "dtype")
                 if dtype is None:
                     dtype = np.asarray(ary).dtype
                 self.argtypes[i] = dtype
@@ -120,8 +120,9 @@ def _resolve_signature(self):
             # Try resolve scalar arguments
             for formaltys in self.typemap:
                 match_map = []
-                for i, (formal, actual) in enumerate(zip(formaltys,
-                                                         self.argtypes)):
+                for i, (formal, actual) in enumerate(
+                    zip(formaltys, self.argtypes)
+                ):
                     if actual is None:
                         actual = np.asarray(self.args[i]).dtype
 
@@ -134,21 +135,26 @@ def _resolve_signature(self):
         if not matches:
             matches = []
             for formaltys in self.typemap:
-                all_matches = all(actual is None or formal == actual
-                                  for formal, actual in
-                                  zip(formaltys, self.argtypes))
+                all_matches = all(
+                    actual is None or formal == actual
+                    for formal, actual in zip(formaltys, self.argtypes)
+                )
                 if all_matches:
                     matches.append(formaltys)
 
         if not matches:
-            raise TypeError("No matching version.  GPU ufunc requires array "
-                            "arguments to have the exact types.  This behaves "
-                            "like regular ufunc with casting='no'.")
+            raise TypeError(
+                "No matching version.  GPU ufunc requires array "
+                "arguments to have the exact types.  This behaves "
+                "like regular ufunc with casting='no'."
+            )
 
         if len(matches) > 1:
-            raise TypeError("Failed to resolve ufunc due to ambiguous "
-                            "signature. Too many untyped scalars. "
-                            "Use numpy dtype object to type tag.")
+            raise TypeError(
+                "Failed to resolve ufunc due to ambiguous "
+                "signature. Too many untyped scalars. "
+                "Use numpy dtype object to type tag."
+            )
 
         # Try scalar arguments
         self.argtypes = matches[0]
@@ -163,8 +169,7 @@ def _get_actual_args(self):
         return self.arrays
 
     def _broadcast(self, arys):
-        """Perform numpy ufunc broadcasting
-        """
+        """Perform numpy ufunc broadcasting"""
         shapelist = [a.shape for a in arys]
         shape = _multi_broadcast(*shapelist)
 
@@ -177,9 +182,11 @@ def _broadcast(self, arys):
                     arys[i] = self.broadcast_device(ary, shape)
 
                 else:
-                    ax_differs = [ax for ax in range(len(shape))
-                                  if ax >= ary.ndim
-                                  or ary.shape[ax] != shape[ax]]
+                    ax_differs = [
+                        ax
+                        for ax in range(len(shape))
+                        if ax >= ary.ndim or ary.shape[ax] != shape[ax]
+                    ]
 
                     missingdim = len(shape) - len(ary.shape)
                     strides = [0] * missingdim + list(ary.strides)
@@ -187,9 +194,9 @@ def _broadcast(self, arys):
                     for ax in ax_differs:
                         strides[ax] = 0
 
-                    strided = np.lib.stride_tricks.as_strided(ary,
-                                                              shape=shape,
-                                                              strides=strides)
+                    strided = np.lib.stride_tricks.as_strided(
+                        ary, shape=shape, strides=strides
+                    )
 
                     arys[i] = self.force_array_layout(strided)
 
@@ -206,8 +213,7 @@ def get_arguments(self):
         return self._broadcast(arys)
 
     def get_function(self):
-        """Returns (result_dtype, function)
-        """
+        """Returns (result_dtype, function)"""
         return self.typemap[self.argtypes]
 
     def is_device_array(self, obj):
@@ -240,14 +246,13 @@ def force_array_layout(self, ary):
 
     @classmethod
     def call(cls, typemap, args, kws):
-        """Perform the entire ufunc call mechanism.
-        """
+        """Perform the entire ufunc call mechanism."""
         # Handle keywords
-        stream = kws.pop('stream', cls.DEFAULT_STREAM)
-        out = kws.pop('out', None)
+        stream = kws.pop("stream", cls.DEFAULT_STREAM)
+        out = kws.pop("out", None)
 
         if kws:
-            warnings.warn("unrecognized keywords: %s" % ', '.join(kws))
+            warnings.warn("unrecognized keywords: %s" % ", ".join(kws))
 
         # Begin call resolution
         cr = cls(typemap, args)
@@ -364,9 +369,11 @@ def __init__(self, func, identity=None, cache=False, targetoptions={}):
         if cache:
             raise TypeError("caching is not supported")
         for opt in targetoptions:
-            if opt == 'nopython':
-                warnings.warn("nopython kwarg for cuda target is redundant",
-                              RuntimeWarning)
+            if opt == "nopython":
+                warnings.warn(
+                    "nopython kwarg for cuda target is redundant",
+                    RuntimeWarning,
+                )
             else:
                 fmt = "Unrecognized options. "
                 fmt += "cuda vectorize target does not support option: '%s'"
@@ -386,14 +393,15 @@ def add(self, sig=None):
         devfnsig = signature(return_type, *args)
 
         funcname = self.pyfunc.__name__
-        kernelsource = self._get_kernel_source(self._kernel_template,
-                                               devfnsig, funcname)
+        kernelsource = self._get_kernel_source(
+            self._kernel_template, devfnsig, funcname
+        )
         corefn, return_type = self._compile_core(devfnsig)
         glbl = self._get_globals(corefn)
         sig = signature(types.void, *([a[:] for a in args] + [return_type[:]]))
         exec(kernelsource, glbl)
 
-        stager = glbl['__vectorized_%s' % funcname]
+        stager = glbl["__vectorized_%s" % funcname]
         kernel = self._compile_kernel(stager, sig)
 
         argdtypes = tuple(to_dtype(t) for t in devfnsig.args)
@@ -404,10 +412,12 @@ def build_ufunc(self):
         raise NotImplementedError
 
     def _get_kernel_source(self, template, sig, funcname):
-        args = ['a%d' % i for i in range(len(sig.args))]
-        fmts = dict(name=funcname,
-                    args=', '.join(args),
-                    argitems=', '.join('%s[__tid__]' % i for i in args))
+        args = ["a%d" % i for i in range(len(sig.args))]
+        fmts = dict(
+            name=funcname,
+            args=", ".join(args),
+            argitems=", ".join("%s[__tid__]" % i for i in args),
+        )
         return template.format(**fmts)
 
     def _compile_core(self, sig):
@@ -421,19 +431,26 @@ def _compile_kernel(self, fnobj, sig):
 
 
 class DeviceGUFuncVectorize(_BaseUFuncBuilder):
-    def __init__(self, func, sig, identity=None, cache=False, targetoptions={},
-                 writable_args=()):
+    def __init__(
+        self,
+        func,
+        sig,
+        identity=None,
+        cache=False,
+        targetoptions={},
+        writable_args=(),
+    ):
         if cache:
             raise TypeError("caching is not supported")
         if writable_args:
             raise TypeError("writable_args are not supported")
 
         # Allow nopython flag to be set.
-        if not targetoptions.pop('nopython', True):
+        if not targetoptions.pop("nopython", True):
             raise TypeError("nopython flag must be True")
         # Are there any more target options?
         if targetoptions:
-            opts = ', '.join([repr(k) for k in targetoptions.keys()])
+            opts = ", ".join([repr(k) for k in targetoptions.keys()])
             fmt = "The following target options are not supported: {0}"
             raise TypeError(fmt.format(opts))
 
@@ -458,18 +475,21 @@ def add(self, sig=None):
         # specify the return type (where the "Python None" is the return type)
         valid_return_type = return_type in (types.none, None)
         if not valid_return_type:
-            raise TypeError('guvectorized functions cannot return values: '
-                            f'signature {sig} specifies {return_type} return '
-                            'type')
+            raise TypeError(
+                "guvectorized functions cannot return values: "
+                f"signature {sig} specifies {return_type} return "
+                "type"
+            )
 
         funcname = self.py_func.__name__
-        src = expand_gufunc_template(self._kernel_template, indims,
-                                     outdims, funcname, args)
+        src = expand_gufunc_template(
+            self._kernel_template, indims, outdims, funcname, args
+        )
 
         glbls = self._get_globals(sig)
 
         exec(src, glbls)
-        fnobj = glbls['__gufunc_{name}'.format(name=funcname)]
+        fnobj = glbls["__gufunc_{name}".format(name=funcname)]
 
         outertys = list(_determine_gufunc_outer_types(args, indims + outdims))
         kernel = self._compile_kernel(fnobj, sig=tuple(outertys))
@@ -495,49 +515,58 @@ def _determine_gufunc_outer_types(argtys, dims):
         else:
             if nd > 0:
                 raise ValueError("gufunc signature mismatch: ndim>0 for scalar")
-            yield types.Array(dtype=at, ndim=1, layout='A')
+            yield types.Array(dtype=at, ndim=1, layout="A")
 
 
 def expand_gufunc_template(template, indims, outdims, funcname, argtypes):
-    """Expand gufunc source template
-    """
+    """Expand gufunc source template"""
     argdims = indims + outdims
     argnames = ["arg{0}".format(i) for i in range(len(argdims))]
-    checkedarg = "min({0})".format(', '.join(["{0}.shape[0]".format(a)
-                                              for a in argnames]))
-    inputs = [_gen_src_for_indexing(aref, adims, atype)
-              for aref, adims, atype in zip(argnames, indims, argtypes)]
-    outputs = [_gen_src_for_indexing(aref, adims, atype)
-               for aref, adims, atype in zip(argnames[len(indims):], outdims,
-                                             argtypes[len(indims):])]
+    checkedarg = "min({0})".format(
+        ", ".join(["{0}.shape[0]".format(a) for a in argnames])
+    )
+    inputs = [
+        _gen_src_for_indexing(aref, adims, atype)
+        for aref, adims, atype in zip(argnames, indims, argtypes)
+    ]
+    outputs = [
+        _gen_src_for_indexing(aref, adims, atype)
+        for aref, adims, atype in zip(
+            argnames[len(indims) :], outdims, argtypes[len(indims) :]
+        )
+    ]
     argitems = inputs + outputs
-    src = template.format(name=funcname, args=', '.join(argnames),
-                          checkedarg=checkedarg,
-                          argitems=', '.join(argitems))
+    src = template.format(
+        name=funcname,
+        args=", ".join(argnames),
+        checkedarg=checkedarg,
+        argitems=", ".join(argitems),
+    )
     return src
 
 
 def _gen_src_for_indexing(aref, adims, atype):
-    return "{aref}[{sliced}]".format(aref=aref,
-                                     sliced=_gen_src_index(adims, atype))
+    return "{aref}[{sliced}]".format(
+        aref=aref, sliced=_gen_src_index(adims, atype)
+    )
 
 
 def _gen_src_index(adims, atype):
     if adims > 0:
-        return ','.join(['__tid__'] + [':'] * adims)
+        return ",".join(["__tid__"] + [":"] * adims)
     elif isinstance(atype, types.Array) and atype.ndim - 1 == adims:
         # Special case for 0-nd in shape-signature but
         # 1d array in type signature.
         # Slice it so that the result has the same dimension.
-        return '__tid__:(__tid__ + 1)'
+        return "__tid__:(__tid__ + 1)"
     else:
-        return '__tid__'
+        return "__tid__"
 
 
 class GUFuncEngine(object):
-    '''Determine how to broadcast and execute a gufunc
+    """Determine how to broadcast and execute a gufunc
     base on input shape and signature
-    '''
+    """
 
     @classmethod
     def from_signature(cls, signature):
@@ -553,7 +582,7 @@ def __init__(self, inputsig, outputsig):
 
     def schedule(self, ishapes):
         if len(ishapes) != self.nin:
-            raise TypeError('invalid number of input argument')
+            raise TypeError("invalid number of input argument")
 
         # associate symbol values for input signature
         symbolmap = {}
@@ -626,7 +655,7 @@ def __init__(self, parent, ishapes, oshapes, loopdims, pinned):
     def __str__(self):
         import pprint
 
-        attrs = 'ishapes', 'oshapes', 'loopdims', 'loopn', 'pinned'
+        attrs = "ishapes", "oshapes", "loopdims", "loopn", "pinned"
         values = [(k, getattr(self, k)) for k in attrs]
         return pprint.pformat(dict(values))
 
@@ -635,13 +664,15 @@ class GeneralizedUFunc(object):
     def __init__(self, kernelmap, engine):
         self.kernelmap = kernelmap
         self.engine = engine
-        self.max_blocksize = 2 ** 30
+        self.max_blocksize = 2**30
 
     def __call__(self, *args, **kws):
-        callsteps = self._call_steps(self.engine.nin, self.engine.nout,
-                                     args, kws)
+        callsteps = self._call_steps(
+            self.engine.nin, self.engine.nout, args, kws
+        )
         indtypes, schedule, outdtypes, kernel = self._schedule(
-            callsteps.inputs, callsteps.outputs)
+            callsteps.inputs, callsteps.outputs
+        )
         callsteps.adjust_input_types(indtypes)
 
         outputs = callsteps.prepare_outputs(schedule, outdtypes)
@@ -671,7 +702,7 @@ def _schedule(self, inputs, outs):
         # check output
         for sched_shape, out in zip(schedule.output_shapes, outs):
             if out is not None and sched_shape != out.shape:
-                raise ValueError('output shape mismatch')
+                raise ValueError("output shape mismatch")
 
         return indtypes, schedule, outdtypes, kernel
 
@@ -683,8 +714,10 @@ def _search_matching_signature(self, idtypes):
         Note: Ordering is guaranteed by `kernelmap` being a OrderedDict
         """
         for sig in self.kernelmap.keys():
-            if all(np.can_cast(actual, desired)
-                   for actual, desired in zip(sig, idtypes)):
+            if all(
+                np.can_cast(actual, desired)
+                for actual, desired in zip(sig, idtypes)
+            ):
                 return sig
         else:
             raise TypeError("no matching signature")
@@ -716,8 +749,9 @@ def _broadcast_array(self, ary, newdim, innerdim):
 
         # Creating new dimension
         elif len(ary.shape) < len(newshape):
-            assert newshape[-len(ary.shape):] == ary.shape, \
+            assert newshape[-len(ary.shape) :] == ary.shape, (
                 "cannot add dim and reshape at the same time"
+            )
             return self._broadcast_add_axis(ary, newshape)
 
         # Collapsing dimension
@@ -744,9 +778,9 @@ class GUFuncCallSteps(metaclass=ABCMeta):
 
     # The base class uses these slots; subclasses may provide additional slots.
     __slots__ = [
-        'outputs',
-        'inputs',
-        '_copy_result_to_host',
+        "outputs",
+        "inputs",
+        "_copy_result_to_host",
     ]
 
     @abstractmethod
@@ -782,21 +816,25 @@ def allocate_device_array(self, shape, dtype):
         """
 
     def __init__(self, nin, nout, args, kwargs):
-        outputs = kwargs.get('out')
+        outputs = kwargs.get("out")
 
         # Ensure the user has passed a correct number of arguments
         if outputs is None and len(args) not in (nin, (nin + nout)):
+
             def pos_argn(n):
-                return f'{n} positional argument{"s" * (n != 1)}'
+                return f"{n} positional argument{'s' * (n != 1)}"
 
-            msg = (f'This gufunc accepts {pos_argn(nin)} (when providing '
-                   f'input only) or {pos_argn(nin + nout)} (when providing '
-                   f'input and output). Got {pos_argn(len(args))}.')
+            msg = (
+                f"This gufunc accepts {pos_argn(nin)} (when providing "
+                f"input only) or {pos_argn(nin + nout)} (when providing "
+                f"input and output). Got {pos_argn(len(args))}."
+            )
             raise TypeError(msg)
 
         if outputs is not None and len(args) > nin:
-            raise ValueError("cannot specify argument 'out' as both positional "
-                             "and keyword")
+            raise ValueError(
+                "cannot specify argument 'out' as both positional and keyword"
+            )
         else:
             # If the user did not pass outputs either in the out kwarg or as
             # positional arguments, then we need to generate an initial list of
@@ -819,8 +857,9 @@ def pos_argn(n):
 
         # - If any of the arguments are device arrays, we leave the output on
         #   the device.
-        self._copy_result_to_host = (all_host_arrays and
-                                     all_user_outputs_are_host)
+        self._copy_result_to_host = (
+            all_host_arrays and all_user_outputs_are_host
+        )
 
         # Normalize arguments - ensure they are either device- or host-side
         # arrays (as opposed to lists, tuples, etc).
@@ -850,9 +889,11 @@ def adjust_input_types(self, indtypes):
         """
         for i, (ity, val) in enumerate(zip(indtypes, self.inputs)):
             if ity != val.dtype:
-                if not hasattr(val, 'astype'):
-                    msg = ("compatible signature is possible by casting but "
-                           "{0} does not support .astype()").format(type(val))
+                if not hasattr(val, "astype"):
+                    msg = (
+                        "compatible signature is possible by casting but "
+                        "{0} does not support .astype()"
+                    ).format(type(val))
                     raise TypeError(msg)
                 # Cast types
                 self.inputs[i] = val.astype(ity)
@@ -866,8 +907,9 @@ def prepare_outputs(self, schedule, outdtypes):
         device; other outputs are allocated as necessary.
         """
         outputs = []
-        for shape, dtype, output in zip(schedule.output_shapes, outdtypes,
-                                        self.outputs):
+        for shape, dtype, output in zip(
+            schedule.output_shapes, outdtypes, self.outputs
+        ):
             if output is None or self._copy_result_to_host:
                 output = self.allocate_device_array(shape, dtype)
             outputs.append(output)
@@ -878,6 +920,7 @@ def prepare_inputs(self):
         """
         Returns a list of input parameters that all reside on the target device.
         """
+
         def ensure_device(parameter):
             if self.is_device_array(parameter):
                 convert = self.as_device_array
@@ -897,8 +940,10 @@ def post_process_outputs(self, outputs):
         jarring, it is consistent with the behavior of GUFuncs in general.
         """
         if self._copy_result_to_host:
-            outputs = [self.to_host(output, self_output)
-                       for output, self_output in zip(outputs, self.outputs)]
+            outputs = [
+                self.to_host(output, self_output)
+                for output, self_output in zip(outputs, self.outputs)
+            ]
         elif self.outputs[0] is not None:
             outputs = self.outputs
 
diff --git a/numba_cuda/numba/cuda/dispatcher.py b/numba_cuda/numba/cuda/dispatcher.py
index ba90f53a0..9f258db33 100644
--- a/numba_cuda/numba/cuda/dispatcher.py
+++ b/numba_cuda/numba/cuda/dispatcher.py
@@ -15,13 +15,19 @@
 from numba.core.types.functions import Function
 from numba.cuda.api import get_current_device
 from numba.cuda.args import wrap_arg
-from numba.cuda.compiler import (compile_cuda, CUDACompiler, kernel_fixup,
-                                 ExternFunction)
+from numba.cuda.compiler import (
+    compile_cuda,
+    CUDACompiler,
+    kernel_fixup,
+    ExternFunction,
+)
 from numba.cuda.cudadrv import driver
 from numba.cuda.cudadrv.devices import get_context
 from numba.cuda.descriptor import cuda_target
-from numba.cuda.errors import (missing_launch_config_msg,
-                               normalize_kernel_dimensions)
+from numba.cuda.errors import (
+    missing_launch_config_msg,
+    normalize_kernel_dimensions,
+)
 from numba.cuda import types as cuda_types
 from numba.cuda.runtime.nrt import rtsys
 
@@ -30,17 +36,26 @@
 
 from warnings import warn
 
-cuda_fp16_math_funcs = ['hsin', 'hcos',
-                        'hlog', 'hlog10',
-                        'hlog2',
-                        'hexp', 'hexp10',
-                        'hexp2',
-                        'hsqrt', 'hrsqrt',
-                        'hfloor', 'hceil',
-                        'hrcp', 'hrint',
-                        'htrunc', 'hdiv']
-
-reshape_funcs = ['nocopy_empty_reshape', 'numba_attempt_nocopy_reshape']
+cuda_fp16_math_funcs = [
+    "hsin",
+    "hcos",
+    "hlog",
+    "hlog10",
+    "hlog2",
+    "hexp",
+    "hexp10",
+    "hexp2",
+    "hsqrt",
+    "hrsqrt",
+    "hfloor",
+    "hceil",
+    "hrcp",
+    "hrint",
+    "htrunc",
+    "hdiv",
+]
+
+reshape_funcs = ["nocopy_empty_reshape", "numba_attempt_nocopy_reshape"]
 
 
 def get_cres_link_objects(cres):
@@ -51,17 +66,16 @@ def get_cres_link_objects(cres):
 
     # List of calls into declared device functions
     device_func_calls = [
-        (name, v) for name, v in cres.fndesc.typemap.items() if (
-            isinstance(v, cuda_types.CUDADispatcher)
-        )
+        (name, v)
+        for name, v in cres.fndesc.typemap.items()
+        if (isinstance(v, cuda_types.CUDADispatcher))
     ]
 
     # List of tuples with SSA name of calls and corresponding signature
     call_signatures = [
         (call.func.name, sig)
-        for call, sig in cres.fndesc.calltypes.items() if (
-            isinstance(call, ir.Expr) and call.op == 'call'
-        )
+        for call, sig in cres.fndesc.calltypes.items()
+        if (isinstance(call, ir.Expr) and call.op == "call")
     ]
 
     # Map SSA names to all invoked signatures
@@ -93,10 +107,10 @@ def get_cres_link_objects(cres):
 
 
 class _Kernel(serialize.ReduceMixin):
-    '''
+    """
     CUDA Kernel specialized for a given set of argument types. When called, this
     object launches the kernel on the device.
-    '''
+    """
 
     NRT_functions = [
         "NRT_Allocate",
@@ -110,16 +124,27 @@ class _Kernel(serialize.ReduceMixin):
         "NRT_MemInfo_alloc_aligned",
         "NRT_Allocate_External",
         "NRT_decref",
-        "NRT_incref"
+        "NRT_incref",
     ]
 
     @global_compiler_lock
-    def __init__(self, py_func, argtypes, link=None, debug=False,
-                 lineinfo=False, inline=False, fastmath=False, extensions=None,
-                 max_registers=None, lto=False, opt=True, device=False):
-
+    def __init__(
+        self,
+        py_func,
+        argtypes,
+        link=None,
+        debug=False,
+        lineinfo=False,
+        inline=False,
+        fastmath=False,
+        extensions=None,
+        max_registers=None,
+        lto=False,
+        opt=True,
+        device=False,
+    ):
         if device:
-            raise RuntimeError('Cannot compile a device function as a kernel')
+            raise RuntimeError("Cannot compile a device function as a kernel")
 
         super().__init__()
 
@@ -144,24 +169,25 @@ def __init__(self, py_func, argtypes, link=None, debug=False,
         self.lineinfo = lineinfo
         self.extensions = extensions or []
 
-        nvvm_options = {
-            'fastmath': fastmath,
-            'opt': 3 if opt else 0
-        }
+        nvvm_options = {"fastmath": fastmath, "opt": 3 if opt else 0}
 
         if debug:
-            nvvm_options['g'] = None
+            nvvm_options["g"] = None
 
         cc = get_current_device().compute_capability
-        cres = compile_cuda(self.py_func, types.void, self.argtypes,
-                            debug=self.debug,
-                            lineinfo=lineinfo,
-                            inline=inline,
-                            fastmath=fastmath,
-                            nvvm_options=nvvm_options,
-                            cc=cc,
-                            max_registers=max_registers,
-                            lto=lto)
+        cres = compile_cuda(
+            self.py_func,
+            types.void,
+            self.argtypes,
+            debug=self.debug,
+            lineinfo=lineinfo,
+            inline=inline,
+            fastmath=fastmath,
+            nvvm_options=nvvm_options,
+            cc=cc,
+            max_registers=max_registers,
+            lto=lto,
+        )
         tgt_ctx = cres.target_context
         lib = cres.library
         kernel = lib.get_function(cres.fndesc.llvm_func_name)
@@ -174,24 +200,25 @@ def __init__(self, py_func, argtypes, link=None, debug=False,
         asm = lib.get_asm_str()
 
         # A kernel needs cooperative launch if grid_sync is being used.
-        self.cooperative = 'cudaCGGetIntrinsicHandle' in asm
+        self.cooperative = "cudaCGGetIntrinsicHandle" in asm
         # We need to link against cudadevrt if grid sync is being used.
         if self.cooperative:
             lib.needs_cudadevrt = True
 
-        def link_to_library_functions(library_functions, library_path,
-                                      prefix=None):
+        def link_to_library_functions(
+            library_functions, library_path, prefix=None
+        ):
             """
             Dynamically links to library functions by searching for their names
             in the specified library and linking to the corresponding source
             file.
             """
             if prefix is not None:
-                library_functions = [f"{prefix}{fn}" for fn in
-                                     library_functions]
+                library_functions = [
+                    f"{prefix}{fn}" for fn in library_functions
+                ]
 
-            found_functions = [fn for fn in library_functions
-                               if f'{fn}' in asm]
+            found_functions = [fn for fn in library_functions if f"{fn}" in asm]
 
             if found_functions:
                 basedir = os.path.dirname(os.path.abspath(__file__))
@@ -201,11 +228,11 @@ def link_to_library_functions(library_functions, library_path,
             return found_functions
 
         # Link to the helper library functions if needed
-        link_to_library_functions(reshape_funcs, 'reshape_funcs.cu')
+        link_to_library_functions(reshape_funcs, "reshape_funcs.cu")
         # Link to the CUDA FP16 math library functions if needed
-        link_to_library_functions(cuda_fp16_math_funcs,
-                                  'cpp_function_wrappers.cu',
-                                  '__numba_wrapper_')
+        link_to_library_functions(
+            cuda_fp16_math_funcs, "cpp_function_wrappers.cu", "__numba_wrapper_"
+        )
 
         self.maybe_link_nrt(link, tgt_ctx, asm)
 
@@ -239,15 +266,16 @@ def maybe_link_nrt(self, link, tgt_ctx, asm):
 
         all_nrt = "|".join(self.NRT_functions)
         pattern = (
-            r'\.extern\s+\.func\s+(?:\s*\(.+\)\s*)?('
-            + all_nrt + r')\s*\([^)]*\)\s*;'
+            r"\.extern\s+\.func\s+(?:\s*\(.+\)\s*)?("
+            + all_nrt
+            + r")\s*\([^)]*\)\s*;"
         )
 
         nrt_in_asm = re.findall(pattern, asm)
 
         basedir = os.path.dirname(os.path.abspath(__file__))
         if nrt_in_asm:
-            nrt_path = os.path.join(basedir, 'runtime', 'nrt.cu')
+            nrt_path = os.path.join(basedir, "runtime", "nrt.cu")
             link.append(nrt_path)
 
     @property
@@ -270,8 +298,17 @@ def argument_types(self):
         return tuple(self.signature.args)
 
     @classmethod
-    def _rebuild(cls, cooperative, name, signature, codelibrary,
-                 debug, lineinfo, call_helper, extensions):
+    def _rebuild(
+        cls,
+        cooperative,
+        name,
+        signature,
+        codelibrary,
+        debug,
+        lineinfo,
+        call_helper,
+        extensions,
+    ):
         """
         Rebuild an instance.
         """
@@ -299,10 +336,16 @@ def _reduce_states(self):
         Thread, block and shared memory configuration are serialized.
         Stream information is discarded.
         """
-        return dict(cooperative=self.cooperative, name=self.entry_name,
-                    signature=self.signature, codelibrary=self._codelibrary,
-                    debug=self.debug, lineinfo=self.lineinfo,
-                    call_helper=self.call_helper, extensions=self.extensions)
+        return dict(
+            cooperative=self.cooperative,
+            name=self.entry_name,
+            signature=self.signature,
+            codelibrary=self._codelibrary,
+            debug=self.debug,
+            lineinfo=self.lineinfo,
+            call_helper=self.call_helper,
+            extensions=self.extensions,
+        )
 
     def bind(self):
         """
@@ -323,73 +366,73 @@ def bind(self):
 
     @property
     def regs_per_thread(self):
-        '''
+        """
         The number of registers used by each thread for this kernel.
-        '''
+        """
         return self._codelibrary.get_cufunc().attrs.regs
 
     @property
     def const_mem_size(self):
-        '''
+        """
         The amount of constant memory used by this kernel.
-        '''
+        """
         return self._codelibrary.get_cufunc().attrs.const
 
     @property
     def shared_mem_per_block(self):
-        '''
+        """
         The amount of shared memory used per block for this kernel.
-        '''
+        """
         return self._codelibrary.get_cufunc().attrs.shared
 
     @property
     def max_threads_per_block(self):
-        '''
+        """
         The maximum allowable threads per block.
-        '''
+        """
         return self._codelibrary.get_cufunc().attrs.maxthreads
 
     @property
     def local_mem_per_thread(self):
-        '''
+        """
         The amount of local memory used per thread for this kernel.
-        '''
+        """
         return self._codelibrary.get_cufunc().attrs.local
 
     def inspect_llvm(self):
-        '''
+        """
         Returns the LLVM IR for this kernel.
-        '''
+        """
         return self._codelibrary.get_llvm_str()
 
     def inspect_asm(self, cc):
-        '''
+        """
         Returns the PTX code for this kernel.
-        '''
+        """
         return self._codelibrary.get_asm_str(cc=cc)
 
     def inspect_sass_cfg(self):
-        '''
+        """
         Returns the CFG of the SASS for this kernel.
 
         Requires nvdisasm to be available on the PATH.
-        '''
+        """
         return self._codelibrary.get_sass_cfg()
 
     def inspect_sass(self):
-        '''
+        """
         Returns the SASS code for this kernel.
 
         Requires nvdisasm to be available on the PATH.
-        '''
+        """
         return self._codelibrary.get_sass()
 
     def inspect_types(self, file=None):
-        '''
+        """
         Produce a dump of the Python source of this function annotated with the
         corresponding Numba IR and type information. The dump is written to
         *file*, or *sys.stdout* if *file* is *None*.
-        '''
+        """
         if self._type_annotation is None:
             raise ValueError("Type annotation is not available")
 
@@ -397,12 +440,12 @@ def inspect_types(self, file=None):
             file = sys.stdout
 
         print("%s %s" % (self.entry_name, self.argument_types), file=file)
-        print('-' * 80, file=file)
+        print("-" * 80, file=file)
         print(self._type_annotation, file=file)
-        print('=' * 80, file=file)
+        print("=" * 80, file=file)
 
     def max_cooperative_grid_blocks(self, blockdim, dynsmemsize=0):
-        '''
+        """
         Calculates the maximum number of blocks that can be launched for this
         kernel in a cooperative grid in the current context, for the given block
         and dynamic shared memory sizes.
@@ -411,15 +454,15 @@ def max_cooperative_grid_blocks(self, blockdim, dynsmemsize=0):
                          a tuple for 2D or 3D blocks.
         :param dynsmemsize: Dynamic shared memory size in bytes.
         :return: The maximum number of blocks in the grid.
-        '''
+        """
         ctx = get_context()
         cufunc = self._codelibrary.get_cufunc()
 
         if isinstance(blockdim, tuple):
             blockdim = functools.reduce(lambda x, y: x * y, blockdim)
-        active_per_sm = ctx.get_active_blocks_per_multiprocessor(cufunc,
-                                                                 blockdim,
-                                                                 dynsmemsize)
+        active_per_sm = ctx.get_active_blocks_per_multiprocessor(
+            cufunc, blockdim, dynsmemsize
+        )
         sm_count = ctx.device.MULTIPROCESSOR_COUNT
         return active_per_sm * sm_count
 
@@ -435,7 +478,7 @@ def launch(self, args, griddim, blockdim, stream=0, sharedmem=0):
             excmem.memset(0, stream=stream)
 
         # Prepare arguments
-        retr = []                       # hold functors for writeback
+        retr = []  # hold functors for writeback
 
         kernelargs = []
         for t, v in zip(self.argument_types, args):
@@ -449,46 +492,51 @@ def launch(self, args, griddim, blockdim, stream=0, sharedmem=0):
         stream_handle = stream and stream.handle or zero_stream
 
         # Invoke kernel
-        driver.launch_kernel(cufunc.handle,
-                             *griddim,
-                             *blockdim,
-                             sharedmem,
-                             stream_handle,
-                             kernelargs,
-                             cooperative=self.cooperative)
+        driver.launch_kernel(
+            cufunc.handle,
+            *griddim,
+            *blockdim,
+            sharedmem,
+            stream_handle,
+            kernelargs,
+            cooperative=self.cooperative,
+        )
 
         if self.debug:
             driver.device_to_host(ctypes.addressof(excval), excmem, excsz)
             if excval.value != 0:
                 # An error occurred
                 def load_symbol(name):
-                    mem, sz = cufunc.module.get_global_symbol("%s__%s__" %
-                                                              (cufunc.name,
-                                                               name))
+                    mem, sz = cufunc.module.get_global_symbol(
+                        "%s__%s__" % (cufunc.name, name)
+                    )
                     val = ctypes.c_int()
                     driver.device_to_host(ctypes.addressof(val), mem, sz)
                     return val.value
 
-                tid = [load_symbol("tid" + i) for i in 'zyx']
-                ctaid = [load_symbol("ctaid" + i) for i in 'zyx']
+                tid = [load_symbol("tid" + i) for i in "zyx"]
+                ctaid = [load_symbol("ctaid" + i) for i in "zyx"]
                 code = excval.value
                 exccls, exc_args, loc = self.call_helper.get_exception(code)
                 # Prefix the exception message with the source location
                 if loc is None:
-                    locinfo = ''
+                    locinfo = ""
                 else:
                     sym, filepath, lineno = loc
                     filepath = os.path.abspath(filepath)
-                    locinfo = 'In function %r, file %s, line %s, ' % (sym,
-                                                                      filepath,
-                                                                      lineno,)
+                    locinfo = "In function %r, file %s, line %s, " % (
+                        sym,
+                        filepath,
+                        lineno,
+                    )
                 # Prefix the exception message with the thread position
                 prefix = "%stid=%s ctaid=%s" % (locinfo, tid, ctaid)
                 if exc_args:
-                    exc_args = ("%s: %s" % (prefix, exc_args[0]),) + \
-                        exc_args[1:]
+                    exc_args = ("%s: %s" % (prefix, exc_args[0]),) + exc_args[
+                        1:
+                    ]
                 else:
-                    exc_args = prefix,
+                    exc_args = (prefix,)
                 raise exccls(*exc_args)
 
         # retrieve auto converted arrays
@@ -502,11 +550,7 @@ def _prepare_args(self, ty, val, stream, retr, kernelargs):
 
         # map the arguments using any extension you've registered
         for extension in reversed(self.extensions):
-            ty, val = extension.prepare_args(
-                ty,
-                val,
-                stream=stream,
-                retr=retr)
+            ty, val = extension.prepare_args(ty, val, stream=stream, retr=retr)
 
         if isinstance(ty, types.Array):
             devary = wrap_arg(val).to_device(retr, stream)
@@ -592,8 +636,9 @@ def _prepare_args(self, ty, val, stream, retr, kernelargs):
 class ForAll(object):
     def __init__(self, dispatcher, ntasks, tpb, stream, sharedmem):
         if ntasks < 0:
-            raise ValueError("Can't create ForAll with negative task count: %s"
-                             % ntasks)
+            raise ValueError(
+                "Can't create ForAll with negative task count: %s" % ntasks
+            )
         self.dispatcher = dispatcher
         self.ntasks = ntasks
         self.thread_per_block = tpb
@@ -611,8 +656,9 @@ def __call__(self, *args):
         blockdim = self._compute_thread_per_block(specialized)
         griddim = (self.ntasks + blockdim - 1) // blockdim
 
-        return specialized[griddim, blockdim, self.stream,
-                           self.sharedmem](*args)
+        return specialized[griddim, blockdim, self.stream, self.sharedmem](
+            *args
+        )
 
     def _compute_thread_per_block(self, dispatcher):
         tpb = self.thread_per_block
@@ -627,7 +673,7 @@ def _compute_thread_per_block(self, dispatcher):
             kernel = next(iter(dispatcher.overloads.values()))
             kwargs = dict(
                 func=kernel._codelibrary.get_cufunc(),
-                b2d_func=0,     # dynamic-shared memory is constant to blksz
+                b2d_func=0,  # dynamic-shared memory is constant to blksz
                 memsize=self.sharedmem,
                 blocksizelimit=1024,
             )
@@ -658,13 +704,16 @@ def __init__(self, dispatcher, griddim, blockdim, stream, sharedmem):
             min_grid_size = 128
             grid_size = griddim[0] * griddim[1] * griddim[2]
             if grid_size < min_grid_size:
-                msg = (f"Grid size {grid_size} will likely result in GPU "
-                       "under-utilization due to low occupancy.")
+                msg = (
+                    f"Grid size {grid_size} will likely result in GPU "
+                    "under-utilization due to low occupancy."
+                )
                 warn(NumbaPerformanceWarning(msg))
 
     def __call__(self, *args):
-        return self.dispatcher.call(args, self.griddim, self.blockdim,
-                                    self.stream, self.sharedmem)
+        return self.dispatcher.call(
+            args, self.griddim, self.blockdim, self.stream, self.sharedmem
+        )
 
 
 class CUDACacheImpl(CacheImpl):
@@ -689,6 +738,7 @@ class CUDACache(Cache):
     """
     Implements a cache that saves and loads CUDA kernels and compile results.
     """
+
     _impl_class = CUDACacheImpl
 
     def load_overload(self, sig, target_context):
@@ -696,12 +746,13 @@ def load_overload(self, sig, target_context):
         # initialized. To initialize the correct (i.e. CUDA) target, we need to
         # enforce that the current target is the CUDA target.
         from numba.core.target_extension import target_override
-        with target_override('cuda'):
+
+        with target_override("cuda"):
             return super().load_overload(sig, target_context)
 
 
 class CUDADispatcher(Dispatcher, serialize.ReduceMixin):
-    '''
+    """
     CUDA Dispatcher object. When configured and called, the dispatcher will
     specialize itself for the given arguments (if no suitable specialized
     version already exists) & compute capability, and launch on the device
@@ -709,7 +760,7 @@ class CUDADispatcher(Dispatcher, serialize.ReduceMixin):
 
     Dispatcher objects are not to be constructed by the user, but instead are
     created using the :func:`numba.cuda.jit` decorator.
-    '''
+    """
 
     # Whether to fold named arguments and default values. Default values are
     # presently unsupported on CUDA, so we can leave this as False in all
@@ -719,8 +770,9 @@ class CUDADispatcher(Dispatcher, serialize.ReduceMixin):
     targetdescr = cuda_target
 
     def __init__(self, py_func, targetoptions, pipeline_class=CUDACompiler):
-        super().__init__(py_func, targetoptions=targetoptions,
-                         pipeline_class=pipeline_class)
+        super().__init__(
+            py_func, targetoptions=targetoptions, pipeline_class=pipeline_class
+        )
 
         # The following properties are for specialization of CUDADispatchers. A
         # specialized CUDADispatcher is one that is compiled for exactly one
@@ -748,7 +800,7 @@ def configure(self, griddim, blockdim, stream=0, sharedmem=0):
 
     def __getitem__(self, args):
         if len(args) not in [2, 3, 4]:
-            raise ValueError('must specify at least the griddim and blockdim')
+            raise ValueError("must specify at least the griddim and blockdim")
         return self.configure(*args)
 
     def forall(self, ntasks, tpb=0, stream=0, sharedmem=0):
@@ -775,7 +827,7 @@ def forall(self, ntasks, tpb=0, stream=0, sharedmem=0):
 
     @property
     def extensions(self):
-        '''
+        """
         A list of objects that must have a `prepare_args` function. When a
         specialized kernel is called, each argument will be passed through
         to the `prepare_args` (from the last object in this list to the
@@ -791,17 +843,17 @@ def extensions(self):
         will be passed in turn to the next right-most `extension`. After all
         the extensions have been called, the resulting `(ty, val)` will be
         passed into Numba's default argument marshalling logic.
-        '''
-        return self.targetoptions.get('extensions')
+        """
+        return self.targetoptions.get("extensions")
 
     def __call__(self, *args, **kwargs):
         # An attempt to launch an unconfigured kernel
         raise ValueError(missing_launch_config_msg)
 
     def call(self, args, griddim, blockdim, stream, sharedmem):
-        '''
+        """
         Compile if necessary and invoke this kernel with *args*.
-        '''
+        """
         if self.specialized:
             kernel = next(iter(self.overloads.values()))
         else:
@@ -824,28 +876,30 @@ def typeof_pyval(self, val):
             if cuda.is_cuda_array(val):
                 # When typing, we don't need to synchronize on the array's
                 # stream - this is done when the kernel is launched.
-                return typeof(cuda.as_cuda_array(val, sync=False),
-                              Purpose.argument)
+                return typeof(
+                    cuda.as_cuda_array(val, sync=False), Purpose.argument
+                )
             else:
                 raise
 
     def specialize(self, *args):
-        '''
+        """
         Create a new instance of this dispatcher specialized for the given
         *args*.
-        '''
+        """
         cc = get_current_device().compute_capability
         argtypes = tuple(self.typeof_pyval(a) for a in args)
         if self.specialized:
-            raise RuntimeError('Dispatcher already specialized')
+            raise RuntimeError("Dispatcher already specialized")
 
         specialization = self.specializations.get((cc, argtypes))
         if specialization:
             return specialization
 
         targetoptions = self.targetoptions
-        specialization = CUDADispatcher(self.py_func,
-                                        targetoptions=targetoptions)
+        specialization = CUDADispatcher(
+            self.py_func, targetoptions=targetoptions
+        )
         specialization.compile(argtypes)
         specialization.disable_compile()
         specialization._specialized = True
@@ -860,7 +914,7 @@ def specialized(self):
         return self._specialized
 
     def get_regs_per_thread(self, signature=None):
-        '''
+        """
         Returns the number of registers used by each thread in this kernel for
         the device in the current context.
 
@@ -869,17 +923,19 @@ def get_regs_per_thread(self, signature=None):
                           kernel.
         :return: The number of registers used by the compiled variant of the
                  kernel for the given signature and current device.
-        '''
+        """
         if signature is not None:
             return self.overloads[signature.args].regs_per_thread
         if self.specialized:
             return next(iter(self.overloads.values())).regs_per_thread
         else:
-            return {sig: overload.regs_per_thread
-                    for sig, overload in self.overloads.items()}
+            return {
+                sig: overload.regs_per_thread
+                for sig, overload in self.overloads.items()
+            }
 
     def get_const_mem_size(self, signature=None):
-        '''
+        """
         Returns the size in bytes of constant memory used by this kernel for
         the device in the current context.
 
@@ -889,17 +945,19 @@ def get_const_mem_size(self, signature=None):
         :return: The size in bytes of constant memory allocated by the
                  compiled variant of the kernel for the given signature and
                  current device.
-        '''
+        """
         if signature is not None:
             return self.overloads[signature.args].const_mem_size
         if self.specialized:
             return next(iter(self.overloads.values())).const_mem_size
         else:
-            return {sig: overload.const_mem_size
-                    for sig, overload in self.overloads.items()}
+            return {
+                sig: overload.const_mem_size
+                for sig, overload in self.overloads.items()
+            }
 
     def get_shared_mem_per_block(self, signature=None):
-        '''
+        """
         Returns the size in bytes of statically allocated shared memory
         for this kernel.
 
@@ -908,17 +966,19 @@ def get_shared_mem_per_block(self, signature=None):
                           specialized kernel.
         :return: The amount of shared memory allocated by the compiled variant
                  of the kernel for the given signature and current device.
-        '''
+        """
         if signature is not None:
             return self.overloads[signature.args].shared_mem_per_block
         if self.specialized:
             return next(iter(self.overloads.values())).shared_mem_per_block
         else:
-            return {sig: overload.shared_mem_per_block
-                    for sig, overload in self.overloads.items()}
+            return {
+                sig: overload.shared_mem_per_block
+                for sig, overload in self.overloads.items()
+            }
 
     def get_max_threads_per_block(self, signature=None):
-        '''
+        """
         Returns the maximum allowable number of threads per block
         for this kernel. Exceeding this threshold will result in
         the kernel failing to launch.
@@ -929,17 +989,19 @@ def get_max_threads_per_block(self, signature=None):
         :return: The maximum allowable threads per block for the compiled
                  variant of the kernel for the given signature and current
                  device.
-        '''
+        """
         if signature is not None:
             return self.overloads[signature.args].max_threads_per_block
         if self.specialized:
             return next(iter(self.overloads.values())).max_threads_per_block
         else:
-            return {sig: overload.max_threads_per_block
-                    for sig, overload in self.overloads.items()}
+            return {
+                sig: overload.max_threads_per_block
+                for sig, overload in self.overloads.items()
+            }
 
     def get_local_mem_per_thread(self, signature=None):
-        '''
+        """
         Returns the size in bytes of local memory per thread
         for this kernel.
 
@@ -948,14 +1010,16 @@ def get_local_mem_per_thread(self, signature=None):
                           specialized kernel.
         :return: The amount of local memory allocated by the compiled variant
                  of the kernel for the given signature and current device.
-        '''
+        """
         if signature is not None:
             return self.overloads[signature.args].local_mem_per_thread
         if self.specialized:
             return next(iter(self.overloads.values())).local_mem_per_thread
         else:
-            return {sig: overload.local_mem_per_thread
-                    for sig, overload in self.overloads.items()}
+            return {
+                sig: overload.local_mem_per_thread
+                for sig, overload in self.overloads.items()
+            }
 
     def get_call_template(self, args, kws):
         # Originally copied from _DispatcherBase.get_call_template. This
@@ -983,7 +1047,8 @@ def get_call_template(self, args, kws):
         name = "CallTemplate({0})".format(func_name)
 
         call_template = typing.make_concrete_template(
-            name, key=func_name, signatures=self.nopython_signatures)
+            name, key=func_name, signatures=self.nopython_signatures
+        )
         pysig = utils.pysignature(self.py_func)
 
         return call_template, pysig, args, kws
@@ -998,33 +1063,36 @@ def compile_device(self, args, return_type=None):
         """
         if args not in self.overloads:
             with self._compiling_counter:
-
-                debug = self.targetoptions.get('debug')
-                lineinfo = self.targetoptions.get('lineinfo')
-                inline = self.targetoptions.get('inline')
-                fastmath = self.targetoptions.get('fastmath')
+                debug = self.targetoptions.get("debug")
+                lineinfo = self.targetoptions.get("lineinfo")
+                inline = self.targetoptions.get("inline")
+                fastmath = self.targetoptions.get("fastmath")
 
                 nvvm_options = {
-                    'opt': 3 if self.targetoptions.get('opt') else 0,
-                    'fastmath': fastmath
+                    "opt": 3 if self.targetoptions.get("opt") else 0,
+                    "fastmath": fastmath,
                 }
 
                 if debug:
-                    nvvm_options['g'] = None
+                    nvvm_options["g"] = None
 
                 cc = get_current_device().compute_capability
-                cres = compile_cuda(self.py_func, return_type, args,
-                                    debug=debug,
-                                    lineinfo=lineinfo,
-                                    inline=inline,
-                                    fastmath=fastmath,
-                                    nvvm_options=nvvm_options,
-                                    cc=cc)
+                cres = compile_cuda(
+                    self.py_func,
+                    return_type,
+                    args,
+                    debug=debug,
+                    lineinfo=lineinfo,
+                    inline=inline,
+                    fastmath=fastmath,
+                    nvvm_options=nvvm_options,
+                    cc=cc,
+                )
                 self.overloads[args] = cres
 
-                cres.target_context.insert_user_function(cres.entry_point,
-                                                         cres.fndesc,
-                                                         [cres.library])
+                cres.target_context.insert_user_function(
+                    cres.entry_point, cres.fndesc, [cres.library]
+                )
         else:
             cres = self.overloads[args]
 
@@ -1036,10 +1104,10 @@ def add_overload(self, kernel, argtypes):
         self.overloads[argtypes] = kernel
 
     def compile(self, sig):
-        '''
+        """
         Compile and bind to the current context a version of this kernel
         specialized for the given signature.
-        '''
+        """
         argtypes, return_type = sigutils.normalize_signature(sig)
         assert return_type is None or return_type == types.none
 
@@ -1072,15 +1140,15 @@ def compile(self, sig):
         return kernel
 
     def inspect_llvm(self, signature=None):
-        '''
+        """
         Return the LLVM IR for this kernel.
 
         :param signature: A tuple of argument types.
         :return: The LLVM IR for the given signature, or a dict of LLVM IR
                  for all previously-encountered signatures.
 
-        '''
-        device = self.targetoptions.get('device')
+        """
+        device = self.targetoptions.get("device")
         if signature is not None:
             if device:
                 return self.overloads[signature].library.get_llvm_str()
@@ -1088,23 +1156,27 @@ def inspect_llvm(self, signature=None):
                 return self.overloads[signature].inspect_llvm()
         else:
             if device:
-                return {sig: overload.library.get_llvm_str()
-                        for sig, overload in self.overloads.items()}
+                return {
+                    sig: overload.library.get_llvm_str()
+                    for sig, overload in self.overloads.items()
+                }
             else:
-                return {sig: overload.inspect_llvm()
-                        for sig, overload in self.overloads.items()}
+                return {
+                    sig: overload.inspect_llvm()
+                    for sig, overload in self.overloads.items()
+                }
 
     def inspect_asm(self, signature=None):
-        '''
+        """
         Return this kernel's PTX assembly code for for the device in the
         current context.
 
         :param signature: A tuple of argument types.
         :return: The PTX code for the given signature, or a dict of PTX codes
                  for all previously-encountered signatures.
-        '''
+        """
         cc = get_current_device().compute_capability
-        device = self.targetoptions.get('device')
+        device = self.targetoptions.get("device")
         if signature is not None:
             if device:
                 return self.overloads[signature].library.get_asm_str(cc)
@@ -1112,14 +1184,18 @@ def inspect_asm(self, signature=None):
                 return self.overloads[signature].inspect_asm(cc)
         else:
             if device:
-                return {sig: overload.library.get_asm_str(cc)
-                        for sig, overload in self.overloads.items()}
+                return {
+                    sig: overload.library.get_asm_str(cc)
+                    for sig, overload in self.overloads.items()
+                }
             else:
-                return {sig: overload.inspect_asm(cc)
-                        for sig, overload in self.overloads.items()}
+                return {
+                    sig: overload.inspect_asm(cc)
+                    for sig, overload in self.overloads.items()
+                }
 
     def inspect_sass_cfg(self, signature=None):
-        '''
+        """
         Return this kernel's CFG for the device in the current context.
 
         :param signature: A tuple of argument types.
@@ -1129,18 +1205,20 @@ def inspect_sass_cfg(self, signature=None):
         The CFG for the device in the current context is returned.
 
         Requires nvdisasm to be available on the PATH.
-        '''
-        if self.targetoptions.get('device'):
-            raise RuntimeError('Cannot get the CFG of a device function')
+        """
+        if self.targetoptions.get("device"):
+            raise RuntimeError("Cannot get the CFG of a device function")
 
         if signature is not None:
             return self.overloads[signature].inspect_sass_cfg()
         else:
-            return {sig: defn.inspect_sass_cfg()
-                    for sig, defn in self.overloads.items()}
+            return {
+                sig: defn.inspect_sass_cfg()
+                for sig, defn in self.overloads.items()
+            }
 
     def inspect_sass(self, signature=None):
-        '''
+        """
         Return this kernel's SASS assembly code for for the device in the
         current context.
 
@@ -1151,22 +1229,23 @@ def inspect_sass(self, signature=None):
         SASS for the device in the current context is returned.
 
         Requires nvdisasm to be available on the PATH.
-        '''
-        if self.targetoptions.get('device'):
-            raise RuntimeError('Cannot inspect SASS of a device function')
+        """
+        if self.targetoptions.get("device"):
+            raise RuntimeError("Cannot inspect SASS of a device function")
 
         if signature is not None:
             return self.overloads[signature].inspect_sass()
         else:
-            return {sig: defn.inspect_sass()
-                    for sig, defn in self.overloads.items()}
+            return {
+                sig: defn.inspect_sass() for sig, defn in self.overloads.items()
+            }
 
     def inspect_types(self, file=None):
-        '''
+        """
         Produce a dump of the Python source of this function annotated with the
         corresponding Numba IR and type information. The dump is written to
         *file*, or *sys.stdout* if *file* is *None*.
-        '''
+        """
         if file is None:
             file = sys.stdout
 
@@ -1186,5 +1265,4 @@ def _reduce_states(self):
         Reduce the instance for serialization.
         Compiled definitions are discarded.
         """
-        return dict(py_func=self.py_func,
-                    targetoptions=self.targetoptions)
+        return dict(py_func=self.py_func, targetoptions=self.targetoptions)
diff --git a/numba_cuda/numba/cuda/errors.py b/numba_cuda/numba/cuda/errors.py
index 653a0db6e..16989714e 100644
--- a/numba_cuda/numba/cuda/errors.py
+++ b/numba_cuda/numba/cuda/errors.py
@@ -7,8 +7,7 @@ def __init__(self, msg, tid=None, ctaid=None):
         self.tid = tid
         self.ctaid = ctaid
         self.msg = msg
-        t = ("An exception was raised in thread=%s block=%s\n"
-             "\t%s")
+        t = "An exception was raised in thread=%s block=%s\n\t%s"
         msg = t % (self.tid, self.ctaid, self.msg)
         super(KernelRuntimeError, self).__init__(msg)
 
@@ -17,8 +16,9 @@ class CudaLoweringError(LoweringError):
     pass
 
 
-_launch_help_url = ("https://numba.readthedocs.io/en/stable/cuda/"
-                    "kernels.html#kernel-invocation")
+_launch_help_url = (
+    "https://numba.readthedocs.io/en/stable/cuda/kernels.html#kernel-invocation"
+)
 missing_launch_config_msg = """
 Kernel launch configuration was not specified. Use the syntax:
 
@@ -40,12 +40,15 @@ def check_dim(dim, name):
         else:
             dim = list(dim)
         if len(dim) > 3:
-            raise ValueError('%s must be a sequence of 1, 2 or 3 integers, '
-                             'got %r' % (name, dim))
+            raise ValueError(
+                "%s must be a sequence of 1, 2 or 3 integers, "
+                "got %r" % (name, dim)
+            )
         for v in dim:
             if not isinstance(v, numbers.Integral):
-                raise TypeError('%s must be a sequence of integers, got %r'
-                                % (name, dim))
+                raise TypeError(
+                    "%s must be a sequence of integers, got %r" % (name, dim)
+                )
         while len(dim) < 3:
             dim.append(1)
         return tuple(dim)
@@ -53,7 +56,7 @@ def check_dim(dim, name):
     if None in (griddim, blockdim):
         raise ValueError(missing_launch_config_msg)
 
-    griddim = check_dim(griddim, 'griddim')
-    blockdim = check_dim(blockdim, 'blockdim')
+    griddim = check_dim(griddim, "griddim")
+    blockdim = check_dim(blockdim, "blockdim")
 
     return griddim, blockdim
diff --git a/numba_cuda/numba/cuda/extending.py b/numba_cuda/numba/cuda/extending.py
index cbc482aaa..a6b370523 100644
--- a/numba_cuda/numba/cuda/extending.py
+++ b/numba_cuda/numba/cuda/extending.py
@@ -4,4 +4,4 @@
 
 from numba.core.extending import intrinsic as _intrinsic
 
-intrinsic = _intrinsic(target='cuda')
+intrinsic = _intrinsic(target="cuda")
diff --git a/numba_cuda/numba/cuda/initialize.py b/numba_cuda/numba/cuda/initialize.py
index e90c95b31..832891a66 100644
--- a/numba_cuda/numba/cuda/initialize.py
+++ b/numba_cuda/numba/cuda/initialize.py
@@ -4,9 +4,11 @@ def initialize_all():
 
     from numba.cuda.decorators import jit
     from numba.cuda.dispatcher import CUDADispatcher
-    from numba.core.target_extension import (target_registry,
-                                             dispatcher_registry,
-                                             jit_registry)
+    from numba.core.target_extension import (
+        target_registry,
+        dispatcher_registry,
+        jit_registry,
+    )
 
     cuda_target = target_registry["cuda"]
     jit_registry[cuda_target] = jit
diff --git a/numba_cuda/numba/cuda/intrinsic_wrapper.py b/numba_cuda/numba/cuda/intrinsic_wrapper.py
index e02639f21..cfbdf06fe 100644
--- a/numba_cuda/numba/cuda/intrinsic_wrapper.py
+++ b/numba_cuda/numba/cuda/intrinsic_wrapper.py
@@ -45,7 +45,7 @@ def shfl_sync(mask, value, src_lane):
     from src_lane. If this is outside the warp, then the
     given value is returned.
     """
-    return numba.cuda.shfl_sync_intrinsic(mask, 0, value, src_lane, 0x1f)[0]
+    return numba.cuda.shfl_sync_intrinsic(mask, 0, value, src_lane, 0x1F)[0]
 
 
 @jit(device=True)
@@ -65,7 +65,7 @@ def shfl_down_sync(mask, value, delta):
     from (laneid + delta). If this is outside the warp, then the
     given value is returned.
     """
-    return numba.cuda.shfl_sync_intrinsic(mask, 2, value, delta, 0x1f)[0]
+    return numba.cuda.shfl_sync_intrinsic(mask, 2, value, delta, 0x1F)[0]
 
 
 @jit(device=True)
@@ -74,4 +74,4 @@ def shfl_xor_sync(mask, value, lane_mask):
     Shuffles value across the masked warp and returns the value
     from (laneid ^ lane_mask).
     """
-    return numba.cuda.shfl_sync_intrinsic(mask, 3, value, lane_mask, 0x1f)[0]
+    return numba.cuda.shfl_sync_intrinsic(mask, 3, value, lane_mask, 0x1F)[0]
diff --git a/numba_cuda/numba/cuda/intrinsics.py b/numba_cuda/numba/cuda/intrinsics.py
index f5b186e88..2691ee8eb 100644
--- a/numba_cuda/numba/cuda/intrinsics.py
+++ b/numba_cuda/numba/cuda/intrinsics.py
@@ -9,9 +9,10 @@
 from numba.cuda.extending import intrinsic
 
 
-#-------------------------------------------------------------------------------
+# -------------------------------------------------------------------------------
 # Grid functions
 
+
 def _type_grid_function(ndim):
     val = ndim.literal_value
     if val == 1:
@@ -19,14 +20,14 @@ def _type_grid_function(ndim):
     elif val in (2, 3):
         restype = types.UniTuple(types.int64, val)
     else:
-        raise ValueError('argument can only be 1, 2, 3')
+        raise ValueError("argument can only be 1, 2, 3")
 
     return signature(restype, types.int32)
 
 
 @intrinsic
 def grid(typingctx, ndim):
-    '''grid(ndim)
+    """grid(ndim)
 
     Return the absolute position of the current thread in the entire grid of
     blocks.  *ndim* should correspond to the number of dimensions declared when
@@ -39,7 +40,7 @@ def grid(typingctx, ndim):
 
     and is similar for the other two indices, but using the ``y`` and ``z``
     attributes.
-    '''
+    """
 
     if not isinstance(ndim, types.IntegerLiteral):
         raise RequireLiteralValue(ndim)
@@ -59,7 +60,7 @@ def codegen(context, builder, sig, args):
 
 @intrinsic
 def gridsize(typingctx, ndim):
-    '''gridsize(ndim)
+    """gridsize(ndim)
 
     Return the absolute size (or shape) in threads of the entire grid of
     blocks. *ndim* should correspond to the number of dimensions declared when
@@ -72,7 +73,7 @@ def gridsize(typingctx, ndim):
 
     and is similar for the other two indices, but using the ``y`` and ``z``
     attributes.
-    '''
+    """
 
     if not isinstance(ndim, types.IntegerLiteral):
         raise RequireLiteralValue(ndim)
@@ -87,17 +88,17 @@ def _nthreads_for_dim(builder, dim):
 
     def codegen(context, builder, sig, args):
         restype = sig.return_type
-        nx = _nthreads_for_dim(builder, 'x')
+        nx = _nthreads_for_dim(builder, "x")
 
         if restype == types.int64:
             return nx
         elif isinstance(restype, types.UniTuple):
-            ny = _nthreads_for_dim(builder, 'y')
+            ny = _nthreads_for_dim(builder, "y")
 
             if restype.count == 2:
                 return cgutils.pack_array(builder, (nx, ny))
             elif restype.count == 3:
-                nz = _nthreads_for_dim(builder, 'z')
+                nz = _nthreads_for_dim(builder, "z")
                 return cgutils.pack_array(builder, (nx, ny, nz))
 
     return sig, codegen
@@ -108,37 +109,40 @@ def _warpsize(typingctx):
     sig = signature(types.int32)
 
     def codegen(context, builder, sig, args):
-        return nvvmutils.call_sreg(builder, 'warpsize')
+        return nvvmutils.call_sreg(builder, "warpsize")
 
     return sig, codegen
 
 
-@overload_attribute(types.Module(cuda), 'warpsize', target='cuda')
+@overload_attribute(types.Module(cuda), "warpsize", target="cuda")
 def cuda_warpsize(mod):
-    '''
+    """
     The size of a warp. All architectures implemented to date have a warp size
     of 32.
-    '''
+    """
+
     def get(mod):
         return _warpsize()
+
     return get
 
 
-#-------------------------------------------------------------------------------
+# -------------------------------------------------------------------------------
 # syncthreads
 
+
 @intrinsic
 def syncthreads(typingctx):
-    '''
+    """
     Synchronize all threads in the same thread block.  This function implements
     the same pattern as barriers in traditional multi-threaded programming: this
     function waits until all threads in the block call it, at which point it
     returns control to all its callers.
-    '''
+    """
     sig = signature(types.none)
 
     def codegen(context, builder, sig, args):
-        fname = 'llvm.nvvm.barrier0'
+        fname = "llvm.nvvm.barrier0"
         lmod = builder.module
         fnty = ir.FunctionType(ir.VoidType(), ())
         sync = cgutils.get_or_insert_function(lmod, fnty, fname)
@@ -164,40 +168,40 @@ def codegen(context, builder, sig, args):
 
 @intrinsic
 def syncthreads_count(typingctx, predicate):
-    '''
+    """
     syncthreads_count(predicate)
 
     An extension to numba.cuda.syncthreads where the return value is a count
     of the threads where predicate is true.
-    '''
-    fname = 'llvm.nvvm.barrier0.popc'
+    """
+    fname = "llvm.nvvm.barrier0.popc"
     return _syncthreads_predicate(typingctx, predicate, fname)
 
 
 @intrinsic
 def syncthreads_and(typingctx, predicate):
-    '''
+    """
     syncthreads_and(predicate)
 
     An extension to numba.cuda.syncthreads where 1 is returned if predicate is
     true for all threads or 0 otherwise.
-    '''
-    fname = 'llvm.nvvm.barrier0.and'
+    """
+    fname = "llvm.nvvm.barrier0.and"
     return _syncthreads_predicate(typingctx, predicate, fname)
 
 
 @intrinsic
 def syncthreads_or(typingctx, predicate):
-    '''
+    """
     syncthreads_or(predicate)
 
     An extension to numba.cuda.syncthreads where 1 is returned if predicate is
     true for any thread or 0 otherwise.
-    '''
-    fname = 'llvm.nvvm.barrier0.or'
+    """
+    fname = "llvm.nvvm.barrier0.or"
     return _syncthreads_predicate(typingctx, predicate, fname)
 
 
-@overload_method(types.Integer, 'bit_count', target='cuda')
+@overload_method(types.Integer, "bit_count", target="cuda")
 def integer_bit_count(i):
     return lambda i: cuda.popc(i)
diff --git a/numba_cuda/numba/cuda/kernels/reduction.py b/numba_cuda/numba/cuda/kernels/reduction.py
index f733935b6..52d362599 100644
--- a/numba_cuda/numba/cuda/kernels/reduction.py
+++ b/numba_cuda/numba/cuda/kernels/reduction.py
@@ -13,7 +13,7 @@ def _gpu_reduce_factory(fn, nbtype):
     from numba import cuda
 
     reduce_op = cuda.jit(device=True)(fn)
-    inner_sm_size = _WARPSIZE + 1   # plus one to avoid SM collision
+    inner_sm_size = _WARPSIZE + 1  # plus one to avoid SM collision
     max_blocksize = _NUMWARPS * _WARPSIZE
 
     @cuda.jit(device=True)
@@ -86,8 +86,9 @@ def device_reduce_full_block(arr, partials, sm_partials):
         # warning: this is assuming 4 warps.
         # assert numwarps == 4
         if tid < 2:
-            sm_partials[tid, 0] = reduce_op(sm_partials[tid, 0],
-                                            sm_partials[tid + 2, 0])
+            sm_partials[tid, 0] = reduce_op(
+                sm_partials[tid, 0], sm_partials[tid + 2, 0]
+            )
             cuda.syncwarp()
         if tid == 0:
             partials[blkid] = reduce_op(sm_partials[0, 0], sm_partials[1, 0])
@@ -148,8 +149,9 @@ def gpu_reduce_block_strided(arr, partials, init, use_init):
         """
         tid = cuda.threadIdx.x
 
-        sm_partials = cuda.shared.array((_NUMWARPS, inner_sm_size),
-                                        dtype=nbtype)
+        sm_partials = cuda.shared.array(
+            (_NUMWARPS, inner_sm_size), dtype=nbtype
+        )
         if cuda.blockDim.x == max_blocksize:
             device_reduce_full_block(arr, partials, sm_partials)
         else:
@@ -238,17 +240,15 @@ def __call__(self, arr, size=None, res=None, init=0, stream=0):
 
         if size_full:
             # kernel for the fully populated threadblocks
-            kernel[full_blockct, blocksize, stream](arr[:size_full],
-                                                    partials[:full_blockct],
-                                                    init,
-                                                    True)
+            kernel[full_blockct, blocksize, stream](
+                arr[:size_full], partials[:full_blockct], init, True
+            )
 
         if size_partial:
             # kernel for partially populated threadblocks
-            kernel[1, size_partial, stream](arr[size_full:],
-                                            partials[full_blockct:],
-                                            init,
-                                            not full_blockct)
+            kernel[1, size_partial, stream](
+                arr[size_full:], partials[full_blockct:], init, not full_blockct
+            )
 
         if partials.size > 1:
             # finish up
diff --git a/numba_cuda/numba/cuda/kernels/transpose.py b/numba_cuda/numba/cuda/kernels/transpose.py
index b1df36e04..1a1af2b41 100644
--- a/numba_cuda/numba/cuda/kernels/transpose.py
+++ b/numba_cuda/numba/cuda/kernels/transpose.py
@@ -18,16 +18,14 @@ def transpose(a, b=None):
     """
 
     # prefer `a`'s stream if
-    stream = getattr(a, 'stream', 0)
+    stream = getattr(a, "stream", 0)
 
     if not b:
         cols, rows = a.shape
         strides = a.dtype.itemsize * cols, a.dtype.itemsize
         b = cuda.cudadrv.devicearray.DeviceNDArray(
-            (rows, cols),
-            strides,
-            dtype=a.dtype,
-            stream=stream)
+            (rows, cols), strides, dtype=a.dtype, stream=stream
+        )
 
     dt = nps.from_dtype(a.dtype)
 
@@ -40,7 +38,6 @@ def transpose(a, b=None):
 
     @cuda.jit
     def kernel(input, output):
-
         tile = cuda.shared.array(shape=tile_shape, dtype=dt)
 
         tx = cuda.threadIdx.x
diff --git a/numba_cuda/numba/cuda/libdevice.py b/numba_cuda/numba/cuda/libdevice.py
index 303ade74b..4a066b77b 100644
--- a/numba_cuda/numba/cuda/libdevice.py
+++ b/numba_cuda/numba/cuda/libdevice.py
@@ -5,7 +5,7 @@ def abs(x):
     :param x: Argument.
     :type x: int32
     :rtype: int32
-"""
+    """
 
 
 def acos(x):
@@ -15,7 +15,7 @@ def acos(x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def acosf(x):
@@ -25,7 +25,7 @@ def acosf(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def acosh(x):
@@ -35,7 +35,7 @@ def acosh(x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def acoshf(x):
@@ -45,7 +45,7 @@ def acoshf(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def asin(x):
@@ -55,7 +55,7 @@ def asin(x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def asinf(x):
@@ -65,7 +65,7 @@ def asinf(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def asinh(x):
@@ -75,7 +75,7 @@ def asinh(x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def asinhf(x):
@@ -85,7 +85,7 @@ def asinhf(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def atan(x):
@@ -95,7 +95,7 @@ def atan(x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def atan2(x, y):
@@ -107,7 +107,7 @@ def atan2(x, y):
     :param y: Argument.
     :type y: float64
     :rtype: float64
-"""
+    """
 
 
 def atan2f(x, y):
@@ -119,7 +119,7 @@ def atan2f(x, y):
     :param y: Argument.
     :type y: float32
     :rtype: float32
-"""
+    """
 
 
 def atanf(x):
@@ -129,7 +129,7 @@ def atanf(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def atanh(x):
@@ -139,7 +139,7 @@ def atanh(x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def atanhf(x):
@@ -149,7 +149,7 @@ def atanhf(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def brev(x):
@@ -159,7 +159,7 @@ def brev(x):
     :param x: Argument.
     :type x: int32
     :rtype: int32
-"""
+    """
 
 
 def brevll(x):
@@ -169,7 +169,7 @@ def brevll(x):
     :param x: Argument.
     :type x: int64
     :rtype: int64
-"""
+    """
 
 
 def byte_perm(x, y, z):
@@ -183,7 +183,7 @@ def byte_perm(x, y, z):
     :param z: Argument.
     :type z: int32
     :rtype: int32
-"""
+    """
 
 
 def cbrt(x):
@@ -193,7 +193,7 @@ def cbrt(x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def cbrtf(x):
@@ -203,7 +203,7 @@ def cbrtf(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def ceil(x):
@@ -213,7 +213,7 @@ def ceil(x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def ceilf(x):
@@ -223,7 +223,7 @@ def ceilf(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def clz(x):
@@ -233,7 +233,7 @@ def clz(x):
     :param x: Argument.
     :type x: int32
     :rtype: int32
-"""
+    """
 
 
 def clzll(x):
@@ -243,7 +243,7 @@ def clzll(x):
     :param x: Argument.
     :type x: int64
     :rtype: int32
-"""
+    """
 
 
 def copysign(x, y):
@@ -255,7 +255,7 @@ def copysign(x, y):
     :param y: Argument.
     :type y: float64
     :rtype: float64
-"""
+    """
 
 
 def copysignf(x, y):
@@ -267,7 +267,7 @@ def copysignf(x, y):
     :param y: Argument.
     :type y: float32
     :rtype: float32
-"""
+    """
 
 
 def cos(x):
@@ -277,7 +277,7 @@ def cos(x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def cosf(x):
@@ -287,7 +287,7 @@ def cosf(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def cosh(x):
@@ -297,7 +297,7 @@ def cosh(x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def coshf(x):
@@ -307,7 +307,7 @@ def coshf(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def cospi(x):
@@ -317,7 +317,7 @@ def cospi(x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def cospif(x):
@@ -327,7 +327,7 @@ def cospif(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def dadd_rd(x, y):
@@ -339,7 +339,7 @@ def dadd_rd(x, y):
     :param y: Argument.
     :type y: float64
     :rtype: float64
-"""
+    """
 
 
 def dadd_rn(x, y):
@@ -351,7 +351,7 @@ def dadd_rn(x, y):
     :param y: Argument.
     :type y: float64
     :rtype: float64
-"""
+    """
 
 
 def dadd_ru(x, y):
@@ -363,7 +363,7 @@ def dadd_ru(x, y):
     :param y: Argument.
     :type y: float64
     :rtype: float64
-"""
+    """
 
 
 def dadd_rz(x, y):
@@ -375,7 +375,7 @@ def dadd_rz(x, y):
     :param y: Argument.
     :type y: float64
     :rtype: float64
-"""
+    """
 
 
 def ddiv_rd(x, y):
@@ -387,7 +387,7 @@ def ddiv_rd(x, y):
     :param y: Argument.
     :type y: float64
     :rtype: float64
-"""
+    """
 
 
 def ddiv_rn(x, y):
@@ -399,7 +399,7 @@ def ddiv_rn(x, y):
     :param y: Argument.
     :type y: float64
     :rtype: float64
-"""
+    """
 
 
 def ddiv_ru(x, y):
@@ -411,7 +411,7 @@ def ddiv_ru(x, y):
     :param y: Argument.
     :type y: float64
     :rtype: float64
-"""
+    """
 
 
 def ddiv_rz(x, y):
@@ -423,7 +423,7 @@ def ddiv_rz(x, y):
     :param y: Argument.
     :type y: float64
     :rtype: float64
-"""
+    """
 
 
 def dmul_rd(x, y):
@@ -435,7 +435,7 @@ def dmul_rd(x, y):
     :param y: Argument.
     :type y: float64
     :rtype: float64
-"""
+    """
 
 
 def dmul_rn(x, y):
@@ -447,7 +447,7 @@ def dmul_rn(x, y):
     :param y: Argument.
     :type y: float64
     :rtype: float64
-"""
+    """
 
 
 def dmul_ru(x, y):
@@ -459,7 +459,7 @@ def dmul_ru(x, y):
     :param y: Argument.
     :type y: float64
     :rtype: float64
-"""
+    """
 
 
 def dmul_rz(x, y):
@@ -471,7 +471,7 @@ def dmul_rz(x, y):
     :param y: Argument.
     :type y: float64
     :rtype: float64
-"""
+    """
 
 
 def double2float_rd(d):
@@ -481,7 +481,7 @@ def double2float_rd(d):
     :param d: Argument.
     :type d: float64
     :rtype: float32
-"""
+    """
 
 
 def double2float_rn(d):
@@ -491,7 +491,7 @@ def double2float_rn(d):
     :param d: Argument.
     :type d: float64
     :rtype: float32
-"""
+    """
 
 
 def double2float_ru(d):
@@ -501,7 +501,7 @@ def double2float_ru(d):
     :param d: Argument.
     :type d: float64
     :rtype: float32
-"""
+    """
 
 
 def double2float_rz(d):
@@ -511,7 +511,7 @@ def double2float_rz(d):
     :param d: Argument.
     :type d: float64
     :rtype: float32
-"""
+    """
 
 
 def double2hiint(d):
@@ -521,7 +521,7 @@ def double2hiint(d):
     :param d: Argument.
     :type d: float64
     :rtype: int32
-"""
+    """
 
 
 def double2int_rd(d):
@@ -531,7 +531,7 @@ def double2int_rd(d):
     :param d: Argument.
     :type d: float64
     :rtype: int32
-"""
+    """
 
 
 def double2int_rn(d):
@@ -541,7 +541,7 @@ def double2int_rn(d):
     :param d: Argument.
     :type d: float64
     :rtype: int32
-"""
+    """
 
 
 def double2int_ru(d):
@@ -551,7 +551,7 @@ def double2int_ru(d):
     :param d: Argument.
     :type d: float64
     :rtype: int32
-"""
+    """
 
 
 def double2int_rz(d):
@@ -561,7 +561,7 @@ def double2int_rz(d):
     :param d: Argument.
     :type d: float64
     :rtype: int32
-"""
+    """
 
 
 def double2ll_rd(f):
@@ -571,7 +571,7 @@ def double2ll_rd(f):
     :param f: Argument.
     :type f: float64
     :rtype: int64
-"""
+    """
 
 
 def double2ll_rn(f):
@@ -581,7 +581,7 @@ def double2ll_rn(f):
     :param f: Argument.
     :type f: float64
     :rtype: int64
-"""
+    """
 
 
 def double2ll_ru(f):
@@ -591,7 +591,7 @@ def double2ll_ru(f):
     :param f: Argument.
     :type f: float64
     :rtype: int64
-"""
+    """
 
 
 def double2ll_rz(f):
@@ -601,7 +601,7 @@ def double2ll_rz(f):
     :param f: Argument.
     :type f: float64
     :rtype: int64
-"""
+    """
 
 
 def double2loint(d):
@@ -611,7 +611,7 @@ def double2loint(d):
     :param d: Argument.
     :type d: float64
     :rtype: int32
-"""
+    """
 
 
 def double2uint_rd(d):
@@ -621,7 +621,7 @@ def double2uint_rd(d):
     :param d: Argument.
     :type d: float64
     :rtype: int32
-"""
+    """
 
 
 def double2uint_rn(d):
@@ -631,7 +631,7 @@ def double2uint_rn(d):
     :param d: Argument.
     :type d: float64
     :rtype: int32
-"""
+    """
 
 
 def double2uint_ru(d):
@@ -641,7 +641,7 @@ def double2uint_ru(d):
     :param d: Argument.
     :type d: float64
     :rtype: int32
-"""
+    """
 
 
 def double2uint_rz(d):
@@ -651,7 +651,7 @@ def double2uint_rz(d):
     :param d: Argument.
     :type d: float64
     :rtype: int32
-"""
+    """
 
 
 def double2ull_rd(f):
@@ -661,7 +661,7 @@ def double2ull_rd(f):
     :param f: Argument.
     :type f: float64
     :rtype: int64
-"""
+    """
 
 
 def double2ull_rn(f):
@@ -671,7 +671,7 @@ def double2ull_rn(f):
     :param f: Argument.
     :type f: float64
     :rtype: int64
-"""
+    """
 
 
 def double2ull_ru(f):
@@ -681,7 +681,7 @@ def double2ull_ru(f):
     :param f: Argument.
     :type f: float64
     :rtype: int64
-"""
+    """
 
 
 def double2ull_rz(f):
@@ -691,7 +691,7 @@ def double2ull_rz(f):
     :param f: Argument.
     :type f: float64
     :rtype: int64
-"""
+    """
 
 
 def double_as_longlong(x):
@@ -701,7 +701,7 @@ def double_as_longlong(x):
     :param x: Argument.
     :type x: float64
     :rtype: int64
-"""
+    """
 
 
 def drcp_rd(x):
@@ -711,7 +711,7 @@ def drcp_rd(x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def drcp_rn(x):
@@ -721,7 +721,7 @@ def drcp_rn(x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def drcp_ru(x):
@@ -731,7 +731,7 @@ def drcp_ru(x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def drcp_rz(x):
@@ -741,7 +741,7 @@ def drcp_rz(x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def dsqrt_rd(x):
@@ -751,7 +751,7 @@ def dsqrt_rd(x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def dsqrt_rn(x):
@@ -761,7 +761,7 @@ def dsqrt_rn(x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def dsqrt_ru(x):
@@ -771,7 +771,7 @@ def dsqrt_ru(x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def dsqrt_rz(x):
@@ -781,7 +781,7 @@ def dsqrt_rz(x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def erf(x):
@@ -791,7 +791,7 @@ def erf(x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def erfc(x):
@@ -801,7 +801,7 @@ def erfc(x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def erfcf(x):
@@ -811,7 +811,7 @@ def erfcf(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def erfcinv(x):
@@ -821,7 +821,7 @@ def erfcinv(x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def erfcinvf(x):
@@ -831,7 +831,7 @@ def erfcinvf(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def erfcx(x):
@@ -841,7 +841,7 @@ def erfcx(x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def erfcxf(x):
@@ -851,7 +851,7 @@ def erfcxf(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def erff(x):
@@ -861,7 +861,7 @@ def erff(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def erfinv(x):
@@ -871,7 +871,7 @@ def erfinv(x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def erfinvf(x):
@@ -881,7 +881,7 @@ def erfinvf(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def exp(x):
@@ -891,7 +891,7 @@ def exp(x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def exp10(x):
@@ -901,7 +901,7 @@ def exp10(x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def exp10f(x):
@@ -911,7 +911,7 @@ def exp10f(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def exp2(x):
@@ -921,7 +921,7 @@ def exp2(x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def exp2f(x):
@@ -931,7 +931,7 @@ def exp2f(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def expf(x):
@@ -941,7 +941,7 @@ def expf(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def expm1(x):
@@ -951,7 +951,7 @@ def expm1(x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def expm1f(x):
@@ -961,7 +961,7 @@ def expm1f(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def fabs(f):
@@ -971,7 +971,7 @@ def fabs(f):
     :param f: Argument.
     :type f: float64
     :rtype: float64
-"""
+    """
 
 
 def fabsf(f):
@@ -981,7 +981,7 @@ def fabsf(f):
     :param f: Argument.
     :type f: float32
     :rtype: float32
-"""
+    """
 
 
 def fadd_rd(x, y):
@@ -993,7 +993,7 @@ def fadd_rd(x, y):
     :param y: Argument.
     :type y: float32
     :rtype: float32
-"""
+    """
 
 
 def fadd_rn(x, y):
@@ -1005,7 +1005,7 @@ def fadd_rn(x, y):
     :param y: Argument.
     :type y: float32
     :rtype: float32
-"""
+    """
 
 
 def fadd_ru(x, y):
@@ -1017,7 +1017,7 @@ def fadd_ru(x, y):
     :param y: Argument.
     :type y: float32
     :rtype: float32
-"""
+    """
 
 
 def fadd_rz(x, y):
@@ -1029,7 +1029,7 @@ def fadd_rz(x, y):
     :param y: Argument.
     :type y: float32
     :rtype: float32
-"""
+    """
 
 
 def fast_cosf(x):
@@ -1039,7 +1039,7 @@ def fast_cosf(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def fast_exp10f(x):
@@ -1049,7 +1049,7 @@ def fast_exp10f(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def fast_expf(x):
@@ -1059,7 +1059,7 @@ def fast_expf(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def fast_fdividef(x, y):
@@ -1071,7 +1071,7 @@ def fast_fdividef(x, y):
     :param y: Argument.
     :type y: float32
     :rtype: float32
-"""
+    """
 
 
 def fast_log10f(x):
@@ -1081,7 +1081,7 @@ def fast_log10f(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def fast_log2f(x):
@@ -1091,7 +1091,7 @@ def fast_log2f(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def fast_logf(x):
@@ -1101,7 +1101,7 @@ def fast_logf(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def fast_powf(x, y):
@@ -1113,7 +1113,7 @@ def fast_powf(x, y):
     :param y: Argument.
     :type y: float32
     :rtype: float32
-"""
+    """
 
 
 def fast_sincosf(x):
@@ -1123,7 +1123,7 @@ def fast_sincosf(x):
     :param x: Argument.
     :type x: float32
     :rtype: UniTuple(float32 x 2)
-"""
+    """
 
 
 def fast_sinf(x):
@@ -1133,7 +1133,7 @@ def fast_sinf(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def fast_tanf(x):
@@ -1143,7 +1143,7 @@ def fast_tanf(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def fdim(x, y):
@@ -1155,7 +1155,7 @@ def fdim(x, y):
     :param y: Argument.
     :type y: float64
     :rtype: float64
-"""
+    """
 
 
 def fdimf(x, y):
@@ -1167,7 +1167,7 @@ def fdimf(x, y):
     :param y: Argument.
     :type y: float32
     :rtype: float32
-"""
+    """
 
 
 def fdiv_rd(x, y):
@@ -1179,7 +1179,7 @@ def fdiv_rd(x, y):
     :param y: Argument.
     :type y: float32
     :rtype: float32
-"""
+    """
 
 
 def fdiv_rn(x, y):
@@ -1191,7 +1191,7 @@ def fdiv_rn(x, y):
     :param y: Argument.
     :type y: float32
     :rtype: float32
-"""
+    """
 
 
 def fdiv_ru(x, y):
@@ -1203,7 +1203,7 @@ def fdiv_ru(x, y):
     :param y: Argument.
     :type y: float32
     :rtype: float32
-"""
+    """
 
 
 def fdiv_rz(x, y):
@@ -1215,7 +1215,7 @@ def fdiv_rz(x, y):
     :param y: Argument.
     :type y: float32
     :rtype: float32
-"""
+    """
 
 
 def ffs(x):
@@ -1225,7 +1225,7 @@ def ffs(x):
     :param x: Argument.
     :type x: int32
     :rtype: int32
-"""
+    """
 
 
 def ffsll(x):
@@ -1235,7 +1235,7 @@ def ffsll(x):
     :param x: Argument.
     :type x: int64
     :rtype: int32
-"""
+    """
 
 
 def finitef(x):
@@ -1245,7 +1245,7 @@ def finitef(x):
     :param x: Argument.
     :type x: float32
     :rtype: int32
-"""
+    """
 
 
 def float2half_rn(f):
@@ -1255,7 +1255,7 @@ def float2half_rn(f):
     :param f: Argument.
     :type f: float32
     :rtype: int16
-"""
+    """
 
 
 def float2int_rd(x):
@@ -1265,7 +1265,7 @@ def float2int_rd(x):
     :param in: Argument.
     :type in: float32
     :rtype: int32
-"""
+    """
 
 
 def float2int_rn(x):
@@ -1275,7 +1275,7 @@ def float2int_rn(x):
     :param in: Argument.
     :type in: float32
     :rtype: int32
-"""
+    """
 
 
 def float2int_ru(x):
@@ -1285,7 +1285,7 @@ def float2int_ru(x):
     :param in: Argument.
     :type in: float32
     :rtype: int32
-"""
+    """
 
 
 def float2int_rz(x):
@@ -1295,7 +1295,7 @@ def float2int_rz(x):
     :param in: Argument.
     :type in: float32
     :rtype: int32
-"""
+    """
 
 
 def float2ll_rd(f):
@@ -1305,7 +1305,7 @@ def float2ll_rd(f):
     :param f: Argument.
     :type f: float32
     :rtype: int64
-"""
+    """
 
 
 def float2ll_rn(f):
@@ -1315,7 +1315,7 @@ def float2ll_rn(f):
     :param f: Argument.
     :type f: float32
     :rtype: int64
-"""
+    """
 
 
 def float2ll_ru(f):
@@ -1325,7 +1325,7 @@ def float2ll_ru(f):
     :param f: Argument.
     :type f: float32
     :rtype: int64
-"""
+    """
 
 
 def float2ll_rz(f):
@@ -1335,7 +1335,7 @@ def float2ll_rz(f):
     :param f: Argument.
     :type f: float32
     :rtype: int64
-"""
+    """
 
 
 def float2uint_rd(x):
@@ -1345,7 +1345,7 @@ def float2uint_rd(x):
     :param in: Argument.
     :type in: float32
     :rtype: int32
-"""
+    """
 
 
 def float2uint_rn(x):
@@ -1355,7 +1355,7 @@ def float2uint_rn(x):
     :param in: Argument.
     :type in: float32
     :rtype: int32
-"""
+    """
 
 
 def float2uint_ru(x):
@@ -1365,7 +1365,7 @@ def float2uint_ru(x):
     :param in: Argument.
     :type in: float32
     :rtype: int32
-"""
+    """
 
 
 def float2uint_rz(x):
@@ -1375,7 +1375,7 @@ def float2uint_rz(x):
     :param in: Argument.
     :type in: float32
     :rtype: int32
-"""
+    """
 
 
 def float2ull_rd(f):
@@ -1385,7 +1385,7 @@ def float2ull_rd(f):
     :param f: Argument.
     :type f: float32
     :rtype: int64
-"""
+    """
 
 
 def float2ull_rn(f):
@@ -1395,7 +1395,7 @@ def float2ull_rn(f):
     :param f: Argument.
     :type f: float32
     :rtype: int64
-"""
+    """
 
 
 def float2ull_ru(f):
@@ -1405,7 +1405,7 @@ def float2ull_ru(f):
     :param f: Argument.
     :type f: float32
     :rtype: int64
-"""
+    """
 
 
 def float2ull_rz(f):
@@ -1415,7 +1415,7 @@ def float2ull_rz(f):
     :param f: Argument.
     :type f: float32
     :rtype: int64
-"""
+    """
 
 
 def float_as_int(x):
@@ -1425,7 +1425,7 @@ def float_as_int(x):
     :param x: Argument.
     :type x: float32
     :rtype: int32
-"""
+    """
 
 
 def floor(f):
@@ -1435,7 +1435,7 @@ def floor(f):
     :param f: Argument.
     :type f: float64
     :rtype: float64
-"""
+    """
 
 
 def floorf(f):
@@ -1445,7 +1445,7 @@ def floorf(f):
     :param f: Argument.
     :type f: float32
     :rtype: float32
-"""
+    """
 
 
 def fma(x, y, z):
@@ -1459,7 +1459,7 @@ def fma(x, y, z):
     :param z: Argument.
     :type z: float64
     :rtype: float64
-"""
+    """
 
 
 def fma_rd(x, y, z):
@@ -1473,7 +1473,7 @@ def fma_rd(x, y, z):
     :param z: Argument.
     :type z: float64
     :rtype: float64
-"""
+    """
 
 
 def fma_rn(x, y, z):
@@ -1487,7 +1487,7 @@ def fma_rn(x, y, z):
     :param z: Argument.
     :type z: float64
     :rtype: float64
-"""
+    """
 
 
 def fma_ru(x, y, z):
@@ -1501,7 +1501,7 @@ def fma_ru(x, y, z):
     :param z: Argument.
     :type z: float64
     :rtype: float64
-"""
+    """
 
 
 def fma_rz(x, y, z):
@@ -1515,7 +1515,7 @@ def fma_rz(x, y, z):
     :param z: Argument.
     :type z: float64
     :rtype: float64
-"""
+    """
 
 
 def fmaf(x, y, z):
@@ -1529,7 +1529,7 @@ def fmaf(x, y, z):
     :param z: Argument.
     :type z: float32
     :rtype: float32
-"""
+    """
 
 
 def fmaf_rd(x, y, z):
@@ -1543,7 +1543,7 @@ def fmaf_rd(x, y, z):
     :param z: Argument.
     :type z: float32
     :rtype: float32
-"""
+    """
 
 
 def fmaf_rn(x, y, z):
@@ -1557,7 +1557,7 @@ def fmaf_rn(x, y, z):
     :param z: Argument.
     :type z: float32
     :rtype: float32
-"""
+    """
 
 
 def fmaf_ru(x, y, z):
@@ -1571,7 +1571,7 @@ def fmaf_ru(x, y, z):
     :param z: Argument.
     :type z: float32
     :rtype: float32
-"""
+    """
 
 
 def fmaf_rz(x, y, z):
@@ -1585,7 +1585,7 @@ def fmaf_rz(x, y, z):
     :param z: Argument.
     :type z: float32
     :rtype: float32
-"""
+    """
 
 
 def fmax(x, y):
@@ -1597,7 +1597,7 @@ def fmax(x, y):
     :param y: Argument.
     :type y: float64
     :rtype: float64
-"""
+    """
 
 
 def fmaxf(x, y):
@@ -1609,7 +1609,7 @@ def fmaxf(x, y):
     :param y: Argument.
     :type y: float32
     :rtype: float32
-"""
+    """
 
 
 def fmin(x, y):
@@ -1621,7 +1621,7 @@ def fmin(x, y):
     :param y: Argument.
     :type y: float64
     :rtype: float64
-"""
+    """
 
 
 def fminf(x, y):
@@ -1633,7 +1633,7 @@ def fminf(x, y):
     :param y: Argument.
     :type y: float32
     :rtype: float32
-"""
+    """
 
 
 def fmod(x, y):
@@ -1645,7 +1645,7 @@ def fmod(x, y):
     :param y: Argument.
     :type y: float64
     :rtype: float64
-"""
+    """
 
 
 def fmodf(x, y):
@@ -1657,7 +1657,7 @@ def fmodf(x, y):
     :param y: Argument.
     :type y: float32
     :rtype: float32
-"""
+    """
 
 
 def fmul_rd(x, y):
@@ -1669,7 +1669,7 @@ def fmul_rd(x, y):
     :param y: Argument.
     :type y: float32
     :rtype: float32
-"""
+    """
 
 
 def fmul_rn(x, y):
@@ -1681,7 +1681,7 @@ def fmul_rn(x, y):
     :param y: Argument.
     :type y: float32
     :rtype: float32
-"""
+    """
 
 
 def fmul_ru(x, y):
@@ -1693,7 +1693,7 @@ def fmul_ru(x, y):
     :param y: Argument.
     :type y: float32
     :rtype: float32
-"""
+    """
 
 
 def fmul_rz(x, y):
@@ -1705,7 +1705,7 @@ def fmul_rz(x, y):
     :param y: Argument.
     :type y: float32
     :rtype: float32
-"""
+    """
 
 
 def frcp_rd(x):
@@ -1715,7 +1715,7 @@ def frcp_rd(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def frcp_rn(x):
@@ -1725,7 +1725,7 @@ def frcp_rn(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def frcp_ru(x):
@@ -1735,7 +1735,7 @@ def frcp_ru(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def frcp_rz(x):
@@ -1745,7 +1745,7 @@ def frcp_rz(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def frexp(x):
@@ -1755,7 +1755,7 @@ def frexp(x):
     :param x: Argument.
     :type x: float64
     :rtype: Tuple(float64, int32)
-"""
+    """
 
 
 def frexpf(x):
@@ -1765,7 +1765,7 @@ def frexpf(x):
     :param x: Argument.
     :type x: float32
     :rtype: Tuple(float32, int32)
-"""
+    """
 
 
 def frsqrt_rn(x):
@@ -1775,7 +1775,7 @@ def frsqrt_rn(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def fsqrt_rd(x):
@@ -1785,7 +1785,7 @@ def fsqrt_rd(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def fsqrt_rn(x):
@@ -1795,7 +1795,7 @@ def fsqrt_rn(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def fsqrt_ru(x):
@@ -1805,7 +1805,7 @@ def fsqrt_ru(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def fsqrt_rz(x):
@@ -1815,7 +1815,7 @@ def fsqrt_rz(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def fsub_rd(x, y):
@@ -1827,7 +1827,7 @@ def fsub_rd(x, y):
     :param y: Argument.
     :type y: float32
     :rtype: float32
-"""
+    """
 
 
 def fsub_rn(x, y):
@@ -1839,7 +1839,7 @@ def fsub_rn(x, y):
     :param y: Argument.
     :type y: float32
     :rtype: float32
-"""
+    """
 
 
 def fsub_ru(x, y):
@@ -1851,7 +1851,7 @@ def fsub_ru(x, y):
     :param y: Argument.
     :type y: float32
     :rtype: float32
-"""
+    """
 
 
 def fsub_rz(x, y):
@@ -1863,7 +1863,7 @@ def fsub_rz(x, y):
     :param y: Argument.
     :type y: float32
     :rtype: float32
-"""
+    """
 
 
 def hadd(x, y):
@@ -1875,7 +1875,7 @@ def hadd(x, y):
     :param y: Argument.
     :type y: int32
     :rtype: int32
-"""
+    """
 
 
 def half2float(h):
@@ -1885,7 +1885,7 @@ def half2float(h):
     :param h: Argument.
     :type h: int16
     :rtype: float32
-"""
+    """
 
 
 def hiloint2double(x, y):
@@ -1897,7 +1897,7 @@ def hiloint2double(x, y):
     :param y: Argument.
     :type y: int32
     :rtype: float64
-"""
+    """
 
 
 def hypot(x, y):
@@ -1909,7 +1909,7 @@ def hypot(x, y):
     :param y: Argument.
     :type y: float64
     :rtype: float64
-"""
+    """
 
 
 def hypotf(x, y):
@@ -1921,7 +1921,7 @@ def hypotf(x, y):
     :param y: Argument.
     :type y: float32
     :rtype: float32
-"""
+    """
 
 
 def ilogb(x):
@@ -1931,7 +1931,7 @@ def ilogb(x):
     :param x: Argument.
     :type x: float64
     :rtype: int32
-"""
+    """
 
 
 def ilogbf(x):
@@ -1941,7 +1941,7 @@ def ilogbf(x):
     :param x: Argument.
     :type x: float32
     :rtype: int32
-"""
+    """
 
 
 def int2double_rn(i):
@@ -1951,7 +1951,7 @@ def int2double_rn(i):
     :param i: Argument.
     :type i: int32
     :rtype: float64
-"""
+    """
 
 
 def int2float_rd(x):
@@ -1961,7 +1961,7 @@ def int2float_rd(x):
     :param in: Argument.
     :type in: int32
     :rtype: float32
-"""
+    """
 
 
 def int2float_rn(x):
@@ -1971,7 +1971,7 @@ def int2float_rn(x):
     :param in: Argument.
     :type in: int32
     :rtype: float32
-"""
+    """
 
 
 def int2float_ru(x):
@@ -1981,7 +1981,7 @@ def int2float_ru(x):
     :param in: Argument.
     :type in: int32
     :rtype: float32
-"""
+    """
 
 
 def int2float_rz(x):
@@ -1991,7 +1991,7 @@ def int2float_rz(x):
     :param in: Argument.
     :type in: int32
     :rtype: float32
-"""
+    """
 
 
 def int_as_float(x):
@@ -2001,7 +2001,7 @@ def int_as_float(x):
     :param x: Argument.
     :type x: int32
     :rtype: float32
-"""
+    """
 
 
 def isfinited(x):
@@ -2011,7 +2011,7 @@ def isfinited(x):
     :param x: Argument.
     :type x: float64
     :rtype: int32
-"""
+    """
 
 
 def isinfd(x):
@@ -2021,7 +2021,7 @@ def isinfd(x):
     :param x: Argument.
     :type x: float64
     :rtype: int32
-"""
+    """
 
 
 def isinff(x):
@@ -2031,7 +2031,7 @@ def isinff(x):
     :param x: Argument.
     :type x: float32
     :rtype: int32
-"""
+    """
 
 
 def isnand(x):
@@ -2041,7 +2041,7 @@ def isnand(x):
     :param x: Argument.
     :type x: float64
     :rtype: int32
-"""
+    """
 
 
 def isnanf(x):
@@ -2051,7 +2051,7 @@ def isnanf(x):
     :param x: Argument.
     :type x: float32
     :rtype: int32
-"""
+    """
 
 
 def j0(x):
@@ -2061,7 +2061,7 @@ def j0(x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def j0f(x):
@@ -2071,7 +2071,7 @@ def j0f(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def j1(x):
@@ -2081,7 +2081,7 @@ def j1(x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def j1f(x):
@@ -2091,7 +2091,7 @@ def j1f(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def jn(n, x):
@@ -2103,7 +2103,7 @@ def jn(n, x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def jnf(n, x):
@@ -2115,7 +2115,7 @@ def jnf(n, x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def ldexp(x, y):
@@ -2127,7 +2127,7 @@ def ldexp(x, y):
     :param y: Argument.
     :type y: int32
     :rtype: float64
-"""
+    """
 
 
 def ldexpf(x, y):
@@ -2139,7 +2139,7 @@ def ldexpf(x, y):
     :param y: Argument.
     :type y: int32
     :rtype: float32
-"""
+    """
 
 
 def lgamma(x):
@@ -2149,7 +2149,7 @@ def lgamma(x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def lgammaf(x):
@@ -2159,7 +2159,7 @@ def lgammaf(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def ll2double_rd(l):
@@ -2169,7 +2169,7 @@ def ll2double_rd(l):
     :param l: Argument.
     :type l: int64
     :rtype: float64
-"""
+    """
 
 
 def ll2double_rn(l):
@@ -2179,7 +2179,7 @@ def ll2double_rn(l):
     :param l: Argument.
     :type l: int64
     :rtype: float64
-"""
+    """
 
 
 def ll2double_ru(l):
@@ -2189,7 +2189,7 @@ def ll2double_ru(l):
     :param l: Argument.
     :type l: int64
     :rtype: float64
-"""
+    """
 
 
 def ll2double_rz(l):
@@ -2199,7 +2199,7 @@ def ll2double_rz(l):
     :param l: Argument.
     :type l: int64
     :rtype: float64
-"""
+    """
 
 
 def ll2float_rd(l):
@@ -2209,7 +2209,7 @@ def ll2float_rd(l):
     :param l: Argument.
     :type l: int64
     :rtype: float32
-"""
+    """
 
 
 def ll2float_rn(l):
@@ -2219,7 +2219,7 @@ def ll2float_rn(l):
     :param l: Argument.
     :type l: int64
     :rtype: float32
-"""
+    """
 
 
 def ll2float_ru(l):
@@ -2229,7 +2229,7 @@ def ll2float_ru(l):
     :param l: Argument.
     :type l: int64
     :rtype: float32
-"""
+    """
 
 
 def ll2float_rz(l):
@@ -2239,7 +2239,7 @@ def ll2float_rz(l):
     :param l: Argument.
     :type l: int64
     :rtype: float32
-"""
+    """
 
 
 def llabs(x):
@@ -2249,7 +2249,7 @@ def llabs(x):
     :param x: Argument.
     :type x: int64
     :rtype: int64
-"""
+    """
 
 
 def llmax(x, y):
@@ -2261,7 +2261,7 @@ def llmax(x, y):
     :param y: Argument.
     :type y: int64
     :rtype: int64
-"""
+    """
 
 
 def llmin(x, y):
@@ -2273,7 +2273,7 @@ def llmin(x, y):
     :param y: Argument.
     :type y: int64
     :rtype: int64
-"""
+    """
 
 
 def llrint(x):
@@ -2283,7 +2283,7 @@ def llrint(x):
     :param x: Argument.
     :type x: float64
     :rtype: int64
-"""
+    """
 
 
 def llrintf(x):
@@ -2293,7 +2293,7 @@ def llrintf(x):
     :param x: Argument.
     :type x: float32
     :rtype: int64
-"""
+    """
 
 
 def llround(x):
@@ -2303,7 +2303,7 @@ def llround(x):
     :param x: Argument.
     :type x: float64
     :rtype: int64
-"""
+    """
 
 
 def llroundf(x):
@@ -2313,7 +2313,7 @@ def llroundf(x):
     :param x: Argument.
     :type x: float32
     :rtype: int64
-"""
+    """
 
 
 def log(x):
@@ -2323,7 +2323,7 @@ def log(x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def log10(x):
@@ -2333,7 +2333,7 @@ def log10(x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def log10f(x):
@@ -2343,7 +2343,7 @@ def log10f(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def log1p(x):
@@ -2353,7 +2353,7 @@ def log1p(x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def log1pf(x):
@@ -2363,7 +2363,7 @@ def log1pf(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def log2(x):
@@ -2373,7 +2373,7 @@ def log2(x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def log2f(x):
@@ -2383,7 +2383,7 @@ def log2f(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def logb(x):
@@ -2393,7 +2393,7 @@ def logb(x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def logbf(x):
@@ -2403,7 +2403,7 @@ def logbf(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def logf(x):
@@ -2413,7 +2413,7 @@ def logf(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def longlong_as_double(x):
@@ -2423,7 +2423,7 @@ def longlong_as_double(x):
     :param x: Argument.
     :type x: int64
     :rtype: float64
-"""
+    """
 
 
 def max(x, y):
@@ -2435,7 +2435,7 @@ def max(x, y):
     :param y: Argument.
     :type y: int32
     :rtype: int32
-"""
+    """
 
 
 def min(x, y):
@@ -2447,7 +2447,7 @@ def min(x, y):
     :param y: Argument.
     :type y: int32
     :rtype: int32
-"""
+    """
 
 
 def modf(x):
@@ -2457,7 +2457,7 @@ def modf(x):
     :param x: Argument.
     :type x: float64
     :rtype: UniTuple(float64 x 2)
-"""
+    """
 
 
 def modff(x):
@@ -2467,7 +2467,7 @@ def modff(x):
     :param x: Argument.
     :type x: float32
     :rtype: UniTuple(float32 x 2)
-"""
+    """
 
 
 def mul24(x, y):
@@ -2479,7 +2479,7 @@ def mul24(x, y):
     :param y: Argument.
     :type y: int32
     :rtype: int32
-"""
+    """
 
 
 def mul64hi(x, y):
@@ -2491,7 +2491,7 @@ def mul64hi(x, y):
     :param y: Argument.
     :type y: int64
     :rtype: int64
-"""
+    """
 
 
 def mulhi(x, y):
@@ -2503,7 +2503,7 @@ def mulhi(x, y):
     :param y: Argument.
     :type y: int32
     :rtype: int32
-"""
+    """
 
 
 def nearbyint(x):
@@ -2513,7 +2513,7 @@ def nearbyint(x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def nearbyintf(x):
@@ -2523,7 +2523,7 @@ def nearbyintf(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def nextafter(x, y):
@@ -2535,7 +2535,7 @@ def nextafter(x, y):
     :param y: Argument.
     :type y: float64
     :rtype: float64
-"""
+    """
 
 
 def nextafterf(x, y):
@@ -2547,7 +2547,7 @@ def nextafterf(x, y):
     :param y: Argument.
     :type y: float32
     :rtype: float32
-"""
+    """
 
 
 def normcdf(x):
@@ -2557,7 +2557,7 @@ def normcdf(x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def normcdff(x):
@@ -2567,7 +2567,7 @@ def normcdff(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def normcdfinv(x):
@@ -2577,7 +2577,7 @@ def normcdfinv(x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def normcdfinvf(x):
@@ -2587,7 +2587,7 @@ def normcdfinvf(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def popc(x):
@@ -2597,7 +2597,7 @@ def popc(x):
     :param x: Argument.
     :type x: int32
     :rtype: int32
-"""
+    """
 
 
 def popcll(x):
@@ -2607,7 +2607,7 @@ def popcll(x):
     :param x: Argument.
     :type x: int64
     :rtype: int32
-"""
+    """
 
 
 def pow(x, y):
@@ -2619,7 +2619,7 @@ def pow(x, y):
     :param y: Argument.
     :type y: float64
     :rtype: float64
-"""
+    """
 
 
 def powf(x, y):
@@ -2631,7 +2631,7 @@ def powf(x, y):
     :param y: Argument.
     :type y: float32
     :rtype: float32
-"""
+    """
 
 
 def powi(x, y):
@@ -2643,7 +2643,7 @@ def powi(x, y):
     :param y: Argument.
     :type y: int32
     :rtype: float64
-"""
+    """
 
 
 def powif(x, y):
@@ -2655,7 +2655,7 @@ def powif(x, y):
     :param y: Argument.
     :type y: int32
     :rtype: float32
-"""
+    """
 
 
 def rcbrt(x):
@@ -2665,7 +2665,7 @@ def rcbrt(x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def rcbrtf(x):
@@ -2675,7 +2675,7 @@ def rcbrtf(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def remainder(x, y):
@@ -2687,7 +2687,7 @@ def remainder(x, y):
     :param y: Argument.
     :type y: float64
     :rtype: float64
-"""
+    """
 
 
 def remainderf(x, y):
@@ -2699,7 +2699,7 @@ def remainderf(x, y):
     :param y: Argument.
     :type y: float32
     :rtype: float32
-"""
+    """
 
 
 def remquo(x, y):
@@ -2711,7 +2711,7 @@ def remquo(x, y):
     :param y: Argument.
     :type y: float64
     :rtype: Tuple(float64, int32)
-"""
+    """
 
 
 def remquof(x, y):
@@ -2723,7 +2723,7 @@ def remquof(x, y):
     :param y: Argument.
     :type y: float32
     :rtype: Tuple(float32, int32)
-"""
+    """
 
 
 def rhadd(x, y):
@@ -2735,7 +2735,7 @@ def rhadd(x, y):
     :param y: Argument.
     :type y: int32
     :rtype: int32
-"""
+    """
 
 
 def rint(x):
@@ -2745,7 +2745,7 @@ def rint(x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def rintf(x):
@@ -2755,7 +2755,7 @@ def rintf(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def round(x):
@@ -2765,7 +2765,7 @@ def round(x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def roundf(x):
@@ -2775,7 +2775,7 @@ def roundf(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def rsqrt(x):
@@ -2785,7 +2785,7 @@ def rsqrt(x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def rsqrtf(x):
@@ -2795,7 +2795,7 @@ def rsqrtf(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def sad(x, y, z):
@@ -2809,7 +2809,7 @@ def sad(x, y, z):
     :param z: Argument.
     :type z: int32
     :rtype: int32
-"""
+    """
 
 
 def saturatef(x):
@@ -2819,7 +2819,7 @@ def saturatef(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def scalbn(x, y):
@@ -2831,7 +2831,7 @@ def scalbn(x, y):
     :param y: Argument.
     :type y: int32
     :rtype: float64
-"""
+    """
 
 
 def scalbnf(x, y):
@@ -2843,7 +2843,7 @@ def scalbnf(x, y):
     :param y: Argument.
     :type y: int32
     :rtype: float32
-"""
+    """
 
 
 def signbitd(x):
@@ -2853,7 +2853,7 @@ def signbitd(x):
     :param x: Argument.
     :type x: float64
     :rtype: int32
-"""
+    """
 
 
 def signbitf(x):
@@ -2863,7 +2863,7 @@ def signbitf(x):
     :param x: Argument.
     :type x: float32
     :rtype: int32
-"""
+    """
 
 
 def sin(x):
@@ -2873,7 +2873,7 @@ def sin(x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def sincos(x):
@@ -2883,7 +2883,7 @@ def sincos(x):
     :param x: Argument.
     :type x: float64
     :rtype: UniTuple(float64 x 2)
-"""
+    """
 
 
 def sincosf(x):
@@ -2893,7 +2893,7 @@ def sincosf(x):
     :param x: Argument.
     :type x: float32
     :rtype: UniTuple(float32 x 2)
-"""
+    """
 
 
 def sincospi(x):
@@ -2903,7 +2903,7 @@ def sincospi(x):
     :param x: Argument.
     :type x: float64
     :rtype: UniTuple(float64 x 2)
-"""
+    """
 
 
 def sincospif(x):
@@ -2913,7 +2913,7 @@ def sincospif(x):
     :param x: Argument.
     :type x: float32
     :rtype: UniTuple(float32 x 2)
-"""
+    """
 
 
 def sinf(x):
@@ -2923,7 +2923,7 @@ def sinf(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def sinh(x):
@@ -2933,7 +2933,7 @@ def sinh(x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def sinhf(x):
@@ -2943,7 +2943,7 @@ def sinhf(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def sinpi(x):
@@ -2953,7 +2953,7 @@ def sinpi(x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def sinpif(x):
@@ -2963,7 +2963,7 @@ def sinpif(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def sqrt(x):
@@ -2973,7 +2973,7 @@ def sqrt(x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def sqrtf(x):
@@ -2983,7 +2983,7 @@ def sqrtf(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def tan(x):
@@ -2993,7 +2993,7 @@ def tan(x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def tanf(x):
@@ -3003,7 +3003,7 @@ def tanf(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def tanh(x):
@@ -3013,7 +3013,7 @@ def tanh(x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def tanhf(x):
@@ -3023,7 +3023,7 @@ def tanhf(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def tgamma(x):
@@ -3033,7 +3033,7 @@ def tgamma(x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def tgammaf(x):
@@ -3043,7 +3043,7 @@ def tgammaf(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def trunc(x):
@@ -3053,7 +3053,7 @@ def trunc(x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def truncf(x):
@@ -3063,7 +3063,7 @@ def truncf(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def uhadd(x, y):
@@ -3075,7 +3075,7 @@ def uhadd(x, y):
     :param y: Argument.
     :type y: int32
     :rtype: int32
-"""
+    """
 
 
 def uint2double_rn(i):
@@ -3085,7 +3085,7 @@ def uint2double_rn(i):
     :param i: Argument.
     :type i: int32
     :rtype: float64
-"""
+    """
 
 
 def uint2float_rd(x):
@@ -3095,7 +3095,7 @@ def uint2float_rd(x):
     :param in: Argument.
     :type in: int32
     :rtype: float32
-"""
+    """
 
 
 def uint2float_rn(x):
@@ -3105,7 +3105,7 @@ def uint2float_rn(x):
     :param in: Argument.
     :type in: int32
     :rtype: float32
-"""
+    """
 
 
 def uint2float_ru(x):
@@ -3115,7 +3115,7 @@ def uint2float_ru(x):
     :param in: Argument.
     :type in: int32
     :rtype: float32
-"""
+    """
 
 
 def uint2float_rz(x):
@@ -3125,7 +3125,7 @@ def uint2float_rz(x):
     :param in: Argument.
     :type in: int32
     :rtype: float32
-"""
+    """
 
 
 def ull2double_rd(l):
@@ -3135,7 +3135,7 @@ def ull2double_rd(l):
     :param l: Argument.
     :type l: int64
     :rtype: float64
-"""
+    """
 
 
 def ull2double_rn(l):
@@ -3145,7 +3145,7 @@ def ull2double_rn(l):
     :param l: Argument.
     :type l: int64
     :rtype: float64
-"""
+    """
 
 
 def ull2double_ru(l):
@@ -3155,7 +3155,7 @@ def ull2double_ru(l):
     :param l: Argument.
     :type l: int64
     :rtype: float64
-"""
+    """
 
 
 def ull2double_rz(l):
@@ -3165,7 +3165,7 @@ def ull2double_rz(l):
     :param l: Argument.
     :type l: int64
     :rtype: float64
-"""
+    """
 
 
 def ull2float_rd(l):
@@ -3175,7 +3175,7 @@ def ull2float_rd(l):
     :param l: Argument.
     :type l: int64
     :rtype: float32
-"""
+    """
 
 
 def ull2float_rn(l):
@@ -3185,7 +3185,7 @@ def ull2float_rn(l):
     :param l: Argument.
     :type l: int64
     :rtype: float32
-"""
+    """
 
 
 def ull2float_ru(l):
@@ -3195,7 +3195,7 @@ def ull2float_ru(l):
     :param l: Argument.
     :type l: int64
     :rtype: float32
-"""
+    """
 
 
 def ull2float_rz(l):
@@ -3205,7 +3205,7 @@ def ull2float_rz(l):
     :param l: Argument.
     :type l: int64
     :rtype: float32
-"""
+    """
 
 
 def ullmax(x, y):
@@ -3217,7 +3217,7 @@ def ullmax(x, y):
     :param y: Argument.
     :type y: int64
     :rtype: int64
-"""
+    """
 
 
 def ullmin(x, y):
@@ -3229,7 +3229,7 @@ def ullmin(x, y):
     :param y: Argument.
     :type y: int64
     :rtype: int64
-"""
+    """
 
 
 def umax(x, y):
@@ -3241,7 +3241,7 @@ def umax(x, y):
     :param y: Argument.
     :type y: int32
     :rtype: int32
-"""
+    """
 
 
 def umin(x, y):
@@ -3253,7 +3253,7 @@ def umin(x, y):
     :param y: Argument.
     :type y: int32
     :rtype: int32
-"""
+    """
 
 
 def umul24(x, y):
@@ -3265,7 +3265,7 @@ def umul24(x, y):
     :param y: Argument.
     :type y: int32
     :rtype: int32
-"""
+    """
 
 
 def umul64hi(x, y):
@@ -3277,7 +3277,7 @@ def umul64hi(x, y):
     :param y: Argument.
     :type y: int64
     :rtype: int64
-"""
+    """
 
 
 def umulhi(x, y):
@@ -3289,7 +3289,7 @@ def umulhi(x, y):
     :param y: Argument.
     :type y: int32
     :rtype: int32
-"""
+    """
 
 
 def urhadd(x, y):
@@ -3301,7 +3301,7 @@ def urhadd(x, y):
     :param y: Argument.
     :type y: int32
     :rtype: int32
-"""
+    """
 
 
 def usad(x, y, z):
@@ -3315,7 +3315,7 @@ def usad(x, y, z):
     :param z: Argument.
     :type z: int32
     :rtype: int32
-"""
+    """
 
 
 def y0(x):
@@ -3325,7 +3325,7 @@ def y0(x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def y0f(x):
@@ -3335,7 +3335,7 @@ def y0f(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def y1(x):
@@ -3345,7 +3345,7 @@ def y1(x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def y1f(x):
@@ -3355,7 +3355,7 @@ def y1f(x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
 
 
 def yn(n, x):
@@ -3367,7 +3367,7 @@ def yn(n, x):
     :param x: Argument.
     :type x: float64
     :rtype: float64
-"""
+    """
 
 
 def ynf(n, x):
@@ -3379,4 +3379,4 @@ def ynf(n, x):
     :param x: Argument.
     :type x: float32
     :rtype: float32
-"""
+    """
diff --git a/numba_cuda/numba/cuda/libdeviceimpl.py b/numba_cuda/numba/cuda/libdeviceimpl.py
index 4bb2e905e..827948954 100644
--- a/numba_cuda/numba/cuda/libdeviceimpl.py
+++ b/numba_cuda/numba/cuda/libdeviceimpl.py
@@ -49,8 +49,9 @@ def core(context, builder, sig, args):
         for arg in prototype_args:
             if arg.is_ptr:
                 # Allocate space for return value and add to args
-                tmp_arg = cgutils.alloca_once(builder,
-                                              context.get_value_type(arg.ty))
+                tmp_arg = cgutils.alloca_once(
+                    builder, context.get_value_type(arg.ty)
+                )
                 actual_args.append(tmp_arg)
                 virtual_args.append(tmp_arg)
             else:
diff --git a/numba_cuda/numba/cuda/mathimpl.py b/numba_cuda/numba/cuda/mathimpl.py
index c22deb564..9b1e1f767 100644
--- a/numba_cuda/numba/cuda/mathimpl.py
+++ b/numba_cuda/numba/cuda/mathimpl.py
@@ -12,57 +12,57 @@
 
 
 booleans = []
-booleans += [('isnand', 'isnanf', math.isnan)]
-booleans += [('isinfd', 'isinff', math.isinf)]
-booleans += [('isfinited', 'finitef', math.isfinite)]
+booleans += [("isnand", "isnanf", math.isnan)]
+booleans += [("isinfd", "isinff", math.isinf)]
+booleans += [("isfinited", "finitef", math.isfinite)]
 
 unarys = []
-unarys += [('ceil', 'ceilf', math.ceil)]
-unarys += [('floor', 'floorf', math.floor)]
-unarys += [('fabs', 'fabsf', math.fabs)]
-unarys += [('exp', 'expf', math.exp)]
-unarys += [('expm1', 'expm1f', math.expm1)]
-unarys += [('erf', 'erff', math.erf)]
-unarys += [('erfc', 'erfcf', math.erfc)]
-unarys += [('tgamma', 'tgammaf', math.gamma)]
-unarys += [('lgamma', 'lgammaf', math.lgamma)]
-unarys += [('sqrt', 'sqrtf', math.sqrt)]
-unarys += [('log', 'logf', math.log)]
-unarys += [('log2', 'log2f', math.log2)]
-unarys += [('log10', 'log10f', math.log10)]
-unarys += [('log1p', 'log1pf', math.log1p)]
-unarys += [('acosh', 'acoshf', math.acosh)]
-unarys += [('acos', 'acosf', math.acos)]
-unarys += [('cos', 'cosf', math.cos)]
-unarys += [('cosh', 'coshf', math.cosh)]
-unarys += [('asinh', 'asinhf', math.asinh)]
-unarys += [('asin', 'asinf', math.asin)]
-unarys += [('sin', 'sinf', math.sin)]
-unarys += [('sinh', 'sinhf', math.sinh)]
-unarys += [('atan', 'atanf', math.atan)]
-unarys += [('atanh', 'atanhf', math.atanh)]
-unarys += [('tan', 'tanf', math.tan)]
-unarys += [('trunc', 'truncf', math.trunc)]
+unarys += [("ceil", "ceilf", math.ceil)]
+unarys += [("floor", "floorf", math.floor)]
+unarys += [("fabs", "fabsf", math.fabs)]
+unarys += [("exp", "expf", math.exp)]
+unarys += [("expm1", "expm1f", math.expm1)]
+unarys += [("erf", "erff", math.erf)]
+unarys += [("erfc", "erfcf", math.erfc)]
+unarys += [("tgamma", "tgammaf", math.gamma)]
+unarys += [("lgamma", "lgammaf", math.lgamma)]
+unarys += [("sqrt", "sqrtf", math.sqrt)]
+unarys += [("log", "logf", math.log)]
+unarys += [("log2", "log2f", math.log2)]
+unarys += [("log10", "log10f", math.log10)]
+unarys += [("log1p", "log1pf", math.log1p)]
+unarys += [("acosh", "acoshf", math.acosh)]
+unarys += [("acos", "acosf", math.acos)]
+unarys += [("cos", "cosf", math.cos)]
+unarys += [("cosh", "coshf", math.cosh)]
+unarys += [("asinh", "asinhf", math.asinh)]
+unarys += [("asin", "asinf", math.asin)]
+unarys += [("sin", "sinf", math.sin)]
+unarys += [("sinh", "sinhf", math.sinh)]
+unarys += [("atan", "atanf", math.atan)]
+unarys += [("atanh", "atanhf", math.atanh)]
+unarys += [("tan", "tanf", math.tan)]
+unarys += [("trunc", "truncf", math.trunc)]
 
 unarys_fastmath = {}
-unarys_fastmath['cosf'] = 'fast_cosf'
-unarys_fastmath['sinf'] = 'fast_sinf'
-unarys_fastmath['tanf'] = 'fast_tanf'
-unarys_fastmath['expf'] = 'fast_expf'
-unarys_fastmath['log2f'] = 'fast_log2f'
-unarys_fastmath['log10f'] = 'fast_log10f'
-unarys_fastmath['logf'] = 'fast_logf'
+unarys_fastmath["cosf"] = "fast_cosf"
+unarys_fastmath["sinf"] = "fast_sinf"
+unarys_fastmath["tanf"] = "fast_tanf"
+unarys_fastmath["expf"] = "fast_expf"
+unarys_fastmath["log2f"] = "fast_log2f"
+unarys_fastmath["log10f"] = "fast_log10f"
+unarys_fastmath["logf"] = "fast_logf"
 
 binarys = []
-binarys += [('copysign', 'copysignf', math.copysign)]
-binarys += [('atan2', 'atan2f', math.atan2)]
-binarys += [('pow', 'powf', math.pow)]
-binarys += [('fmod', 'fmodf', math.fmod)]
-binarys += [('hypot', 'hypotf', math.hypot)]
-binarys += [('remainder', 'remainderf', math.remainder)]
+binarys += [("copysign", "copysignf", math.copysign)]
+binarys += [("atan2", "atan2f", math.atan2)]
+binarys += [("pow", "powf", math.pow)]
+binarys += [("fmod", "fmodf", math.fmod)]
+binarys += [("hypot", "hypotf", math.hypot)]
+binarys += [("remainder", "remainderf", math.remainder)]
 
 binarys_fastmath = {}
-binarys_fastmath['powf'] = 'fast_powf'
+binarys_fastmath["powf"] = "fast_powf"
 
 
 @lower(math.isinf, types.Integer)
@@ -179,8 +179,9 @@ def fp16_trunc(x):
 
 def impl_boolean(key, ty, libfunc):
     def lower_boolean_impl(context, builder, sig, args):
-        libfunc_impl = context.get_function(libfunc,
-                                            typing.signature(types.int32, ty))
+        libfunc_impl = context.get_function(
+            libfunc, typing.signature(types.int32, ty)
+        )
         result = libfunc_impl(builder, args)
         return context.cast(builder, result, types.int32, types.boolean)
 
@@ -197,9 +198,11 @@ def lower_unary_impl(context, builder, sig, args):
         if fast_replacement is not None:
             actual_libfunc = getattr(libdevice, fast_replacement)
 
-        libfunc_impl = context.get_function(actual_libfunc,
-                                            typing.signature(ty, ty))
+        libfunc_impl = context.get_function(
+            actual_libfunc, typing.signature(ty, ty)
+        )
         return libfunc_impl(builder, args)
+
     return lower_unary_impl
 
 
@@ -208,7 +211,7 @@ def get_unary_impl_for_fn_and_ty(fn, ty):
     # unary implementations, it does not appear in the unarys list. However,
     # its implementation can be looked up by key like the other
     # implementations, so we add it to the list we search here.
-    tanh_impls = ('tanh', 'tanhf', math.tanh)
+    tanh_impls = ("tanh", "tanhf", math.tanh)
     for fname64, fname32, key in unarys + [tanh_impls]:
         if fn == key:
             if ty == float32:
@@ -233,7 +236,7 @@ def lower_unary_int_impl(context, builder, sig, args):
         elif sig.args[0] == uint64:
             convert = builder.uitofp
         else:
-            m = 'Only 64-bit integers are supported for generic unary int ops'
+            m = "Only 64-bit integers are supported for generic unary int ops"
             raise TypeError(m)
 
         arg = convert(args[0], ir.DoubleType())
@@ -254,9 +257,11 @@ def lower_binary_impl(context, builder, sig, args):
         if fast_replacement is not None:
             actual_libfunc = getattr(libdevice, fast_replacement)
 
-        libfunc_impl = context.get_function(actual_libfunc,
-                                            typing.signature(ty, ty, ty))
+        libfunc_impl = context.get_function(
+            actual_libfunc, typing.signature(ty, ty, ty)
+        )
         return libfunc_impl(builder, args)
+
     return lower_binary_impl
 
 
@@ -285,7 +290,7 @@ def lower_binary_int_impl(context, builder, sig, args):
         elif sig.args[0] == uint64:
             convert = builder.uitofp
         else:
-            m = 'Only 64-bit integers are supported for generic binary int ops'
+            m = "Only 64-bit integers are supported for generic binary int ops"
             raise TypeError(m)
 
         args = [convert(arg, ir.DoubleType()) for arg in args]
@@ -390,12 +395,12 @@ def tanh_impl_libdevice():
 
         def tanhf_impl_fastmath():
             fnty = ir.FunctionType(ir.FloatType(), [ir.FloatType()])
-            asm = ir.InlineAsm(fnty, 'tanh.approx.f32 $0, $1;', '=f,f')
+            asm = ir.InlineAsm(fnty, "tanh.approx.f32 $0, $1;", "=f,f")
             return builder.call(asm, args)
 
         if ty == float32 and context.fastmath:
             cc = get_compute_capability()
-            if cc >= (7,5):
+            if cc >= (7, 5):
                 return tanhf_impl_fastmath()
 
         return tanh_impl_libdevice()
@@ -420,7 +425,6 @@ def tanhf_impl_fastmath():
 def cpow_implement(fty, cty):
     def core(context, builder, sig, args):
         def cpow_internal(a, b):
-
             if b.real == fty(0.0) and b.imag == fty(0.0):
                 return cty(1.0) + cty(0.0j)
             elif a.real == fty(0.0) and b.real == fty(0.0):
@@ -434,8 +438,9 @@ def cpow_internal(a, b):
                 len /= math.exp(at * b.imag)
                 phase += b.imag * math.log(vabs)
 
-            return len * (cty(math.cos(phase)) +
-                          cty(math.sin(phase) * cty(1.0j)))
+            return len * (
+                cty(math.cos(phase)) + cty(math.sin(phase) * cty(1.0j))
+            )
 
         return context.compile_internal(builder, cpow_internal, sig, args)
 
diff --git a/numba_cuda/numba/cuda/models.py b/numba_cuda/numba/cuda/models.py
index 21d115125..f9735d7fc 100644
--- a/numba_cuda/numba/cuda/models.py
+++ b/numba_cuda/numba/cuda/models.py
@@ -16,11 +16,7 @@
 @register_model(Dim3)
 class Dim3Model(models.StructModel):
     def __init__(self, dmm, fe_type):
-        members = [
-            ('x', types.int32),
-            ('y', types.int32),
-            ('z', types.int32)
-        ]
+        members = [("x", types.int32), ("y", types.int32), ("z", types.int32)]
         super().__init__(dmm, fe_type, members)
 
 
diff --git a/numba_cuda/numba/cuda/nvvmutils.py b/numba_cuda/numba/cuda/nvvmutils.py
index 9a7dcde02..1b4fa1c33 100644
--- a/numba_cuda/numba/cuda/nvvmutils.py
+++ b/numba_cuda/numba/cuda/nvvmutils.py
@@ -5,159 +5,178 @@
 
 
 def declare_atomic_cas_int(lmod, isize):
-    fname = '___numba_atomic_i' + str(isize) + '_cas_hack'
-    fnty = ir.FunctionType(ir.IntType(isize),
-                           (ir.PointerType(ir.IntType(isize)),
-                            ir.IntType(isize),
-                            ir.IntType(isize)))
+    fname = "___numba_atomic_i" + str(isize) + "_cas_hack"
+    fnty = ir.FunctionType(
+        ir.IntType(isize),
+        (
+            ir.PointerType(ir.IntType(isize)),
+            ir.IntType(isize),
+            ir.IntType(isize),
+        ),
+    )
     return cgutils.get_or_insert_function(lmod, fnty, fname)
 
 
 def atomic_cmpxchg(builder, lmod, isize, ptr, cmp, val):
-    out = builder.cmpxchg(ptr, cmp, val, 'monotonic', 'monotonic')
+    out = builder.cmpxchg(ptr, cmp, val, "monotonic", "monotonic")
     return builder.extract_value(out, 0)
 
 
 def declare_atomic_add_float32(lmod):
-    fname = 'llvm.nvvm.atomic.load.add.f32.p0f32'
-    fnty = ir.FunctionType(ir.FloatType(),
-                           (ir.PointerType(ir.FloatType(), 0), ir.FloatType()))
+    fname = "llvm.nvvm.atomic.load.add.f32.p0f32"
+    fnty = ir.FunctionType(
+        ir.FloatType(), (ir.PointerType(ir.FloatType(), 0), ir.FloatType())
+    )
     return cgutils.get_or_insert_function(lmod, fnty, fname)
 
 
 def declare_atomic_add_float64(lmod):
     flags = targetconfig.ConfigStack().top()
     if flags.compute_capability >= (6, 0):
-        fname = 'llvm.nvvm.atomic.load.add.f64.p0f64'
+        fname = "llvm.nvvm.atomic.load.add.f64.p0f64"
     else:
-        fname = '___numba_atomic_double_add'
-    fnty = ir.FunctionType(ir.DoubleType(),
-                           (ir.PointerType(ir.DoubleType()), ir.DoubleType()))
+        fname = "___numba_atomic_double_add"
+    fnty = ir.FunctionType(
+        ir.DoubleType(), (ir.PointerType(ir.DoubleType()), ir.DoubleType())
+    )
     return cgutils.get_or_insert_function(lmod, fnty, fname)
 
 
 def declare_atomic_sub_float32(lmod):
-    fname = '___numba_atomic_float_sub'
-    fnty = ir.FunctionType(ir.FloatType(),
-                           (ir.PointerType(ir.FloatType()), ir.FloatType()))
+    fname = "___numba_atomic_float_sub"
+    fnty = ir.FunctionType(
+        ir.FloatType(), (ir.PointerType(ir.FloatType()), ir.FloatType())
+    )
     return cgutils.get_or_insert_function(lmod, fnty, fname)
 
 
 def declare_atomic_sub_float64(lmod):
-    fname = '___numba_atomic_double_sub'
-    fnty = ir.FunctionType(ir.DoubleType(),
-                           (ir.PointerType(ir.DoubleType()), ir.DoubleType()))
+    fname = "___numba_atomic_double_sub"
+    fnty = ir.FunctionType(
+        ir.DoubleType(), (ir.PointerType(ir.DoubleType()), ir.DoubleType())
+    )
     return cgutils.get_or_insert_function(lmod, fnty, fname)
 
 
 def declare_atomic_inc_int32(lmod):
-    fname = 'llvm.nvvm.atomic.load.inc.32.p0i32'
-    fnty = ir.FunctionType(ir.IntType(32),
-                           (ir.PointerType(ir.IntType(32)), ir.IntType(32)))
+    fname = "llvm.nvvm.atomic.load.inc.32.p0i32"
+    fnty = ir.FunctionType(
+        ir.IntType(32), (ir.PointerType(ir.IntType(32)), ir.IntType(32))
+    )
     return cgutils.get_or_insert_function(lmod, fnty, fname)
 
 
 def declare_atomic_inc_int64(lmod):
-    fname = '___numba_atomic_u64_inc'
-    fnty = ir.FunctionType(ir.IntType(64),
-                           (ir.PointerType(ir.IntType(64)), ir.IntType(64)))
+    fname = "___numba_atomic_u64_inc"
+    fnty = ir.FunctionType(
+        ir.IntType(64), (ir.PointerType(ir.IntType(64)), ir.IntType(64))
+    )
     return cgutils.get_or_insert_function(lmod, fnty, fname)
 
 
 def declare_atomic_dec_int32(lmod):
-    fname = 'llvm.nvvm.atomic.load.dec.32.p0i32'
-    fnty = ir.FunctionType(ir.IntType(32),
-                           (ir.PointerType(ir.IntType(32)), ir.IntType(32)))
+    fname = "llvm.nvvm.atomic.load.dec.32.p0i32"
+    fnty = ir.FunctionType(
+        ir.IntType(32), (ir.PointerType(ir.IntType(32)), ir.IntType(32))
+    )
     return cgutils.get_or_insert_function(lmod, fnty, fname)
 
 
 def declare_atomic_dec_int64(lmod):
-    fname = '___numba_atomic_u64_dec'
-    fnty = ir.FunctionType(ir.IntType(64),
-                           (ir.PointerType(ir.IntType(64)), ir.IntType(64)))
+    fname = "___numba_atomic_u64_dec"
+    fnty = ir.FunctionType(
+        ir.IntType(64), (ir.PointerType(ir.IntType(64)), ir.IntType(64))
+    )
     return cgutils.get_or_insert_function(lmod, fnty, fname)
 
 
 def declare_atomic_max_float32(lmod):
-    fname = '___numba_atomic_float_max'
-    fnty = ir.FunctionType(ir.FloatType(),
-                           (ir.PointerType(ir.FloatType()), ir.FloatType()))
+    fname = "___numba_atomic_float_max"
+    fnty = ir.FunctionType(
+        ir.FloatType(), (ir.PointerType(ir.FloatType()), ir.FloatType())
+    )
     return cgutils.get_or_insert_function(lmod, fnty, fname)
 
 
 def declare_atomic_max_float64(lmod):
-    fname = '___numba_atomic_double_max'
-    fnty = ir.FunctionType(ir.DoubleType(),
-                           (ir.PointerType(ir.DoubleType()), ir.DoubleType()))
+    fname = "___numba_atomic_double_max"
+    fnty = ir.FunctionType(
+        ir.DoubleType(), (ir.PointerType(ir.DoubleType()), ir.DoubleType())
+    )
     return cgutils.get_or_insert_function(lmod, fnty, fname)
 
 
 def declare_atomic_min_float32(lmod):
-    fname = '___numba_atomic_float_min'
-    fnty = ir.FunctionType(ir.FloatType(),
-                           (ir.PointerType(ir.FloatType()), ir.FloatType()))
+    fname = "___numba_atomic_float_min"
+    fnty = ir.FunctionType(
+        ir.FloatType(), (ir.PointerType(ir.FloatType()), ir.FloatType())
+    )
     return cgutils.get_or_insert_function(lmod, fnty, fname)
 
 
 def declare_atomic_min_float64(lmod):
-    fname = '___numba_atomic_double_min'
-    fnty = ir.FunctionType(ir.DoubleType(),
-                           (ir.PointerType(ir.DoubleType()), ir.DoubleType()))
+    fname = "___numba_atomic_double_min"
+    fnty = ir.FunctionType(
+        ir.DoubleType(), (ir.PointerType(ir.DoubleType()), ir.DoubleType())
+    )
     return cgutils.get_or_insert_function(lmod, fnty, fname)
 
 
 def declare_atomic_nanmax_float32(lmod):
-    fname = '___numba_atomic_float_nanmax'
-    fnty = ir.FunctionType(ir.FloatType(),
-                           (ir.PointerType(ir.FloatType()), ir.FloatType()))
+    fname = "___numba_atomic_float_nanmax"
+    fnty = ir.FunctionType(
+        ir.FloatType(), (ir.PointerType(ir.FloatType()), ir.FloatType())
+    )
     return cgutils.get_or_insert_function(lmod, fnty, fname)
 
 
 def declare_atomic_nanmax_float64(lmod):
-    fname = '___numba_atomic_double_nanmax'
-    fnty = ir.FunctionType(ir.DoubleType(),
-                           (ir.PointerType(ir.DoubleType()), ir.DoubleType()))
+    fname = "___numba_atomic_double_nanmax"
+    fnty = ir.FunctionType(
+        ir.DoubleType(), (ir.PointerType(ir.DoubleType()), ir.DoubleType())
+    )
     return cgutils.get_or_insert_function(lmod, fnty, fname)
 
 
 def declare_atomic_nanmin_float32(lmod):
-    fname = '___numba_atomic_float_nanmin'
-    fnty = ir.FunctionType(ir.FloatType(),
-                           (ir.PointerType(ir.FloatType()), ir.FloatType()))
+    fname = "___numba_atomic_float_nanmin"
+    fnty = ir.FunctionType(
+        ir.FloatType(), (ir.PointerType(ir.FloatType()), ir.FloatType())
+    )
     return cgutils.get_or_insert_function(lmod, fnty, fname)
 
 
 def declare_atomic_nanmin_float64(lmod):
-    fname = '___numba_atomic_double_nanmin'
-    fnty = ir.FunctionType(ir.DoubleType(),
-                           (ir.PointerType(ir.DoubleType()), ir.DoubleType()))
+    fname = "___numba_atomic_double_nanmin"
+    fnty = ir.FunctionType(
+        ir.DoubleType(), (ir.PointerType(ir.DoubleType()), ir.DoubleType())
+    )
     return cgutils.get_or_insert_function(lmod, fnty, fname)
 
 
 def declare_cudaCGGetIntrinsicHandle(lmod):
-    fname = 'cudaCGGetIntrinsicHandle'
-    fnty = ir.FunctionType(ir.IntType(64),
-                           (ir.IntType(32),))
+    fname = "cudaCGGetIntrinsicHandle"
+    fnty = ir.FunctionType(ir.IntType(64), (ir.IntType(32),))
     return cgutils.get_or_insert_function(lmod, fnty, fname)
 
 
 def declare_cudaCGSynchronize(lmod):
-    fname = 'cudaCGSynchronize'
-    fnty = ir.FunctionType(ir.IntType(32),
-                           (ir.IntType(64), ir.IntType(32)))
+    fname = "cudaCGSynchronize"
+    fnty = ir.FunctionType(ir.IntType(32), (ir.IntType(64), ir.IntType(32)))
     return cgutils.get_or_insert_function(lmod, fnty, fname)
 
 
 def declare_string(builder, value):
     lmod = builder.basic_block.function.module
     cval = cgutils.make_bytearray(value.encode("utf-8") + b"\x00")
-    gl = cgutils.add_global_variable(lmod, cval.type, name="_str",
-                                     addrspace=nvvm.ADDRSPACE_CONSTANT)
-    gl.linkage = 'internal'
+    gl = cgutils.add_global_variable(
+        lmod, cval.type, name="_str", addrspace=nvvm.ADDRSPACE_CONSTANT
+    )
+    gl.linkage = "internal"
     gl.global_constant = True
     gl.initializer = cval
 
-    return builder.addrspacecast(gl, ir.PointerType(ir.IntType(8)), 'generic')
+    return builder.addrspacecast(gl, ir.PointerType(ir.IntType(8)), "generic")
 
 
 def declare_vprint(lmod):
@@ -172,24 +191,20 @@ def declare_vprint(lmod):
 # -----------------------------------------------------------------------------
 
 SREG_MAPPING = {
-    'tid.x': 'llvm.nvvm.read.ptx.sreg.tid.x',
-    'tid.y': 'llvm.nvvm.read.ptx.sreg.tid.y',
-    'tid.z': 'llvm.nvvm.read.ptx.sreg.tid.z',
-
-    'ntid.x': 'llvm.nvvm.read.ptx.sreg.ntid.x',
-    'ntid.y': 'llvm.nvvm.read.ptx.sreg.ntid.y',
-    'ntid.z': 'llvm.nvvm.read.ptx.sreg.ntid.z',
-
-    'ctaid.x': 'llvm.nvvm.read.ptx.sreg.ctaid.x',
-    'ctaid.y': 'llvm.nvvm.read.ptx.sreg.ctaid.y',
-    'ctaid.z': 'llvm.nvvm.read.ptx.sreg.ctaid.z',
-
-    'nctaid.x': 'llvm.nvvm.read.ptx.sreg.nctaid.x',
-    'nctaid.y': 'llvm.nvvm.read.ptx.sreg.nctaid.y',
-    'nctaid.z': 'llvm.nvvm.read.ptx.sreg.nctaid.z',
-
-    'warpsize': 'llvm.nvvm.read.ptx.sreg.warpsize',
-    'laneid': 'llvm.nvvm.read.ptx.sreg.laneid',
+    "tid.x": "llvm.nvvm.read.ptx.sreg.tid.x",
+    "tid.y": "llvm.nvvm.read.ptx.sreg.tid.y",
+    "tid.z": "llvm.nvvm.read.ptx.sreg.tid.z",
+    "ntid.x": "llvm.nvvm.read.ptx.sreg.ntid.x",
+    "ntid.y": "llvm.nvvm.read.ptx.sreg.ntid.y",
+    "ntid.z": "llvm.nvvm.read.ptx.sreg.ntid.z",
+    "ctaid.x": "llvm.nvvm.read.ptx.sreg.ctaid.x",
+    "ctaid.y": "llvm.nvvm.read.ptx.sreg.ctaid.y",
+    "ctaid.z": "llvm.nvvm.read.ptx.sreg.ctaid.z",
+    "nctaid.x": "llvm.nvvm.read.ptx.sreg.nctaid.x",
+    "nctaid.y": "llvm.nvvm.read.ptx.sreg.nctaid.y",
+    "nctaid.z": "llvm.nvvm.read.ptx.sreg.nctaid.z",
+    "warpsize": "llvm.nvvm.read.ptx.sreg.warpsize",
+    "laneid": "llvm.nvvm.read.ptx.sreg.laneid",
 }
 
 
@@ -205,16 +220,16 @@ def __init__(self, builder):
         self.builder = builder
 
     def tid(self, xyz):
-        return call_sreg(self.builder, 'tid.%s' % xyz)
+        return call_sreg(self.builder, "tid.%s" % xyz)
 
     def ctaid(self, xyz):
-        return call_sreg(self.builder, 'ctaid.%s' % xyz)
+        return call_sreg(self.builder, "ctaid.%s" % xyz)
 
     def ntid(self, xyz):
-        return call_sreg(self.builder, 'ntid.%s' % xyz)
+        return call_sreg(self.builder, "ntid.%s" % xyz)
 
     def nctaid(self, xyz):
-        return call_sreg(self.builder, 'nctaid.%s' % xyz)
+        return call_sreg(self.builder, "nctaid.%s" % xyz)
 
     def getdim(self, xyz):
         i64 = ir.IntType(64)
@@ -227,7 +242,7 @@ def getdim(self, xyz):
 
 def get_global_id(builder, dim):
     sreg = SRegBuilder(builder)
-    it = (sreg.getdim(xyz) for xyz in 'xyz')
+    it = (sreg.getdim(xyz) for xyz in "xyz")
     seq = list(itertools.islice(it, None, dim))
     if dim == 1:
         return seq[0]
diff --git a/numba_cuda/numba/cuda/printimpl.py b/numba_cuda/numba/cuda/printimpl.py
index b8f3d2eec..6acd70049 100644
--- a/numba_cuda/numba/cuda/printimpl.py
+++ b/numba_cuda/numba/cuda/printimpl.py
@@ -15,6 +15,7 @@
 
 # NOTE: we don't use @lower here since print_item() doesn't return a LLVM value
 
+
 @singledispatch
 def print_item(ty, context, builder, val):
     """
@@ -22,8 +23,9 @@ def print_item(ty, context, builder, val):
     A (format string, [list of arguments]) is returned that will allow
     forming the final printf()-like call.
     """
-    raise NotImplementedError("printing unimplemented for values of type %s"
-                              % (ty,))
+    raise NotImplementedError(
+        "printing unimplemented for values of type %s" % (ty,)
+    )
 
 
 @print_item.register(types.Integer)
@@ -92,11 +94,13 @@ def print_varargs(context, builder, sig, args):
 
     rawfmt = " ".join(formats) + "\n"
     if len(args) > 32:
-        msg = ('CUDA print() cannot print more than 32 items. '
-               'The raw format string will be emitted by the kernel instead.')
+        msg = (
+            "CUDA print() cannot print more than 32 items. "
+            "The raw format string will be emitted by the kernel instead."
+        )
         warn(msg, NumbaWarning)
 
-        rawfmt = rawfmt.replace('%', '%%')
+        rawfmt = rawfmt.replace("%", "%%")
     fmt = context.insert_string_const_addrspace(builder, rawfmt)
     array = cgutils.make_anonymous_struct(builder, values)
     arrayptr = cgutils.alloca_once_value(builder, array)
diff --git a/numba_cuda/numba/cuda/random.py b/numba_cuda/numba/cuda/random.py
index 460c7fc21..82905e8ac 100644
--- a/numba_cuda/numba/cuda/random.py
+++ b/numba_cuda/numba/cuda/random.py
@@ -1,7 +1,16 @@
 import math
 
-from numba import (config, cuda, float32, float64, uint32, int64, uint64,
-                   from_dtype, jit)
+from numba import (
+    config,
+    cuda,
+    float32,
+    float64,
+    uint32,
+    int64,
+    uint64,
+    from_dtype,
+    jit,
+)
 
 import numpy as np
 
@@ -29,8 +38,9 @@
 # using the CPU @jit decorator everywhere to create functions that work as
 # both CPU and CUDA device functions.
 
-xoroshiro128p_dtype = np.dtype([('s0', np.uint64), ('s1', np.uint64)],
-                               align=True)
+xoroshiro128p_dtype = np.dtype(
+    [("s0", np.uint64), ("s1", np.uint64)], align=True
+)
 xoroshiro128p_type = from_dtype(xoroshiro128p_dtype)
 
 # When cudasim is enabled, Fake CUDA arrays are passed to some of the
@@ -45,7 +55,7 @@
 
 @jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
 def init_xoroshiro128p_state(states, index, seed):
-    '''Use SplitMix64 to generate an xoroshiro128p state from 64-bit seed.
+    """Use SplitMix64 to generate an xoroshiro128p state from 64-bit seed.
 
     This ensures that manually set small seeds don't result in a predictable
     initial sequence from the random number generator.
@@ -56,7 +66,7 @@ def init_xoroshiro128p_state(states, index, seed):
     :param index: offset in states to update
     :type seed: int64
     :param seed: seed value to use when initializing state
-    '''
+    """
     index = int64(index)
     seed = uint64(seed)
 
@@ -65,13 +75,13 @@ def init_xoroshiro128p_state(states, index, seed):
     z = (z ^ (z >> uint32(27))) * uint64(0x94D049BB133111EB)
     z = z ^ (z >> uint32(31))
 
-    states[index]['s0'] = z
-    states[index]['s1'] = z
+    states[index]["s0"] = z
+    states[index]["s1"] = z
 
 
 @jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
 def rotl(x, k):
-    '''Left rotate x by k bits.'''
+    """Left rotate x by k bits."""
     x = uint64(x)
     k = uint32(k)
     return (x << k) | (x >> uint32(64 - k))
@@ -79,38 +89,38 @@ def rotl(x, k):
 
 @jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
 def xoroshiro128p_next(states, index):
-    '''Return the next random uint64 and advance the RNG in states[index].
+    """Return the next random uint64 and advance the RNG in states[index].
 
     :type states: 1D array, dtype=xoroshiro128p_dtype
     :param states: array of RNG states
     :type index: int64
     :param index: offset in states to update
     :rtype: uint64
-    '''
+    """
     index = int64(index)
-    s0 = states[index]['s0']
-    s1 = states[index]['s1']
+    s0 = states[index]["s0"]
+    s1 = states[index]["s1"]
     result = s0 + s1
 
     s1 ^= s0
-    states[index]['s0'] = uint64(rotl(s0, uint32(55))) ^ s1 ^ (s1 << uint32(14))
-    states[index]['s1'] = uint64(rotl(s1, uint32(36)))
+    states[index]["s0"] = uint64(rotl(s0, uint32(55))) ^ s1 ^ (s1 << uint32(14))
+    states[index]["s1"] = uint64(rotl(s1, uint32(36)))
 
     return result
 
 
 @jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
 def xoroshiro128p_jump(states, index):
-    '''Advance the RNG in ``states[index]`` by 2**64 steps.
+    """Advance the RNG in ``states[index]`` by 2**64 steps.
 
     :type states: 1D array, dtype=xoroshiro128p_dtype
     :param states: array of RNG states
     :type index: int64
     :param index: offset in states to update
-    '''
+    """
     index = int64(index)
 
-    jump = (uint64(0xbeac0467eba5facb), uint64(0xd86b048b86aa9922))
+    jump = (uint64(0xBEAC0467EBA5FACB), uint64(0xD86B048B86AA9922))
 
     s0 = uint64(0)
     s1 = uint64(0)
@@ -118,52 +128,52 @@ def xoroshiro128p_jump(states, index):
     for i in range(2):
         for b in range(64):
             if jump[i] & (uint64(1) << uint32(b)):
-                s0 ^= states[index]['s0']
-                s1 ^= states[index]['s1']
+                s0 ^= states[index]["s0"]
+                s1 ^= states[index]["s1"]
             xoroshiro128p_next(states, index)
 
-    states[index]['s0'] = s0
-    states[index]['s1'] = s1
+    states[index]["s0"] = s0
+    states[index]["s1"] = s1
 
 
 @jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
 def uint64_to_unit_float64(x):
-    '''Convert uint64 to float64 value in the range [0.0, 1.0)'''
+    """Convert uint64 to float64 value in the range [0.0, 1.0)"""
     x = uint64(x)
     return (x >> uint32(11)) * (float64(1) / (uint64(1) << uint32(53)))
 
 
 @jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
 def uint64_to_unit_float32(x):
-    '''Convert uint64 to float32 value in the range [0.0, 1.0)'''
+    """Convert uint64 to float32 value in the range [0.0, 1.0)"""
     x = uint64(x)
     return float32(uint64_to_unit_float64(x))
 
 
 @jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
 def xoroshiro128p_uniform_float32(states, index):
-    '''Return a float32 in range [0.0, 1.0) and advance ``states[index]``.
+    """Return a float32 in range [0.0, 1.0) and advance ``states[index]``.
 
     :type states: 1D array, dtype=xoroshiro128p_dtype
     :param states: array of RNG states
     :type index: int64
     :param index: offset in states to update
     :rtype: float32
-    '''
+    """
     index = int64(index)
     return uint64_to_unit_float32(xoroshiro128p_next(states, index))
 
 
 @jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
 def xoroshiro128p_uniform_float64(states, index):
-    '''Return a float64 in range [0.0, 1.0) and advance ``states[index]``.
+    """Return a float64 in range [0.0, 1.0) and advance ``states[index]``.
 
     :type states: 1D array, dtype=xoroshiro128p_dtype
     :param states: array of RNG states
     :type index: int64
     :param index: offset in states to update
     :rtype: float64
-    '''
+    """
     index = int64(index)
     return uint64_to_unit_float64(xoroshiro128p_next(states, index))
 
@@ -174,7 +184,7 @@ def xoroshiro128p_uniform_float64(states, index):
 
 @jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
 def xoroshiro128p_normal_float32(states, index):
-    '''Return a normally distributed float32 and advance ``states[index]``.
+    """Return a normally distributed float32 and advance ``states[index]``.
 
     The return value is drawn from a Gaussian of mean=0 and sigma=1 using the
     Box-Muller transform.  This advances the RNG sequence by two steps.
@@ -184,7 +194,7 @@ def xoroshiro128p_normal_float32(states, index):
     :type index: int64
     :param index: offset in states to update
     :rtype: float32
-    '''
+    """
     index = int64(index)
 
     u1 = xoroshiro128p_uniform_float32(states, index)
@@ -199,7 +209,7 @@ def xoroshiro128p_normal_float32(states, index):
 
 @jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
 def xoroshiro128p_normal_float64(states, index):
-    '''Return a normally distributed float32 and advance ``states[index]``.
+    """Return a normally distributed float32 and advance ``states[index]``.
 
     The return value is drawn from a Gaussian of mean=0 and sigma=1 using the
     Box-Muller transform.  This advances the RNG sequence by two steps.
@@ -209,7 +219,7 @@ def xoroshiro128p_normal_float64(states, index):
     :type index: int64
     :param index: offset in states to update
     :rtype: float64
-    '''
+    """
     index = int64(index)
 
     u1 = xoroshiro128p_uniform_float32(states, index)
@@ -242,7 +252,7 @@ def init_xoroshiro128p_states_cpu(states, seed, subsequence_start):
 
 
 def init_xoroshiro128p_states(states, seed, subsequence_start=0, stream=0):
-    '''Initialize RNG states on the GPU for parallel generators.
+    """Initialize RNG states on the GPU for parallel generators.
 
     This initializes the RNG states so that each state in the array corresponds
     subsequences in the separated by 2**64 steps from each other in the main
@@ -257,7 +267,7 @@ def init_xoroshiro128p_states(states, seed, subsequence_start=0, stream=0):
     :param states: array of RNG states
     :type seed: uint64
     :param seed: starting seed for list of generators
-    '''
+    """
 
     # Initialization on CPU is much faster than the GPU
     states_cpu = np.empty(shape=states.shape, dtype=xoroshiro128p_dtype)
@@ -267,7 +277,7 @@ def init_xoroshiro128p_states(states, seed, subsequence_start=0, stream=0):
 
 
 def create_xoroshiro128p_states(n, seed, subsequence_start=0, stream=0):
-    '''Returns a new device array initialized for n random number generators.
+    """Returns a new device array initialized for n random number generators.
 
     This initializes the RNG states so that each state in the array corresponds
     subsequences in the separated by 2**64 steps from each other in the main
@@ -286,7 +296,7 @@ def create_xoroshiro128p_states(n, seed, subsequence_start=0, stream=0):
     :param subsequence_start:
     :type stream: CUDA stream
     :param stream: stream to run initialization kernel on
-    '''
+    """
     states = cuda.device_array(n, dtype=xoroshiro128p_dtype, stream=stream)
     init_xoroshiro128p_states(states, seed, subsequence_start, stream)
     return states
diff --git a/numba_cuda/numba/cuda/reshape_funcs.cu b/numba_cuda/numba/cuda/reshape_funcs.cu
index 123bfed97..7dfc19db7 100644
--- a/numba_cuda/numba/cuda/reshape_funcs.cu
+++ b/numba_cuda/numba/cuda/reshape_funcs.cu
@@ -148,4 +148,4 @@ numba_attempt_nocopy_reshape(npy_intp nd, const npy_intp *dims, const npy_intp *
     }
 
     return 1;
-}
\ No newline at end of file
+}
diff --git a/numba_cuda/numba/cuda/runtime/__init__.py b/numba_cuda/numba/cuda/runtime/__init__.py
index 636f187d3..881d0d0d0 100644
--- a/numba_cuda/numba/cuda/runtime/__init__.py
+++ b/numba_cuda/numba/cuda/runtime/__init__.py
@@ -1 +1 @@
-from numba.cuda.runtime.nrt import rtsys # noqa: F401
+from numba.cuda.runtime.nrt import rtsys  # noqa: F401
diff --git a/numba_cuda/numba/cuda/runtime/memsys.cu b/numba_cuda/numba/cuda/runtime/memsys.cu
index a5820971c..ed1133dbd 100644
--- a/numba_cuda/numba/cuda/runtime/memsys.cu
+++ b/numba_cuda/numba/cuda/runtime/memsys.cu
@@ -91,4 +91,4 @@ extern "C" __global__ void NRT_MemSys_print(void)
   } else {
     printf("TheMsys is null.\n");
   }
-}
\ No newline at end of file
+}
diff --git a/numba_cuda/numba/cuda/runtime/memsys.cuh b/numba_cuda/numba/cuda/runtime/memsys.cuh
index 862a1754c..74cfefc49 100644
--- a/numba_cuda/numba/cuda/runtime/memsys.cuh
+++ b/numba_cuda/numba/cuda/runtime/memsys.cuh
@@ -14,4 +14,4 @@ struct NRT_MemSys {
 /* The Memory System object */
 __device__ NRT_MemSys* TheMSys;
 
-extern "C" __global__ void NRT_MemSys_set(NRT_MemSys *memsys_ptr);
\ No newline at end of file
+extern "C" __global__ void NRT_MemSys_set(NRT_MemSys *memsys_ptr);
diff --git a/numba_cuda/numba/cuda/runtime/nrt.cu b/numba_cuda/numba/cuda/runtime/nrt.cu
index 879bf8d2f..a318dd4dd 100644
--- a/numba_cuda/numba/cuda/runtime/nrt.cu
+++ b/numba_cuda/numba/cuda/runtime/nrt.cu
@@ -33,7 +33,7 @@ extern "C" __device__ void* NRT_Allocate(size_t size)
 {
   void* ptr = NULL;
   ptr       = malloc(size);
-  if (TheMSys && TheMSys->stats.enabled) { 
+  if (TheMSys && TheMSys->stats.enabled) {
     TheMSys->stats.alloc.fetch_add(1, cuda::memory_order_relaxed); }
   return ptr;
 }
@@ -49,7 +49,7 @@ extern "C" __device__ void NRT_MemInfo_init(NRT_MemInfo* mi,
   mi->dtor_info = dtor_info;
   mi->data      = data;
   mi->size      = size;
- if (TheMSys && TheMSys->stats.enabled) { 
+ if (TheMSys && TheMSys->stats.enabled) {
   TheMSys->stats.mi_alloc.fetch_add(1, cuda::memory_order_relaxed); }
 }
 
@@ -77,7 +77,7 @@ extern "C" __device__ void NRT_dealloc(NRT_MemInfo* mi)
 extern "C" __device__ void NRT_MemInfo_destroy(NRT_MemInfo* mi)
 {
   NRT_dealloc(mi);
-  if (TheMSys && TheMSys->stats.enabled) { 
+  if (TheMSys && TheMSys->stats.enabled) {
     TheMSys->stats.mi_free.fetch_add(1, cuda::memory_order_relaxed); }
 }
 
diff --git a/numba_cuda/numba/cuda/runtime/nrt.py b/numba_cuda/numba/cuda/runtime/nrt.py
index 0b6781789..8b3be0c8e 100644
--- a/numba_cuda/numba/cuda/runtime/nrt.py
+++ b/numba_cuda/numba/cuda/runtime/nrt.py
@@ -5,26 +5,28 @@
 
 from numba import cuda, config
 from numba.core.runtime.nrt import _nrt_mstats
-from numba.cuda.cudadrv.driver import (Linker, driver, launch_kernel,
-                                       USE_NV_BINDING)
+from numba.cuda.cudadrv.driver import (
+    Linker,
+    driver,
+    launch_kernel,
+    USE_NV_BINDING,
+)
 from numba.cuda.cudadrv import devices
 from numba.cuda.api import get_current_device
 from numba.cuda.utils import _readenv
 
 
 # Check environment variable or config for NRT statistics enablement
-NRT_STATS = (
-    _readenv("NUMBA_CUDA_NRT_STATS", bool, False) or
-    getattr(config, "NUMBA_CUDA_NRT_STATS", False)
+NRT_STATS = _readenv("NUMBA_CUDA_NRT_STATS", bool, False) or getattr(
+    config, "NUMBA_CUDA_NRT_STATS", False
 )
 if not hasattr(config, "NUMBA_CUDA_NRT_STATS"):
     config.CUDA_NRT_STATS = NRT_STATS
 
 
 # Check environment variable or config for NRT enablement
-ENABLE_NRT = (
-    _readenv("NUMBA_CUDA_ENABLE_NRT", bool, False) or
-    getattr(config, "NUMBA_CUDA_ENABLE_NRT", False)
+ENABLE_NRT = _readenv("NUMBA_CUDA_ENABLE_NRT", bool, False) or getattr(
+    config, "NUMBA_CUDA_ENABLE_NRT", False
 )
 if not hasattr(config, "NUMBA_CUDA_ENABLE_NRT"):
     config.CUDA_ENABLE_NRT = ENABLE_NRT
@@ -35,16 +37,19 @@ def _alloc_init_guard(method):
     """
     Ensure NRT memory allocation and initialization before running the method
     """
+
     @wraps(method)
     def wrapper(self, *args, **kwargs):
         self.ensure_allocated()
         self.ensure_initialized()
         return method(self, *args, **kwargs)
+
     return wrapper
 
 
 class _Runtime:
     """Singleton class for Numba CUDA runtime"""
+
     _instance = None
 
     def __new__(cls, *args, **kwargs):
@@ -64,8 +69,7 @@ def _compile_memsys_module(self):
         """
         # Define the path for memsys.cu
         memsys_mod = os.path.join(
-            os.path.dirname(os.path.abspath(__file__)),
-            "memsys.cu"
+            os.path.dirname(os.path.abspath(__file__)), "memsys.cu"
         )
         cc = get_current_device().compute_capability
 
@@ -105,10 +109,12 @@ def allocate(self, stream=None):
         # Allocate space for NRT_MemSys
         ptr, nbytes = self._memsys_module.get_global_symbol("memsys_size")
         memsys_size = ctypes.c_uint64()
-        driver.cuMemcpyDtoH(ctypes.addressof(memsys_size),
-                            ptr.device_ctypes_pointer, nbytes)
+        driver.cuMemcpyDtoH(
+            ctypes.addressof(memsys_size), ptr.device_ctypes_pointer, nbytes
+        )
         self._memsys = device_array(
-            (memsys_size.value,), dtype="i1", stream=stream)
+            (memsys_size.value,), dtype="i1", stream=stream
+        )
         self.set_memsys_to_module(self._memsys_module, stream=stream)
 
     def _single_thread_launch(self, module, stream, name, params=()):
@@ -121,12 +127,16 @@ def _single_thread_launch(self, module, stream, name, params=()):
         func = module.get_function(name)
         launch_kernel(
             func.handle,
-            1, 1, 1,
-            1, 1, 1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
             0,
             stream.handle,
             params,
-            cooperative=False
+            cooperative=False,
         )
 
     def _ctypes_pointer(self, array):
@@ -158,7 +168,8 @@ def initialize(self, stream=None):
         self.ensure_allocated()
 
         self._single_thread_launch(
-            self._memsys_module, stream, "NRT_MemSys_init")
+            self._memsys_module, stream, "NRT_MemSys_init"
+        )
         self._initialized = True
 
         if config.CUDA_NRT_STATS:
@@ -170,7 +181,8 @@ def memsys_enable_stats(self, stream=None):
         Enable memsys statistics
         """
         self._single_thread_launch(
-            self._memsys_module, stream, "NRT_MemSys_enable_stats")
+            self._memsys_module, stream, "NRT_MemSys_enable_stats"
+        )
 
     @_alloc_init_guard
     def memsys_disable_stats(self, stream=None):
@@ -178,7 +190,8 @@ def memsys_disable_stats(self, stream=None):
         Disable memsys statistics
         """
         self._single_thread_launch(
-            self._memsys_module, stream, "NRT_MemSys_disable_stats")
+            self._memsys_module, stream, "NRT_MemSys_disable_stats"
+        )
 
     @_alloc_init_guard
     def memsys_stats_enabled(self, stream=None):
@@ -193,7 +206,7 @@ def memsys_stats_enabled(self, stream=None):
             self._memsys_module,
             stream,
             "NRT_MemSys_stats_enabled",
-            (enabled_ptr,)
+            (enabled_ptr,),
         )
 
         cuda.synchronize()
@@ -204,21 +217,20 @@ def _copy_memsys_to_host(self, stream):
         """
         Copy all statistics of memsys to the host
         """
-        dt = np.dtype([
-            ('alloc', np.uint64),
-            ('free', np.uint64),
-            ('mi_alloc', np.uint64),
-            ('mi_free', np.uint64)
-        ])
+        dt = np.dtype(
+            [
+                ("alloc", np.uint64),
+                ("free", np.uint64),
+                ("mi_alloc", np.uint64),
+                ("mi_free", np.uint64),
+            ]
+        )
 
         stats_for_read = cuda.managed_array(1, dt)
         stats_ptr = self._ctypes_pointer(stats_for_read)
 
         self._single_thread_launch(
-            self._memsys_module,
-            stream,
-            "NRT_MemSys_read",
-            [stats_ptr]
+            self._memsys_module, stream, "NRT_MemSys_read", [stats_ptr]
         )
         cuda.synchronize()
 
@@ -237,7 +249,7 @@ def get_allocation_stats(self, stream=None):
             alloc=memsys["alloc"],
             free=memsys["free"],
             mi_alloc=memsys["mi_alloc"],
-            mi_free=memsys["mi_free"]
+            mi_free=memsys["mi_free"],
         )
 
     @_alloc_init_guard
@@ -249,10 +261,7 @@ def _get_single_stat(self, stat, stream=None):
         got_ptr = self._ctypes_pointer(got)
 
         self._single_thread_launch(
-            self._memsys_module,
-            stream,
-            f"NRT_MemSys_read_{stat}",
-            [got_ptr]
+            self._memsys_module, stream, f"NRT_MemSys_read_{stat}", [got_ptr]
         )
 
         cuda.synchronize()
@@ -309,15 +318,13 @@ def set_memsys_to_module(self, module, stream=None):
         """
         if self._memsys is None:
             raise RuntimeError(
-                "Please allocate NRT Memsys first before setting to module.")
+                "Please allocate NRT Memsys first before setting to module."
+            )
 
         memsys_ptr = self._ctypes_pointer(self._memsys)
 
         self._single_thread_launch(
-            module,
-            stream,
-            "NRT_MemSys_set",
-            [memsys_ptr]
+            module, stream, "NRT_MemSys_set", [memsys_ptr]
         )
 
     @_alloc_init_guard
@@ -327,9 +334,7 @@ def print_memsys(self, stream=None):
         """
         cuda.synchronize()
         self._single_thread_launch(
-            self._memsys_module,
-            stream,
-            "NRT_MemSys_print"
+            self._memsys_module, stream, "NRT_MemSys_print"
         )
 
 
diff --git a/numba_cuda/numba/cuda/simulator/__init__.py b/numba_cuda/numba/cuda/simulator/__init__.py
index d24aa6e7d..ad75c1ec7 100644
--- a/numba_cuda/numba/cuda/simulator/__init__.py
+++ b/numba_cuda/numba/cuda/simulator/__init__.py
@@ -3,14 +3,22 @@
 from .api import *
 from .vector_types import vector_types
 from .reduction import Reduce
-from .cudadrv.devicearray import (device_array, device_array_like, pinned,
-                                  pinned_array, pinned_array_like,
-                                  mapped_array, to_device, auto_device)
+from .cudadrv.devicearray import (
+    device_array,
+    device_array_like,
+    pinned,
+    pinned_array,
+    pinned_array_like,
+    mapped_array,
+    to_device,
+    auto_device,
+)
 from .cudadrv import devicearray
 from .cudadrv.devices import require_context, gpus
 from .cudadrv.devices import get_context as current_context
 from .cudadrv.runtime import runtime
 from numba.core import config
+
 reduce = Reduce
 
 # Register simulated vector types as module level variables
@@ -25,14 +33,16 @@
 if config.ENABLE_CUDASIM:
     import sys
     from numba.cuda.simulator import cudadrv
-    sys.modules['numba.cuda.cudadrv'] = cudadrv
-    sys.modules['numba.cuda.cudadrv.devicearray'] = cudadrv.devicearray
-    sys.modules['numba.cuda.cudadrv.devices'] = cudadrv.devices
-    sys.modules['numba.cuda.cudadrv.driver'] = cudadrv.driver
-    sys.modules['numba.cuda.cudadrv.runtime'] = cudadrv.runtime
-    sys.modules['numba.cuda.cudadrv.drvapi'] = cudadrv.drvapi
-    sys.modules['numba.cuda.cudadrv.error'] = cudadrv.error
-    sys.modules['numba.cuda.cudadrv.nvvm'] = cudadrv.nvvm
+
+    sys.modules["numba.cuda.cudadrv"] = cudadrv
+    sys.modules["numba.cuda.cudadrv.devicearray"] = cudadrv.devicearray
+    sys.modules["numba.cuda.cudadrv.devices"] = cudadrv.devices
+    sys.modules["numba.cuda.cudadrv.driver"] = cudadrv.driver
+    sys.modules["numba.cuda.cudadrv.runtime"] = cudadrv.runtime
+    sys.modules["numba.cuda.cudadrv.drvapi"] = cudadrv.drvapi
+    sys.modules["numba.cuda.cudadrv.error"] = cudadrv.error
+    sys.modules["numba.cuda.cudadrv.nvvm"] = cudadrv.nvvm
 
     from . import compiler
-    sys.modules['numba.cuda.compiler'] = compiler
+
+    sys.modules["numba.cuda.compiler"] = compiler
diff --git a/numba_cuda/numba/cuda/simulator/api.py b/numba_cuda/numba/cuda/simulator/api.py
index c6a55e88e..39a893d3f 100644
--- a/numba_cuda/numba/cuda/simulator/api.py
+++ b/numba_cuda/numba/cuda/simulator/api.py
@@ -1,6 +1,6 @@
-'''
+"""
 Contains CUDA API functions
-'''
+"""
 
 # Imports here bring together parts of the API from other modules, so some of
 # them appear unused.
@@ -15,7 +15,7 @@
 
 
 def select_device(dev=0):
-    assert dev == 0, 'Only a single device supported by the simulator'
+    assert dev == 0, "Only a single device supported by the simulator"
 
 
 def is_float16_supported():
@@ -23,10 +23,11 @@ def is_float16_supported():
 
 
 class stream(object):
-    '''
+    """
     The stream API is supported in the simulator - however, all execution
     occurs synchronously, so synchronization requires no operation.
-    '''
+    """
+
     @contextmanager
     def auto_synchronize(self):
         yield
@@ -62,9 +63,9 @@ def declare_device(*args, **kwargs):
 
 
 def detect():
-    print('Found 1 CUDA devices')
-    print('id %d    %20s %40s' % (0, 'SIMULATOR', '[SUPPORTED]'))
-    print('%40s: 5.0' % 'compute capability')
+    print("Found 1 CUDA devices")
+    print("id %d    %20s %40s" % (0, "SIMULATOR", "[SUPPORTED]"))
+    print("%40s: 5.0" % "compute capability")
 
 
 def list_devices():
@@ -73,11 +74,13 @@ def list_devices():
 
 # Events
 
+
 class Event(object):
-    '''
+    """
     The simulator supports the event API, but they do not record timing info,
     and all simulation is synchronous. Execution time is not recorded.
-    '''
+    """
+
     def record(self, stream=0):
         pass
 
@@ -88,35 +91,48 @@ def synchronize(self):
         pass
 
     def elapsed_time(self, event):
-        warn('Simulator timings are bogus')
+        warn("Simulator timings are bogus")
         return 0.0
 
 
 event = Event
 
 
-def jit(func_or_sig=None, device=False, debug=None, argtypes=None,
-        inline=False, restype=None, fastmath=False, link=None,
-        boundscheck=None, opt=None, cache=None
-        ):
+def jit(
+    func_or_sig=None,
+    device=False,
+    debug=None,
+    argtypes=None,
+    inline=False,
+    restype=None,
+    fastmath=False,
+    link=None,
+    boundscheck=None,
+    opt=None,
+    cache=None,
+):
     # Here for API compatibility
     if boundscheck:
         raise NotImplementedError("bounds checking is not supported for CUDA")
 
     if link is not None:
-        raise NotImplementedError('Cannot link PTX in the simulator')
+        raise NotImplementedError("Cannot link PTX in the simulator")
 
     debug = config.CUDA_DEBUGINFO_DEFAULT if debug is None else debug
 
     # Check for first argument specifying types - in that case the
     # decorator is not being passed a function
-    if (func_or_sig is None or is_signature(func_or_sig)
-            or isinstance(func_or_sig, list)):
+    if (
+        func_or_sig is None
+        or is_signature(func_or_sig)
+        or isinstance(func_or_sig, list)
+    ):
+
         def jitwrapper(fn):
-            return FakeCUDAKernel(fn,
-                                  device=device,
-                                  fastmath=fastmath,
-                                  debug=debug)
+            return FakeCUDAKernel(
+                fn, device=device, fastmath=fastmath, debug=debug
+            )
+
         return jitwrapper
     return FakeCUDAKernel(func_or_sig, device=device, debug=debug)
 
diff --git a/numba_cuda/numba/cuda/simulator/compiler.py b/numba_cuda/numba/cuda/simulator/compiler.py
index 7db28d41a..ddebaf51c 100644
--- a/numba_cuda/numba/cuda/simulator/compiler.py
+++ b/numba_cuda/numba/cuda/simulator/compiler.py
@@ -1,7 +1,7 @@
-'''
+"""
 The compiler is not implemented in the simulator. This module provides a stub
 to allow tests to import successfully.
-'''
+"""
 
 compile = None
 compile_for_current_device = None
diff --git a/numba_cuda/numba/cuda/simulator/cudadrv/__init__.py b/numba_cuda/numba/cuda/simulator/cudadrv/__init__.py
index dde9362d4..128579600 100644
--- a/numba_cuda/numba/cuda/simulator/cudadrv/__init__.py
+++ b/numba_cuda/numba/cuda/simulator/cudadrv/__init__.py
@@ -1,2 +1,8 @@
-from numba.cuda.simulator.cudadrv import (devicearray, devices, driver, drvapi,
-                                          error, nvvm)
+from numba.cuda.simulator.cudadrv import (
+    devicearray,
+    devices,
+    driver,
+    drvapi,
+    error,
+    nvvm,
+)
diff --git a/numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py b/numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py
index 088184fd4..47d7777af 100644
--- a/numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py
+++ b/numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py
@@ -1,7 +1,8 @@
-'''
+"""
 The Device Array API is not implemented in the simulator. This module provides
 stubs to allow tests to import correctly.
-'''
+"""
+
 from contextlib import contextmanager
 from numba.np.numpy_support import numpy_version
 
@@ -12,37 +13,39 @@
 from_record_like = None
 
 
-errmsg_contiguous_buffer = ("Array contains non-contiguous buffer and cannot "
-                            "be transferred as a single memory region. Please "
-                            "ensure contiguous buffer with numpy "
-                            ".ascontiguousarray()")
+errmsg_contiguous_buffer = (
+    "Array contains non-contiguous buffer and cannot "
+    "be transferred as a single memory region. Please "
+    "ensure contiguous buffer with numpy "
+    ".ascontiguousarray()"
+)
 
 
 class FakeShape(tuple):
-    '''
+    """
     The FakeShape class is used to provide a shape which does not allow negative
     indexing, similar to the shape in CUDA Python. (Numpy shape arrays allow
     negative indexing)
-    '''
+    """
 
     def __getitem__(self, k):
         if isinstance(k, int) and k < 0:
-            raise IndexError('tuple index out of range')
+            raise IndexError("tuple index out of range")
         return super(FakeShape, self).__getitem__(k)
 
 
 class FakeWithinKernelCUDAArray(object):
-    '''
+    """
     Created to emulate the behavior of arrays within kernels, where either
     array.item or array['item'] is valid (that is, give all structured
     arrays `numpy.recarray`-like semantics). This behaviour does not follow
     the semantics of Python and NumPy with non-jitted code, and will be
     deprecated and removed.
-    '''
+    """
 
     def __init__(self, item):
         assert isinstance(item, FakeCUDAArray)
-        self.__dict__['_item'] = item
+        self.__dict__["_item"] = item
 
     def __wrap_if_fake(self, item):
         if isinstance(item, FakeCUDAArray):
@@ -84,18 +87,18 @@ def convert_fakes(obj):
 
             return obj
 
-        out = kwargs.get('out')
+        out = kwargs.get("out")
         if out:
-            kwargs['out'] = tuple(convert_fakes(o) for o in out)
+            kwargs["out"] = tuple(convert_fakes(o) for o in out)
         args = tuple(convert_fakes(a) for a in args)
         return call(*args, **kwargs)
 
 
 class FakeCUDAArray(object):
-    '''
+    """
     Implements the interface of a DeviceArray/DeviceRecord, but mostly just
     wraps a NumPy array.
-    '''
+    """
 
     __cuda_ndarray__ = True  # There must be gpu_data attribute
 
@@ -149,13 +152,13 @@ def copy_to_host(self, ary=None, stream=0):
         return ary
 
     def copy_to_device(self, ary, stream=0):
-        '''
+        """
         Copy from the provided array into this array.
 
         This may be less forgiving than the CUDA Python implementation, which
         will copy data up to the length of the smallest of the two arrays,
         whereas this expects the size of the arrays to be equal.
-        '''
+        """
         sentry_contiguous(self)
         self_core, ary_core = array_core(self), array_core(ary)
         if isinstance(ary, FakeCUDAArray):
@@ -164,9 +167,10 @@ def copy_to_device(self, ary, stream=0):
         else:
             ary_core = np.array(
                 ary_core,
-                order='C' if self_core.flags['C_CONTIGUOUS'] else 'F',
+                order="C" if self_core.flags["C_CONTIGUOUS"] else "F",
                 subok=True,
-                copy=False if numpy_version < (2, 0) else None)
+                copy=False if numpy_version < (2, 0) else None,
+            )
             check_array_compatibility(self_core, ary_core)
         np.copyto(self_core._ary, ary_core)
 
@@ -237,7 +241,7 @@ def __mod__(self, other):
         return FakeCUDAArray(self._ary % other)
 
     def __pow__(self, other):
-        return FakeCUDAArray(self._ary ** other)
+        return FakeCUDAArray(self._ary**other)
 
     def split(self, section, stream=0):
         return [
@@ -282,30 +286,33 @@ def is_contiguous(ary):
 
 def sentry_contiguous(ary):
     core = array_core(ary)
-    if not core.flags['C_CONTIGUOUS'] and not core.flags['F_CONTIGUOUS']:
+    if not core.flags["C_CONTIGUOUS"] and not core.flags["F_CONTIGUOUS"]:
         raise ValueError(errmsg_contiguous_buffer)
 
 
 def check_array_compatibility(ary1, ary2):
     ary1sq, ary2sq = ary1.squeeze(), ary2.squeeze()
     if ary1.dtype != ary2.dtype:
-        raise TypeError('incompatible dtype: %s vs. %s' %
-                        (ary1.dtype, ary2.dtype))
+        raise TypeError(
+            "incompatible dtype: %s vs. %s" % (ary1.dtype, ary2.dtype)
+        )
     if ary1sq.shape != ary2sq.shape:
-        raise ValueError('incompatible shape: %s vs. %s' %
-                         (ary1.shape, ary2.shape))
+        raise ValueError(
+            "incompatible shape: %s vs. %s" % (ary1.shape, ary2.shape)
+        )
     if ary1sq.strides != ary2sq.strides:
-        raise ValueError('incompatible strides: %s vs. %s' %
-                         (ary1.strides, ary2.strides))
+        raise ValueError(
+            "incompatible strides: %s vs. %s" % (ary1.strides, ary2.strides)
+        )
 
 
 def to_device(ary, stream=0, copy=True, to=None):
-    ary = np.array(ary,
-                   copy=False if numpy_version < (2, 0) else None,
-                   subok=True)
+    ary = np.array(
+        ary, copy=False if numpy_version < (2, 0) else None, subok=True
+    )
     sentry_contiguous(ary)
     if to is None:
-        buffer_dtype = np.int64 if ary.dtype.char in 'Mm' else ary.dtype
+        buffer_dtype = np.int64 if ary.dtype.char in "Mm" else ary.dtype
         return FakeCUDAArray(
             np.ndarray(
                 buffer=np.copy(array_core(ary)).view(buffer_dtype),
@@ -324,22 +331,22 @@ def pinned(arg):
 
 
 def mapped_array(*args, **kwargs):
-    for unused_arg in ('portable', 'wc'):
+    for unused_arg in ("portable", "wc"):
         if unused_arg in kwargs:
             kwargs.pop(unused_arg)
     return device_array(*args, **kwargs)
 
 
-def pinned_array(shape, dtype=np.float64, strides=None, order='C'):
+def pinned_array(shape, dtype=np.float64, strides=None, order="C"):
     return np.ndarray(shape=shape, strides=strides, dtype=dtype, order=order)
 
 
-def managed_array(shape, dtype=np.float64, strides=None, order='C'):
+def managed_array(shape, dtype=np.float64, strides=None, order="C"):
     return np.ndarray(shape=shape, strides=strides, dtype=dtype, order=order)
 
 
 def device_array(*args, **kwargs):
-    stream = kwargs.pop('stream') if 'stream' in kwargs else 0
+    stream = kwargs.pop("stream") if "stream" in kwargs else 0
     return FakeCUDAArray(np.ndarray(*args, **kwargs), stream=stream)
 
 
@@ -350,7 +357,7 @@ def _contiguous_strides_like_array(ary):
     """
     # Don't recompute strides if the default strides will be sufficient to
     # create a contiguous array.
-    if ary.flags['C_CONTIGUOUS'] or ary.flags['F_CONTIGUOUS'] or ary.ndim <= 1:
+    if ary.flags["C_CONTIGUOUS"] or ary.flags["F_CONTIGUOUS"] or ary.ndim <= 1:
         return None
 
     # Otherwise, we need to compute new strides using an algorithm adapted from
@@ -360,7 +367,7 @@ def _contiguous_strides_like_array(ary):
 
     # Stride permutation. E.g. a stride array (4, -2, 12) becomes
     # [(1, -2), (0, 4), (2, 12)]
-    strideperm = [ x for x in enumerate(ary.strides) ]
+    strideperm = [x for x in enumerate(ary.strides)]
     strideperm.sort(key=lambda x: x[1])
 
     # Compute new strides using permutation
@@ -373,24 +380,26 @@ def _contiguous_strides_like_array(ary):
 
 
 def _order_like_array(ary):
-    if ary.flags['F_CONTIGUOUS'] and not ary.flags['C_CONTIGUOUS']:
-        return 'F'
+    if ary.flags["F_CONTIGUOUS"] and not ary.flags["C_CONTIGUOUS"]:
+        return "F"
     else:
-        return 'C'
+        return "C"
 
 
 def device_array_like(ary, stream=0):
     strides = _contiguous_strides_like_array(ary)
     order = _order_like_array(ary)
-    return device_array(shape=ary.shape, dtype=ary.dtype, strides=strides,
-                        order=order)
+    return device_array(
+        shape=ary.shape, dtype=ary.dtype, strides=strides, order=order
+    )
 
 
 def pinned_array_like(ary):
     strides = _contiguous_strides_like_array(ary)
     order = _order_like_array(ary)
-    return pinned_array(shape=ary.shape, dtype=ary.dtype, strides=strides,
-                        order=order)
+    return pinned_array(
+        shape=ary.shape, dtype=ary.dtype, strides=strides, order=order
+    )
 
 
 def auto_device(ary, stream=0, copy=True):
@@ -399,15 +408,14 @@ def auto_device(ary, stream=0, copy=True):
 
     if not isinstance(ary, np.void):
         ary = np.array(
-            ary,
-            copy=False if numpy_version < (2, 0) else None,
-            subok=True)
+            ary, copy=False if numpy_version < (2, 0) else None, subok=True
+        )
     return to_device(ary, stream, copy), True
 
 
 def is_cuda_ndarray(obj):
     "Check if an object is a CUDA ndarray"
-    return getattr(obj, '__cuda_ndarray__', False)
+    return getattr(obj, "__cuda_ndarray__", False)
 
 
 def verify_cuda_ndarray_interface(obj):
@@ -418,15 +426,15 @@ def requires_attr(attr, typ):
         if not hasattr(obj, attr):
             raise AttributeError(attr)
         if not isinstance(getattr(obj, attr), typ):
-            raise AttributeError('%s must be of type %s' % (attr, typ))
+            raise AttributeError("%s must be of type %s" % (attr, typ))
 
-    requires_attr('shape', tuple)
-    requires_attr('strides', tuple)
-    requires_attr('dtype', np.dtype)
-    requires_attr('size', int)
+    requires_attr("shape", tuple)
+    requires_attr("strides", tuple)
+    requires_attr("dtype", np.dtype)
+    requires_attr("size", int)
 
 
 def require_cuda_ndarray(obj):
     "Raises ValueError is is_cuda_ndarray(obj) evaluates False"
     if not is_cuda_ndarray(obj):
-        raise ValueError('require an cuda ndarray object')
+        raise ValueError("require an cuda ndarray object")
diff --git a/numba_cuda/numba/cuda/simulator/cudadrv/devices.py b/numba_cuda/numba/cuda/simulator/cudadrv/devices.py
index 3237fb2c6..433316262 100644
--- a/numba_cuda/numba/cuda/simulator/cudadrv/devices.py
+++ b/numba_cuda/numba/cuda/simulator/cudadrv/devices.py
@@ -8,7 +8,7 @@
 
 class FakeCUDADevice:
     def __init__(self):
-        self.uuid = 'GPU-00000000-0000-0000-0000-000000000000'
+        self.uuid = "GPU-00000000-0000-0000-0000-000000000000"
 
     @property
     def compute_capability(self):
@@ -16,10 +16,11 @@ def compute_capability(self):
 
 
 class FakeCUDAContext:
-    '''
+    """
     This stub implements functionality only for simulating a single GPU
     at the moment.
-    '''
+    """
+
     def __init__(self, device_id):
         self._device_id = device_id
         self._device = FakeCUDADevice()
@@ -54,7 +55,7 @@ def get_memory_info(self):
         dependencies, e.g. `psutil` - so return infinite memory to maintain API
         type compatibility
         """
-        return _MemoryInfo(float('inf'), float('inf'))
+        return _MemoryInfo(float("inf"), float("inf"))
 
     def memalloc(self, sz):
         """
@@ -62,19 +63,20 @@ def memalloc(self, sz):
         At present, there is no division between simulated
         host memory and simulated device memory.
         """
-        return np.ndarray(sz, dtype='u1')
+        return np.ndarray(sz, dtype="u1")
 
     def memhostalloc(self, sz, mapped=False, portable=False, wc=False):
-        '''Allocates memory on the host'''
+        """Allocates memory on the host"""
         return self.memalloc(sz)
 
 
 class FakeDeviceList:
-    '''
+    """
     This stub implements a device list containing a single GPU. It also
     keeps track of the GPU status, i.e. whether the context is closed or not,
     which may have been set by the user calling reset()
-    '''
+    """
+
     def __init__(self):
         self.lst = (FakeCUDAContext(0),)
         self.closed = False
@@ -84,7 +86,7 @@ def __getitem__(self, devnum):
         return self.lst[devnum]
 
     def __str__(self):
-        return ', '.join([str(d) for d in self.lst])
+        return ", ".join([str(d) for d in self.lst])
 
     def __iter__(self):
         return iter(self.lst)
@@ -111,7 +113,7 @@ def get_context(devnum=0):
 
 
 def require_context(func):
-    '''
+    """
     In the simulator, a context is always "available", so this is a no-op.
-    '''
+    """
     return func
diff --git a/numba_cuda/numba/cuda/simulator/cudadrv/driver.py b/numba_cuda/numba/cuda/simulator/cudadrv/driver.py
index 09de5b729..7a567de58 100644
--- a/numba_cuda/numba/cuda/simulator/cudadrv/driver.py
+++ b/numba_cuda/numba/cuda/simulator/cudadrv/driver.py
@@ -1,15 +1,15 @@
-'''
+"""
 Most of the driver API is unsupported in the simulator, but some stubs are
 provided to allow tests to import correctly.
-'''
+"""
 
 
 def device_memset(dst, val, size, stream=0):
-    dst.view('u1')[:size].fill(bytes([val])[0])
+    dst.view("u1")[:size].fill(bytes([val])[0])
 
 
 def host_to_device(dst, src, size, stream=0):
-    dst.view('u1')[:size] = src.view('u1')[:size]
+    dst.view("u1")[:size] = src.view("u1")[:size]
 
 
 def device_to_host(dst, src, size, stream=0):
@@ -55,7 +55,7 @@ class CudaAPIError(RuntimeError):
 
 
 def launch_kernel(*args, **kwargs):
-    msg = 'Launching kernels directly is not supported in the simulator'
+    msg = "Launching kernels directly is not supported in the simulator"
     raise RuntimeError(msg)
 
 
diff --git a/numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py b/numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py
index 44c697f37..8229cba8d 100644
--- a/numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py
+++ b/numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py
@@ -1,4 +1,4 @@
-'''
+"""
 drvapi is not implemented in the simulator, but this module exists to allow
 tests to import correctly.
-'''
+"""
diff --git a/numba_cuda/numba/cuda/simulator/cudadrv/libs.py b/numba_cuda/numba/cuda/simulator/cudadrv/libs.py
index 347b936c5..3b56434d6 100644
--- a/numba_cuda/numba/cuda/simulator/cudadrv/libs.py
+++ b/numba_cuda/numba/cuda/simulator/cudadrv/libs.py
@@ -1,2 +1,2 @@
 def check_static_lib(lib):
-    raise FileNotFoundError('Linking libraries not supported by cudasim')
+    raise FileNotFoundError("Linking libraries not supported by cudasim")
diff --git a/numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py b/numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py
index 2a011a77a..4fa5561db 100644
--- a/numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py
+++ b/numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py
@@ -1,7 +1,7 @@
-'''
+"""
 NVVM is not supported in the simulator, but stubs are provided to allow tests
 to import correctly.
-'''
+"""
 
 
 class NvvmSupportError(ImportError):
@@ -10,7 +10,7 @@ class NvvmSupportError(ImportError):
 
 class NVVM(object):
     def __init__(self):
-        raise NvvmSupportError('NVVM not supported in the simulator')
+        raise NvvmSupportError("NVVM not supported in the simulator")
 
 
 CompilationUnit = None
diff --git a/numba_cuda/numba/cuda/simulator/cudadrv/runtime.py b/numba_cuda/numba/cuda/simulator/cudadrv/runtime.py
index 308d19e76..b38abedb6 100644
--- a/numba_cuda/numba/cuda/simulator/cudadrv/runtime.py
+++ b/numba_cuda/numba/cuda/simulator/cudadrv/runtime.py
@@ -1,7 +1,7 @@
-'''
+"""
 The runtime API is unsupported in the simulator, but some stubs are
 provided to allow tests to import correctly.
-'''
+"""
 
 
 class FakeRuntime(object):
@@ -13,7 +13,7 @@ def is_supported_version(self):
 
     @property
     def supported_versions(self):
-        return (-1, -1),
+        return ((-1, -1),)
 
 
 runtime = FakeRuntime()
diff --git a/numba_cuda/numba/cuda/simulator/kernel.py b/numba_cuda/numba/cuda/simulator/kernel.py
index b3ca22599..74d6d0dd3 100644
--- a/numba_cuda/numba/cuda/simulator/kernel.py
+++ b/numba_cuda/numba/cuda/simulator/kernel.py
@@ -41,9 +41,10 @@ def _get_kernel_context():
 
 
 class FakeOverload:
-    '''
+    """
     Used only to provide the max_cooperative_grid_blocks method
-    '''
+    """
+
     def max_cooperative_grid_blocks(self, blockdim):
         # We can only run one block in a cooperative grid because we have no
         # mechanism for synchronization between different blocks
@@ -58,16 +59,16 @@ def __getitem__(self, key):
 
 
 class FakeCUDAKernel(object):
-    '''
+    """
     Wraps a @cuda.jit-ed function.
-    '''
+    """
 
     def __init__(self, fn, device, fastmath=False, extensions=[], debug=False):
         self.fn = fn
         self._device = device
         self._fastmath = fastmath
         self._debug = debug
-        self.extensions = list(extensions) # defensive copy
+        self.extensions = list(extensions)  # defensive copy
         # Initial configuration: grid unconfigured, stream 0, no dynamic shared
         # memory.
         self.grid_dim = None
@@ -82,11 +83,13 @@ def __call__(self, *args):
                 return self.fn(*args)
 
         # Ensure we've been given a valid grid configuration
-        grid_dim, block_dim = normalize_kernel_dimensions(self.grid_dim,
-                                                          self.block_dim)
+        grid_dim, block_dim = normalize_kernel_dimensions(
+            self.grid_dim, self.block_dim
+        )
 
-        fake_cuda_module = FakeCUDAModule(grid_dim, block_dim,
-                                          self.dynshared_size)
+        fake_cuda_module = FakeCUDAModule(
+            grid_dim, block_dim, self.dynshared_size
+        )
         with _push_kernel_context(fake_cuda_module):
             # fake_args substitutes all numpy arrays for FakeCUDAArrays
             # because they implement some semantics differently
@@ -96,11 +99,10 @@ def fake_arg(arg):
                 # map the arguments using any extension you've registered
                 _, arg = functools.reduce(
                     lambda ty_val, extension: extension.prepare_args(
-                        *ty_val,
-                        stream=0,
-                        retr=retr),
+                        *ty_val, stream=0, retr=retr
+                    ),
                     self.extensions,
-                    (None, arg)
+                    (None, arg),
                 )
 
                 if isinstance(arg, np.ndarray) and arg.ndim > 0:
@@ -126,8 +128,9 @@ def fake_arg(arg):
                 wb()
 
     def __getitem__(self, configuration):
-        self.grid_dim, self.block_dim = \
-            normalize_kernel_dimensions(*configuration[:2])
+        self.grid_dim, self.block_dim = normalize_kernel_dimensions(
+            *configuration[:2]
+        )
 
         if len(configuration) == 4:
             self.dynshared_size = configuration[3]
@@ -142,8 +145,9 @@ def specialize(self, *args):
 
     def forall(self, ntasks, tpb=0, stream=0, sharedmem=0):
         if ntasks < 0:
-            raise ValueError("Can't create ForAll with negative task count: %s"
-                             % ntasks)
+            raise ValueError(
+                "Can't create ForAll with negative task count: %s" % ntasks
+            )
         return self[ntasks, 1, stream, sharedmem]
 
     @property
@@ -157,15 +161,19 @@ def py_func(self):
 
 # Thread emulation
 
+
 class BlockThread(threading.Thread):
-    '''
+    """
     Manages the execution of a function for a single CUDA thread.
-    '''
+    """
+
     def __init__(self, f, manager, blockIdx, threadIdx, debug):
         if debug:
+
             def debug_wrapper(*args, **kwargs):
-                np.seterr(divide='raise')
+                np.seterr(divide="raise")
                 f(*args, **kwargs)
+
             target = debug_wrapper
         else:
             target = f
@@ -181,27 +189,26 @@ def debug_wrapper(*args, **kwargs):
         self.abort = False
         self.debug = debug
         blockDim = Dim3(*self._manager._block_dim)
-        self.thread_id = self.threadIdx.x + (blockDim.x * (self.threadIdx.y +
-                                                           blockDim.y *
-                                                           self.threadIdx.z))
+        self.thread_id = self.threadIdx.x + (
+            blockDim.x * (self.threadIdx.y + blockDim.y * self.threadIdx.z)
+        )
 
     def run(self):
         try:
             super(BlockThread, self).run()
         except Exception as e:
-            tid = 'tid=%s' % list(self.threadIdx)
-            ctaid = 'ctaid=%s' % list(self.blockIdx)
-            if str(e) == '':
-                msg = '%s %s' % (tid, ctaid)
+            tid = "tid=%s" % list(self.threadIdx)
+            ctaid = "ctaid=%s" % list(self.blockIdx)
+            if str(e) == "":
+                msg = "%s %s" % (tid, ctaid)
             else:
-                msg = '%s %s: %s' % (tid, ctaid, e)
+                msg = "%s %s: %s" % (tid, ctaid, e)
             tb = sys.exc_info()[2]
             # Using `with_traceback` here would cause it to be mutated by
             # future raise statements, which may or may not matter.
             self.exception = (type(e)(msg), tb)
 
     def syncthreads(self):
-
         if self.abort:
             raise RuntimeError("abort flag set on syncthreads call")
 
@@ -237,11 +244,11 @@ def syncthreads_or(self, value):
         return 1 if test else 0
 
     def __str__(self):
-        return 'Thread <<<%s, %s>>>' % (self.blockIdx, self.threadIdx)
+        return "Thread <<<%s, %s>>>" % (self.blockIdx, self.threadIdx)
 
 
 class BlockManager(object):
-    '''
+    """
     Manages the execution of a thread block.
 
     When run() is called, all threads are started. Each thread executes until it
@@ -257,7 +264,8 @@ class BlockManager(object):
 
     The polling continues until no threads are alive, when execution is
     complete.
-    '''
+    """
+
     def __init__(self, f, grid_dim, block_dim, debug):
         self._grid_dim = grid_dim
         self._block_dim = block_dim
@@ -271,8 +279,10 @@ def run(self, grid_point, *args):
         livethreads = set()
         blockedthreads = set()
         for block_point in np.ndindex(*self._block_dim):
+
             def target():
                 self._f(*args)
+
             t = BlockThread(target, self, grid_point, block_point, self._debug)
             t.start()
             threads.add(t)
@@ -286,7 +296,6 @@ def target():
                 if t.syncthreads_blocked:
                     blockedthreads.add(t)
                 elif t.exception:
-
                     # Abort all other simulator threads on exception,
                     # do *not* join immediately to facilitate debugging.
                     for t_other in threads:
@@ -300,7 +309,7 @@ def target():
                     t.syncthreads_blocked = False
                     t.syncthreads_event.set()
                 blockedthreads = set()
-            livethreads = set([ t for t in livethreads if t.is_alive() ])
+            livethreads = set([t for t in livethreads if t.is_alive()])
         # Final check for exceptions in case any were set prior to thread
         # finishing, before we could check it
         for t in threads:
diff --git a/numba_cuda/numba/cuda/simulator/kernelapi.py b/numba_cuda/numba/cuda/simulator/kernelapi.py
index 64793df05..49670ab3b 100644
--- a/numba_cuda/numba/cuda/simulator/kernelapi.py
+++ b/numba_cuda/numba/cuda/simulator/kernelapi.py
@@ -1,7 +1,7 @@
-'''
+"""
 Implements the cuda module as called from within an executing kernel
 (@cuda.jit-decorated function).
-'''
+"""
 
 from contextlib import contextmanager
 import sys
@@ -16,19 +16,20 @@
 
 
 class Dim3(object):
-    '''
+    """
     Used to implement thread/block indices/dimensions
-    '''
+    """
+
     def __init__(self, x, y, z):
         self.x = x
         self.y = y
         self.z = z
 
     def __str__(self):
-        return '(%s, %s, %s)' % (self.x, self.y, self.z)
+        return "(%s, %s, %s)" % (self.x, self.y, self.z)
 
     def __repr__(self):
-        return 'Dim3(%s, %s, %s)' % (self.x, self.y, self.z)
+        return "Dim3(%s, %s, %s)" % (self.x, self.y, self.z)
 
     def __iter__(self):
         yield self.x
@@ -37,9 +38,9 @@ def __iter__(self):
 
 
 class GridGroup:
-    '''
+    """
     Used to implement the grid group.
-    '''
+    """
 
     def sync(self):
         # Synchronization of the grid group is equivalent to synchronization of
@@ -49,17 +50,19 @@ def sync(self):
 
 
 class FakeCUDACg:
-    '''
+    """
     CUDA Cooperative Groups
-    '''
+    """
+
     def this_grid(self):
         return GridGroup()
 
 
 class FakeCUDALocal(object):
-    '''
+    """
     CUDA Local arrays
-    '''
+    """
+
     def array(self, shape, dtype):
         if isinstance(dtype, types.Type):
             dtype = numpy_support.as_dtype(dtype)
@@ -67,21 +70,23 @@ def array(self, shape, dtype):
 
 
 class FakeCUDAConst(object):
-    '''
+    """
     CUDA Const arrays
-    '''
+    """
+
     def array_like(self, ary):
         return ary
 
 
 class FakeCUDAShared(object):
-    '''
+    """
     CUDA Shared arrays.
 
     Limitations: assumes that only one call to cuda.shared.array is on a line,
     and that that line is only executed once per thread. i.e.::
 
-        a = cuda.shared.array(...); b = cuda.shared.array(...)
+        a = cuda.shared.array(...)
+        b = cuda.shared.array(...)
 
     will erroneously alias a and b, and::
 
@@ -90,7 +95,7 @@ class FakeCUDAShared(object):
 
     will alias all arrays created at that point (though it is not certain that
     this would be supported by Numba anyway).
-    '''
+    """
 
     def __init__(self, dynshared_size):
         self._allocations = {}
@@ -274,13 +279,13 @@ def hexp2(self, x):
         return np.exp2(x, dtype=np.float16)
 
     def hexp10(self, x):
-        return np.float16(10 ** x)
+        return np.float16(10**x)
 
     def hsqrt(self, x):
         return np.sqrt(x, dtype=np.float16)
 
     def hrsqrt(self, x):
-        return np.float16(x ** -0.5)
+        return np.float16(x**-0.5)
 
     def hceil(self, x):
         return np.ceil(x, dtype=np.float16)
@@ -323,7 +328,7 @@ def hmin(self, a, b):
 
 
 class FakeCUDAModule(object):
-    '''
+    """
     An instance of this class will be injected into the __globals__ for an
     executing function in order to implement calls to cuda.*. This will fail to
     work correctly if the user code does::
@@ -331,7 +336,7 @@ class FakeCUDAModule(object):
         from numba import cuda as something_else
 
     In other words, the CUDA module must be called cuda.
-    '''
+    """
 
     def __init__(self, grid_dim, block_dim, dynshared_size):
         self.gridDim = Dim3(*grid_dim)
@@ -426,11 +431,11 @@ def cbrt(self, a):
         return a ** (1 / 3)
 
     def brev(self, val):
-        return int('{:032b}'.format(val)[::-1], 2)
+        return int("{:032b}".format(val)[::-1], 2)
 
     def clz(self, val):
-        s = '{:032b}'.format(val)
-        return len(s) - len(s.lstrip('0'))
+        s = "{:032b}".format(val)
+        return len(s) - len(s.lstrip("0"))
 
     def ffs(self, val):
         # The algorithm is:
@@ -438,8 +443,8 @@ def ffs(self, val):
         # 2. Add 1, because the LSB is numbered 1 rather than 0, and so on.
         # 3. If we've counted 32 zeros (resulting in 33), there were no bits
         #    set so we need to return zero.
-        s = '{:032b}'.format(val)
-        r = (len(s) - len(s.rstrip('0')) + 1) % 33
+        s = "{:032b}".format(val)
+        r = (len(s) - len(s.rstrip("0")) + 1) % 33
         return r
 
     def selp(self, a, b, c):
diff --git a/numba_cuda/numba/cuda/simulator/reduction.py b/numba_cuda/numba/cuda/simulator/reduction.py
index 1b819c043..5a3a8e87b 100644
--- a/numba_cuda/numba/cuda/simulator/reduction.py
+++ b/numba_cuda/numba/cuda/simulator/reduction.py
@@ -9,6 +9,7 @@ def reduce_wrapper(seq, res=None, init=0):
             return None
         else:
             return r
+
     return reduce_wrapper
 
 
diff --git a/numba_cuda/numba/cuda/simulator/vector_types.py b/numba_cuda/numba/cuda/simulator/vector_types.py
index 82a6fbe8a..55792d9b8 100644
--- a/numba_cuda/numba/cuda/simulator/vector_types.py
+++ b/numba_cuda/numba/cuda/simulator/vector_types.py
@@ -3,7 +3,7 @@
 
 
 class SimulatedVectorType:
-    attributes = ['x', 'y', 'z', 'w']
+    attributes = ["x", "y", "z", "w"]
 
     def __init__(self, *args):
         args_flattened = []
@@ -12,7 +12,7 @@ def __init__(self, *args):
                 args_flattened += arg.as_list()
             else:
                 args_flattened.append(arg)
-        self._attrs = self.attributes[:len(args_flattened)]
+        self._attrs = self.attributes[: len(args_flattened)]
         if not self.num_elements == len(args_flattened):
             raise TypeError(
                 f"{self.name} expects {self.num_elements}"
@@ -35,11 +35,15 @@ def as_list(self):
 
 
 def make_simulated_vector_type(num_elements, name):
-    obj = type(name, (SimulatedVectorType,), {
-        "num_elements": num_elements,
-        "base_type": types.float32,
-        "name": name
-    })
+    obj = type(
+        name,
+        (SimulatedVectorType,),
+        {
+            "num_elements": num_elements,
+            "base_type": types.float32,
+            "name": name,
+        },
+    )
     obj.user_facing_object = obj
     return obj
 
@@ -48,8 +52,8 @@ def _initialize():
     _simulated_vector_types = {}
     for stub in _vector_type_stubs:
         num_elements = int(stub.__name__[-1])
-        _simulated_vector_types[stub.__name__] = (
-            make_simulated_vector_type(num_elements, stub.__name__)
+        _simulated_vector_types[stub.__name__] = make_simulated_vector_type(
+            num_elements, stub.__name__
         )
         _simulated_vector_types[stub.__name__].aliases = stub.aliases
     return _simulated_vector_types
diff --git a/numba_cuda/numba/cuda/simulator_init.py b/numba_cuda/numba/cuda/simulator_init.py
index 9d7dd124a..fb2120632 100644
--- a/numba_cuda/numba/cuda/simulator_init.py
+++ b/numba_cuda/numba/cuda/simulator_init.py
@@ -4,14 +4,12 @@
 
 
 def is_available():
-    """Returns a boolean to indicate the availability of a CUDA GPU.
-    """
+    """Returns a boolean to indicate the availability of a CUDA GPU."""
     # Simulator is always available
     return True
 
 
 def cuda_error():
-    """Returns None or an exception if the CUDA driver fails to initialize.
-    """
+    """Returns None or an exception if the CUDA driver fails to initialize."""
     # Simulator never fails to initialize
     return None
diff --git a/numba_cuda/numba/cuda/stubs.py b/numba_cuda/numba/cuda/stubs.py
index 205cf8045..a16607699 100644
--- a/numba_cuda/numba/cuda/stubs.py
+++ b/numba_cuda/numba/cuda/stubs.py
@@ -1,6 +1,7 @@
 """
 This scripts specifies all PTX special objects.
 """
+
 import numpy as np
 from collections import defaultdict
 import functools
@@ -9,12 +10,13 @@
 
 
 class Stub(object):
-    '''
+    """
     A stub object to represent special objects that are meaningless
     outside the context of a CUDA kernel
-    '''
-    _description_ = '<ptx special value>'
-    __slots__ = () # don't allocate __dict__
+    """
+
+    _description_ = "<ptx special value>"
+    __slots__ = ()  # don't allocate __dict__
 
     def __new__(cls):
         raise NotImplementedError("%s is not instantiable" % cls)
@@ -24,23 +26,26 @@ def __repr__(self):
 
 
 def stub_function(fn):
-    '''
+    """
     A stub function to represent special functions that are meaningless
     outside the context of a CUDA kernel
-    '''
+    """
+
     @functools.wraps(fn)
     def wrapped(*args, **kwargs):
         raise NotImplementedError("%s cannot be called from host code" % fn)
+
     return wrapped
 
 
-#-------------------------------------------------------------------------------
+# -------------------------------------------------------------------------------
 # Thread and grid indices and dimensions
 
 
 class Dim3(Stub):
-    '''A triple, (x, y, z)'''
-    _description_ = '<Dim3>'
+    """A triple, (x, y, z)"""
+
+    _description_ = "<Dim3>"
 
     @property
     def x(self):
@@ -56,68 +61,76 @@ def z(self):
 
 
 class threadIdx(Dim3):
-    '''
+    """
     The thread indices in the current thread block. Each index is an integer
     spanning the range from 0 inclusive to the corresponding value of the
     attribute in :attr:`numba.cuda.blockDim` exclusive.
-    '''
-    _description_ = '<threadIdx.{x,y,z}>'
+    """
+
+    _description_ = "<threadIdx.{x,y,z}>"
 
 
 class blockIdx(Dim3):
-    '''
+    """
     The block indices in the grid of thread blocks. Each index is an integer
     spanning the range from 0 inclusive to the corresponding value of the
     attribute in :attr:`numba.cuda.gridDim` exclusive.
-    '''
-    _description_ = '<blockIdx.{x,y,z}>'
+    """
+
+    _description_ = "<blockIdx.{x,y,z}>"
 
 
 class blockDim(Dim3):
-    '''
+    """
     The shape of a block of threads, as declared when instantiating the kernel.
     This value is the same for all threads in a given kernel launch, even if
     they belong to different blocks (i.e. each block is "full").
-    '''
-    _description_ = '<blockDim.{x,y,z}>'
+    """
+
+    _description_ = "<blockDim.{x,y,z}>"
 
 
 class gridDim(Dim3):
-    '''
+    """
     The shape of the grid of blocks. This value is the same for all threads in
     a given kernel launch.
-    '''
-    _description_ = '<gridDim.{x,y,z}>'
+    """
+
+    _description_ = "<gridDim.{x,y,z}>"
 
 
 class warpsize(Stub):
-    '''
+    """
     The size of a warp. All architectures implemented to date have a warp size
     of 32.
-    '''
-    _description_ = '<warpsize>'
+    """
+
+    _description_ = "<warpsize>"
 
 
 class laneid(Stub):
-    '''
+    """
     This thread's lane within a warp. Ranges from 0 to
     :attr:`numba.cuda.warpsize` - 1.
-    '''
-    _description_ = '<laneid>'
+    """
+
+    _description_ = "<laneid>"
 
 
-#-------------------------------------------------------------------------------
+# -------------------------------------------------------------------------------
 # Array creation
 
+
 class shared(Stub):
-    '''
+    """
     Shared memory namespace
-    '''
-    _description_ = '<shared>'
+    """
+
+    _description_ = "<shared>"
 
     @stub_function
     def array(shape, dtype):
-        '''
+        """
         Allocate a shared array of the given *shape* and *type*. *shape* is
         either an integer or a tuple of integers representing the array's
         dimensions.  *type* is a :ref:`Numba type <numba-types>` of the
@@ -125,83 +138,89 @@ def array(shape, dtype):
 
         The returned array-like object can be read and written to like any
         normal device array (e.g. through indexing).
-        '''
+        """
 
 
 class local(Stub):
-    '''
+    """
     Local memory namespace
-    '''
-    _description_ = '<local>'
+    """
+
+    _description_ = "<local>"
 
     @stub_function
     def array(shape, dtype):
-        '''
+        """
         Allocate a local array of the given *shape* and *type*. The array is
         private to the current thread, and resides in global memory. An
         array-like object is returned which can be read and written to like any
         standard array (e.g.  through indexing).
-        '''
+        """
 
 
 class const(Stub):
-    '''
+    """
     Constant memory namespace
-    '''
+    """
 
     @stub_function
     def array_like(ndarray):
-        '''
+        """
         Create a const array from *ndarry*. The resulting const array will have
         the same shape, type, and values as *ndarray*.
-        '''
+        """
 
 
 # -------------------------------------------------------------------------------
 # warp level operations
 
+
 class syncwarp(Stub):
-    '''
+    """
     syncwarp(mask=0xFFFFFFFF)
 
     Synchronizes a masked subset of threads in a warp.
-    '''
-    _description_ = '<warp_sync()>'
+    """
+
+    _description_ = "<warp_sync()>"
 
 
 class shfl_sync_intrinsic(Stub):
-    '''
+    """
     shfl_sync_intrinsic(mask, mode, value, mode_offset, clamp)
 
     Nvvm intrinsic for shuffling data across a warp
     docs.nvidia.com/cuda/nvvm-ir-spec/index.html#nvvm-intrin-warp-level-datamove
-    '''
-    _description_ = '<shfl_sync()>'
+    """
+
+    _description_ = "<shfl_sync()>"
 
 
 class vote_sync_intrinsic(Stub):
-    '''
+    """
     vote_sync_intrinsic(mask, mode, predictate)
 
     Nvvm intrinsic for performing a reduce and broadcast across a warp
     docs.nvidia.com/cuda/nvvm-ir-spec/index.html#nvvm-intrin-warp-level-vote
-    '''
-    _description_ = '<vote_sync()>'
+    """
+
+    _description_ = "<vote_sync()>"
 
 
 class match_any_sync(Stub):
-    '''
+    """
     match_any_sync(mask, value)
 
     Nvvm intrinsic for performing a compare and broadcast across a warp.
     Returns a mask of threads that have same value as the given value from
     within the masked warp.
-    '''
-    _description_ = '<match_any_sync()>'
+    """
+
+    _description_ = "<match_any_sync()>"
 
 
 class match_all_sync(Stub):
-    '''
+    """
     match_all_sync(mask, value)
 
     Nvvm intrinsic for performing a compare and broadcast across a warp.
@@ -209,12 +228,13 @@ class match_all_sync(Stub):
     same value as the given value from within the masked warp, if they
     all have the same value, otherwise it is 0. Pred is a boolean of whether
     or not all threads in the mask warp have the same warp.
-    '''
-    _description_ = '<match_all_sync()>'
+    """
+
+    _description_ = "<match_all_sync()>"
 
 
 class activemask(Stub):
-    '''
+    """
     activemask()
 
     Returns a 32-bit integer mask of all currently active threads in the
@@ -222,47 +242,54 @@ class activemask(Stub):
     activemask() is called. Inactive threads are represented by 0 bits in the
     returned mask. Threads which have exited the kernel are always marked as
     inactive.
-    '''
-    _description_ = '<activemask()>'
+    """
+
+    _description_ = "<activemask()>"
 
 
 class lanemask_lt(Stub):
-    '''
+    """
     lanemask_lt()
 
     Returns a 32-bit integer mask of all lanes (including inactive ones) with
     ID less than the current lane.
-    '''
-    _description_ = '<lanemask_lt()>'
+    """
+
+    _description_ = "<lanemask_lt()>"
 
 
 # -------------------------------------------------------------------------------
 # memory fences
 
+
 class threadfence_block(Stub):
-    '''
+    """
     A memory fence at thread block level
-    '''
-    _description_ = '<threadfence_block()>'
+    """
+
+    _description_ = "<threadfence_block()>"
 
 
 class threadfence_system(Stub):
-    '''
+    """
     A memory fence at system level: across devices
-    '''
-    _description_ = '<threadfence_system()>'
+    """
+
+    _description_ = "<threadfence_system()>"
 
 
 class threadfence(Stub):
-    '''
+    """
     A memory fence at device level
-    '''
-    _description_ = '<threadfence()>'
+    """
+
+    _description_ = "<threadfence()>"
 
 
-#-------------------------------------------------------------------------------
+# -------------------------------------------------------------------------------
 # bit manipulation
 
+
 class popc(Stub):
     """
     popc(x)
@@ -297,9 +324,10 @@ class ffs(Stub):
     """
 
 
-#-------------------------------------------------------------------------------
+# -------------------------------------------------------------------------------
 # comparison and selection instructions
 
+
 class selp(Stub):
     """
     selp(a, b, c)
@@ -309,9 +337,10 @@ class selp(Stub):
     """
 
 
-#-------------------------------------------------------------------------------
+# -------------------------------------------------------------------------------
 # single / double precision arithmetic
 
+
 class fma(Stub):
     """
     fma(a, b, c)
@@ -321,20 +350,21 @@ class fma(Stub):
 
 
 class cbrt(Stub):
-    """"
+    """ "
     cbrt(a)
 
     Perform the cube root operation.
     """
 
 
-#-------------------------------------------------------------------------------
+# -------------------------------------------------------------------------------
 # atomic
 
+
 class atomic(Stub):
-    """Namespace for atomic operations
-    """
-    _description_ = '<atomic>'
+    """Namespace for atomic operations"""
+
+    _description_ = "<atomic>"
 
     class add(Stub):
         """add(ary, idx, val)
@@ -401,8 +431,7 @@ class dec(Stub):
 
         Performs::
 
-           ary[idx] = (val if (ary[idx] == 0) or
-                       (ary[idx] > val) else ary[idx] - 1)
+           ary[idx] = val if (ary[idx] == 0) or (ary[idx] > val) else ary[idx] - 1
 
         Supported on uint32, and uint64 operands only.
 
@@ -497,26 +526,29 @@ class cas(Stub):
         """
 
 
-#-------------------------------------------------------------------------------
+# -------------------------------------------------------------------------------
 # timers
 
+
 class nanosleep(Stub):
-    '''
+    """
     nanosleep(ns)
 
     Suspends the thread for a sleep duration approximately close to the delay
     `ns`, specified in nanoseconds.
-    '''
-    _description_ = '<nansleep()>'
+    """
+
+    _description_ = "<nansleep()>"
 
-#-------------------------------------------------------------------------------
+
+# -------------------------------------------------------------------------------
 # Floating point 16
 
 
 class fp16(Stub):
-    """Namespace for fp16 operations
-    """
-    _description_ = '<fp16>'
+    """Namespace for fp16 operations"""
+
+    _description_ = "<fp16>"
 
     class hadd(Stub):
         """hadd(a, b)
@@ -817,9 +849,10 @@ class hmin(Stub):
         """
 
 
-#-------------------------------------------------------------------------------
+# -------------------------------------------------------------------------------
 # vector types
 
+
 def make_vector_type_stubs():
     """Make user facing objects for vector types"""
     vector_type_stubs = []
@@ -833,7 +866,7 @@ def make_vector_type_stubs():
         "uint32",
         "uint64",
         "float32",
-        "float64"
+        "float64",
     )
     vector_type_element_counts = (1, 2, 3, 4)
     vector_type_attribute_names = ("x", "y", "z", "w")
@@ -845,21 +878,25 @@ def make_vector_type_stubs():
         attr_names = vector_type_attribute_names[:nelem]
 
         vector_type_stub = type(
-            type_name, (Stub,),
+            type_name,
+            (Stub,),
             {
                 **{attr: lambda self: None for attr in attr_names},
                 **{
                     "_description_": f"<{type_name}>",
-                    "__signature__": Signature(parameters=[
-                        Parameter(
-                            name=attr_name, kind=Parameter.POSITIONAL_ONLY
-                        ) for attr_name in attr_names[:nelem]
-                    ]),
+                    "__signature__": Signature(
+                        parameters=[
+                            Parameter(
+                                name=attr_name, kind=Parameter.POSITIONAL_ONLY
+                            )
+                            for attr_name in attr_names[:nelem]
+                        ]
+                    ),
                     "__doc__": f"A stub for {type_name} to be used in "
-                    "CUDA kernels."
+                    "CUDA kernels.",
                 },
-                **{"aliases": []}
-            }
+                **{"aliases": []},
+            },
         )
         vector_type_stubs.append(vector_type_stub)
     return vector_type_stubs
@@ -884,7 +921,7 @@ def map_vector_type_stubs_to_alias(vector_type_stubs):
         "ulong": f"uint{np.dtype(np.uint).itemsize * 8}",
         "ulonglong": f"uint{np.dtype(np.ulonglong).itemsize * 8}",
         "float": f"float{np.dtype(np.single).itemsize * 8}",
-        "double": f"float{np.dtype(np.double).itemsize * 8}"
+        "double": f"float{np.dtype(np.double).itemsize * 8}",
     }
 
     base_type_to_vector_type = defaultdict(list)
diff --git a/numba_cuda/numba/cuda/target.py b/numba_cuda/numba/cuda/target.py
index 5713cdf93..ead46f446 100644
--- a/numba_cuda/numba/cuda/target.py
+++ b/numba_cuda/numba/cuda/target.py
@@ -35,19 +35,21 @@ def load_additional_registries(self):
     def resolve_value_type(self, val):
         # treat other dispatcher object as another device function
         from numba.cuda.dispatcher import CUDADispatcher
-        if (isinstance(val, Dispatcher) and not
-                isinstance(val, CUDADispatcher)):
+
+        if isinstance(val, Dispatcher) and not isinstance(val, CUDADispatcher):
             try:
                 # use cached device function
                 val = val.__dispatcher
             except AttributeError:
                 if not val._can_compile:
-                    raise ValueError('using cpu function on device '
-                                     'but its compilation is disabled')
+                    raise ValueError(
+                        "using cpu function on device "
+                        "but its compilation is disabled"
+                    )
                 targetoptions = val.targetoptions.copy()
-                targetoptions['device'] = True
-                targetoptions['debug'] = targetoptions.get('debug', False)
-                targetoptions['opt'] = targetoptions.get('opt', True)
+                targetoptions["device"] = True
+                targetoptions["debug"] = targetoptions.get("debug", False)
+                targetoptions["opt"] = targetoptions.get("opt", True)
                 disp = CUDADispatcher(val.py_func, targetoptions)
                 # cache the device function for future use and to avoid
                 # duplicated copy of the same function.
@@ -57,18 +59,19 @@ def resolve_value_type(self, val):
         # continue with parent logic
         return super(CUDATypingContext, self).resolve_value_type(val)
 
+
 # -----------------------------------------------------------------------------
 # Implementation
 
 
-VALID_CHARS = re.compile(r'[^a-z0-9]', re.I)
+VALID_CHARS = re.compile(r"[^a-z0-9]", re.I)
 
 
 class CUDATargetContext(BaseContext):
     implement_powi_as_math_call = True
     strict_alignment = True
 
-    def __init__(self, typingctx, target='cuda'):
+    def __init__(self, typingctx, target="cuda"):
         super().__init__(typingctx, target)
         self.data_model_manager = cuda_data_manager.chain(
             datamodel.default_manager
@@ -76,7 +79,7 @@ def __init__(self, typingctx, target='cuda'):
 
     @property
     def enable_nrt(self):
-        return getattr(config, 'CUDA_ENABLE_NRT', False)
+        return getattr(config, "CUDA_ENABLE_NRT", False)
 
     @property
     def DIBuilder(self):
@@ -98,18 +101,17 @@ def init(self):
     def load_additional_registries(self):
         # side effect of import needed for numba.cpython.*, the builtins
         # registry is updated at import time.
-        from numba.cpython import numbers, tupleobj, slicing # noqa: F401
-        from numba.cpython import rangeobj, iterators, enumimpl # noqa: F401
-        from numba.cpython import unicode, charseq # noqa: F401
+        from numba.cpython import numbers, tupleobj, slicing  # noqa: F401
+        from numba.cpython import rangeobj, iterators, enumimpl  # noqa: F401
+        from numba.cpython import unicode, charseq  # noqa: F401
         from numba.cpython import cmathimpl
         from numba.misc import cffiimpl
-        from numba.np import arrayobj # noqa: F401
-        from numba.np import npdatetime # noqa: F401
-        from . import (
-            cudaimpl, printimpl, libdeviceimpl, mathimpl, vector_types
-        )
+        from numba.np import arrayobj  # noqa: F401
+        from numba.np import npdatetime  # noqa: F401
+        from . import cudaimpl, printimpl, libdeviceimpl, mathimpl, vector_types
+
         # fix for #8940
-        from numba.np.unsafe import ndarray # noqa F401
+        from numba.np.unsafe import ndarray  # noqa F401
 
         self.install_registry(cudaimpl.registry)
         self.install_registry(cffiimpl.registry)
@@ -136,10 +138,18 @@ def nonconst_module_attrs(self):
         These include threadIdx, blockDim, etc.
         """
         from numba import cuda
-        nonconsts = ('threadIdx', 'blockDim', 'blockIdx', 'gridDim', 'laneid',
-                     'warpsize')
-        nonconsts_with_mod = tuple([(types.Module(cuda), nc)
-                                    for nc in nonconsts])
+
+        nonconsts = (
+            "threadIdx",
+            "blockDim",
+            "blockIdx",
+            "gridDim",
+            "laneid",
+            "warpsize",
+        )
+        nonconsts_with_mod = tuple(
+            [(types.Module(cuda), nc) for nc in nonconsts]
+        )
         return nonconsts_with_mod
 
     @cached_property
@@ -147,8 +157,9 @@ def call_conv(self):
         return CUDACallConv(self)
 
     def mangler(self, name, argtypes, *, abi_tags=(), uid=None):
-        return itanium_mangler.mangle(name, argtypes, abi_tags=abi_tags,
-                                      uid=uid)
+        return itanium_mangler.mangle(
+            name, argtypes, abi_tags=abi_tags, uid=uid
+        )
 
     def make_constant_array(self, builder, aryty, arr):
         """
@@ -160,15 +171,16 @@ def make_constant_array(self, builder, aryty, arr):
 
         constvals = [
             self.get_constant(types.byte, i)
-            for i in iter(arr.tobytes(order='A'))
+            for i in iter(arr.tobytes(order="A"))
         ]
         constaryty = ir.ArrayType(ir.IntType(8), len(constvals))
         constary = ir.Constant(constaryty, constvals)
 
         addrspace = nvvm.ADDRSPACE_CONSTANT
-        gv = cgutils.add_global_variable(lmod, constary.type, "_cudapy_cmem",
-                                         addrspace=addrspace)
-        gv.linkage = 'internal'
+        gv = cgutils.add_global_variable(
+            lmod, constary.type, "_cudapy_cmem", addrspace=addrspace
+        )
+        gv.linkage = "internal"
         gv.global_constant = True
         gv.initializer = constary
 
@@ -179,17 +191,21 @@ def make_constant_array(self, builder, aryty, arr):
 
         # Convert to generic address-space
         ptrty = ir.PointerType(ir.IntType(8))
-        genptr = builder.addrspacecast(gv, ptrty, 'generic')
+        genptr = builder.addrspacecast(gv, ptrty, "generic")
 
         # Create array object
         ary = self.make_array(aryty)(self, builder)
         kshape = [self.get_constant(types.intp, s) for s in arr.shape]
         kstrides = [self.get_constant(types.intp, s) for s in arr.strides]
-        self.populate_array(ary, data=builder.bitcast(genptr, ary.data.type),
-                            shape=kshape,
-                            strides=kstrides,
-                            itemsize=ary.itemsize, parent=ary.parent,
-                            meminfo=None)
+        self.populate_array(
+            ary,
+            data=builder.bitcast(genptr, ary.data.type),
+            shape=kshape,
+            strides=kstrides,
+            itemsize=ary.itemsize,
+            parent=ary.parent,
+            meminfo=None,
+        )
 
         return ary._getvalue()
 
@@ -199,15 +215,17 @@ def insert_const_string(self, mod, string):
         addrspace.
         """
         text = cgutils.make_bytearray(string.encode("utf-8") + b"\x00")
-        name = '$'.join(["__conststring__",
-                         itanium_mangler.mangle_identifier(string)])
+        name = "$".join(
+            ["__conststring__", itanium_mangler.mangle_identifier(string)]
+        )
         # Try to reuse existing global
         gv = mod.globals.get(name)
         if gv is None:
             # Not defined yet
-            gv = cgutils.add_global_variable(mod, text.type, name,
-                                             addrspace=nvvm.ADDRSPACE_CONSTANT)
-            gv.linkage = 'internal'
+            gv = cgutils.add_global_variable(
+                mod, text.type, name, addrspace=nvvm.ADDRSPACE_CONSTANT
+            )
+            gv.linkage = "internal"
             gv.global_constant = True
             gv.initializer = text
 
@@ -225,11 +243,10 @@ def insert_string_const_addrspace(self, builder, string):
         lmod = builder.module
         gv = self.insert_const_string(lmod, string)
         charptrty = ir.PointerType(ir.IntType(8))
-        return builder.addrspacecast(gv, charptrty, 'generic')
+        return builder.addrspacecast(gv, charptrty, "generic")
 
     def optimize_function(self, func):
-        """Run O1 function passes
-        """
+        """Run O1 function passes"""
         pass
         ## XXX skipped for now
         # fpm = lp.FunctionPassManager.new(func.module)
@@ -266,8 +283,9 @@ def _make_call_helper(self, builder):
     def return_value(self, builder, retval):
         return builder.ret(retval)
 
-    def return_user_exc(self, builder, exc, exc_args=None, loc=None,
-                        func_name=None):
+    def return_user_exc(
+        self, builder, exc, exc_args=None, loc=None, func_name=None
+    ):
         msg = "Python exceptions are unsupported in the CUDA C/C++ ABI"
         raise NotImplementedError(msg)
 
@@ -290,8 +308,7 @@ def decorate_function(self, fn, args, fe_argtypes, noalias=False):
         """
         assert not noalias
         arginfo = self._get_arg_packer(fe_argtypes)
-        arginfo.assign_names(self.get_arguments(fn),
-                             ['arg.' + a for a in args])
+        arginfo.assign_names(self.get_arguments(fn), ["arg." + a for a in args])
 
     def get_arguments(self, func):
         """
diff --git a/numba_cuda/numba/cuda/testing.py b/numba_cuda/numba/cuda/testing.py
index 3c2d7bf46..86a95e789 100644
--- a/numba_cuda/numba/cuda/testing.py
+++ b/numba_cuda/numba/cuda/testing.py
@@ -11,7 +11,7 @@
 import unittest
 
 numba_cuda_dir = Path(__file__).parent
-test_data_dir = numba_cuda_dir / 'tests' / 'data'
+test_data_dir = numba_cuda_dir / "tests" / "data"
 
 
 class CUDATestCase(SerialMixin, TestCase):
@@ -55,6 +55,7 @@ class ContextResettingTestCase(CUDATestCase):
     def tearDown(self):
         super().tearDown()
         from numba.cuda.cudadrv.devices import reset
+
         reset()
 
 
@@ -89,26 +90,26 @@ def skip_unless_conda_cudatoolkit(reason):
 
 def skip_if_external_memmgr(reason):
     """Skip test if an EMM Plugin is in use"""
-    return unittest.skipIf(config.CUDA_MEMORY_MANAGER != 'default', reason)
+    return unittest.skipIf(config.CUDA_MEMORY_MANAGER != "default", reason)
 
 
 def skip_under_cuda_memcheck(reason):
-    return unittest.skipIf(os.environ.get('CUDA_MEMCHECK') is not None, reason)
+    return unittest.skipIf(os.environ.get("CUDA_MEMCHECK") is not None, reason)
 
 
 def skip_without_nvdisasm(reason):
-    nvdisasm_path = shutil.which('nvdisasm')
+    nvdisasm_path = shutil.which("nvdisasm")
     return unittest.skipIf(nvdisasm_path is None, reason)
 
 
 def skip_with_nvdisasm(reason):
-    nvdisasm_path = shutil.which('nvdisasm')
+    nvdisasm_path = shutil.which("nvdisasm")
     return unittest.skipIf(nvdisasm_path is not None, reason)
 
 
 def skip_on_arm(reason):
     cpu = platform.processor()
-    is_arm = cpu.startswith('arm') or cpu.startswith('aarch')
+    is_arm = cpu.startswith("arm") or cpu.startswith("aarch")
     return unittest.skipIf(is_arm, reason)
 
 
@@ -116,25 +117,27 @@ def skip_if_cuda_includes_missing(fn):
     # Skip when cuda.h is not available - generally this should indicate
     # whether the CUDA includes are available or not
     cuda_include_path = libs.get_cuda_include_dir()
-    cuda_h = os.path.join(cuda_include_path, 'cuda.h')
-    cuda_h_file = (os.path.exists(cuda_h) and os.path.isfile(cuda_h))
-    reason = 'CUDA include dir not available on this system'
+    cuda_h = os.path.join(cuda_include_path, "cuda.h")
+    cuda_h_file = os.path.exists(cuda_h) and os.path.isfile(cuda_h)
+    reason = "CUDA include dir not available on this system"
     return unittest.skipUnless(cuda_h_file, reason)(fn)
 
 
 def skip_if_curand_kernel_missing(fn):
     cuda_include_path = libs.get_cuda_include_dir()
-    curand_kernel_h = os.path.join(cuda_include_path, 'curand_kernel.h')
-    curand_kernel_h_file = (os.path.exists(curand_kernel_h) and
-                            os.path.isfile(curand_kernel_h))
-    reason = 'curand_kernel.h not available on this system'
+    curand_kernel_h = os.path.join(cuda_include_path, "curand_kernel.h")
+    curand_kernel_h_file = os.path.exists(curand_kernel_h) and os.path.isfile(
+        curand_kernel_h
+    )
+    reason = "curand_kernel.h not available on this system"
     return unittest.skipUnless(curand_kernel_h_file, reason)(fn)
 
 
 def skip_if_mvc_enabled(reason):
     """Skip a test if Minor Version Compatibility is enabled"""
-    return unittest.skipIf(config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY,
-                           reason)
+    return unittest.skipIf(
+        config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY, reason
+    )
 
 
 def skip_if_mvc_libraries_unavailable(fn):
@@ -142,12 +145,14 @@ def skip_if_mvc_libraries_unavailable(fn):
     try:
         import cubinlinker  # noqa: F401
         import ptxcompiler  # noqa: F401
+
         libs_available = True
     except ImportError:
         pass
 
-    return unittest.skipUnless(libs_available,
-                               "Requires cubinlinker and ptxcompiler")(fn)
+    return unittest.skipUnless(
+        libs_available, "Requires cubinlinker and ptxcompiler"
+    )(fn)
 
 
 def cc_X_or_above(major, minor):
@@ -189,7 +194,7 @@ def cudadevrt_missing():
     if config.ENABLE_CUDASIM:
         return False
     try:
-        path = libs.get_cudalib('cudadevrt', static=True)
+        path = libs.get_cudalib("cudadevrt", static=True)
         libs.check_static_lib(path)
     except FileNotFoundError:
         return True
@@ -197,7 +202,7 @@ def cudadevrt_missing():
 
 
 def skip_if_cudadevrt_missing(fn):
-    return unittest.skipIf(cudadevrt_missing(), 'cudadevrt missing')(fn)
+    return unittest.skipIf(cudadevrt_missing(), "cudadevrt missing")(fn)
 
 
 class ForeignArray(object):
diff --git a/numba_cuda/numba/cuda/tests/__init__.py b/numba_cuda/numba/cuda/tests/__init__.py
index 425a52b2e..d04d546ed 100644
--- a/numba_cuda/numba/cuda/tests/__init__.py
+++ b/numba_cuda/numba/cuda/tests/__init__.py
@@ -19,18 +19,19 @@ def load_testsuite(loader, dir):
         files = []
         for f in os.listdir(dir):
             path = join(dir, f)
-            if isfile(path) and fnmatch(f, 'test_*.py'):
+            if isfile(path) and fnmatch(f, "test_*.py"):
                 files.append(f)
-            elif isfile(join(path, '__init__.py')):
-                suite.addTests(loader.discover(path,
-                                               top_level_dir=top_level_dir))
+            elif isfile(join(path, "__init__.py")):
+                suite.addTests(
+                    loader.discover(path, top_level_dir=top_level_dir)
+                )
         for f in files:
             # turn 'f' into a filename relative to the toplevel dir and
             # translate it to a module name. This differs from the
             # implementation in Numba, because the toplevel dir is the
             # numba_cuda module location, not the numba one.
             f = relpath(join(dir, f), top_level_dir)
-            f = splitext(normpath(f.replace(os.path.sep, '.')))[0]
+            f = splitext(normpath(f.replace(os.path.sep, ".")))[0]
             suite.addTests(loader.loadTestsFromName(f))
         return suite
     except Exception:
@@ -42,16 +43,17 @@ def load_tests(loader, tests, pattern):
     suite = unittest.TestSuite()
     this_dir = dirname(__file__)
     ensure_supported_ccs_initialized()
-    suite.addTests(load_testsuite(loader, join(this_dir, 'nocuda')))
+    suite.addTests(load_testsuite(loader, join(this_dir, "nocuda")))
     if cuda.is_available():
-        suite.addTests(load_testsuite(loader, join(this_dir, 'cudasim')))
+        suite.addTests(load_testsuite(loader, join(this_dir, "cudasim")))
         gpus = cuda.list_devices()
         if gpus and gpus[0].compute_capability >= (2, 0):
-            suite.addTests(load_testsuite(loader, join(this_dir, 'cudadrv')))
-            suite.addTests(load_testsuite(loader, join(this_dir, 'cudapy')))
-            suite.addTests(load_testsuite(loader, join(this_dir, 'nrt')))
-            suite.addTests(load_testsuite(loader, join(this_dir,
-                                                       'doc_examples')))
+            suite.addTests(load_testsuite(loader, join(this_dir, "cudadrv")))
+            suite.addTests(load_testsuite(loader, join(this_dir, "cudapy")))
+            suite.addTests(load_testsuite(loader, join(this_dir, "nrt")))
+            suite.addTests(
+                load_testsuite(loader, join(this_dir, "doc_examples"))
+            )
         else:
             print("skipped CUDA tests because GPU CC < 2.0")
     else:
diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py b/numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py
index 32f75c855..27a61cf5e 100644
--- a/numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py
+++ b/numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py
@@ -4,7 +4,6 @@
 
 
 class TestArrayAttr(CUDATestCase):
-
     def test_contigous_2d(self):
         ary = np.arange(10)
         cary = ary.reshape(2, 5)
@@ -44,7 +43,7 @@ def test_contigous_4d(self):
     def test_ravel_1d(self):
         ary = np.arange(60)
         dary = cuda.to_device(ary)
-        for order in 'CFA':
+        for order in "CFA":
             expect = ary.ravel(order=order)
             dflat = dary.ravel(order=order)
             flat = dflat.copy_to_host()
@@ -52,14 +51,14 @@ def test_ravel_1d(self):
             self.assertEqual(flat.ndim, 1)
             self.assertPreciseEqual(expect, flat)
 
-    @skip_on_cudasim('CUDA Array Interface is not supported in the simulator')
+    @skip_on_cudasim("CUDA Array Interface is not supported in the simulator")
     def test_ravel_stride_1d(self):
         ary = np.arange(60)
         dary = cuda.to_device(ary)
         # No-copy stride device array
         darystride = dary[::2]
-        dary_data = dary.__cuda_array_interface__['data'][0]
-        ddarystride_data = darystride.__cuda_array_interface__['data'][0]
+        dary_data = dary.__cuda_array_interface__["data"][0]
+        ddarystride_data = darystride.__cuda_array_interface__["data"][0]
         self.assertEqual(dary_data, ddarystride_data)
         # Fail on ravel on non-contiguous array
         with self.assertRaises(NotImplementedError):
@@ -69,7 +68,7 @@ def test_ravel_c(self):
         ary = np.arange(60)
         reshaped = ary.reshape(2, 5, 2, 3)
 
-        expect = reshaped.ravel(order='C')
+        expect = reshaped.ravel(order="C")
         dary = cuda.to_device(reshaped)
         dflat = dary.ravel()
         flat = dflat.copy_to_host()
@@ -78,7 +77,7 @@ def test_ravel_c(self):
         self.assertPreciseEqual(expect, flat)
 
         # explicit order kwarg
-        for order in 'CA':
+        for order in "CA":
             expect = reshaped.ravel(order=order)
             dary = cuda.to_device(reshaped)
             dflat = dary.ravel(order=order)
@@ -87,15 +86,15 @@ def test_ravel_c(self):
             self.assertEqual(flat.ndim, 1)
             self.assertPreciseEqual(expect, flat)
 
-    @skip_on_cudasim('CUDA Array Interface is not supported in the simulator')
+    @skip_on_cudasim("CUDA Array Interface is not supported in the simulator")
     def test_ravel_stride_c(self):
         ary = np.arange(60)
         reshaped = ary.reshape(2, 5, 2, 3)
 
         dary = cuda.to_device(reshaped)
         darystride = dary[::2, ::2, ::2, ::2]
-        dary_data = dary.__cuda_array_interface__['data'][0]
-        ddarystride_data = darystride.__cuda_array_interface__['data'][0]
+        dary_data = dary.__cuda_array_interface__["data"][0]
+        ddarystride_data = darystride.__cuda_array_interface__["data"][0]
         self.assertEqual(dary_data, ddarystride_data)
         with self.assertRaises(NotImplementedError):
             darystride.ravel()
@@ -103,7 +102,7 @@ def test_ravel_stride_c(self):
     def test_ravel_f(self):
         ary = np.arange(60)
         reshaped = np.asfortranarray(ary.reshape(2, 5, 2, 3))
-        for order in 'FA':
+        for order in "FA":
             expect = reshaped.ravel(order=order)
             dary = cuda.to_device(reshaped)
             dflat = dary.ravel(order=order)
@@ -112,14 +111,14 @@ def test_ravel_f(self):
             self.assertEqual(flat.ndim, 1)
             self.assertPreciseEqual(expect, flat)
 
-    @skip_on_cudasim('CUDA Array Interface is not supported in the simulator')
+    @skip_on_cudasim("CUDA Array Interface is not supported in the simulator")
     def test_ravel_stride_f(self):
         ary = np.arange(60)
         reshaped = np.asfortranarray(ary.reshape(2, 5, 2, 3))
         dary = cuda.to_device(reshaped)
         darystride = dary[::2, ::2, ::2, ::2]
-        dary_data = dary.__cuda_array_interface__['data'][0]
-        ddarystride_data = darystride.__cuda_array_interface__['data'][0]
+        dary_data = dary.__cuda_array_interface__["data"][0]
+        ddarystride_data = darystride.__cuda_array_interface__["data"][0]
         self.assertEqual(dary_data, ddarystride_data)
         with self.assertRaises(NotImplementedError):
             darystride.ravel()
@@ -134,12 +133,12 @@ def test_reshape_c(self):
 
     def test_reshape_f(self):
         ary = np.arange(10)
-        expect = ary.reshape(2, 5, order='F')
+        expect = ary.reshape(2, 5, order="F")
         dary = cuda.to_device(ary)
-        dary_reshaped = dary.reshape(2, 5, order='F')
+        dary_reshaped = dary.reshape(2, 5, order="F")
         got = dary_reshaped.copy_to_host()
         self.assertPreciseEqual(expect, got)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py b/numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py
index 030052507..049804e7a 100644
--- a/numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py
+++ b/numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py
@@ -27,7 +27,6 @@ def test_gpus_iter(self):
 
 
 class TestContextAPI(CUDATestCase):
-
     def tearDown(self):
         super().tearDown()
         cuda.close()
@@ -36,7 +35,7 @@ def test_context_memory(self):
         try:
             mem = cuda.current_context().get_memory_info()
         except NotImplementedError:
-            self.skipTest('EMM Plugin does not implement get_memory_info()')
+            self.skipTest("EMM Plugin does not implement get_memory_info()")
 
         self.assertIsInstance(mem.free, numbers.Number)
         self.assertEqual(mem.free, mem[0])
@@ -47,7 +46,7 @@ def test_context_memory(self):
         self.assertLessEqual(mem.free, mem.total)
 
     @unittest.skipIf(len(cuda.gpus) < 2, "need more than 1 gpus")
-    @skip_on_cudasim('CUDA HW required')
+    @skip_on_cudasim("CUDA HW required")
     def test_forbidden_context_switch(self):
         # Cannot switch context inside a `cuda.require_context`
         @cuda.require_context
@@ -72,7 +71,7 @@ def switch_gpu():
         self.assertEqual(int(devid), 1)
 
 
-@skip_on_cudasim('CUDA HW required')
+@skip_on_cudasim("CUDA HW required")
 class Test3rdPartyContext(CUDATestCase):
     def tearDown(self):
         super().tearDown()
@@ -118,8 +117,9 @@ def test_attached_non_primary(self):
             cuda.current_context()
         except RuntimeError as e:
             # Expecting an error about non-primary CUDA context
-            self.assertIn("Numba cannot operate on non-primary CUDA context ",
-                          str(e))
+            self.assertIn(
+                "Numba cannot operate on non-primary CUDA context ", str(e)
+            )
         else:
             self.fail("No RuntimeError raised")
         finally:
@@ -141,5 +141,5 @@ def foo(a):
         self.test_attached_primary(do)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py b/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py
index 5033a115f..ec8f239b3 100644
--- a/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py
+++ b/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py
@@ -11,7 +11,7 @@ class CudaArrayIndexing(CUDATestCase):
     def test_index_1d(self):
         arr = np.arange(10)
         darr = cuda.to_device(arr)
-        x, = arr.shape
+        (x,) = arr.shape
         for i in range(-x, x):
             self.assertEqual(arr[i], darr[i])
         with self.assertRaises(IndexError):
@@ -58,7 +58,6 @@ def test_index_3d(self):
 
 
 class CudaArrayStridedSlice(CUDATestCase):
-
     def test_strided_index_1d(self):
         arr = np.arange(10)
         darr = cuda.to_device(arr)
@@ -71,8 +70,9 @@ def test_strided_index_2d(self):
 
         for i in range(arr.shape[0]):
             for j in range(arr.shape[1]):
-                np.testing.assert_equal(arr[i::2, j::2],
-                                        darr[i::2, j::2].copy_to_host())
+                np.testing.assert_equal(
+                    arr[i::2, j::2], darr[i::2, j::2].copy_to_host()
+                )
 
     def test_strided_index_3d(self):
         arr = np.arange(6 * 7 * 8).reshape(6, 7, 8)
@@ -83,7 +83,8 @@ def test_strided_index_3d(self):
                 for k in range(arr.shape[2]):
                     np.testing.assert_equal(
                         arr[i::2, j::2, k::2],
-                        darr[i::2, j::2, k::2].copy_to_host())
+                        darr[i::2, j::2, k::2].copy_to_host(),
+                    )
 
 
 class CudaArraySlicing(CUDATestCase):
@@ -96,7 +97,7 @@ def test_prefix_1d(self):
             self.assertTrue(np.all(expect == got))
 
     def test_prefix_2d(self):
-        arr = np.arange(3 ** 2).reshape(3, 3)
+        arr = np.arange(3**2).reshape(3, 3)
         darr = cuda.to_device(arr)
         for i in range(arr.shape[0]):
             for j in range(arr.shape[1]):
@@ -129,39 +130,45 @@ def test_select_3d_first_two_dim(self):
                 self.assertTrue(np.all(expect == got))
 
     def test_select_f(self):
-        a = np.arange(5 * 6 * 7).reshape(5, 6, 7, order='F')
+        a = np.arange(5 * 6 * 7).reshape(5, 6, 7, order="F")
         da = cuda.to_device(a)
 
         for i in range(a.shape[0]):
             for j in range(a.shape[1]):
-                self.assertTrue(np.array_equal(da[i, j, :].copy_to_host(),
-                                               a[i, j, :]))
+                self.assertTrue(
+                    np.array_equal(da[i, j, :].copy_to_host(), a[i, j, :])
+                )
             for j in range(a.shape[2]):
-                self.assertTrue(np.array_equal(da[i, :, j].copy_to_host(),
-                                               a[i, :, j]))
+                self.assertTrue(
+                    np.array_equal(da[i, :, j].copy_to_host(), a[i, :, j])
+                )
         for i in range(a.shape[1]):
             for j in range(a.shape[2]):
-                self.assertTrue(np.array_equal(da[:, i, j].copy_to_host(),
-                                               a[:, i, j]))
+                self.assertTrue(
+                    np.array_equal(da[:, i, j].copy_to_host(), a[:, i, j])
+                )
 
     def test_select_c(self):
-        a = np.arange(5 * 6 * 7).reshape(5, 6, 7, order='C')
+        a = np.arange(5 * 6 * 7).reshape(5, 6, 7, order="C")
         da = cuda.to_device(a)
 
         for i in range(a.shape[0]):
             for j in range(a.shape[1]):
-                self.assertTrue(np.array_equal(da[i, j, :].copy_to_host(),
-                                               a[i, j, :]))
+                self.assertTrue(
+                    np.array_equal(da[i, j, :].copy_to_host(), a[i, j, :])
+                )
             for j in range(a.shape[2]):
-                self.assertTrue(np.array_equal(da[i, :, j].copy_to_host(),
-                                               a[i, :, j]))
+                self.assertTrue(
+                    np.array_equal(da[i, :, j].copy_to_host(), a[i, :, j])
+                )
         for i in range(a.shape[1]):
             for j in range(a.shape[2]):
-                self.assertTrue(np.array_equal(da[:, i, j].copy_to_host(),
-                                               a[:, i, j]))
+                self.assertTrue(
+                    np.array_equal(da[:, i, j].copy_to_host(), a[:, i, j])
+                )
 
     def test_prefix_select(self):
-        arr = np.arange(5 * 7).reshape(5, 7, order='F')
+        arr = np.arange(5 * 7).reshape(5, 7, order="F")
 
         darr = cuda.to_device(arr)
         self.assertTrue(np.all(darr[:1, 1].copy_to_host() == arr[:1, 1]))
@@ -170,15 +177,15 @@ def test_negative_slicing_1d(self):
         arr = np.arange(10)
         darr = cuda.to_device(arr)
         for i, j in product(range(-10, 10), repeat=2):
-            np.testing.assert_array_equal(arr[i:j],
-                                          darr[i:j].copy_to_host())
+            np.testing.assert_array_equal(arr[i:j], darr[i:j].copy_to_host())
 
     def test_negative_slicing_2d(self):
         arr = np.arange(12).reshape(3, 4)
         darr = cuda.to_device(arr)
         for x, y, w, s in product(range(-4, 4), repeat=4):
-            np.testing.assert_array_equal(arr[x:y, w:s],
-                                          darr[x:y, w:s].copy_to_host())
+            np.testing.assert_array_equal(
+                arr[x:y, w:s], darr[x:y, w:s].copy_to_host()
+            )
 
     def test_empty_slice_1d(self):
         arr = np.arange(5)
@@ -188,10 +195,10 @@ def test_empty_slice_1d(self):
         # empty slice of empty slice
         self.assertFalse(darr[:0][:0].copy_to_host())
         # out-of-bound slice just produces empty slices
-        np.testing.assert_array_equal(darr[:0][:1].copy_to_host(),
-                                      arr[:0][:1])
-        np.testing.assert_array_equal(darr[:0][-1:].copy_to_host(),
-                                      arr[:0][-1:])
+        np.testing.assert_array_equal(darr[:0][:1].copy_to_host(), arr[:0][:1])
+        np.testing.assert_array_equal(
+            darr[:0][-1:].copy_to_host(), arr[:0][-1:]
+        )
 
     def test_empty_slice_2d(self):
         arr = np.arange(5 * 7).reshape(5, 7)
@@ -202,8 +209,9 @@ def test_empty_slice_2d(self):
         self.assertFalse(darr[:0][:0].copy_to_host())
         # out-of-bound slice just produces empty slices
         np.testing.assert_array_equal(darr[:0][:1].copy_to_host(), arr[:0][:1])
-        np.testing.assert_array_equal(darr[:0][-1:].copy_to_host(),
-                                      arr[:0][-1:])
+        np.testing.assert_array_equal(
+            darr[:0][-1:].copy_to_host(), arr[:0][-1:]
+        )
 
 
 class CudaArraySetting(CUDATestCase):
@@ -292,7 +300,8 @@ def test_incompatible_highdim(self):
                 "Can't assign 3-D array to 1-D self",  # device
                 "could not broadcast input array from shape (2,3) "
                 "into shape (35,)",  # simulator, NP >= 1.20
-            ])
+            ],
+        )
 
     def test_incompatible_shape(self):
         darr = cuda.to_device(np.arange(5))
@@ -306,57 +315,67 @@ def test_incompatible_shape(self):
                 "Can't copy sequence with size 2 to array axis 0 with "
                 "dimension 5",  # device
                 "could not broadcast input array from shape (2,) into "
-                "shape (5,)",   # simulator, NP >= 1.20
-            ])
+                "shape (5,)",  # simulator, NP >= 1.20
+            ],
+        )
 
-    @skip_on_cudasim('cudasim does not use streams and operates synchronously')
+    @skip_on_cudasim("cudasim does not use streams and operates synchronously")
     def test_sync(self):
         # There should be a synchronization when no stream is supplied
         darr = cuda.to_device(np.arange(5))
 
-        with patch.object(cuda.cudadrv.driver.Stream, 'synchronize',
-                          return_value=None) as mock_sync:
+        with patch.object(
+            cuda.cudadrv.driver.Stream, "synchronize", return_value=None
+        ) as mock_sync:
             darr[0] = 10
 
         mock_sync.assert_called_once()
 
-    @skip_on_cudasim('cudasim does not use streams and operates synchronously')
+    @skip_on_cudasim("cudasim does not use streams and operates synchronously")
     def test_no_sync_default_stream(self):
         # There should not be a synchronization when the array has a default
         # stream, whether it is the default stream, the legacy default stream,
         # the per-thread default stream, or another stream.
-        streams = (cuda.stream(), cuda.default_stream(),
-                   cuda.legacy_default_stream(),
-                   cuda.per_thread_default_stream())
+        streams = (
+            cuda.stream(),
+            cuda.default_stream(),
+            cuda.legacy_default_stream(),
+            cuda.per_thread_default_stream(),
+        )
 
         for stream in streams:
             darr = cuda.to_device(np.arange(5), stream=stream)
 
-            with patch.object(cuda.cudadrv.driver.Stream, 'synchronize',
-                              return_value=None) as mock_sync:
+            with patch.object(
+                cuda.cudadrv.driver.Stream, "synchronize", return_value=None
+            ) as mock_sync:
                 darr[0] = 10
 
             mock_sync.assert_not_called()
 
-    @skip_on_cudasim('cudasim does not use streams and operates synchronously')
+    @skip_on_cudasim("cudasim does not use streams and operates synchronously")
     def test_no_sync_supplied_stream(self):
         # There should not be a synchronization when a stream is supplied for
         # the setitem call, whether it is the default stream, the legacy default
         # stream, the per-thread default stream, or another stream.
-        streams = (cuda.stream(), cuda.default_stream(),
-                   cuda.legacy_default_stream(),
-                   cuda.per_thread_default_stream())
+        streams = (
+            cuda.stream(),
+            cuda.default_stream(),
+            cuda.legacy_default_stream(),
+            cuda.per_thread_default_stream(),
+        )
 
         for stream in streams:
             darr = cuda.to_device(np.arange(5))
 
-            with patch.object(cuda.cudadrv.driver.Stream, 'synchronize',
-                              return_value=None) as mock_sync:
+            with patch.object(
+                cuda.cudadrv.driver.Stream, "synchronize", return_value=None
+            ) as mock_sync:
                 darr.setitem(0, 10, stream=stream)
 
             mock_sync.assert_not_called()
 
-    @unittest.skip('Requires PR #6367')
+    @unittest.skip("Requires PR #6367")
     def test_issue_6505(self):
         # On Windows, the writes to ary_v would not be visible prior to the
         # assertion, due to the assignment being done with a kernel launch that
@@ -365,11 +384,11 @@ def test_issue_6505(self):
         ary = cuda.mapped_array(2, dtype=np.int32)
         ary[:] = 0
 
-        ary_v = ary.view('u1')
+        ary_v = ary.view("u1")
         ary_v[1] = 1
         ary_v[5] = 1
         self.assertEqual(sum(ary), 512)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py b/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py
index 4a4d59310..45815be70 100644
--- a/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py
+++ b/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py
@@ -17,5 +17,5 @@ def test_auto_context(self):
         self.assertTrue(np.allclose(A, newA))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py b/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py
index e2acd34d7..b13c8f979 100644
--- a/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py
+++ b/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py
@@ -1,7 +1,10 @@
 import numpy as np
 import ctypes
-from numba.cuda.cudadrv.devicearray import (DeviceRecord, from_record_like,
-                                            auto_device)
+from numba.cuda.cudadrv.devicearray import (
+    DeviceRecord,
+    from_record_like,
+    auto_device,
+)
 from numba.cuda.testing import unittest, CUDATestCase
 from numba.cuda.testing import skip_on_cudasim
 from numba.np import numpy_support
@@ -11,43 +14,37 @@
 
 recordtype = np.dtype(
     [
-        ('a', np.float64),
-        ('b', np.int32),
-        ('c', np.complex64),
-        ('d', (np.str_, N_CHARS))
+        ("a", np.float64),
+        ("b", np.int32),
+        ("c", np.complex64),
+        ("d", (np.str_, N_CHARS)),
     ],
-    align=True
+    align=True,
 )
 
-recordwitharray = np.dtype(
-    [
-        ('g', np.int32),
-        ('h', np.float32, 2)
-    ],
-    align=True
-)
+recordwitharray = np.dtype([("g", np.int32), ("h", np.float32, 2)], align=True)
 
-recwithmat = np.dtype([('i', np.int32),
-                       ('j', np.float32, (3, 3))])
+recwithmat = np.dtype([("i", np.int32), ("j", np.float32, (3, 3))])
 
-recwithrecwithmat = np.dtype([('x', np.int32), ('y', recwithmat)])
+recwithrecwithmat = np.dtype([("x", np.int32), ("y", recwithmat)])
 
 
-@skip_on_cudasim('Device Record API unsupported in the simulator')
+@skip_on_cudasim("Device Record API unsupported in the simulator")
 class TestCudaDeviceRecord(CUDATestCase):
     """
     Tests the DeviceRecord class with np.void host types.
     """
+
     def setUp(self):
         super().setUp()
         self._create_data(np.zeros)
 
     def _create_data(self, array_ctor):
-        self.dtype = np.dtype([('a', np.int32), ('b', np.float32)], align=True)
+        self.dtype = np.dtype([("a", np.int32), ("b", np.float32)], align=True)
         self.hostz = array_ctor(1, self.dtype)[0]
         self.hostnz = array_ctor(1, self.dtype)[0]
-        self.hostnz['a'] = 10
-        self.hostnz['b'] = 11.0
+        self.hostnz["a"] = 10
+        self.hostnz["b"] = 11.0
 
     def _check_device_record(self, reference, rec):
         self.assertEqual(rec.shape, tuple())
@@ -111,21 +108,22 @@ class TestCudaDeviceRecordWithRecord(TestCudaDeviceRecord):
     """
     Tests the DeviceRecord class with np.record host types
     """
+
     def setUp(self):
         CUDATestCase.setUp(self)
         self._create_data(np.recarray)
 
 
-@skip_on_cudasim('Structured array attr access not supported in simulator')
+@skip_on_cudasim("Structured array attr access not supported in simulator")
 class TestRecordDtypeWithStructArrays(CUDATestCase):
-    '''
+    """
     Test operation of device arrays on structured arrays.
-    '''
+    """
 
     def _createSampleArrays(self):
         self.sample1d = cuda.device_array(3, dtype=recordtype)
         self.samplerec1darr = cuda.device_array(1, dtype=recordwitharray)[0]
-        self.samplerecmat = cuda.device_array(1,dtype=recwithmat)[0]
+        self.samplerecmat = cuda.device_array(1, dtype=recwithmat)[0]
 
     def setUp(self):
         super().setUp()
@@ -134,46 +132,46 @@ def setUp(self):
         ary = self.sample1d
         for i in range(ary.size):
             x = i + 1
-            ary[i]['a'] = x / 2
-            ary[i]['b'] = x
-            ary[i]['c'] = x * 1j
-            ary[i]['d'] = str(x) * N_CHARS
+            ary[i]["a"] = x / 2
+            ary[i]["b"] = x
+            ary[i]["c"] = x * 1j
+            ary[i]["d"] = str(x) * N_CHARS
 
     def test_structured_array1(self):
         ary = self.sample1d
         for i in range(self.sample1d.size):
             x = i + 1
-            self.assertEqual(ary[i]['a'], x / 2)
-            self.assertEqual(ary[i]['b'], x)
-            self.assertEqual(ary[i]['c'], x * 1j)
-            self.assertEqual(ary[i]['d'], str(x) * N_CHARS)
+            self.assertEqual(ary[i]["a"], x / 2)
+            self.assertEqual(ary[i]["b"], x)
+            self.assertEqual(ary[i]["c"], x * 1j)
+            self.assertEqual(ary[i]["d"], str(x) * N_CHARS)
 
     def test_structured_array2(self):
         ary = self.samplerec1darr
-        ary['g'] = 2
-        ary['h'][0] = 3.0
-        ary['h'][1] = 4.0
-        self.assertEqual(ary['g'], 2)
-        self.assertEqual(ary['h'][0], 3.0)
-        self.assertEqual(ary['h'][1], 4.0)
+        ary["g"] = 2
+        ary["h"][0] = 3.0
+        ary["h"][1] = 4.0
+        self.assertEqual(ary["g"], 2)
+        self.assertEqual(ary["h"][0], 3.0)
+        self.assertEqual(ary["h"][1], 4.0)
 
     def test_structured_array3(self):
         ary = self.samplerecmat
-        mat = np.array([[5.0, 10.0, 15.0],
-                       [20.0, 25.0, 30.0],
-                       [35.0, 40.0, 45.0]],
-                       dtype=np.float32).reshape(3,3)
-        ary['j'][:] = mat
-        np.testing.assert_equal(ary['j'], mat)
+        mat = np.array(
+            [[5.0, 10.0, 15.0], [20.0, 25.0, 30.0], [35.0, 40.0, 45.0]],
+            dtype=np.float32,
+        ).reshape(3, 3)
+        ary["j"][:] = mat
+        np.testing.assert_equal(ary["j"], mat)
 
     def test_structured_array4(self):
         arr = np.zeros(1, dtype=recwithrecwithmat)
         d_arr = cuda.to_device(arr)
-        d_arr[0]['y']['i'] = 1
-        self.assertEqual(d_arr[0]['y']['i'], 1)
-        d_arr[0]['y']['j'][0, 0] = 2.0
-        self.assertEqual(d_arr[0]['y']['j'][0, 0], 2.0)
+        d_arr[0]["y"]["i"] = 1
+        self.assertEqual(d_arr[0]["y"]["i"], 1)
+        d_arr[0]["y"]["j"][0, 0] = 2.0
+        self.assertEqual(d_arr[0]["y"]["j"][0, 0], 2.0)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py b/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py
index ea9d72fa8..6972a44ed 100644
--- a/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py
+++ b/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py
@@ -1,13 +1,17 @@
 from ctypes import byref, c_int, c_void_p, sizeof
 
-from numba.cuda.cudadrv.driver import (host_to_device, device_to_host, driver,
-                                       launch_kernel)
+from numba.cuda.cudadrv.driver import (
+    host_to_device,
+    device_to_host,
+    driver,
+    launch_kernel,
+)
 from numba.cuda.cudadrv import devices, drvapi, driver as _driver
 from numba.cuda.testing import unittest, CUDATestCase
 from numba.cuda.testing import skip_on_cudasim
 
 
-ptx1 = '''
+ptx1 = """
     .version 1.4
     .target sm_10, map_f64_to_f32
 
@@ -29,9 +33,9 @@
     exit;
 $LDWend__Z10helloworldPi:
     } // _Z10helloworldPi
-'''
+"""
 
-ptx2 = '''
+ptx2 = """
 .version 3.0
 .target sm_20
 .address_size 64
@@ -57,10 +61,10 @@
     .loc 2 7 2
     ret;
 }
-'''
+"""
 
 
-@skip_on_cudasim('CUDA Driver API unsupported in the simulator')
+@skip_on_cudasim("CUDA Driver API unsupported in the simulator")
 class TestCudaDriver(CUDATestCase):
     def setUp(self):
         super().setUp()
@@ -79,7 +83,7 @@ def tearDown(self):
 
     def test_cuda_driver_basic(self):
         module = self.context.create_module_ptx(self.ptx)
-        function = module.get_function('_Z10helloworldPi')
+        function = module.get_function("_Z10helloworldPi")
 
         array = (c_int * 100)()
 
@@ -93,12 +97,18 @@ def test_cuda_driver_basic(self):
             ptr = c_void_p(int(ptr))
             stream = _driver.binding.CUstream(stream)
 
-        launch_kernel(function.handle,  # Kernel
-                      1,   1, 1,        # gx, gy, gz
-                      100, 1, 1,        # bx, by, bz
-                      0,                # dynamic shared mem
-                      stream,           # stream
-                      [ptr])            # arguments
+        launch_kernel(
+            function.handle,  # Kernel
+            1,
+            1,
+            1,  # gx, gy, gz
+            100,
+            1,
+            1,  # bx, by, bz
+            0,  # dynamic shared mem
+            stream,  # stream
+            [ptr],
+        )  # arguments
 
         device_to_host(array, memory, sizeof(array))
         for i, v in enumerate(array):
@@ -108,7 +118,7 @@ def test_cuda_driver_basic(self):
 
     def test_cuda_driver_stream_operations(self):
         module = self.context.create_module_ptx(self.ptx)
-        function = module.get_function('_Z10helloworldPi')
+        function = module.get_function("_Z10helloworldPi")
 
         array = (c_int * 100)()
 
@@ -122,12 +132,18 @@ def test_cuda_driver_stream_operations(self):
             if _driver.USE_NV_BINDING:
                 ptr = c_void_p(int(ptr))
 
-            launch_kernel(function.handle,  # Kernel
-                          1,   1, 1,        # gx, gy, gz
-                          100, 1, 1,        # bx, by, bz
-                          0,                # dynamic shared mem
-                          stream.handle,    # stream
-                          [ptr])            # arguments
+            launch_kernel(
+                function.handle,  # Kernel
+                1,
+                1,
+                1,  # gx, gy, gz
+                100,
+                1,
+                1,  # bx, by, bz
+                0,  # dynamic shared mem
+                stream.handle,  # stream
+                [ptr],
+            )  # arguments
 
         device_to_host(array, memory, sizeof(array), stream=stream)
 
@@ -193,17 +209,19 @@ def test_cuda_driver_external_stream(self):
 
     def test_cuda_driver_occupancy(self):
         module = self.context.create_module_ptx(self.ptx)
-        function = module.get_function('_Z10helloworldPi')
+        function = module.get_function("_Z10helloworldPi")
 
-        value = self.context.get_active_blocks_per_multiprocessor(function,
-                                                                  128, 128)
+        value = self.context.get_active_blocks_per_multiprocessor(
+            function, 128, 128
+        )
         self.assertTrue(value > 0)
 
         def b2d(bs):
             return bs
 
-        grid, block = self.context.get_max_potential_block_size(function, b2d,
-                                                                128, 128)
+        grid, block = self.context.get_max_potential_block_size(
+            function, b2d, 128, 128
+        )
         self.assertTrue(grid > 0)
         self.assertTrue(block > 0)
 
@@ -221,15 +239,15 @@ def test_device_get_uuid(self):
         # 4122) pertaining to versions and variants, so we do not extract and
         # validate the values of these bits.
 
-        h = '[0-9a-f]{%d}'
+        h = "[0-9a-f]{%d}"
         h4 = h % 4
         h8 = h % 8
         h12 = h % 12
-        uuid_format = f'^GPU-{h8}-{h4}-{h4}-{h4}-{h12}$'
+        uuid_format = f"^GPU-{h8}-{h4}-{h4}-{h4}-{h12}$"
 
         dev = devices.get_context().device
         self.assertRegex(dev.uuid, uuid_format)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py b/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py
index 890bf6829..f80c44ada 100644
--- a/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py
+++ b/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py
@@ -3,7 +3,7 @@
 from numba.misc.findlib import find_lib
 
 
-@skip_on_cudasim('Library detection unsupported in the simulator')
+@skip_on_cudasim("Library detection unsupported in the simulator")
 @skip_unless_conda_cudatoolkit
 class TestLibraryDetection(unittest.TestCase):
     def test_detect(self):
@@ -13,10 +13,10 @@ def test_detect(self):
         PyCulib (and potentially others) rely on Numba's library finding
         capacity to find and subsequently load these libraries.
         """
-        core_libs = ['nvvm']
+        core_libs = ["nvvm"]
         for l in core_libs:
             self.assertNotEqual(find_lib(l), [])
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py b/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py
index 6402f7773..5d187411f 100644
--- a/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py
+++ b/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py
@@ -7,7 +7,7 @@
 from numba.cuda.testing import skip_on_cudasim
 
 
-@skip_on_cudasim('CUDA Memory API unsupported in the simulator')
+@skip_on_cudasim("CUDA Memory API unsupported in the simulator")
 class TestCudaMemory(ContextResettingTestCase):
     def setUp(self):
         super().setUp()
@@ -24,8 +24,7 @@ def _template(self, obj):
             expected_class = driver.binding.CUdeviceptr
         else:
             expected_class = drvapi.cu_device_ptr
-        self.assertTrue(isinstance(obj.device_ctypes_pointer,
-                                   expected_class))
+        self.assertTrue(isinstance(obj.device_ctypes_pointer, expected_class))
 
     def test_device_memory(self):
         devmem = self.context.memalloc(1024)
@@ -41,9 +40,9 @@ def test_host_alloc(self):
 
     def test_pinned_memory(self):
         ary = np.arange(10)
-        devmem = self.context.mempin(ary, ary.ctypes.data,
-                                     ary.size * ary.dtype.itemsize,
-                                     mapped=True)
+        devmem = self.context.mempin(
+            ary, ary.ctypes.data, ary.size * ary.dtype.itemsize, mapped=True
+        )
         self._template(devmem)
 
     def test_managed_memory(self):
@@ -69,8 +68,7 @@ def check(m, offset):
             v2 = v1.view(offset)
             self.assertEqual(handle_val(v2.owner), handle_val(m))
             self.assertEqual(handle_val(v2.owner), handle_val(m))
-            self.assertEqual(handle_val(v2) - offset * 2,
-                             handle_val(v2.owner))
+            self.assertEqual(handle_val(v2) - offset * 2, handle_val(v2.owner))
             self.assertEqual(m.refct, 3)
             del v2
             self.assertEqual(m.refct, 2)
@@ -84,22 +82,24 @@ def check(m, offset):
     def test_user_extension(self):
         # User can use MemoryPointer to wrap externally defined pointers.
         # This test checks if the finalizer is invokded at correct time
-        fake_ptr = ctypes.c_void_p(0xdeadbeef)
+        fake_ptr = ctypes.c_void_p(0xDEADBEEF)
         dtor_invoked = [0]
 
         def dtor():
             dtor_invoked[0] += 1
 
         # Ensure finalizer is called when pointer is deleted
-        ptr = driver.MemoryPointer(context=self.context, pointer=fake_ptr,
-                                   size=40, finalizer=dtor)
+        ptr = driver.MemoryPointer(
+            context=self.context, pointer=fake_ptr, size=40, finalizer=dtor
+        )
         self.assertEqual(dtor_invoked[0], 0)
         del ptr
         self.assertEqual(dtor_invoked[0], 1)
 
         # Ensure removing derived pointer doesn't call finalizer
-        ptr = driver.MemoryPointer(context=self.context, pointer=fake_ptr,
-                                   size=40, finalizer=dtor)
+        ptr = driver.MemoryPointer(
+            context=self.context, pointer=fake_ptr, size=40, finalizer=dtor
+        )
         owned = ptr.own()
         del owned
         self.assertEqual(dtor_invoked[0], 1)
@@ -128,16 +128,16 @@ def test_memcpy(self):
         self.assertTrue(np.all(hstary == hstary2))
 
     def test_memset(self):
-        dtype = np.dtype('uint32')
+        dtype = np.dtype("uint32")
         n = 10
         sz = dtype.itemsize * 10
         devary = self.context.memalloc(sz)
-        driver.device_memset(devary, 0xab, sz)
+        driver.device_memset(devary, 0xAB, sz)
 
         hstary = np.empty(n, dtype=dtype)
         driver.device_to_host(hstary, devary, sz)
 
-        hstary2 = np.array([0xabababab] * n, dtype=np.dtype('uint32'))
+        hstary2 = np.array([0xABABABAB] * n, dtype=np.dtype("uint32"))
         self.assertTrue(np.all(hstary == hstary2))
 
     def test_d2d(self):
@@ -152,7 +152,7 @@ def test_d2d(self):
         self.assertTrue(np.all(hst == hst2))
 
 
-@skip_on_cudasim('CUDA Memory API unsupported in the simulator')
+@skip_on_cudasim("CUDA Memory API unsupported in the simulator")
 class TestMVExtent(ContextResettingTestCase):
     def test_c_contiguous_array(self):
         ary = np.arange(100)
@@ -177,7 +177,7 @@ def test_single_element_array(self):
 
     def test_ctypes_struct(self):
         class mystruct(ctypes.Structure):
-            _fields_ = [('x', ctypes.c_int), ('y', ctypes.c_int)]
+            _fields_ = [("x", ctypes.c_int), ("y", ctypes.c_int)]
 
         data = mystruct(x=123, y=432)
         sz = driver.host_memory_size(data)
@@ -189,5 +189,5 @@ def test_ctypes_double(self):
         self.assertTrue(ctypes.sizeof(data) == sz)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py b/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py
index fd6150882..68ebd234a 100644
--- a/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py
+++ b/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py
@@ -57,10 +57,7 @@ def test_devicearray(self):
     def test_stream_bind(self):
         stream = cuda.stream()
         with stream.auto_synchronize():
-            arr = cuda.device_array(
-                (3, 3),
-                dtype=np.float64,
-                stream=stream)
+            arr = cuda.device_array((3, 3), dtype=np.float64, stream=stream)
             self.assertEqual(arr.bind(stream).stream, stream)
             self.assertEqual(arr.stream, stream)
 
@@ -90,8 +87,8 @@ def test_devicearray_partition(self):
 
         self.assertTrue(np.all(array == 0))
 
-        right.copy_to_host(array[N // 2:])
-        left.copy_to_host(array[:N // 2])
+        right.copy_to_host(array[N // 2 :])
+        left.copy_to_host(array[: N // 2])
 
         self.assertTrue(np.all(array == original))
 
@@ -104,7 +101,7 @@ def test_devicearray_replace(self):
         gpumem.copy_to_host(array)
         np.testing.assert_array_equal(array, original * 2)
 
-    @skip_on_cudasim('This works in the simulator')
+    @skip_on_cudasim("This works in the simulator")
     def test_devicearray_transpose_wrongdim(self):
         gpumem = cuda.to_device(np.array(np.arange(12)).reshape(3, 4, 1))
 
@@ -113,13 +110,15 @@ def test_devicearray_transpose_wrongdim(self):
 
         self.assertEqual(
             "transposing a non-2D DeviceNDArray isn't supported",
-            str(e.exception))
+            str(e.exception),
+        )
 
     def test_devicearray_transpose_identity(self):
         # any-shape identities should work
         original = np.array(np.arange(24)).reshape(3, 4, 2)
-        array = np.transpose(cuda.to_device(original),
-                             axes=(0, 1, 2)).copy_to_host()
+        array = np.transpose(
+            cuda.to_device(original), axes=(0, 1, 2)
+        ).copy_to_host()
         self.assertTrue(np.all(array == original))
 
     def test_devicearray_transpose_duplicatedaxis(self):
@@ -131,9 +130,10 @@ def test_devicearray_transpose_duplicatedaxis(self):
         self.assertIn(
             str(e.exception),
             container=[
-                'invalid axes list (0, 0)',  # GPU
-                'repeated axis in transpose',  # sim
-            ])
+                "invalid axes list (0, 0)",  # GPU
+                "repeated axis in transpose",  # sim
+            ],
+        )
 
     def test_devicearray_transpose_wrongaxis(self):
         gpumem = cuda.to_device(np.array(np.arange(12)).reshape(3, 4))
@@ -144,10 +144,11 @@ def test_devicearray_transpose_wrongaxis(self):
         self.assertIn(
             str(e.exception),
             container=[
-                'invalid axes list (0, 2)',  # GPU
-                'invalid axis for this array',
-                'axis 2 is out of bounds for array of dimension 2',  # sim
-            ])
+                "invalid axes list (0, 2)",  # GPU
+                "invalid axis for this array",
+                "axis 2 is out of bounds for array of dimension 2",  # sim
+            ],
+        )
 
     def test_devicearray_view_ok(self):
         original = np.array(np.arange(12), dtype="i2").reshape(3, 4)
@@ -155,8 +156,7 @@ def test_devicearray_view_ok(self):
         for dtype in ("i4", "u4", "i8", "f8"):
             with self.subTest(dtype=dtype):
                 np.testing.assert_array_equal(
-                    array.view(dtype).copy_to_host(),
-                    original.view(dtype)
+                    array.view(dtype).copy_to_host(), original.view(dtype)
                 )
 
     def test_devicearray_view_ok_not_c_contig(self):
@@ -164,8 +164,7 @@ def test_devicearray_view_ok_not_c_contig(self):
         array = cuda.to_device(original)[:, ::2]
         original = original[:, ::2]
         np.testing.assert_array_equal(
-            array.view("u2").copy_to_host(),
-            original.view("u2")
+            array.view("u2").copy_to_host(), original.view("u2")
         )
 
     def test_devicearray_view_bad_not_c_contig(self):
@@ -175,12 +174,14 @@ def test_devicearray_view_bad_not_c_contig(self):
             array.view("i4")
 
         msg = str(e.exception)
-        self.assertIn('To change to a dtype of a different size,', msg)
+        self.assertIn("To change to a dtype of a different size,", msg)
 
-        contiguous_pre_np123 = 'the array must be C-contiguous' in msg
-        contiguous_post_np123 = 'the last axis must be contiguous' in msg
-        self.assertTrue(contiguous_pre_np123 or contiguous_post_np123,
-                        'Expected message to mention contiguity')
+        contiguous_pre_np123 = "the array must be C-contiguous" in msg
+        contiguous_post_np123 = "the last axis must be contiguous" in msg
+        self.assertTrue(
+            contiguous_pre_np123 or contiguous_post_np123,
+            "Expected message to mention contiguity",
+        )
 
     def test_devicearray_view_bad_itemsize(self):
         original = np.array(np.arange(12), dtype="i2").reshape(4, 3)
@@ -191,7 +192,8 @@ def test_devicearray_view_bad_itemsize(self):
             "When changing to a larger dtype,"
             " its size must be a divisor of the total size in bytes"
             " of the last axis of the array.",
-            str(e.exception))
+            str(e.exception),
+        )
 
     def test_devicearray_transpose_ok(self):
         original = np.array(np.arange(12)).reshape(3, 4)
@@ -206,7 +208,7 @@ def test_devicearray_transpose_T(self):
     def test_devicearray_contiguous_slice(self):
         # memcpys are dumb ranges of bytes, so trying to
         # copy to a non-contiguous range shouldn't work!
-        a = np.arange(25).reshape(5, 5, order='F')
+        a = np.arange(25).reshape(5, 5, order="F")
         s = np.full(fill_value=5, shape=(5,))
 
         d = cuda.to_device(a)
@@ -216,9 +218,7 @@ def test_devicearray_contiguous_slice(self):
         # (40-byte strides). This means we can't memcpy to it!
         with self.assertRaises(ValueError) as e:
             d[2].copy_to_device(s)
-        self.assertEqual(
-            devicearray.errmsg_contiguous_buffer,
-            str(e.exception))
+        self.assertEqual(devicearray.errmsg_contiguous_buffer, str(e.exception))
 
         # if d[2].copy_to_device(s), then this would pass:
         # self.assertTrue((a == d.copy_to_host()).all())
@@ -236,9 +236,9 @@ def _test_devicearray_contiguous_host_copy(self, a_c, a_f):
             (a_c, a_f),
             (a_c, a_c),
         ]:
-            msg = '%s => %s' % (
-                'C' if original.flags.c_contiguous else 'F',
-                'C' if copy.flags.c_contiguous else 'F',
+            msg = "%s => %s" % (
+                "C" if original.flags.c_contiguous else "F",
+                "C" if copy.flags.c_contiguous else "F",
             )
 
             d = cuda.to_device(original)
@@ -248,17 +248,17 @@ def _test_devicearray_contiguous_host_copy(self, a_c, a_f):
 
     def test_devicearray_contiguous_copy_host_3d(self):
         a_c = np.arange(5 * 5 * 5).reshape(5, 5, 5)
-        a_f = np.array(a_c, order='F')
+        a_f = np.array(a_c, order="F")
         self._test_devicearray_contiguous_host_copy(a_c, a_f)
 
     def test_devicearray_contiguous_copy_host_1d(self):
         a_c = np.arange(5)
-        a_f = np.array(a_c, order='F')
+        a_f = np.array(a_c, order="F")
         self._test_devicearray_contiguous_host_copy(a_c, a_f)
 
     def test_devicearray_contiguous_copy_device(self):
         a_c = np.arange(5 * 5 * 5).reshape(5, 5, 5)
-        a_f = np.array(a_c, order='F')
+        a_f = np.array(a_c, order="F")
         self.assertTrue(a_c.flags.c_contiguous)
         self.assertTrue(a_f.flags.f_contiguous)
 
@@ -268,7 +268,8 @@ def test_devicearray_contiguous_copy_device(self):
             d.copy_to_device(cuda.to_device(a_f))
         self.assertEqual(
             "incompatible strides: {} vs. {}".format(a_c.strides, a_f.strides),
-            str(e.exception))
+            str(e.exception),
+        )
 
         d.copy_to_device(cuda.to_device(a_c))
         self.assertTrue(np.all(d.copy_to_host() == a_c))
@@ -279,7 +280,8 @@ def test_devicearray_contiguous_copy_device(self):
             d.copy_to_device(cuda.to_device(a_c))
         self.assertEqual(
             "incompatible strides: {} vs. {}".format(a_f.strides, a_c.strides),
-            str(e.exception))
+            str(e.exception),
+        )
 
         d.copy_to_device(cuda.to_device(a_f))
         self.assertTrue(np.all(d.copy_to_host() == a_f))
@@ -288,8 +290,8 @@ def test_devicearray_broadcast_host_copy(self):
         broadsize = 4
         coreshape = (2, 3)
         coresize = np.prod(coreshape)
-        core_c = np.arange(coresize).reshape(coreshape, order='C')
-        core_f = np.arange(coresize).reshape(coreshape, order='F')
+        core_c = np.arange(coresize).reshape(coreshape, order="C")
+        core_f = np.arange(coresize).reshape(coreshape, order="F")
         for dim in range(len(coreshape)):
             newindex = (slice(None),) * dim + (np.newaxis,)
             broadshape = coreshape[:dim] + (broadsize,) + coreshape[dim:]
@@ -318,11 +320,9 @@ def test_devicearray_contiguous_device_strided(self):
 
         with self.assertRaises(ValueError) as e:
             d.copy_to_device(cuda.to_device(arr)[::2])
-        self.assertEqual(
-            devicearray.errmsg_contiguous_buffer,
-            str(e.exception))
+        self.assertEqual(devicearray.errmsg_contiguous_buffer, str(e.exception))
 
-    @skip_on_cudasim('DeviceNDArray class not present in simulator')
+    @skip_on_cudasim("DeviceNDArray class not present in simulator")
     def test_devicearray_relaxed_strides(self):
         # From the reproducer in Issue #6824.
 
@@ -334,86 +334,88 @@ def test_devicearray_relaxed_strides(self):
 
         # Ensure we still believe the array to be contiguous because
         # strides checking is relaxed.
-        self.assertTrue(arr.flags['C_CONTIGUOUS'])
-        self.assertTrue(arr.flags['F_CONTIGUOUS'])
+        self.assertTrue(arr.flags["C_CONTIGUOUS"])
+        self.assertTrue(arr.flags["F_CONTIGUOUS"])
 
     def test_c_f_contiguity_matches_numpy(self):
         # From the reproducer in Issue #4943.
 
         shapes = ((1, 4), (4, 1))
-        orders = ('C', 'F')
+        orders = ("C", "F")
 
         for shape, order in itertools.product(shapes, orders):
             arr = np.ndarray(shape, order=order)
             d_arr = cuda.to_device(arr)
-            self.assertEqual(arr.flags['C_CONTIGUOUS'],
-                             d_arr.flags['C_CONTIGUOUS'])
-            self.assertEqual(arr.flags['F_CONTIGUOUS'],
-                             d_arr.flags['F_CONTIGUOUS'])
+            self.assertEqual(
+                arr.flags["C_CONTIGUOUS"], d_arr.flags["C_CONTIGUOUS"]
+            )
+            self.assertEqual(
+                arr.flags["F_CONTIGUOUS"], d_arr.flags["F_CONTIGUOUS"]
+            )
 
-    @skip_on_cudasim('Typing not done in the simulator')
+    @skip_on_cudasim("Typing not done in the simulator")
     def test_devicearray_typing_order_simple_c(self):
         # C-order 1D array
-        a = np.zeros(10, order='C')
+        a = np.zeros(10, order="C")
         d = cuda.to_device(a)
-        self.assertEqual(d._numba_type_.layout, 'C')
+        self.assertEqual(d._numba_type_.layout, "C")
 
-    @skip_on_cudasim('Typing not done in the simulator')
+    @skip_on_cudasim("Typing not done in the simulator")
     def test_devicearray_typing_order_simple_f(self):
         # F-order array that is also C layout.
-        a = np.zeros(10, order='F')
+        a = np.zeros(10, order="F")
         d = cuda.to_device(a)
-        self.assertEqual(d._numba_type_.layout, 'C')
+        self.assertEqual(d._numba_type_.layout, "C")
 
-    @skip_on_cudasim('Typing not done in the simulator')
+    @skip_on_cudasim("Typing not done in the simulator")
     def test_devicearray_typing_order_2d_c(self):
         # C-order 2D array
-        a = np.zeros((2, 10), order='C')
+        a = np.zeros((2, 10), order="C")
         d = cuda.to_device(a)
-        self.assertEqual(d._numba_type_.layout, 'C')
+        self.assertEqual(d._numba_type_.layout, "C")
 
-    @skip_on_cudasim('Typing not done in the simulator')
+    @skip_on_cudasim("Typing not done in the simulator")
     def test_devicearray_typing_order_2d_f(self):
         # F-order array that can only be F layout
-        a = np.zeros((2, 10), order='F')
+        a = np.zeros((2, 10), order="F")
         d = cuda.to_device(a)
-        self.assertEqual(d._numba_type_.layout, 'F')
+        self.assertEqual(d._numba_type_.layout, "F")
 
-    @skip_on_cudasim('Typing not done in the simulator')
+    @skip_on_cudasim("Typing not done in the simulator")
     def test_devicearray_typing_order_noncontig_slice_c(self):
         # Non-contiguous slice of C-order array
-        a = np.zeros((5, 5), order='C')
-        d = cuda.to_device(a)[:,2]
-        self.assertEqual(d._numba_type_.layout, 'A')
+        a = np.zeros((5, 5), order="C")
+        d = cuda.to_device(a)[:, 2]
+        self.assertEqual(d._numba_type_.layout, "A")
 
-    @skip_on_cudasim('Typing not done in the simulator')
+    @skip_on_cudasim("Typing not done in the simulator")
     def test_devicearray_typing_order_noncontig_slice_f(self):
         # Non-contiguous slice of F-order array
-        a = np.zeros((5, 5), order='F')
-        d = cuda.to_device(a)[2,:]
-        self.assertEqual(d._numba_type_.layout, 'A')
+        a = np.zeros((5, 5), order="F")
+        d = cuda.to_device(a)[2, :]
+        self.assertEqual(d._numba_type_.layout, "A")
 
-    @skip_on_cudasim('Typing not done in the simulator')
+    @skip_on_cudasim("Typing not done in the simulator")
     def test_devicearray_typing_order_contig_slice_c(self):
         # Contiguous slice of C-order array
-        a = np.zeros((5, 5), order='C')
-        d = cuda.to_device(a)[2,:]
-        self.assertEqual(d._numba_type_.layout, 'C')
+        a = np.zeros((5, 5), order="C")
+        d = cuda.to_device(a)[2, :]
+        self.assertEqual(d._numba_type_.layout, "C")
 
-    @skip_on_cudasim('Typing not done in the simulator')
+    @skip_on_cudasim("Typing not done in the simulator")
     def test_devicearray_typing_order_contig_slice_f(self):
         # Contiguous slice of F-order array - is both C- and F-contiguous, so
         # types as 'C' layout
-        a = np.zeros((5, 5), order='F')
-        d = cuda.to_device(a)[:,2]
-        self.assertEqual(d._numba_type_.layout, 'C')
+        a = np.zeros((5, 5), order="F")
+        d = cuda.to_device(a)[:, 2]
+        self.assertEqual(d._numba_type_.layout, "C")
 
-    @skip_on_cudasim('Typing not done in the simulator')
+    @skip_on_cudasim("Typing not done in the simulator")
     def test_devicearray_typing_order_broadcasted(self):
         # Broadcasted array, similar to that used for passing scalars to ufuncs
         a = np.broadcast_to(np.array([1]), (10,))
         d = cuda.to_device(a)
-        self.assertEqual(d._numba_type_.layout, 'A')
+        self.assertEqual(d._numba_type_.layout, "A")
 
     def test_bug6697(self):
         ary = np.arange(10, dtype=np.int16)
@@ -421,7 +423,7 @@ def test_bug6697(self):
         got = np.asarray(dary)
         self.assertEqual(got.dtype, dary.dtype)
 
-    @skip_on_cudasim('DeviceNDArray class not present in simulator')
+    @skip_on_cudasim("DeviceNDArray class not present in simulator")
     def test_issue_8477(self):
         # Ensure that we can copy a zero-length device array to a zero-length
         # host array when the strides of the device and host arrays differ -
@@ -430,8 +432,9 @@ def test_issue_8477(self):
         # https://github.com/numba/numba/issues/8477.
 
         # Create a device array with shape (0,) and strides (8,)
-        dev_array = devicearray.DeviceNDArray(shape=(0,), strides=(8,),
-                                              dtype=np.int8)
+        dev_array = devicearray.DeviceNDArray(
+            shape=(0,), strides=(8,), dtype=np.int8
+        )
 
         # Create a host array with shape (0,) and strides (0,)
         host_array = np.ndarray(shape=(0,), strides=(0,), dtype=np.int8)
@@ -470,8 +473,7 @@ def test_np_array_dtype(self):
         dev_array = cuda.to_device(np.asarray([1.0, 2.0, 3.0]))
         host_array = np.array(dev_array, dtype=dtype)
         np.testing.assert_equal(
-            host_array,
-            dev_array.copy_to_host().astype(dtype)
+            host_array, dev_array.copy_to_host().astype(dtype)
         )
 
     @unittest.skipUnless(IS_NUMPY_2, "NumPy 1.x does not pass copy kwarg")
@@ -490,10 +492,13 @@ def test_np_array_copy_true(self):
 class TestRecarray(CUDATestCase):
     def test_recarray(self):
         # From issue #4111
-        a = np.recarray((16,), dtype=[
-            ("value1", np.int64),
-            ("value2", np.float64),
-        ])
+        a = np.recarray(
+            (16,),
+            dtype=[
+                ("value1", np.int64),
+                ("value2", np.float64),
+            ],
+        )
         a.value1 = np.arange(a.size, dtype=np.int64)
         a.value2 = np.arange(a.size, dtype=np.float64) / 100
 
@@ -518,39 +523,39 @@ class TestCoreContiguous(CUDATestCase):
     def _test_against_array_core(self, view):
         self.assertEqual(
             devicearray.is_contiguous(view),
-            devicearray.array_core(view).flags['C_CONTIGUOUS']
+            devicearray.array_core(view).flags["C_CONTIGUOUS"],
         )
 
     def test_device_array_like_1d(self):
-        d_a = cuda.device_array(10, order='C')
+        d_a = cuda.device_array(10, order="C")
         self._test_against_array_core(d_a)
 
     def test_device_array_like_2d(self):
-        d_a = cuda.device_array((10, 12), order='C')
+        d_a = cuda.device_array((10, 12), order="C")
         self._test_against_array_core(d_a)
 
     def test_device_array_like_2d_transpose(self):
-        d_a = cuda.device_array((10, 12), order='C')
+        d_a = cuda.device_array((10, 12), order="C")
         self._test_against_array_core(d_a.T)
 
     def test_device_array_like_3d(self):
-        d_a = cuda.device_array((10, 12, 14), order='C')
+        d_a = cuda.device_array((10, 12, 14), order="C")
         self._test_against_array_core(d_a)
 
     def test_device_array_like_1d_f(self):
-        d_a = cuda.device_array(10, order='F')
+        d_a = cuda.device_array(10, order="F")
         self._test_against_array_core(d_a)
 
     def test_device_array_like_2d_f(self):
-        d_a = cuda.device_array((10, 12), order='F')
+        d_a = cuda.device_array((10, 12), order="F")
         self._test_against_array_core(d_a)
 
     def test_device_array_like_2d_f_transpose(self):
-        d_a = cuda.device_array((10, 12), order='F')
+        d_a = cuda.device_array((10, 12), order="F")
         self._test_against_array_core(d_a.T)
 
     def test_device_array_like_3d_f(self):
-        d_a = cuda.device_array((10, 12, 14), order='F')
+        d_a = cuda.device_array((10, 12, 14), order="F")
         self._test_against_array_core(d_a)
 
     def test_1d_view(self):
@@ -560,7 +565,7 @@ def test_1d_view(self):
 
     def test_1d_view_f(self):
         shape = 10
-        view = np.zeros(shape, order='F')[::2]
+        view = np.zeros(shape, order="F")[::2]
         self._test_against_array_core(view)
 
     def test_2d_view(self):
@@ -570,9 +575,9 @@ def test_2d_view(self):
 
     def test_2d_view_f(self):
         shape = (10, 12)
-        view = np.zeros(shape, order='F')[::2, ::2]
+        view = np.zeros(shape, order="F")[::2, ::2]
         self._test_against_array_core(view)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py b/numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py
index 66fbbc372..7f03912c1 100644
--- a/numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py
+++ b/numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py
@@ -3,14 +3,18 @@
 import numpy as np
 
 from numba import cuda
-from numba.cuda.testing import (unittest, skip_on_cudasim,
-                                skip_if_external_memmgr, CUDATestCase)
+from numba.cuda.testing import (
+    unittest,
+    skip_on_cudasim,
+    skip_if_external_memmgr,
+    CUDATestCase,
+)
 from numba.tests.support import captured_stderr
 from numba.core import config
 
 
-@skip_on_cudasim('not supported on CUDASIM')
-@skip_if_external_memmgr('Deallocation specific to Numba memory management')
+@skip_on_cudasim("not supported on CUDASIM")
+@skip_if_external_memmgr("Deallocation specific to Numba memory management")
 class TestDeallocation(CUDATestCase):
     def test_max_pending_count(self):
         # get deallocation manager and flush it
@@ -41,8 +45,9 @@ def test_max_pending_bytes(self):
             config.CUDA_DEALLOCS_RATIO = max_pending / mi.total
             # due to round off error (floor is used in calculating
             # _max_pending_bytes) it can be off by 1.
-            self.assertAlmostEqual(deallocs._max_pending_bytes, max_pending,
-                                   delta=1)
+            self.assertAlmostEqual(
+                deallocs._max_pending_bytes, max_pending, delta=1
+            )
 
             # allocate half the max size
             # this will not trigger deallocation
@@ -51,8 +56,11 @@ def test_max_pending_bytes(self):
 
             # allocate another remaining
             # this will not trigger deallocation
-            cuda.to_device(np.ones(deallocs._max_pending_bytes -
-                                   deallocs._size, dtype=np.int8))
+            cuda.to_device(
+                np.ones(
+                    deallocs._max_pending_bytes - deallocs._size, dtype=np.int8
+                )
+            )
             self.assertEqual(len(deallocs), 2)
 
             # another byte to trigger .clear()
@@ -64,7 +72,7 @@ def test_max_pending_bytes(self):
 
 
 @skip_on_cudasim("defer_cleanup has no effect in CUDASIM")
-@skip_if_external_memmgr('Deallocation specific to Numba memory management')
+@skip_if_external_memmgr("Deallocation specific to Numba memory management")
 class TestDeferCleanup(CUDATestCase):
     def test_basic(self):
         harr = np.arange(5)
@@ -138,11 +146,12 @@ def test_context_manager(self):
             pass
 
 
-@skip_on_cudasim('not supported on CUDASIM')
+@skip_on_cudasim("not supported on CUDASIM")
 class TestDel(CUDATestCase):
     """
     Ensure resources are deleted properly without ignored exception.
     """
+
     @contextmanager
     def check_ignored_exception(self, ctx):
         with captured_stderr() as cap:
@@ -245,5 +254,5 @@ class MappedException(Exception):
                     pass
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_detect.py b/numba_cuda/numba/cuda/tests/cudadrv/test_detect.py
index 528e11bf8..d70b6776e 100644
--- a/numba_cuda/numba/cuda/tests/cudadrv/test_detect.py
+++ b/numba_cuda/numba/cuda/tests/cudadrv/test_detect.py
@@ -3,8 +3,12 @@
 import subprocess
 import threading
 from numba import cuda
-from numba.cuda.testing import (unittest, CUDATestCase, skip_on_cudasim,
-                                skip_under_cuda_memcheck)
+from numba.cuda.testing import (
+    unittest,
+    CUDATestCase,
+    skip_on_cudasim,
+    skip_under_cuda_memcheck,
+)
 from numba.tests.support import captured_stdout
 
 
@@ -14,21 +18,19 @@ def test_cuda_detect(self):
         with captured_stdout() as out:
             cuda.detect()
         output = out.getvalue()
-        self.assertIn('Found', output)
-        self.assertIn('CUDA devices', output)
+        self.assertIn("Found", output)
+        self.assertIn("CUDA devices", output)
 
 
-@skip_under_cuda_memcheck('Hangs cuda-memcheck')
+@skip_under_cuda_memcheck("Hangs cuda-memcheck")
 class TestCUDAFindLibs(CUDATestCase):
-
     def run_cmd(self, cmdline, env):
-        popen = subprocess.Popen(cmdline,
-                                 stdout=subprocess.PIPE,
-                                 stderr=subprocess.PIPE,
-                                 env=env)
+        popen = subprocess.Popen(
+            cmdline, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env
+        )
 
         # finish in 5 minutes or kill it
-        timeout = threading.Timer(5 * 60., popen.kill)
+        timeout = threading.Timer(5 * 60.0, popen.kill)
         try:
             timeout.start()
             out, err = popen.communicate()
@@ -51,8 +53,8 @@ def kernel(x):
         cmdline = [sys.executable, "-c", code]
         return self.run_cmd(cmdline, env_copy)
 
-    @skip_on_cudasim('Simulator does not hit device library search code path')
-    @unittest.skipIf(not sys.platform.startswith('linux'), "linux only")
+    @skip_on_cudasim("Simulator does not hit device library search code path")
+    @unittest.skipIf(not sys.platform.startswith("linux"), "linux only")
     def test_cuda_find_lib_errors(self):
         """
         This tests that the find_libs works as expected in the case of an
@@ -60,7 +62,7 @@ def test_cuda_find_lib_errors(self):
         """
         # one of these is likely to exist on linux, it's also unlikely that
         # someone has extracted the contents of libdevice into here!
-        locs = ['lib', 'lib64']
+        locs = ["lib", "lib64"]
 
         looking_for = None
         for l in locs:
@@ -71,11 +73,12 @@ def test_cuda_find_lib_errors(self):
         # This is the testing part, the test will only run if there's a valid
         # path in which to look
         if looking_for is not None:
-            out, err = self.run_test_in_separate_process("NUMBA_CUDA_DRIVER",
-                                                         looking_for)
+            out, err = self.run_test_in_separate_process(
+                "NUMBA_CUDA_DRIVER", looking_for
+            )
             self.assertTrue(out is not None)
             self.assertTrue(err is not None)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py b/numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py
index 209355ed6..c0ad870bd 100644
--- a/numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py
+++ b/numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py
@@ -8,6 +8,7 @@
 from numba.tests.support import linux_only
 
 if not config.ENABLE_CUDASIM:
+
     class DeviceOnlyEMMPlugin(cuda.HostOnlyCUDAMemoryManager):
         """
         Dummy EMM Plugin implementation for testing. It memorises which plugin
@@ -56,8 +57,9 @@ def finalizer():
             # the reference count drops to zero.
             ctx = weakref.proxy(self.context)
             ptr = ctypes.c_void_p(alloc_count)
-            return cuda.cudadrv.driver.AutoFreePointer(ctx, ptr, size,
-                                                       finalizer=finalizer)
+            return cuda.cudadrv.driver.AutoFreePointer(
+                ctx, ptr, size, finalizer=finalizer
+            )
 
         def initialize(self):
             # No special initialization needed.
@@ -97,7 +99,7 @@ def interface_version(self):
             return 2
 
 
-@skip_on_cudasim('EMM Plugins not supported on CUDA simulator')
+@skip_on_cudasim("EMM Plugins not supported on CUDA simulator")
 class TestDeviceOnlyEMMPlugin(CUDATestCase):
     """
     Tests that the API of an EMM Plugin that implements device allocations
@@ -175,7 +177,7 @@ def test_get_ipc_handle(self):
         self.assertIn("Dummy IPC handle for alloc 1", ipch._ipc_handle)
 
 
-@skip_on_cudasim('EMM Plugins not supported on CUDA simulator')
+@skip_on_cudasim("EMM Plugins not supported on CUDA simulator")
 class TestBadEMMPluginVersion(CUDATestCase):
     """
     Ensure that Numba rejects EMM Plugins with incompatible version
@@ -185,8 +187,8 @@ class TestBadEMMPluginVersion(CUDATestCase):
     def test_bad_plugin_version(self):
         with self.assertRaises(RuntimeError) as raises:
             cuda.set_memory_manager(BadVersionEMMPlugin)
-        self.assertIn('version 1 required', str(raises.exception))
+        self.assertIn("version 1 required", str(raises.exception))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_events.py b/numba_cuda/numba/cuda/tests/cudadrv/test_events.py
index b611a4a75..f8a7805d5 100644
--- a/numba_cuda/numba/cuda/tests/cudadrv/test_events.py
+++ b/numba_cuda/numba/cuda/tests/cudadrv/test_events.py
@@ -34,5 +34,5 @@ def test_event_elapsed_stream(self):
         evtstart.elapsed_time(evtend)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py b/numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py
index 62c4ecafe..02761d958 100644
--- a/numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py
+++ b/numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py
@@ -10,10 +10,9 @@ def test_host_alloc_driver(self):
         mem = cuda.current_context().memhostalloc(n, mapped=True)
 
         dtype = np.dtype(np.uint8)
-        ary = np.ndarray(shape=n // dtype.itemsize, dtype=dtype,
-                         buffer=mem)
+        ary = np.ndarray(shape=n // dtype.itemsize, dtype=dtype, buffer=mem)
 
-        magic = 0xab
+        magic = 0xAB
         driver.device_memset(mem, magic, n)
 
         self.assertTrue(np.all(ary == magic))
@@ -46,8 +45,10 @@ def test_host_alloc_mapped(self):
         self.assertTrue(sum(ary != 0) == 0)
 
     def test_host_operators(self):
-        for ary in [cuda.mapped_array(10, dtype=np.uint32),
-                    cuda.pinned_array(10, dtype=np.uint32)]:
+        for ary in [
+            cuda.mapped_array(10, dtype=np.uint32),
+            cuda.pinned_array(10, dtype=np.uint32),
+        ]:
             ary[:] = range(10)
             self.assertTrue(sum(ary + 1) == 55)
             self.assertTrue(sum((ary + 1) * 2 - 1) == 100)
@@ -55,11 +56,11 @@ def test_host_operators(self):
             self.assertTrue(sum(ary <= 5) == 6)
             self.assertTrue(sum(ary > 6) == 3)
             self.assertTrue(sum(ary >= 6) == 4)
-            self.assertTrue(sum(ary ** 2) == 285)
+            self.assertTrue(sum(ary**2) == 285)
             self.assertTrue(sum(ary // 2) == 20)
             self.assertTrue(sum(ary / 2.0) == 22.5)
             self.assertTrue(sum(ary % 2) == 5)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_init.py b/numba_cuda/numba/cuda/tests/cudadrv/test_init.py
index 600687fd5..c5dccbd6a 100644
--- a/numba_cuda/numba/cuda/tests/cudadrv/test_init.py
+++ b/numba_cuda/numba/cuda/tests/cudadrv/test_init.py
@@ -9,7 +9,7 @@
 
 # A mock of cuInit that always raises a CudaAPIError
 def cuInit_raising(arg):
-    raise CudaAPIError(999, 'CUDA_ERROR_UNKNOWN')
+    raise CudaAPIError(999, "CUDA_ERROR_UNKNOWN")
 
 
 # Test code to run in a child that patches driver.cuInit to a variant that
@@ -82,45 +82,45 @@ def cuda_disabled_error_test(result_queue):
     result_queue.put((success, msg))
 
 
-@skip_on_cudasim('CUDA Simulator does not initialize driver')
+@skip_on_cudasim("CUDA Simulator does not initialize driver")
 class TestInit(CUDATestCase):
     def _test_init_failure(self, target, expected):
         # Run the initialization failure test in a separate subprocess
-        ctx = mp.get_context('spawn')
+        ctx = mp.get_context("spawn")
         result_queue = ctx.Queue()
         proc = ctx.Process(target=target, args=(result_queue,))
         proc.start()
-        proc.join(30) # should complete within 30s
+        proc.join(30)  # should complete within 30s
         success, msg = result_queue.get()
 
         # Ensure the child process raised an exception during initialization
         # before checking the message
         if not success:
-            self.fail('CudaSupportError not raised')
+            self.fail("CudaSupportError not raised")
 
         self.assertIn(expected, msg)
 
     def test_init_failure_raising(self):
-        expected = 'Error at driver init: CUDA_ERROR_UNKNOWN (999)'
+        expected = "Error at driver init: CUDA_ERROR_UNKNOWN (999)"
         self._test_init_failure(cuInit_raising_test, expected)
 
     def test_init_failure_error(self):
-        expected = 'CUDA_ERROR_UNKNOWN (999)'
+        expected = "CUDA_ERROR_UNKNOWN (999)"
         self._test_init_failure(initialization_error_test, expected)
 
     def _test_cuda_disabled(self, target):
         # Uses _test_init_failure to launch the test in a separate subprocess
         # with CUDA disabled.
-        cuda_disabled = os.environ.get('NUMBA_DISABLE_CUDA')
-        os.environ['NUMBA_DISABLE_CUDA'] = "1"
+        cuda_disabled = os.environ.get("NUMBA_DISABLE_CUDA")
+        os.environ["NUMBA_DISABLE_CUDA"] = "1"
         try:
-            expected = 'CUDA is disabled due to setting NUMBA_DISABLE_CUDA=1'
+            expected = "CUDA is disabled due to setting NUMBA_DISABLE_CUDA=1"
             self._test_init_failure(cuda_disabled_test, expected)
         finally:
             if cuda_disabled is not None:
-                os.environ['NUMBA_DISABLE_CUDA'] = cuda_disabled
+                os.environ["NUMBA_DISABLE_CUDA"] = cuda_disabled
             else:
-                os.environ.pop('NUMBA_DISABLE_CUDA')
+                os.environ.pop("NUMBA_DISABLE_CUDA")
 
     def test_cuda_disabled_raising(self):
         self._test_cuda_disabled(cuda_disabled_test)
@@ -135,5 +135,5 @@ def test_init_success(self):
         self.assertIsNone(cuda.cuda_error())
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py b/numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py
index 40a6fa599..aeeb5bbd2 100644
--- a/numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py
+++ b/numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py
@@ -5,20 +5,23 @@
 from numba.cuda.testing import skip_on_cudasim
 
 
-@skip_on_cudasim('Inline PTX cannot be used in the simulator')
+@skip_on_cudasim("Inline PTX cannot be used in the simulator")
 class TestCudaInlineAsm(ContextResettingTestCase):
     def test_inline_rsqrt(self):
         mod = ir.Module(__name__)
-        mod.triple = 'nvptx64-nvidia-cuda'
+        mod.triple = "nvptx64-nvidia-cuda"
         nvvm.add_ir_version(mod)
         fnty = ir.FunctionType(ir.VoidType(), [ir.PointerType(ir.FloatType())])
-        fn = ir.Function(mod, fnty, 'cu_rsqrt')
-        bldr = ir.IRBuilder(fn.append_basic_block('entry'))
+        fn = ir.Function(mod, fnty, "cu_rsqrt")
+        bldr = ir.IRBuilder(fn.append_basic_block("entry"))
 
         rsqrt_approx_fnty = ir.FunctionType(ir.FloatType(), [ir.FloatType()])
-        inlineasm = ir.InlineAsm(rsqrt_approx_fnty,
-                                 'rsqrt.approx.f32 $0, $1;',
-                                 '=f,f', side_effect=True)
+        inlineasm = ir.InlineAsm(
+            rsqrt_approx_fnty,
+            "rsqrt.approx.f32 $0, $1;",
+            "=f,f",
+            side_effect=True,
+        )
         val = bldr.load(fn.args[0])
         res = bldr.call(inlineasm, [val])
 
@@ -30,8 +33,8 @@ def test_inline_rsqrt(self):
         nvvm.set_cuda_kernel(fn)
         nvvmir = str(mod)
         ptx = nvvm.compile_ir(nvvmir)
-        self.assertTrue('rsqrt.approx.f32' in str(ptx))
+        self.assertTrue("rsqrt.approx.f32" in str(ptx))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_linker.py b/numba_cuda/numba/cuda/tests/cudadrv/test_linker.py
index 22e2ee837..be018ccef 100644
--- a/numba_cuda/numba/cuda/tests/cudadrv/test_linker.py
+++ b/numba_cuda/numba/cuda/tests/cudadrv/test_linker.py
@@ -1,10 +1,9 @@
 import numpy as np
 import warnings
 from numba.cuda.testing import unittest
-from numba.cuda.testing import (skip_on_cudasim, skip_if_cuda_includes_missing)
+from numba.cuda.testing import skip_on_cudasim, skip_if_cuda_includes_missing
 from numba.cuda.testing import CUDATestCase, test_data_dir
-from numba.cuda.cudadrv.driver import (CudaAPIError, Linker,
-                                       LinkerError)
+from numba.cuda.cudadrv.driver import CudaAPIError, Linker, LinkerError
 from numba.cuda.cudadrv.error import NvrtcError
 from numba.cuda import require_context
 from numba.tests.support import ignore_internal_warnings
@@ -103,25 +102,24 @@ def simple_lmem(A, B, dty):
         B[i] = C[i]
 
 
-@skip_on_cudasim('Linking unsupported in the simulator')
+@skip_on_cudasim("Linking unsupported in the simulator")
 class TestLinker(CUDATestCase):
-    _NUMBA_NVIDIA_BINDING_0_ENV = {'NUMBA_CUDA_USE_NVIDIA_BINDING': '0'}
+    _NUMBA_NVIDIA_BINDING_0_ENV = {"NUMBA_CUDA_USE_NVIDIA_BINDING": "0"}
 
     @require_context
     def test_linker_basic(self):
-        '''Simply go through the constructor and destructor
-        '''
+        """Simply go through the constructor and destructor"""
         linker = Linker.new(cc=(5, 3))
         del linker
 
     def _test_linking(self, eager):
         global bar  # must be a global; other it is recognized as a freevar
-        bar = cuda.declare_device('bar', 'int32(int32)')
+        bar = cuda.declare_device("bar", "int32(int32)")
 
-        link = str(test_data_dir / 'jitlink.ptx')
+        link = str(test_data_dir / "jitlink.ptx")
 
         if eager:
-            args = ['void(int32[:], int32[:])']
+            args = ["void(int32[:], int32[:])"]
         else:
             args = []
 
@@ -144,9 +142,9 @@ def test_linking_eager_compile(self):
         self._test_linking(eager=True)
 
     def test_linking_cu(self):
-        bar = cuda.declare_device('bar', 'int32(int32)')
+        bar = cuda.declare_device("bar", "int32(int32)")
 
-        link = str(test_data_dir / 'jitlink.cu')
+        link = str(test_data_dir / "jitlink.cu")
 
         @cuda.jit(link=[link])
         def kernel(r, x):
@@ -165,36 +163,37 @@ def kernel(r, x):
         np.testing.assert_array_equal(r, expected)
 
     def test_linking_cu_log_warning(self):
-        bar = cuda.declare_device('bar', 'int32(int32)')
+        bar = cuda.declare_device("bar", "int32(int32)")
 
-        link = str(test_data_dir / 'warn.cu')
+        link = str(test_data_dir / "warn.cu")
 
         with warnings.catch_warnings(record=True) as w:
             ignore_internal_warnings()
 
-            @cuda.jit('void(int32)', link=[link])
+            @cuda.jit("void(int32)", link=[link])
             def kernel(x):
                 bar(x)
 
-        self.assertEqual(len(w), 1, 'Expected warnings from NVRTC')
+        self.assertEqual(len(w), 1, "Expected warnings from NVRTC")
         # Check the warning refers to the log messages
-        self.assertIn('NVRTC log messages', str(w[0].message))
+        self.assertIn("NVRTC log messages", str(w[0].message))
         # Check the message pertaining to the unused variable is provided
-        self.assertIn('declared but never referenced', str(w[0].message))
+        self.assertIn("declared but never referenced", str(w[0].message))
 
     def test_linking_cu_error(self):
-        bar = cuda.declare_device('bar', 'int32(int32)')
+        bar = cuda.declare_device("bar", "int32(int32)")
 
-        link = str(test_data_dir / 'error.cu')
+        link = str(test_data_dir / "error.cu")
 
         with self.assertRaises(NvrtcError) as e:
-            @cuda.jit('void(int32)', link=[link])
+
+            @cuda.jit("void(int32)", link=[link])
             def kernel(x):
                 bar(x)
 
         msg = e.exception.args[0]
         # Check the error message refers to the NVRTC compile
-        self.assertIn('NVRTC Compilation failure', msg)
+        self.assertIn("NVRTC Compilation failure", msg)
         # Check the expected error in the CUDA source is reported
         self.assertIn('identifier "SYNTAX" is undefined', msg)
         # Check the filename is reported correctly
@@ -203,33 +202,37 @@ def kernel(x):
     def test_linking_unknown_filetype_error(self):
         expected_err = "Don't know how to link file with extension .cuh"
         with self.assertRaisesRegex(RuntimeError, expected_err):
-            @cuda.jit('void()', link=['header.cuh'])
+
+            @cuda.jit("void()", link=["header.cuh"])
             def kernel():
                 pass
 
     def test_linking_file_with_no_extension_error(self):
         expected_err = "Don't know how to link file with no extension"
         with self.assertRaisesRegex(RuntimeError, expected_err):
-            @cuda.jit('void()', link=['data'])
+
+            @cuda.jit("void()", link=["data"])
             def kernel():
                 pass
 
     @skip_if_cuda_includes_missing
     def test_linking_cu_cuda_include(self):
-        link = str(test_data_dir / 'cuda_include.cu')
+        link = str(test_data_dir / "cuda_include.cu")
 
         # An exception will be raised when linking this kernel due to the
         # compile failure if CUDA includes cannot be found by Nvrtc.
-        @cuda.jit('void()', link=[link])
+        @cuda.jit("void()", link=[link])
         def kernel():
             pass
 
     def test_try_to_link_nonexistent(self):
         with self.assertRaises(LinkerError) as e:
-            @cuda.jit('void(int32[::1])', link=['nonexistent.a'])
+
+            @cuda.jit("void(int32[::1])", link=["nonexistent.a"])
             def f(x):
                 x[0] = 0
-        self.assertIn('nonexistent.a not found', e.exception.args)
+
+        self.assertIn("nonexistent.a not found", e.exception.args)
 
     def test_set_registers_no_max(self):
         """Ensure that the jitted kernel used in the test_set_registers_* tests
@@ -276,7 +279,8 @@ def test_get_shared_mem_per_block(self):
     def test_get_shared_mem_per_specialized(self):
         compiled = cuda.jit(simple_smem)
         compiled_specialized = compiled.specialize(
-            np.zeros(100, dtype=np.int32), np.float64)
+            np.zeros(100, dtype=np.int32), np.float64
+        )
         shared_mem_size = compiled_specialized.get_shared_mem_per_block()
         self.assertEqual(shared_mem_size, 800)
 
@@ -307,11 +311,12 @@ def test_get_local_mem_per_specialized(self):
         compiled_specialized = compiled.specialize(
             np.zeros(LMEM_SIZE, dtype=np.int32),
             np.zeros(LMEM_SIZE, dtype=np.int32),
-            np.float64)
+            np.float64,
+        )
         local_mem_size = compiled_specialized.get_local_mem_per_thread()
         calc_size = np.dtype(np.float64).itemsize * LMEM_SIZE
         self.assertGreaterEqual(local_mem_size, calc_size)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py b/numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py
index e9cc37ca8..1f4eb411e 100644
--- a/numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py
+++ b/numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py
@@ -7,11 +7,10 @@
 from numba.tests.support import linux_only
 
 
-@skip_on_cudasim('CUDA Driver API unsupported in the simulator')
+@skip_on_cudasim("CUDA Driver API unsupported in the simulator")
 @linux_only
-@skip_on_arm('Managed Alloc support is experimental/untested on ARM')
+@skip_on_arm("Managed Alloc support is experimental/untested on ARM")
 class TestManagedAlloc(ContextResettingTestCase):
-
     def get_total_gpu_memory(self):
         # We use a driver function to directly get the total GPU memory because
         # an EMM plugin may report something different (or not implement
@@ -85,7 +84,7 @@ def _test_managed_alloc_driver(self, memory_factor, attach_global=True):
         n_elems = n_bytes // dtype.itemsize
         ary = np.ndarray(shape=n_elems, dtype=dtype, buffer=mem)
 
-        magic = 0xab
+        magic = 0xAB
         device_memset(mem, magic, n_bytes)
         ctx.synchronize()
 
@@ -102,7 +101,7 @@ def _test_managed_array(self, attach_global=True):
         ary.fill(123.456)
         self.assertTrue(all(ary == 123.456))
 
-        @cuda.jit('void(double[:])')
+        @cuda.jit("void(double[:])")
         def kernel(x):
             i = cuda.grid(1)
             if i < x.shape[0]:
@@ -123,5 +122,5 @@ def test_managed_array_attach_host(self):
         self._test_managed_array(attach_global=False)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py b/numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py
index c25bc5ae2..4da56e009 100644
--- a/numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py
+++ b/numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py
@@ -1,8 +1,11 @@
 import multiprocessing as mp
 import traceback
 from numba.cuda.testing import unittest, CUDATestCase
-from numba.cuda.testing import (skip_on_cudasim, skip_under_cuda_memcheck,
-                                skip_if_mvc_libraries_unavailable)
+from numba.cuda.testing import (
+    skip_on_cudasim,
+    skip_under_cuda_memcheck,
+    skip_if_mvc_libraries_unavailable,
+)
 from numba.tests.support import linux_only
 
 
@@ -24,7 +27,7 @@ def child_test_wrapper(result_queue):
         output = child_test()
         success = True
     # Catch anything raised so it can be propagated
-    except: # noqa: E722
+    except:  # noqa: E722
         output = traceback.format_exc()
         success = False
 
@@ -32,13 +35,13 @@ def child_test_wrapper(result_queue):
 
 
 @linux_only
-@skip_under_cuda_memcheck('May hang CUDA memcheck')
-@skip_on_cudasim('Simulator does not require or implement MVC')
+@skip_under_cuda_memcheck("May hang CUDA memcheck")
+@skip_on_cudasim("Simulator does not require or implement MVC")
 @skip_if_mvc_libraries_unavailable
 class TestMinorVersionCompatibility(CUDATestCase):
     def test_mvc(self):
         # Run test with Minor Version Compatibility enabled in a child process
-        ctx = mp.get_context('spawn')
+        ctx = mp.get_context("spawn")
         result_queue = ctx.Queue()
         proc = ctx.Process(target=child_test_wrapper, args=(result_queue,))
         proc.start()
@@ -50,5 +53,5 @@ def test_mvc(self):
             self.fail(output)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py b/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py
index 106ab0d30..0fe6177cb 100644
--- a/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py
+++ b/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py
@@ -10,6 +10,7 @@
 
 try:
     import pynvjitlink  # noqa: F401
+
     PYNVJITLINK_INSTALLED = True
 except ImportError:
     PYNVJITLINK_INSTALLED = False
@@ -52,7 +53,7 @@
 
 @unittest.skipIf(
     not config.CUDA_ENABLE_PYNVJITLINK or not TEST_BIN_DIR,
-    "pynvjitlink not enabled"
+    "pynvjitlink not enabled",
 )
 @skip_on_cudasim("Linking unsupported in the simulator")
 class TestLinker(CUDATestCase):
@@ -85,7 +86,6 @@ def test_nvjitlink_invalid_cc_type_error(self):
             PyNvJitLinker(cc=0)
 
     def test_nvjitlink_ptx_compile_options(self):
-
         max_registers = (None, 32)
         lineinfo = (False, True)
         lto = (False, True)
@@ -190,7 +190,7 @@ def test_nvjitlink_jit_with_linkable_code_lto_dump_assembly(self):
         files = [
             test_device_functions_cu,
             test_device_functions_ltoir,
-            test_device_functions_fatbin_multi
+            test_device_functions_fatbin_multi,
         ]
 
         config.DUMP_ASSEMBLY = True
@@ -228,7 +228,7 @@ def test_nvjitlink_jit_with_linkable_code_lto_dump_assembly_warn(self):
         for file in files:
             with self.subTest(file=file):
                 with warnings.catch_warnings(record=True) as w:
-                    with contextlib.redirect_stdout(None): # suppress other PTX
+                    with contextlib.redirect_stdout(None):  # suppress other PTX
                         sig = "uint32(uint32, uint32)"
                         add_from_numba = cuda.declare_device(
                             "add_from_numba", sig
@@ -243,8 +243,11 @@ def kernel(result):
                         assert result[0] == 3
 
                 assert len(w) == 1
-                self.assertIn("it is not optimizable at link time, and "
-                              "`ignore_nonlto == True`", str(w[0].message))
+                self.assertIn(
+                    "it is not optimizable at link time, and "
+                    "`ignore_nonlto == True`",
+                    str(w[0].message),
+                )
 
         config.DUMP_ASSEMBLY = False
 
@@ -262,7 +265,7 @@ def kernel():
 
 @unittest.skipIf(
     not PYNVJITLINK_INSTALLED or not TEST_BIN_DIR,
-    reason="pynvjitlink not enabled"
+    reason="pynvjitlink not enabled",
 )
 class TestLinkerUsage(CUDATestCase):
     """Test that whether pynvjitlink can be enabled by both environment variable
@@ -295,12 +298,12 @@ def kernel(result):
 
     def test_linker_enabled_envvar(self):
         env = os.environ.copy()
-        env['NUMBA_CUDA_ENABLE_PYNVJITLINK'] = "1"
+        env["NUMBA_CUDA_ENABLE_PYNVJITLINK"] = "1"
         run_in_subprocess(self.src.format(config=""), env=env)
 
     def test_linker_disabled_envvar(self):
         env = os.environ.copy()
-        env.pop('NUMBA_CUDA_ENABLE_PYNVJITLINK', None)
+        env.pop("NUMBA_CUDA_ENABLE_PYNVJITLINK", None)
         with self.assertRaisesRegex(
             AssertionError, "LTO and additional flags require PyNvJitLinker"
         ):
@@ -310,19 +313,25 @@ def test_linker_disabled_envvar(self):
 
     def test_linker_enabled_config(self):
         env = os.environ.copy()
-        env.pop('NUMBA_CUDA_ENABLE_PYNVJITLINK', None)
-        run_in_subprocess(self.src.format(
-            config="config.CUDA_ENABLE_PYNVJITLINK = True"), env=env)
+        env.pop("NUMBA_CUDA_ENABLE_PYNVJITLINK", None)
+        run_in_subprocess(
+            self.src.format(config="config.CUDA_ENABLE_PYNVJITLINK = True"),
+            env=env,
+        )
 
     def test_linker_disabled_config(self):
         env = os.environ.copy()
-        env.pop('NUMBA_CUDA_ENABLE_PYNVJITLINK', None)
+        env.pop("NUMBA_CUDA_ENABLE_PYNVJITLINK", None)
         with override_config("CUDA_ENABLE_PYNVJITLINK", False):
             with self.assertRaisesRegex(
                 AssertionError, "LTO and additional flags require PyNvJitLinker"
             ):
-                run_in_subprocess(self.src.format(
-                    config="config.CUDA_ENABLE_PYNVJITLINK = False"), env=env)
+                run_in_subprocess(
+                    self.src.format(
+                        config="config.CUDA_ENABLE_PYNVJITLINK = False"
+                    ),
+                    env=env,
+                )
 
 
 if __name__ == "__main__":
diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py b/numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py
index 309169bfc..fad357243 100644
--- a/numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py
+++ b/numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py
@@ -7,7 +7,7 @@
 from numba.cuda.testing import skip_on_cudasim
 
 
-@skip_on_cudasim('NVVM Driver unsupported in the simulator')
+@skip_on_cudasim("NVVM Driver unsupported in the simulator")
 class TestNvvmDriver(unittest.TestCase):
     def get_nvvmir(self):
         versions = NVVM().get_ir_version()
@@ -16,9 +16,9 @@ def get_nvvmir(self):
 
     def test_nvvm_compile_simple(self):
         nvvmir = self.get_nvvmir()
-        ptx = nvvm.compile_ir(nvvmir).decode('utf8')
-        self.assertTrue('simple' in ptx)
-        self.assertTrue('ave' in ptx)
+        ptx = nvvm.compile_ir(nvvmir).decode("utf8")
+        self.assertTrue("simple" in ptx)
+        self.assertTrue("ave" in ptx)
 
     def test_nvvm_compile_nullary_option(self):
         # Tests compilation with an option that doesn't take an argument
@@ -34,7 +34,7 @@ def test_nvvm_compile_nullary_option(self):
 
         # Verify we correctly passed the option by checking if we got LTOIR
         # from NVVM (by looking for the expected magic number for LTOIR)
-        self.assertEqual(ltoir[:4], b'\xed\x43\x4e\x7f')
+        self.assertEqual(ltoir[:4], b"\xed\x43\x4e\x7f")
 
     def test_nvvm_bad_option(self):
         # Ensure that unsupported / non-existent options are reported as such
@@ -45,36 +45,37 @@ def test_nvvm_bad_option(self):
 
     def test_nvvm_from_llvm(self):
         m = ir.Module("test_nvvm_from_llvm")
-        m.triple = 'nvptx64-nvidia-cuda'
+        m.triple = "nvptx64-nvidia-cuda"
         nvvm.add_ir_version(m)
         fty = ir.FunctionType(ir.VoidType(), [ir.IntType(32)])
-        kernel = ir.Function(m, fty, name='mycudakernel')
-        bldr = ir.IRBuilder(kernel.append_basic_block('entry'))
+        kernel = ir.Function(m, fty, name="mycudakernel")
+        bldr = ir.IRBuilder(kernel.append_basic_block("entry"))
         bldr.ret_void()
         nvvm.set_cuda_kernel(kernel)
 
         m.data_layout = NVVM().data_layout
-        ptx = nvvm.compile_ir(str(m)).decode('utf8')
-        self.assertTrue('mycudakernel' in ptx)
-        self.assertTrue('.address_size 64' in ptx)
+        ptx = nvvm.compile_ir(str(m)).decode("utf8")
+        self.assertTrue("mycudakernel" in ptx)
+        self.assertTrue(".address_size 64" in ptx)
 
     def test_used_list(self):
         # Construct a module
         m = ir.Module("test_used_list")
-        m.triple = 'nvptx64-nvidia-cuda'
+        m.triple = "nvptx64-nvidia-cuda"
         m.data_layout = NVVM().data_layout
         nvvm.add_ir_version(m)
 
         # Add a function and mark it as a kernel
         fty = ir.FunctionType(ir.VoidType(), [ir.IntType(32)])
-        kernel = ir.Function(m, fty, name='mycudakernel')
-        bldr = ir.IRBuilder(kernel.append_basic_block('entry'))
+        kernel = ir.Function(m, fty, name="mycudakernel")
+        bldr = ir.IRBuilder(kernel.append_basic_block("entry"))
         bldr.ret_void()
         nvvm.set_cuda_kernel(kernel)
 
         # Verify that the used list was correctly constructed
-        used_lines = [line for line in str(m).splitlines()
-                      if 'llvm.used' in line]
+        used_lines = [
+            line for line in str(m).splitlines() if "llvm.used" in line
+        ]
         msg = 'Expected exactly one @"llvm.used" array'
         self.assertEqual(len(used_lines), 1, msg)
 
@@ -93,70 +94,71 @@ def test_nvvm_ir_verify_fail(self):
         m.triple = "unknown-unknown-unknown"
         m.data_layout = NVVM().data_layout
         nvvm.add_ir_version(m)
-        with self.assertRaisesRegex(NvvmError, 'Invalid target triple'):
+        with self.assertRaisesRegex(NvvmError, "Invalid target triple"):
             nvvm.compile_ir(str(m))
 
     def _test_nvvm_support(self, arch):
-        compute_xx = 'compute_{0}{1}'.format(*arch)
+        compute_xx = "compute_{0}{1}".format(*arch)
         nvvmir = self.get_nvvmir()
-        ptx = nvvm.compile_ir(nvvmir, arch=compute_xx, ftz=1, prec_sqrt=0,
-                              prec_div=0).decode('utf8')
+        ptx = nvvm.compile_ir(
+            nvvmir, arch=compute_xx, ftz=1, prec_sqrt=0, prec_div=0
+        ).decode("utf8")
         self.assertIn(".target sm_{0}{1}".format(*arch), ptx)
-        self.assertIn('simple', ptx)
-        self.assertIn('ave', ptx)
+        self.assertIn("simple", ptx)
+        self.assertIn("ave", ptx)
 
     def test_nvvm_support(self):
-        """Test supported CC by NVVM
-        """
+        """Test supported CC by NVVM"""
         for arch in nvvm.get_supported_ccs():
             self._test_nvvm_support(arch=arch)
 
     def test_nvvm_warning(self):
         m = ir.Module("test_nvvm_warning")
-        m.triple = 'nvptx64-nvidia-cuda'
+        m.triple = "nvptx64-nvidia-cuda"
         m.data_layout = NVVM().data_layout
         nvvm.add_ir_version(m)
 
         fty = ir.FunctionType(ir.VoidType(), [])
-        kernel = ir.Function(m, fty, name='inlinekernel')
-        builder = ir.IRBuilder(kernel.append_basic_block('entry'))
+        kernel = ir.Function(m, fty, name="inlinekernel")
+        builder = ir.IRBuilder(kernel.append_basic_block("entry"))
         builder.ret_void()
         nvvm.set_cuda_kernel(kernel)
 
         # Add the noinline attribute to trigger NVVM to generate a warning
-        kernel.attributes.add('noinline')
+        kernel.attributes.add("noinline")
 
         with warnings.catch_warnings(record=True) as w:
             nvvm.compile_ir(str(m))
 
         self.assertEqual(len(w), 1)
-        self.assertIn('overriding noinline attribute', str(w[0]))
+        self.assertIn("overriding noinline attribute", str(w[0]))
 
 
-@skip_on_cudasim('NVVM Driver unsupported in the simulator')
+@skip_on_cudasim("NVVM Driver unsupported in the simulator")
 class TestArchOption(unittest.TestCase):
     def test_get_arch_option(self):
         # Test returning the nearest lowest arch.
-        self.assertEqual(nvvm.get_arch_option(5, 3), 'compute_53')
-        self.assertEqual(nvvm.get_arch_option(7, 5), 'compute_75')
-        self.assertEqual(nvvm.get_arch_option(7, 7), 'compute_75')
+        self.assertEqual(nvvm.get_arch_option(5, 3), "compute_53")
+        self.assertEqual(nvvm.get_arch_option(7, 5), "compute_75")
+        self.assertEqual(nvvm.get_arch_option(7, 7), "compute_75")
         # Test known arch.
         supported_cc = nvvm.get_supported_ccs()
         for arch in supported_cc:
-            self.assertEqual(nvvm.get_arch_option(*arch), 'compute_%d%d' % arch)
-        self.assertEqual(nvvm.get_arch_option(1000, 0),
-                         'compute_%d%d' % supported_cc[-1])
+            self.assertEqual(nvvm.get_arch_option(*arch), "compute_%d%d" % arch)
+        self.assertEqual(
+            nvvm.get_arch_option(1000, 0), "compute_%d%d" % supported_cc[-1]
+        )
 
 
-@skip_on_cudasim('NVVM Driver unsupported in the simulator')
+@skip_on_cudasim("NVVM Driver unsupported in the simulator")
 class TestLibDevice(unittest.TestCase):
     def test_libdevice_load(self):
         # Test that constructing LibDevice gives a bitcode file
         libdevice = LibDevice()
-        self.assertEqual(libdevice.bc[:4], b'BC\xc0\xde')
+        self.assertEqual(libdevice.bc[:4], b"BC\xc0\xde")
 
 
-nvvmir_generic = '''\
+nvvmir_generic = """\
 target triple="nvptx64-nvidia-cuda"
 target datalayout = "{data_layout}"
 
@@ -194,8 +196,8 @@ def test_libdevice_load(self):
 !2 = !{{void (i32*)* @simple, !"kernel", i32 1}}
 
 @"llvm.used" = appending global [1 x i8*] [i8* bitcast (void (i32*)* @simple to i8*)], section "llvm.metadata"
-'''  # noqa: E501
+"""  # noqa: E501
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py b/numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py
index ef727c5a8..8e4d811d1 100644
--- a/numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py
+++ b/numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py
@@ -6,7 +6,6 @@
 
 
 class TestPinned(ContextResettingTestCase):
-
     def _run_copies(self, A):
         A0 = np.copy(A)
 
@@ -20,8 +19,8 @@ def _run_copies(self, A):
 
     def test_pinned(self):
         machine = platform.machine()
-        if machine.startswith('arm') or machine.startswith('aarch64'):
-            count = 262144   # 2MB
+        if machine.startswith("arm") or machine.startswith("aarch64"):
+            count = 262144  # 2MB
         else:
             count = 2097152  # 16MB
         A = np.arange(count)
@@ -29,9 +28,9 @@ def test_pinned(self):
             self._run_copies(A)
 
     def test_unpinned(self):
-        A = np.arange(2 * 1024 * 1024) # 16 MB
+        A = np.arange(2 * 1024 * 1024)  # 16 MB
         self._run_copies(A)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py b/numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py
index 1660d4d42..a1d7a95ce 100644
--- a/numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py
+++ b/numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py
@@ -4,7 +4,7 @@
 from numba.cuda.testing import skip_on_cudasim
 
 
-@skip_on_cudasim('CUDA Profiler unsupported in the simulator')
+@skip_on_cudasim("CUDA Profiler unsupported in the simulator")
 class TestProfiler(ContextResettingTestCase):
     def test_profiling(self):
         with cuda.profiling():
@@ -16,5 +16,5 @@ def test_profiling(self):
             del a
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py b/numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py
index b03fd3647..a532f8c28 100644
--- a/numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py
+++ b/numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py
@@ -2,8 +2,11 @@
 import logging
 import traceback
 from numba.cuda.testing import unittest, CUDATestCase
-from numba.cuda.testing import (skip_on_cudasim, skip_with_cuda_python,
-                                skip_under_cuda_memcheck)
+from numba.cuda.testing import (
+    skip_on_cudasim,
+    skip_with_cuda_python,
+    skip_under_cuda_memcheck,
+)
 from numba.tests.support import linux_only
 
 
@@ -23,12 +26,12 @@ def child_test():
     # used.
     logbuf = io.StringIO()
     handler = logging.StreamHandler(logbuf)
-    cudadrv_logger = logging.getLogger('numba.cuda.cudadrv.driver')
+    cudadrv_logger = logging.getLogger("numba.cuda.cudadrv.driver")
     cudadrv_logger.addHandler(handler)
     cudadrv_logger.setLevel(logging.DEBUG)
 
     # Set up data for our test, and copy over to the device
-    N = 2 ** 16
+    N = 2**16
     N_THREADS = 10
     N_ADDITIONS = 4096
 
@@ -65,8 +68,10 @@ def kernel_thread(n):
         f[n_blocks, n_threads, stream](rs[n], xs[n])
 
     # Create threads
-    threads = [threading.Thread(target=kernel_thread, args=(i,))
-               for i in range(N_THREADS)]
+    threads = [
+        threading.Thread(target=kernel_thread, args=(i,))
+        for i in range(N_THREADS)
+    ]
 
     # Start all threads
     for thread in threads:
@@ -95,7 +100,7 @@ def child_test_wrapper(result_queue):
         output = child_test()
         success = True
     # Catch anything raised so it can be propagated
-    except: # noqa: E722
+    except:  # noqa: E722
         output = traceback.format_exc()
         success = False
 
@@ -105,13 +110,13 @@ def child_test_wrapper(result_queue):
 # Run on Linux only until the reason for test hangs on Windows (Issue #8635,
 # https://github.com/numba/numba/issues/8635) is diagnosed
 @linux_only
-@skip_under_cuda_memcheck('Hangs cuda-memcheck')
-@skip_on_cudasim('Streams not supported on the simulator')
+@skip_under_cuda_memcheck("Hangs cuda-memcheck")
+@skip_on_cudasim("Streams not supported on the simulator")
 class TestPTDS(CUDATestCase):
-    @skip_with_cuda_python('Function names unchanged for PTDS with NV Binding')
+    @skip_with_cuda_python("Function names unchanged for PTDS with NV Binding")
     def test_ptds(self):
         # Run a test with PTDS enabled in a child process
-        ctx = mp.get_context('spawn')
+        ctx = mp.get_context("spawn")
         result_queue = ctx.Queue()
         proc = ctx.Process(target=child_test_wrapper, args=(result_queue,))
         proc.start()
@@ -124,8 +129,11 @@ def test_ptds(self):
 
         # Functions with a per-thread default stream variant that we expect to
         # see in the output
-        ptds_functions = ('cuMemcpyHtoD_v2_ptds', 'cuLaunchKernel_ptsz',
-                          'cuMemcpyDtoH_v2_ptds')
+        ptds_functions = (
+            "cuMemcpyHtoD_v2_ptds",
+            "cuLaunchKernel_ptsz",
+            "cuMemcpyDtoH_v2_ptds",
+        )
 
         for fn in ptds_functions:
             with self.subTest(fn=fn, expected=True):
@@ -133,17 +141,20 @@ def test_ptds(self):
 
         # Non-PTDS versions of the functions that we should not see in the
         # output:
-        legacy_functions = ('cuMemcpyHtoD_v2', 'cuLaunchKernel',
-                            'cuMemcpyDtoH_v2')
+        legacy_functions = (
+            "cuMemcpyHtoD_v2",
+            "cuLaunchKernel",
+            "cuMemcpyDtoH_v2",
+        )
 
         for fn in legacy_functions:
             with self.subTest(fn=fn, expected=False):
                 # Ensure we only spot these function names appearing without a
                 # _ptds or _ptsz suffix by checking including the end of the
                 # line in the log
-                fn_at_end = f'{fn}\n'
+                fn_at_end = f"{fn}\n"
                 self.assertNotIn(fn_at_end, output)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py b/numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py
index f2e0b6d10..d7a8ae384 100644
--- a/numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py
+++ b/numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py
@@ -7,7 +7,6 @@
 
 class TestResetDevice(ContextResettingTestCase):
     def test_reset_device(self):
-
         def newthread(exception_queue):
             try:
                 devices = range(driver.get_device_count())
@@ -32,5 +31,5 @@ def newthread(exception_queue):
         self.assertEqual(exceptions, [])
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py b/numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py
index 51e0722ec..4cb3d09cf 100644
--- a/numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py
+++ b/numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py
@@ -11,32 +11,40 @@ def set_visible_devices_and_check(q):
         from numba import cuda
         import os
 
-        os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+        os.environ["CUDA_VISIBLE_DEVICES"] = "0"
         q.put(len(cuda.gpus.lst))
-    except: # noqa: E722
+    except:  # noqa: E722
         # Sentinel value for error executing test code
         q.put(-1)
 
 
 if config.ENABLE_CUDASIM:
-    SUPPORTED_VERSIONS = (-1, -1),
+    SUPPORTED_VERSIONS = ((-1, -1),)
 else:
-    SUPPORTED_VERSIONS = ((11, 0), (11, 1), (11, 2), (11, 3), (11, 4), (11, 5),
-                          (11, 6), (11, 7))
+    SUPPORTED_VERSIONS = (
+        (11, 0),
+        (11, 1),
+        (11, 2),
+        (11, 3),
+        (11, 4),
+        (11, 5),
+        (11, 6),
+        (11, 7),
+    )
 
 
 class TestRuntime(unittest.TestCase):
     def test_is_supported_version_true(self):
         for v in SUPPORTED_VERSIONS:
-            with patch.object(runtime, 'get_version', return_value=v):
+            with patch.object(runtime, "get_version", return_value=v):
                 self.assertTrue(runtime.is_supported_version())
 
-    @skip_on_cudasim('The simulator always simulates a supported runtime')
+    @skip_on_cudasim("The simulator always simulates a supported runtime")
     def test_is_supported_version_false(self):
         # Check with an old unsupported version and some potential future
         # versions
         for v in ((10, 2), (11, 8), (12, 0)):
-            with patch.object(runtime, 'get_version', return_value=v):
+            with patch.object(runtime, "get_version", return_value=v):
                 self.assertFalse(runtime.is_supported_version())
 
     def test_supported_versions(self):
@@ -57,13 +65,13 @@ def test_visible_devices_set_after_import(self):
         from numba import cuda
 
         if len(cuda.gpus.lst) in (0, 1):
-            self.skipTest('This test requires multiple GPUs')
+            self.skipTest("This test requires multiple GPUs")
 
-        if os.environ.get('CUDA_VISIBLE_DEVICES'):
-            msg = 'Cannot test when CUDA_VISIBLE_DEVICES already set'
+        if os.environ.get("CUDA_VISIBLE_DEVICES"):
+            msg = "Cannot test when CUDA_VISIBLE_DEVICES already set"
             self.skipTest(msg)
 
-        ctx = multiprocessing.get_context('spawn')
+        ctx = multiprocessing.get_context("spawn")
         q = ctx.Queue()
         p = ctx.Process(target=set_visible_devices_and_check, args=(q,))
         p.start()
@@ -74,12 +82,12 @@ def test_visible_devices_set_after_import(self):
 
         # Make an obvious distinction between an error running the test code
         # and an incorrect number of GPUs in the list
-        msg = 'Error running set_visible_devices_and_check'
+        msg = "Error running set_visible_devices_and_check"
         self.assertNotEqual(visible_gpu_count, -1, msg=msg)
 
         # The actual check that we see only one GPU
         self.assertEqual(visible_gpu_count, 1)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py b/numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py
index aca78d94b..e592a4773 100644
--- a/numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py
+++ b/numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py
@@ -37,5 +37,5 @@ def test_select_device(self):
         self.assertEqual(exceptions, [])
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_streams.py b/numba_cuda/numba/cuda/tests/cudadrv/test_streams.py
index c4fbec19f..e2154dda8 100644
--- a/numba_cuda/numba/cuda/tests/cudadrv/test_streams.py
+++ b/numba_cuda/numba/cuda/tests/cudadrv/test_streams.py
@@ -15,10 +15,11 @@ def runner(*args, **kwds):
             return loop.run_until_complete(f(*args, **kwds))
         finally:
             loop.close()
+
     return runner
 
 
-@skip_on_cudasim('CUDA Driver API unsupported in the simulator')
+@skip_on_cudasim("CUDA Driver API unsupported in the simulator")
 class TestCudaStream(CUDATestCase):
     def test_add_callback(self):
         def callback(stream, status, event):
@@ -89,7 +90,7 @@ async def test_cancelled_future(self):
         self.assertTrue(done2.done())
 
 
-@skip_on_cudasim('CUDA Driver API unsupported in the simulator')
+@skip_on_cudasim("CUDA Driver API unsupported in the simulator")
 class TestFailingStream(CUDATestCase):
     # This test can only be run in isolation because it corrupts the CUDA
     # context, which cannot be recovered from within the same process. It is
@@ -118,5 +119,5 @@ async def test_failed_stream(self):
         self.assertIsNotNone(done.exception())
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py b/numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py
index ad6d9ad57..c9f7c6975 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py
@@ -17,6 +17,7 @@ class UseCase:
     The return type is inferred from the type of the first argument, unless it
     is explicitly overridden by the ``retty`` kwarg.
     """
+
     def __init__(self, func, retty=None):
         self._func = func
         self._retty = retty
@@ -59,6 +60,7 @@ def add_nocache_usecase_kernel(r, x, y):
 
 # Inner / outer cached / uncached cases
 
+
 @cuda.jit(cache=True)
 def inner(x, y):
     return x + y + Z
@@ -81,13 +83,13 @@ def outer_uncached_kernel(r, x, y):
 # Exercise returning a record instance.  This used to hardcode the dtype
 # pointer's value in the bitcode.
 
-packed_record_type = np.dtype([('a', np.int8), ('b', np.float64)])
-aligned_record_type = np.dtype([('a', np.int8), ('b', np.float64)], align=True)
+packed_record_type = np.dtype([("a", np.int8), ("b", np.float64)])
+aligned_record_type = np.dtype([("a", np.int8), ("b", np.float64)], align=True)
 
 packed_arr = np.empty(2, dtype=packed_record_type)
 for i in range(packed_arr.size):
-    packed_arr[i]['a'] = i + 1
-    packed_arr[i]['b'] = i + 42.5
+    packed_arr[i]["a"] = i + 1
+    packed_arr[i]["b"] = i + 42.5
 
 aligned_arr = np.array(packed_arr, dtype=aligned_record_type)
 
@@ -103,6 +105,7 @@ def record_return(r, ary, i):
 
 # Closure test cases
 
+
 def make_closure(x):
     @cuda.jit(cache=True)
     def closure(r, y):
@@ -119,6 +122,7 @@ def closure(r, y):
 
 # Ambiguous / renamed functions
 
+
 @cuda.jit(cache=True)
 def ambiguous_function(r, x):
     r[()] = x[()] + 2
@@ -190,6 +194,7 @@ def many_locals():
 
 # Simple use case for multiprocessing test
 
+
 @cuda.jit(cache=True)
 def simple_usecase_kernel(r, x):
     r[()] = x[()]
@@ -200,6 +205,7 @@ def simple_usecase_kernel(r, x):
 
 # Usecase with cooperative groups
 
+
 @cuda.jit(cache=True)
 def cg_usecase_kernel(r, x):
     grid = cuda.cg.this_grid()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py b/numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py
index 07b42d755..a58b3c141 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py
@@ -12,6 +12,7 @@ def _call(self, ret, *args):
 
 # Using the same function as a cached CPU and CUDA-jitted function
 
+
 def target_shared_assign(r, x):
     r[()] = x[()]
 
diff --git a/numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py b/numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py
index 1e639d379..897b0bbd8 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py
@@ -23,7 +23,7 @@ def __init__(self):
         register_model,
         make_attribute_wrapper,
         typeof_impl,
-        type_callable
+        type_callable,
     )
     from numba.cuda.cudaimpl import lower
     from numba.core import cgutils
@@ -38,21 +38,22 @@ def __init__(self, dmm, fe_type):
             members = [("x", int32), ("y", int32)]
             super().__init__(dmm, fe_type, members)
 
-    make_attribute_wrapper(TestStructModelType, 'x', 'x')
-    make_attribute_wrapper(TestStructModelType, 'y', 'y')
+    make_attribute_wrapper(TestStructModelType, "x", "x")
+    make_attribute_wrapper(TestStructModelType, "y", "y")
 
     @type_callable(TestStruct)
     def type_test_struct(context):
         def typer(x, y):
             if isinstance(x, types.Integer) and isinstance(y, types.Integer):
                 return test_struct_model_type
+
         return typer
 
     @lower(TestStruct, types.Integer, types.Integer)
     def lower_test_type_ctor(context, builder, sig, args):
-        obj = cgutils.create_struct_proxy(
-            test_struct_model_type
-        )(context, builder)
+        obj = cgutils.create_struct_proxy(test_struct_model_type)(
+            context, builder
+        )
         obj.x = args[0]
         obj.y = args[1]
         return obj._getvalue()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx b/numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx
index 8cc1aa6d6..0a818041a 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx
+++ b/numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx
@@ -26,5 +26,3 @@
 	st.param.b32	[func_retval0+0], %r3;
 	ret;
 }
-
-
diff --git a/numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py b/numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py
index b182359b1..0bc1cf605 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py
@@ -97,4 +97,5 @@ def make_list(n):
             return None
 
         return (n, make_list(n - 1))
+
     return make_list
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_alignment.py b/numba_cuda/numba/cuda/tests/cudapy/test_alignment.py
index 7c7dff8ca..a3183ae47 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_alignment.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_alignment.py
@@ -6,7 +6,7 @@
 
 class TestAlignment(CUDATestCase):
     def test_record_alignment(self):
-        rec_dtype = np.dtype([('a', 'int32'), ('b', 'float64')], align=True)
+        rec_dtype = np.dtype([("a", "int32"), ("b", "float64")], align=True)
         rec = from_dtype(rec_dtype)
 
         @cuda.jit((rec[:],))
@@ -24,19 +24,20 @@ def foo(a):
 
         self.assertTrue(np.all(a_recarray.a == a_recarray.b))
 
-    @skip_on_cudasim('Simulator does not check alignment')
+    @skip_on_cudasim("Simulator does not check alignment")
     def test_record_alignment_error(self):
-        rec_dtype = np.dtype([('a', 'int32'), ('b', 'float64')])
+        rec_dtype = np.dtype([("a", "int32"), ("b", "float64")])
         rec = from_dtype(rec_dtype)
 
         with self.assertRaises(Exception) as raises:
+
             @cuda.jit((rec[:],))
             def foo(a):
                 i = cuda.grid(1)
                 a[i].a = a[i].b
 
-        self.assertTrue('type float64 is not aligned' in str(raises.exception))
+        self.assertTrue("type float64 is not aligned" in str(raises.exception))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_array.py b/numba_cuda/numba/cuda/tests/cudapy/test_array.py
index fdd759e76..a244b762f 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_array.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_array.py
@@ -8,8 +8,11 @@
 if config.ENABLE_CUDASIM:
     ARRAY_LIKE_FUNCTIONS = (cuda.device_array_like, cuda.pinned_array_like)
 else:
-    ARRAY_LIKE_FUNCTIONS = (cuda.device_array_like, cuda.mapped_array_like,
-                            cuda.pinned_array_like)
+    ARRAY_LIKE_FUNCTIONS = (
+        cuda.device_array_like,
+        cuda.mapped_array_like,
+        cuda.pinned_array_like,
+    )
 
 
 def array_reshape1d(arr, newshape, got):
@@ -55,8 +58,7 @@ def test_null_shape(self):
         self.assertEqual(shape2, null_shape)
 
     def test_gpu_array_strided(self):
-
-        @cuda.jit('void(double[:])')
+        @cuda.jit("void(double[:])")
         def kernel(x):
             i = cuda.grid(1)
             if i < x.shape[0]:
@@ -69,8 +71,7 @@ def kernel(x):
         self.assertTrue(np.allclose(z, list(range(9))))
 
     def test_gpu_array_interleaved(self):
-
-        @cuda.jit('void(double[:], double[:])')
+        @cuda.jit("void(double[:], double[:])")
         def copykernel(x, y):
             i = cuda.grid(1)
             if i < x.shape[0]:
@@ -86,8 +87,10 @@ def copykernel(x, y):
         except ValueError:
             pass
         else:
-            raise AssertionError("Should raise exception complaining the "
-                                 "contiguous-ness of the array.")
+            raise AssertionError(
+                "Should raise exception complaining the "
+                "contiguous-ness of the array."
+            )
             # Should we handle this use case?
             # assert z.size == y.size
             # copykernel[1, n](y, x)
@@ -108,55 +111,57 @@ def _test_array_like_same(self, like_func, array):
         self.assertEqual(array.shape, array_like.shape)
         self.assertEqual(array.strides, array_like.strides)
         self.assertEqual(array.dtype, array_like.dtype)
-        self.assertEqual(array.flags['C_CONTIGUOUS'],
-                         array_like.flags['C_CONTIGUOUS'])
-        self.assertEqual(array.flags['F_CONTIGUOUS'],
-                         array_like.flags['F_CONTIGUOUS'])
+        self.assertEqual(
+            array.flags["C_CONTIGUOUS"], array_like.flags["C_CONTIGUOUS"]
+        )
+        self.assertEqual(
+            array.flags["F_CONTIGUOUS"], array_like.flags["F_CONTIGUOUS"]
+        )
 
     def test_array_like_1d(self):
-        d_a = cuda.device_array(10, order='C')
+        d_a = cuda.device_array(10, order="C")
         for like_func in ARRAY_LIKE_FUNCTIONS:
             with self.subTest(like_func=like_func):
                 self._test_array_like_same(like_func, d_a)
 
     def test_array_like_2d(self):
-        d_a = cuda.device_array((10, 12), order='C')
+        d_a = cuda.device_array((10, 12), order="C")
         for like_func in ARRAY_LIKE_FUNCTIONS:
             with self.subTest(like_func=like_func):
                 self._test_array_like_same(like_func, d_a)
 
     def test_array_like_2d_transpose(self):
-        d_a = cuda.device_array((10, 12), order='C')
+        d_a = cuda.device_array((10, 12), order="C")
         for like_func in ARRAY_LIKE_FUNCTIONS:
             with self.subTest(like_func=like_func):
                 self._test_array_like_same(like_func, d_a)
 
     def test_array_like_3d(self):
-        d_a = cuda.device_array((10, 12, 14), order='C')
+        d_a = cuda.device_array((10, 12, 14), order="C")
         for like_func in ARRAY_LIKE_FUNCTIONS:
             with self.subTest(like_func=like_func):
                 self._test_array_like_same(like_func, d_a)
 
     def test_array_like_1d_f(self):
-        d_a = cuda.device_array(10, order='F')
+        d_a = cuda.device_array(10, order="F")
         for like_func in ARRAY_LIKE_FUNCTIONS:
             with self.subTest(like_func=like_func):
                 self._test_array_like_same(like_func, d_a)
 
     def test_array_like_2d_f(self):
-        d_a = cuda.device_array((10, 12), order='F')
+        d_a = cuda.device_array((10, 12), order="F")
         for like_func in ARRAY_LIKE_FUNCTIONS:
             with self.subTest(like_func=like_func):
                 self._test_array_like_same(like_func, d_a)
 
     def test_array_like_2d_f_transpose(self):
-        d_a = cuda.device_array((10, 12), order='F')
+        d_a = cuda.device_array((10, 12), order="F")
         for like_func in ARRAY_LIKE_FUNCTIONS:
             with self.subTest(like_func=like_func):
                 self._test_array_like_same(like_func, d_a)
 
     def test_array_like_3d_f(self):
-        d_a = cuda.device_array((10, 12, 14), order='F')
+        d_a = cuda.device_array((10, 12, 14), order="F")
         for like_func in ARRAY_LIKE_FUNCTIONS:
             with self.subTest(like_func=like_func):
                 self._test_array_like_same(like_func, d_a)
@@ -173,10 +178,12 @@ def _test_array_like_view(self, like_func, view, d_view):
         # Use NumPy as a reference for the expected strides
         np_like = np.zeros_like(view)
         self.assertEqual(nb_like.strides, np_like.strides)
-        self.assertEqual(nb_like.flags['C_CONTIGUOUS'],
-                         np_like.flags['C_CONTIGUOUS'])
-        self.assertEqual(nb_like.flags['F_CONTIGUOUS'],
-                         np_like.flags['F_CONTIGUOUS'])
+        self.assertEqual(
+            nb_like.flags["C_CONTIGUOUS"], np_like.flags["C_CONTIGUOUS"]
+        )
+        self.assertEqual(
+            nb_like.flags["F_CONTIGUOUS"], np_like.flags["F_CONTIGUOUS"]
+        )
 
     def test_array_like_1d_view(self):
         shape = 10
@@ -188,8 +195,8 @@ def test_array_like_1d_view(self):
 
     def test_array_like_1d_view_f(self):
         shape = 10
-        view = np.zeros(shape, order='F')[::2]
-        d_view = cuda.device_array(shape, order='F')[::2]
+        view = np.zeros(shape, order="F")[::2]
+        d_view = cuda.device_array(shape, order="F")[::2]
         for like_func in ARRAY_LIKE_FUNCTIONS:
             with self.subTest(like_func=like_func):
                 self._test_array_like_view(like_func, view, d_view)
@@ -204,13 +211,13 @@ def test_array_like_2d_view(self):
 
     def test_array_like_2d_view_f(self):
         shape = (10, 12)
-        view = np.zeros(shape, order='F')[::2, ::2]
-        d_view = cuda.device_array(shape, order='F')[::2, ::2]
+        view = np.zeros(shape, order="F")[::2, ::2]
+        d_view = cuda.device_array(shape, order="F")[::2, ::2]
         for like_func in ARRAY_LIKE_FUNCTIONS:
             with self.subTest(like_func=like_func):
                 self._test_array_like_view(like_func, view, d_view)
 
-    @skip_on_cudasim('Numba and NumPy stride semantics differ for transpose')
+    @skip_on_cudasim("Numba and NumPy stride semantics differ for transpose")
     def test_array_like_2d_view_transpose_device(self):
         shape = (10, 12)
         d_view = cuda.device_array(shape)[::2, ::2].T
@@ -224,11 +231,12 @@ def test_array_like_2d_view_transpose_device(self):
                 self.assertEqual(d_view.shape, like.shape)
                 self.assertEqual(d_view.dtype, like.dtype)
                 self.assertEqual((40, 8), like.strides)
-                self.assertTrue(like.flags['C_CONTIGUOUS'])
-                self.assertFalse(like.flags['F_CONTIGUOUS'])
+                self.assertTrue(like.flags["C_CONTIGUOUS"])
+                self.assertFalse(like.flags["F_CONTIGUOUS"])
 
-    @skip_unless_cudasim('Numba and NumPy stride semantics differ for '
-                         'transpose')
+    @skip_unless_cudasim(
+        "Numba and NumPy stride semantics differ for transpose"
+    )
     def test_array_like_2d_view_transpose_simulator(self):
         shape = (10, 12)
         view = np.zeros(shape)[::2, ::2].T
@@ -243,20 +251,22 @@ def test_array_like_2d_view_transpose_simulator(self):
                 self.assertEqual(d_view.shape, nb_like.shape)
                 self.assertEqual(d_view.dtype, nb_like.dtype)
                 self.assertEqual(np_like.strides, nb_like.strides)
-                self.assertEqual(np_like.flags['C_CONTIGUOUS'],
-                                 nb_like.flags['C_CONTIGUOUS'])
-                self.assertEqual(np_like.flags['F_CONTIGUOUS'],
-                                 nb_like.flags['F_CONTIGUOUS'])
+                self.assertEqual(
+                    np_like.flags["C_CONTIGUOUS"], nb_like.flags["C_CONTIGUOUS"]
+                )
+                self.assertEqual(
+                    np_like.flags["F_CONTIGUOUS"], nb_like.flags["F_CONTIGUOUS"]
+                )
 
     def test_array_like_2d_view_f_transpose(self):
         shape = (10, 12)
-        view = np.zeros(shape, order='F')[::2, ::2].T
-        d_view = cuda.device_array(shape, order='F')[::2, ::2].T
+        view = np.zeros(shape, order="F")[::2, ::2].T
+        d_view = cuda.device_array(shape, order="F")[::2, ::2].T
         for like_func in ARRAY_LIKE_FUNCTIONS:
             with self.subTest(like_func=like_func):
                 self._test_array_like_view(like_func, view, d_view)
 
-    @skip_on_cudasim('Kernel overloads not created in the simulator')
+    @skip_on_cudasim("Kernel overloads not created in the simulator")
     def test_issue_4628(self):
         # CUDA Device arrays were reported as always being typed with 'A' order
         # so launching the kernel with a host array and then a device array
@@ -318,7 +328,7 @@ def check_empty(arr):
         check(array_reshape, array_reshape3d, arr, (8, 1, 3))
 
         # Test negative shape value
-        arr = np.arange(25).reshape(5,5)
+        arr = np.arange(25).reshape(5, 5)
         check(array_reshape, array_reshape1d, arr, -1)
         check(array_reshape, array_reshape1d, arr, (-1,))
         check(array_reshape, array_reshape2d, arr, (-1, 5))
@@ -329,5 +339,5 @@ def check_empty(arr):
         check_empty(arr)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_array_args.py b/numba_cuda/numba/cuda/tests/cudapy/test_array_args.py
index 87db4a6c7..1e3b1d920 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_array_args.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_array_args.py
@@ -7,12 +7,11 @@
 
 class TestCudaArrayArg(CUDATestCase):
     def test_array_ary(self):
-
-        @cuda.jit('double(double[:],int64)', device=True, inline=True)
+        @cuda.jit("double(double[:],int64)", device=True, inline=True)
         def device_function(a, c):
             return a[c]
 
-        @cuda.jit('void(double[:],double[:])')
+        @cuda.jit("void(double[:],double[:])")
         def kernel(x, y):
             i = cuda.grid(1)
             y[i] = device_function(x, i)
@@ -63,7 +62,7 @@ def f(r, x):
             r[0] = x.x
             r[1] = x.y
 
-        Point = namedtuple('Point', ('x', 'y'))
+        Point = namedtuple("Point", ("x", "y"))
         x = Point(1, 2)
         r = np.zeros(len(x), dtype=np.int64)
         f[1, 1](r, x)
@@ -78,7 +77,7 @@ def f(r1, r2, x):
             r1[1] = x.y
             r2[0] = x.r
 
-        Point = namedtuple('Point', ('x', 'y', 'r'))
+        Point = namedtuple("Point", ("x", "y", "r"))
         x = Point(1, 2, 2.236)
         r1 = np.zeros(2, dtype=np.int64)
         r2 = np.zeros(1, dtype=np.float64)
@@ -197,5 +196,5 @@ def f(r, x):
         self.assertEqual(r[4], 3)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py b/numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py
index 7f129b5df..ceb884700 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py
@@ -31,5 +31,5 @@ def test_reinterpret_array_type(self):
             self.assertEqual(expect, got)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_atomics.py b/numba_cuda/numba/cuda/tests/cudapy/test_atomics.py
index 86dbb22c1..e4f057ba0 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_atomics.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_atomics.py
@@ -22,9 +22,17 @@ def atomic_cast_none(num):
 
 
 @cuda.jit(device=True)
-def atomic_binary_1dim_shared(ary, idx, op2, ary_dtype, ary_nelements,
-                              binop_func, cast_func, initializer,
-                              neg_idx):
+def atomic_binary_1dim_shared(
+    ary,
+    idx,
+    op2,
+    ary_dtype,
+    ary_nelements,
+    binop_func,
+    cast_func,
+    initializer,
+    neg_idx,
+):
     tid = cuda.threadIdx.x
     sm = cuda.shared.array(ary_nelements, ary_dtype)
     sm[tid] = initializer
@@ -38,8 +46,9 @@ def atomic_binary_1dim_shared(ary, idx, op2, ary_dtype, ary_nelements,
 
 
 @cuda.jit(device=True)
-def atomic_binary_1dim_shared2(ary, idx, op2, ary_dtype, ary_nelements,
-                               binop_func, cast_func):
+def atomic_binary_1dim_shared2(
+    ary, idx, op2, ary_dtype, ary_nelements, binop_func, cast_func
+):
     tid = cuda.threadIdx.x
     sm = cuda.shared.array(ary_nelements, ary_dtype)
     sm[tid] = ary[tid]
@@ -51,8 +60,9 @@ def atomic_binary_1dim_shared2(ary, idx, op2, ary_dtype, ary_nelements,
 
 
 @cuda.jit(device=True)
-def atomic_binary_2dim_shared(ary, op2, ary_dtype, ary_shape,
-                              binop_func, y_cast_func, neg_idx):
+def atomic_binary_2dim_shared(
+    ary, op2, ary_dtype, ary_shape, binop_func, y_cast_func, neg_idx
+):
     tx = cuda.threadIdx.x
     ty = cuda.threadIdx.y
     sm = cuda.shared.array(ary_shape, ary_dtype)
@@ -77,8 +87,9 @@ def atomic_binary_2dim_global(ary, op2, binop_func, y_cast_func, neg_idx):
 
 
 @cuda.jit(device=True)
-def atomic_binary_1dim_global(ary, idx, ary_nelements, op2,
-                              binop_func, neg_idx):
+def atomic_binary_1dim_global(
+    ary, idx, ary_nelements, op2, binop_func, neg_idx
+):
     tid = cuda.threadIdx.x
     bin = int(idx[tid] % ary_nelements)
     if neg_idx:
@@ -87,53 +98,79 @@ def atomic_binary_1dim_global(ary, idx, ary_nelements, op2,
 
 
 def atomic_add(ary):
-    atomic_binary_1dim_shared(ary, ary, 1, uint32, 32,
-                              cuda.atomic.add, atomic_cast_none, 0, False)
+    atomic_binary_1dim_shared(
+        ary, ary, 1, uint32, 32, cuda.atomic.add, atomic_cast_none, 0, False
+    )
 
 
 def atomic_add_wrap(ary):
-    atomic_binary_1dim_shared(ary, ary, 1, uint32, 32,
-                              cuda.atomic.add, atomic_cast_none, 0, True)
+    atomic_binary_1dim_shared(
+        ary, ary, 1, uint32, 32, cuda.atomic.add, atomic_cast_none, 0, True
+    )
 
 
 def atomic_add2(ary):
-    atomic_binary_2dim_shared(ary, 1, uint32, (4, 8),
-                              cuda.atomic.add, atomic_cast_none, False)
+    atomic_binary_2dim_shared(
+        ary, 1, uint32, (4, 8), cuda.atomic.add, atomic_cast_none, False
+    )
 
 
 def atomic_add2_wrap(ary):
-    atomic_binary_2dim_shared(ary, 1, uint32, (4, 8),
-                              cuda.atomic.add, atomic_cast_none, True)
+    atomic_binary_2dim_shared(
+        ary, 1, uint32, (4, 8), cuda.atomic.add, atomic_cast_none, True
+    )
 
 
 def atomic_add3(ary):
-    atomic_binary_2dim_shared(ary, 1, uint32, (4, 8),
-                              cuda.atomic.add, atomic_cast_to_uint64, False)
+    atomic_binary_2dim_shared(
+        ary, 1, uint32, (4, 8), cuda.atomic.add, atomic_cast_to_uint64, False
+    )
 
 
 def atomic_add_float(ary):
-    atomic_binary_1dim_shared(ary, ary, 1.0, float32, 32,
-                              cuda.atomic.add, atomic_cast_to_int, 0.0, False)
+    atomic_binary_1dim_shared(
+        ary,
+        ary,
+        1.0,
+        float32,
+        32,
+        cuda.atomic.add,
+        atomic_cast_to_int,
+        0.0,
+        False,
+    )
 
 
 def atomic_add_float_wrap(ary):
-    atomic_binary_1dim_shared(ary, ary, 1.0, float32, 32,
-                              cuda.atomic.add, atomic_cast_to_int, 0.0, True)
+    atomic_binary_1dim_shared(
+        ary,
+        ary,
+        1.0,
+        float32,
+        32,
+        cuda.atomic.add,
+        atomic_cast_to_int,
+        0.0,
+        True,
+    )
 
 
 def atomic_add_float_2(ary):
-    atomic_binary_2dim_shared(ary, 1.0, float32, (4, 8),
-                              cuda.atomic.add, atomic_cast_none, False)
+    atomic_binary_2dim_shared(
+        ary, 1.0, float32, (4, 8), cuda.atomic.add, atomic_cast_none, False
+    )
 
 
 def atomic_add_float_2_wrap(ary):
-    atomic_binary_2dim_shared(ary, 1.0, float32, (4, 8),
-                              cuda.atomic.add, atomic_cast_none, True)
+    atomic_binary_2dim_shared(
+        ary, 1.0, float32, (4, 8), cuda.atomic.add, atomic_cast_none, True
+    )
 
 
 def atomic_add_float_3(ary):
-    atomic_binary_2dim_shared(ary, 1.0, float32, (4, 8),
-                              cuda.atomic.add, atomic_cast_to_uint64, False)
+    atomic_binary_2dim_shared(
+        ary, 1.0, float32, (4, 8), cuda.atomic.add, atomic_cast_to_uint64, False
+    )
 
 
 def atomic_add_double_global(idx, ary):
@@ -153,78 +190,117 @@ def atomic_add_double_global_2_wrap(ary):
 
 
 def atomic_add_double_global_3(ary):
-    atomic_binary_2dim_global(ary, 1, cuda.atomic.add, atomic_cast_to_uint64,
-                              False)
+    atomic_binary_2dim_global(
+        ary, 1, cuda.atomic.add, atomic_cast_to_uint64, False
+    )
 
 
 def atomic_add_double(idx, ary):
-    atomic_binary_1dim_shared(ary, idx, 1.0, float64, 32,
-                              cuda.atomic.add, atomic_cast_none, 0.0, False)
+    atomic_binary_1dim_shared(
+        ary,
+        idx,
+        1.0,
+        float64,
+        32,
+        cuda.atomic.add,
+        atomic_cast_none,
+        0.0,
+        False,
+    )
 
 
 def atomic_add_double_wrap(idx, ary):
-    atomic_binary_1dim_shared(ary, idx, 1.0, float64, 32,
-                              cuda.atomic.add, atomic_cast_none, 0.0, True)
+    atomic_binary_1dim_shared(
+        ary, idx, 1.0, float64, 32, cuda.atomic.add, atomic_cast_none, 0.0, True
+    )
 
 
 def atomic_add_double_2(ary):
-    atomic_binary_2dim_shared(ary, 1.0, float64, (4, 8),
-                              cuda.atomic.add, atomic_cast_none, False)
+    atomic_binary_2dim_shared(
+        ary, 1.0, float64, (4, 8), cuda.atomic.add, atomic_cast_none, False
+    )
 
 
 def atomic_add_double_2_wrap(ary):
-    atomic_binary_2dim_shared(ary, 1.0, float64, (4, 8),
-                              cuda.atomic.add, atomic_cast_none, True)
+    atomic_binary_2dim_shared(
+        ary, 1.0, float64, (4, 8), cuda.atomic.add, atomic_cast_none, True
+    )
 
 
 def atomic_add_double_3(ary):
-    atomic_binary_2dim_shared(ary, 1.0, float64, (4, 8),
-                              cuda.atomic.add, atomic_cast_to_uint64, False)
+    atomic_binary_2dim_shared(
+        ary, 1.0, float64, (4, 8), cuda.atomic.add, atomic_cast_to_uint64, False
+    )
 
 
 def atomic_sub(ary):
-    atomic_binary_1dim_shared(ary, ary, 1, uint32, 32,
-                              cuda.atomic.sub, atomic_cast_none, 0, False)
+    atomic_binary_1dim_shared(
+        ary, ary, 1, uint32, 32, cuda.atomic.sub, atomic_cast_none, 0, False
+    )
 
 
 def atomic_sub2(ary):
-    atomic_binary_2dim_shared(ary, 1, uint32, (4, 8),
-                              cuda.atomic.sub, atomic_cast_none, False)
+    atomic_binary_2dim_shared(
+        ary, 1, uint32, (4, 8), cuda.atomic.sub, atomic_cast_none, False
+    )
 
 
 def atomic_sub3(ary):
-    atomic_binary_2dim_shared(ary, 1, uint32, (4, 8),
-                              cuda.atomic.sub, atomic_cast_to_uint64, False)
+    atomic_binary_2dim_shared(
+        ary, 1, uint32, (4, 8), cuda.atomic.sub, atomic_cast_to_uint64, False
+    )
 
 
 def atomic_sub_float(ary):
-    atomic_binary_1dim_shared(ary, ary, 1.0, float32, 32,
-                              cuda.atomic.sub, atomic_cast_to_int, 0.0, False)
+    atomic_binary_1dim_shared(
+        ary,
+        ary,
+        1.0,
+        float32,
+        32,
+        cuda.atomic.sub,
+        atomic_cast_to_int,
+        0.0,
+        False,
+    )
 
 
 def atomic_sub_float_2(ary):
-    atomic_binary_2dim_shared(ary, 1.0, float32, (4, 8),
-                              cuda.atomic.sub, atomic_cast_none, False)
+    atomic_binary_2dim_shared(
+        ary, 1.0, float32, (4, 8), cuda.atomic.sub, atomic_cast_none, False
+    )
 
 
 def atomic_sub_float_3(ary):
-    atomic_binary_2dim_shared(ary, 1.0, float32, (4, 8),
-                              cuda.atomic.sub, atomic_cast_to_uint64, False)
+    atomic_binary_2dim_shared(
+        ary, 1.0, float32, (4, 8), cuda.atomic.sub, atomic_cast_to_uint64, False
+    )
 
 
 def atomic_sub_double(idx, ary):
-    atomic_binary_1dim_shared(ary, idx, 1.0, float64, 32,
-                              cuda.atomic.sub, atomic_cast_none, 0.0, False)
+    atomic_binary_1dim_shared(
+        ary,
+        idx,
+        1.0,
+        float64,
+        32,
+        cuda.atomic.sub,
+        atomic_cast_none,
+        0.0,
+        False,
+    )
 
 
 def atomic_sub_double_2(ary):
-    atomic_binary_2dim_shared(ary, 1.0, float64, (4, 8),
-                              cuda.atomic.sub, atomic_cast_none, False)
+    atomic_binary_2dim_shared(
+        ary, 1.0, float64, (4, 8), cuda.atomic.sub, atomic_cast_none, False
+    )
 
 
 def atomic_sub_double_3(ary):
-    atomic_binary_2dim_shared(ary, 1.0, float64, (4, 8),
-                              cuda.atomic.sub, atomic_cast_to_uint64, False)
+    atomic_binary_2dim_shared(
+        ary, 1.0, float64, (4, 8), cuda.atomic.sub, atomic_cast_to_uint64, False
+    )
 
 
 def atomic_sub_double_global(idx, ary):
@@ -232,28 +308,33 @@ def atomic_sub_double_global(idx, ary):
 
 
 def atomic_sub_double_global_2(ary):
-    atomic_binary_2dim_global(ary, 1.0, cuda.atomic.sub, atomic_cast_none,
-                              False)
+    atomic_binary_2dim_global(
+        ary, 1.0, cuda.atomic.sub, atomic_cast_none, False
+    )
 
 
 def atomic_sub_double_global_3(ary):
-    atomic_binary_2dim_shared(ary, 1.0, float64, (4, 8),
-                              cuda.atomic.sub, atomic_cast_to_uint64, False)
+    atomic_binary_2dim_shared(
+        ary, 1.0, float64, (4, 8), cuda.atomic.sub, atomic_cast_to_uint64, False
+    )
 
 
 def atomic_and(ary, op2):
-    atomic_binary_1dim_shared(ary, ary, op2, uint32, 32,
-                              cuda.atomic.and_, atomic_cast_none, 1, False)
+    atomic_binary_1dim_shared(
+        ary, ary, op2, uint32, 32, cuda.atomic.and_, atomic_cast_none, 1, False
+    )
 
 
 def atomic_and2(ary, op2):
-    atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
-                              cuda.atomic.and_, atomic_cast_none, False)
+    atomic_binary_2dim_shared(
+        ary, op2, uint32, (4, 8), cuda.atomic.and_, atomic_cast_none, False
+    )
 
 
 def atomic_and3(ary, op2):
-    atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
-                              cuda.atomic.and_, atomic_cast_to_uint64, False)
+    atomic_binary_2dim_shared(
+        ary, op2, uint32, (4, 8), cuda.atomic.and_, atomic_cast_to_uint64, False
+    )
 
 
 def atomic_and_global(idx, ary, op2):
@@ -261,23 +342,27 @@ def atomic_and_global(idx, ary, op2):
 
 
 def atomic_and_global_2(ary, op2):
-    atomic_binary_2dim_global(ary, op2, cuda.atomic.and_,
-                              atomic_cast_none, False)
+    atomic_binary_2dim_global(
+        ary, op2, cuda.atomic.and_, atomic_cast_none, False
+    )
 
 
 def atomic_or(ary, op2):
-    atomic_binary_1dim_shared(ary, ary, op2, uint32, 32,
-                              cuda.atomic.or_, atomic_cast_none, 0, False)
+    atomic_binary_1dim_shared(
+        ary, ary, op2, uint32, 32, cuda.atomic.or_, atomic_cast_none, 0, False
+    )
 
 
 def atomic_or2(ary, op2):
-    atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
-                              cuda.atomic.or_, atomic_cast_none, False)
+    atomic_binary_2dim_shared(
+        ary, op2, uint32, (4, 8), cuda.atomic.or_, atomic_cast_none, False
+    )
 
 
 def atomic_or3(ary, op2):
-    atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
-                              cuda.atomic.or_, atomic_cast_to_uint64, False)
+    atomic_binary_2dim_shared(
+        ary, op2, uint32, (4, 8), cuda.atomic.or_, atomic_cast_to_uint64, False
+    )
 
 
 def atomic_or_global(idx, ary, op2):
@@ -285,23 +370,27 @@ def atomic_or_global(idx, ary, op2):
 
 
 def atomic_or_global_2(ary, op2):
-    atomic_binary_2dim_global(ary, op2, cuda.atomic.or_,
-                              atomic_cast_none, False)
+    atomic_binary_2dim_global(
+        ary, op2, cuda.atomic.or_, atomic_cast_none, False
+    )
 
 
 def atomic_xor(ary, op2):
-    atomic_binary_1dim_shared(ary, ary, op2, uint32, 32,
-                              cuda.atomic.xor, atomic_cast_none, 0, False)
+    atomic_binary_1dim_shared(
+        ary, ary, op2, uint32, 32, cuda.atomic.xor, atomic_cast_none, 0, False
+    )
 
 
 def atomic_xor2(ary, op2):
-    atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
-                              cuda.atomic.xor, atomic_cast_none, False)
+    atomic_binary_2dim_shared(
+        ary, op2, uint32, (4, 8), cuda.atomic.xor, atomic_cast_none, False
+    )
 
 
 def atomic_xor3(ary, op2):
-    atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
-                              cuda.atomic.xor, atomic_cast_to_uint64, False)
+    atomic_binary_2dim_shared(
+        ary, op2, uint32, (4, 8), cuda.atomic.xor, atomic_cast_to_uint64, False
+    )
 
 
 def atomic_xor_global(idx, ary, op2):
@@ -309,33 +398,39 @@ def atomic_xor_global(idx, ary, op2):
 
 
 def atomic_xor_global_2(ary, op2):
-    atomic_binary_2dim_global(ary, op2, cuda.atomic.xor,
-                              atomic_cast_none, False)
+    atomic_binary_2dim_global(
+        ary, op2, cuda.atomic.xor, atomic_cast_none, False
+    )
 
 
 def atomic_inc32(ary, idx, op2):
-    atomic_binary_1dim_shared2(ary, idx, op2, uint32, 32,
-                               cuda.atomic.inc, atomic_cast_none)
+    atomic_binary_1dim_shared2(
+        ary, idx, op2, uint32, 32, cuda.atomic.inc, atomic_cast_none
+    )
 
 
 def atomic_inc64(ary, idx, op2):
-    atomic_binary_1dim_shared2(ary, idx, op2, uint64, 32,
-                               cuda.atomic.inc, atomic_cast_to_int)
+    atomic_binary_1dim_shared2(
+        ary, idx, op2, uint64, 32, cuda.atomic.inc, atomic_cast_to_int
+    )
 
 
 def atomic_inc2_32(ary, op2):
-    atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
-                              cuda.atomic.inc, atomic_cast_none, False)
+    atomic_binary_2dim_shared(
+        ary, op2, uint32, (4, 8), cuda.atomic.inc, atomic_cast_none, False
+    )
 
 
 def atomic_inc2_64(ary, op2):
-    atomic_binary_2dim_shared(ary, op2, uint64, (4, 8),
-                              cuda.atomic.inc, atomic_cast_none, False)
+    atomic_binary_2dim_shared(
+        ary, op2, uint64, (4, 8), cuda.atomic.inc, atomic_cast_none, False
+    )
 
 
 def atomic_inc3(ary, op2):
-    atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
-                              cuda.atomic.inc, atomic_cast_to_uint64, False)
+    atomic_binary_2dim_shared(
+        ary, op2, uint32, (4, 8), cuda.atomic.inc, atomic_cast_to_uint64, False
+    )
 
 
 def atomic_inc_global(idx, ary, op2):
@@ -343,33 +438,39 @@ def atomic_inc_global(idx, ary, op2):
 
 
 def atomic_inc_global_2(ary, op2):
-    atomic_binary_2dim_global(ary, op2, cuda.atomic.inc,
-                              atomic_cast_none, False)
+    atomic_binary_2dim_global(
+        ary, op2, cuda.atomic.inc, atomic_cast_none, False
+    )
 
 
 def atomic_dec32(ary, idx, op2):
-    atomic_binary_1dim_shared2(ary, idx, op2, uint32, 32,
-                               cuda.atomic.dec, atomic_cast_none)
+    atomic_binary_1dim_shared2(
+        ary, idx, op2, uint32, 32, cuda.atomic.dec, atomic_cast_none
+    )
 
 
 def atomic_dec64(ary, idx, op2):
-    atomic_binary_1dim_shared2(ary, idx, op2, uint64, 32,
-                               cuda.atomic.dec, atomic_cast_to_int)
+    atomic_binary_1dim_shared2(
+        ary, idx, op2, uint64, 32, cuda.atomic.dec, atomic_cast_to_int
+    )
 
 
 def atomic_dec2_32(ary, op2):
-    atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
-                              cuda.atomic.dec, atomic_cast_none, False)
+    atomic_binary_2dim_shared(
+        ary, op2, uint32, (4, 8), cuda.atomic.dec, atomic_cast_none, False
+    )
 
 
 def atomic_dec2_64(ary, op2):
-    atomic_binary_2dim_shared(ary, op2, uint64, (4, 8),
-                              cuda.atomic.dec, atomic_cast_none, False)
+    atomic_binary_2dim_shared(
+        ary, op2, uint64, (4, 8), cuda.atomic.dec, atomic_cast_none, False
+    )
 
 
 def atomic_dec3(ary, op2):
-    atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
-                              cuda.atomic.dec, atomic_cast_to_uint64, False)
+    atomic_binary_2dim_shared(
+        ary, op2, uint32, (4, 8), cuda.atomic.dec, atomic_cast_to_uint64, False
+    )
 
 
 def atomic_dec_global(idx, ary, op2):
@@ -377,23 +478,27 @@ def atomic_dec_global(idx, ary, op2):
 
 
 def atomic_dec_global_2(ary, op2):
-    atomic_binary_2dim_global(ary, op2, cuda.atomic.dec,
-                              atomic_cast_none, False)
+    atomic_binary_2dim_global(
+        ary, op2, cuda.atomic.dec, atomic_cast_none, False
+    )
 
 
 def atomic_exch(ary, idx, op2):
-    atomic_binary_1dim_shared2(ary, idx, op2, uint32, 32,
-                               cuda.atomic.exch, atomic_cast_none)
+    atomic_binary_1dim_shared2(
+        ary, idx, op2, uint32, 32, cuda.atomic.exch, atomic_cast_none
+    )
 
 
 def atomic_exch2(ary, op2):
-    atomic_binary_2dim_shared(ary, op2, uint32, (4, 8),
-                              cuda.atomic.exch, atomic_cast_none, False)
+    atomic_binary_2dim_shared(
+        ary, op2, uint32, (4, 8), cuda.atomic.exch, atomic_cast_none, False
+    )
 
 
 def atomic_exch3(ary, op2):
-    atomic_binary_2dim_shared(ary, op2, uint64, (4, 8),
-                              cuda.atomic.exch, atomic_cast_none, False)
+    atomic_binary_2dim_shared(
+        ary, op2, uint64, (4, 8), cuda.atomic.exch, atomic_cast_none, False
+    )
 
 
 def atomic_exch_global(idx, ary, op2):
@@ -401,7 +506,6 @@ def atomic_exch_global(idx, ary, op2):
 
 
 def gen_atomic_extreme_funcs(func):
-
     fns = dedent("""
     def atomic(res, ary):
         tx = cuda.threadIdx.x
@@ -431,21 +535,39 @@ def atomic_double_shared(res, ary):
             res[0] = smres[0]
     """).format(func=func)
     ld = {}
-    exec(fns, {'cuda': cuda, 'float64': float64, 'uint64': uint64}, ld)
-    return (ld['atomic'], ld['atomic_double_normalizedindex'],
-            ld['atomic_double_oneindex'], ld['atomic_double_shared'])
-
-
-(atomic_max, atomic_max_double_normalizedindex, atomic_max_double_oneindex,
- atomic_max_double_shared) = gen_atomic_extreme_funcs('cuda.atomic.max')
-(atomic_min, atomic_min_double_normalizedindex, atomic_min_double_oneindex,
- atomic_min_double_shared) = gen_atomic_extreme_funcs('cuda.atomic.min')
-(atomic_nanmax, atomic_nanmax_double_normalizedindex,
- atomic_nanmax_double_oneindex, atomic_nanmax_double_shared) = \
-    gen_atomic_extreme_funcs('cuda.atomic.nanmax')
-(atomic_nanmin, atomic_nanmin_double_normalizedindex,
- atomic_nanmin_double_oneindex, atomic_nanmin_double_shared) = \
-    gen_atomic_extreme_funcs('cuda.atomic.nanmin')
+    exec(fns, {"cuda": cuda, "float64": float64, "uint64": uint64}, ld)
+    return (
+        ld["atomic"],
+        ld["atomic_double_normalizedindex"],
+        ld["atomic_double_oneindex"],
+        ld["atomic_double_shared"],
+    )
+
+
+(
+    atomic_max,
+    atomic_max_double_normalizedindex,
+    atomic_max_double_oneindex,
+    atomic_max_double_shared,
+) = gen_atomic_extreme_funcs("cuda.atomic.max")
+(
+    atomic_min,
+    atomic_min_double_normalizedindex,
+    atomic_min_double_oneindex,
+    atomic_min_double_shared,
+) = gen_atomic_extreme_funcs("cuda.atomic.min")
+(
+    atomic_nanmax,
+    atomic_nanmax_double_normalizedindex,
+    atomic_nanmax_double_oneindex,
+    atomic_nanmax_double_shared,
+) = gen_atomic_extreme_funcs("cuda.atomic.nanmax")
+(
+    atomic_nanmin,
+    atomic_nanmin_double_normalizedindex,
+    atomic_nanmin_double_oneindex,
+    atomic_nanmin_double_shared,
+) = gen_atomic_extreme_funcs("cuda.atomic.nanmin")
 
 
 def atomic_compare_and_swap(res, old, ary, fill_val):
@@ -476,10 +598,10 @@ def test_atomic_add(self):
         ary_wrap = ary.copy()
         orig = ary.copy()
 
-        cuda_atomic_add = cuda.jit('void(uint32[:])')(atomic_add)
+        cuda_atomic_add = cuda.jit("void(uint32[:])")(atomic_add)
         cuda_atomic_add[1, 32](ary)
 
-        cuda_atomic_add_wrap = cuda.jit('void(uint32[:])')(atomic_add_wrap)
+        cuda_atomic_add_wrap = cuda.jit("void(uint32[:])")(atomic_add_wrap)
         cuda_atomic_add_wrap[1, 32](ary_wrap)
 
         gold = np.zeros(32, dtype=np.uint32)
@@ -494,10 +616,10 @@ def test_atomic_add2(self):
         ary_wrap = ary.copy()
         orig = ary.copy()
 
-        cuda_atomic_add2 = cuda.jit('void(uint32[:,:])')(atomic_add2)
+        cuda_atomic_add2 = cuda.jit("void(uint32[:,:])")(atomic_add2)
         cuda_atomic_add2[1, (4, 8)](ary)
 
-        cuda_atomic_add2_wrap = cuda.jit('void(uint32[:,:])')(atomic_add2_wrap)
+        cuda_atomic_add2_wrap = cuda.jit("void(uint32[:,:])")(atomic_add2_wrap)
         cuda_atomic_add2_wrap[1, (4, 8)](ary_wrap)
 
         self.assertTrue(np.all(ary == orig + 1))
@@ -506,7 +628,7 @@ def test_atomic_add2(self):
     def test_atomic_add3(self):
         ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
         orig = ary.copy()
-        cuda_atomic_add3 = cuda.jit('void(uint32[:,:])')(atomic_add3)
+        cuda_atomic_add3 = cuda.jit("void(uint32[:,:])")(atomic_add3)
         cuda_atomic_add3[1, (4, 8)](ary)
 
         self.assertTrue(np.all(ary == orig + 1))
@@ -516,10 +638,10 @@ def test_atomic_add_float(self):
         ary_wrap = ary.copy()
         orig = ary.copy().astype(np.intp)
 
-        cuda_atomic_add_float = cuda.jit('void(float32[:])')(atomic_add_float)
+        cuda_atomic_add_float = cuda.jit("void(float32[:])")(atomic_add_float)
         cuda_atomic_add_float[1, 32](ary)
 
-        add_float_wrap = cuda.jit('void(float32[:])')(atomic_add_float_wrap)
+        add_float_wrap = cuda.jit("void(float32[:])")(atomic_add_float_wrap)
         add_float_wrap[1, 32](ary_wrap)
 
         gold = np.zeros(32, dtype=np.uint32)
@@ -534,10 +656,10 @@ def test_atomic_add_float_2(self):
         ary_wrap = ary.copy()
         orig = ary.copy()
 
-        cuda_atomic_add2 = cuda.jit('void(float32[:,:])')(atomic_add_float_2)
+        cuda_atomic_add2 = cuda.jit("void(float32[:,:])")(atomic_add_float_2)
         cuda_atomic_add2[1, (4, 8)](ary)
 
-        cuda_func_wrap = cuda.jit('void(float32[:,:])')(atomic_add_float_2_wrap)
+        cuda_func_wrap = cuda.jit("void(float32[:,:])")(atomic_add_float_2_wrap)
         cuda_func_wrap[1, (4, 8)](ary_wrap)
 
         self.assertTrue(np.all(ary == orig + 1))
@@ -546,7 +668,7 @@ def test_atomic_add_float_2(self):
     def test_atomic_add_float_3(self):
         ary = np.random.randint(0, 32, size=32).astype(np.float32).reshape(4, 8)
         orig = ary.copy()
-        cuda_atomic_add3 = cuda.jit('void(float32[:,:])')(atomic_add_float_3)
+        cuda_atomic_add3 = cuda.jit("void(float32[:,:])")(atomic_add_float_3)
         cuda_atomic_add3[1, (4, 8)](ary)
 
         self.assertTrue(np.all(ary == orig + 1))
@@ -561,24 +683,24 @@ def assertCorrectFloat64Atomics(self, kernel, shared=True):
             inst = "(red|atom)"
 
             if shared:
-                inst = f'{inst}\\.shared'
+                inst = f"{inst}\\.shared"
 
-            self.assertRegex(asm, f'{inst}.add.f64', asm)
+            self.assertRegex(asm, f"{inst}.add.f64", asm)
         else:
             if shared:
-                self.assertIn('atom.shared.cas.b64', asm)
+                self.assertIn("atom.shared.cas.b64", asm)
             else:
-                self.assertIn('atom.cas.b64', asm)
+                self.assertIn("atom.cas.b64", asm)
 
     def test_atomic_add_double(self):
         idx = np.random.randint(0, 32, size=32, dtype=np.int64)
         ary = np.zeros(32, np.float64)
         ary_wrap = ary.copy()
 
-        cuda_fn = cuda.jit('void(int64[:], float64[:])')(atomic_add_double)
+        cuda_fn = cuda.jit("void(int64[:], float64[:])")(atomic_add_double)
         cuda_fn[1, 32](idx, ary)
 
-        wrap_fn = cuda.jit('void(int64[:], float64[:])')(atomic_add_double_wrap)
+        wrap_fn = cuda.jit("void(int64[:], float64[:])")(atomic_add_double_wrap)
         wrap_fn[1, 32](idx, ary_wrap)
 
         gold = np.zeros(32, dtype=np.uint32)
@@ -595,10 +717,10 @@ def test_atomic_add_double_2(self):
         ary_wrap = ary.copy()
         orig = ary.copy()
 
-        cuda_fn = cuda.jit('void(float64[:,:])')(atomic_add_double_2)
+        cuda_fn = cuda.jit("void(float64[:,:])")(atomic_add_double_2)
         cuda_fn[1, (4, 8)](ary)
 
-        cuda_fn_wrap = cuda.jit('void(float64[:,:])')(atomic_add_double_2_wrap)
+        cuda_fn_wrap = cuda.jit("void(float64[:,:])")(atomic_add_double_2_wrap)
         cuda_fn_wrap[1, (4, 8)](ary_wrap)
 
         np.testing.assert_equal(ary, orig + 1)
@@ -609,7 +731,7 @@ def test_atomic_add_double_2(self):
     def test_atomic_add_double_3(self):
         ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8)
         orig = ary.copy()
-        cuda_func = cuda.jit('void(float64[:,:])')(atomic_add_double_3)
+        cuda_func = cuda.jit("void(float64[:,:])")(atomic_add_double_3)
         cuda_func[1, (4, 8)](ary)
 
         np.testing.assert_equal(ary, orig + 1)
@@ -620,7 +742,7 @@ def test_atomic_add_double_global(self):
         ary = np.zeros(32, np.float64)
         ary_wrap = ary.copy()
 
-        sig = 'void(int64[:], float64[:])'
+        sig = "void(int64[:], float64[:])"
         cuda_func = cuda.jit(sig)(atomic_add_double_global)
         wrap_cuda_func = cuda.jit(sig)(atomic_add_double_global_wrap)
 
@@ -641,7 +763,7 @@ def test_atomic_add_double_global_2(self):
         ary_wrap = ary.copy()
         orig = ary.copy()
 
-        sig = 'void(float64[:,:])'
+        sig = "void(float64[:,:])"
         cuda_func = cuda.jit(sig)(atomic_add_double_global_2)
         wrap_cuda_func = cuda.jit(sig)(atomic_add_double_global_2_wrap)
 
@@ -656,7 +778,7 @@ def test_atomic_add_double_global_2(self):
     def test_atomic_add_double_global_3(self):
         ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8)
         orig = ary.copy()
-        cuda_func = cuda.jit('void(float64[:,:])')(atomic_add_double_global_3)
+        cuda_func = cuda.jit("void(float64[:,:])")(atomic_add_double_global_3)
         cuda_func[1, (4, 8)](ary)
 
         np.testing.assert_equal(ary, orig + 1)
@@ -665,7 +787,7 @@ def test_atomic_add_double_global_3(self):
     def test_atomic_sub(self):
         ary = np.random.randint(0, 32, size=32).astype(np.uint32)
         orig = ary.copy()
-        cuda_atomic_sub = cuda.jit('void(uint32[:])')(atomic_sub)
+        cuda_atomic_sub = cuda.jit("void(uint32[:])")(atomic_sub)
         cuda_atomic_sub[1, 32](ary)
 
         gold = np.zeros(32, dtype=np.uint32)
@@ -677,21 +799,21 @@ def test_atomic_sub(self):
     def test_atomic_sub2(self):
         ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
         orig = ary.copy()
-        cuda_atomic_sub2 = cuda.jit('void(uint32[:,:])')(atomic_sub2)
+        cuda_atomic_sub2 = cuda.jit("void(uint32[:,:])")(atomic_sub2)
         cuda_atomic_sub2[1, (4, 8)](ary)
         self.assertTrue(np.all(ary == orig - 1))
 
     def test_atomic_sub3(self):
         ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
         orig = ary.copy()
-        cuda_atomic_sub3 = cuda.jit('void(uint32[:,:])')(atomic_sub3)
+        cuda_atomic_sub3 = cuda.jit("void(uint32[:,:])")(atomic_sub3)
         cuda_atomic_sub3[1, (4, 8)](ary)
         self.assertTrue(np.all(ary == orig - 1))
 
     def test_atomic_sub_float(self):
         ary = np.random.randint(0, 32, size=32).astype(np.float32)
         orig = ary.copy().astype(np.intp)
-        cuda_atomic_sub_float = cuda.jit('void(float32[:])')(atomic_sub_float)
+        cuda_atomic_sub_float = cuda.jit("void(float32[:])")(atomic_sub_float)
         cuda_atomic_sub_float[1, 32](ary)
 
         gold = np.zeros(32, dtype=np.float32)
@@ -703,21 +825,21 @@ def test_atomic_sub_float(self):
     def test_atomic_sub_float_2(self):
         ary = np.random.randint(0, 32, size=32).astype(np.float32).reshape(4, 8)
         orig = ary.copy()
-        cuda_atomic_sub2 = cuda.jit('void(float32[:,:])')(atomic_sub_float_2)
+        cuda_atomic_sub2 = cuda.jit("void(float32[:,:])")(atomic_sub_float_2)
         cuda_atomic_sub2[1, (4, 8)](ary)
         self.assertTrue(np.all(ary == orig - 1))
 
     def test_atomic_sub_float_3(self):
         ary = np.random.randint(0, 32, size=32).astype(np.float32).reshape(4, 8)
         orig = ary.copy()
-        cuda_atomic_sub3 = cuda.jit('void(float32[:,:])')(atomic_sub_float_3)
+        cuda_atomic_sub3 = cuda.jit("void(float32[:,:])")(atomic_sub_float_3)
         cuda_atomic_sub3[1, (4, 8)](ary)
         self.assertTrue(np.all(ary == orig - 1))
 
     def test_atomic_sub_double(self):
         idx = np.random.randint(0, 32, size=32, dtype=np.int64)
         ary = np.zeros(32, np.float64)
-        cuda_func = cuda.jit('void(int64[:], float64[:])')(atomic_sub_double)
+        cuda_func = cuda.jit("void(int64[:], float64[:])")(atomic_sub_double)
         cuda_func[1, 32](idx, ary)
 
         gold = np.zeros(32, dtype=np.float64)
@@ -729,21 +851,21 @@ def test_atomic_sub_double(self):
     def test_atomic_sub_double_2(self):
         ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8)
         orig = ary.copy()
-        cuda_func = cuda.jit('void(float64[:,:])')(atomic_sub_double_2)
+        cuda_func = cuda.jit("void(float64[:,:])")(atomic_sub_double_2)
         cuda_func[1, (4, 8)](ary)
         np.testing.assert_equal(ary, orig - 1)
 
     def test_atomic_sub_double_3(self):
         ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8)
         orig = ary.copy()
-        cuda_func = cuda.jit('void(float64[:,:])')(atomic_sub_double_3)
+        cuda_func = cuda.jit("void(float64[:,:])")(atomic_sub_double_3)
         cuda_func[1, (4, 8)](ary)
         np.testing.assert_equal(ary, orig - 1)
 
     def test_atomic_sub_double_global(self):
         idx = np.random.randint(0, 32, size=32, dtype=np.int64)
         ary = np.zeros(32, np.float64)
-        sig = 'void(int64[:], float64[:])'
+        sig = "void(int64[:], float64[:])"
         cuda_func = cuda.jit(sig)(atomic_sub_double_global)
         cuda_func[1, 32](idx, ary)
 
@@ -756,14 +878,14 @@ def test_atomic_sub_double_global(self):
     def test_atomic_sub_double_global_2(self):
         ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8)
         orig = ary.copy()
-        cuda_func = cuda.jit('void(float64[:,:])')(atomic_sub_double_global_2)
+        cuda_func = cuda.jit("void(float64[:,:])")(atomic_sub_double_global_2)
         cuda_func[1, (4, 8)](ary)
         np.testing.assert_equal(ary, orig - 1)
 
     def test_atomic_sub_double_global_3(self):
         ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8)
         orig = ary.copy()
-        cuda_func = cuda.jit('void(float64[:,:])')(atomic_sub_double_global_3)
+        cuda_func = cuda.jit("void(float64[:,:])")(atomic_sub_double_global_3)
         cuda_func[1, (4, 8)](ary)
         np.testing.assert_equal(ary, orig - 1)
 
@@ -771,7 +893,7 @@ def test_atomic_and(self):
         rand_const = np.random.randint(500)
         ary = np.random.randint(0, 32, size=32).astype(np.uint32)
         orig = ary.copy()
-        cuda_func = cuda.jit('void(uint32[:], uint32)')(atomic_and)
+        cuda_func = cuda.jit("void(uint32[:], uint32)")(atomic_and)
         cuda_func[1, 32](ary, rand_const)
 
         gold = ary.copy()
@@ -784,7 +906,7 @@ def test_atomic_and2(self):
         rand_const = np.random.randint(500)
         ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
         orig = ary.copy()
-        cuda_atomic_and2 = cuda.jit('void(uint32[:,:], uint32)')(atomic_and2)
+        cuda_atomic_and2 = cuda.jit("void(uint32[:,:], uint32)")(atomic_and2)
         cuda_atomic_and2[1, (4, 8)](ary, rand_const)
         self.assertTrue(np.all(ary == orig & rand_const))
 
@@ -792,7 +914,7 @@ def test_atomic_and3(self):
         rand_const = np.random.randint(500)
         ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
         orig = ary.copy()
-        cuda_atomic_and3 = cuda.jit('void(uint32[:,:], uint32)')(atomic_and3)
+        cuda_atomic_and3 = cuda.jit("void(uint32[:,:], uint32)")(atomic_and3)
         cuda_atomic_and3[1, (4, 8)](ary, rand_const)
         self.assertTrue(np.all(ary == orig & rand_const))
 
@@ -800,7 +922,7 @@ def test_atomic_and_global(self):
         rand_const = np.random.randint(500)
         idx = np.random.randint(0, 32, size=32, dtype=np.int32)
         ary = np.random.randint(0, 32, size=32, dtype=np.int32)
-        sig = 'void(int32[:], int32[:], int32)'
+        sig = "void(int32[:], int32[:], int32)"
         cuda_func = cuda.jit(sig)(atomic_and_global)
         cuda_func[1, 32](idx, ary, rand_const)
 
@@ -814,7 +936,7 @@ def test_atomic_and_global_2(self):
         rand_const = np.random.randint(500)
         ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
         orig = ary.copy()
-        cuda_func = cuda.jit('void(uint32[:,:], uint32)')(atomic_and_global_2)
+        cuda_func = cuda.jit("void(uint32[:,:], uint32)")(atomic_and_global_2)
         cuda_func[1, (4, 8)](ary, rand_const)
         np.testing.assert_equal(ary, orig & rand_const)
 
@@ -822,7 +944,7 @@ def test_atomic_or(self):
         rand_const = np.random.randint(500)
         ary = np.random.randint(0, 32, size=32).astype(np.uint32)
         orig = ary.copy()
-        cuda_func = cuda.jit('void(uint32[:], uint32)')(atomic_or)
+        cuda_func = cuda.jit("void(uint32[:], uint32)")(atomic_or)
         cuda_func[1, 32](ary, rand_const)
 
         gold = np.zeros(32, dtype=np.uint32)
@@ -835,7 +957,7 @@ def test_atomic_or2(self):
         rand_const = np.random.randint(500)
         ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
         orig = ary.copy()
-        cuda_atomic_and2 = cuda.jit('void(uint32[:,:], uint32)')(atomic_or2)
+        cuda_atomic_and2 = cuda.jit("void(uint32[:,:], uint32)")(atomic_or2)
         cuda_atomic_and2[1, (4, 8)](ary, rand_const)
         self.assertTrue(np.all(ary == orig | rand_const))
 
@@ -843,7 +965,7 @@ def test_atomic_or3(self):
         rand_const = np.random.randint(500)
         ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
         orig = ary.copy()
-        cuda_atomic_and3 = cuda.jit('void(uint32[:,:], uint32)')(atomic_or3)
+        cuda_atomic_and3 = cuda.jit("void(uint32[:,:], uint32)")(atomic_or3)
         cuda_atomic_and3[1, (4, 8)](ary, rand_const)
         self.assertTrue(np.all(ary == orig | rand_const))
 
@@ -851,7 +973,7 @@ def test_atomic_or_global(self):
         rand_const = np.random.randint(500)
         idx = np.random.randint(0, 32, size=32, dtype=np.int32)
         ary = np.random.randint(0, 32, size=32, dtype=np.int32)
-        sig = 'void(int32[:], int32[:], int32)'
+        sig = "void(int32[:], int32[:], int32)"
         cuda_func = cuda.jit(sig)(atomic_or_global)
         cuda_func[1, 32](idx, ary, rand_const)
 
@@ -865,7 +987,7 @@ def test_atomic_or_global_2(self):
         rand_const = np.random.randint(500)
         ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
         orig = ary.copy()
-        cuda_func = cuda.jit('void(uint32[:,:], uint32)')(atomic_or_global_2)
+        cuda_func = cuda.jit("void(uint32[:,:], uint32)")(atomic_or_global_2)
         cuda_func[1, (4, 8)](ary, rand_const)
         np.testing.assert_equal(ary, orig | rand_const)
 
@@ -873,7 +995,7 @@ def test_atomic_xor(self):
         rand_const = np.random.randint(500)
         ary = np.random.randint(0, 32, size=32).astype(np.uint32)
         orig = ary.copy()
-        cuda_func = cuda.jit('void(uint32[:], uint32)')(atomic_xor)
+        cuda_func = cuda.jit("void(uint32[:], uint32)")(atomic_xor)
         cuda_func[1, 32](ary, rand_const)
 
         gold = np.zeros(32, dtype=np.uint32)
@@ -886,7 +1008,7 @@ def test_atomic_xor2(self):
         rand_const = np.random.randint(500)
         ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
         orig = ary.copy()
-        cuda_atomic_xor2 = cuda.jit('void(uint32[:,:], uint32)')(atomic_xor2)
+        cuda_atomic_xor2 = cuda.jit("void(uint32[:,:], uint32)")(atomic_xor2)
         cuda_atomic_xor2[1, (4, 8)](ary, rand_const)
         self.assertTrue(np.all(ary == orig ^ rand_const))
 
@@ -894,7 +1016,7 @@ def test_atomic_xor3(self):
         rand_const = np.random.randint(500)
         ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
         orig = ary.copy()
-        cuda_atomic_xor3 = cuda.jit('void(uint32[:,:], uint32)')(atomic_xor3)
+        cuda_atomic_xor3 = cuda.jit("void(uint32[:,:], uint32)")(atomic_xor3)
         cuda_atomic_xor3[1, (4, 8)](ary, rand_const)
         self.assertTrue(np.all(ary == orig ^ rand_const))
 
@@ -903,7 +1025,7 @@ def test_atomic_xor_global(self):
         idx = np.random.randint(0, 32, size=32, dtype=np.int32)
         ary = np.random.randint(0, 32, size=32, dtype=np.int32)
         gold = ary.copy()
-        sig = 'void(int32[:], int32[:], int32)'
+        sig = "void(int32[:], int32[:], int32)"
         cuda_func = cuda.jit(sig)(atomic_xor_global)
         cuda_func[1, 32](idx, ary, rand_const)
 
@@ -916,12 +1038,12 @@ def test_atomic_xor_global_2(self):
         rand_const = np.random.randint(500)
         ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
         orig = ary.copy()
-        cuda_func = cuda.jit('void(uint32[:,:], uint32)')(atomic_xor_global_2)
+        cuda_func = cuda.jit("void(uint32[:,:], uint32)")(atomic_xor_global_2)
         cuda_func[1, (4, 8)](ary, rand_const)
         np.testing.assert_equal(ary, orig ^ rand_const)
 
     def inc_dec_1dim_setup(self, dtype):
-        rconst = np.random.randint(32,  dtype=dtype)
+        rconst = np.random.randint(32, dtype=dtype)
         rary = np.random.randint(0, 32, size=32).astype(dtype)
         ary_idx = np.arange(32, dtype=dtype)
         return rconst, rary, ary_idx
@@ -951,131 +1073,141 @@ def check_inc(self, ary, rconst, sig, nblocks, blksize, func):
 
     def test_atomic_inc_32(self):
         rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint32)
-        sig = 'void(uint32[:], uint32[:], uint32)'
+        sig = "void(uint32[:], uint32[:], uint32)"
         self.check_inc_index(ary, idx, rand_const, sig, 1, 32, atomic_inc32)
 
     def test_atomic_inc_64(self):
         rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint64)
-        sig = 'void(uint64[:], uint64[:], uint64)'
+        sig = "void(uint64[:], uint64[:], uint64)"
         self.check_inc_index(ary, idx, rand_const, sig, 1, 32, atomic_inc64)
 
     def test_atomic_inc2_32(self):
         rand_const, ary = self.inc_dec_2dim_setup(np.uint32)
-        sig = 'void(uint32[:,:], uint32)'
-        self.check_inc(ary, rand_const, sig, 1, (4,8), atomic_inc2_32)
+        sig = "void(uint32[:,:], uint32)"
+        self.check_inc(ary, rand_const, sig, 1, (4, 8), atomic_inc2_32)
 
     def test_atomic_inc2_64(self):
         rand_const, ary = self.inc_dec_2dim_setup(np.uint64)
-        sig = 'void(uint64[:,:], uint64)'
-        self.check_inc(ary, rand_const, sig, 1, (4,8), atomic_inc2_64)
+        sig = "void(uint64[:,:], uint64)"
+        self.check_inc(ary, rand_const, sig, 1, (4, 8), atomic_inc2_64)
 
     def test_atomic_inc3(self):
         rand_const, ary = self.inc_dec_2dim_setup(np.uint32)
-        sig = 'void(uint32[:,:], uint32)'
-        self.check_inc(ary, rand_const, sig, 1, (4,8), atomic_inc3)
+        sig = "void(uint32[:,:], uint32)"
+        self.check_inc(ary, rand_const, sig, 1, (4, 8), atomic_inc3)
 
     def test_atomic_inc_global_32(self):
         rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint32)
-        sig = 'void(uint32[:], uint32[:], uint32)'
-        self.check_inc_index2(ary, idx, rand_const, sig, 1, 32,
-                              atomic_inc_global)
+        sig = "void(uint32[:], uint32[:], uint32)"
+        self.check_inc_index2(
+            ary, idx, rand_const, sig, 1, 32, atomic_inc_global
+        )
 
     def test_atomic_inc_global_64(self):
         rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint64)
-        sig = 'void(uint64[:], uint64[:], uint64)'
-        self.check_inc_index2(ary, idx, rand_const, sig, 1, 32,
-                              atomic_inc_global)
+        sig = "void(uint64[:], uint64[:], uint64)"
+        self.check_inc_index2(
+            ary, idx, rand_const, sig, 1, 32, atomic_inc_global
+        )
 
     def test_atomic_inc_global_2_32(self):
         rand_const, ary = self.inc_dec_2dim_setup(np.uint32)
-        sig = 'void(uint32[:,:], uint32)'
-        self.check_inc(ary, rand_const, sig, 1, (4,8), atomic_inc_global_2)
+        sig = "void(uint32[:,:], uint32)"
+        self.check_inc(ary, rand_const, sig, 1, (4, 8), atomic_inc_global_2)
 
     def test_atomic_inc_global_2_64(self):
         rand_const, ary = self.inc_dec_2dim_setup(np.uint64)
-        sig = 'void(uint64[:,:], uint64)'
-        self.check_inc(ary, rand_const, sig, 1, (4,8), atomic_inc_global_2)
+        sig = "void(uint64[:,:], uint64)"
+        self.check_inc(ary, rand_const, sig, 1, (4, 8), atomic_inc_global_2)
 
     def check_dec_index(self, ary, idx, rconst, sig, nblocks, blksize, func):
         orig = ary.copy()
         cuda_func = cuda.jit(sig)(func)
         cuda_func[nblocks, blksize](ary, idx, rconst)
-        np.testing.assert_equal(ary, np.where(orig == 0, rconst,
-                                              np.where(orig > rconst,
-                                                       rconst,
-                                                       orig - 1)))
+        np.testing.assert_equal(
+            ary,
+            np.where(
+                orig == 0, rconst, np.where(orig > rconst, rconst, orig - 1)
+            ),
+        )
 
     def check_dec_index2(self, ary, idx, rconst, sig, nblocks, blksize, func):
         orig = ary.copy()
         cuda_func = cuda.jit(sig)(func)
         cuda_func[nblocks, blksize](idx, ary, rconst)
-        np.testing.assert_equal(ary, np.where(orig == 0, rconst,
-                                              np.where(orig > rconst,
-                                                       rconst,
-                                                       orig - 1)))
+        np.testing.assert_equal(
+            ary,
+            np.where(
+                orig == 0, rconst, np.where(orig > rconst, rconst, orig - 1)
+            ),
+        )
 
     def check_dec(self, ary, rconst, sig, nblocks, blksize, func):
         orig = ary.copy()
         cuda_func = cuda.jit(sig)(func)
         cuda_func[nblocks, blksize](ary, rconst)
-        np.testing.assert_equal(ary, np.where(orig == 0, rconst,
-                                              np.where(orig > rconst,
-                                                       rconst,
-                                                       orig - 1)))
+        np.testing.assert_equal(
+            ary,
+            np.where(
+                orig == 0, rconst, np.where(orig > rconst, rconst, orig - 1)
+            ),
+        )
 
     def test_atomic_dec_32(self):
         rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint32)
-        sig = 'void(uint32[:], uint32[:], uint32)'
+        sig = "void(uint32[:], uint32[:], uint32)"
         self.check_dec_index(ary, idx, rand_const, sig, 1, 32, atomic_dec32)
 
     def test_atomic_dec_64(self):
         rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint64)
-        sig = 'void(uint64[:], uint64[:], uint64)'
+        sig = "void(uint64[:], uint64[:], uint64)"
         self.check_dec_index(ary, idx, rand_const, sig, 1, 32, atomic_dec64)
 
     def test_atomic_dec2_32(self):
         rand_const, ary = self.inc_dec_2dim_setup(np.uint32)
-        sig = 'void(uint32[:,:], uint32)'
-        self.check_dec(ary, rand_const, sig, 1, (4,8), atomic_dec2_32)
+        sig = "void(uint32[:,:], uint32)"
+        self.check_dec(ary, rand_const, sig, 1, (4, 8), atomic_dec2_32)
 
     def test_atomic_dec2_64(self):
         rand_const, ary = self.inc_dec_2dim_setup(np.uint64)
-        sig = 'void(uint64[:,:], uint64)'
-        self.check_dec(ary, rand_const, sig, 1, (4,8), atomic_dec2_64)
+        sig = "void(uint64[:,:], uint64)"
+        self.check_dec(ary, rand_const, sig, 1, (4, 8), atomic_dec2_64)
 
     def test_atomic_dec3_new(self):
         rand_const, ary = self.inc_dec_2dim_setup(np.uint32)
-        sig = 'void(uint32[:,:], uint32)'
-        self.check_dec(ary, rand_const, sig, 1, (4,8), atomic_dec3)
+        sig = "void(uint32[:,:], uint32)"
+        self.check_dec(ary, rand_const, sig, 1, (4, 8), atomic_dec3)
 
     def test_atomic_dec_global_32(self):
         rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint32)
-        sig = 'void(uint32[:], uint32[:], uint32)'
-        self.check_dec_index2(ary, idx, rand_const, sig, 1, 32,
-                              atomic_dec_global)
+        sig = "void(uint32[:], uint32[:], uint32)"
+        self.check_dec_index2(
+            ary, idx, rand_const, sig, 1, 32, atomic_dec_global
+        )
 
     def test_atomic_dec_global_64(self):
         rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint64)
-        sig = 'void(uint64[:], uint64[:], uint64)'
-        self.check_dec_index2(ary, idx, rand_const, sig, 1, 32,
-                              atomic_dec_global)
+        sig = "void(uint64[:], uint64[:], uint64)"
+        self.check_dec_index2(
+            ary, idx, rand_const, sig, 1, 32, atomic_dec_global
+        )
 
     def test_atomic_dec_global2_32(self):
         rand_const, ary = self.inc_dec_2dim_setup(np.uint32)
-        sig = 'void(uint32[:,:], uint32)'
-        self.check_dec(ary, rand_const, sig, 1, (4,8), atomic_dec_global_2)
+        sig = "void(uint32[:,:], uint32)"
+        self.check_dec(ary, rand_const, sig, 1, (4, 8), atomic_dec_global_2)
 
     def test_atomic_dec_global2_64(self):
         rand_const, ary = self.inc_dec_2dim_setup(np.uint64)
-        sig = 'void(uint64[:,:], uint64)'
-        self.check_dec(ary, rand_const, sig, 1, (4,8), atomic_dec_global_2)
+        sig = "void(uint64[:,:], uint64)"
+        self.check_dec(ary, rand_const, sig, 1, (4, 8), atomic_dec_global_2)
 
     def test_atomic_exch(self):
         rand_const = np.random.randint(50, 100, dtype=np.uint32)
         ary = np.random.randint(0, 32, size=32).astype(np.uint32)
         idx = np.arange(32, dtype=np.uint32)
 
-        cuda_func = cuda.jit('void(uint32[:], uint32[:], uint32)')(atomic_exch)
+        cuda_func = cuda.jit("void(uint32[:], uint32[:], uint32)")(atomic_exch)
         cuda_func[1, 32](ary, idx, rand_const)
 
         np.testing.assert_equal(ary, rand_const)
@@ -1084,7 +1216,7 @@ def test_atomic_exch2(self):
         rand_const = np.random.randint(50, 100, dtype=np.uint32)
         ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
 
-        cuda_func = cuda.jit('void(uint32[:,:], uint32)')(atomic_exch2)
+        cuda_func = cuda.jit("void(uint32[:,:], uint32)")(atomic_exch2)
         cuda_func[1, (4, 8)](ary, rand_const)
         np.testing.assert_equal(ary, rand_const)
 
@@ -1092,7 +1224,7 @@ def test_atomic_exch3(self):
         rand_const = np.random.randint(50, 100, dtype=np.uint64)
         ary = np.random.randint(0, 32, size=32).astype(np.uint64).reshape(4, 8)
 
-        cuda_func = cuda.jit('void(uint64[:,:], uint64)')(atomic_exch3)
+        cuda_func = cuda.jit("void(uint64[:,:], uint64)")(atomic_exch3)
         cuda_func[1, (4, 8)](ary, rand_const)
         np.testing.assert_equal(ary, rand_const)
 
@@ -1101,7 +1233,7 @@ def test_atomic_exch_global(self):
         idx = np.arange(32, dtype=np.uint32)
         ary = np.random.randint(0, 32, size=32, dtype=np.uint32)
 
-        sig = 'void(uint32[:], uint32[:], uint32)'
+        sig = "void(uint32[:], uint32[:], uint32)"
         cuda_func = cuda.jit(sig)(atomic_exch_global)
         cuda_func[1, 32](idx, ary, rand_const)
         np.testing.assert_equal(ary, rand_const)
@@ -1135,8 +1267,9 @@ def test_atomic_max_double(self):
     def test_atomic_max_double_normalizedindex(self):
         vals = np.random.randint(0, 65535, size=(32, 32)).astype(np.float64)
         res = np.zeros(1, np.float64)
-        cuda_func = cuda.jit('void(float64[:], float64[:,:])')(
-            atomic_max_double_normalizedindex)
+        cuda_func = cuda.jit("void(float64[:], float64[:,:])")(
+            atomic_max_double_normalizedindex
+        )
         cuda_func[32, 32](res, vals)
 
         gold = np.max(vals)
@@ -1145,8 +1278,9 @@ def test_atomic_max_double_normalizedindex(self):
     def test_atomic_max_double_oneindex(self):
         vals = np.random.randint(0, 128, size=32).astype(np.float64)
         res = np.zeros(1, np.float64)
-        cuda_func = cuda.jit('void(float64[:], float64[:])')(
-            atomic_max_double_oneindex)
+        cuda_func = cuda.jit("void(float64[:], float64[:])")(
+            atomic_max_double_oneindex
+        )
         cuda_func[1, 32](res, vals)
 
         gold = np.max(vals)
@@ -1182,8 +1316,9 @@ def test_atomic_min_double(self):
     def test_atomic_min_double_normalizedindex(self):
         vals = np.random.randint(0, 65535, size=(32, 32)).astype(np.float64)
         res = np.ones(1, np.float64) * 65535
-        cuda_func = cuda.jit('void(float64[:], float64[:,:])')(
-            atomic_min_double_normalizedindex)
+        cuda_func = cuda.jit("void(float64[:], float64[:,:])")(
+            atomic_min_double_normalizedindex
+        )
         cuda_func[32, 32](res, vals)
 
         gold = np.min(vals)
@@ -1192,8 +1327,9 @@ def test_atomic_min_double_normalizedindex(self):
     def test_atomic_min_double_oneindex(self):
         vals = np.random.randint(0, 128, size=32).astype(np.float64)
         res = np.ones(1, np.float64) * 128
-        cuda_func = cuda.jit('void(float64[:], float64[:])')(
-            atomic_min_double_oneindex)
+        cuda_func = cuda.jit("void(float64[:], float64[:])")(
+            atomic_min_double_oneindex
+        )
         cuda_func[1, 32](res, vals)
 
         gold = np.min(vals)
@@ -1211,16 +1347,15 @@ def test_atomic_min_double_oneindex(self):
     # the result will be ary[idx] for either of ary[idx] or val being NaN.
 
     def _test_atomic_minmax_nan_location(self, func):
+        cuda_func = cuda.jit("void(float64[:], float64[:,:])")(func)
 
-        cuda_func = cuda.jit('void(float64[:], float64[:,:])')(func)
-
-        vals = np.random.randint(0, 128, size=(1,1)).astype(np.float64)
+        vals = np.random.randint(0, 128, size=(1, 1)).astype(np.float64)
         res = np.zeros(1, np.float64) + np.nan
         cuda_func[1, 1](res, vals)
         np.testing.assert_equal(res, [np.nan])
 
     def _test_atomic_minmax_nan_val(self, func):
-        cuda_func = cuda.jit('void(float64[:], float64[:,:])')(func)
+        cuda_func = cuda.jit("void(float64[:], float64[:,:])")(func)
 
         res = np.random.randint(0, 128, size=1).astype(np.float64)
         gold = res.copy()
@@ -1244,7 +1379,7 @@ def test_atomic_max_nan_val(self):
     def test_atomic_max_double_shared(self):
         vals = np.random.randint(0, 32, size=32).astype(np.float64)
         res = np.zeros(1, np.float64)
-        sig = 'void(float64[:], float64[:])'
+        sig = "void(float64[:], float64[:])"
         cuda_func = cuda.jit(sig)(atomic_max_double_shared)
         cuda_func[1, 32](res, vals)
 
@@ -1254,7 +1389,7 @@ def test_atomic_max_double_shared(self):
     def test_atomic_min_double_shared(self):
         vals = np.random.randint(0, 32, size=32).astype(np.float64)
         res = np.ones(1, np.float64) * 32
-        sig = 'void(float64[:], float64[:])'
+        sig = "void(float64[:], float64[:])"
         cuda_func = cuda.jit(sig)(atomic_min_double_shared)
         cuda_func[1, 32](res, vals)
 
@@ -1289,64 +1424,120 @@ def check_cas(self, n, fill, unfill, dtype, cas_func, ndim=1):
         np.testing.assert_array_equal(expect_out, out)
 
     def test_atomic_compare_and_swap(self):
-        self.check_cas(n=100, fill=-99, unfill=-1, dtype=np.int32,
-                       cas_func=atomic_compare_and_swap)
+        self.check_cas(
+            n=100,
+            fill=-99,
+            unfill=-1,
+            dtype=np.int32,
+            cas_func=atomic_compare_and_swap,
+        )
 
     def test_atomic_compare_and_swap2(self):
-        self.check_cas(n=100, fill=-45, unfill=-1, dtype=np.int64,
-                       cas_func=atomic_compare_and_swap)
+        self.check_cas(
+            n=100,
+            fill=-45,
+            unfill=-1,
+            dtype=np.int64,
+            cas_func=atomic_compare_and_swap,
+        )
 
     def test_atomic_compare_and_swap3(self):
         rfill = np.random.randint(50, 500, dtype=np.uint32)
         runfill = np.random.randint(1, 25, dtype=np.uint32)
-        self.check_cas(n=100, fill=rfill, unfill=runfill, dtype=np.uint32,
-                       cas_func=atomic_compare_and_swap)
+        self.check_cas(
+            n=100,
+            fill=rfill,
+            unfill=runfill,
+            dtype=np.uint32,
+            cas_func=atomic_compare_and_swap,
+        )
 
     def test_atomic_compare_and_swap4(self):
         rfill = np.random.randint(50, 500, dtype=np.uint64)
         runfill = np.random.randint(1, 25, dtype=np.uint64)
-        self.check_cas(n=100, fill=rfill, unfill=runfill, dtype=np.uint64,
-                       cas_func=atomic_compare_and_swap)
+        self.check_cas(
+            n=100,
+            fill=rfill,
+            unfill=runfill,
+            dtype=np.uint64,
+            cas_func=atomic_compare_and_swap,
+        )
 
     def test_atomic_cas_1dim(self):
-        self.check_cas(n=100, fill=-99, unfill=-1, dtype=np.int32,
-                       cas_func=atomic_cas_1dim)
+        self.check_cas(
+            n=100, fill=-99, unfill=-1, dtype=np.int32, cas_func=atomic_cas_1dim
+        )
 
     def test_atomic_cas_2dim(self):
-        self.check_cas(n=100, fill=-99, unfill=-1, dtype=np.int32,
-                       cas_func=atomic_cas_2dim, ndim=2)
+        self.check_cas(
+            n=100,
+            fill=-99,
+            unfill=-1,
+            dtype=np.int32,
+            cas_func=atomic_cas_2dim,
+            ndim=2,
+        )
 
     def test_atomic_cas2_1dim(self):
-        self.check_cas(n=100, fill=-45, unfill=-1, dtype=np.int64,
-                       cas_func=atomic_cas_1dim)
+        self.check_cas(
+            n=100, fill=-45, unfill=-1, dtype=np.int64, cas_func=atomic_cas_1dim
+        )
 
     def test_atomic_cas2_2dim(self):
-        self.check_cas(n=100, fill=-45, unfill=-1, dtype=np.int64,
-                       cas_func=atomic_cas_2dim, ndim=2)
+        self.check_cas(
+            n=100,
+            fill=-45,
+            unfill=-1,
+            dtype=np.int64,
+            cas_func=atomic_cas_2dim,
+            ndim=2,
+        )
 
     def test_atomic_cas3_1dim(self):
         rfill = np.random.randint(50, 500, dtype=np.uint32)
         runfill = np.random.randint(1, 25, dtype=np.uint32)
-        self.check_cas(n=100, fill=rfill, unfill=runfill, dtype=np.uint32,
-                       cas_func=atomic_cas_1dim)
+        self.check_cas(
+            n=100,
+            fill=rfill,
+            unfill=runfill,
+            dtype=np.uint32,
+            cas_func=atomic_cas_1dim,
+        )
 
     def test_atomic_cas3_2dim(self):
         rfill = np.random.randint(50, 500, dtype=np.uint32)
         runfill = np.random.randint(1, 25, dtype=np.uint32)
-        self.check_cas(n=100, fill=rfill, unfill=runfill, dtype=np.uint32,
-                       cas_func=atomic_cas_2dim, ndim=2)
+        self.check_cas(
+            n=100,
+            fill=rfill,
+            unfill=runfill,
+            dtype=np.uint32,
+            cas_func=atomic_cas_2dim,
+            ndim=2,
+        )
 
     def test_atomic_cas4_1dim(self):
         rfill = np.random.randint(50, 500, dtype=np.uint64)
         runfill = np.random.randint(1, 25, dtype=np.uint64)
-        self.check_cas(n=100, fill=rfill, unfill=runfill, dtype=np.uint64,
-                       cas_func=atomic_cas_1dim)
+        self.check_cas(
+            n=100,
+            fill=rfill,
+            unfill=runfill,
+            dtype=np.uint64,
+            cas_func=atomic_cas_1dim,
+        )
 
     def test_atomic_cas4_2dim(self):
         rfill = np.random.randint(50, 500, dtype=np.uint64)
         runfill = np.random.randint(1, 25, dtype=np.uint64)
-        self.check_cas(n=100, fill=rfill, unfill=runfill, dtype=np.uint64,
-                       cas_func=atomic_cas_2dim, ndim=2)
+        self.check_cas(
+            n=100,
+            fill=rfill,
+            unfill=runfill,
+            dtype=np.uint64,
+            cas_func=atomic_cas_2dim,
+            ndim=2,
+        )
 
     # Tests that the atomic add, min, and max operations return the old value -
     # in the simulator, they did not (see Issue #5458). The max and min have
@@ -1438,34 +1629,36 @@ def check_atomic_nanmax(self, dtype, lo, hi, init_val):
         np.testing.assert_equal(res, gold)
 
     def test_atomic_nanmax_int32(self):
-        self.check_atomic_nanmax(dtype=np.int32, lo=-65535, hi=65535,
-                                 init_val=0)
+        self.check_atomic_nanmax(
+            dtype=np.int32, lo=-65535, hi=65535, init_val=0
+        )
 
     def test_atomic_nanmax_uint32(self):
-        self.check_atomic_nanmax(dtype=np.uint32, lo=0, hi=65535,
-                                 init_val=0)
+        self.check_atomic_nanmax(dtype=np.uint32, lo=0, hi=65535, init_val=0)
 
     def test_atomic_nanmax_int64(self):
-        self.check_atomic_nanmax(dtype=np.int64, lo=-65535, hi=65535,
-                                 init_val=0)
+        self.check_atomic_nanmax(
+            dtype=np.int64, lo=-65535, hi=65535, init_val=0
+        )
 
     def test_atomic_nanmax_uint64(self):
-        self.check_atomic_nanmax(dtype=np.uint64, lo=0, hi=65535,
-                                 init_val=0)
+        self.check_atomic_nanmax(dtype=np.uint64, lo=0, hi=65535, init_val=0)
 
     def test_atomic_nanmax_float32(self):
-        self.check_atomic_nanmax(dtype=np.float32, lo=-65535, hi=65535,
-                                 init_val=np.nan)
+        self.check_atomic_nanmax(
+            dtype=np.float32, lo=-65535, hi=65535, init_val=np.nan
+        )
 
     def test_atomic_nanmax_double(self):
-        self.check_atomic_nanmax(dtype=np.float64, lo=-65535, hi=65535,
-                                 init_val=np.nan)
+        self.check_atomic_nanmax(
+            dtype=np.float64, lo=-65535, hi=65535, init_val=np.nan
+        )
 
     def test_atomic_nanmax_double_shared(self):
         vals = np.random.randint(0, 32, size=32).astype(np.float64)
         vals[1::2] = np.nan
         res = np.array([0], dtype=vals.dtype)
-        sig = 'void(float64[:], float64[:])'
+        sig = "void(float64[:], float64[:])"
         cuda_func = cuda.jit(sig)(atomic_nanmax_double_shared)
         cuda_func[1, 32](res, vals)
 
@@ -1476,8 +1669,9 @@ def test_atomic_nanmax_double_oneindex(self):
         vals = np.random.randint(0, 128, size=32).astype(np.float64)
         vals[1::2] = np.nan
         res = np.zeros(1, np.float64)
-        cuda_func = cuda.jit('void(float64[:], float64[:])')(
-            atomic_max_double_oneindex)
+        cuda_func = cuda.jit("void(float64[:], float64[:])")(
+            atomic_max_double_oneindex
+        )
         cuda_func[1, 32](res, vals)
 
         gold = np.nanmax(vals)
@@ -1495,34 +1689,36 @@ def check_atomic_nanmin(self, dtype, lo, hi, init_val):
         np.testing.assert_equal(res, gold)
 
     def test_atomic_nanmin_int32(self):
-        self.check_atomic_nanmin(dtype=np.int32, lo=-65535, hi=65535,
-                                 init_val=0)
+        self.check_atomic_nanmin(
+            dtype=np.int32, lo=-65535, hi=65535, init_val=0
+        )
 
     def test_atomic_nanmin_uint32(self):
-        self.check_atomic_nanmin(dtype=np.uint32, lo=0, hi=65535,
-                                 init_val=0)
+        self.check_atomic_nanmin(dtype=np.uint32, lo=0, hi=65535, init_val=0)
 
     def test_atomic_nanmin_int64(self):
-        self.check_atomic_nanmin(dtype=np.int64, lo=-65535, hi=65535,
-                                 init_val=0)
+        self.check_atomic_nanmin(
+            dtype=np.int64, lo=-65535, hi=65535, init_val=0
+        )
 
     def test_atomic_nanmin_uint64(self):
-        self.check_atomic_nanmin(dtype=np.uint64, lo=0, hi=65535,
-                                 init_val=0)
+        self.check_atomic_nanmin(dtype=np.uint64, lo=0, hi=65535, init_val=0)
 
     def test_atomic_nanmin_float(self):
-        self.check_atomic_nanmin(dtype=np.float32, lo=-65535, hi=65535,
-                                 init_val=np.nan)
+        self.check_atomic_nanmin(
+            dtype=np.float32, lo=-65535, hi=65535, init_val=np.nan
+        )
 
     def test_atomic_nanmin_double(self):
-        self.check_atomic_nanmin(dtype=np.float64, lo=-65535, hi=65535,
-                                 init_val=np.nan)
+        self.check_atomic_nanmin(
+            dtype=np.float64, lo=-65535, hi=65535, init_val=np.nan
+        )
 
     def test_atomic_nanmin_double_shared(self):
         vals = np.random.randint(0, 32, size=32).astype(np.float64)
         vals[1::2] = np.nan
         res = np.array([32], dtype=vals.dtype)
-        sig = 'void(float64[:], float64[:])'
+        sig = "void(float64[:], float64[:])"
         cuda_func = cuda.jit(sig)(atomic_nanmin_double_shared)
         cuda_func[1, 32](res, vals)
 
@@ -1533,8 +1729,9 @@ def test_atomic_nanmin_double_oneindex(self):
         vals = np.random.randint(0, 128, size=32).astype(np.float64)
         vals[1::2] = np.nan
         res = np.array([128], np.float64)
-        cuda_func = cuda.jit('void(float64[:], float64[:])')(
-            atomic_min_double_oneindex)
+        cuda_func = cuda.jit("void(float64[:], float64[:])")(
+            atomic_min_double_oneindex
+        )
         cuda_func[1, 32](res, vals)
 
         gold = np.nanmin(vals)
@@ -1610,5 +1807,5 @@ def kernel(x):
         self._test_atomic_nan_returns_old(kernel, 11)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py b/numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py
index 1375162d9..7cf4d288f 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py
@@ -17,13 +17,23 @@
 
 def cnd(d):
     K = 1.0 / (1.0 + 0.2316419 * np.abs(d))
-    ret_val = (RSQRT2PI * np.exp(-0.5 * d * d) *
-               (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))))
+    ret_val = (
+        RSQRT2PI
+        * np.exp(-0.5 * d * d)
+        * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))))
+    )
     return np.where(d > 0, 1.0 - ret_val, ret_val)
 
 
-def black_scholes(callResult, putResult, stockPrice, optionStrike, optionYears,
-                  Riskfree, Volatility):
+def black_scholes(
+    callResult,
+    putResult,
+    stockPrice,
+    optionStrike,
+    optionYears,
+    Riskfree,
+    Volatility,
+):
     S = stockPrice
     X = optionStrike
     T = optionYears
@@ -35,9 +45,9 @@ def black_scholes(callResult, putResult, stockPrice, optionStrike, optionYears,
     cndd1 = cnd(d1)
     cndd2 = cnd(d2)
 
-    expRT = np.exp(- R * T)
-    callResult[:] = (S * cndd1 - X * expRT * cndd2)
-    putResult[:] = (X * expRT * (1.0 - cndd2) - S * (1.0 - cndd1))
+    expRT = np.exp(-R * T)
+    callResult[:] = S * cndd1 - X * expRT * cndd2
+    putResult[:] = X * expRT * (1.0 - cndd2) - S * (1.0 - cndd1)
 
 
 def randfloat(rand_var, low, high):
@@ -61,34 +71,54 @@ def test_blackscholes(self):
 
         # numpy
         for i in range(iterations):
-            black_scholes(callResultNumpy, putResultNumpy, stockPrice,
-                          optionStrike, optionYears, RISKFREE, VOLATILITY)
+            black_scholes(
+                callResultNumpy,
+                putResultNumpy,
+                stockPrice,
+                optionStrike,
+                optionYears,
+                RISKFREE,
+                VOLATILITY,
+            )
 
         @cuda.jit(double(double), device=True, inline=True)
         def cnd_cuda(d):
             K = 1.0 / (1.0 + 0.2316419 * math.fabs(d))
-            ret_val = (RSQRT2PI * math.exp(-0.5 * d * d) *
-                       (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))))
+            ret_val = (
+                RSQRT2PI
+                * math.exp(-0.5 * d * d)
+                * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))))
+            )
             if d > 0:
                 ret_val = 1.0 - ret_val
             return ret_val
 
-        @cuda.jit(void(double[:], double[:], double[:], double[:], double[:],
-                       double, double))
+        @cuda.jit(
+            void(
+                double[:],
+                double[:],
+                double[:],
+                double[:],
+                double[:],
+                double,
+                double,
+            )
+        )
         def black_scholes_cuda(callResult, putResult, S, X, T, R, V):
             i = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
             if i >= S.shape[0]:
                 return
             sqrtT = math.sqrt(T[i])
-            d1 = ((math.log(S[i] / X[i]) + (R + 0.5 * V * V) * T[i])
-                  / (V * sqrtT))
+            d1 = (math.log(S[i] / X[i]) + (R + 0.5 * V * V) * T[i]) / (
+                V * sqrtT
+            )
             d2 = d1 - V * sqrtT
             cndd1 = cnd_cuda(d1)
             cndd2 = cnd_cuda(d2)
 
-            expRT = math.exp((-1. * R) * T[i])
-            callResult[i] = (S[i] * cndd1 - X[i] * expRT * cndd2)
-            putResult[i] = (X[i] * expRT * (1.0 - cndd2) - S[i] * (1.0 - cndd1))
+            expRT = math.exp((-1.0 * R) * T[i])
+            callResult[i] = S[i] * cndd1 - X[i] * expRT * cndd2
+            putResult[i] = X[i] * expRT * (1.0 - cndd2) - S[i] * (1.0 - cndd1)
 
         # numba
         blockdim = 512, 1
@@ -102,8 +132,14 @@ def black_scholes_cuda(callResult, putResult, S, X, T, R, V):
 
         for i in range(iterations):
             black_scholes_cuda[griddim, blockdim, stream](
-                d_callResult, d_putResult, d_stockPrice, d_optionStrike,
-                d_optionYears, RISKFREE, VOLATILITY)
+                d_callResult,
+                d_putResult,
+                d_stockPrice,
+                d_optionStrike,
+                d_optionYears,
+                RISKFREE,
+                VOLATILITY,
+            )
         d_callResult.copy_to_host(callResultNumba, stream)
         d_putResult.copy_to_host(putResultNumba, stream)
         stream.synchronize()
@@ -116,5 +152,5 @@ def black_scholes_cuda(callResult, putResult, S, X, T, R, V):
         self.assertTrue(max_abs_err < 1e-13)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_boolean.py b/numba_cuda/numba/cuda/tests/cudapy/test_boolean.py
index fc0568233..ac0d28769 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_boolean.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_boolean.py
@@ -12,13 +12,13 @@ def boolean_func(A, vertial):
 
 class TestCudaBoolean(CUDATestCase):
     def test_boolean(self):
-        func = cuda.jit('void(float64[:], bool_)')(boolean_func)
-        A = np.array([0], dtype='float64')
+        func = cuda.jit("void(float64[:], bool_)")(boolean_func)
+        A = np.array([0], dtype="float64")
         func[1, 1](A, True)
         self.assertTrue(A[0] == 123)
         func[1, 1](A, False)
         self.assertTrue(A[0] == 321)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_caching.py b/numba_cuda/numba/cuda/tests/cudapy/test_caching.py
index 22e2f4a6e..d8002207e 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_caching.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_caching.py
@@ -8,15 +8,22 @@
 
 from numba import cuda
 from numba.core.errors import NumbaWarning
-from numba.cuda.testing import (CUDATestCase, skip_on_cudasim,
-                                skip_unless_cc_60, skip_if_cudadevrt_missing,
-                                skip_if_mvc_enabled, test_data_dir)
+from numba.cuda.testing import (
+    CUDATestCase,
+    skip_on_cudasim,
+    skip_unless_cc_60,
+    skip_if_cudadevrt_missing,
+    skip_if_mvc_enabled,
+    test_data_dir,
+)
 from numba.tests.support import SerialMixin
-from numba.tests.test_caching import (DispatcherCacheUsecasesTest,
-                                      skip_bad_access)
+from numba.tests.test_caching import (
+    DispatcherCacheUsecasesTest,
+    skip_bad_access,
+)
 
 
-@skip_on_cudasim('Simulator does not implement caching')
+@skip_on_cudasim("Simulator does not implement caching")
 class CUDACachingTest(SerialMixin, DispatcherCacheUsecasesTest):
     here = os.path.dirname(__file__)
     usecases_file = os.path.join(here, "cache_usecases.py")
@@ -72,23 +79,23 @@ def test_many_locals(self):
         mod = self.import_module()
         f = mod.many_locals
         f[1, 1]()
-        self.check_pycache(2) # 1 index, 1 data
+        self.check_pycache(2)  # 1 index, 1 data
 
     def test_closure(self):
         mod = self.import_module()
 
         with warnings.catch_warnings():
-            warnings.simplefilter('error', NumbaWarning)
+            warnings.simplefilter("error", NumbaWarning)
 
             f = mod.closure1
-            self.assertPreciseEqual(f(3), 6) # 3 + 3 = 6
+            self.assertPreciseEqual(f(3), 6)  # 3 + 3 = 6
             f = mod.closure2
-            self.assertPreciseEqual(f(3), 8) # 3 + 5 = 8
+            self.assertPreciseEqual(f(3), 8)  # 3 + 5 = 8
             f = mod.closure3
-            self.assertPreciseEqual(f(3), 10) # 3 + 7 = 10
+            self.assertPreciseEqual(f(3), 10)  # 3 + 7 = 10
             f = mod.closure4
-            self.assertPreciseEqual(f(3), 12) # 3 + 9 = 12
-            self.check_pycache(5) # 1 nbi, 4 nbc
+            self.assertPreciseEqual(f(3), 12)  # 3 + 9 = 12
+            self.check_pycache(5)  # 1 nbi, 4 nbc
 
     def test_cache_reuse(self):
         mod = self.import_module()
@@ -158,7 +165,7 @@ def test_same_names(self):
 
     @skip_unless_cc_60
     @skip_if_cudadevrt_missing
-    @skip_if_mvc_enabled('CG not supported with MVC')
+    @skip_if_mvc_enabled("CG not supported with MVC")
     def test_cache_cg(self):
         # Functions using cooperative groups should be cacheable. See Issue
         # #8888: https://github.com/numba/numba/issues/8888
@@ -174,7 +181,7 @@ def test_cache_cg(self):
 
     @skip_unless_cc_60
     @skip_if_cudadevrt_missing
-    @skip_if_mvc_enabled('CG not supported with MVC')
+    @skip_if_mvc_enabled("CG not supported with MVC")
     def test_cache_cg_clean_run(self):
         # See Issue #9432: https://github.com/numba/numba/issues/9432
         # If a cached function using CG sync was the first thing to compile,
@@ -191,9 +198,11 @@ def test_cache_cg_clean_run(self):
             mod.cg_usecase(0)
             """ % dict(tempdir=self.tempdir, modname=self.modname)
 
-        popen = subprocess.Popen([sys.executable, "-c", code],
-                                 stdout=subprocess.PIPE,
-                                 stderr=subprocess.PIPE)
+        popen = subprocess.Popen(
+            [sys.executable, "-c", code],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+        )
         out, err = popen.communicate(timeout=60)
         if popen.returncode != 0:
             raise AssertionError(
@@ -212,8 +221,9 @@ def _test_pycache_fallback(self):
         f = mod.add_usecase
         # Remove this function's cache files at the end, to avoid accumulation
         # across test calls.
-        self.addCleanup(shutil.rmtree, f.func.stats.cache_path,
-                        ignore_errors=True)
+        self.addCleanup(
+            shutil.rmtree, f.func.stats.cache_path, ignore_errors=True
+        )
 
         self.assertPreciseEqual(f(2, 3), 6)
         # It's a cache miss since the file was copied to a new temp location
@@ -230,8 +240,9 @@ def _test_pycache_fallback(self):
         self.check_pycache(0)
 
     @skip_bad_access
-    @unittest.skipIf(os.name == "nt",
-                     "cannot easily make a directory read-only on Windows")
+    @unittest.skipIf(
+        os.name == "nt", "cannot easily make a directory read-only on Windows"
+    )
     def test_non_creatable_pycache(self):
         # Make it impossible to create the __pycache__ directory
         old_perms = os.stat(self.tempdir).st_mode
@@ -241,11 +252,12 @@ def test_non_creatable_pycache(self):
         self._test_pycache_fallback()
 
     @skip_bad_access
-    @unittest.skipIf(os.name == "nt",
-                     "cannot easily make a directory read-only on Windows")
+    @unittest.skipIf(
+        os.name == "nt", "cannot easily make a directory read-only on Windows"
+    )
     def test_non_writable_pycache(self):
         # Make it impossible to write to the __pycache__ directory
-        pycache = os.path.join(self.tempdir, '__pycache__')
+        pycache = os.path.join(self.tempdir, "__pycache__")
         os.mkdir(pycache)
         old_perms = os.stat(pycache).st_mode
         os.chmod(pycache, 0o500)
@@ -254,15 +266,16 @@ def test_non_writable_pycache(self):
         self._test_pycache_fallback()
 
     def test_cannot_cache_linking_libraries(self):
-        link = str(test_data_dir / 'jitlink.ptx')
-        msg = 'Cannot pickle CUDACodeLibrary with linking files'
+        link = str(test_data_dir / "jitlink.ptx")
+        msg = "Cannot pickle CUDACodeLibrary with linking files"
         with self.assertRaisesRegex(RuntimeError, msg):
-            @cuda.jit('void()', cache=True, link=[link])
+
+            @cuda.jit("void()", cache=True, link=[link])
             def f():
                 pass
 
 
-@skip_on_cudasim('Simulator does not implement caching')
+@skip_on_cudasim("Simulator does not implement caching")
 class CUDAAndCPUCachingTest(SerialMixin, DispatcherCacheUsecasesTest):
     here = os.path.dirname(__file__)
     usecases_file = os.path.join(here, "cache_with_cpu_usecases.py")
@@ -353,7 +366,7 @@ def get_different_cc_gpus():
     return None
 
 
-@skip_on_cudasim('Simulator does not implement caching')
+@skip_on_cudasim("Simulator does not implement caching")
 class TestMultiCCCaching(SerialMixin, DispatcherCacheUsecasesTest):
     here = os.path.dirname(__file__)
     usecases_file = os.path.join(here, "cache_usecases.py")
@@ -370,7 +383,7 @@ def tearDown(self):
     def test_cache(self):
         gpus = get_different_cc_gpus()
         if not gpus:
-            self.skipTest('Need two different CCs for multi-CC cache test')
+            self.skipTest("Need two different CCs for multi-CC cache test")
 
         self.check_pycache(0)
         mod = self.import_module()
@@ -482,13 +495,13 @@ def child_initializer():
     # Disable occupancy and implicit copy warnings in processes in a
     # multiprocessing pool.
     from numba.core import config
+
     config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
     config.CUDA_WARN_ON_IMPLICIT_COPY = 0
 
 
-@skip_on_cudasim('Simulator does not implement caching')
+@skip_on_cudasim("Simulator does not implement caching")
 class TestMultiprocessCache(SerialMixin, DispatcherCacheUsecasesTest):
-
     # Nested multiprocessing.Pool raises AssertionError:
     # "daemonic processes are not allowed to have children"
     _numba_parallel_test_ = False
@@ -513,7 +526,7 @@ def test_multiprocessing(self):
         f = mod.simple_usecase_caller
         n = 3
         try:
-            ctx = multiprocessing.get_context('spawn')
+            ctx = multiprocessing.get_context("spawn")
         except AttributeError:
             ctx = multiprocessing
 
@@ -526,7 +539,7 @@ def test_multiprocessing(self):
         self.assertEqual(res, n * (n - 1) // 2)
 
 
-@skip_on_cudasim('Simulator does not implement the CUDACodeLibrary')
+@skip_on_cudasim("Simulator does not implement the CUDACodeLibrary")
 class TestCUDACodeLibrary(CUDATestCase):
     # For tests of miscellaneous CUDACodeLibrary behaviour that we wish to
     # explicitly check
@@ -539,7 +552,7 @@ def test_cannot_serialize_unfinalized(self):
         # Usually a CodeLibrary requires a real CodeGen, but since we don't
         # interact with it, anything will do
         codegen = object()
-        name = 'library'
+        name = "library"
         cl = CUDACodeLibrary(codegen, name)
-        with self.assertRaisesRegex(RuntimeError, 'Cannot pickle unfinalized'):
+        with self.assertRaisesRegex(RuntimeError, "Cannot pickle unfinalized"):
             cl._reduce_states()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_casting.py b/numba_cuda/numba/cuda/tests/cudapy/test_casting.py
index 2ce77e05b..1d291fa9f 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_casting.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_casting.py
@@ -4,8 +4,7 @@
 from numba.core.types import f2, i1, i2, i4, i8, u1, u2, u4, u8
 from numba import cuda
 from numba.core import types
-from numba.cuda.testing import (CUDATestCase, skip_on_cudasim,
-                                skip_unless_cc_53)
+from numba.cuda.testing import CUDATestCase, skip_on_cudasim, skip_unless_cc_53
 from numba.types import float16, float32
 import itertools
 import unittest
@@ -50,7 +49,7 @@ def to_uint64(x):
 def to_float16(x):
     # When division and operators on float16 types are supported, this should
     # be changed to match the implementation in to_float32.
-    return (np.float16(x) * np.float16(0.5))
+    return np.float16(x) * np.float16(0.5)
 
 
 def to_float32(x):
@@ -76,6 +75,7 @@ def to_complex128(x):
 # - The device version uses cuda.fp16.hmul
 # - The host version uses the * operator
 
+
 def cuda_int_literal_to_float16(x):
     # Note that we need to use `2` and not `np.float16(2)` to ensure that this
     # types as a literal int and not a const float16.
@@ -128,7 +128,7 @@ def test_float_to_int(self):
                     self.assertEqual(cfunc(-12.3), pyfunc(-12.3))
                     self.assertEqual(cfunc(-12.3), int(-12.3))
 
-    @skip_on_cudasim('Compilation unsupported in the simulator')
+    @skip_on_cudasim("Compilation unsupported in the simulator")
     def test_float16_to_int_ptx(self):
         pyfuncs = (to_int8, to_int16, to_int32, to_int64)
         sizes = (8, 16, 32, 64)
@@ -150,7 +150,7 @@ def test_float_to_uint(self):
                     self.assertEqual(cfunc(12.3), pyfunc(12.3))
                     self.assertEqual(cfunc(12.3), int(12.3))
 
-    @skip_on_cudasim('Compilation unsupported in the simulator')
+    @skip_on_cudasim("Compilation unsupported in the simulator")
     def test_float16_to_uint_ptx(self):
         pyfuncs = (to_uint8, to_uint16, to_uint32, to_uint64)
         sizes = (8, 16, 32, 64)
@@ -171,17 +171,18 @@ def test_int_to_float(self):
 
     @skip_unless_cc_53
     def test_literal_to_float16(self):
-        cudafuncs = (cuda_int_literal_to_float16,
-                     cuda_float_literal_to_float16)
-        hostfuncs = (reference_int_literal_to_float16,
-                     reference_float_literal_to_float16)
+        cudafuncs = (cuda_int_literal_to_float16, cuda_float_literal_to_float16)
+        hostfuncs = (
+            reference_int_literal_to_float16,
+            reference_float_literal_to_float16,
+        )
 
         for cudafunc, hostfunc in zip(cudafuncs, hostfuncs):
             with self.subTest(func=cudafunc):
                 cfunc = self._create_wrapped(cudafunc, np.float16, np.float16)
                 self.assertEqual(cfunc(321), hostfunc(321))
 
-    @skip_on_cudasim('Compilation unsupported in the simulator')
+    @skip_on_cudasim("Compilation unsupported in the simulator")
     def test_int_to_float16_ptx(self):
         fromtys = (i1, i2, i4, i8)
         sizes = (8, 16, 32, 64)
@@ -190,7 +191,7 @@ def test_int_to_float16_ptx(self):
             ptx, _ = compile_ptx(to_float16, (ty,), device=True)
             self.assertIn(f"cvt.rn.f16.s{size}", ptx)
 
-    @skip_on_cudasim('Compilation unsupported in the simulator')
+    @skip_on_cudasim("Compilation unsupported in the simulator")
     def test_uint_to_float16_ptx(self):
         fromtys = (u1, u2, u4, u8)
         sizes = (8, 16, 32, 64)
@@ -211,12 +212,14 @@ def test_float_to_float(self):
                 # the CUDA target doesn't yet implement division (or operators)
                 # for float16 values, so we test by comparing with the computed
                 # expression instead.
-                np.testing.assert_allclose(cfunc(12.3),
-                                           toty(12.3) / toty(2), rtol=0.0003)
-                np.testing.assert_allclose(cfunc(-12.3),
-                                           toty(-12.3) / toty(2), rtol=0.0003)
-
-    @skip_on_cudasim('Compilation unsupported in the simulator')
+                np.testing.assert_allclose(
+                    cfunc(12.3), toty(12.3) / toty(2), rtol=0.0003
+                )
+                np.testing.assert_allclose(
+                    cfunc(-12.3), toty(-12.3) / toty(2), rtol=0.0003
+                )
+
+    @skip_on_cudasim("Compilation unsupported in the simulator")
     def test_float16_to_float_ptx(self):
         pyfuncs = (to_float32, to_float64)
         postfixes = ("f32", "f64")
@@ -239,12 +242,14 @@ def test_float_to_complex(self):
                     # to match the casting that is automatically applied when
                     # passing the input to the cfunc as part of wrapping it in
                     # an array of type fromtype.
-                    np.testing.assert_allclose(cfunc(3.21),
-                                               pyfunc(fromty(3.21)))
-                    np.testing.assert_allclose(cfunc(-3.21),
-                                               pyfunc(fromty(-3.21)) + 0j)
-
-    @skip_on_cudasim('Compilation unsupported in the simulator')
+                    np.testing.assert_allclose(
+                        cfunc(3.21), pyfunc(fromty(3.21))
+                    )
+                    np.testing.assert_allclose(
+                        cfunc(-3.21), pyfunc(fromty(-3.21)) + 0j
+                    )
+
+    @skip_on_cudasim("Compilation unsupported in the simulator")
     def test_native_cast(self):
         float32_ptx, _ = cuda.compile_ptx(native_cast, (float32,), device=True)
         self.assertIn("st.f32", float32_ptx)
@@ -253,5 +258,5 @@ def test_native_cast(self):
         self.assertIn("st.u16", float16_ptx)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_cffi.py b/numba_cuda/numba/cuda/tests/cudapy/test_cffi.py
index ee09fcc31..57f3efdb8 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_cffi.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_cffi.py
@@ -1,21 +1,26 @@
 import numpy as np
 
 from numba import cuda, types
-from numba.cuda.testing import (skip_on_cudasim, test_data_dir, unittest,
-                                CUDATestCase)
+from numba.cuda.testing import (
+    skip_on_cudasim,
+    test_data_dir,
+    unittest,
+    CUDATestCase,
+)
 from numba.tests.support import skip_unless_cffi
 
 
 @skip_unless_cffi
-@skip_on_cudasim('Simulator does not support linking')
+@skip_on_cudasim("Simulator does not support linking")
 class TestCFFI(CUDATestCase):
     def test_from_buffer(self):
         import cffi
+
         ffi = cffi.FFI()
 
-        link = str(test_data_dir / 'jitlink.ptx')
+        link = str(test_data_dir / "jitlink.ptx")
         sig = types.void(types.CPointer(types.int32))
-        array_mutator = cuda.declare_device('array_mutator', sig)
+        array_mutator = cuda.declare_device("array_mutator", sig)
 
         @cuda.jit(link=[link])
         def mutate_array(x):
@@ -29,5 +34,5 @@ def mutate_array(x):
         self.assertEqual(x[0], x[1])
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_compiler.py b/numba_cuda/numba/cuda/tests/cudapy/test_compiler.py
index 3a9ded7c4..ddc847681 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_compiler.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_compiler.py
@@ -1,7 +1,11 @@
 from math import sqrt
 from numba import cuda, float32, int16, int32, int64, uint32, void
-from numba.cuda import (compile, compile_for_current_device, compile_ptx,
-                        compile_ptx_for_current_device)
+from numba.cuda import (
+    compile,
+    compile_for_current_device,
+    compile_ptx,
+    compile_ptx_for_current_device,
+)
 from numba.cuda.cudadrv import runtime
 from numba.cuda.testing import skip_on_cudasim, unittest, CUDATestCase
 
@@ -12,7 +16,7 @@ def f_module(x, y):
     return x + y
 
 
-@skip_on_cudasim('Compilation unsupported in the simulator')
+@skip_on_cudasim("Compilation unsupported in the simulator")
 class TestCompile(unittest.TestCase):
     def test_global_kernel(self):
         def f(r, x, y):
@@ -24,11 +28,11 @@ def f(r, x, y):
         ptx, resty = compile_ptx(f, args)
 
         # Kernels should not have a func_retval parameter
-        self.assertNotIn('func_retval', ptx)
+        self.assertNotIn("func_retval", ptx)
         # .visible .func is used to denote a device function
-        self.assertNotIn('.visible .func', ptx)
+        self.assertNotIn(".visible .func", ptx)
         # .visible .entry would denote the presence of a global function
-        self.assertIn('.visible .entry', ptx)
+        self.assertIn(".visible .entry", ptx)
         # Return type for kernels should always be void
         self.assertEqual(resty, void)
 
@@ -41,11 +45,11 @@ def add(x, y):
 
         # Device functions take a func_retval parameter for storing the
         # returned value in by reference
-        self.assertIn('func_retval', ptx)
+        self.assertIn("func_retval", ptx)
         # .visible .func is used to denote a device function
-        self.assertIn('.visible .func', ptx)
+        self.assertIn(".visible .func", ptx)
         # .visible .entry would denote the presence of a global function
-        self.assertNotIn('.visible .entry', ptx)
+        self.assertNotIn(".visible .entry", ptx)
         # Inferred return type as expected?
         self.assertEqual(resty, float32)
 
@@ -71,21 +75,21 @@ def f(x, y, z, d):
 
         # Without fastmath, fma contraction is enabled by default, but ftz and
         # approximate div / sqrt is not.
-        self.assertIn('fma.rn.f32', ptx)
-        self.assertIn('div.rn.f32', ptx)
-        self.assertIn('sqrt.rn.f32', ptx)
+        self.assertIn("fma.rn.f32", ptx)
+        self.assertIn("div.rn.f32", ptx)
+        self.assertIn("sqrt.rn.f32", ptx)
 
         ptx, resty = compile_ptx(f, args, device=True, fastmath=True)
 
         # With fastmath, ftz and approximate div / sqrt are enabled
-        self.assertIn('fma.rn.ftz.f32', ptx)
-        self.assertIn('div.approx.ftz.f32', ptx)
-        self.assertIn('sqrt.approx.ftz.f32', ptx)
+        self.assertIn("fma.rn.ftz.f32", ptx)
+        self.assertIn("div.approx.ftz.f32", ptx)
+        self.assertIn("sqrt.approx.ftz.f32", ptx)
 
     def check_debug_info(self, ptx):
         # A debug_info section should exist in the PTX. Whitespace varies
         # between CUDA toolkit versions.
-        self.assertRegex(ptx, '\\.section\\s+\\.debug_info')
+        self.assertRegex(ptx, "\\.section\\s+\\.debug_info")
         # A .file directive should be produced and include the name of the
         # source. The path and whitespace may vary, so we accept anything
         # ending in the filename of this module.
@@ -136,23 +140,25 @@ def test_non_void_return_type(self):
         def f(x, y):
             return x[0] + y[0]
 
-        with self.assertRaisesRegex(TypeError, 'must have void return type'):
+        with self.assertRaisesRegex(TypeError, "must have void return type"):
             compile_ptx(f, (uint32[::1], uint32[::1]))
 
     def test_c_abi_disallowed_for_kernel(self):
         def f(x, y):
             return x + y
 
-        with self.assertRaisesRegex(NotImplementedError,
-                                    "The C ABI is not supported for kernels"):
+        with self.assertRaisesRegex(
+            NotImplementedError, "The C ABI is not supported for kernels"
+        ):
             compile_ptx(f, (int32, int32), abi="c")
 
     def test_unsupported_abi(self):
         def f(x, y):
             return x + y
 
-        with self.assertRaisesRegex(NotImplementedError,
-                                    "Unsupported ABI: fastcall"):
+        with self.assertRaisesRegex(
+            NotImplementedError, "Unsupported ABI: fastcall"
+        ):
             compile_ptx(f, (int32, int32), abi="fastcall")
 
     def test_c_abi_device_function(self):
@@ -166,8 +172,11 @@ def f(x, y):
         # The function name should match the Python function name (not the
         # qualname, which includes additional info), and its return value
         # should be 32 bits
-        self.assertRegex(ptx, r"\.visible\s+\.func\s+\(\.param\s+\.b32\s+"
-                              r"func_retval0\)\s+f\(")
+        self.assertRegex(
+            ptx,
+            r"\.visible\s+\.func\s+\(\.param\s+\.b32\s+"
+            r"func_retval0\)\s+f\(",
+        )
 
         # If we compile for 64-bit integers, the return type should be 64 bits
         # wide
@@ -175,44 +184,60 @@ def f(x, y):
         self.assertRegex(ptx, r"\.visible\s+\.func\s+\(\.param\s+\.b64")
 
     def test_c_abi_device_function_module_scope(self):
-        ptx, resty = compile_ptx(f_module, int32(int32, int32), device=True,
-                                 abi="c")
+        ptx, resty = compile_ptx(
+            f_module, int32(int32, int32), device=True, abi="c"
+        )
 
         # The function name should match the Python function name, and its
         # return value should be 32 bits
-        self.assertRegex(ptx, r"\.visible\s+\.func\s+\(\.param\s+\.b32\s+"
-                              r"func_retval0\)\s+f_module\(")
+        self.assertRegex(
+            ptx,
+            r"\.visible\s+\.func\s+\(\.param\s+\.b32\s+"
+            r"func_retval0\)\s+f_module\(",
+        )
 
     def test_c_abi_with_abi_name(self):
-        abi_info = {'abi_name': '_Z4funcii'}
-        ptx, resty = compile_ptx(f_module, int32(int32, int32), device=True,
-                                 abi="c", abi_info=abi_info)
+        abi_info = {"abi_name": "_Z4funcii"}
+        ptx, resty = compile_ptx(
+            f_module,
+            int32(int32, int32),
+            device=True,
+            abi="c",
+            abi_info=abi_info,
+        )
 
         # The function name should match the one given in the ABI info, and its
         # return value should be 32 bits
-        self.assertRegex(ptx, r"\.visible\s+\.func\s+\(\.param\s+\.b32\s+"
-                              r"func_retval0\)\s+_Z4funcii\(")
+        self.assertRegex(
+            ptx,
+            r"\.visible\s+\.func\s+\(\.param\s+\.b32\s+"
+            r"func_retval0\)\s+_Z4funcii\(",
+        )
 
     def test_compile_defaults_to_c_abi(self):
         ptx, resty = compile(f_module, int32(int32, int32), device=True)
 
         # The function name should match the Python function name, and its
         # return value should be 32 bits
-        self.assertRegex(ptx, r"\.visible\s+\.func\s+\(\.param\s+\.b32\s+"
-                              r"func_retval0\)\s+f_module\(")
+        self.assertRegex(
+            ptx,
+            r"\.visible\s+\.func\s+\(\.param\s+\.b32\s+"
+            r"func_retval0\)\s+f_module\(",
+        )
 
     def test_compile_to_ltoir(self):
         if runtime.get_version() < (11, 5):
             self.skipTest("-gen-lto unavailable in this toolkit version")
 
-        ltoir, resty = compile(f_module, int32(int32, int32), device=True,
-                               output="ltoir")
+        ltoir, resty = compile(
+            f_module, int32(int32, int32), device=True, output="ltoir"
+        )
 
         # There are no tools to interpret the LTOIR output, but we can check
         # that we appear to have obtained an LTOIR file. This magic number is
         # not documented, but is expected to remain consistent.
         LTOIR_MAGIC = 0x7F4E43ED
-        header = int.from_bytes(ltoir[:4], byteorder='little')
+        header = int.from_bytes(ltoir[:4], byteorder="little")
         self.assertEqual(header, LTOIR_MAGIC)
         self.assertEqual(resty, int32)
 
@@ -220,11 +245,15 @@ def test_compile_to_invalid_error(self):
         illegal_output = "illegal"
         msg = f"Unsupported output type: {illegal_output}"
         with self.assertRaisesRegex(NotImplementedError, msg):
-            compile(f_module, int32(int32, int32), device=True,
-                    output=illegal_output)
+            compile(
+                f_module,
+                int32(int32, int32),
+                device=True,
+                output=illegal_output,
+            )
 
 
-@skip_on_cudasim('Compilation unsupported in the simulator')
+@skip_on_cudasim("Compilation unsupported in the simulator")
 class TestCompileForCurrentDevice(CUDATestCase):
     def _check_ptx_for_current_device(self, compile_function):
         def add(x, y):
@@ -237,7 +266,7 @@ def add(x, y):
         # closest compute capability supported by the current toolkit.
         device_cc = cuda.get_current_device().compute_capability
         cc = cuda.cudadrv.nvvm.find_closest_arch(device_cc)
-        target = f'.target sm_{cc[0]}{cc[1]}'
+        target = f".target sm_{cc[0]}{cc[1]}"
         self.assertIn(target, ptx)
 
     def test_compile_ptx_for_current_device(self):
@@ -247,10 +276,10 @@ def test_compile_for_current_device(self):
         self._check_ptx_for_current_device(compile_for_current_device)
 
 
-@skip_on_cudasim('Compilation unsupported in the simulator')
+@skip_on_cudasim("Compilation unsupported in the simulator")
 class TestCompileOnlyTests(unittest.TestCase):
-    '''For tests where we can only check correctness by examining the compiler
-    output rather than observing the effects of execution.'''
+    """For tests where we can only check correctness by examining the compiler
+    output rather than observing the effects of execution."""
 
     def test_nanosleep(self):
         def use_nanosleep(x):
@@ -262,15 +291,20 @@ def use_nanosleep(x):
         ptx, resty = compile_ptx(use_nanosleep, (uint32,), cc=(7, 0))
 
         nanosleep_count = 0
-        for line in ptx.split('\n'):
-            if 'nanosleep.u32' in line:
+        for line in ptx.split("\n"):
+            if "nanosleep.u32" in line:
                 nanosleep_count += 1
 
         expected = 2
-        self.assertEqual(expected, nanosleep_count,
-                         (f'Got {nanosleep_count} nanosleep instructions, '
-                          f'expected {expected}'))
+        self.assertEqual(
+            expected,
+            nanosleep_count,
+            (
+                f"Got {nanosleep_count} nanosleep instructions, "
+                f"expected {expected}"
+            ),
+        )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_complex.py b/numba_cuda/numba/cuda/tests/cudapy/test_complex.py
index 958393162..d956433f2 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_complex.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_complex.py
@@ -6,20 +6,34 @@
 from numba.cuda.testing import unittest, CUDATestCase
 from numba.core import types
 from numba import cuda
-from numba.tests.complex_usecases import (real_usecase, imag_usecase,
-                                          conjugate_usecase, phase_usecase,
-                                          polar_as_complex_usecase,
-                                          rect_usecase, isnan_usecase,
-                                          isinf_usecase, isfinite_usecase,
-                                          exp_usecase, log_usecase,
-                                          log_base_usecase, log10_usecase,
-                                          sqrt_usecase, asin_usecase,
-                                          acos_usecase, atan_usecase,
-                                          cos_usecase, sin_usecase,
-                                          tan_usecase, acosh_usecase,
-                                          asinh_usecase, atanh_usecase,
-                                          cosh_usecase, sinh_usecase,
-                                          tanh_usecase)
+from numba.tests.complex_usecases import (
+    real_usecase,
+    imag_usecase,
+    conjugate_usecase,
+    phase_usecase,
+    polar_as_complex_usecase,
+    rect_usecase,
+    isnan_usecase,
+    isinf_usecase,
+    isfinite_usecase,
+    exp_usecase,
+    log_usecase,
+    log_base_usecase,
+    log10_usecase,
+    sqrt_usecase,
+    asin_usecase,
+    acos_usecase,
+    atan_usecase,
+    cos_usecase,
+    sin_usecase,
+    tan_usecase,
+    acosh_usecase,
+    asinh_usecase,
+    atanh_usecase,
+    cosh_usecase,
+    sinh_usecase,
+    tanh_usecase,
+)
 from numba.np import numpy_support
 
 
@@ -29,15 +43,18 @@ def compile_scalar_func(pyfunc, argtypes, restype):
     assert not isinstance(restype, types.Array)
     device_func = cuda.jit(restype(*argtypes), device=True)(pyfunc)
 
-    kernel_types = [types.Array(tp, 1, "C")
-                    for tp in [restype] + list(argtypes)]
+    kernel_types = [
+        types.Array(tp, 1, "C") for tp in [restype] + list(argtypes)
+    ]
 
     if len(argtypes) == 1:
+
         def kernel_func(out, a):
             i = cuda.grid(1)
             if i < out.shape[0]:
                 out[i] = device_func(a[i])
     elif len(argtypes) == 2:
+
         def kernel_func(out, a, b):
             i = cuda.grid(1)
             if i < out.shape[0]:
@@ -49,8 +66,9 @@ def kernel_func(out, a, b):
 
     def kernel_wrapper(values):
         n = len(values)
-        inputs = [np.empty(n, dtype=numpy_support.as_dtype(tp))
-                  for tp in argtypes]
+        inputs = [
+            np.empty(n, dtype=numpy_support.as_dtype(tp)) for tp in argtypes
+        ]
         output = np.empty(n, dtype=numpy_support.as_dtype(restype))
         for i, vs in enumerate(values):
             for v, inp in zip(vs, inputs):
@@ -58,42 +76,70 @@ def kernel_wrapper(values):
         args = [output] + inputs
         kernel[int(math.ceil(n / 256)), 256](*args)
         return list(output)
+
     return kernel_wrapper
 
 
 class BaseComplexTest(CUDATestCase):
-
     def basic_values(self):
-        reals = [-0.0, +0.0, 1, -1, +1.5, -3.5,
-                 float('-inf'), float('+inf'), float('nan')]
+        reals = [
+            -0.0,
+            +0.0,
+            1,
+            -1,
+            +1.5,
+            -3.5,
+            float("-inf"),
+            float("+inf"),
+            float("nan"),
+        ]
         return [complex(x, y) for x, y in itertools.product(reals, reals)]
 
     def more_values(self):
-        reals = [0.0, +0.0, 1, -1, -math.pi, +math.pi,
-                 float('-inf'), float('+inf'), float('nan')]
+        reals = [
+            0.0,
+            +0.0,
+            1,
+            -1,
+            -math.pi,
+            +math.pi,
+            float("-inf"),
+            float("+inf"),
+            float("nan"),
+        ]
         return [complex(x, y) for x, y in itertools.product(reals, reals)]
 
     def non_nan_values(self):
-        reals = [-0.0, +0.0, 1, -1, -math.pi, +math.pi,
-                 float('inf'), float('-inf')]
+        reals = [
+            -0.0,
+            +0.0,
+            1,
+            -1,
+            -math.pi,
+            +math.pi,
+            float("inf"),
+            float("-inf"),
+        ]
         return [complex(x, y) for x, y in itertools.product(reals, reals)]
 
     def run_func(self, pyfunc, sigs, values, ulps=1, ignore_sign_on_zero=False):
         for sig in sigs:
             if isinstance(sig, types.Type):
-                sig = sig,
+                sig = (sig,)
             if isinstance(sig, tuple):
                 # Assume return type is the type of first argument
                 sig = sig[0](*sig)
-            prec = ('single'
-                    if sig.args[0] in (types.float32, types.complex64)
-                    else 'double')
+            prec = (
+                "single"
+                if sig.args[0] in (types.float32, types.complex64)
+                else "double"
+            )
             cudafunc = compile_scalar_func(pyfunc, sig.args, sig.return_type)
             ok_values = []
             expected_list = []
             for args in values:
                 if not isinstance(args, (list, tuple)):
-                    args = args,
+                    args = (args,)
                 try:
                     expected_list.append(pyfunc(*args))
                     ok_values.append(args)
@@ -102,24 +148,31 @@ def run_func(self, pyfunc, sigs, values, ulps=1, ignore_sign_on_zero=False):
                     continue
             got_list = cudafunc(ok_values)
             for got, expected, args in zip(got_list, expected_list, ok_values):
-                msg = 'for input %r with prec %r' % (args, prec)
-                self.assertPreciseEqual(got, expected, prec=prec,
-                                        ulps=ulps,
-                                        ignore_sign_on_zero=ignore_sign_on_zero,
-                                        msg=msg)
+                msg = "for input %r with prec %r" % (args, prec)
+                self.assertPreciseEqual(
+                    got,
+                    expected,
+                    prec=prec,
+                    ulps=ulps,
+                    ignore_sign_on_zero=ignore_sign_on_zero,
+                    msg=msg,
+                )
 
     run_unary = run_func
     run_binary = run_func
 
 
 class TestComplex(BaseComplexTest):
-
     def check_real_image(self, pyfunc):
         values = self.basic_values()
-        self.run_unary(pyfunc,
-                       [tp.underlying_float(tp)
-                        for tp in (types.complex64, types.complex128)],
-                       values)
+        self.run_unary(
+            pyfunc,
+            [
+                tp.underlying_float(tp)
+                for tp in (types.complex64, types.complex128)
+            ],
+            values,
+        )
 
     def test_real(self):
         self.check_real_image(real_usecase)
@@ -130,9 +183,7 @@ def test_imag(self):
     def test_conjugate(self):
         pyfunc = conjugate_usecase
         values = self.basic_values()
-        self.run_unary(pyfunc,
-                       [types.complex64, types.complex128],
-                       values)
+        self.run_unary(pyfunc, [types.complex64, types.complex128], values)
 
 
 class TestCMath(BaseComplexTest):
@@ -141,26 +192,44 @@ class TestCMath(BaseComplexTest):
     """
 
     def check_predicate_func(self, pyfunc):
-        self.run_unary(pyfunc,
-                       [types.boolean(tp)
-                        for tp in (types.complex128, types.complex64)],
-                       self.basic_values())
-
-    def check_unary_func(self, pyfunc, ulps=1, values=None,
-                         returns_float=False, ignore_sign_on_zero=False):
+        self.run_unary(
+            pyfunc,
+            [types.boolean(tp) for tp in (types.complex128, types.complex64)],
+            self.basic_values(),
+        )
+
+    def check_unary_func(
+        self,
+        pyfunc,
+        ulps=1,
+        values=None,
+        returns_float=False,
+        ignore_sign_on_zero=False,
+    ):
         if returns_float:
+
             def sig(tp):
                 return tp.underlying_float(tp)
         else:
+
             def sig(tp):
                 return tp(tp)
-        self.run_unary(pyfunc, [sig(types.complex128)],
-                       values or self.more_values(), ulps=ulps,
-                       ignore_sign_on_zero=ignore_sign_on_zero)
+
+        self.run_unary(
+            pyfunc,
+            [sig(types.complex128)],
+            values or self.more_values(),
+            ulps=ulps,
+            ignore_sign_on_zero=ignore_sign_on_zero,
+        )
         # Avoid discontinuities around pi when in single precision.
-        self.run_unary(pyfunc, [sig(types.complex64)],
-                       values or self.basic_values(), ulps=ulps,
-                       ignore_sign_on_zero=ignore_sign_on_zero)
+        self.run_unary(
+            pyfunc,
+            [sig(types.complex64)],
+            values or self.basic_values(),
+            ulps=ulps,
+            ignore_sign_on_zero=ignore_sign_on_zero,
+        )
 
     # Conversions
 
@@ -172,11 +241,14 @@ def test_polar(self):
 
     def test_rect(self):
         def do_test(tp, seed_values):
-            values = [(z.real, z.imag) for z in seed_values
-                      if not math.isinf(z.imag) or z.real == 0]
+            values = [
+                (z.real, z.imag)
+                for z in seed_values
+                if not math.isinf(z.imag) or z.real == 0
+            ]
             float_type = tp.underlying_float
-            self.run_binary(rect_usecase, [tp(float_type, float_type)],
-                            values)
+            self.run_binary(rect_usecase, [tp(float_type, float_type)], values)
+
         do_test(types.complex128, self.more_values())
         # Avoid discontinuities around pi when in single precision.
         do_test(types.complex64, self.basic_values())
@@ -202,10 +274,11 @@ def test_log(self):
 
     def test_log_base(self):
         values = list(itertools.product(self.more_values(), self.more_values()))
-        value_types = [(types.complex128, types.complex128),
-                       (types.complex64, types.complex64)]
-        self.run_binary(log_base_usecase, value_types, values,
-                        ulps=3)
+        value_types = [
+            (types.complex128, types.complex128),
+            (types.complex64, types.complex64),
+        ]
+        self.run_binary(log_base_usecase, value_types, values, ulps=3)
 
     def test_log10(self):
         self.check_unary_func(log10_usecase)
@@ -222,8 +295,9 @@ def test_asin(self):
         self.check_unary_func(asin_usecase, ulps=2)
 
     def test_atan(self):
-        self.check_unary_func(atan_usecase, ulps=2,
-                              values=self.non_nan_values())
+        self.check_unary_func(
+            atan_usecase, ulps=2, values=self.non_nan_values()
+        )
 
     def test_cos(self):
         self.check_unary_func(cos_usecase, ulps=2)
@@ -233,8 +307,7 @@ def test_sin(self):
         self.check_unary_func(sin_usecase, ulps=2)
 
     def test_tan(self):
-        self.check_unary_func(tan_usecase, ulps=2,
-                              ignore_sign_on_zero=True)
+        self.check_unary_func(tan_usecase, ulps=2, ignore_sign_on_zero=True)
 
     # Hyperbolic functions
 
@@ -245,8 +318,7 @@ def test_asinh(self):
         self.check_unary_func(asinh_usecase, ulps=2)
 
     def test_atanh(self):
-        self.check_unary_func(atanh_usecase, ulps=2,
-                              ignore_sign_on_zero=True)
+        self.check_unary_func(atanh_usecase, ulps=2, ignore_sign_on_zero=True)
 
     def test_cosh(self):
         self.check_unary_func(cosh_usecase, ulps=2)
@@ -255,8 +327,7 @@ def test_sinh(self):
         self.check_unary_func(sinh_usecase, ulps=2)
 
     def test_tanh(self):
-        self.check_unary_func(tanh_usecase, ulps=2,
-                              ignore_sign_on_zero=True)
+        self.check_unary_func(tanh_usecase, ulps=2, ignore_sign_on_zero=True)
 
 
 class TestAtomicOnComplexComponents(CUDATestCase):
@@ -292,5 +363,5 @@ def atomic_add_one_j(values):
         np.testing.assert_equal(arr1 + 1j, arr2)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py b/numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py
index e72a6df00..8f948311b 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py
@@ -5,7 +5,7 @@
 
 class TestCudaComplex(CUDATestCase):
     def test_cuda_complex_arg(self):
-        @cuda.jit('void(complex128[:], complex128)')
+        @cuda.jit("void(complex128[:], complex128)")
         def foo(a, b):
             i = cuda.grid(1)
             a[i] += b
@@ -16,5 +16,5 @@ def foo(a, b):
         self.assertTrue(np.allclose(a, a0 + 2j))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_const_string.py b/numba_cuda/numba/cuda/tests/cudapy/test_const_string.py
index 173319cb2..040d5305e 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_const_string.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_const_string.py
@@ -14,14 +14,17 @@ def test_const_string(self):
 
         targetctx = cuda_target.target_context
         mod = targetctx.create_module("")
-        textstring = 'A Little Brown Fox'
+        textstring = "A Little Brown Fox"
         gv0 = targetctx.insert_const_string(mod, textstring)
         # Insert the same const string a second time - the first should be
         # reused.
         targetctx.insert_const_string(mod, textstring)
 
-        res = re.findall(r"@\"__conststring__.*internal.*constant.*\["
-                         r"19\s+x\s+i8\]", str(mod))
+        res = re.findall(
+            r"@\"__conststring__.*internal.*constant.*\["
+            r"19\s+x\s+i8\]",
+            str(mod),
+        )
         # Ensure that the const string was only inserted once
         self.assertEqual(len(res), 1)
 
@@ -30,12 +33,16 @@ def test_const_string(self):
         # Using insert_const_string
         fn = ir.Function(mod, fnty, "test_insert_const_string")
         builder = ir.IRBuilder(fn.append_basic_block())
-        res = builder.addrspacecast(gv0, ir.PointerType(ir.IntType(8)),
-                                    'generic')
+        res = builder.addrspacecast(
+            gv0, ir.PointerType(ir.IntType(8)), "generic"
+        )
         builder.ret(res)
 
-        matches = re.findall(r"@\"__conststring__.*internal.*constant.*\["
-                             r"19\s+x\s+i8\]", str(mod))
+        matches = re.findall(
+            r"@\"__conststring__.*internal.*constant.*\["
+            r"19\s+x\s+i8\]",
+            str(mod),
+        )
         self.assertEqual(len(matches), 1)
 
         # Using insert_string_const_addrspace
@@ -44,11 +51,14 @@ def test_const_string(self):
         res = targetctx.insert_string_const_addrspace(builder, textstring)
         builder.ret(res)
 
-        matches = re.findall(r"@\"__conststring__.*internal.*constant.*\["
-                             r"19\s+x\s+i8\]", str(mod))
+        matches = re.findall(
+            r"@\"__conststring__.*internal.*constant.*\["
+            r"19\s+x\s+i8\]",
+            str(mod),
+        )
         self.assertEqual(len(matches), 1)
 
-        ptx = compile_ir(str(mod)).decode('ascii')
+        ptx = compile_ir(str(mod)).decode("ascii")
         matches = list(re.findall(r"\.const.*__conststring__", ptx))
 
         self.assertEqual(len(matches), 1)
@@ -70,8 +80,8 @@ def str_assign(arr):
         # Expected result, e.g.:
         #     ['XYZ' 'XYZ' 'XYZ' 'XYZ' 'XYZ' 'XYZ' 'XYZ' 'XYZ' '']
         expected = np.zeros_like(arr)
-        expected[:-1] = 'XYZ'
-        expected[-1] = ''
+        expected[:-1] = "XYZ"
+        expected[-1] = ""
         np.testing.assert_equal(arr, expected)
 
     def test_assign_const_byte_string(self):
@@ -88,42 +98,42 @@ def bytes_assign(arr):
         # Expected result, e.g.:
         #     [b'XYZ' b'XYZ' b'XYZ' b'XYZ' b'XYZ' b'XYZ' b'XYZ' b'XYZ' b'']
         expected = np.zeros_like(arr)
-        expected[:-1] = b'XYZ'
-        expected[-1] = b''
+        expected[:-1] = b"XYZ"
+        expected[-1] = b""
         np.testing.assert_equal(arr, expected)
 
     def test_assign_const_string_in_record(self):
         @cuda.jit
         def f(a):
-            a[0]['x'] = 1
-            a[0]['y'] = 'ABC'
-            a[1]['x'] = 2
-            a[1]['y'] = 'XYZ'
+            a[0]["x"] = 1
+            a[0]["y"] = "ABC"
+            a[1]["x"] = 2
+            a[1]["y"] = "XYZ"
 
-        dt = np.dtype([('x', np.int32), ('y', np.dtype('<U12'))])
+        dt = np.dtype([("x", np.int32), ("y", np.dtype("<U12"))])
         a = np.zeros(2, dt)
 
         f[1, 1](a)
 
-        reference = np.asarray([(1, 'ABC'), (2, 'XYZ')], dtype=dt)
+        reference = np.asarray([(1, "ABC"), (2, "XYZ")], dtype=dt)
         np.testing.assert_array_equal(reference, a)
 
     def test_assign_const_bytes_in_record(self):
         @cuda.jit
         def f(a):
-            a[0]['x'] = 1
-            a[0]['y'] = b'ABC'
-            a[1]['x'] = 2
-            a[1]['y'] = b'XYZ'
+            a[0]["x"] = 1
+            a[0]["y"] = b"ABC"
+            a[1]["x"] = 2
+            a[1]["y"] = b"XYZ"
 
-        dt = np.dtype([('x', np.float32), ('y', np.dtype('S12'))])
+        dt = np.dtype([("x", np.float32), ("y", np.dtype("S12"))])
         a = np.zeros(2, dt)
 
         f[1, 1](a)
 
-        reference = np.asarray([(1, b'ABC'), (2, b'XYZ')], dtype=dt)
+        reference = np.asarray([(1, b"ABC"), (2, b"XYZ")], dtype=dt)
         np.testing.assert_array_equal(reference, a)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_constmem.py b/numba_cuda/numba/cuda/tests/cudapy/test_constmem.py
index 94b11c1b3..bb49b38e3 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_constmem.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_constmem.py
@@ -5,30 +5,26 @@
 from numba.core.config import ENABLE_CUDASIM
 
 CONST_EMPTY = np.array([])
-CONST1D = np.arange(10, dtype=np.float64) / 2.
-CONST2D = np.asfortranarray(
-    np.arange(100, dtype=np.int32).reshape(10, 10))
-CONST3D = ((np.arange(5 * 5 * 5, dtype=np.complex64).reshape(5, 5, 5) + 1j) /
-           2j)
+CONST1D = np.arange(10, dtype=np.float64) / 2.0
+CONST2D = np.asfortranarray(np.arange(100, dtype=np.int32).reshape(10, 10))
+CONST3D = (np.arange(5 * 5 * 5, dtype=np.complex64).reshape(5, 5, 5) + 1j) / 2j
 CONST3BYTES = np.arange(3, dtype=np.uint8)
 
-CONST_RECORD_EMPTY = np.array(
-    [],
-    dtype=[('x', float), ('y', int)])
-CONST_RECORD = np.array(
-    [(1.0, 2), (3.0, 4)],
-    dtype=[('x', float), ('y', int)])
+CONST_RECORD_EMPTY = np.array([], dtype=[("x", float), ("y", int)])
+CONST_RECORD = np.array([(1.0, 2), (3.0, 4)], dtype=[("x", float), ("y", int)])
 CONST_RECORD_ALIGN = np.array(
     [(1, 2, 3, 0xDEADBEEF, 8), (4, 5, 6, 0xBEEFDEAD, 10)],
     dtype=np.dtype(
         dtype=[
-            ('a', np.uint8),
-            ('b', np.uint8),
-            ('x', np.uint8),
-            ('y', np.uint32),
-            ('z', np.uint8),
+            ("a", np.uint8),
+            ("b", np.uint8),
+            ("x", np.uint8),
+            ("y", np.uint32),
+            ("z", np.uint8),
         ],
-        align=True))
+        align=True,
+    ),
+)
 
 
 def cuconstEmpty(A):
@@ -68,18 +64,18 @@ def cuconstRecEmpty(A):
 def cuconstRec(A, B):
     C = cuda.const.array_like(CONST_RECORD)
     i = cuda.grid(1)
-    A[i] = C[i]['x']
-    B[i] = C[i]['y']
+    A[i] = C[i]["x"]
+    B[i] = C[i]["y"]
 
 
 def cuconstRecAlign(A, B, C, D, E):
     Z = cuda.const.array_like(CONST_RECORD_ALIGN)
     i = cuda.grid(1)
-    A[i] = Z[i]['a']
-    B[i] = Z[i]['b']
-    C[i] = Z[i]['x']
-    D[i] = Z[i]['y']
-    E[i] = Z[i]['z']
+    A[i] = Z[i]["a"]
+    B[i] = Z[i]["b"]
+    C[i] = Z[i]["x"]
+    D[i] = Z[i]["y"]
+    E[i] = Z[i]["z"]
 
 
 def cuconstAlign(z):
@@ -99,50 +95,52 @@ def test_const_array(self):
 
         if not ENABLE_CUDASIM:
             self.assertIn(
-                'ld.const.f64',
+                "ld.const.f64",
                 jcuconst.inspect_asm(sig),
-                "as we're adding to it, load as a double")
+                "as we're adding to it, load as a double",
+            )
 
     def test_const_empty(self):
-        jcuconstEmpty = cuda.jit('void(int64[:])')(cuconstEmpty)
+        jcuconstEmpty = cuda.jit("void(int64[:])")(cuconstEmpty)
         A = np.full(1, fill_value=-1, dtype=np.int64)
         jcuconstEmpty[1, 1](A)
         self.assertTrue(np.all(A == 0))
 
     def test_const_align(self):
-        jcuconstAlign = cuda.jit('void(float64[:])')(cuconstAlign)
+        jcuconstAlign = cuda.jit("void(float64[:])")(cuconstAlign)
         A = np.full(3, fill_value=np.nan, dtype=float)
         jcuconstAlign[1, 3](A)
         self.assertTrue(np.all(A == (CONST3BYTES + CONST1D[:3])))
 
     def test_const_array_2d(self):
-        sig = (int32[:,:],)
+        sig = (int32[:, :],)
         jcuconst2d = cuda.jit(sig)(cuconst2d)
-        A = np.zeros_like(CONST2D, order='C')
+        A = np.zeros_like(CONST2D, order="C")
         jcuconst2d[(2, 2), (5, 5)](A)
         self.assertTrue(np.all(A == CONST2D))
 
         if not ENABLE_CUDASIM:
             self.assertIn(
-                'ld.const.u32',
+                "ld.const.u32",
                 jcuconst2d.inspect_asm(sig),
-                "load the ints as ints")
+                "load the ints as ints",
+            )
 
     def test_const_array_3d(self):
-        sig = (complex64[:,:,:],)
+        sig = (complex64[:, :, :],)
         jcuconst3d = cuda.jit(sig)(cuconst3d)
-        A = np.zeros_like(CONST3D, order='F')
+        A = np.zeros_like(CONST3D, order="F")
         jcuconst3d[1, (5, 5, 5)](A)
         self.assertTrue(np.all(A == CONST3D))
 
         if not ENABLE_CUDASIM:
             asm = jcuconst3d.inspect_asm(sig)
-            complex_load = 'ld.const.v2.f32'
-            description = 'Load the complex as a vector of 2x f32'
+            complex_load = "ld.const.v2.f32"
+            description = "Load the complex as a vector of 2x f32"
             self.assertIn(complex_load, asm, description)
 
     def test_const_record_empty(self):
-        jcuconstRecEmpty = cuda.jit('void(int64[:])')(cuconstRecEmpty)
+        jcuconstRecEmpty = cuda.jit("void(int64[:])")(cuconstRecEmpty)
         A = np.full(1, fill_value=-1, dtype=np.int64)
         jcuconstRecEmpty[1, 1](A)
         self.assertTrue(np.all(A == 0))
@@ -153,8 +151,8 @@ def test_const_record(self):
         jcuconst = cuda.jit(cuconstRec).specialize(A, B)
 
         jcuconst[2, 1](A, B)
-        np.testing.assert_allclose(A, CONST_RECORD['x'])
-        np.testing.assert_allclose(B, CONST_RECORD['y'])
+        np.testing.assert_allclose(A, CONST_RECORD["x"])
+        np.testing.assert_allclose(B, CONST_RECORD["y"])
 
     def test_const_record_align(self):
         A = np.zeros(2, dtype=np.float64)
@@ -165,12 +163,12 @@ def test_const_record_align(self):
         jcuconst = cuda.jit(cuconstRecAlign).specialize(A, B, C, D, E)
 
         jcuconst[2, 1](A, B, C, D, E)
-        np.testing.assert_allclose(A, CONST_RECORD_ALIGN['a'])
-        np.testing.assert_allclose(B, CONST_RECORD_ALIGN['b'])
-        np.testing.assert_allclose(C, CONST_RECORD_ALIGN['x'])
-        np.testing.assert_allclose(D, CONST_RECORD_ALIGN['y'])
-        np.testing.assert_allclose(E, CONST_RECORD_ALIGN['z'])
+        np.testing.assert_allclose(A, CONST_RECORD_ALIGN["a"])
+        np.testing.assert_allclose(B, CONST_RECORD_ALIGN["b"])
+        np.testing.assert_allclose(C, CONST_RECORD_ALIGN["x"])
+        np.testing.assert_allclose(D, CONST_RECORD_ALIGN["y"])
+        np.testing.assert_allclose(E, CONST_RECORD_ALIGN["z"])
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py b/numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py
index 3a9d00404..c50b1ac52 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py
@@ -3,9 +3,14 @@
 import numpy as np
 
 from numba import config, cuda, int32
-from numba.cuda.testing import (unittest, CUDATestCase, skip_on_cudasim,
-                                skip_unless_cc_60, skip_if_cudadevrt_missing,
-                                skip_if_mvc_enabled)
+from numba.cuda.testing import (
+    unittest,
+    CUDATestCase,
+    skip_on_cudasim,
+    skip_unless_cc_60,
+    skip_if_cudadevrt_missing,
+    skip_if_mvc_enabled,
+)
 
 
 @cuda.jit
@@ -47,7 +52,7 @@ def sequential_rows(M):
 
 
 @skip_if_cudadevrt_missing
-@skip_if_mvc_enabled('CG not supported with MVC')
+@skip_if_mvc_enabled("CG not supported with MVC")
 class TestCudaCooperativeGroups(CUDATestCase):
     @skip_unless_cc_60
     def test_this_grid(self):
@@ -55,11 +60,12 @@ def test_this_grid(self):
         this_grid[1, 1](A)
 
         # Ensure the kernel executed beyond the call to cuda.this_grid()
-        self.assertFalse(np.isnan(A[0]), 'Value was not set')
+        self.assertFalse(np.isnan(A[0]), "Value was not set")
 
     @skip_unless_cc_60
-    @skip_on_cudasim("Simulator doesn't differentiate between normal and "
-                     "cooperative kernels")
+    @skip_on_cudasim(
+        "Simulator doesn't differentiate between normal and cooperative kernels"
+    )
     def test_this_grid_is_cooperative(self):
         A = np.full(1, fill_value=np.nan)
         this_grid[1, 1](A)
@@ -74,11 +80,12 @@ def test_sync_group(self):
         sync_group[1, 1](A)
 
         # Ensure the kernel executed beyond the call to cuda.sync_group()
-        self.assertFalse(np.isnan(A[0]), 'Value was not set')
+        self.assertFalse(np.isnan(A[0]), "Value was not set")
 
     @skip_unless_cc_60
-    @skip_on_cudasim("Simulator doesn't differentiate between normal and "
-                     "cooperative kernels")
+    @skip_on_cudasim(
+        "Simulator doesn't differentiate between normal and cooperative kernels"
+    )
     def test_sync_group_is_cooperative(self):
         A = np.full(1, fill_value=np.nan)
         sync_group[1, 1](A)
@@ -99,7 +106,7 @@ def test_false_cooperative_doesnt_link_cudadevrt(self):
         for key, overload in no_sync.overloads.items():
             self.assertFalse(overload.cooperative)
             for link in overload._codelibrary._linking_files:
-                self.assertNotIn('cudadevrt', link)
+                self.assertNotIn("cudadevrt", link)
 
     @skip_unless_cc_60
     def test_sync_at_matrix_row(self):
@@ -113,7 +120,7 @@ def test_sync_at_matrix_row(self):
         blockdim = 32
         griddim = A.shape[1] // blockdim
 
-        sig = (int32[:,::1],)
+        sig = (int32[:, ::1],)
         c_sequential_rows = cuda.jit(sig)(sequential_rows)
 
         overload = c_sequential_rows.overloads[sig]
@@ -133,7 +140,7 @@ def test_max_cooperative_grid_blocks(self):
         # doesn't error, and that varying the number of dimensions of the block
         # whilst keeping the total number of threads constant doesn't change
         # the maximum to validate some of the logic.
-        sig = (int32[:,::1],)
+        sig = (int32[:, ::1],)
         c_sequential_rows = cuda.jit(sig)(sequential_rows)
         overload = c_sequential_rows.overloads[sig]
         blocks1d = overload.max_cooperative_grid_blocks(256)
@@ -143,5 +150,5 @@ def test_max_cooperative_grid_blocks(self):
         self.assertEqual(blocks1d, blocks3d)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py b/numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py
index 6448f450a..96c4087ad 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py
@@ -9,12 +9,13 @@
 from unittest.mock import call, patch
 
 
-@skip_on_cudasim('CUDA Array Interface is not supported in the simulator')
+@skip_on_cudasim("CUDA Array Interface is not supported in the simulator")
 class TestCudaArrayInterface(ContextResettingTestCase):
     def assertPointersEqual(self, a, b):
         if driver.USE_NV_BINDING:
-            self.assertEqual(int(a.device_ctypes_pointer),
-                             int(b.device_ctypes_pointer))
+            self.assertEqual(
+                int(a.device_ctypes_pointer), int(b.device_ctypes_pointer)
+            )
 
     def test_as_cuda_array(self):
         h_arr = np.arange(10)
@@ -37,7 +38,7 @@ def get_stream_value(self, stream):
         else:
             return stream.handle.value
 
-    @skip_if_external_memmgr('Ownership not relevant with external memmgr')
+    @skip_if_external_memmgr("Ownership not relevant with external memmgr")
     def test_ownership(self):
         # Get the deallocation queue
         ctx = cuda.current_context()
@@ -82,7 +83,7 @@ def mutate(arr, val):
         np.testing.assert_array_equal(d_arr.copy_to_host(), h_arr + val)
 
     def test_ufunc_arg(self):
-        @vectorize(['f8(f8, f8)'], target='cuda')
+        @vectorize(["f8(f8, f8)"], target="cuda")
         def vadd(a, b):
             return a + b
 
@@ -99,7 +100,7 @@ def vadd(a, b):
         np.testing.assert_array_equal(returned.copy_to_host(), h_arr + val)
 
     def test_gufunc_arg(self):
-        @guvectorize(['(f8, f8, f8[:])'], '(),()->()', target='cuda')
+        @guvectorize(["(f8, f8, f8[:])"], "(),()->()", target="cuda")
         def vadd(inp, val, out):
             out[0] = inp + val
 
@@ -118,8 +119,8 @@ def vadd(inp, val, out):
 
     def test_array_views(self):
         """Views created via array interface support:
-            - Strided slices
-            - Strided slices
+        - Strided slices
+        - Strided slices
         """
         h_arr = np.random.random(10)
         c_arr = cuda.to_device(h_arr)
@@ -148,23 +149,22 @@ def test_array_views(self):
         self.assertEqual(arr[::2].strides, arr_strided.strides)
         self.assertEqual(arr[::2].dtype.itemsize, arr_strided.dtype.itemsize)
         self.assertEqual(arr[::2].alloc_size, arr_strided.alloc_size)
-        self.assertEqual(arr[::2].nbytes,
-                         arr_strided.size * arr_strided.dtype.itemsize)
+        self.assertEqual(
+            arr[::2].nbytes, arr_strided.size * arr_strided.dtype.itemsize
+        )
 
         # __setitem__ interface propagates into external array
 
         # Writes to a slice
         arr[:5] = np.pi
         np.testing.assert_array_equal(
-            c_arr.copy_to_host(),
-            np.concatenate((np.full(5, np.pi), h_arr[5:]))
+            c_arr.copy_to_host(), np.concatenate((np.full(5, np.pi), h_arr[5:]))
         )
 
         # Writes to a slice from a view
         arr[:5] = arr[5:]
         np.testing.assert_array_equal(
-            c_arr.copy_to_host(),
-            np.concatenate((h_arr[5:], h_arr[5:]))
+            c_arr.copy_to_host(), np.concatenate((h_arr[5:], h_arr[5:]))
         )
 
         # Writes through a view
@@ -177,10 +177,7 @@ def test_array_views(self):
             c_arr.copy_to_host()[::2],
             np.full(5, np.pi),
         )
-        np.testing.assert_array_equal(
-            c_arr.copy_to_host()[1::2],
-            h_arr[1::2]
-        )
+        np.testing.assert_array_equal(c_arr.copy_to_host()[1::2], h_arr[1::2])
 
     def test_negative_strided_issue(self):
         # issue #3705
@@ -188,7 +185,7 @@ def test_negative_strided_issue(self):
         c_arr = cuda.to_device(h_arr)
 
         def base_offset(orig, sliced):
-            return sliced['data'][0] - orig['data'][0]
+            return sliced["data"][0] - orig["data"][0]
 
         h_ai = h_arr.__array_interface__
         c_ai = c_arr.__cuda_array_interface__
@@ -202,8 +199,8 @@ def base_offset(orig, sliced):
             base_offset(c_ai, c_ai_sliced),
         )
         # Check shape and strides are correct
-        self.assertEqual(h_ai_sliced['shape'], c_ai_sliced['shape'])
-        self.assertEqual(h_ai_sliced['strides'], c_ai_sliced['strides'])
+        self.assertEqual(h_ai_sliced["shape"], c_ai_sliced["shape"])
+        self.assertEqual(h_ai_sliced["strides"], c_ai_sliced["strides"])
 
     def test_negative_strided_copy_to_host(self):
         # issue #3705
@@ -212,28 +209,28 @@ def test_negative_strided_copy_to_host(self):
         sliced = c_arr[::-1]
         with self.assertRaises(NotImplementedError) as raises:
             sliced.copy_to_host()
-        expected_msg = 'D->H copy not implemented for negative strides'
+        expected_msg = "D->H copy not implemented for negative strides"
         self.assertIn(expected_msg, str(raises.exception))
 
     def test_masked_array(self):
         h_arr = np.random.random(10)
-        h_mask = np.random.randint(2, size=10, dtype='bool')
+        h_mask = np.random.randint(2, size=10, dtype="bool")
         c_arr = cuda.to_device(h_arr)
         c_mask = cuda.to_device(h_mask)
 
         # Manually create a masked CUDA Array Interface dictionary
         masked_cuda_array_interface = c_arr.__cuda_array_interface__.copy()
-        masked_cuda_array_interface['mask'] = c_mask
+        masked_cuda_array_interface["mask"] = c_mask
 
         with self.assertRaises(NotImplementedError) as raises:
             cuda.from_cuda_array_interface(masked_cuda_array_interface)
-        expected_msg = 'Masked arrays are not supported'
+        expected_msg = "Masked arrays are not supported"
         self.assertIn(expected_msg, str(raises.exception))
 
     def test_zero_size_array(self):
         # for #4175
         c_arr = cuda.device_array(0)
-        self.assertEqual(c_arr.__cuda_array_interface__['data'][0], 0)
+        self.assertEqual(c_arr.__cuda_array_interface__["data"][0], 0)
 
         @cuda.jit
         def add_one(arr):
@@ -249,49 +246,49 @@ def test_strides(self):
         # for #4175
         # First, test C-contiguous array
         c_arr = cuda.device_array((2, 3, 4))
-        self.assertEqual(c_arr.__cuda_array_interface__['strides'], None)
+        self.assertEqual(c_arr.__cuda_array_interface__["strides"], None)
 
         # Second, test non C-contiguous array
         c_arr = c_arr[:, 1, :]
-        self.assertNotEqual(c_arr.__cuda_array_interface__['strides'], None)
+        self.assertNotEqual(c_arr.__cuda_array_interface__["strides"], None)
 
     def test_consuming_strides(self):
         hostarray = np.arange(10).reshape(2, 5)
         devarray = cuda.to_device(hostarray)
         face = devarray.__cuda_array_interface__
-        self.assertIsNone(face['strides'])
+        self.assertIsNone(face["strides"])
         got = cuda.from_cuda_array_interface(face).copy_to_host()
         np.testing.assert_array_equal(got, hostarray)
-        self.assertTrue(got.flags['C_CONTIGUOUS'])
+        self.assertTrue(got.flags["C_CONTIGUOUS"])
         # Try non-NULL strides
-        face['strides'] = hostarray.strides
-        self.assertIsNotNone(face['strides'])
+        face["strides"] = hostarray.strides
+        self.assertIsNotNone(face["strides"])
         got = cuda.from_cuda_array_interface(face).copy_to_host()
         np.testing.assert_array_equal(got, hostarray)
-        self.assertTrue(got.flags['C_CONTIGUOUS'])
+        self.assertTrue(got.flags["C_CONTIGUOUS"])
 
     def test_produce_no_stream(self):
         c_arr = cuda.device_array(10)
-        self.assertIsNone(c_arr.__cuda_array_interface__['stream'])
+        self.assertIsNone(c_arr.__cuda_array_interface__["stream"])
 
         mapped_arr = cuda.mapped_array(10)
-        self.assertIsNone(mapped_arr.__cuda_array_interface__['stream'])
+        self.assertIsNone(mapped_arr.__cuda_array_interface__["stream"])
 
     @linux_only
     def test_produce_managed_no_stream(self):
         managed_arr = cuda.managed_array(10)
-        self.assertIsNone(managed_arr.__cuda_array_interface__['stream'])
+        self.assertIsNone(managed_arr.__cuda_array_interface__["stream"])
 
     def test_produce_stream(self):
         s = cuda.stream()
         c_arr = cuda.device_array(10, stream=s)
-        cai_stream = c_arr.__cuda_array_interface__['stream']
+        cai_stream = c_arr.__cuda_array_interface__["stream"]
         stream_value = self.get_stream_value(s)
         self.assertEqual(stream_value, cai_stream)
 
         s = cuda.stream()
         mapped_arr = cuda.mapped_array(10, stream=s)
-        cai_stream = mapped_arr.__cuda_array_interface__['stream']
+        cai_stream = mapped_arr.__cuda_array_interface__["stream"]
         stream_value = self.get_stream_value(s)
         self.assertEqual(stream_value, cai_stream)
 
@@ -299,7 +296,7 @@ def test_produce_stream(self):
     def test_produce_managed_stream(self):
         s = cuda.stream()
         managed_arr = cuda.managed_array(10, stream=s)
-        cai_stream = managed_arr.__cuda_array_interface__['stream']
+        cai_stream = managed_arr.__cuda_array_interface__["stream"]
         stream_value = self.get_stream_value(s)
         self.assertEqual(stream_value, cai_stream)
 
@@ -327,8 +324,9 @@ def test_consume_no_sync(self):
         # Create a foreign array with no stream
         f_arr = ForeignArray(cuda.device_array(10))
 
-        with patch.object(cuda.cudadrv.driver.Stream, 'synchronize',
-                          return_value=None) as mock_sync:
+        with patch.object(
+            cuda.cudadrv.driver.Stream, "synchronize", return_value=None
+        ) as mock_sync:
             cuda.as_cuda_array(f_arr)
 
         # Ensure the synchronize method of a stream was not called
@@ -339,8 +337,9 @@ def test_consume_sync(self):
         s = cuda.stream()
         f_arr = ForeignArray(cuda.device_array(10, stream=s))
 
-        with patch.object(cuda.cudadrv.driver.Stream, 'synchronize',
-                          return_value=None) as mock_sync:
+        with patch.object(
+            cuda.cudadrv.driver.Stream, "synchronize", return_value=None
+        ) as mock_sync:
             cuda.as_cuda_array(f_arr)
 
         # Ensure the synchronize method of a stream was called
@@ -354,9 +353,10 @@ def test_consume_sync_disabled(self):
         # Set sync to false before testing. The test suite should generally be
         # run with sync enabled, but stash the old value just in case it is
         # not.
-        with override_config('CUDA_ARRAY_INTERFACE_SYNC', False):
-            with patch.object(cuda.cudadrv.driver.Stream, 'synchronize',
-                              return_value=None) as mock_sync:
+        with override_config("CUDA_ARRAY_INTERFACE_SYNC", False):
+            with patch.object(
+                cuda.cudadrv.driver.Stream, "synchronize", return_value=None
+            ) as mock_sync:
                 cuda.as_cuda_array(f_arr)
 
             # Ensure the synchronize method of a stream was not called
@@ -370,8 +370,9 @@ def test_launch_no_sync(self):
         def f(x):
             pass
 
-        with patch.object(cuda.cudadrv.driver.Stream, 'synchronize',
-                          return_value=None) as mock_sync:
+        with patch.object(
+            cuda.cudadrv.driver.Stream, "synchronize", return_value=None
+        ) as mock_sync:
             f[1, 1](f_arr)
 
         # Ensure the synchronize method of a stream was not called
@@ -386,8 +387,9 @@ def test_launch_sync(self):
         def f(x):
             pass
 
-        with patch.object(cuda.cudadrv.driver.Stream, 'synchronize',
-                          return_value=None) as mock_sync:
+        with patch.object(
+            cuda.cudadrv.driver.Stream, "synchronize", return_value=None
+        ) as mock_sync:
             f[1, 1](f_arr)
 
         # Ensure the synchronize method of a stream was called
@@ -404,8 +406,9 @@ def test_launch_sync_two_streams(self):
         def f(x, y):
             pass
 
-        with patch.object(cuda.cudadrv.driver.Stream, 'synchronize',
-                          return_value=None) as mock_sync:
+        with patch.object(
+            cuda.cudadrv.driver.Stream, "synchronize", return_value=None
+        ) as mock_sync:
             f[1, 1](f_arr1, f_arr2)
 
         # Ensure that synchronize was called twice
@@ -418,13 +421,15 @@ def test_launch_sync_disabled(self):
         f_arr1 = ForeignArray(cuda.device_array(10, stream=s1))
         f_arr2 = ForeignArray(cuda.device_array(10, stream=s2))
 
-        with override_config('CUDA_ARRAY_INTERFACE_SYNC', False):
+        with override_config("CUDA_ARRAY_INTERFACE_SYNC", False):
+
             @cuda.jit
             def f(x, y):
                 pass
 
-            with patch.object(cuda.cudadrv.driver.Stream, 'synchronize',
-                              return_value=None) as mock_sync:
+            with patch.object(
+                cuda.cudadrv.driver.Stream, "synchronize", return_value=None
+            ) as mock_sync:
                 f[1, 1](f_arr1, f_arr2)
 
             # Ensure that synchronize was not called
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py b/numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py
index 45af1b677..99f614677 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py
@@ -80,11 +80,12 @@ def outer(argin, argout):
     def test_jit_debug_simulator(self):
         # Ensure that the jit decorator accepts the debug kwarg when the
         # simulator is in use - see Issue #6615.
-        with override_config('ENABLE_CUDASIM', 1):
+        with override_config("ENABLE_CUDASIM", 1):
+
             @cuda.jit(debug=True, opt=False)
             def f(x):
                 pass
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_datetime.py b/numba_cuda/numba/cuda/tests/cudapy/test_datetime.py
index 7921f9e9b..1b177ccc4 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_datetime.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_datetime.py
@@ -13,9 +13,9 @@ def foo(start, end, delta):
             for i in range(cuda.grid(1), delta.size, cuda.gridsize(1)):
                 delta[i] = end[i] - start[i]
 
-        arr1 = np.arange('2005-02', '2006-02', dtype='datetime64[D]')
+        arr1 = np.arange("2005-02", "2006-02", dtype="datetime64[D]")
         arr2 = arr1 + np.random.randint(0, 10000, arr1.size)
-        delta = np.zeros_like(arr1, dtype='timedelta64[D]')
+        delta = np.zeros_like(arr1, dtype="timedelta64[D]")
 
         foo[1, 32](arr1, arr2, delta)
 
@@ -27,11 +27,12 @@ def foo(dates, target, delta, matches, outdelta):
             for i in range(cuda.grid(1), matches.size, cuda.gridsize(1)):
                 matches[i] = dates[i] == target
                 outdelta[i] = dates[i] - delta
-        arr1 = np.arange('2005-02', '2006-02', dtype='datetime64[D]')
-        target = arr1[5]           # datetime
+
+        arr1 = np.arange("2005-02", "2006-02", dtype="datetime64[D]")
+        target = arr1[5]  # datetime
         delta = arr1[6] - arr1[5]  # timedelta
         matches = np.zeros_like(arr1, dtype=np.bool_)
-        outdelta = np.zeros_like(arr1, dtype='datetime64[D]')
+        outdelta = np.zeros_like(arr1, dtype="datetime64[D]")
 
         foo[1, 32](arr1, target, delta, matches, outdelta)
         where = matches.nonzero()
@@ -39,56 +40,59 @@ def foo(dates, target, delta, matches, outdelta):
         self.assertEqual(list(where), [5])
         self.assertPreciseEqual(outdelta, arr1 - delta)
 
-    @skip_on_cudasim('ufunc API unsupported in the simulator')
+    @skip_on_cudasim("ufunc API unsupported in the simulator")
     def test_ufunc(self):
-        datetime_t = from_dtype(np.dtype('datetime64[D]'))
+        datetime_t = from_dtype(np.dtype("datetime64[D]"))
 
-        @vectorize([(datetime_t, datetime_t)], target='cuda')
+        @vectorize([(datetime_t, datetime_t)], target="cuda")
         def timediff(start, end):
             return end - start
 
-        arr1 = np.arange('2005-02', '2006-02', dtype='datetime64[D]')
+        arr1 = np.arange("2005-02", "2006-02", dtype="datetime64[D]")
         arr2 = arr1 + np.random.randint(0, 10000, arr1.size)
 
         delta = timediff(arr1, arr2)
 
         self.assertPreciseEqual(delta, arr2 - arr1)
 
-    @skip_on_cudasim('ufunc API unsupported in the simulator')
+    @skip_on_cudasim("ufunc API unsupported in the simulator")
     def test_gufunc(self):
-        datetime_t = from_dtype(np.dtype('datetime64[D]'))
-        timedelta_t = from_dtype(np.dtype('timedelta64[D]'))
-
-        @guvectorize([(datetime_t, datetime_t, timedelta_t[:])], '(),()->()',
-                     target='cuda')
+        datetime_t = from_dtype(np.dtype("datetime64[D]"))
+        timedelta_t = from_dtype(np.dtype("timedelta64[D]"))
+
+        @guvectorize(
+            [(datetime_t, datetime_t, timedelta_t[:])],
+            "(),()->()",
+            target="cuda",
+        )
         def timediff(start, end, out):
             out[0] = end - start
 
-        arr1 = np.arange('2005-02', '2006-02', dtype='datetime64[D]')
+        arr1 = np.arange("2005-02", "2006-02", dtype="datetime64[D]")
         arr2 = arr1 + np.random.randint(0, 10000, arr1.size)
 
         delta = timediff(arr1, arr2)
 
         self.assertPreciseEqual(delta, arr2 - arr1)
 
-    @skip_on_cudasim('no .copy_to_host() in the simulator')
+    @skip_on_cudasim("no .copy_to_host() in the simulator")
     def test_datetime_view_as_int64(self):
-        arr = np.arange('2005-02', '2006-02', dtype='datetime64[D]')
+        arr = np.arange("2005-02", "2006-02", dtype="datetime64[D]")
         darr = cuda.to_device(arr)
         viewed = darr.view(np.int64)
         self.assertPreciseEqual(arr.view(np.int64), viewed.copy_to_host())
         self.assertEqual(viewed.gpu_data, darr.gpu_data)
 
-    @skip_on_cudasim('no .copy_to_host() in the simulator')
+    @skip_on_cudasim("no .copy_to_host() in the simulator")
     def test_timedelta_view_as_int64(self):
-        arr = np.arange('2005-02', '2006-02', dtype='datetime64[D]')
+        arr = np.arange("2005-02", "2006-02", dtype="datetime64[D]")
         arr = arr - (arr - 1)
-        self.assertEqual(arr.dtype, np.dtype('timedelta64[D]'))
+        self.assertEqual(arr.dtype, np.dtype("timedelta64[D]"))
         darr = cuda.to_device(arr)
         viewed = darr.view(np.int64)
         self.assertPreciseEqual(arr.view(np.int64), viewed.copy_to_host())
         self.assertEqual(viewed.gpu_data, darr.gpu_data)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_debug.py b/numba_cuda/numba/cuda/tests/cudapy/test_debug.py
index b88c25a18..00fb70c06 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_debug.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_debug.py
@@ -2,8 +2,11 @@
 
 from numba.core.utils import PYVERSION
 from numba.cuda.testing import skip_on_cudasim, CUDATestCase
-from numba.tests.support import (override_config, captured_stderr,
-                                 captured_stdout)
+from numba.tests.support import (
+    override_config,
+    captured_stderr,
+    captured_stdout,
+)
 from numba import cuda, float64
 import unittest
 
@@ -13,9 +16,8 @@ def simple_cuda(A, B):
     B[i] = A[i] + 1.5
 
 
-@skip_on_cudasim('Simulator does not produce debug dumps')
+@skip_on_cudasim("Simulator does not produce debug dumps")
 class TestDebugOutput(CUDATestCase):
-
     def compile_simple_cuda(self):
         with captured_stderr() as err:
             with captured_stdout() as out:
@@ -34,14 +36,14 @@ def assert_fails(self, *args, **kwargs):
         self.assertRaises(AssertionError, *args, **kwargs)
 
     def check_debug_output(self, out, enabled_dumps):
-        all_dumps = dict.fromkeys(['bytecode', 'cfg', 'ir', 'llvm',
-                                   'assembly'],
-                                  False)
+        all_dumps = dict.fromkeys(
+            ["bytecode", "cfg", "ir", "llvm", "assembly"], False
+        )
         for name in enabled_dumps:
             assert name in all_dumps
             all_dumps[name] = True
         for name, enabled in sorted(all_dumps.items()):
-            check_meth = getattr(self, '_check_dump_%s' % name)
+            check_meth = getattr(self, "_check_dump_%s" % name)
             if enabled:
                 check_meth(out)
             else:
@@ -50,50 +52,50 @@ def check_debug_output(self, out, enabled_dumps):
     def _check_dump_bytecode(self, out):
         if PYVERSION > (3, 10):
             # binop with arg=0 is binary add, see CPython dis.py and opcode.py
-            self.assertIn('BINARY_OP(arg=0', out)
+            self.assertIn("BINARY_OP(arg=0", out)
         else:
-            self.assertIn('BINARY_ADD', out)
+            self.assertIn("BINARY_ADD", out)
 
     def _check_dump_cfg(self, out):
-        self.assertIn('CFG dominators', out)
+        self.assertIn("CFG dominators", out)
 
     def _check_dump_ir(self, out):
-        self.assertIn('--IR DUMP: simple_cuda--', out)
-        self.assertIn('const(float, 1.5)', out)
+        self.assertIn("--IR DUMP: simple_cuda--", out)
+        self.assertIn("const(float, 1.5)", out)
 
     def _check_dump_llvm(self, out):
-        self.assertIn('--LLVM DUMP', out)
+        self.assertIn("--LLVM DUMP", out)
         self.assertIn('!"kernel", i32 1', out)
 
     def _check_dump_assembly(self, out):
-        self.assertIn('--ASSEMBLY simple_cuda', out)
-        self.assertIn('Generated by NVIDIA NVVM Compiler', out)
+        self.assertIn("--ASSEMBLY simple_cuda", out)
+        self.assertIn("Generated by NVIDIA NVVM Compiler", out)
 
     def test_dump_bytecode(self):
-        with override_config('DUMP_BYTECODE', True):
+        with override_config("DUMP_BYTECODE", True):
             out = self.compile_simple_cuda()
-        self.check_debug_output(out, ['bytecode'])
+        self.check_debug_output(out, ["bytecode"])
 
     def test_dump_ir(self):
-        with override_config('DUMP_IR', True):
+        with override_config("DUMP_IR", True):
             out = self.compile_simple_cuda()
-        self.check_debug_output(out, ['ir'])
+        self.check_debug_output(out, ["ir"])
 
     def test_dump_cfg(self):
-        with override_config('DUMP_CFG', True):
+        with override_config("DUMP_CFG", True):
             out = self.compile_simple_cuda()
-        self.check_debug_output(out, ['cfg'])
+        self.check_debug_output(out, ["cfg"])
 
     def test_dump_llvm(self):
-        with override_config('DUMP_LLVM', True):
+        with override_config("DUMP_LLVM", True):
             out = self.compile_simple_cuda()
-        self.check_debug_output(out, ['llvm'])
+        self.check_debug_output(out, ["llvm"])
 
     def test_dump_assembly(self):
-        with override_config('DUMP_ASSEMBLY', True):
+        with override_config("DUMP_ASSEMBLY", True):
             out = self.compile_simple_cuda()
-        self.check_debug_output(out, ['assembly'])
+        self.check_debug_output(out, ["assembly"])
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py b/numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py
index 0afa99115..76d732474 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py
@@ -1,4 +1,4 @@
-from numba.tests.support import (override_config, captured_stdout)
+from numba.tests.support import override_config, captured_stdout
 from numba.cuda.testing import skip_on_cudasim
 from numba import cuda
 from numba.core import types
@@ -8,7 +8,7 @@
 import unittest
 
 
-@skip_on_cudasim('Simulator does not produce debug dumps')
+@skip_on_cudasim("Simulator does not produce debug dumps")
 class TestCudaDebugInfo(CUDATestCase):
     """
     These tests only checks the compiled PTX for debuginfo section
@@ -49,7 +49,7 @@ def foo(x):
         self._check(foo, sig=(types.int32[:],), expect=True)
 
     def test_environment_override(self):
-        with override_config('CUDA_DEBUGINFO_DEFAULT', 1):
+        with override_config("CUDA_DEBUGINFO_DEFAULT", 1):
             # Using default value
             @cuda.jit(opt=False)
             def foo(x):
@@ -86,7 +86,7 @@ def f(cond):
 
         llvm_ir = f.inspect_llvm(sig)
         # A varible name starting with "bool" in the debug metadata
-        pat = r'!DILocalVariable\(.*name:\s+\"bool'
+        pat = r"!DILocalVariable\(.*name:\s+\"bool"
         match = re.compile(pat).search(llvm_ir)
         self.assertIsNone(match, msg=llvm_ir)
 
@@ -106,7 +106,7 @@ def f(x, y):
         mdnode_id = match.group(1)
 
         # verify the DIBasicType has correct encoding attribute DW_ATE_boolean
-        pat = rf'!{mdnode_id}\s+=\s+!DIBasicType\(.*DW_ATE_boolean'
+        pat = rf"!{mdnode_id}\s+=\s+!DIBasicType\(.*DW_ATE_boolean"
         match = re.compile(pat).search(llvm_ir)
         self.assertIsNotNone(match, msg=llvm_ir)
 
@@ -133,14 +133,17 @@ def f(x):
 
         llvm_ir = f.inspect_llvm(sig)
 
-        defines = [line for line in llvm_ir.splitlines()
-                   if 'define void @"_ZN6cudapy' in line]
+        defines = [
+            line
+            for line in llvm_ir.splitlines()
+            if 'define void @"_ZN6cudapy' in line
+        ]
 
         # Make sure we only found one definition
         self.assertEqual(len(defines), 1)
 
         wrapper_define = defines[0]
-        self.assertIn('!dbg', wrapper_define)
+        self.assertIn("!dbg", wrapper_define)
 
     def test_debug_function_calls_internal_impl(self):
         # Calling a function in a module generated from an implementation
@@ -198,16 +201,16 @@ def test_chained_device_function(self):
         debug_opts = itertools.product(*[(True, False)] * 3)
 
         for kernel_debug, f1_debug, f2_debug in debug_opts:
-            with self.subTest(kernel_debug=kernel_debug,
-                              f1_debug=f1_debug,
-                              f2_debug=f2_debug):
-                self._test_chained_device_function(kernel_debug,
-                                                   f1_debug,
-                                                   f2_debug)
-
-    def _test_chained_device_function_two_calls(self, kernel_debug, f1_debug,
-                                                f2_debug):
-
+            with self.subTest(
+                kernel_debug=kernel_debug, f1_debug=f1_debug, f2_debug=f2_debug
+            ):
+                self._test_chained_device_function(
+                    kernel_debug, f1_debug, f2_debug
+                )
+
+    def _test_chained_device_function_two_calls(
+        self, kernel_debug, f1_debug, f2_debug
+    ):
         @cuda.jit(device=True, debug=f2_debug, opt=False)
         def f2(x):
             return x + 1
@@ -232,12 +235,12 @@ def test_chained_device_function_two_calls(self):
         debug_opts = itertools.product(*[(True, False)] * 3)
 
         for kernel_debug, f1_debug, f2_debug in debug_opts:
-            with self.subTest(kernel_debug=kernel_debug,
-                              f1_debug=f1_debug,
-                              f2_debug=f2_debug):
-                self._test_chained_device_function_two_calls(kernel_debug,
-                                                             f1_debug,
-                                                             f2_debug)
+            with self.subTest(
+                kernel_debug=kernel_debug, f1_debug=f1_debug, f2_debug=f2_debug
+            ):
+                self._test_chained_device_function_two_calls(
+                    kernel_debug, f1_debug, f2_debug
+                )
 
     def test_chained_device_three_functions(self):
         # Like test_chained_device_function, but with enough functions (three)
@@ -278,13 +281,13 @@ def f(x, y):
         llvm_ir = f.inspect_llvm(sig)
 
         # extract the metadata node id from `types` field of DISubroutineType
-        pat = r'!DISubroutineType\(types:\s+!(\d+)\)'
+        pat = r"!DISubroutineType\(types:\s+!(\d+)\)"
         match = re.compile(pat).search(llvm_ir)
         self.assertIsNotNone(match, msg=llvm_ir)
         mdnode_id = match.group(1)
 
         # extract the metadata node ids from the flexible node of types
-        pat = rf'!{mdnode_id}\s+=\s+!{{\s+!(\d+),\s+!(\d+)\s+}}'
+        pat = rf"!{mdnode_id}\s+=\s+!{{\s+!(\d+),\s+!(\d+)\s+}}"
         match = re.compile(pat).search(llvm_ir)
         self.assertIsNotNone(match, msg=llvm_ir)
         mdnode_id1 = match.group(1)
@@ -303,10 +306,10 @@ def test_kernel_args_types(self):
 
     def test_kernel_args_types_dump(self):
         # see issue#135
-        with override_config('DUMP_LLVM', 1):
+        with override_config("DUMP_LLVM", 1):
             with captured_stdout():
                 self._test_kernel_args_types()
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_device_func.py b/numba_cuda/numba/cuda/tests/cudapy/test_device_func.py
index 0c7088b74..4ff973baa 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_device_func.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_device_func.py
@@ -3,8 +3,13 @@
 
 import numpy as np
 
-from numba.cuda.testing import (skip_if_curand_kernel_missing, skip_on_cudasim,
-                                test_data_dir, unittest, CUDATestCase)
+from numba.cuda.testing import (
+    skip_if_curand_kernel_missing,
+    skip_on_cudasim,
+    test_data_dir,
+    unittest,
+    CUDATestCase,
+)
 from numba import cuda, jit, float32, int32, types
 from numba.core.errors import TypingError
 from numba.tests.support import skip_unless_cffi
@@ -12,9 +17,7 @@
 
 
 class TestDeviceFunc(CUDATestCase):
-
     def test_use_add2f(self):
-
         @cuda.jit("float32(float32, float32)", device=True)
         def add2f(a, b):
             return a + b
@@ -33,7 +36,6 @@ def use_add2f(ary):
         self.assertTrue(np.all(ary == exp), (ary, exp))
 
     def test_indirect_add2f(self):
-
         @cuda.jit("float32(float32, float32)", device=True)
         def add2f(a, b):
             return a + b
@@ -74,12 +76,12 @@ def add(a, b):
 
         self._check_cpu_dispatcher(add)
 
-    @skip_on_cudasim('not supported in cudasim')
+    @skip_on_cudasim("not supported in cudasim")
     def test_cpu_dispatcher_invalid(self):
         # Test invalid usage
         # Explicit signature disables compilation, which also disable
         # compiling on CUDA.
-        @jit('(i4, i4)')
+        @jit("(i4, i4)")
         def add(a, b):
             return a + b
 
@@ -95,7 +97,7 @@ def test_cpu_dispatcher_other_module(self):
         def add(a, b):
             return a + b
 
-        mymod = ModuleType(name='mymod')
+        mymod = ModuleType(name="mymod")
         mymod.add = add
         del add
 
@@ -109,7 +111,7 @@ def add_kernel(ary):
         add_kernel[1, ary.size](ary)
         np.testing.assert_equal(expect, ary)
 
-    @skip_on_cudasim('not supported in cudasim')
+    @skip_on_cudasim("not supported in cudasim")
     def test_inspect_llvm(self):
         @cuda.jit(device=True)
         def foo(x, y):
@@ -120,13 +122,13 @@ def foo(x, y):
 
         fname = cres.fndesc.mangled_name
         # Verify that the function name has "foo" in it as in the python name
-        self.assertIn('foo', fname)
+        self.assertIn("foo", fname)
 
         llvm = foo.inspect_llvm(args)
         # Check that the compiled function name is in the LLVM.
         self.assertIn(fname, llvm)
 
-    @skip_on_cudasim('not supported in cudasim')
+    @skip_on_cudasim("not supported in cudasim")
     def test_inspect_asm(self):
         @cuda.jit(device=True)
         def foo(x, y):
@@ -137,13 +139,13 @@ def foo(x, y):
 
         fname = cres.fndesc.mangled_name
         # Verify that the function name has "foo" in it as in the python name
-        self.assertIn('foo', fname)
+        self.assertIn("foo", fname)
 
         ptx = foo.inspect_asm(args)
         # Check that the compiled function name is in the PTX
         self.assertIn(fname, ptx)
 
-    @skip_on_cudasim('not supported in cudasim')
+    @skip_on_cudasim("not supported in cudasim")
     def test_inspect_sass_disallowed(self):
         @cuda.jit(device=True)
         def foo(x, y):
@@ -152,10 +154,11 @@ def foo(x, y):
         with self.assertRaises(RuntimeError) as raises:
             foo.inspect_sass((int32, int32))
 
-        self.assertIn('Cannot inspect SASS of a device function',
-                      str(raises.exception))
+        self.assertIn(
+            "Cannot inspect SASS of a device function", str(raises.exception)
+        )
 
-    @skip_on_cudasim('cudasim will allow calling any function')
+    @skip_on_cudasim("cudasim will allow calling any function")
     def test_device_func_as_kernel_disallowed(self):
         @cuda.jit(device=True)
         def f():
@@ -164,10 +167,12 @@ def f():
         with self.assertRaises(RuntimeError) as raises:
             f[1, 1]()
 
-        self.assertIn('Cannot compile a device function as a kernel',
-                      str(raises.exception))
+        self.assertIn(
+            "Cannot compile a device function as a kernel",
+            str(raises.exception),
+        )
 
-    @skip_on_cudasim('cudasim ignores casting by jit decorator signature')
+    @skip_on_cudasim("cudasim ignores casting by jit decorator signature")
     def test_device_casting(self):
         # Ensure that casts to the correct type are forced when calling a
         # device function with a signature. This test ensures that:
@@ -176,20 +181,23 @@ def test_device_casting(self):
         #   shouldn't
         # - We insert a cast when calling rgba, as opposed to failing to type.
 
-        @cuda.jit('int32(int32, int32, int32, int32)', device=True)
+        @cuda.jit("int32(int32, int32, int32, int32)", device=True)
         def rgba(r, g, b, a):
-            return (((r & 0xFF) << 16) |
-                    ((g & 0xFF) << 8) |
-                    ((b & 0xFF) << 0) |
-                    ((a & 0xFF) << 24))
+            return (
+                ((r & 0xFF) << 16)
+                | ((g & 0xFF) << 8)
+                | ((b & 0xFF) << 0)
+                | ((a & 0xFF) << 24)
+            )
 
         @cuda.jit
         def rgba_caller(x, channels):
             x[0] = rgba(channels[0], channels[1], channels[2], channels[3])
 
         x = cuda.device_array(1, dtype=np.int32)
-        channels = cuda.to_device(np.asarray([1.0, 2.0, 3.0, 4.0],
-                                             dtype=np.float32))
+        channels = cuda.to_device(
+            np.asarray([1.0, 2.0, 3.0, 4.0], dtype=np.float32)
+        )
 
         rgba_caller[1, 1](x, channels)
 
@@ -259,32 +267,31 @@ def rgba_caller(x, channels):
 }""")
 
 
-@skip_on_cudasim('External functions unsupported in the simulator')
+@skip_on_cudasim("External functions unsupported in the simulator")
 class TestDeclareDevice(CUDATestCase):
-
     def check_api(self, decl):
-        self.assertEqual(decl.name, 'f1')
+        self.assertEqual(decl.name, "f1")
         self.assertEqual(decl.sig.args, (float32[:],))
         self.assertEqual(decl.sig.return_type, int32)
 
     def test_declare_device_signature(self):
-        f1 = cuda.declare_device('f1', int32(float32[:]))
+        f1 = cuda.declare_device("f1", int32(float32[:]))
         self.check_api(f1)
 
     def test_declare_device_string(self):
-        f1 = cuda.declare_device('f1', 'int32(float32[:])')
+        f1 = cuda.declare_device("f1", "int32(float32[:])")
         self.check_api(f1)
 
     def test_bad_declare_device_tuple(self):
-        with self.assertRaisesRegex(TypeError, 'Return type'):
-            cuda.declare_device('f1', (float32[:],))
+        with self.assertRaisesRegex(TypeError, "Return type"):
+            cuda.declare_device("f1", (float32[:],))
 
     def test_bad_declare_device_string(self):
-        with self.assertRaisesRegex(TypeError, 'Return type'):
-            cuda.declare_device('f1', '(float32[:],)')
+        with self.assertRaisesRegex(TypeError, "Return type"):
+            cuda.declare_device("f1", "(float32[:],)")
 
     def test_link_cu_source(self):
-        times2 = cuda.declare_device('times2', 'int32(int32)', link=times2_cu)
+        times2 = cuda.declare_device("times2", "int32(int32)", link=times2_cu)
 
         @cuda.jit
         def kernel(r, x):
@@ -301,7 +308,7 @@ def kernel(r, x):
 
     def _test_link_multiple_sources(self, link_type):
         link = link_type([times2_cu, times4_cu])
-        times4 = cuda.declare_device('times4', 'int32(int32)', link=link)
+        times4 = cuda.declare_device("times4", "int32(int32)", link=link)
 
         @cuda.jit
         def kernel(r, x):
@@ -360,7 +367,7 @@ def kernel(x, seed):
         np.testing.assert_equal(x[0], 323845807)
 
     def test_declared_in_called_function(self):
-        times2 = cuda.declare_device('times2', 'int32(int32)', link=times2_cu)
+        times2 = cuda.declare_device("times2", "int32(int32)", link=times2_cu)
 
         @cuda.jit
         def device_func(x):
@@ -380,7 +387,7 @@ def kernel(r, x):
         np.testing.assert_equal(r, x * 2)
 
     def test_declared_in_called_function_twice(self):
-        times2 = cuda.declare_device('times2', 'int32(int32)', link=times2_cu)
+        times2 = cuda.declare_device("times2", "int32(int32)", link=times2_cu)
 
         @cuda.jit
         def device_func_1(x):
@@ -404,7 +411,7 @@ def kernel(r, x):
         np.testing.assert_equal(r, x * 2)
 
     def test_declared_in_called_function_two_calls(self):
-        times2 = cuda.declare_device('times2', 'int32(int32)', link=times2_cu)
+        times2 = cuda.declare_device("times2", "int32(int32)", link=times2_cu)
 
         @cuda.jit
         def device_func(x):
@@ -424,7 +431,7 @@ def kernel(r, x):
         np.testing.assert_equal(r, x * 6)
 
     def test_call_declared_function_twice(self):
-        times2 = cuda.declare_device('times2', 'int32(int32)', link=times2_cu)
+        times2 = cuda.declare_device("times2", "int32(int32)", link=times2_cu)
 
         @cuda.jit
         def kernel(r, x):
@@ -440,7 +447,7 @@ def kernel(r, x):
         np.testing.assert_equal(r, x * 6)
 
     def test_declared_in_called_function_and_parent(self):
-        times2 = cuda.declare_device('times2', 'int32(int32)', link=times2_cu)
+        times2 = cuda.declare_device("times2", "int32(int32)", link=times2_cu)
 
         @cuda.jit
         def device_func(x):
@@ -460,8 +467,8 @@ def kernel(r, x):
         np.testing.assert_equal(r, x * 4)
 
     def test_call_two_different_declared_functions(self):
-        times2 = cuda.declare_device('times2', 'int32(int32)', link=times2_cu)
-        times3 = cuda.declare_device('times3', 'int32(int32)', link=times3_cu)
+        times2 = cuda.declare_device("times2", "int32(int32)", link=times2_cu)
+        times3 = cuda.declare_device("times3", "int32(int32)", link=times3_cu)
 
         @cuda.jit
         def kernel(r, x):
@@ -477,5 +484,5 @@ def kernel(r, x):
         np.testing.assert_equal(r, x * 5)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py b/numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py
index da5257699..4bb773ef1 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py
@@ -15,19 +15,18 @@ def add_kernel(r, x, y):
     r[0] = x + y
 
 
-@skip_on_cudasim('Specialization not implemented in the simulator')
+@skip_on_cudasim("Specialization not implemented in the simulator")
 class TestDispatcherSpecialization(CUDATestCase):
     def _test_no_double_specialize(self, dispatcher, ty):
-
         with self.assertRaises(RuntimeError) as e:
             dispatcher.specialize(ty)
 
-        self.assertIn('Dispatcher already specialized', str(e.exception))
+        self.assertIn("Dispatcher already specialized", str(e.exception))
 
     def test_no_double_specialize_sig_same_types(self):
         # Attempting to specialize a kernel jitted with a signature is illegal,
         # even for the same types the kernel is already specialized for.
-        @cuda.jit('void(float32[::1])')
+        @cuda.jit("void(float32[::1])")
         def f(x):
             pass
 
@@ -45,7 +44,7 @@ def f(x):
 
     def test_no_double_specialize_sig_diff_types(self):
         # Attempting to specialize a kernel jitted with a signature is illegal.
-        @cuda.jit('void(int32[::1])')
+        @cuda.jit("void(int32[::1])")
         def f(x):
             pass
 
@@ -132,13 +131,13 @@ def test_coerce_input_types(self):
         self.assertEqual(r[0], add(12300000000, 456))
 
         # Now force compilation of only a single specialization
-        c_add = cuda.jit('(i4[::1], i4, i4)')(add_kernel)
+        c_add = cuda.jit("(i4[::1], i4, i4)")(add_kernel)
         r = np.zeros(1, dtype=np.int32)
 
         c_add[1, 1](r, 123, 456)
         self.assertPreciseEqual(r[0], add(123, 456))
 
-    @skip_on_cudasim('Simulator ignores signature')
+    @skip_on_cudasim("Simulator ignores signature")
     @unittest.expectedFailure
     def test_coerce_input_types_unsafe(self):
         # Implicit (unsafe) conversion of float to int, originally from
@@ -149,25 +148,24 @@ def test_coerce_input_types_unsafe(self):
         #
         # This test is marked as xfail until future changes enable this
         # behavior.
-        c_add = cuda.jit('(i4[::1], i4, i4)')(add_kernel)
+        c_add = cuda.jit("(i4[::1], i4, i4)")(add_kernel)
         r = np.zeros(1, dtype=np.int32)
 
         c_add[1, 1](r, 12.3, 45.6)
         self.assertPreciseEqual(r[0], add(12, 45))
 
-    @skip_on_cudasim('Simulator ignores signature')
+    @skip_on_cudasim("Simulator ignores signature")
     def test_coerce_input_types_unsafe_complex(self):
         # Implicit conversion of complex to int disallowed
-        c_add = cuda.jit('(i4[::1], i4, i4)')(add_kernel)
+        c_add = cuda.jit("(i4[::1], i4, i4)")(add_kernel)
         r = np.zeros(1, dtype=np.int32)
 
         with self.assertRaises(TypeError):
             c_add[1, 1](r, 12.3, 45.6j)
 
-    @skip_on_cudasim('Simulator does not track overloads')
+    @skip_on_cudasim("Simulator does not track overloads")
     def test_ambiguous_new_version(self):
-        """Test compiling new version in an ambiguous case
-        """
+        """Test compiling new version in an ambiguous case"""
         c_add = cuda.jit(add_kernel)
 
         r = np.zeros(1, dtype=np.float64)
@@ -190,8 +188,9 @@ def test_ambiguous_new_version(self):
         # to (float, int) or (int, float) with equal weight.
         c_add[1, 1](r, 1, 1)
         self.assertAlmostEqual(r[0], INT + INT)
-        self.assertEqual(len(c_add.overloads), 4, "didn't compile a new "
-                                                  "version")
+        self.assertEqual(
+            len(c_add.overloads), 4, "didn't compile a new version"
+        )
 
     @skip_on_cudasim("Simulator doesn't support concurrent kernels")
     def test_lock(self):
@@ -245,8 +244,10 @@ def _test_explicit_signatures(self, sigs):
 
     def test_explicit_signatures_strings(self):
         # Check with a list of strings for signatures
-        sigs = ["(int64[::1], int64, int64)",
-                "(float64[::1], float64, float64)"]
+        sigs = [
+            "(int64[::1], int64, int64)",
+            "(float64[::1], float64, float64)",
+        ]
         self._test_explicit_signatures(sigs)
 
     def test_explicit_signatures_tuples(self):
@@ -256,26 +257,31 @@ def test_explicit_signatures_tuples(self):
 
     def test_explicit_signatures_signatures(self):
         # Check with a list of Signature objects for signatures
-        sigs = [void(int64[::1], int64, int64),
-                void(float64[::1], float64, float64)]
+        sigs = [
+            void(int64[::1], int64, int64),
+            void(float64[::1], float64, float64),
+        ]
         self._test_explicit_signatures(sigs)
 
     def test_explicit_signatures_mixed(self):
         # Check when we mix types of signature objects in a list of signatures
 
         # Tuple and string
-        sigs = [(int64[::1], int64, int64),
-                "(float64[::1], float64, float64)"]
+        sigs = [(int64[::1], int64, int64), "(float64[::1], float64, float64)"]
         self._test_explicit_signatures(sigs)
 
         # Tuple and Signature object
-        sigs = [(int64[::1], int64, int64),
-                void(float64[::1], float64, float64)]
+        sigs = [
+            (int64[::1], int64, int64),
+            void(float64[::1], float64, float64),
+        ]
         self._test_explicit_signatures(sigs)
 
         # Signature object and string
-        sigs = [void(int64[::1], int64, int64),
-                "(float64[::1], float64, float64)"]
+        sigs = [
+            void(int64[::1], int64, int64),
+            "(float64[::1], float64, float64)",
+        ]
         self._test_explicit_signatures(sigs)
 
     def test_explicit_signatures_same_type_class(self):
@@ -284,8 +290,10 @@ def test_explicit_signatures_same_type_class(self):
         # that dispatch is differentiated on the types of x and y only, to
         # closely preserve the intent of the original test from
         # numba.tests.test_dispatcher)
-        sigs = ["(float64[::1], float32, float32)",
-                "(float64[::1], float64, float64)"]
+        sigs = [
+            "(float64[::1], float32, float32)",
+            "(float64[::1], float64, float64)",
+        ]
         f = cuda.jit(sigs)(add_kernel)
 
         r = np.zeros(1, dtype=np.float64)
@@ -296,13 +304,17 @@ def test_explicit_signatures_same_type_class(self):
         f[1, 1](r, 1, 2**-25)
         self.assertPreciseEqual(r[0], 1.0000000298023224)
 
-    @skip_on_cudasim('No overload resolution in the simulator')
+    @skip_on_cudasim("No overload resolution in the simulator")
     def test_explicit_signatures_ambiguous_resolution(self):
         # Fail to resolve ambiguity between the two best overloads
         # (Also deliberate float64[::1] for the first argument in all cases)
-        f = cuda.jit(["(float64[::1], float32, float64)",
-                      "(float64[::1], float64, float32)",
-                      "(float64[::1], int64, int64)"])(add_kernel)
+        f = cuda.jit(
+            [
+                "(float64[::1], float32, float64)",
+                "(float64[::1], float64, float32)",
+                "(float64[::1], int64, int64)",
+            ]
+        )(add_kernel)
         with self.assertRaises(TypeError) as cm:
             r = np.zeros(1, dtype=np.float64)
             f[1, 1](r, 1.0, 2.0)
@@ -317,12 +329,12 @@ def test_explicit_signatures_ambiguous_resolution(self):
             r"\(Array\(float64, 1, 'C', False, aligned=True\), float32,"
             r" float64\) -> none\n"
             r"\(Array\(float64, 1, 'C', False, aligned=True\), float64,"
-            r" float32\) -> none"
+            r" float32\) -> none",
         )
         # The integer signature is not part of the best matches
         self.assertNotIn("int64", str(cm.exception))
 
-    @skip_on_cudasim('Simulator does not use _prepare_args')
+    @skip_on_cudasim("Simulator does not use _prepare_args")
     @unittest.expectedFailure
     def test_explicit_signatures_unsafe(self):
         # These tests are from test_explicit_signatures, but have to be xfail
@@ -336,8 +348,10 @@ def test_explicit_signatures_unsafe(self):
         self.assertPreciseEqual(r[0], 3)
         self.assertEqual(len(f.overloads), 1, f.overloads)
 
-        sigs = ["(int64[::1], int64, int64)",
-                "(float64[::1], float64, float64)"]
+        sigs = [
+            "(int64[::1], int64, int64)",
+            "(float64[::1], float64, float64)",
+        ]
         f = cuda.jit(sigs)(add_kernel)
         r = np.zeros(1, dtype=np.float64)
         # Approximate match (int32 -> float64 is a safe conversion)
@@ -414,7 +428,7 @@ def test_explicit_signatures_device_ambiguous(self):
         f[1, 1](r, 1.5, 2.5)
         self.assertPreciseEqual(r[0], 4.0)
 
-    @skip_on_cudasim('CUDA Simulator does not force casting')
+    @skip_on_cudasim("CUDA Simulator does not force casting")
     def test_explicit_signatures_device_unsafe(self):
         # These tests are from test_explicit_signatures. The device function
         # variant of these tests can succeed on CUDA because the compilation
@@ -489,17 +503,15 @@ def pi_sin_array(x, n):
         # provides the same values as getting the registers per thread for
         # individual signatures.
         regs_per_thread_all = pi_sin_array.get_regs_per_thread()
-        self.assertEqual(regs_per_thread_all[sig_f32.args],
-                         regs_per_thread_f32)
-        self.assertEqual(regs_per_thread_all[sig_f64.args],
-                         regs_per_thread_f64)
+        self.assertEqual(regs_per_thread_all[sig_f32.args], regs_per_thread_f32)
+        self.assertEqual(regs_per_thread_all[sig_f64.args], regs_per_thread_f64)
 
         if regs_per_thread_f32 == regs_per_thread_f64:
             # If the register usage is the same for both variants, there may be
             # a bug, but this may also be an artifact of the compiler / driver
             # / device combination, so produce an informational message only.
-            print('f32 and f64 variant thread usages are equal.')
-            print('This may warrant some investigation. Devices:')
+            print("f32 and f64 variant thread usages are equal.")
+            print("This may warrant some investigation. Devices:")
             cuda.detect()
 
     def test_get_regs_per_thread_specialized(self):
@@ -696,5 +708,5 @@ def simple_lmem(ary):
         self.assertGreaterEqual(local_mem_per_thread, N * 4)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_enums.py b/numba_cuda/numba/cuda/tests/cudapy/test_enums.py
index da60b7565..6db955e06 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_enums.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_enums.py
@@ -12,18 +12,17 @@
     Shape,
     Planet,
     RequestError,
-    IntEnumWithNegatives
+    IntEnumWithNegatives,
 )
 
 
 class EnumTest(CUDATestCase):
-
     pairs = [
         (Color.red, Color.red),
         (Color.red, Color.green),
         (Planet.EARTH, Planet.EARTH),
         (Planet.VENUS, Planet.MARS),
-        (Shape.circle, IntEnumWithNegatives.two) # IntEnum, same value
+        (Shape.circle, IntEnumWithNegatives.two),  # IntEnum, same value
     ]
 
     def test_compare(self):
@@ -45,7 +44,7 @@ def test_getattr_getitem(self):
         def f(out):
             # Lookup of an enum member on its class
             out[0] = Color.red == Color.green
-            out[1] = Color['red'] == Color['green']
+            out[1] = Color["red"] == Color["green"]
 
         cuda_f = cuda.jit(f)
         got = np.zeros((2,), dtype=np.bool_)
@@ -106,16 +105,16 @@ def f(x, out):
     def test_vectorize(self):
         def f(x):
             if x != RequestError.not_found:
-                return RequestError['internal_error']
+                return RequestError["internal_error"]
             else:
                 return RequestError.dummy
 
-        cuda_func = vectorize("int64(int64)", target='cuda')(f)
+        cuda_func = vectorize("int64(int64)", target="cuda")(f)
         arr = np.array([2, 404, 500, 404], dtype=np.int64)
         expected = np.array([f(x) for x in arr], dtype=np.int64)
         got = cuda_func(arr)
         self.assertPreciseEqual(expected, got)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_errors.py b/numba_cuda/numba/cuda/tests/cudapy/test_errors.py
index c20fb8dcc..0b24bee8e 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_errors.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_errors.py
@@ -17,34 +17,49 @@ def test_too_many_dims(self):
 
         with self.assertRaises(ValueError) as raises:
             kernfunc[(1, 2, 3, 4), (5, 6)]
-        self.assertIn("griddim must be a sequence of 1, 2 or 3 integers, "
-                      "got [1, 2, 3, 4]",
-                      str(raises.exception))
+        self.assertIn(
+            "griddim must be a sequence of 1, 2 or 3 integers, "
+            "got [1, 2, 3, 4]",
+            str(raises.exception),
+        )
 
         with self.assertRaises(ValueError) as raises:
-            kernfunc[(1, 2,), (3, 4, 5, 6)]
-        self.assertIn("blockdim must be a sequence of 1, 2 or 3 integers, "
-                      "got [3, 4, 5, 6]",
-                      str(raises.exception))
+            kernfunc[
+                (
+                    1,
+                    2,
+                ),
+                (3, 4, 5, 6),
+            ]
+        self.assertIn(
+            "blockdim must be a sequence of 1, 2 or 3 integers, "
+            "got [3, 4, 5, 6]",
+            str(raises.exception),
+        )
 
     def test_non_integral_dims(self):
         kernfunc = cuda.jit(noop)
 
         with self.assertRaises(TypeError) as raises:
             kernfunc[2.0, 3]
-        self.assertIn("griddim must be a sequence of integers, got [2.0]",
-                      str(raises.exception))
+        self.assertIn(
+            "griddim must be a sequence of integers, got [2.0]",
+            str(raises.exception),
+        )
 
         with self.assertRaises(TypeError) as raises:
             kernfunc[2, 3.0]
-        self.assertIn("blockdim must be a sequence of integers, got [3.0]",
-                      str(raises.exception))
+        self.assertIn(
+            "blockdim must be a sequence of integers, got [3.0]",
+            str(raises.exception),
+        )
 
     def _test_unconfigured(self, kernfunc):
         with self.assertRaises(ValueError) as raises:
             kernfunc(0)
-        self.assertIn("launch configuration was not specified",
-                      str(raises.exception))
+        self.assertIn(
+            "launch configuration was not specified", str(raises.exception)
+        )
 
     def test_unconfigured_typed_cudakernel(self):
         kernfunc = cuda.jit("void(int32)")(noop)
@@ -54,7 +69,7 @@ def test_unconfigured_untyped_cudakernel(self):
         kernfunc = cuda.jit(noop)
         self._test_unconfigured(kernfunc)
 
-    @skip_on_cudasim('TypingError does not occur on simulator')
+    @skip_on_cudasim("TypingError does not occur on simulator")
     def test_typing_error(self):
         # see #5860, this is present to catch changes to error reporting
         # accidentally breaking the CUDA target
@@ -75,5 +90,5 @@ def kernel_func():
         self.assertIn("NameError: name 'floor' is not defined", excstr)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_exception.py b/numba_cuda/numba/cuda/tests/cudapy/test_exception.py
index 42f31074a..63dce76eb 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_exception.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_exception.py
@@ -83,20 +83,19 @@ def oracle(x, y):
                         x[i] += x[i] // y[i]
 
         n = 32
-        got_x = 1. / (np.arange(n) + 0.01)
-        got_y = 1. / (np.arange(n) + 0.01)
+        got_x = 1.0 / (np.arange(n) + 0.01)
+        got_y = 1.0 / (np.arange(n) + 0.01)
         problematic[1, n](got_x, got_y)
 
-        expect_x = 1. / (np.arange(n) + 0.01)
-        expect_y = 1. / (np.arange(n) + 0.01)
+        expect_x = 1.0 / (np.arange(n) + 0.01)
+        expect_y = 1.0 / (np.arange(n) + 0.01)
         oracle[1, n](expect_x, expect_y)
 
         np.testing.assert_almost_equal(expect_x, got_x)
         np.testing.assert_almost_equal(expect_y, got_y)
 
     def test_raise_causing_warp_diverge(self):
-        """Test case for issue #2655.
-        """
+        """Test case for issue #2655."""
         self.case_raise_causing_warp_diverge(with_debug_mode=False)
 
     # The following two cases relate to Issue #7806: Division by zero stops the
@@ -117,8 +116,8 @@ def f(r, x, y):
 
         f[1, 1](r, x, y)
 
-        self.assertTrue(np.isinf(r[0]), 'Expected inf from div by zero')
-        self.assertEqual(r[1], y[0], 'Expected execution to continue')
+        self.assertTrue(np.isinf(r[0]), "Expected inf from div by zero")
+        self.assertEqual(r[1], y[0], "Expected execution to continue")
 
     def test_zero_division_error_in_debug(self):
         # When debug is True:
@@ -146,15 +145,15 @@ def f(r, x, y):
         with self.assertRaises(exc):
             f[1, 1](r, x, y)
 
-        self.assertEqual(r[0], 0, 'Expected result to be left unset')
-        self.assertEqual(r[1], 0, 'Expected execution to stop')
+        self.assertEqual(r[0], 0, "Expected result to be left unset")
+        self.assertEqual(r[1], 0, "Expected execution to stop")
 
     @xfail_unless_cudasim
     def test_raise_in_device_function(self):
         # This is an expected failure because reporting of exceptions raised in
         # device functions does not work correctly - see Issue #8036:
         # https://github.com/numba/numba/issues/8036
-        msg = 'Device Function Error'
+        msg = "Device Function Error"
 
         @cuda.jit(device=True)
         def f():
@@ -170,5 +169,5 @@ def kernel():
         self.assertIn(msg, str(raises.exception))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_extending.py b/numba_cuda/numba/cuda/tests/cudapy/test_extending.py
index 142d917c0..18f3ac478 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_extending.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_extending.py
@@ -8,12 +8,13 @@ class Interval:
     """
     A half-open interval on the real number line.
     """
+
     def __init__(self, lo, hi):
         self.lo = lo
         self.hi = hi
 
     def __repr__(self):
-        return 'Interval(%f, %f)' % (self.lo, self.hi)
+        return "Interval(%f, %f)" % (self.lo, self.hi)
 
     @property
     def width(self):
@@ -32,16 +33,21 @@ def sum_intervals(i, j):
 
 if not config.ENABLE_CUDASIM:
     from numba.core import cgutils
-    from numba.core.extending import (lower_builtin, make_attribute_wrapper,
-                                      models, register_model, type_callable,
-                                      typeof_impl)
+    from numba.core.extending import (
+        lower_builtin,
+        make_attribute_wrapper,
+        models,
+        register_model,
+        type_callable,
+        typeof_impl,
+    )
     from numba.core.typing.templates import AttributeTemplate
     from numba.cuda.cudadecl import registry as cuda_registry
     from numba.cuda.cudaimpl import lower_attr as cuda_lower_attr
 
     class IntervalType(types.Type):
         def __init__(self):
-            super().__init__(name='Interval')
+            super().__init__(name="Interval")
 
     interval_type = IntervalType()
 
@@ -54,19 +60,20 @@ def type_interval(context):
         def typer(lo, hi):
             if isinstance(lo, types.Float) and isinstance(hi, types.Float):
                 return interval_type
+
         return typer
 
     @register_model(IntervalType)
     class IntervalModel(models.StructModel):
         def __init__(self, dmm, fe_type):
             members = [
-                ('lo', types.float64),
-                ('hi', types.float64),
+                ("lo", types.float64),
+                ("hi", types.float64),
             ]
             models.StructModel.__init__(self, dmm, fe_type, members)
 
-    make_attribute_wrapper(IntervalType, 'lo', 'lo')
-    make_attribute_wrapper(IntervalType, 'hi', 'hi')
+    make_attribute_wrapper(IntervalType, "lo", "lo")
+    make_attribute_wrapper(IntervalType, "hi", "hi")
 
     @lower_builtin(Interval, types.Float, types.Float)
     def impl_interval(context, builder, sig, args):
@@ -84,14 +91,14 @@ class Interval_attrs(AttributeTemplate):
         def resolve_width(self, mod):
             return types.float64
 
-    @cuda_lower_attr(IntervalType, 'width')
+    @cuda_lower_attr(IntervalType, "width")
     def cuda_Interval_width(context, builder, sig, arg):
         lo = builder.extract_value(arg, 0)
         hi = builder.extract_value(arg, 1)
         return builder.fsub(hi, lo)
 
 
-@skip_on_cudasim('Extensions not supported in the simulator')
+@skip_on_cudasim("Extensions not supported in the simulator")
 class TestExtending(CUDATestCase):
     def test_attributes(self):
         @cuda.jit
@@ -151,5 +158,5 @@ def f(r, x):
         np.testing.assert_allclose(r, expected)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py b/numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py
index f4b705683..de250d635 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py
@@ -5,8 +5,7 @@
 from math import cos, sin, tan, exp, log, log10, log2, pow, tanh
 from operator import truediv
 import numpy as np
-from numba.cuda.testing import (CUDATestCase, skip_on_cudasim,
-                                skip_unless_cc_75)
+from numba.cuda.testing import CUDATestCase, skip_on_cudasim, skip_unless_cc_75
 import unittest
 
 
@@ -24,10 +23,9 @@ def check(self, test: CUDATestCase, fast: str, prec: str):
         test.assertTrue(all(i not in prec for i in self.prec_unexpected))
 
 
-@skip_on_cudasim('Fastmath and PTX inspection not available on cudasim')
+@skip_on_cudasim("Fastmath and PTX inspection not available on cudasim")
 class TestFastMathOption(CUDATestCase):
     def _test_fast_math_common(self, pyfunc, sig, device, criterion):
-
         # Test jit code path
         fastver = cuda.jit(sig, device=device, fastmath=True)(pyfunc)
         precver = cuda.jit(sig, device=device)(pyfunc)
@@ -40,9 +38,7 @@ def _test_fast_math_common(self, pyfunc, sig, device, criterion):
         fastptx, _ = compile_ptx_for_current_device(
             pyfunc, sig, device=device, fastmath=True
         )
-        precptx, _ = compile_ptx_for_current_device(
-            pyfunc, sig, device=device
-        )
+        precptx, _ = compile_ptx_for_current_device(pyfunc, sig, device=device)
 
         criterion.check(self, fastptx, precptx)
 
@@ -69,7 +65,9 @@ def device(x, y):
 
         self._test_fast_math_common(
             kernel,
-            (float32[::1], float32, float32), device=False, criterion=criterion
+            (float32[::1], float32, float32),
+            device=False,
+            criterion=criterion,
         )
         self._test_fast_math_common(
             device, (float32, float32), device=True, criterion=criterion
@@ -79,39 +77,41 @@ def test_cosf(self):
         self._test_fast_math_unary(
             cos,
             FastMathCriterion(
-                fast_expected=['cos.approx.ftz.f32 '],
-                prec_unexpected=['cos.approx.ftz.f32 ']
-            )
+                fast_expected=["cos.approx.ftz.f32 "],
+                prec_unexpected=["cos.approx.ftz.f32 "],
+            ),
         )
 
     def test_sinf(self):
         self._test_fast_math_unary(
             sin,
             FastMathCriterion(
-                fast_expected=['sin.approx.ftz.f32 '],
-                prec_unexpected=['sin.approx.ftz.f32 ']
-            )
+                fast_expected=["sin.approx.ftz.f32 "],
+                prec_unexpected=["sin.approx.ftz.f32 "],
+            ),
         )
 
     def test_tanf(self):
         self._test_fast_math_unary(
             tan,
-            FastMathCriterion(fast_expected=[
-                'sin.approx.ftz.f32 ',
-                'cos.approx.ftz.f32 ',
-                'div.approx.ftz.f32 '
-            ], prec_unexpected=['sin.approx.ftz.f32 '])
+            FastMathCriterion(
+                fast_expected=[
+                    "sin.approx.ftz.f32 ",
+                    "cos.approx.ftz.f32 ",
+                    "div.approx.ftz.f32 ",
+                ],
+                prec_unexpected=["sin.approx.ftz.f32 "],
+            ),
         )
 
     @skip_unless_cc_75
     def test_tanhf(self):
-
         self._test_fast_math_unary(
             tanh,
             FastMathCriterion(
-                fast_expected=['tanh.approx.f32 '],
-                prec_unexpected=['tanh.approx.f32 ']
-            )
+                fast_expected=["tanh.approx.f32 "],
+                prec_unexpected=["tanh.approx.f32 "],
+            ),
         )
 
     def test_tanhf_compile_ptx(self):
@@ -119,74 +119,85 @@ def tanh_kernel(r, x):
             r[0] = tanh(x)
 
         def tanh_common_test(cc, criterion):
-            fastptx, _ = compile_ptx(tanh_kernel, (float32[::1], float32),
-                                     fastmath=True, cc=cc)
-            precptx, _ = compile_ptx(tanh_kernel, (float32[::1], float32),
-                                     cc=cc)
+            fastptx, _ = compile_ptx(
+                tanh_kernel, (float32[::1], float32), fastmath=True, cc=cc
+            )
+            precptx, _ = compile_ptx(
+                tanh_kernel, (float32[::1], float32), cc=cc
+            )
             criterion.check(self, fastptx, precptx)
 
-        tanh_common_test(cc=(7, 5), criterion=FastMathCriterion(
-            fast_expected=['tanh.approx.f32 '],
-            prec_unexpected=['tanh.approx.f32 ']
-        ))
+        tanh_common_test(
+            cc=(7, 5),
+            criterion=FastMathCriterion(
+                fast_expected=["tanh.approx.f32 "],
+                prec_unexpected=["tanh.approx.f32 "],
+            ),
+        )
 
-        tanh_common_test(cc=(7, 0),
-                         criterion=FastMathCriterion(
-            fast_expected=['ex2.approx.ftz.f32 ',
-                           'rcp.approx.ftz.f32 '],
-            prec_unexpected=['tanh.approx.f32 ']))
+        tanh_common_test(
+            cc=(7, 0),
+            criterion=FastMathCriterion(
+                fast_expected=["ex2.approx.ftz.f32 ", "rcp.approx.ftz.f32 "],
+                prec_unexpected=["tanh.approx.f32 "],
+            ),
+        )
 
     def test_expf(self):
         self._test_fast_math_unary(
             exp,
             FastMathCriterion(
-                fast_unexpected=['fma.rn.f32 '],
-                prec_expected=['fma.rn.f32 ']
-            )
+                fast_unexpected=["fma.rn.f32 "], prec_expected=["fma.rn.f32 "]
+            ),
         )
 
     def test_logf(self):
         # Look for constant used to convert from log base 2 to log base e
         self._test_fast_math_unary(
-            log, FastMathCriterion(
-                fast_expected=['lg2.approx.ftz.f32 ', '0f3F317218'],
-                prec_unexpected=['lg2.approx.ftz.f32 '],
-            )
+            log,
+            FastMathCriterion(
+                fast_expected=["lg2.approx.ftz.f32 ", "0f3F317218"],
+                prec_unexpected=["lg2.approx.ftz.f32 "],
+            ),
         )
 
     def test_log10f(self):
         # Look for constant used to convert from log base 2 to log base 10
         self._test_fast_math_unary(
-            log10, FastMathCriterion(
-                fast_expected=['lg2.approx.ftz.f32 ', '0f3E9A209B'],
-                prec_unexpected=['lg2.approx.ftz.f32 ']
-            )
+            log10,
+            FastMathCriterion(
+                fast_expected=["lg2.approx.ftz.f32 ", "0f3E9A209B"],
+                prec_unexpected=["lg2.approx.ftz.f32 "],
+            ),
         )
 
     def test_log2f(self):
         self._test_fast_math_unary(
-            log2, FastMathCriterion(
-                fast_expected=['lg2.approx.ftz.f32 '],
-                prec_unexpected=['lg2.approx.ftz.f32 ']
-            )
+            log2,
+            FastMathCriterion(
+                fast_expected=["lg2.approx.ftz.f32 "],
+                prec_unexpected=["lg2.approx.ftz.f32 "],
+            ),
         )
 
     def test_powf(self):
         self._test_fast_math_binary(
-            pow, FastMathCriterion(
-                fast_expected=['lg2.approx.ftz.f32 '],
-                prec_unexpected=['lg2.approx.ftz.f32 '],
-            )
+            pow,
+            FastMathCriterion(
+                fast_expected=["lg2.approx.ftz.f32 "],
+                prec_unexpected=["lg2.approx.ftz.f32 "],
+            ),
         )
 
     def test_divf(self):
         self._test_fast_math_binary(
-            truediv, FastMathCriterion(
-                fast_expected=['div.approx.ftz.f32 '],
-                fast_unexpected=['div.rn.f32'],
-                prec_expected=['div.rn.f32'],
-                prec_unexpected=['div.approx.ftz.f32 '],
-            )
+            truediv,
+            FastMathCriterion(
+                fast_expected=["div.approx.ftz.f32 "],
+                fast_unexpected=["div.rn.f32"],
+                prec_expected=["div.rn.f32"],
+                prec_unexpected=["div.approx.ftz.f32 "],
+            ),
         )
 
     def test_divf_exception(self):
@@ -232,13 +243,13 @@ def bar(arr, val):
         # https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-div
 
         # The fast version should use the "fast, approximate divide" variant
-        self.assertIn('div.approx.f32', fastver.inspect_asm(sig))
+        self.assertIn("div.approx.f32", fastver.inspect_asm(sig))
         # The precise version should use the "IEEE 754 compliant rounding"
         # variant, and neither of the "approximate divide" variants.
-        self.assertIn('div.rn.f32', precver.inspect_asm(sig))
-        self.assertNotIn('div.approx.f32', precver.inspect_asm(sig))
-        self.assertNotIn('div.full.f32', precver.inspect_asm(sig))
+        self.assertIn("div.rn.f32", precver.inspect_asm(sig))
+        self.assertNotIn("div.approx.f32", precver.inspect_asm(sig))
+        self.assertNotIn("div.full.f32", precver.inspect_asm(sig))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_forall.py b/numba_cuda/numba/cuda/tests/cudapy/test_forall.py
index 23286c22c..adef70911 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_forall.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_forall.py
@@ -44,9 +44,11 @@ def test_forall_negative_work(self):
         # negative element count.
         with self.assertRaises(ValueError) as raises:
             foo.forall(-1)
-        self.assertIn("Can't create ForAll with negative task count",
-                      str(raises.exception))
+        self.assertIn(
+            "Can't create ForAll with negative task count",
+            str(raises.exception),
+        )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_freevar.py b/numba_cuda/numba/cuda/tests/cudapy/test_freevar.py
index 6b7b2d2ab..14470902f 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_freevar.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_freevar.py
@@ -17,13 +17,15 @@ def test_freevar(self):
         @cuda.jit("(float32[::1], intp)")
         def foo(A, i):
             "Dummy function"
-            sdata = cuda.shared.array(size,   # size is freevar
-                                      dtype=nbtype)  # nbtype is freevar
+            sdata = cuda.shared.array(
+                size,  # size is freevar
+                dtype=nbtype,
+            )  # nbtype is freevar
             A[i] = sdata[i]
 
         A = np.arange(2, dtype="float32")
         foo[1, 1](A, 0)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py b/numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py
index 71169801e..e78971dd1 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py
@@ -62,5 +62,5 @@ def test_ldexp_f8(self):
         self.template_test_ldexp(np.float64, float64)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_globals.py b/numba_cuda/numba/cuda/tests/cudapy/test_globals.py
index a2406e665..0bfb277c8 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_globals.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_globals.py
@@ -29,8 +29,7 @@ def coop_smem2d(ary):
 
 class TestCudaTestGlobal(CUDATestCase):
     def test_global_int_const(self):
-        """Test simple_smem
-        """
+        """Test simple_smem"""
         compiled = cuda.jit("void(int32[:])")(simple_smem)
 
         nelem = 100
@@ -41,8 +40,7 @@ def test_global_int_const(self):
 
     @unittest.SkipTest
     def test_global_tuple_const(self):
-        """Test coop_smem2d
-        """
+        """Test coop_smem2d"""
         compiled = cuda.jit("void(float32[:,:])")(coop_smem2d)
 
         shape = 10, 20
@@ -56,5 +54,5 @@ def test_global_tuple_const(self):
         self.assertTrue(np.allclose(ary, exp))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py b/numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py
index 098318e3a..954ed635d 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py
@@ -12,9 +12,11 @@
 
 
 def _get_matmulcore_gufunc(dtype=float32):
-    @guvectorize([void(dtype[:, :], dtype[:, :], dtype[:, :])],
-                 '(m,n),(n,p)->(m,p)',
-                 target='cuda')
+    @guvectorize(
+        [void(dtype[:, :], dtype[:, :], dtype[:, :])],
+        "(m,n),(n,p)->(m,p)",
+        target="cuda",
+    )
     def matmulcore(A, B, C):
         m, n = A.shape
         n, p = B.shape
@@ -27,32 +29,33 @@ def matmulcore(A, B, C):
     return matmulcore
 
 
-@skip_on_cudasim('ufunc API unsupported in the simulator')
+@skip_on_cudasim("ufunc API unsupported in the simulator")
 class TestCUDAGufunc(CUDATestCase):
-
     def test_gufunc_small(self):
-
         gufunc = _get_matmulcore_gufunc()
 
         matrix_ct = 2
-        A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2,
-                                                                   4)
-        B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4,
-                                                                   5)
+        A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(
+            matrix_ct, 2, 4
+        )
+        B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(
+            matrix_ct, 4, 5
+        )
 
         C = gufunc(A, B)
         Gold = np.matmul(A, B)
         self.assertTrue(np.allclose(C, Gold))
 
     def test_gufunc_auto_transfer(self):
-
         gufunc = _get_matmulcore_gufunc()
 
         matrix_ct = 2
-        A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2,
-                                                                   4)
-        B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4,
-                                                                   5)
+        A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(
+            matrix_ct, 2, 4
+        )
+        B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(
+            matrix_ct, 4, 5
+        )
 
         dB = cuda.to_device(B)
 
@@ -61,24 +64,24 @@ def test_gufunc_auto_transfer(self):
         self.assertTrue(np.allclose(C, Gold))
 
     def test_gufunc(self):
-
         gufunc = _get_matmulcore_gufunc()
 
-        matrix_ct = 1001 # an odd number to test thread/block division in CUDA
-        A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2,
-                                                                   4)
-        B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4,
-                                                                   5)
+        matrix_ct = 1001  # an odd number to test thread/block division in CUDA
+        A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(
+            matrix_ct, 2, 4
+        )
+        B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(
+            matrix_ct, 4, 5
+        )
 
         C = gufunc(A, B)
         Gold = np.matmul(A, B)
         self.assertTrue(np.allclose(C, Gold))
 
     def test_gufunc_hidim(self):
-
         gufunc = _get_matmulcore_gufunc()
 
-        matrix_ct = 100 # an odd number to test thread/block division in CUDA
+        matrix_ct = 100  # an odd number to test thread/block division in CUDA
         A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(4, 25, 2, 4)
         B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(4, 25, 4, 5)
 
@@ -87,7 +90,6 @@ def test_gufunc_hidim(self):
         self.assertTrue(np.allclose(C, Gold))
 
     def test_gufunc_new_axis(self):
-
         gufunc = _get_matmulcore_gufunc(dtype=float64)
 
         X = np.random.randn(10, 3, 3)
@@ -102,15 +104,16 @@ def test_gufunc_new_axis(self):
         np.testing.assert_allclose(gold, res2)
 
     def test_gufunc_stream(self):
-
         gufunc = _get_matmulcore_gufunc()
 
-        #cuda.driver.flush_pending_free()
-        matrix_ct = 1001 # an odd number to test thread/block division in CUDA
-        A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2,
-                                                                   4)
-        B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4,
-                                                                   5)
+        # cuda.driver.flush_pending_free()
+        matrix_ct = 1001  # an odd number to test thread/block division in CUDA
+        A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(
+            matrix_ct, 2, 4
+        )
+        B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(
+            matrix_ct, 4, 5
+        )
 
         stream = cuda.stream()
         dA = cuda.to_device(A, stream)
@@ -126,10 +129,7 @@ def test_gufunc_stream(self):
         self.assertTrue(np.allclose(C, Gold))
 
     def test_copy(self):
-
-        @guvectorize([void(float32[:], float32[:])],
-                     '(x)->(x)',
-                     target='cuda')
+        @guvectorize([void(float32[:], float32[:])], "(x)->(x)", target="cuda")
         def copy(A, B):
             for i in range(B.size):
                 B[i] = A[i]
@@ -142,9 +142,7 @@ def copy(A, B):
     def test_copy_unspecified_return(self):
         # Ensure that behaviour is correct when the return type is not
         # specified in the signature.
-        @guvectorize([(float32[:], float32[:])],
-                     '(x)->(x)',
-                     target='cuda')
+        @guvectorize([(float32[:], float32[:])], "(x)->(x)", target="cuda")
         def copy(A, B):
             for i in range(B.size):
                 B[i] = A[i]
@@ -155,10 +153,7 @@ def copy(A, B):
         self.assertTrue(np.allclose(A, B))
 
     def test_copy_odd(self):
-
-        @guvectorize([void(float32[:], float32[:])],
-                     '(x)->(x)',
-                     target='cuda')
+        @guvectorize([void(float32[:], float32[:])], "(x)->(x)", target="cuda")
         def copy(A, B):
             for i in range(B.size):
                 B[i] = A[i]
@@ -169,10 +164,11 @@ def copy(A, B):
         self.assertTrue(np.allclose(A, B))
 
     def test_copy2d(self):
-
-        @guvectorize([void(float32[:, :], float32[:, :])],
-                     '(x, y)->(x, y)',
-                     target='cuda')
+        @guvectorize(
+            [void(float32[:, :], float32[:, :])],
+            "(x, y)->(x, y)",
+            target="cuda",
+        )
         def copy2d(A, B):
             for x in range(B.shape[0]):
                 for y in range(B.shape[1]):
@@ -185,8 +181,7 @@ def copy2d(A, B):
 
     def test_not_supported_call_from_jit(self):
         # not supported
-        @guvectorize([void(int32[:], int32[:])],
-                     '(n)->(n)', target='cuda')
+        @guvectorize([void(int32[:], int32[:])], "(n)->(n)", target="cuda")
         def gufunc_copy(A, b):
             for i in range(A.shape[0]):
                 b[i] = A[i]
@@ -195,7 +190,7 @@ def gufunc_copy(A, b):
         def cuda_jit(A, b):
             return gufunc_copy(A, b)
 
-        A = np.arange(1024 * 32).astype('int32')
+        A = np.arange(1024 * 32).astype("int32")
         b = np.zeros_like(A)
         msg = "Untyped global name 'gufunc_copy'.*"
         with self.assertRaisesRegex(TypingError, msg):
@@ -204,56 +199,68 @@ def cuda_jit(A, b):
     # Test inefficient use of the GPU where the inputs are all mapped onto a
     # single thread in a single block.
     def test_inefficient_launch_configuration(self):
-        @guvectorize(['void(float32[:], float32[:], float32[:])'],
-                     '(n),(n)->(n)', target='cuda')
+        @guvectorize(
+            ["void(float32[:], float32[:], float32[:])"],
+            "(n),(n)->(n)",
+            target="cuda",
+        )
         def numba_dist_cuda(a, b, dist):
             len = a.shape[0]
             for i in range(len):
                 dist[i] = a[i] * b[i]
 
-        a = np.random.rand(1024 * 32).astype('float32')
-        b = np.random.rand(1024 * 32).astype('float32')
-        dist = np.zeros(a.shape[0]).astype('float32')
+        a = np.random.rand(1024 * 32).astype("float32")
+        b = np.random.rand(1024 * 32).astype("float32")
+        dist = np.zeros(a.shape[0]).astype("float32")
 
-        with override_config('CUDA_LOW_OCCUPANCY_WARNINGS', 1):
+        with override_config("CUDA_LOW_OCCUPANCY_WARNINGS", 1):
             with warnings.catch_warnings(record=True) as w:
                 numba_dist_cuda(a, b, dist)
                 self.assertEqual(w[0].category, NumbaPerformanceWarning)
-                self.assertIn('Grid size', str(w[0].message))
-                self.assertIn('low occupancy', str(w[0].message))
+                self.assertIn("Grid size", str(w[0].message))
+                self.assertIn("low occupancy", str(w[0].message))
 
     def test_efficient_launch_configuration(self):
-        @guvectorize(['void(float32[:], float32[:], float32[:])'],
-                     '(n),(n)->(n)', nopython=True, target='cuda')
+        @guvectorize(
+            ["void(float32[:], float32[:], float32[:])"],
+            "(n),(n)->(n)",
+            nopython=True,
+            target="cuda",
+        )
         def numba_dist_cuda2(a, b, dist):
             len = a.shape[0]
             for i in range(len):
                 dist[i] = a[i] * b[i]
 
-        a = np.random.rand(524288 * 2).astype('float32').\
-            reshape((524288, 2))
-        b = np.random.rand(524288 * 2).astype('float32').\
-            reshape((524288, 2))
+        a = np.random.rand(524288 * 2).astype("float32").reshape((524288, 2))
+        b = np.random.rand(524288 * 2).astype("float32").reshape((524288, 2))
         dist = np.zeros_like(a)
 
-        with override_config('CUDA_LOW_OCCUPANCY_WARNINGS', 1):
+        with override_config("CUDA_LOW_OCCUPANCY_WARNINGS", 1):
             with warnings.catch_warnings(record=True) as w:
                 numba_dist_cuda2(a, b, dist)
                 self.assertEqual(len(w), 0)
 
     def test_nopython_flag(self):
-
         def foo(A, B):
             pass
 
         # nopython = True is fine
-        guvectorize([void(float32[:], float32[:])], '(x)->(x)', target='cuda',
-                    nopython=True)(foo)
+        guvectorize(
+            [void(float32[:], float32[:])],
+            "(x)->(x)",
+            target="cuda",
+            nopython=True,
+        )(foo)
 
         # nopython = False is bad
         with self.assertRaises(TypeError) as raises:
-            guvectorize([void(float32[:], float32[:])], '(x)->(x)',
-                        target='cuda', nopython=False)(foo)
+            guvectorize(
+                [void(float32[:], float32[:])],
+                "(x)->(x)",
+                target="cuda",
+                nopython=False,
+            )(foo)
         self.assertEqual("nopython flag must be True", str(raises.exception))
 
     def test_invalid_flags(self):
@@ -262,17 +269,22 @@ def foo(A, B):
             pass
 
         with self.assertRaises(TypeError) as raises:
-            guvectorize([void(float32[:], float32[:])], '(x)->(x)',
-                        target='cuda', what1=True, ever2=False)(foo)
+            guvectorize(
+                [void(float32[:], float32[:])],
+                "(x)->(x)",
+                target="cuda",
+                what1=True,
+                ever2=False,
+            )(foo)
         head = "The following target options are not supported:"
         msg = str(raises.exception)
-        self.assertEqual(msg[:len(head)], head)
-        items = msg[len(head):].strip().split(',')
+        self.assertEqual(msg[: len(head)], head)
+        items = msg[len(head) :].strip().split(",")
         items = [i.strip("'\" ") for i in items]
-        self.assertEqual(set(['what1', 'ever2']), set(items))
+        self.assertEqual(set(["what1", "ever2"]), set(items))
 
     def test_duplicated_output(self):
-        @guvectorize([void(float32[:], float32[:])], '(x)->(x)', target='cuda')
+        @guvectorize([void(float32[:], float32[:])], "(x)->(x)", target="cuda")
         def foo(inp, out):
             pass  # intentionally empty; never executed
 
@@ -284,8 +296,9 @@ def foo(inp, out):
         self.assertEqual(str(raises.exception), msg)
 
     def check_tuple_arg(self, a, b):
-        @guvectorize([(float64[:], float64[:], float64[:])], '(n),(n)->()',
-                     target='cuda')
+        @guvectorize(
+            [(float64[:], float64[:], float64[:])], "(n),(n)->()", target="cuda"
+        )
         def gu_reduce(x, y, r):
             s = 0
             for i in range(len(x)):
@@ -297,44 +310,40 @@ def gu_reduce(x, y, r):
         np.testing.assert_equal(expected, r)
 
     def test_tuple_of_tuple_arg(self):
-        a = ((1.0, 2.0, 3.0),
-             (4.0, 5.0, 6.0))
-        b = ((1.5, 2.5, 3.5),
-             (4.5, 5.5, 6.5))
+        a = ((1.0, 2.0, 3.0), (4.0, 5.0, 6.0))
+        b = ((1.5, 2.5, 3.5), (4.5, 5.5, 6.5))
         self.check_tuple_arg(a, b)
 
     def test_tuple_of_namedtuple_arg(self):
-        Point = namedtuple('Point', ('x', 'y', 'z'))
-        a = (Point(x=1.0, y=2.0, z=3.0),
-             Point(x=4.0, y=5.0, z=6.0))
-        b = (Point(x=1.5, y=2.5, z=3.5),
-             Point(x=4.5, y=5.5, z=6.5))
+        Point = namedtuple("Point", ("x", "y", "z"))
+        a = (Point(x=1.0, y=2.0, z=3.0), Point(x=4.0, y=5.0, z=6.0))
+        b = (Point(x=1.5, y=2.5, z=3.5), Point(x=4.5, y=5.5, z=6.5))
         self.check_tuple_arg(a, b)
 
     def test_tuple_of_array_arg(self):
-        a = (np.asarray((1.0, 2.0, 3.0)),
-             np.asarray((4.0, 5.0, 6.0)))
-        b = (np.asarray((1.5, 2.5, 3.5)),
-             np.asarray((4.5, 5.5, 6.5)))
+        a = (np.asarray((1.0, 2.0, 3.0)), np.asarray((4.0, 5.0, 6.0)))
+        b = (np.asarray((1.5, 2.5, 3.5)), np.asarray((4.5, 5.5, 6.5)))
         self.check_tuple_arg(a, b)
 
     def test_gufunc_name(self):
         gufunc = _get_matmulcore_gufunc()
-        self.assertEqual(gufunc.__name__, 'matmulcore')
+        self.assertEqual(gufunc.__name__, "matmulcore")
 
     def test_bad_return_type(self):
         with self.assertRaises(TypeError) as te:
-            @guvectorize([int32(int32[:], int32[:])], '(m)->(m)', target='cuda')
+
+            @guvectorize([int32(int32[:], int32[:])], "(m)->(m)", target="cuda")
             def f(x, y):
                 pass
 
         msg = str(te.exception)
-        self.assertIn('guvectorized functions cannot return values', msg)
-        self.assertIn('specifies int32 return type', msg)
+        self.assertIn("guvectorized functions cannot return values", msg)
+        self.assertIn("specifies int32 return type", msg)
 
     def test_incorrect_number_of_pos_args(self):
-        @guvectorize([(int32[:], int32[:], int32[:])],
-                     '(m),(m)->(m)', target='cuda')
+        @guvectorize(
+            [(int32[:], int32[:], int32[:])], "(m),(m)->(m)", target="cuda"
+        )
         def f(x, y, z):
             pass
 
@@ -345,26 +354,28 @@ def f(x, y, z):
             f(arr)
 
         msg = str(te.exception)
-        self.assertIn('gufunc accepts 2 positional arguments', msg)
-        self.assertIn('or 3 positional arguments', msg)
-        self.assertIn('Got 1 positional argument.', msg)
+        self.assertIn("gufunc accepts 2 positional arguments", msg)
+        self.assertIn("or 3 positional arguments", msg)
+        self.assertIn("Got 1 positional argument.", msg)
 
         # Inputs and outputs, too many
         with self.assertRaises(TypeError) as te:
             f(arr, arr, arr, arr)
 
         msg = str(te.exception)
-        self.assertIn('gufunc accepts 2 positional arguments', msg)
-        self.assertIn('or 3 positional arguments', msg)
-        self.assertIn('Got 4 positional arguments.', msg)
+        self.assertIn("gufunc accepts 2 positional arguments", msg)
+        self.assertIn("or 3 positional arguments", msg)
+        self.assertIn("Got 4 positional arguments.", msg)
 
 
-@skip_on_cudasim('ufunc API unsupported in the simulator')
+@skip_on_cudasim("ufunc API unsupported in the simulator")
 class TestMultipleOutputs(CUDATestCase):
     def test_multiple_outputs_same_type_passed_in(self):
-        @guvectorize([void(float32[:], float32[:], float32[:])],
-                     '(x)->(x),(x)',
-                     target='cuda')
+        @guvectorize(
+            [void(float32[:], float32[:], float32[:])],
+            "(x)->(x),(x)",
+            target="cuda",
+        )
         def copy(A, B, C):
             for i in range(B.size):
                 B[i] = A[i]
@@ -378,10 +389,11 @@ def copy(A, B, C):
         np.testing.assert_allclose(A, C)
 
     def test_multiple_outputs_distinct_values(self):
-
-        @guvectorize([void(float32[:], float32[:], float32[:])],
-                     '(x)->(x),(x)',
-                     target='cuda')
+        @guvectorize(
+            [void(float32[:], float32[:], float32[:])],
+            "(x)->(x),(x)",
+            target="cuda",
+        )
         def copy_and_double(A, B, C):
             for i in range(B.size):
                 B[i] = A[i]
@@ -395,9 +407,11 @@ def copy_and_double(A, B, C):
         np.testing.assert_allclose(A * 2, C)
 
     def test_multiple_output_allocation(self):
-        @guvectorize([void(float32[:], float32[:], float32[:])],
-                     '(x)->(x),(x)',
-                     target='cuda')
+        @guvectorize(
+            [void(float32[:], float32[:], float32[:])],
+            "(x)->(x),(x)",
+            target="cuda",
+        )
         def copy_and_double(A, B, C):
             for i in range(B.size):
                 B[i] = A[i]
@@ -409,10 +423,11 @@ def copy_and_double(A, B, C):
         np.testing.assert_allclose(A * 2, C)
 
     def test_multiple_output_dtypes(self):
-
-        @guvectorize([void(int32[:], int32[:], float64[:])],
-                     '(x)->(x),(x)',
-                     target='cuda')
+        @guvectorize(
+            [void(int32[:], int32[:], float64[:])],
+            "(x)->(x),(x)",
+            target="cuda",
+        )
         def copy_and_multiply(A, B, C):
             for i in range(B.size):
                 B[i] = A[i]
@@ -426,8 +441,11 @@ def copy_and_multiply(A, B, C):
         np.testing.assert_allclose(A * np.float64(1.5), C)
 
     def test_incorrect_number_of_pos_args(self):
-        @guvectorize([(int32[:], int32[:], int32[:], int32[:])],
-                     '(m),(m)->(m),(m)', target='cuda')
+        @guvectorize(
+            [(int32[:], int32[:], int32[:], int32[:])],
+            "(m),(m)->(m),(m)",
+            target="cuda",
+        )
         def f(x, y, z, w):
             pass
 
@@ -438,19 +456,19 @@ def f(x, y, z, w):
             f(arr)
 
         msg = str(te.exception)
-        self.assertIn('gufunc accepts 2 positional arguments', msg)
-        self.assertIn('or 4 positional arguments', msg)
-        self.assertIn('Got 1 positional argument.', msg)
+        self.assertIn("gufunc accepts 2 positional arguments", msg)
+        self.assertIn("or 4 positional arguments", msg)
+        self.assertIn("Got 1 positional argument.", msg)
 
         # Inputs and outputs, too many
         with self.assertRaises(TypeError) as te:
             f(arr, arr, arr, arr, arr)
 
         msg = str(te.exception)
-        self.assertIn('gufunc accepts 2 positional arguments', msg)
-        self.assertIn('or 4 positional arguments', msg)
-        self.assertIn('Got 5 positional arguments.', msg)
+        self.assertIn("gufunc accepts 2 positional arguments", msg)
+        self.assertIn("or 4 positional arguments", msg)
+        self.assertIn("Got 5 positional arguments.", msg)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py b/numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py
index 493a9ceec..6b9940805 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py
@@ -3,13 +3,14 @@
 See Numpy documentation for detail about gufunc:
     http://docs.scipy.org/doc/numpy/reference/c-api.generalized-ufuncs.html
 """
+
 import numpy as np
 from numba import guvectorize, cuda
 from numba.cuda.testing import skip_on_cudasim, CUDATestCase
 import unittest
 
 
-@skip_on_cudasim('ufunc API unsupported in the simulator')
+@skip_on_cudasim("ufunc API unsupported in the simulator")
 class TestGUFuncScalar(CUDATestCase):
     def test_gufunc_scalar_output(self):
         #    function type:
@@ -20,9 +21,9 @@ def test_gufunc_scalar_output(self):
         #    signature: (n)->()
         #        - the function takes an array of n-element and output a scalar.
 
-        @guvectorize(['void(int32[:], int32[:])'], '(n)->()', target='cuda')
+        @guvectorize(["void(int32[:], int32[:])"], "(n)->()", target="cuda")
         def sum_row(inp, out):
-            tmp = 0.
+            tmp = 0.0
             for i in range(inp.shape[0]):
                 tmp += inp[i]
             out[0] = tmp
@@ -38,15 +39,14 @@ def sum_row(inp, out):
         out1 = np.empty(100, dtype=inp.dtype)
         out2 = np.empty(100, dtype=inp.dtype)
 
-        dev_inp = cuda.to_device(
-            inp)                 # alloc and copy input data
-        dev_out1 = cuda.to_device(out1, copy=False)   # alloc only
+        dev_inp = cuda.to_device(inp)  # alloc and copy input data
+        dev_out1 = cuda.to_device(out1, copy=False)  # alloc only
 
-        sum_row(dev_inp, out=dev_out1)                # invoke the gufunc
-        dev_out2 = sum_row(dev_inp)                   # invoke the gufunc
+        sum_row(dev_inp, out=dev_out1)  # invoke the gufunc
+        dev_out2 = sum_row(dev_inp)  # invoke the gufunc
 
-        dev_out1.copy_to_host(out1)                 # retrieve the result
-        dev_out2.copy_to_host(out2)                 # retrieve the result
+        dev_out1.copy_to_host(out1)  # retrieve the result
+        dev_out2.copy_to_host(out2)  # retrieve the result
 
         # verify result
         for i in range(inp.shape[0]):
@@ -55,7 +55,7 @@ def sum_row(inp, out):
 
     def test_gufunc_scalar_output_bug(self):
         # Issue 2812: Error due to using input argument types as output argument
-        @guvectorize(['void(int32, int32[:])'], '()->()', target='cuda')
+        @guvectorize(["void(int32, int32[:])"], "()->()", target="cuda")
         def twice(inp, out):
             out[0] = inp * 2
 
@@ -64,8 +64,11 @@ def twice(inp, out):
         self.assertPreciseEqual(twice(arg), arg * 2)
 
     def test_gufunc_scalar_input_saxpy(self):
-        @guvectorize(['void(float32, float32[:], float32[:], float32[:])'],
-                     '(),(t),(t)->(t)', target='cuda')
+        @guvectorize(
+            ["void(float32, float32[:], float32[:], float32[:])"],
+            "(),(t),(t)->(t)",
+            target="cuda",
+        )
         def saxpy(a, x, y, out):
             for i in range(out.shape[0]):
                 out[i] = a * x[i] + y[i]
@@ -99,8 +102,9 @@ def saxpy(a, x, y, out):
                 self.assertTrue(exp == out[j, i], (exp, out[j, i]))
 
     def test_gufunc_scalar_cast(self):
-        @guvectorize(['void(int32, int32[:], int32[:])'], '(),(t)->(t)',
-                     target='cuda')
+        @guvectorize(
+            ["void(int32, int32[:], int32[:])"], "(),(t)->(t)", target="cuda"
+        )
         def foo(a, b, out):
             for i in range(b.size):
                 out[i] = a * b[i]
@@ -121,8 +125,9 @@ def foo(a, b, out):
 
     def test_gufunc_old_style_scalar_as_array(self):
         # Example from issue #2579
-        @guvectorize(['void(int32[:],int32[:],int32[:])'], '(n),()->(n)',
-                     target='cuda')
+        @guvectorize(
+            ["void(int32[:],int32[:],int32[:])"], "(n),()->(n)", target="cuda"
+        )
         def gufunc(x, y, res):
             for i in range(x.shape[0]):
                 res[i] = x[i] + y[0]
@@ -155,5 +160,5 @@ def gufunc(x, y, res):
         np.testing.assert_almost_equal(expected, res)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py b/numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py
index fb8de3285..3c04a978e 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py
@@ -8,88 +8,82 @@ def template(signature, shapes, expects):
     for k, v in expects.items():
         got = getattr(sch, k)
         if got != v:
-            fmt = 'error for %s: got=%s but expect=%s'
+            fmt = "error for %s: got=%s but expect=%s"
             raise AssertionError(fmt % (k, got, v))
 
 
 class TestGUFuncScheduling(unittest.TestCase):
     def test_signature_1(self):
-        signature = '(m, n), (n, p) -> (m, p)'
+        signature = "(m, n), (n, p) -> (m, p)"
         shapes = (100, 4, 5), (1, 5, 7)
         expects = dict(
             ishapes=[(4, 5), (5, 7)],
             oshapes=[(4, 7)],
             loopdims=(100,),
-            pinned=[False, True]
+            pinned=[False, True],
         )
         template(signature, shapes, expects)
 
     def test_signature_2(self):
-        signature = '(m, n), (n, p) -> (m, p)'
+        signature = "(m, n), (n, p) -> (m, p)"
         shapes = (100, 4, 5), (100, 5, 7)
         expects = dict(
             ishapes=[(4, 5), (5, 7)],
             oshapes=[(4, 7)],
             loopdims=(100,),
-            pinned=[False, False]
+            pinned=[False, False],
         )
         template(signature, shapes, expects)
 
     def test_signature_3(self):
-        signature = '(m, n), (n, p) -> (m, p)'
+        signature = "(m, n), (n, p) -> (m, p)"
         shapes = (12, 34, 4, 5), (12, 34, 5, 7)
         expects = dict(
             ishapes=[(4, 5), (5, 7)],
             oshapes=[(4, 7)],
             loopdims=(12, 34),
-            pinned=[False, False]
+            pinned=[False, False],
         )
         template(signature, shapes, expects)
 
     def test_signature_4(self):
-        signature = '(m, n), (n, p) -> (m, p)'
+        signature = "(m, n), (n, p) -> (m, p)"
         shapes = (4, 5), (5, 7)
         expects = dict(
             ishapes=[(4, 5), (5, 7)],
             oshapes=[(4, 7)],
             loopdims=(),
-            pinned=[False, False]
+            pinned=[False, False],
         )
         template(signature, shapes, expects)
 
     def test_signature_5(self):
-        signature = '(a), (a) -> (a)'
+        signature = "(a), (a) -> (a)"
         shapes = (5,), (5,)
         expects = dict(
             ishapes=[(5,), (5,)],
             oshapes=[(5,)],
             loopdims=(),
-            pinned=[False, False]
+            pinned=[False, False],
         )
         template(signature, shapes, expects)
 
     def test_signature_6(self):
-        signature = '(), () -> ()'
+        signature = "(), () -> ()"
         shapes = (5,), (5,)
         expects = dict(
-            ishapes=[(), ()],
-            oshapes=[()],
-            loopdims=(5,),
-            pinned=[False, False]
+            ishapes=[(), ()], oshapes=[()], loopdims=(5,), pinned=[False, False]
         )
         template(signature, shapes, expects)
 
     def test_signature_7(self):
-        signature = '(), () -> ()'
+        signature = "(), () -> ()"
         shapes = (5,), ()
         expects = dict(
-            ishapes=[(), ()],
-            oshapes=[()],
-            loopdims=(5,),
-            pinned=[False, True]
+            ishapes=[(), ()], oshapes=[()], loopdims=(5,), pinned=[False, True]
         )
         template(signature, shapes, expects)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_idiv.py b/numba_cuda/numba/cuda/tests/cudapy/test_idiv.py
index 44b770f42..332a87718 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_idiv.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_idiv.py
@@ -5,7 +5,6 @@
 
 class TestCudaIDiv(CUDATestCase):
     def test_inplace_div(self):
-
         @cuda.jit(void(float32[:, :], int32, int32))
         def div(grid, l_x, l_y):
             for x in range(l_x):
@@ -19,7 +18,6 @@ def div(grid, l_x, l_y):
         self.assertTrue(np.all(y == 0.5))
 
     def test_inplace_div_double(self):
-
         @cuda.jit(void(float64[:, :], int32, int32))
         def div_double(grid, l_x, l_y):
             for x in range(l_x):
@@ -33,5 +31,5 @@ def div_double(grid, l_x, l_y):
         self.assertTrue(np.all(y == 0.5))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_inspect.py b/numba_cuda/numba/cuda/tests/cudapy/test_inspect.py
index 5c122dbd9..5a038a11c 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_inspect.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_inspect.py
@@ -3,11 +3,14 @@
 from io import StringIO
 from numba import cuda, float32, float64, int32, intp
 from numba.cuda.testing import unittest, CUDATestCase
-from numba.cuda.testing import (skip_on_cudasim, skip_with_nvdisasm,
-                                skip_without_nvdisasm)
+from numba.cuda.testing import (
+    skip_on_cudasim,
+    skip_with_nvdisasm,
+    skip_without_nvdisasm,
+)
 
 
-@skip_on_cudasim('Simulator does not generate code to be inspected')
+@skip_on_cudasim("Simulator does not generate code to be inspected")
 class TestInspect(CUDATestCase):
     @property
     def cc(self):
@@ -60,7 +63,10 @@ def foo(x, y):
 
         # Signature in LLVM dict
         llvmirs = foo.inspect_llvm()
-        self.assertEqual(2, len(llvmirs), )
+        self.assertEqual(
+            2,
+            len(llvmirs),
+        )
         self.assertIn((intp, intp), llvmirs)
         self.assertIn((float64, float64), llvmirs)
 
@@ -75,7 +81,10 @@ def foo(x, y):
         asmdict = foo.inspect_asm()
 
         # Signature in assembly dict
-        self.assertEqual(2, len(asmdict), )
+        self.assertEqual(
+            2,
+            len(asmdict),
+        )
         self.assertIn((intp, intp), asmdict)
         self.assertIn((float64, float64), asmdict)
 
@@ -87,7 +96,7 @@ def _test_inspect_sass(self, kernel, name, sass):
         # Ensure function appears in output
         seen_function = False
         for line in sass.split():
-            if '.text' in line and name in line:
+            if ".text" in line and name in line:
                 seen_function = True
         self.assertTrue(seen_function)
 
@@ -95,11 +104,11 @@ def _test_inspect_sass(self, kernel, name, sass):
 
         # Some instructions common to all supported architectures that should
         # appear in the output
-        self.assertIn('S2R', sass)   # Special register to register
-        self.assertIn('BRA', sass)   # Branch
-        self.assertIn('EXIT', sass)  # Exit program
+        self.assertIn("S2R", sass)  # Special register to register
+        self.assertIn("BRA", sass)  # Branch
+        self.assertIn("EXIT", sass)  # Exit program
 
-    @skip_without_nvdisasm('nvdisasm needed for inspect_sass()')
+    @skip_without_nvdisasm("nvdisasm needed for inspect_sass()")
     def test_inspect_sass_eager(self):
         sig = (float32[::1], int32[::1])
 
@@ -109,9 +118,9 @@ def add(x, y):
             if i < len(x):
                 x[i] += y[i]
 
-        self._test_inspect_sass(add, 'add', add.inspect_sass(sig))
+        self._test_inspect_sass(add, "add", add.inspect_sass(sig))
 
-    @skip_without_nvdisasm('nvdisasm needed for inspect_sass()')
+    @skip_without_nvdisasm("nvdisasm needed for inspect_sass()")
     def test_inspect_sass_lazy(self):
         @cuda.jit(lineinfo=True)
         def add(x, y):
@@ -124,10 +133,11 @@ def add(x, y):
         add[1, 10](x, y)
 
         signature = (int32[::1], float32[::1])
-        self._test_inspect_sass(add, 'add', add.inspect_sass(signature))
+        self._test_inspect_sass(add, "add", add.inspect_sass(signature))
 
-    @skip_with_nvdisasm('Missing nvdisasm exception only generated when it is '
-                        'not present')
+    @skip_with_nvdisasm(
+        "Missing nvdisasm exception only generated when it is not present"
+    )
     def test_inspect_sass_nvdisasm_missing(self):
         @cuda.jit((float32[::1],))
         def f(x):
@@ -136,9 +146,9 @@ def f(x):
         with self.assertRaises(RuntimeError) as raises:
             f.inspect_sass()
 
-        self.assertIn('nvdisasm has not been found', str(raises.exception))
+        self.assertIn("nvdisasm has not been found", str(raises.exception))
 
-    @skip_without_nvdisasm('nvdisasm needed for inspect_sass_cfg()')
+    @skip_without_nvdisasm("nvdisasm needed for inspect_sass_cfg()")
     def test_inspect_sass_cfg(self):
         sig = (float32[::1], int32[::1])
 
@@ -149,10 +159,9 @@ def add(x, y):
                 x[i] += y[i]
 
         self.assertRegex(
-            add.inspect_sass_cfg(signature=sig),
-            r'digraph\s*\w\s*{(.|\n)*\n}'
+            add.inspect_sass_cfg(signature=sig), r"digraph\s*\w\s*{(.|\n)*\n}"
         )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py b/numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py
index 5622789f7..6e4fa61e3 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py
@@ -6,8 +6,12 @@
 from numba.cuda import compile_ptx
 from numba.core.errors import TypingError
 from numba.core.types import f2
-from numba.cuda.testing import (unittest, CUDATestCase, skip_on_cudasim,
-                                skip_unless_cc_53)
+from numba.cuda.testing import (
+    unittest,
+    CUDATestCase,
+    skip_on_cudasim,
+    skip_unless_cc_53,
+)
 
 
 def simple_threadidx(ary):
@@ -260,7 +264,6 @@ def simple_hsqrt(r, x):
 
 
 def simple_hrsqrt(r, x):
-
     i = cuda.grid(1)
 
     if i < len(r):
@@ -268,7 +271,7 @@ def simple_hrsqrt(r, x):
 
 
 def numpy_hrsqrt(x, dtype):
-    return x ** -0.5
+    return x**-0.5
 
 
 def simple_hceil(r, x):
@@ -404,15 +407,15 @@ def f_contigous():
         f_res = f_contigous()
         self.assertTrue(np.all(c_res == f_res))
 
-    @skip_on_cudasim('Cudasim does not check types')
+    @skip_on_cudasim("Cudasim does not check types")
     def test_nonliteral_grid_error(self):
-        with self.assertRaisesRegex(TypingError, 'RequireLiteralValue'):
-            cuda.jit('void(int32)')(nonliteral_grid)
+        with self.assertRaisesRegex(TypingError, "RequireLiteralValue"):
+            cuda.jit("void(int32)")(nonliteral_grid)
 
-    @skip_on_cudasim('Cudasim does not check types')
+    @skip_on_cudasim("Cudasim does not check types")
     def test_nonliteral_gridsize_error(self):
-        with self.assertRaisesRegex(TypingError, 'RequireLiteralValue'):
-            cuda.jit('void(int32)')(nonliteral_gridsize)
+        with self.assertRaisesRegex(TypingError, "RequireLiteralValue"):
+            cuda.jit("void(int32)")(nonliteral_gridsize)
 
     def test_simple_grid1d(self):
         compiled = cuda.jit("void(int32[::1])")(simple_grid1d)
@@ -444,7 +447,7 @@ def test_simple_gridsize1d(self):
         compiled[nctaid, ntid](ary)
         self.assertEqual(ary[0], nctaid * ntid)
 
-    @skip_on_cudasim('Requires too many threads')
+    @skip_on_cudasim("Requires too many threads")
     def test_issue_9229(self):
         # Ensure that grid and grid size are correct - #9229 showed that they
         # overflowed an int32.
@@ -469,7 +472,7 @@ def f(grid_error, gridsize_error):
         self.assertEqual(grid_error[0], 0)
         self.assertEqual(gridsize_error[0], 0)
 
-    @skip_on_cudasim('Tests PTX emission')
+    @skip_on_cudasim("Tests PTX emission")
     def test_selp(self):
         sig = (int64[:], int64, int64[:])
         cu_branching_with_ifs = cuda.jit(sig)(branching_with_ifs)
@@ -485,14 +488,14 @@ def test_selp(self):
         a = np.arange(n, dtype=np.int64)
         cu_branching_with_ifs[n, 1](a, b, c)
         ptx = cu_branching_with_ifs.inspect_asm(sig)
-        self.assertEqual(2, len(re.findall(r'\s+bra\s+', ptx)))
-        np.testing.assert_array_equal(a, expected, err_msg='branching')
+        self.assertEqual(2, len(re.findall(r"\s+bra\s+", ptx)))
+        np.testing.assert_array_equal(a, expected, err_msg="branching")
 
         a = np.arange(n, dtype=np.int64)
         cu_branching_with_selps[n, 1](a, b, c)
         ptx = cu_branching_with_selps.inspect_asm(sig)
-        self.assertEqual(0, len(re.findall(r'\s+bra\s+', ptx)))
-        np.testing.assert_array_equal(a, expected, err_msg='selp')
+        self.assertEqual(0, len(re.findall(r"\s+bra\s+", ptx)))
+        np.testing.assert_array_equal(a, expected, err_msg="selp")
 
     def test_simple_gridsize2d(self):
         compiled = cuda.jit("void(int32[::1])")(simple_gridsize2d)
@@ -528,10 +531,10 @@ def foo(out):
             a, b, c = cuda.gridsize(3)
             out[x, y, z] = a * b * c
 
-        arr = np.zeros(9 ** 3, dtype=np.int32).reshape(9, 9, 9)
+        arr = np.zeros(9**3, dtype=np.int32).reshape(9, 9, 9)
         foo[(3, 3, 3), (3, 3, 3)](arr)
 
-        np.testing.assert_equal(arr, 9 ** 3)
+        np.testing.assert_equal(arr, 9**3)
 
     def test_3dgrid_2(self):
         @cuda.jit
@@ -539,13 +542,15 @@ def foo(out):
             x, y, z = cuda.grid(3)
             a, b, c = cuda.gridsize(3)
             grid_is_right = (
-                x == cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x and
-                y == cuda.threadIdx.y + cuda.blockIdx.y * cuda.blockDim.y and
-                z == cuda.threadIdx.z + cuda.blockIdx.z * cuda.blockDim.z
+                x == cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
+                and y == cuda.threadIdx.y + cuda.blockIdx.y * cuda.blockDim.y
+                and z == cuda.threadIdx.z + cuda.blockIdx.z * cuda.blockDim.z
+            )
+            gridsize_is_right = (
+                a == cuda.blockDim.x * cuda.gridDim.x
+                and b == cuda.blockDim.y * cuda.gridDim.y
+                and c == cuda.blockDim.z * cuda.gridDim.z
             )
-            gridsize_is_right = (a == cuda.blockDim.x * cuda.gridDim.x and
-                                 b == cuda.blockDim.y * cuda.gridDim.y and
-                                 c == cuda.blockDim.z * cuda.gridDim.z)
             out[x, y, z] = grid_is_right and gridsize_is_right
 
         x, y, z = (4 * 3, 3 * 2, 2 * 4)
@@ -605,21 +610,21 @@ def test_bit_count_u8(self):
     def test_fma_f4(self):
         compiled = cuda.jit("void(f4[:], f4, f4, f4)")(simple_fma)
         ary = np.zeros(1, dtype=np.float32)
-        compiled[1, 1](ary, 2., 3., 4.)
+        compiled[1, 1](ary, 2.0, 3.0, 4.0)
         np.testing.assert_allclose(ary[0], 2 * 3 + 4)
 
     def test_fma_f8(self):
         compiled = cuda.jit("void(f8[:], f8, f8, f8)")(simple_fma)
         ary = np.zeros(1, dtype=np.float64)
-        compiled[1, 1](ary, 2., 3., 4.)
+        compiled[1, 1](ary, 2.0, 3.0, 4.0)
         np.testing.assert_allclose(ary[0], 2 * 3 + 4)
 
     @skip_unless_cc_53
     def test_hadd(self):
         compiled = cuda.jit("void(f2[:], f2[:], f2[:])")(simple_hadd)
         ary = np.zeros(1, dtype=np.float16)
-        arg1 = np.array([3.], dtype=np.float16)
-        arg2 = np.array([4.], dtype=np.float16)
+        arg1 = np.array([3.0], dtype=np.float16)
+        arg2 = np.array([4.0], dtype=np.float16)
         compiled[1, 1](ary, arg1, arg2)
         np.testing.assert_allclose(ary[0], arg1 + arg2)
 
@@ -628,24 +633,24 @@ def test_hadd_scalar(self):
         compiled = cuda.jit("void(f2[:], f2, f2)")(simple_hadd_scalar)
         ary = np.zeros(1, dtype=np.float16)
         arg1 = np.float16(3.1415926)
-        arg2 = np.float16(3.)
+        arg2 = np.float16(3.0)
         compiled[1, 1](ary, arg1, arg2)
         ref = arg1 + arg2
         np.testing.assert_allclose(ary[0], ref)
 
-    @skip_on_cudasim('Compilation unsupported in the simulator')
+    @skip_on_cudasim("Compilation unsupported in the simulator")
     def test_hadd_ptx(self):
         args = (f2[:], f2, f2)
         ptx, _ = compile_ptx(simple_hadd_scalar, args, cc=(5, 3))
-        self.assertIn('add.f16', ptx)
+        self.assertIn("add.f16", ptx)
 
     @skip_unless_cc_53
     def test_hfma(self):
         compiled = cuda.jit("void(f2[:], f2[:], f2[:], f2[:])")(simple_hfma)
         ary = np.zeros(1, dtype=np.float16)
-        arg1 = np.array([2.], dtype=np.float16)
-        arg2 = np.array([3.], dtype=np.float16)
-        arg3 = np.array([4.], dtype=np.float16)
+        arg1 = np.array([2.0], dtype=np.float16)
+        arg2 = np.array([3.0], dtype=np.float16)
+        arg3 = np.array([4.0], dtype=np.float16)
         compiled[1, 1](ary, arg1, arg2, arg3)
         np.testing.assert_allclose(ary[0], arg1 * arg2 + arg3)
 
@@ -653,25 +658,25 @@ def test_hfma(self):
     def test_hfma_scalar(self):
         compiled = cuda.jit("void(f2[:], f2, f2, f2)")(simple_hfma_scalar)
         ary = np.zeros(1, dtype=np.float16)
-        arg1 = np.float16(2.)
-        arg2 = np.float16(3.)
-        arg3 = np.float16(4.)
+        arg1 = np.float16(2.0)
+        arg2 = np.float16(3.0)
+        arg3 = np.float16(4.0)
         compiled[1, 1](ary, arg1, arg2, arg3)
         ref = arg1 * arg2 + arg3
         np.testing.assert_allclose(ary[0], ref)
 
-    @skip_on_cudasim('Compilation unsupported in the simulator')
+    @skip_on_cudasim("Compilation unsupported in the simulator")
     def test_hfma_ptx(self):
         args = (f2[:], f2, f2, f2)
         ptx, _ = compile_ptx(simple_hfma_scalar, args, cc=(5, 3))
-        self.assertIn('fma.rn.f16', ptx)
+        self.assertIn("fma.rn.f16", ptx)
 
     @skip_unless_cc_53
     def test_hsub(self):
         compiled = cuda.jit("void(f2[:], f2[:], f2[:])")(simple_hsub)
         ary = np.zeros(1, dtype=np.float16)
-        arg1 = np.array([3.], dtype=np.float16)
-        arg2 = np.array([4.], dtype=np.float16)
+        arg1 = np.array([3.0], dtype=np.float16)
+        arg2 = np.array([4.0], dtype=np.float16)
         compiled[1, 1](ary, arg1, arg2)
         np.testing.assert_allclose(ary[0], arg1 - arg2)
 
@@ -685,18 +690,18 @@ def test_hsub_scalar(self):
         ref = arg1 - arg2
         np.testing.assert_allclose(ary[0], ref)
 
-    @skip_on_cudasim('Compilation unsupported in the simulator')
+    @skip_on_cudasim("Compilation unsupported in the simulator")
     def test_hsub_ptx(self):
         args = (f2[:], f2, f2)
         ptx, _ = compile_ptx(simple_hsub_scalar, args, cc=(5, 3))
-        self.assertIn('sub.f16', ptx)
+        self.assertIn("sub.f16", ptx)
 
     @skip_unless_cc_53
     def test_hmul(self):
         compiled = cuda.jit()(simple_hmul)
         ary = np.zeros(1, dtype=np.float16)
-        arg1 = np.array([3.], dtype=np.float16)
-        arg2 = np.array([4.], dtype=np.float16)
+        arg1 = np.array([3.0], dtype=np.float16)
+        arg2 = np.array([4.0], dtype=np.float16)
         compiled[1, 1](ary, arg1, arg2)
         np.testing.assert_allclose(ary[0], arg1 * arg2)
 
@@ -710,11 +715,11 @@ def test_hmul_scalar(self):
         ref = arg1 * arg2
         np.testing.assert_allclose(ary[0], ref)
 
-    @skip_on_cudasim('Compilation unsupported in the simulator')
+    @skip_on_cudasim("Compilation unsupported in the simulator")
     def test_hmul_ptx(self):
         args = (f2[:], f2, f2)
         ptx, _ = compile_ptx(simple_hmul_scalar, args, cc=(5, 3))
-        self.assertIn('mul.f16', ptx)
+        self.assertIn("mul.f16", ptx)
 
     @skip_unless_cc_53
     def test_hdiv_scalar(self):
@@ -742,7 +747,7 @@ def test_hdiv(self):
     def test_hneg(self):
         compiled = cuda.jit("void(f2[:], f2[:])")(simple_hneg)
         ary = np.zeros(1, dtype=np.float16)
-        arg1 = np.array([3.], dtype=np.float16)
+        arg1 = np.array([3.0], dtype=np.float16)
         compiled[1, 1](ary, arg1)
         np.testing.assert_allclose(ary[0], -arg1)
 
@@ -755,17 +760,17 @@ def test_hneg_scalar(self):
         ref = -arg1
         np.testing.assert_allclose(ary[0], ref)
 
-    @skip_on_cudasim('Compilation unsupported in the simulator')
+    @skip_on_cudasim("Compilation unsupported in the simulator")
     def test_hneg_ptx(self):
         args = (f2[:], f2)
         ptx, _ = compile_ptx(simple_hneg_scalar, args, cc=(5, 3))
-        self.assertIn('neg.f16', ptx)
+        self.assertIn("neg.f16", ptx)
 
     @skip_unless_cc_53
     def test_habs(self):
         compiled = cuda.jit()(simple_habs)
         ary = np.zeros(1, dtype=np.float16)
-        arg1 = np.array([-3.], dtype=np.float16)
+        arg1 = np.array([-3.0], dtype=np.float16)
         compiled[1, 1](ary, arg1)
         np.testing.assert_allclose(ary[0], abs(arg1))
 
@@ -778,25 +783,43 @@ def test_habs_scalar(self):
         ref = abs(arg1)
         np.testing.assert_allclose(ary[0], ref)
 
-    @skip_on_cudasim('Compilation unsupported in the simulator')
+    @skip_on_cudasim("Compilation unsupported in the simulator")
     def test_habs_ptx(self):
         args = (f2[:], f2)
         ptx, _ = compile_ptx(simple_habs_scalar, args, cc=(5, 3))
-        self.assertIn('abs.f16', ptx)
+        self.assertIn("abs.f16", ptx)
 
     @skip_unless_cc_53
     def test_fp16_intrinsics_common(self):
-        kernels = (simple_hsin, simple_hcos,
-                   simple_hlog, simple_hlog2, simple_hlog10,
-                   simple_hsqrt, simple_hceil, simple_hfloor,
-                   simple_hrcp, simple_htrunc, simple_hrint,
-                   simple_hrsqrt)
+        kernels = (
+            simple_hsin,
+            simple_hcos,
+            simple_hlog,
+            simple_hlog2,
+            simple_hlog10,
+            simple_hsqrt,
+            simple_hceil,
+            simple_hfloor,
+            simple_hrcp,
+            simple_htrunc,
+            simple_hrint,
+            simple_hrsqrt,
+        )
         exp_kernels = (simple_hexp, simple_hexp2)
-        expected_functions = (np.sin, np.cos,
-                              np.log, np.log2, np.log10,
-                              np.sqrt, np.ceil, np.floor,
-                              np.reciprocal, np.trunc, np.rint,
-                              numpy_hrsqrt)
+        expected_functions = (
+            np.sin,
+            np.cos,
+            np.log,
+            np.log2,
+            np.log10,
+            np.sqrt,
+            np.ceil,
+            np.floor,
+            np.reciprocal,
+            np.trunc,
+            np.rint,
+            numpy_hrsqrt,
+        )
         expected_exp_functions = (np.exp, np.exp2)
 
         # Generate random data
@@ -807,7 +830,7 @@ def test_fp16_intrinsics_common(self):
         for kernel, fn in zip(kernels, expected_functions):
             with self.subTest(fn=fn):
                 kernel = cuda.jit("void(f2[:], f2[:])")(kernel)
-                kernel[1,N](r, x)
+                kernel[1, N](r, x)
                 expected = fn(x, dtype=np.float16)
                 np.testing.assert_allclose(r, expected)
 
@@ -815,7 +838,7 @@ def test_fp16_intrinsics_common(self):
         for kernel, fn in zip(exp_kernels, expected_exp_functions):
             with self.subTest(fn=fn):
                 kernel = cuda.jit("void(f2[:], f2[:])")(kernel)
-                kernel[1,N](r, x2)
+                kernel[1, N](r, x2)
                 expected = fn(x2, dtype=np.float16)
                 np.testing.assert_allclose(r, expected)
 
@@ -836,14 +859,26 @@ def hexp10_vectors(r, x):
 
         # Run the kernel
         hexp10_vectors[1, N](r, x)
-        np.testing.assert_allclose(r, 10 ** x)
+        np.testing.assert_allclose(r, 10**x)
 
     @skip_unless_cc_53
     def test_fp16_comparison(self):
-        fns = (simple_heq_scalar, simple_hne_scalar, simple_hge_scalar,
-               simple_hgt_scalar, simple_hle_scalar, simple_hlt_scalar)
-        ops = (operator.eq, operator.ne, operator.ge,
-               operator.gt, operator.le, operator.lt)
+        fns = (
+            simple_heq_scalar,
+            simple_hne_scalar,
+            simple_hge_scalar,
+            simple_hgt_scalar,
+            simple_hle_scalar,
+            simple_hlt_scalar,
+        )
+        ops = (
+            operator.eq,
+            operator.ne,
+            operator.ge,
+            operator.gt,
+            operator.le,
+            operator.lt,
+        )
 
         for fn, op in zip(fns, ops):
             with self.subTest(op=op):
@@ -872,18 +907,20 @@ def test_fp16_comparison(self):
 
     @skip_unless_cc_53
     def test_multiple_float16_comparisons(self):
-        functions = (test_multiple_hcmp_1,
-                     test_multiple_hcmp_2,
-                     test_multiple_hcmp_3,
-                     test_multiple_hcmp_4,
-                     test_multiple_hcmp_5)
+        functions = (
+            test_multiple_hcmp_1,
+            test_multiple_hcmp_2,
+            test_multiple_hcmp_3,
+            test_multiple_hcmp_4,
+            test_multiple_hcmp_5,
+        )
         for fn in functions:
             with self.subTest(fn=fn):
                 compiled = cuda.jit("void(b1[:], f2, f2, f2)")(fn)
                 ary = np.zeros(1, dtype=np.bool_)
-                arg1 = np.float16(2.)
-                arg2 = np.float16(3.)
-                arg3 = np.float16(4.)
+                arg1 = np.float16(2.0)
+                arg2 = np.float16(3.0)
+                arg3 = np.float16(4.0)
                 compiled[1, 1](ary, arg1, arg2, arg3)
                 self.assertTrue(ary[0])
 
@@ -891,11 +928,11 @@ def test_multiple_float16_comparisons(self):
     def test_hmax(self):
         compiled = cuda.jit("void(f2[:], f2, f2)")(simple_hmax_scalar)
         ary = np.zeros(1, dtype=np.float16)
-        arg1 = np.float16(3.)
-        arg2 = np.float16(4.)
+        arg1 = np.float16(3.0)
+        arg2 = np.float16(4.0)
         compiled[1, 1](ary, arg1, arg2)
         np.testing.assert_allclose(ary[0], arg2)
-        arg1 = np.float16(5.)
+        arg1 = np.float16(5.0)
         compiled[1, 1](ary, arg1, arg2)
         np.testing.assert_allclose(ary[0], arg1)
 
@@ -903,25 +940,25 @@ def test_hmax(self):
     def test_hmin(self):
         compiled = cuda.jit("void(f2[:], f2, f2)")(simple_hmin_scalar)
         ary = np.zeros(1, dtype=np.float16)
-        arg1 = np.float16(3.)
-        arg2 = np.float16(4.)
+        arg1 = np.float16(3.0)
+        arg2 = np.float16(4.0)
         compiled[1, 1](ary, arg1, arg2)
         np.testing.assert_allclose(ary[0], arg1)
-        arg1 = np.float16(5.)
+        arg1 = np.float16(5.0)
         compiled[1, 1](ary, arg1, arg2)
         np.testing.assert_allclose(ary[0], arg2)
 
     def test_cbrt_f32(self):
         compiled = cuda.jit("void(float32[:], float32)")(simple_cbrt)
         ary = np.zeros(1, dtype=np.float32)
-        cbrt_arg = 2.
+        cbrt_arg = 2.0
         compiled[1, 1](ary, cbrt_arg)
         np.testing.assert_allclose(ary[0], cbrt_arg ** (1 / 3))
 
     def test_cbrt_f64(self):
         compiled = cuda.jit("void(float64[:], float64)")(simple_cbrt)
         ary = np.zeros(1, dtype=np.float64)
-        cbrt_arg = 6.
+        cbrt_arg = 6.0
         compiled[1, 1](ary, cbrt_arg)
         np.testing.assert_allclose(ary[0], cbrt_arg ** (1 / 3))
 
@@ -1052,25 +1089,36 @@ def test_round_to_f4(self):
         np.concatenate((vals, np.array([np.inf, -np.inf, np.nan])))
         digits = (
             # Common case branch of round_to_impl
-            -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5,
+            -5,
+            -4,
+            -3,
+            -2,
+            -1,
+            0,
+            1,
+            2,
+            3,
+            4,
+            5,
             # The algorithm currently implemented can only round to 13 digits
             # with single precision. Note that this doesn't trigger the
             # "overflow safe" branch of the implementation, which can only be
             # hit when using double precision.
-            13
+            13,
         )
         for val, ndigits in itertools.product(vals, digits):
             with self.subTest(val=val, ndigits=ndigits):
                 compiled[1, 1](ary, val, ndigits)
-                self.assertPreciseEqual(ary[0], round(val, ndigits),
-                                        prec='single')
+                self.assertPreciseEqual(
+                    ary[0], round(val, ndigits), prec="single"
+                )
 
     # CPython on most platforms uses rounding based on dtoa.c, whereas the CUDA
     # round-to implementation uses CPython's fallback implementation, which has
     # slightly different behavior at the edges of the domain. Since the CUDA
     # simulator executes using CPython, we need to skip this test when the
     # simulator is active.
-    @skip_on_cudasim('Overflow behavior differs on CPython')
+    @skip_on_cudasim("Overflow behavior differs on CPython")
     def test_round_to_f4_overflow(self):
         # Test that the input value is returned when y in round_ndigits
         # overflows.
@@ -1092,7 +1140,7 @@ def test_round_to_f4_halfway(self):
         val = 0.3425
         ndigits = 3
         compiled[1, 1](ary, val, ndigits)
-        self.assertPreciseEqual(ary[0], round(val, ndigits), prec='single')
+        self.assertPreciseEqual(ary[0], round(val, ndigits), prec="single")
 
     def test_round_to_f8(self):
         compiled = cuda.jit("void(float64[:], float64, int32)")(simple_round_to)
@@ -1105,19 +1153,19 @@ def test_round_to_f8(self):
         for val, ndigits in itertools.product(vals, digits):
             with self.subTest(val=val, ndigits=ndigits):
                 compiled[1, 1](ary, val, ndigits)
-                self.assertPreciseEqual(ary[0], round(val, ndigits),
-                                        prec='exact')
+                self.assertPreciseEqual(
+                    ary[0], round(val, ndigits), prec="exact"
+                )
 
         # Trigger the "overflow safe" branch of the implementation
         val = 0.12345678987654321 * 10e-15
         ndigits = 23
         with self.subTest(val=val, ndigits=ndigits):
             compiled[1, 1](ary, val, ndigits)
-            self.assertPreciseEqual(ary[0], round(val, ndigits),
-                                    prec='double')
+            self.assertPreciseEqual(ary[0], round(val, ndigits), prec="double")
 
     # Skipped on cudasim for the same reasons as test_round_to_f4 above.
-    @skip_on_cudasim('Overflow behavior differs on CPython')
+    @skip_on_cudasim("Overflow behavior differs on CPython")
     def test_round_to_f8_overflow(self):
         # Test that the input value is returned when y in round_ndigits
         # overflows.
@@ -1139,8 +1187,8 @@ def test_round_to_f8_halfway(self):
         val = 0.5425
         ndigits = 3
         compiled[1, 1](ary, val, ndigits)
-        self.assertPreciseEqual(ary[0], round(val, ndigits), prec='double')
+        self.assertPreciseEqual(ary[0], round(val, ndigits), prec="double")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_ipc.py b/numba_cuda/numba/cuda/tests/cudapy/test_ipc.py
index 657e9a104..4a6083cd2 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_ipc.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_ipc.py
@@ -7,9 +7,13 @@
 
 from numba import cuda
 from numba.cuda.cudadrv import driver
-from numba.cuda.testing import (skip_on_arm, skip_on_cudasim,
-                                skip_under_cuda_memcheck,
-                                ContextResettingTestCase, ForeignArray)
+from numba.cuda.testing import (
+    skip_on_arm,
+    skip_on_cudasim,
+    skip_under_cuda_memcheck,
+    ContextResettingTestCase,
+    ForeignArray,
+)
 from numba.tests.support import linux_only, windows_only
 import unittest
 
@@ -32,8 +36,9 @@ def core_ipc_handle_test(the_work, result_queue):
 def base_ipc_handle_test(handle, size, result_queue):
     def the_work():
         dtype = np.dtype(np.intp)
-        with cuda.open_ipc_array(handle, shape=size // dtype.itemsize,
-                                 dtype=dtype) as darr:
+        with cuda.open_ipc_array(
+            handle, shape=size // dtype.itemsize, dtype=dtype
+        ) as darr:
             # copy the data to host
             return darr.copy_to_host()
 
@@ -43,9 +48,11 @@ def the_work():
 def serialize_ipc_handle_test(handle, result_queue):
     def the_work():
         dtype = np.dtype(np.intp)
-        darr = handle.open_array(cuda.current_context(),
-                                 shape=handle.size // dtype.itemsize,
-                                 dtype=dtype)
+        darr = handle.open_array(
+            cuda.current_context(),
+            shape=handle.size // dtype.itemsize,
+            dtype=dtype,
+        )
         # copy the data to host
         arr = darr.copy_to_host()
         handle.close()
@@ -63,10 +70,10 @@ def ipc_array_test(ipcarr, result_queue):
                 with ipcarr:
                     pass
             except ValueError as e:
-                if str(e) != 'IpcHandle is already opened':
-                    raise AssertionError('invalid exception message')
+                if str(e) != "IpcHandle is already opened":
+                    raise AssertionError("invalid exception message")
             else:
-                raise AssertionError('did not raise on reopen')
+                raise AssertionError("did not raise on reopen")
     # Catch any exception so we can propagate it
     except:  # noqa: E722
         # FAILED. propagate the exception as a string
@@ -80,11 +87,10 @@ def ipc_array_test(ipcarr, result_queue):
 
 
 @linux_only
-@skip_under_cuda_memcheck('Hangs cuda-memcheck')
-@skip_on_cudasim('Ipc not available in CUDASIM')
-@skip_on_arm('CUDA IPC not supported on ARM in Numba')
+@skip_under_cuda_memcheck("Hangs cuda-memcheck")
+@skip_on_cudasim("Ipc not available in CUDASIM")
+@skip_on_arm("CUDA IPC not supported on ARM in Numba")
 class TestIpcMemory(ContextResettingTestCase):
-
     def test_ipc_handle(self):
         # prepare data for IPC
         arr = np.arange(10, dtype=np.intp)
@@ -102,7 +108,7 @@ def test_ipc_handle(self):
         size = ipch.size
 
         # spawn new process for testing
-        ctx = mp.get_context('spawn')
+        ctx = mp.get_context("spawn")
         result_queue = ctx.Queue()
         args = (handle_bytes, size, result_queue)
         proc = ctx.Process(target=base_ipc_handle_test, args=args)
@@ -145,11 +151,12 @@ def check_ipc_handle_serialization(self, index_arg=None, foreign=False):
         if driver.USE_NV_BINDING:
             self.assertEqual(ipch_recon.handle.reserved, ipch.handle.reserved)
         else:
-            self.assertEqual(ipch_recon.handle.reserved[:],
-                             ipch.handle.reserved[:])
+            self.assertEqual(
+                ipch_recon.handle.reserved[:], ipch.handle.reserved[:]
+            )
 
         # spawn new process for testing
-        ctx = mp.get_context('spawn')
+        ctx = mp.get_context("spawn")
         result_queue = ctx.Queue()
         args = (ipch, result_queue)
         proc = ctx.Process(target=serialize_ipc_handle_test, args=args)
@@ -162,7 +169,10 @@ def check_ipc_handle_serialization(self, index_arg=None, foreign=False):
         proc.join(3)
 
     def test_ipc_handle_serialization(self):
-        for index, foreign, in self.variants():
+        for (
+            index,
+            foreign,
+        ) in self.variants():
             with self.subTest(index=index, foreign=foreign):
                 self.check_ipc_handle_serialization(index, foreign)
 
@@ -179,7 +189,7 @@ def check_ipc_array(self, index_arg=None, foreign=False):
         ipch = devarr.get_ipc_handle()
 
         # spawn new process for testing
-        ctx = mp.get_context('spawn')
+        ctx = mp.get_context("spawn")
         result_queue = ctx.Queue()
         args = (ipch, result_queue)
         proc = ctx.Process(target=ipc_array_test, args=args)
@@ -192,7 +202,10 @@ def check_ipc_array(self, index_arg=None, foreign=False):
         proc.join(3)
 
     def test_ipc_array(self):
-        for index, foreign, in self.variants():
+        for (
+            index,
+            foreign,
+        ) in self.variants():
             with self.subTest(index=index, foreign=foreign):
                 self.check_ipc_array(index, foreign)
 
@@ -205,7 +218,9 @@ def the_work():
             arrsize = handle.size // np.dtype(np.intp).itemsize
             hostarray = np.zeros(arrsize, dtype=np.intp)
             cuda.driver.device_to_host(
-                hostarray, deviceptr, size=handle.size,
+                hostarray,
+                deviceptr,
+                size=handle.size,
             )
             handle.close()
         return hostarray
@@ -223,10 +238,10 @@ def staged_ipc_array_test(ipcarr, device_num, result_queue):
                     with ipcarr:
                         pass
                 except ValueError as e:
-                    if str(e) != 'IpcHandle is already opened':
-                        raise AssertionError('invalid exception message')
+                    if str(e) != "IpcHandle is already opened":
+                        raise AssertionError("invalid exception message")
                 else:
-                    raise AssertionError('did not raise on reopen')
+                    raise AssertionError("did not raise on reopen")
     # Catch any exception so we can propagate it
     except:  # noqa: E722
         # FAILED. propagate the exception as a string
@@ -240,9 +255,9 @@ def staged_ipc_array_test(ipcarr, device_num, result_queue):
 
 
 @linux_only
-@skip_under_cuda_memcheck('Hangs cuda-memcheck')
-@skip_on_cudasim('Ipc not available in CUDASIM')
-@skip_on_arm('CUDA IPC not supported on ARM in Numba')
+@skip_under_cuda_memcheck("Hangs cuda-memcheck")
+@skip_on_cudasim("Ipc not available in CUDASIM")
+@skip_on_arm("CUDA IPC not supported on ARM in Numba")
 class TestIpcStaged(ContextResettingTestCase):
     def test_staged(self):
         # prepare data for IPC
@@ -250,7 +265,7 @@ def test_staged(self):
         devarr = cuda.to_device(arr)
 
         # spawn new process for testing
-        mpctx = mp.get_context('spawn')
+        mpctx = mp.get_context("spawn")
         result_queue = mpctx.Queue()
 
         # create IPC handle
@@ -264,8 +279,7 @@ def test_staged(self):
             self.assertEqual(ipch_recon.handle.reserved, ipch.handle.reserved)
         else:
             self.assertEqual(
-                ipch_recon.handle.reserved[:],
-                ipch.handle.reserved[:]
+                ipch_recon.handle.reserved[:], ipch.handle.reserved[:]
             )
         self.assertEqual(ipch_recon.size, ipch.size)
 
@@ -289,7 +303,7 @@ def test_ipc_array(self):
             ipch = devarr.get_ipc_handle()
 
             # spawn new process for testing
-            ctx = mp.get_context('spawn')
+            ctx = mp.get_context("spawn")
             result_queue = ctx.Queue()
             args = (ipch, device_num, result_queue)
             proc = ctx.Process(target=staged_ipc_array_test, args=args)
@@ -303,7 +317,7 @@ def test_ipc_array(self):
 
 
 @windows_only
-@skip_on_cudasim('Ipc not available in CUDASIM')
+@skip_on_cudasim("Ipc not available in CUDASIM")
 class TestIpcNotSupported(ContextResettingTestCase):
     def test_unsupported(self):
         arr = np.arange(10, dtype=np.intp)
@@ -311,8 +325,8 @@ def test_unsupported(self):
         with self.assertRaises(OSError) as raises:
             devarr.get_ipc_handle()
         errmsg = str(raises.exception)
-        self.assertIn('OS does not support CUDA IPC', errmsg)
+        self.assertIn("OS does not support CUDA IPC", errmsg)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_iterators.py b/numba_cuda/numba/cuda/tests/cudapy/test_iterators.py
index 47366f380..4a69badc2 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_iterators.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_iterators.py
@@ -5,7 +5,6 @@
 
 
 class TestIterators(CUDATestCase):
-
     def test_enumerate(self):
         @cuda.jit
         def enumerator(x, error):
@@ -95,5 +94,5 @@ def zipper_enumerator(x, y, error):
         self._test_twoarg_function(zipper_enumerator)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_lang.py b/numba_cuda/numba/cuda/tests/cudapy/test_lang.py
index 0241c1e40..97562c250 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_lang.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_lang.py
@@ -10,7 +10,7 @@
 
 class TestLang(CUDATestCase):
     def test_enumerate(self):
-        tup = (1., 2.5, 3.)
+        tup = (1.0, 2.5, 3.0)
 
         @cuda.jit("void(float64[:])")
         def foo(a):
@@ -39,12 +39,12 @@ def foo(a):
         self.assertTrue(np.all(a == (b + c).sum()))
 
     def test_issue_872(self):
-        '''
+        """
         Ensure that typing and lowering of CUDA kernel API primitives works in
         more than one block. Was originally to ensure that macro expansion works
         for more than one block (issue #872), but macro expansion has been
         replaced by a "proper" implementation of all kernel API functions.
-        '''
+        """
 
         @cuda.jit("void(float64[:,:])")
         def cuda_kernel_api_in_multiple_blocks(ary):
@@ -60,5 +60,5 @@ def cuda_kernel_api_in_multiple_blocks(ary):
         cuda_kernel_api_in_multiple_blocks[1, (2, 3)](a)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_laplace.py b/numba_cuda/numba/cuda/tests/cudapy/test_laplace.py
index d868b0297..3a1dee8b0 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_laplace.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_laplace.py
@@ -14,7 +14,6 @@
 
 class TestCudaLaplace(CUDATestCase):
     def test_laplace_small(self):
-
         @cuda.jit(float64(float64, float64), device=True, inline=True)
         def get_max(a, b):
             if a > b:
@@ -38,8 +37,9 @@ def jocabi_relax_core(A, Anew, error):
 
             err_sm[ty, tx] = 0
             if j >= 1 and j < n - 1 and i >= 1 and i < m - 1:
-                Anew[j, i] = 0.25 * ( A[j, i + 1] + A[j, i - 1]
-                                      + A[j - 1, i] + A[j + 1, i])
+                Anew[j, i] = 0.25 * (
+                    A[j, i + 1] + A[j, i - 1] + A[j - 1, i] + A[j + 1, i]
+                )
                 err_sm[ty, tx] = Anew[j, i] - A[j, i]
 
             cuda.syncthreads()
@@ -91,8 +91,8 @@ def jocabi_relax_core(A, Anew, error):
 
         stream = cuda.stream()
 
-        dA = cuda.to_device(A, stream)          # to device and don't come back
-        dAnew = cuda.to_device(Anew, stream)    # to device and don't come back
+        dA = cuda.to_device(A, stream)  # to device and don't come back
+        dAnew = cuda.to_device(Anew, stream)  # to device and don't come back
         derror_grid = cuda.to_device(error_grid, stream)
 
         while error > tol and iter < iter_max:
@@ -115,5 +115,5 @@ def jocabi_relax_core(A, Anew, error):
             iter += 1
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py b/numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py
index 9572a8882..d2b85e501 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py
@@ -31,7 +31,7 @@ def use_sad(r, x, y, z):
         r[i] = libdevice.sad(x[i], y[i], z[i])
 
 
-@skip_on_cudasim('Libdevice functions are not supported on cudasim')
+@skip_on_cudasim("Libdevice functions are not supported on cudasim")
 class TestLibdevice(CUDATestCase):
     """
     Some tests of libdevice function wrappers that check the returned values.
@@ -102,14 +102,15 @@ def make_test_call(libname):
     def _test_call_functions(self):
         # Strip off '__nv_' from libdevice name to get Python name
         apiname = libname[5:]
-        apifunc = getattr(libdevice, apiname)
+        apifunc = getattr(libdevice, apiname)  # noqa: F841
         retty, args = functions[libname]
         sig = create_signature(retty, args)
 
         # Construct arguments to the libdevice function. These are all
         # non-pointer arguments to the underlying bitcode function.
-        funcargs = ", ".join(['a%d' % i for i, arg in enumerate(args) if not
-                              arg.is_ptr])
+        funcargs = ", ".join(
+            ["a%d" % i for i, arg in enumerate(args) if not arg.is_ptr]
+        )
 
         # Arguments to the Python function (`pyfunc` in the template above) are
         # the arguments to the libdevice function, plus as many extra arguments
@@ -118,35 +119,37 @@ def _test_call_functions(self):
         # returns.
         if isinstance(sig.return_type, (types.Tuple, types.UniTuple)):
             # Start with the parameters for the return values
-            pyargs = ", ".join(['r%d' % i for i in
-                                range(len(sig.return_type))])
+            pyargs = ", ".join(["r%d" % i for i in range(len(sig.return_type))])
             # Add the parameters for the argument values
             pyargs += ", " + funcargs
             # Generate the unpacking of the return value from the libdevice
             # function into the Python function return values (`r0`, `r1`,
             # etc.).
-            retvars = ", ".join(['r%d[0]' % i for i in
-                                 range(len(sig.return_type))])
+            retvars = ", ".join(
+                ["r%d[0]" % i for i in range(len(sig.return_type))]
+            )
         else:
             # Scalar return is a more straightforward case
             pyargs = "r0, " + funcargs
             retvars = "r0[0]"
 
         # Create the string containing the function to compile
-        d = { 'func': apiname,
-              'pyargs': pyargs,
-              'funcargs': funcargs,
-              'retvars': retvars }
+        d = {
+            "func": apiname,
+            "pyargs": pyargs,
+            "funcargs": funcargs,
+            "retvars": retvars,
+        }
         code = function_template % d
 
         # Convert the string to a Python function
         locals = {}
         exec(code, globals(), locals)
-        pyfunc = locals['pyfunc']
+        pyfunc = locals["pyfunc"]
 
         # Compute the signature for compilation. This mirrors the creation of
         # arguments to the Python function above.
-        pyargs = [ arg.ty for arg in args if not arg.is_ptr ]
+        pyargs = [arg.ty for arg in args if not arg.is_ptr]
         if isinstance(sig.return_type, (types.Tuple, types.UniTuple)):
             pyreturns = [ret[::1] for ret in sig.return_type]
             pyargs = pyreturns + pyargs
@@ -159,16 +162,16 @@ def _test_call_functions(self):
         # If the function body was discarded by optimization (therefore making
         # the test a bit weak), there won't be any loading of parameters -
         # ensure that a load from parameters occurs somewhere in the PTX
-        self.assertIn('ld.param', ptx)
+        self.assertIn("ld.param", ptx)
 
         # Returning the result (through a passed-in array) should also require
         # a store to global memory, so check for at least one of those too.
-        self.assertIn('st.global', ptx)
+        self.assertIn("st.global", ptx)
 
     return _test_call_functions
 
 
-@skip_on_cudasim('Compilation to PTX is not supported on cudasim')
+@skip_on_cudasim("Compilation to PTX is not supported on cudasim")
 class TestLibdeviceCompilation(unittest.TestCase):
     """
     Class for holding all tests of compiling calls to libdevice functions. We
@@ -179,9 +182,10 @@ class TestLibdeviceCompilation(unittest.TestCase):
 
 
 for libname in functions:
-    setattr(TestLibdeviceCompilation, 'test_%s' % libname,
-            make_test_call(libname))
+    setattr(
+        TestLibdeviceCompilation, "test_%s" % libname, make_test_call(libname)
+    )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py b/numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py
index 182873b50..edd7c314d 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py
@@ -7,16 +7,16 @@
 import warnings
 
 
-@skip_on_cudasim('Simulator does not produce lineinfo')
+@skip_on_cudasim("Simulator does not produce lineinfo")
 class TestCudaLineInfo(CUDATestCase):
     def _loc_directive_regex(self):
         # This is used in several tests
 
         pat = (
-            r'\.loc'      # .loc directive beginning
-            r'\s+[0-9]+'  # whitespace then file index
-            r'\s+[0-9]+'  # whitespace then line number
-            r'\s+[0-9]+'  # whitespace then column position
+            r"\.loc"  # .loc directive beginning
+            r"\s+[0-9]+"  # whitespace then file index
+            r"\s+[0-9]+"  # whitespace then line number
+            r"\s+[0-9]+"  # whitespace then column position
         )
         return re.compile(pat)
 
@@ -29,21 +29,21 @@ def _check(self, fn, sig, expect):
         # DICompileUnit debug info metadata should all be of the
         # DebugDirectivesOnly kind, and not the FullDebug kind
         pat = (
-            r'!DICompileUnit\(.*'    # Opening of DICompileUnit metadata. Since
-                                     # the order of attributes is not
-                                     # guaranteed, we need to match arbitrarily
-                                     # afterwards.
-            r'emissionKind:\s+'      # The emissionKind attribute followed by
-                                     # whitespace.
-            r'DebugDirectivesOnly'   # The correct emissionKind.
+            r"!DICompileUnit\(.*"  # Opening of DICompileUnit metadata. Since
+            # the order of attributes is not
+            # guaranteed, we need to match arbitrarily
+            # afterwards.
+            r"emissionKind:\s+"  # The emissionKind attribute followed by
+            # whitespace.
+            r"DebugDirectivesOnly"  # The correct emissionKind.
         )
         match = re.compile(pat).search(llvm)
         assertfn(match, msg=ptx)
 
         pat = (
-            r'!DICompileUnit\(.*'  # Same as the pattern above, but for the
-            r'emissionKind:\s+'    # incorrect FullDebug emissionKind.
-            r'FullDebug'           #
+            r"!DICompileUnit\(.*"  # Same as the pattern above, but for the
+            r"emissionKind:\s+"  # incorrect FullDebug emissionKind.
+            r"FullDebug"  #
         )
         match = re.compile(pat).search(llvm)
         self.assertIsNone(match, msg=ptx)
@@ -51,8 +51,8 @@ def _check(self, fn, sig, expect):
         # The name of this file should be present in the line mapping
         # if lineinfo was propagated through correctly.
         pat = (
-            r'\.file'                # .file directive beginning
-            r'\s+[0-9]+\s+'          # file number surrounded by whitespace
+            r"\.file"  # .file directive beginning
+            r"\s+[0-9]+\s+"  # file number surrounded by whitespace
             r'".*test_lineinfo.py"'  # filename in quotes, ignoring full path
         )
         match = re.compile(pat).search(ptx)
@@ -65,8 +65,8 @@ def _check(self, fn, sig, expect):
         # Debug info sections should not be present when only lineinfo is
         # generated
         pat = (
-            r'\.section\s+'  # .section directive beginning
-            r'\.debug_info'  # Section named ".debug_info"
+            r"\.section\s+"  # .section directive beginning
+            r"\.debug_info"  # Section named ".debug_info"
         )
         match = re.compile(pat).search(ptx)
         self.assertIsNone(match, msg=ptx)
@@ -98,7 +98,7 @@ def divide_kernel(x, y):
         # signal an exception (e.g. divide by zero) has occurred. When the
         # error model is the default NumPy one (as it should be when only
         # lineinfo is enabled) the device function always returns 0.
-        self.assertNotIn('ret i32 1', llvm)
+        self.assertNotIn("ret i32 1", llvm)
 
     def test_no_lineinfo_in_device_function(self):
         # Ensure that no lineinfo is generated in device functions by default.
@@ -138,7 +138,7 @@ def caller(x):
         # Check that there is no device function in the PTX
 
         # A line beginning with ".weak .func" that identifies a device function
-        devfn_start = re.compile(r'^\.weak\s+\.func')
+        devfn_start = re.compile(r"^\.weak\s+\.func")
 
         for line in ptxlines:
             if devfn_start.match(line) is not None:
@@ -151,13 +151,14 @@ def caller(x):
 
         for line in ptxlines:
             if loc_directive.search(line) is not None:
-                if 'inlined_at' in line:
+                if "inlined_at" in line:
                     found = True
                     break
 
         if not found:
-            self.fail(f'No .loc directive with inlined_at info found'
-                      f'in:\n\n{ptx}')
+            self.fail(
+                f"No .loc directive with inlined_at info foundin:\n\n{ptx}"
+            )
 
         # We also inspect the LLVM to ensure that there's debug info for each
         # subprogram (function). A lightweight way to check this is to ensure
@@ -166,7 +167,7 @@ def caller(x):
         llvm = caller.inspect_llvm(sig)
         subprograms = 0
         for line in llvm.splitlines():
-            if 'distinct !DISubprogram' in line:
+            if "distinct !DISubprogram" in line:
                 subprograms += 1
 
         # One DISubprogram for each of:
@@ -174,9 +175,12 @@ def caller(x):
         # - The callee
         expected_subprograms = 2
 
-        self.assertEqual(subprograms, expected_subprograms,
-                         f'"Expected {expected_subprograms} DISubprograms; '
-                         f'got {subprograms}')
+        self.assertEqual(
+            subprograms,
+            expected_subprograms,
+            f'"Expected {expected_subprograms} DISubprograms; '
+            f"got {subprograms}",
+        )
 
     def test_debug_and_lineinfo_warning(self):
         with warnings.catch_warnings(record=True) as w:
@@ -190,9 +194,10 @@ def f():
 
         self.assertEqual(len(w), 1)
         self.assertEqual(w[0].category, NumbaInvalidConfigWarning)
-        self.assertIn('debug and lineinfo are mutually exclusive',
-                      str(w[0].message))
+        self.assertIn(
+            "debug and lineinfo are mutually exclusive", str(w[0].message)
+        )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_localmem.py b/numba_cuda/numba/cuda/tests/cudapy/test_localmem.py
index 26b3469a7..1e6687ce6 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_localmem.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_localmem.py
@@ -31,32 +31,31 @@ def culocal1tuple(A, B):
         B[i] = C[i]
 
 
-@skip_on_cudasim('PTX inspection not available in cudasim')
+@skip_on_cudasim("PTX inspection not available in cudasim")
 class TestCudaLocalMem(CUDATestCase):
     def test_local_array(self):
         sig = (int32[:], int32[:])
         jculocal = cuda.jit(sig)(culocal)
-        self.assertTrue('.local' in jculocal.inspect_asm(sig))
-        A = np.arange(1000, dtype='int32')
+        self.assertTrue(".local" in jculocal.inspect_asm(sig))
+        A = np.arange(1000, dtype="int32")
         B = np.zeros_like(A)
         jculocal[1, 1](A, B)
         self.assertTrue(np.all(A == B))
 
     def test_local_array_1_tuple(self):
-        """Ensure that local arrays can be constructed with 1-tuple shape
-        """
-        jculocal = cuda.jit('void(int32[:], int32[:])')(culocal1tuple)
+        """Ensure that local arrays can be constructed with 1-tuple shape"""
+        jculocal = cuda.jit("void(int32[:], int32[:])")(culocal1tuple)
         # Don't check if .local is in the ptx because the optimizer
         # may reduce it to registers.
-        A = np.arange(5, dtype='int32')
+        A = np.arange(5, dtype="int32")
         B = np.zeros_like(A)
         jculocal[1, 1](A, B)
         self.assertTrue(np.all(A == B))
 
     def test_local_array_complex(self):
-        sig = 'void(complex128[:], complex128[:])'
+        sig = "void(complex128[:], complex128[:])"
         jculocalcomplex = cuda.jit(sig)(culocalcomplex)
-        A = (np.arange(100, dtype='complex128') - 1) / 2j
+        A = (np.arange(100, dtype="complex128") - 1) / 2j
         B = np.zeros_like(A)
         jculocalcomplex[1, 1](A, B)
         self.assertTrue(np.all(A == B))
@@ -64,7 +63,7 @@ def test_local_array_complex(self):
     def check_dtype(self, f, dtype):
         # Find the typing of the dtype argument to cuda.local.array
         annotation = next(iter(f.overloads.values()))._type_annotation
-        l_dtype = annotation.typemap['l'].dtype
+        l_dtype = annotation.typemap["l"].dtype
         # Ensure that the typing is correct
         self.assertEqual(l_dtype, dtype)
 
@@ -95,7 +94,7 @@ def test_string_dtype(self):
         # Check that strings can be used to specify the dtype of a local array
         @cuda.jit(void(int32[::1]))
         def f(x):
-            l = cuda.local.array(10, dtype='int32')
+            l = cuda.local.array(10, dtype="int32")
             l[0] = x[0]
             x[0] = l[0]
 
@@ -106,9 +105,10 @@ def test_invalid_string_dtype(self):
         # Check that strings of invalid dtypes cause a typing error
         re = ".*Invalid NumPy dtype specified: 'int33'.*"
         with self.assertRaisesRegex(TypingError, re):
+
             @cuda.jit(void(int32[::1]))
             def f(x):
-                l = cuda.local.array(10, dtype='int33')
+                l = cuda.local.array(10, dtype="int33")
                 l[0] = x[0]
                 x[0] = l[0]
 
@@ -160,5 +160,5 @@ def test_issue_fp16_support(self):
         self._check_local_array_size_fp16(2, 2, np.float16)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_mandel.py b/numba_cuda/numba/cuda/tests/cudapy/test_mandel.py
index 2b7290ad4..29265bfa5 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_mandel.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_mandel.py
@@ -3,11 +3,10 @@
 from numba.cuda.testing import skip_on_cudasim, unittest
 
 
-@skip_on_cudasim('Compilation unsupported in the simulator')
+@skip_on_cudasim("Compilation unsupported in the simulator")
 class TestCudaMandel(unittest.TestCase):
     def test_mandel(self):
-        """Just make sure we can compile this
-        """
+        """Just make sure we can compile this"""
 
         def mandel(tid, min_x, max_x, min_y, max_y, width, height, iters):
             pixel_size_x = (max_x - min_x) / width
@@ -28,10 +27,18 @@ def mandel(tid, min_x, max_x, min_y, max_y, width, height, iters):
                     return i
             return iters
 
-        args = (uint32, float64, float64, float64, float64,
-                uint32, uint32, uint32)
+        args = (
+            uint32,
+            float64,
+            float64,
+            float64,
+            float64,
+            uint32,
+            uint32,
+            uint32,
+        )
         compile_ptx(mandel, args, device=True)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_math.py b/numba_cuda/numba/cuda/tests/cudapy/test_math.py
index 028a402ff..9cc7ff473 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_math.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_math.py
@@ -1,8 +1,10 @@
 import numpy as np
-from numba.cuda.testing import (skip_unless_cc_53,
-                                unittest,
-                                CUDATestCase,
-                                skip_on_cudasim)
+from numba.cuda.testing import (
+    skip_unless_cc_53,
+    unittest,
+    CUDATestCase,
+    skip_on_cudasim,
+)
 from numba.np import numpy_support
 from numba import cuda, float32, float64, int32, vectorize, void, int64
 import math
@@ -253,8 +255,10 @@ def unary_template(self, func, npfunc, npdtype, nprestype, start, stop):
     def unary_bool_special_values(self, func, npfunc, npdtype, npmtype):
         fi = np.finfo(npdtype)
         denorm = fi.tiny / 4
-        A = np.array([0., denorm, fi.tiny, 0.5, 1., fi.max, np.inf, np.nan],
-                     dtype=npdtype)
+        A = np.array(
+            [0.0, denorm, fi.tiny, 0.5, 1.0, fi.max, np.inf, np.nan],
+            dtype=npdtype,
+        )
         B = np.empty_like(A, dtype=np.int32)
         cfunc = cuda.jit((npmtype[::1], int32[::1]))(func)
 
@@ -314,7 +318,7 @@ def binary_template(self, func, npfunc, npdtype, nprestype, start, stop):
         cfunc[1, nelem](A, A, B)
         np.testing.assert_allclose(npfunc(A, A), B)
 
-    #---------------------------------------------------------------------------
+    # ---------------------------------------------------------------------------
     # test_math_acos
 
     def test_math_acos(self):
@@ -325,7 +329,7 @@ def test_math_acos(self):
         self.unary_template_int64(math_acos, np.arccos, start=0, stop=0)
         self.unary_template_uint64(math_acos, np.arccos, start=0, stop=0)
 
-    #---------------------------------------------------------------------------
+    # ---------------------------------------------------------------------------
     # test_math_asin
 
     def test_math_asin(self):
@@ -336,7 +340,7 @@ def test_math_asin(self):
         self.unary_template_int64(math_asin, np.arcsin, start=0, stop=0)
         self.unary_template_uint64(math_asin, np.arcsin, start=0, stop=0)
 
-    #---------------------------------------------------------------------------
+    # ---------------------------------------------------------------------------
     # test_math_atan
 
     def test_math_atan(self):
@@ -345,7 +349,7 @@ def test_math_atan(self):
         self.unary_template_int64(math_atan, np.arctan)
         self.unary_template_uint64(math_atan, np.arctan)
 
-    #---------------------------------------------------------------------------
+    # ---------------------------------------------------------------------------
     # test_math_acosh
 
     def test_math_acosh(self):
@@ -354,7 +358,7 @@ def test_math_acosh(self):
         self.unary_template_int64(math_acosh, np.arccosh, start=1, stop=2)
         self.unary_template_uint64(math_acosh, np.arccosh, start=1, stop=2)
 
-    #---------------------------------------------------------------------------
+    # ---------------------------------------------------------------------------
     # test_math_asinh
 
     def test_math_asinh(self):
@@ -363,16 +367,16 @@ def test_math_asinh(self):
         self.unary_template_int64(math_asinh, np.arcsinh)
         self.unary_template_uint64(math_asinh, np.arcsinh)
 
-    #---------------------------------------------------------------------------
+    # ---------------------------------------------------------------------------
     # test_math_atanh
 
     def test_math_atanh(self):
-        self.unary_template_float32(math_atanh, np.arctanh, start=0, stop=.9)
-        self.unary_template_float64(math_atanh, np.arctanh, start=0, stop=.9)
-        self.unary_template_int64(math_atanh, np.arctanh, start=0, stop=.9)
-        self.unary_template_uint64(math_atanh, np.arctanh, start=0, stop=.9)
+        self.unary_template_float32(math_atanh, np.arctanh, start=0, stop=0.9)
+        self.unary_template_float64(math_atanh, np.arctanh, start=0, stop=0.9)
+        self.unary_template_int64(math_atanh, np.arctanh, start=0, stop=0.9)
+        self.unary_template_uint64(math_atanh, np.arctanh, start=0, stop=0.9)
 
-    #---------------------------------------------------------------------------
+    # ---------------------------------------------------------------------------
     # test_math_cos
 
     def test_math_cos(self):
@@ -399,7 +403,7 @@ def test_math_fp16(self):
     def test_math_fp16_trunc(self):
         self.unary_template_float16(math_trunc, np.trunc)
 
-    #---------------------------------------------------------------------------
+    # ---------------------------------------------------------------------------
     # test_math_sin
 
     def test_math_sin(self):
@@ -408,7 +412,7 @@ def test_math_sin(self):
         self.unary_template_int64(math_sin, np.sin)
         self.unary_template_uint64(math_sin, np.sin)
 
-    #---------------------------------------------------------------------------
+    # ---------------------------------------------------------------------------
     # test_math_tan
 
     def test_math_tan(self):
@@ -417,7 +421,7 @@ def test_math_tan(self):
         self.unary_template_int64(math_tan, np.tan)
         self.unary_template_uint64(math_tan, np.tan)
 
-    #---------------------------------------------------------------------------
+    # ---------------------------------------------------------------------------
     # test_math_cosh
 
     def test_math_cosh(self):
@@ -426,7 +430,7 @@ def test_math_cosh(self):
         self.unary_template_int64(math_cosh, np.cosh)
         self.unary_template_uint64(math_cosh, np.cosh)
 
-    #---------------------------------------------------------------------------
+    # ---------------------------------------------------------------------------
     # test_math_sinh
 
     def test_math_sinh(self):
@@ -435,7 +439,7 @@ def test_math_sinh(self):
         self.unary_template_int64(math_sinh, np.sinh)
         self.unary_template_uint64(math_sinh, np.sinh)
 
-    #---------------------------------------------------------------------------
+    # ---------------------------------------------------------------------------
     # test_math_tanh
 
     def test_math_tanh(self):
@@ -444,7 +448,7 @@ def test_math_tanh(self):
         self.unary_template_int64(math_tanh, np.tanh)
         self.unary_template_uint64(math_tanh, np.tanh)
 
-    #---------------------------------------------------------------------------
+    # ---------------------------------------------------------------------------
     # test_math_atan2
 
     def test_math_atan2(self):
@@ -453,31 +457,33 @@ def test_math_atan2(self):
         self.binary_template_int64(math_atan2, np.arctan2)
         self.binary_template_uint64(math_atan2, np.arctan2)
 
-    #---------------------------------------------------------------------------
+    # ---------------------------------------------------------------------------
     # test_math_erf
 
     def test_math_erf(self):
         @vectorize
         def ufunc(x):
             return math.erf(x)
+
         self.unary_template_float32(math_erf, ufunc)
         self.unary_template_float64(math_erf, ufunc)
         self.unary_template_int64(math_erf, ufunc)
         self.unary_template_uint64(math_erf, ufunc)
 
-    #---------------------------------------------------------------------------
+    # ---------------------------------------------------------------------------
     # test_math_erfc
 
     def test_math_erfc(self):
         @vectorize
         def ufunc(x):
             return math.erfc(x)
+
         self.unary_template_float32(math_erfc, ufunc)
         self.unary_template_float64(math_erfc, ufunc)
         self.unary_template_int64(math_erfc, ufunc)
         self.unary_template_uint64(math_erfc, ufunc)
 
-    #---------------------------------------------------------------------------
+    # ---------------------------------------------------------------------------
     # test_math_exp
 
     def test_math_exp(self):
@@ -486,7 +492,7 @@ def test_math_exp(self):
         self.unary_template_int64(math_exp, np.exp)
         self.unary_template_uint64(math_exp, np.exp)
 
-    #---------------------------------------------------------------------------
+    # ---------------------------------------------------------------------------
     # test_math_expm1
 
     def test_math_expm1(self):
@@ -495,7 +501,7 @@ def test_math_expm1(self):
         self.unary_template_int64(math_expm1, np.expm1)
         self.unary_template_uint64(math_expm1, np.expm1)
 
-    #---------------------------------------------------------------------------
+    # ---------------------------------------------------------------------------
     # test_math_fabs
 
     def test_math_fabs(self):
@@ -504,31 +510,33 @@ def test_math_fabs(self):
         self.unary_template_int64(math_fabs, np.fabs, start=-1)
         self.unary_template_uint64(math_fabs, np.fabs, start=-1)
 
-    #---------------------------------------------------------------------------
+    # ---------------------------------------------------------------------------
     # test_math_gamma
 
     def test_math_gamma(self):
         @vectorize
         def ufunc(x):
             return math.gamma(x)
+
         self.unary_template_float32(math_gamma, ufunc, start=0.1)
         self.unary_template_float64(math_gamma, ufunc, start=0.1)
         self.unary_template_int64(math_gamma, ufunc, start=1)
         self.unary_template_uint64(math_gamma, ufunc, start=1)
 
-    #---------------------------------------------------------------------------
+    # ---------------------------------------------------------------------------
     # test_math_lgamma
 
     def test_math_lgamma(self):
         @vectorize
         def ufunc(x):
             return math.lgamma(x)
+
         self.unary_template_float32(math_lgamma, ufunc, start=0.1)
         self.unary_template_float64(math_lgamma, ufunc, start=0.1)
         self.unary_template_int64(math_lgamma, ufunc, start=1)
         self.unary_template_uint64(math_lgamma, ufunc, start=1)
 
-    #---------------------------------------------------------------------------
+    # ---------------------------------------------------------------------------
     # test_math_log
 
     def test_math_log(self):
@@ -537,7 +545,7 @@ def test_math_log(self):
         self.unary_template_int64(math_log, np.log, start=1)
         self.unary_template_uint64(math_log, np.log, start=1)
 
-    #---------------------------------------------------------------------------
+    # ---------------------------------------------------------------------------
     # test_math_log2
 
     def test_math_log2(self):
@@ -546,7 +554,7 @@ def test_math_log2(self):
         self.unary_template_int64(math_log2, np.log2, start=1)
         self.unary_template_uint64(math_log2, np.log2, start=1)
 
-    #---------------------------------------------------------------------------
+    # ---------------------------------------------------------------------------
     # test_math_log10
 
     def test_math_log10(self):
@@ -555,7 +563,7 @@ def test_math_log10(self):
         self.unary_template_int64(math_log10, np.log10, start=1)
         self.unary_template_uint64(math_log10, np.log10, start=1)
 
-    #---------------------------------------------------------------------------
+    # ---------------------------------------------------------------------------
     # test_math_log1p
 
     def test_math_log1p(self):
@@ -564,7 +572,7 @@ def test_math_log1p(self):
         self.unary_template_int64(math_log1p, np.log1p)
         self.unary_template_uint64(math_log1p, np.log1p)
 
-    #---------------------------------------------------------------------------
+    # ---------------------------------------------------------------------------
     # test_math_remainder
 
     def test_math_remainder(self):
@@ -573,16 +581,17 @@ def test_math_remainder(self):
         self.binary_template_int64(math_remainder, np.remainder, start=1)
         self.binary_template_uint64(math_remainder, np.remainder, start=1)
 
-    @skip_on_cudasim('math.remainder(0, 0) raises a ValueError on CUDASim')
+    @skip_on_cudasim("math.remainder(0, 0) raises a ValueError on CUDASim")
     def test_math_remainder_0_0(self):
         @cuda.jit(void(float64[::1], int64, int64))
         def test_0_0(r, x, y):
             r[0] = math.remainder(x, y)
+
         r = np.zeros(1, np.float64)
         test_0_0[1, 1](r, 0, 0)
         self.assertTrue(np.isnan(r[0]))
 
-    #---------------------------------------------------------------------------
+    # ---------------------------------------------------------------------------
     # test_math_sqrt
 
     def test_math_sqrt(self):
@@ -591,7 +600,7 @@ def test_math_sqrt(self):
         self.unary_template_int64(math_sqrt, np.sqrt)
         self.unary_template_uint64(math_sqrt, np.sqrt)
 
-    #---------------------------------------------------------------------------
+    # ---------------------------------------------------------------------------
     # test_math_hypot
 
     def test_math_hypot(self):
@@ -600,7 +609,7 @@ def test_math_hypot(self):
         self.binary_template_int64(math_hypot, np.hypot)
         self.binary_template_uint64(math_hypot, np.hypot)
 
-    #---------------------------------------------------------------------------
+    # ---------------------------------------------------------------------------
     # test_math_pow
 
     def pow_template_int32(self, npdtype):
@@ -626,14 +635,14 @@ def test_math_pow(self):
         self.pow_template_int32(np.float32)
         self.pow_template_int32(np.float64)
 
-    #---------------------------------------------------------------------------
+    # ---------------------------------------------------------------------------
     # test_math_pow_binop
 
     def test_math_pow_binop(self):
         self.binary_template_float32(math_pow_binop, np.power)
         self.binary_template_float64(math_pow_binop, np.power)
 
-    #---------------------------------------------------------------------------
+    # ---------------------------------------------------------------------------
     # test_math_ceil
 
     def test_math_ceil(self):
@@ -642,7 +651,7 @@ def test_math_ceil(self):
         self.unary_template_int64(math_ceil, np.ceil)
         self.unary_template_uint64(math_ceil, np.ceil)
 
-    #---------------------------------------------------------------------------
+    # ---------------------------------------------------------------------------
     # test_math_floor
 
     def test_math_floor(self):
@@ -651,7 +660,7 @@ def test_math_floor(self):
         self.unary_template_int64(math_floor, np.floor)
         self.unary_template_uint64(math_floor, np.floor)
 
-    #---------------------------------------------------------------------------
+    # ---------------------------------------------------------------------------
     # test_math_trunc
     #
     # Note that math.trunc() is only supported on NumPy float64s, and not
@@ -663,20 +672,20 @@ def test_math_floor(self):
     def test_math_trunc(self):
         self.unary_template_float64(math_trunc, np.trunc)
 
-    @skip_on_cudasim('trunc only supported on NumPy float64')
+    @skip_on_cudasim("trunc only supported on NumPy float64")
     def test_math_trunc_non_float64(self):
         self.unary_template_float32(math_trunc, np.trunc)
         self.unary_template_int64(math_trunc, np.trunc)
         self.unary_template_uint64(math_trunc, np.trunc)
 
-    #---------------------------------------------------------------------------
+    # ---------------------------------------------------------------------------
     # test_math_copysign
 
     def test_math_copysign(self):
         self.binary_template_float32(math_copysign, np.copysign, start=-1)
         self.binary_template_float64(math_copysign, np.copysign, start=-1)
 
-    #---------------------------------------------------------------------------
+    # ---------------------------------------------------------------------------
     # test_math_modf
 
     def test_math_modf(self):
@@ -696,45 +705,53 @@ def modf_template_compare(A, dtype, arytype):
             cfunc = cuda.jit((arytype, arytype, arytype))(math_modf)
             cfunc[1, len(A)](A, B, C)
             D, E = np.modf(A)
-            self.assertTrue(np.array_equal(B,D))
-            self.assertTrue(np.array_equal(C,E))
+            self.assertTrue(np.array_equal(B, D))
+            self.assertTrue(np.array_equal(C, E))
 
         nelem = 50
-        #32 bit float
+        # 32 bit float
         with self.subTest("float32 modf on simple float"):
-            modf_template_compare(np.linspace(0, 10, nelem), dtype=np.float32,
-                                  arytype=float32[:])
+            modf_template_compare(
+                np.linspace(0, 10, nelem), dtype=np.float32, arytype=float32[:]
+            )
         with self.subTest("float32 modf on +- infinity"):
-            modf_template_compare(np.array([np.inf, -np.inf]), dtype=np.float32,
-                                  arytype=float32[:])
+            modf_template_compare(
+                np.array([np.inf, -np.inf]),
+                dtype=np.float32,
+                arytype=float32[:],
+            )
         with self.subTest("float32 modf on nan"):
             modf_template_nan(dtype=np.float32, arytype=float32[:])
 
-        #64 bit float
+        # 64 bit float
         with self.subTest("float64 modf on simple float"):
-            modf_template_compare(np.linspace(0, 10, nelem), dtype=np.float64,
-                                  arytype=float64[:])
+            modf_template_compare(
+                np.linspace(0, 10, nelem), dtype=np.float64, arytype=float64[:]
+            )
         with self.subTest("float64 modf on +- infinity"):
-            modf_template_compare(np.array([np.inf, -np.inf]), dtype=np.float64,
-                                  arytype=float64[:])
+            modf_template_compare(
+                np.array([np.inf, -np.inf]),
+                dtype=np.float64,
+                arytype=float64[:],
+            )
         with self.subTest("float64 modf on nan"):
             modf_template_nan(dtype=np.float64, arytype=float64[:])
 
-    #---------------------------------------------------------------------------
+    # ---------------------------------------------------------------------------
     # test_math_fmod
 
     def test_math_fmod(self):
         self.binary_template_float32(math_fmod, np.fmod, start=1)
         self.binary_template_float64(math_fmod, np.fmod, start=1)
 
-    #---------------------------------------------------------------------------
+    # ---------------------------------------------------------------------------
     # test_math_mod_binop
 
     def test_math_mod_binop(self):
         self.binary_template_float32(math_mod_binop, np.fmod, start=1)
         self.binary_template_float64(math_mod_binop, np.fmod, start=1)
 
-    #---------------------------------------------------------------------------
+    # ---------------------------------------------------------------------------
     # test_math_isnan
 
     def test_math_isnan(self):
@@ -745,7 +762,7 @@ def test_math_isnan(self):
         self.unary_bool_special_values_float32(math_isnan, np.isnan)
         self.unary_bool_special_values_float64(math_isnan, np.isnan)
 
-    #---------------------------------------------------------------------------
+    # ---------------------------------------------------------------------------
     # test_math_isinf
 
     def test_math_isinf(self):
@@ -756,7 +773,7 @@ def test_math_isinf(self):
         self.unary_bool_special_values_float32(math_isinf, np.isinf)
         self.unary_bool_special_values_float64(math_isinf, np.isinf)
 
-    #---------------------------------------------------------------------------
+    # ---------------------------------------------------------------------------
     # test_math_isfinite
 
     def test_math_isfinite(self):
@@ -767,14 +784,14 @@ def test_math_isfinite(self):
         self.unary_bool_special_values_float32(math_isfinite, np.isfinite)
         self.unary_bool_special_values_float64(math_isfinite, np.isfinite)
 
-    #---------------------------------------------------------------------------
+    # ---------------------------------------------------------------------------
     # test_math_degrees
 
     def test_math_degrees(self):
         self.unary_bool_template_float32(math_degrees, np.degrees)
         self.unary_bool_template_float64(math_degrees, np.degrees)
 
-    #---------------------------------------------------------------------------
+    # ---------------------------------------------------------------------------
     # test_math_radians
 
     def test_math_radians(self):
@@ -782,5 +799,5 @@ def test_math_radians(self):
         self.unary_bool_template_float64(math_radians, np.radians)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_matmul.py b/numba_cuda/numba/cuda/tests/cudapy/test_matmul.py
index 51f1181a3..0071287c7 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_matmul.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_matmul.py
@@ -15,9 +15,7 @@
 
 
 class TestCudaMatMul(CUDATestCase):
-
     def test_func(self):
-
         @cuda.jit(void(float32[:, ::1], float32[:, ::1], float32[:, ::1]))
         def cu_square_matrix_mul(A, B, C):
             sA = cuda.shared.array(shape=SM_SIZE, dtype=float32)
@@ -70,5 +68,5 @@ def cu_square_matrix_mul(A, B, C):
         np.testing.assert_allclose(C, Cans, rtol=1e-5)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_minmax.py b/numba_cuda/numba/cuda/tests/cudapy/test_minmax.py
index aee97fd63..c44a2b5e6 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_minmax.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_minmax.py
@@ -22,20 +22,21 @@ def builtin_min(A, B, C):
     C[i] = float64(min(A[i], B[i]))
 
 
-@skip_on_cudasim('Tests PTX emission')
+@skip_on_cudasim("Tests PTX emission")
 class TestCudaMinMax(CUDATestCase):
     def _run(
-            self,
-            kernel,
-            numpy_equivalent,
-            ptx_instruction,
-            dtype_left,
-            dtype_right,
-            n=5):
+        self,
+        kernel,
+        numpy_equivalent,
+        ptx_instruction,
+        dtype_left,
+        dtype_right,
+        n=5,
+    ):
         kernel = cuda.jit(kernel)
 
         c = np.zeros(n, dtype=np.float64)
-        a = np.arange(n, dtype=dtype_left) + .5
+        a = np.arange(n, dtype=dtype_left) + 0.5
         b = np.full(n, fill_value=2, dtype=dtype_right)
 
         kernel[1, c.shape](a, b, c)
@@ -45,69 +46,29 @@ def _run(
         self.assertIn(ptx_instruction, ptx)
 
     def test_max_f8f8(self):
-        self._run(
-            builtin_max,
-            np.maximum,
-            'max.f64',
-            np.float64,
-            np.float64)
+        self._run(builtin_max, np.maximum, "max.f64", np.float64, np.float64)
 
     def test_max_f4f8(self):
-        self._run(
-            builtin_max,
-            np.maximum,
-            'max.f64',
-            np.float32,
-            np.float64)
+        self._run(builtin_max, np.maximum, "max.f64", np.float32, np.float64)
 
     def test_max_f8f4(self):
-        self._run(
-            builtin_max,
-            np.maximum,
-            'max.f64',
-            np.float64,
-            np.float32)
+        self._run(builtin_max, np.maximum, "max.f64", np.float64, np.float32)
 
     def test_max_f4f4(self):
-        self._run(
-            builtin_max,
-            np.maximum,
-            'max.f32',
-            np.float32,
-            np.float32)
+        self._run(builtin_max, np.maximum, "max.f32", np.float32, np.float32)
 
     def test_min_f8f8(self):
-        self._run(
-            builtin_min,
-            np.minimum,
-            'min.f64',
-            np.float64,
-            np.float64)
+        self._run(builtin_min, np.minimum, "min.f64", np.float64, np.float64)
 
     def test_min_f4f8(self):
-        self._run(
-            builtin_min,
-            np.minimum,
-            'min.f64',
-            np.float32,
-            np.float64)
+        self._run(builtin_min, np.minimum, "min.f64", np.float32, np.float64)
 
     def test_min_f8f4(self):
-        self._run(
-            builtin_min,
-            np.minimum,
-            'min.f64',
-            np.float64,
-            np.float32)
+        self._run(builtin_min, np.minimum, "min.f64", np.float64, np.float32)
 
     def test_min_f4f4(self):
-        self._run(
-            builtin_min,
-            np.minimum,
-            'min.f32',
-            np.float32,
-            np.float32)
+        self._run(builtin_min, np.minimum, "min.f32", np.float32, np.float32)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py b/numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py
index 181a80a69..143fa10c6 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py
@@ -5,11 +5,11 @@
 
 class TestCudaMonteCarlo(CUDATestCase):
     def test_montecarlo(self):
-        """Just make sure we can compile this
-        """
+        """Just make sure we can compile this"""
 
         @cuda.jit(
-            'void(double[:], double[:], double, double, double, double[:])')
+            "void(double[:], double[:], double, double, double, double[:])"
+        )
         def step(last, paths, dt, c0, c1, normdist):
             i = cuda.grid(1)
             if i >= paths.shape[0]:
@@ -18,5 +18,5 @@ def step(last, paths, dt, c0, c1, normdist):
             paths[i] = last[i] * math.exp(c0 * dt + c1 * noise)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py b/numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py
index 01b8a63ea..700987252 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py
@@ -47,7 +47,7 @@ def check(inp, out):
         copy_plus_1[1, N](A, B)
         check(A, B)
 
-    @skip_on_cudasim('Simulator does not support multiple threads')
+    @skip_on_cudasim("Simulator does not support multiple threads")
     def test_multithreaded(self):
         def work(gpu, dA, results, ridx):
             try:
@@ -64,9 +64,12 @@ def work(gpu, dA, results, ridx):
 
         nthreads = 10
         results = [None] * nthreads
-        threads = [threading.Thread(target=work, args=(cuda.gpus.current,
-                                                       dA, results, i))
-                   for i in range(nthreads)]
+        threads = [
+            threading.Thread(
+                target=work, args=(cuda.gpus.current, dA, results, i)
+            )
+            for i in range(nthreads)
+        ]
         for th in threads:
             th.start()
 
@@ -81,7 +84,6 @@ def work(gpu, dA, results, ridx):
 
     @unittest.skipIf(len(cuda.gpus) < 2, "need more than 1 gpus")
     def test_with_context(self):
-
         @cuda.jit
         def vector_add_scalar(arr, val):
             i = cuda.grid(1)
@@ -115,7 +117,7 @@ def test_with_context_peer_copy(self):
         with cuda.gpus[0]:
             ctx = cuda.current_context()
             if not ctx.can_access_peer(1):
-                self.skipTest('Peer access between GPUs disabled')
+                self.skipTest("Peer access between GPUs disabled")
 
         # 1. Create a range in an array
         hostarr = np.arange(10, dtype=np.float32)
@@ -136,5 +138,5 @@ def test_with_context_peer_copy(self):
             np.testing.assert_equal(arr2.copy_to_host(), hostarr)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py b/numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py
index 04a1234b4..4d3fa07ca 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py
@@ -7,12 +7,13 @@
 from numba.cuda.testing import skip_on_cudasim, CUDATestCase
 import unittest
 
-has_mp_get_context = hasattr(mp, 'get_context')
-is_unix = os.name == 'posix'
+has_mp_get_context = hasattr(mp, "get_context")
+is_unix = os.name == "posix"
 
 
 def fork_test(q):
     from numba.cuda.cudadrv.error import CudaDriverError
+
     try:
         cuda.to_device(np.arange(1))
     except CudaDriverError as e:
@@ -21,17 +22,17 @@ def fork_test(q):
         q.put(None)
 
 
-@skip_on_cudasim('disabled for cudasim')
+@skip_on_cudasim("disabled for cudasim")
 class TestMultiprocessing(CUDATestCase):
-    @unittest.skipUnless(has_mp_get_context, 'requires mp.get_context')
-    @unittest.skipUnless(is_unix, 'requires Unix')
+    @unittest.skipUnless(has_mp_get_context, "requires mp.get_context")
+    @unittest.skipUnless(is_unix, "requires Unix")
     def test_fork(self):
         """
         Test fork detection.
         """
         cuda.current_context()  # force cuda initialize
         # fork in process that also uses CUDA
-        ctx = mp.get_context('fork')
+        ctx = mp.get_context("fork")
         q = ctx.Queue()
         proc = ctx.Process(target=fork_test, args=[q])
         proc.start()
@@ -39,8 +40,8 @@ def test_fork(self):
         proc.join()
         # there should be an exception raised in the child process
         self.assertIsNotNone(exc)
-        self.assertIn('CUDA initialized before forking', str(exc))
+        self.assertIn("CUDA initialized before forking", str(exc))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py b/numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py
index 30afd3eb0..7ca6ff8dd 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py
@@ -3,8 +3,11 @@
 import multiprocessing
 import numpy as np
 from numba import cuda
-from numba.cuda.testing import (skip_on_cudasim, skip_under_cuda_memcheck,
-                                CUDATestCase)
+from numba.cuda.testing import (
+    skip_on_cudasim,
+    skip_under_cuda_memcheck,
+    CUDATestCase,
+)
 import unittest
 
 try:
@@ -15,7 +18,7 @@
     has_concurrent_futures = True
 
 
-has_mp_get_context = hasattr(multiprocessing, 'get_context')
+has_mp_get_context = hasattr(multiprocessing, "get_context")
 
 
 def check_concurrent_compiling():
@@ -41,15 +44,14 @@ def spawn_process_entry(q):
     # Catch anything that goes wrong in the threads
     except:  # noqa: E722
         msg = traceback.format_exc()
-        q.put('\n'.join(['', '=' * 80, msg]))
+        q.put("\n".join(["", "=" * 80, msg]))
     else:
         q.put(None)
 
 
-@skip_under_cuda_memcheck('Hangs cuda-memcheck')
-@skip_on_cudasim('disabled for cudasim')
+@skip_under_cuda_memcheck("Hangs cuda-memcheck")
+@skip_on_cudasim("disabled for cudasim")
 class TestMultiThreadCompiling(CUDATestCase):
-
     @unittest.skipIf(not has_concurrent_futures, "no concurrent.futures")
     def test_concurrent_compiling(self):
         check_concurrent_compiling()
@@ -59,7 +61,7 @@ def test_spawn_concurrent_compilation(self):
         # force CUDA context init
         cuda.get_current_device()
         # use "spawn" to avoid inheriting the CUDA context
-        ctx = multiprocessing.get_context('spawn')
+        ctx = multiprocessing.get_context("spawn")
 
         q = ctx.Queue()
         p = ctx.Process(target=spawn_process_entry, args=(q,))
@@ -70,7 +72,7 @@ def test_spawn_concurrent_compilation(self):
             p.join()
         if err is not None:
             raise AssertionError(err)
-        self.assertEqual(p.exitcode, 0, 'test failed in child process')
+        self.assertEqual(p.exitcode, 0, "test failed in child process")
 
     def test_invalid_context_error_with_d2h(self):
         def d2h(arr, out):
@@ -97,5 +99,5 @@ def d2d(dst, src):
         np.testing.assert_equal(darr.copy_to_host(), arr)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_nondet.py b/numba_cuda/numba/cuda/tests/cudapy/test_nondet.py
index eaf141052..af57a47ed 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_nondet.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_nondet.py
@@ -45,5 +45,5 @@ def diagproduct(c, a, b):
         np.testing.assert_array_almost_equal(dF.copy_to_host(), E)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_operator.py b/numba_cuda/numba/cuda/tests/cudapy/test_operator.py
index 0547d55fe..5df98b1e2 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_operator.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_operator.py
@@ -1,6 +1,10 @@
 import numpy as np
-from numba.cuda.testing import (unittest, CUDATestCase, skip_unless_cc_53,
-                                skip_on_cudasim)
+from numba.cuda.testing import (
+    unittest,
+    CUDATestCase,
+    skip_unless_cc_53,
+    skip_on_cudasim,
+)
 from numba import cuda
 from numba.core.types import f2, b1
 from numba.cuda import compile_ptx
@@ -73,12 +77,12 @@ def simple_fp16_ne(ary, a, b):
     ary[0] = a != b
 
 
-@cuda.jit('b1(f2, f2)', device=True)
+@cuda.jit("b1(f2, f2)", device=True)
 def hlt_func_1(x, y):
     return x < y
 
 
-@cuda.jit('b1(f2, f2)', device=True)
+@cuda.jit("b1(f2, f2)", device=True)
 def hlt_func_2(x, y):
     return x < y
 
@@ -116,6 +120,7 @@ def setUp(self):
     """
     Test if operator module is supported by the CUDA target.
     """
+
     def operator_template(self, op):
         @cuda.jit
         def foo(a, b):
@@ -146,8 +151,12 @@ def test_floordiv(self):
 
     @skip_unless_cc_53
     def test_fp16_binary(self):
-        functions = (simple_fp16add, simple_fp16sub, simple_fp16mul,
-                     simple_fp16_div_scalar)
+        functions = (
+            simple_fp16add,
+            simple_fp16sub,
+            simple_fp16mul,
+            simple_fp16_div_scalar,
+        )
         ops = (operator.add, operator.sub, operator.mul, operator.truediv)
 
         for fn, op in zip(functions, ops):
@@ -162,10 +171,10 @@ def test_fp16_binary(self):
                 expected = op(arg1, arg2)
                 np.testing.assert_allclose(got, expected)
 
-    @skip_on_cudasim('Compilation unsupported in the simulator')
+    @skip_on_cudasim("Compilation unsupported in the simulator")
     def test_fp16_binary_ptx(self):
         functions = (simple_fp16add, simple_fp16sub, simple_fp16mul)
-        instrs = ('add.f16', 'sub.f16', 'mul.f16')
+        instrs = ("add.f16", "sub.f16", "mul.f16")
         args = (f2[:], f2, f2)
         for fn, instr in zip(functions, instrs):
             with self.subTest(instr=instr):
@@ -174,11 +183,14 @@ def test_fp16_binary_ptx(self):
 
     @skip_unless_cc_53
     def test_mixed_fp16_binary_arithmetic(self):
-        functions = (simple_fp16add, simple_fp16sub, simple_fp16mul,
-                     simple_fp16_div_scalar)
+        functions = (
+            simple_fp16add,
+            simple_fp16sub,
+            simple_fp16mul,
+            simple_fp16_div_scalar,
+        )
         ops = (operator.add, operator.sub, operator.mul, operator.truediv)
-        types = (np.int8, np.int16, np.int32, np.int64,
-                 np.float32, np.float64)
+        types = (np.int8, np.int16, np.int32, np.int64, np.float32, np.float64)
         for (fn, op), ty in itertools.product(zip(functions, ops), types):
             with self.subTest(op=op, ty=ty):
                 kernel = cuda.jit(fn)
@@ -192,10 +204,10 @@ def test_mixed_fp16_binary_arithmetic(self):
                 expected = op(arg1, arg2)
                 np.testing.assert_allclose(got, expected)
 
-    @skip_on_cudasim('Compilation unsupported in the simulator')
+    @skip_on_cudasim("Compilation unsupported in the simulator")
     def test_fp16_inplace_binary_ptx(self):
         functions = (simple_fp16_iadd, simple_fp16_isub, simple_fp16_imul)
-        instrs = ('add.f16', 'sub.f16', 'mul.f16')
+        instrs = ("add.f16", "sub.f16", "mul.f16")
         args = (f2[:], f2)
 
         for fn, instr in zip(functions, instrs):
@@ -205,8 +217,12 @@ def test_fp16_inplace_binary_ptx(self):
 
     @skip_unless_cc_53
     def test_fp16_inplace_binary(self):
-        functions = (simple_fp16_iadd, simple_fp16_isub, simple_fp16_imul,
-                     simple_fp16_idiv)
+        functions = (
+            simple_fp16_iadd,
+            simple_fp16_isub,
+            simple_fp16_imul,
+            simple_fp16_idiv,
+        )
         ops = (operator.iadd, operator.isub, operator.imul, operator.itruediv)
 
         for fn, op in zip(functions, ops):
@@ -236,26 +252,37 @@ def test_fp16_unary(self):
                 expected = op(arg1)
                 np.testing.assert_allclose(got, expected)
 
-    @skip_on_cudasim('Compilation unsupported in the simulator')
+    @skip_on_cudasim("Compilation unsupported in the simulator")
     def test_fp16_neg_ptx(self):
         args = (f2[:], f2)
         ptx, _ = compile_ptx(simple_fp16neg, args, cc=(5, 3))
-        self.assertIn('neg.f16', ptx)
+        self.assertIn("neg.f16", ptx)
 
-    @skip_on_cudasim('Compilation unsupported in the simulator')
+    @skip_on_cudasim("Compilation unsupported in the simulator")
     def test_fp16_abs_ptx(self):
         args = (f2[:], f2)
         ptx, _ = compile_ptx(simple_fp16abs, args, cc=(5, 3))
 
-        self.assertIn('abs.f16', ptx)
+        self.assertIn("abs.f16", ptx)
 
     @skip_unless_cc_53
     def test_fp16_comparison(self):
-        functions = (simple_fp16_gt, simple_fp16_ge,
-                     simple_fp16_lt, simple_fp16_le,
-                     simple_fp16_eq, simple_fp16_ne)
-        ops = (operator.gt, operator.ge, operator.lt, operator.le,
-               operator.eq, operator.ne)
+        functions = (
+            simple_fp16_gt,
+            simple_fp16_ge,
+            simple_fp16_lt,
+            simple_fp16_le,
+            simple_fp16_eq,
+            simple_fp16_ne,
+        )
+        ops = (
+            operator.gt,
+            operator.ge,
+            operator.lt,
+            operator.le,
+            operator.eq,
+            operator.ne,
+        )
 
         for fn, op in zip(functions, ops):
             with self.subTest(op=op):
@@ -271,16 +298,25 @@ def test_fp16_comparison(self):
 
     @skip_unless_cc_53
     def test_mixed_fp16_comparison(self):
-        functions = (simple_fp16_gt, simple_fp16_ge,
-                     simple_fp16_lt, simple_fp16_le,
-                     simple_fp16_eq, simple_fp16_ne)
-        ops = (operator.gt, operator.ge, operator.lt, operator.le,
-               operator.eq, operator.ne)
-        types = (np.int8, np.int16, np.int32, np.int64,
-                 np.float32, np.float64)
-
-        for (fn, op), ty in itertools.product(zip(functions, ops),
-                                              types):
+        functions = (
+            simple_fp16_gt,
+            simple_fp16_ge,
+            simple_fp16_lt,
+            simple_fp16_le,
+            simple_fp16_eq,
+            simple_fp16_ne,
+        )
+        ops = (
+            operator.gt,
+            operator.ge,
+            operator.lt,
+            operator.le,
+            operator.eq,
+            operator.ne,
+        )
+        types = (np.int8, np.int16, np.int32, np.int64, np.float32, np.float64)
+
+        for (fn, op), ty in itertools.product(zip(functions, ops), types):
             with self.subTest(op=op, ty=ty):
                 kernel = cuda.jit(fn)
 
@@ -294,48 +330,68 @@ def test_mixed_fp16_comparison(self):
 
     @skip_unless_cc_53
     def test_multiple_float16_comparisons(self):
-        functions = (test_multiple_hcmp_1,
-                     test_multiple_hcmp_2,
-                     test_multiple_hcmp_3,
-                     test_multiple_hcmp_4,
-                     test_multiple_hcmp_5)
+        functions = (
+            test_multiple_hcmp_1,
+            test_multiple_hcmp_2,
+            test_multiple_hcmp_3,
+            test_multiple_hcmp_4,
+            test_multiple_hcmp_5,
+        )
         for fn in functions:
             with self.subTest(fn=fn):
                 compiled = cuda.jit("void(b1[:], f2, f2, f2)")(fn)
                 ary = np.zeros(1, dtype=np.bool_)
-                arg1 = np.float16(2.)
-                arg2 = np.float16(3.)
-                arg3 = np.float16(4.)
+                arg1 = np.float16(2.0)
+                arg2 = np.float16(3.0)
+                arg3 = np.float16(4.0)
                 compiled[1, 1](ary, arg1, arg2, arg3)
                 self.assertTrue(ary[0])
 
     @skip_unless_cc_53
     def test_multiple_float16_comparisons_false(self):
-        functions = (test_multiple_hcmp_1,
-                     test_multiple_hcmp_2,
-                     test_multiple_hcmp_3,
-                     test_multiple_hcmp_4,
-                     test_multiple_hcmp_5)
+        functions = (
+            test_multiple_hcmp_1,
+            test_multiple_hcmp_2,
+            test_multiple_hcmp_3,
+            test_multiple_hcmp_4,
+            test_multiple_hcmp_5,
+        )
         for fn in functions:
             with self.subTest(fn=fn):
                 compiled = cuda.jit("void(b1[:], f2, f2, f2)")(fn)
                 ary = np.zeros(1, dtype=np.bool_)
-                arg1 = np.float16(2.)
-                arg2 = np.float16(3.)
-                arg3 = np.float16(1.)
+                arg1 = np.float16(2.0)
+                arg2 = np.float16(3.0)
+                arg3 = np.float16(1.0)
                 compiled[1, 1](ary, arg1, arg2, arg3)
                 self.assertFalse(ary[0])
 
-    @skip_on_cudasim('Compilation unsupported in the simulator')
+    @skip_on_cudasim("Compilation unsupported in the simulator")
     def test_fp16_comparison_ptx(self):
-        functions = (simple_fp16_gt, simple_fp16_ge,
-                     simple_fp16_lt, simple_fp16_le,
-                     simple_fp16_eq, simple_fp16_ne)
-        ops = (operator.gt, operator.ge, operator.lt, operator.le,
-               operator.eq, operator.ne)
-        opstring = ('setp.gt.f16', 'setp.ge.f16',
-                    'setp.lt.f16', 'setp.le.f16',
-                    'setp.eq.f16', 'setp.ne.f16')
+        functions = (
+            simple_fp16_gt,
+            simple_fp16_ge,
+            simple_fp16_lt,
+            simple_fp16_le,
+            simple_fp16_eq,
+            simple_fp16_ne,
+        )
+        ops = (
+            operator.gt,
+            operator.ge,
+            operator.lt,
+            operator.le,
+            operator.eq,
+            operator.ne,
+        )
+        opstring = (
+            "setp.gt.f16",
+            "setp.ge.f16",
+            "setp.lt.f16",
+            "setp.le.f16",
+            "setp.eq.f16",
+            "setp.ne.f16",
+        )
         args = (b1[:], f2, f2)
 
         for fn, op, s in zip(functions, ops, opstring):
@@ -343,51 +399,79 @@ def test_fp16_comparison_ptx(self):
                 ptx, _ = compile_ptx(fn, args, cc=(5, 3))
                 self.assertIn(s, ptx)
 
-    @skip_on_cudasim('Compilation unsupported in the simulator')
+    @skip_on_cudasim("Compilation unsupported in the simulator")
     def test_fp16_int8_comparison_ptx(self):
         # Test that int8 can be safely converted to fp16
         # in a comparison
-        functions = (simple_fp16_gt, simple_fp16_ge,
-                     simple_fp16_lt, simple_fp16_le,
-                     simple_fp16_eq, simple_fp16_ne)
-        ops = (operator.gt, operator.ge, operator.lt, operator.le,
-               operator.eq, operator.ne)
-
-        opstring = {operator.gt:'setp.gt.f16',
-                    operator.ge:'setp.ge.f16',
-                    operator.lt:'setp.lt.f16',
-                    operator.le:'setp.le.f16',
-                    operator.eq:'setp.eq.f16',
-                    operator.ne:'setp.ne.f16'}
+        functions = (
+            simple_fp16_gt,
+            simple_fp16_ge,
+            simple_fp16_lt,
+            simple_fp16_le,
+            simple_fp16_eq,
+            simple_fp16_ne,
+        )
+        ops = (
+            operator.gt,
+            operator.ge,
+            operator.lt,
+            operator.le,
+            operator.eq,
+            operator.ne,
+        )
+
+        opstring = {
+            operator.gt: "setp.gt.f16",
+            operator.ge: "setp.ge.f16",
+            operator.lt: "setp.lt.f16",
+            operator.le: "setp.le.f16",
+            operator.eq: "setp.eq.f16",
+            operator.ne: "setp.ne.f16",
+        }
         for fn, op in zip(functions, ops):
             with self.subTest(op=op):
                 args = (b1[:], f2, from_dtype(np.int8))
                 ptx, _ = compile_ptx(fn, args, cc=(5, 3))
                 self.assertIn(opstring[op], ptx)
 
-    @skip_on_cudasim('Compilation unsupported in the simulator')
+    @skip_on_cudasim("Compilation unsupported in the simulator")
     def test_mixed_fp16_comparison_promotion_ptx(self):
-        functions = (simple_fp16_gt, simple_fp16_ge,
-                     simple_fp16_lt, simple_fp16_le,
-                     simple_fp16_eq, simple_fp16_ne)
-        ops = (operator.gt, operator.ge, operator.lt, operator.le,
-               operator.eq, operator.ne)
-
-        types_promote = (np.int16, np.int32, np.int64,
-                         np.float32, np.float64)
-        opstring = {operator.gt:'setp.gt.',
-                    operator.ge:'setp.ge.',
-                    operator.lt:'setp.lt.',
-                    operator.le:'setp.le.',
-                    operator.eq:'setp.eq.',
-                    operator.ne:'setp.neu.'}
-        opsuffix = {np.dtype('int32'): 'f64',
-                    np.dtype('int64'): 'f64',
-                    np.dtype('float32'): 'f32',
-                    np.dtype('float64'): 'f64'}
-
-        for (fn, op), ty in itertools.product(zip(functions, ops),
-                                              types_promote):
+        functions = (
+            simple_fp16_gt,
+            simple_fp16_ge,
+            simple_fp16_lt,
+            simple_fp16_le,
+            simple_fp16_eq,
+            simple_fp16_ne,
+        )
+        ops = (
+            operator.gt,
+            operator.ge,
+            operator.lt,
+            operator.le,
+            operator.eq,
+            operator.ne,
+        )
+
+        types_promote = (np.int16, np.int32, np.int64, np.float32, np.float64)
+        opstring = {
+            operator.gt: "setp.gt.",
+            operator.ge: "setp.ge.",
+            operator.lt: "setp.lt.",
+            operator.le: "setp.le.",
+            operator.eq: "setp.eq.",
+            operator.ne: "setp.neu.",
+        }
+        opsuffix = {
+            np.dtype("int32"): "f64",
+            np.dtype("int64"): "f64",
+            np.dtype("float32"): "f32",
+            np.dtype("float64"): "f64",
+        }
+
+        for (fn, op), ty in itertools.product(
+            zip(functions, ops), types_promote
+        ):
             with self.subTest(op=op, ty=ty):
                 arg2_ty = np.result_type(np.float16, ty)
                 args = (b1[:], f2, from_dtype(arg2_ty))
@@ -397,5 +481,5 @@ def test_mixed_fp16_comparison_promotion_ptx(self):
                 self.assertIn(ops, ptx)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_optimization.py b/numba_cuda/numba/cuda/tests/cudapy/test_optimization.py
index 27399727b..200ec5264 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_optimization.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_optimization.py
@@ -18,10 +18,10 @@ def device_func(x, y, z):
 # the test function were more complex it may be possible to isolate additional
 # fragments of PTX we could check for the absence / presence of, but removal of
 # the use of local memory is a good indicator that optimization was applied.
-removed_by_opt = ( '__local_depot0',)
+removed_by_opt = ("__local_depot0",)
 
 
-@skip_on_cudasim('Simulator does not optimize code')
+@skip_on_cudasim("Simulator does not optimize code")
 class TestOptimization(CUDATestCase):
     def test_eager_opt(self):
         # Optimization should occur by default
@@ -74,7 +74,7 @@ def test_device_opt(self):
         sig = (float64, float64, float64)
         device = cuda.jit(sig, device=True)(device_func)
         ptx = device.inspect_asm(sig)
-        self.assertIn('fma.rn.f64', ptx)
+        self.assertIn("fma.rn.f64", ptx)
 
     def test_device_noopt(self):
         # Optimization disabled
@@ -82,8 +82,8 @@ def test_device_noopt(self):
         device = cuda.jit(sig, device=True, opt=False)(device_func)
         ptx = device.inspect_asm(sig)
         # Fused-multiply adds should be disabled when not optimizing
-        self.assertNotIn('fma.rn.f64', ptx)
+        self.assertNotIn("fma.rn.f64", ptx)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_overload.py b/numba_cuda/numba/cuda/tests/cudapy/test_overload.py
index 746ea3f4a..51752f732 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_overload.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_overload.py
@@ -8,6 +8,7 @@
 
 # Dummy function definitions to overload
 
+
 def generic_func_1():
     pass
 
@@ -83,109 +84,124 @@ def default_values_and_kwargs():
 
 # Overload implementations
 
-@overload(generic_func_1, target='generic')
+
+@overload(generic_func_1, target="generic")
 def ol_generic_func_1(x):
     def impl(x):
         x[0] *= GENERIC_FUNCTION_1
+
     return impl
 
 
-@overload(cuda_func_1, target='cuda')
+@overload(cuda_func_1, target="cuda")
 def ol_cuda_func_1(x):
     def impl(x):
         x[0] *= CUDA_FUNCTION_1
+
     return impl
 
 
-@overload(generic_func_2, target='generic')
+@overload(generic_func_2, target="generic")
 def ol_generic_func_2(x):
     def impl(x):
         x[0] *= GENERIC_FUNCTION_2
+
     return impl
 
 
-@overload(cuda_func_2, target='cuda')
+@overload(cuda_func_2, target="cuda")
 def ol_cuda_func(x):
     def impl(x):
         x[0] *= CUDA_FUNCTION_2
+
     return impl
 
 
-@overload(generic_calls_generic, target='generic')
+@overload(generic_calls_generic, target="generic")
 def ol_generic_calls_generic(x):
     def impl(x):
         x[0] *= GENERIC_CALLS_GENERIC
         generic_func_1(x)
+
     return impl
 
 
-@overload(generic_calls_cuda, target='generic')
+@overload(generic_calls_cuda, target="generic")
 def ol_generic_calls_cuda(x):
     def impl(x):
         x[0] *= GENERIC_CALLS_CUDA
         cuda_func_1(x)
+
     return impl
 
 
-@overload(cuda_calls_generic, target='cuda')
+@overload(cuda_calls_generic, target="cuda")
 def ol_cuda_calls_generic(x):
     def impl(x):
         x[0] *= CUDA_CALLS_GENERIC
         generic_func_1(x)
+
     return impl
 
 
-@overload(cuda_calls_cuda, target='cuda')
+@overload(cuda_calls_cuda, target="cuda")
 def ol_cuda_calls_cuda(x):
     def impl(x):
         x[0] *= CUDA_CALLS_CUDA
         cuda_func_1(x)
+
     return impl
 
 
-@overload(target_overloaded, target='generic')
+@overload(target_overloaded, target="generic")
 def ol_target_overloaded_generic(x):
     def impl(x):
         x[0] *= GENERIC_TARGET_OL
+
     return impl
 
 
-@overload(target_overloaded, target='cuda')
+@overload(target_overloaded, target="cuda")
 def ol_target_overloaded_cuda(x):
     def impl(x):
         x[0] *= CUDA_TARGET_OL
+
     return impl
 
 
-@overload(generic_calls_target_overloaded, target='generic')
+@overload(generic_calls_target_overloaded, target="generic")
 def ol_generic_calls_target_overloaded(x):
     def impl(x):
         x[0] *= GENERIC_CALLS_TARGET_OL
         target_overloaded(x)
+
     return impl
 
 
-@overload(cuda_calls_target_overloaded, target='cuda')
+@overload(cuda_calls_target_overloaded, target="cuda")
 def ol_cuda_calls_target_overloaded(x):
     def impl(x):
         x[0] *= CUDA_CALLS_TARGET_OL
         target_overloaded(x)
+
     return impl
 
 
-@overload(target_overloaded_calls_target_overloaded, target='generic')
+@overload(target_overloaded_calls_target_overloaded, target="generic")
 def ol_generic_calls_target_overloaded_generic(x):
     def impl(x):
         x[0] *= GENERIC_TARGET_OL_CALLS_TARGET_OL
         target_overloaded(x)
+
     return impl
 
 
-@overload(target_overloaded_calls_target_overloaded, target='cuda')
+@overload(target_overloaded_calls_target_overloaded, target="cuda")
 def ol_generic_calls_target_overloaded_cuda(x):
     def impl(x):
         x[0] *= CUDA_TARGET_OL_CALLS_TARGET_OL
         target_overloaded(x)
+
     return impl
 
 
@@ -193,10 +209,11 @@ def impl(x):
 def ol_default_values_and_kwargs(out, x, y=5, z=6):
     def impl(out, x, y=5, z=6):
         out[0], out[1] = x + y, z
+
     return impl
 
 
-@skip_on_cudasim('Overloading not supported in cudasim')
+@skip_on_cudasim("Overloading not supported in cudasim")
 class TestOverload(CUDATestCase):
     def check_overload(self, kernel, expected):
         x = np.ones(1, dtype=np.int32)
@@ -311,7 +328,7 @@ def test_overload_attribute_target(self):
         MyDummy, MyDummyType = self.make_dummy_type()
         mydummy_type = typeof(MyDummy())
 
-        @overload_attribute(MyDummyType, 'cuda_only', target='cuda')
+        @overload_attribute(MyDummyType, "cuda_only", target="cuda")
         def ov_dummy_cuda_attr(obj):
             def imp(obj):
                 return 42
@@ -330,6 +347,7 @@ def imp(obj):
             msg = "Unknown attribute 'cuda_only'"
 
         with self.assertRaisesRegex(TypingError, msg):
+
             @njit(types.int64(mydummy_type))
             def illegal_target_attr_use(x):
                 return x.cuda_only
@@ -345,14 +363,15 @@ def test_default_values_and_kwargs(self):
         """
         Test default values and kwargs.
         """
+
         @cuda.jit()
         def kernel(a, b, out):
             default_values_and_kwargs(out, a, z=b)
 
         out = np.empty(2, dtype=np.int64)
-        kernel[1,1](1, 2, out)
+        kernel[1, 1](1, 2, out)
         self.assertEqual(tuple(out), (6, 2))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_powi.py b/numba_cuda/numba/cuda/tests/cudapy/test_powi.py
index 1932b3165..331a4b25c 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_powi.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_powi.py
@@ -47,7 +47,7 @@ def vec_pow_inplace_binop(r, x):
 
 def random_complex(N):
     np.random.seed(123)
-    return (np.random.random(1) + np.random.random(1) * 1j)
+    return np.random.random(1) + np.random.random(1) * 1j
 
 
 class TestCudaPowi(CUDATestCase):
@@ -59,7 +59,7 @@ def test_powi(self):
         A = np.arange(10, dtype=np.float64).reshape(2, 5)
         Aout = np.empty_like(A)
         kernel[1, A.shape](A, power, Aout)
-        self.assertTrue(np.allclose(Aout, A ** power))
+        self.assertTrue(np.allclose(Aout, A**power))
 
     def test_powi_binop(self):
         dec = cuda.jit(void(float64[:, :], int8, float64[:, :]))
@@ -69,7 +69,7 @@ def test_powi_binop(self):
         A = np.arange(10, dtype=np.float64).reshape(2, 5)
         Aout = np.empty_like(A)
         kernel[1, A.shape](A, power, Aout)
-        self.assertTrue(np.allclose(Aout, A ** power))
+        self.assertTrue(np.allclose(Aout, A**power))
 
     # Relative tolerance kwarg is provided because 1.0e-7 (the default for
     # assert_allclose) is a bit tight for single precision.
@@ -81,7 +81,7 @@ def _test_cpow(self, dtype, func, rtol=1.0e-7):
 
         cfunc = cuda.jit(func)
         cfunc[1, N](r, x, y)
-        np.testing.assert_allclose(r, x ** y, rtol=rtol)
+        np.testing.assert_allclose(r, x**y, rtol=rtol)
 
         # Checks special cases
         x = np.asarray([0.0j, 1.0j], dtype=dtype)
@@ -89,7 +89,7 @@ def _test_cpow(self, dtype, func, rtol=1.0e-7):
         r = np.zeros_like(x)
 
         cfunc[1, 2](r, x, y)
-        np.testing.assert_allclose(r, x ** y, rtol=rtol)
+        np.testing.assert_allclose(r, x**y, rtol=rtol)
 
     def test_cpow_complex64_pow(self):
         self._test_cpow(np.complex64, vec_pow, rtol=3.0e-7)
@@ -107,7 +107,7 @@ def _test_cpow_inplace_binop(self, dtype, rtol=1.0e-7):
         N = 32
         x = random_complex(N).astype(dtype)
         y = random_complex(N).astype(dtype)
-        r = x ** y
+        r = x**y
 
         cfunc = cuda.jit(vec_pow_inplace_binop)
         cfunc[1, N](x, y)
@@ -120,5 +120,5 @@ def test_cpow_complex128_inplace_binop(self):
         self._test_cpow_inplace_binop(np.complex128, rtol=3.0e-7)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_print.py b/numba_cuda/numba/cuda/tests/cudapy/test_print.py
index 30328ead4..0dbb3139b 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_print.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_print.py
@@ -113,7 +113,7 @@ def run_code(self, code):
     def test_cuhello(self):
         output, _ = self.run_code(cuhello_usecase)
         actual = [line.strip() for line in output.splitlines()]
-        expected = ['-42'] * 6 + ['%d 999' % i for i in range(6)]
+        expected = ["-42"] * 6 + ["%d 999" % i for i in range(6)]
         # The output of GPU threads is intermingled, but each print()
         # call is still atomic
         self.assertEqual(sorted(actual), expected)
@@ -136,7 +136,7 @@ def test_printempty(self):
     def test_string(self):
         output, _ = self.run_code(printstring_usecase)
         lines = [line.strip() for line in output.splitlines(True)]
-        expected = ['%d hop! 999' % i for i in range(3)]
+        expected = ["%d hop! 999" % i for i in range(3)]
         self.assertEqual(sorted(lines), expected)
 
     def test_dim3(self):
@@ -145,7 +145,7 @@ def test_dim3(self):
         expected = [str(i) for i in np.ndindex(2, 2, 2)]
         self.assertEqual(sorted(lines), expected)
 
-    @skip_on_cudasim('cudasim can print unlimited output')
+    @skip_on_cudasim("cudasim can print unlimited output")
     def test_too_many_args(self):
         # Tests that we emit the format string and warn when there are more
         # than 32 arguments, in common with CUDA C/C++ printf - this is due to
@@ -155,14 +155,16 @@ def test_too_many_args(self):
         output, errors = self.run_code(print_too_many_usecase)
 
         # Check that the format string was printed instead of formatted garbage
-        expected_fmt_string = ' '.join(['%lld' for _ in range(33)])
+        expected_fmt_string = " ".join(["%lld" for _ in range(33)])
         self.assertIn(expected_fmt_string, output)
 
         # Check for the expected warning about formatting more than 32 items
-        warn_msg = ('CUDA print() cannot print more than 32 items. The raw '
-                    'format string will be emitted by the kernel instead.')
+        warn_msg = (
+            "CUDA print() cannot print more than 32 items. The raw "
+            "format string will be emitted by the kernel instead."
+        )
         self.assertIn(warn_msg, errors)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py b/numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py
index 298a5b747..8ee4b786d 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py
@@ -29,5 +29,5 @@ def preCalc(y, yA, yB, numDataPoints):
         self.assertTrue(np.all(y == yB))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_random.py b/numba_cuda/numba/cuda/tests/cudapy/test_random.py
index 11bbf95aa..feffb840e 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_random.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_random.py
@@ -6,9 +6,12 @@
 from numba.cuda.testing import unittest
 from numba.cuda.testing import skip_on_cudasim, CUDATestCase
 
-from numba.cuda.random import \
-    xoroshiro128p_uniform_float32, xoroshiro128p_normal_float32, \
-    xoroshiro128p_uniform_float64, xoroshiro128p_normal_float64
+from numba.cuda.random import (
+    xoroshiro128p_uniform_float32,
+    xoroshiro128p_normal_float32,
+    xoroshiro128p_uniform_float64,
+    xoroshiro128p_normal_float64,
+)
 
 
 # Distributions
@@ -52,8 +55,9 @@ def test_create_subsequence_start(self):
         states = cuda.random.create_xoroshiro128p_states(10, seed=1)
         s1 = states.copy_to_host()
 
-        states = cuda.random.create_xoroshiro128p_states(10, seed=1,
-                                                         subsequence_start=3)
+        states = cuda.random.create_xoroshiro128p_states(
+            10, seed=1, subsequence_start=3
+        )
         s2 = states.copy_to_host()
 
         # Starting seeds should match up with offset of 3
@@ -61,8 +65,9 @@ def test_create_subsequence_start(self):
 
     def test_create_stream(self):
         stream = cuda.stream()
-        states = cuda.random.create_xoroshiro128p_states(10, seed=1,
-                                                         stream=stream)
+        states = cuda.random.create_xoroshiro128p_states(
+            10, seed=1, stream=stream
+        )
         s = states.copy_to_host()
         self.assertEqual(len(np.unique(s)), 10)
 
@@ -79,7 +84,7 @@ def check_uniform(self, kernel_func, dtype):
     def test_uniform_float32(self):
         self.check_uniform(rng_kernel_float32, np.float32)
 
-    @skip_on_cudasim('skip test for speed under cudasim')
+    @skip_on_cudasim("skip test for speed under cudasim")
     def test_uniform_float64(self):
         self.check_uniform(rng_kernel_float64, np.float64)
 
@@ -95,10 +100,10 @@ def check_normal(self, kernel_func, dtype):
     def test_normal_float32(self):
         self.check_normal(rng_kernel_float32, np.float32)
 
-    @skip_on_cudasim('skip test for speed under cudasim')
+    @skip_on_cudasim("skip test for speed under cudasim")
     def test_normal_float64(self):
         self.check_normal(rng_kernel_float64, np.float64)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py b/numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py
index 75651488e..85ddf1d74 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py
@@ -78,24 +78,17 @@ def record_read_2d_array(r, a):
 
 recordtype = np.dtype(
     [
-        ('a', np.float64),
-        ('b', np.int32),
-        ('c', np.complex64),
-        ('d', (np.uint8, 5))
+        ("a", np.float64),
+        ("b", np.int32),
+        ("c", np.complex64),
+        ("d", (np.uint8, 5)),
     ],
-    align=True
+    align=True,
 )
 
-recordwitharray = np.dtype(
-    [
-        ('g', np.int32),
-        ('h', np.float32, 2)
-    ],
-    align=True
-)
+recordwitharray = np.dtype([("g", np.int32), ("h", np.float32, 2)], align=True)
 
-recordwith2darray = np.dtype([('i', np.int32),
-                              ('j', np.float32, (3, 2))])
+recordwith2darray = np.dtype([("i", np.int32), ("j", np.float32, (3, 2))])
 
 nested_array1_dtype = np.dtype([("array1", np.int16, (3,))], align=True)
 
@@ -104,12 +97,13 @@ def record_read_2d_array(r, a):
 
 # Functions used for "full array" tests
 
+
 def record_write_full_array(rec):
     rec.j[:, :] = np.ones((3, 2))
 
 
 def record_write_full_array_alt(rec):
-    rec['j'][:, :] = np.ones((3, 2))
+    rec["j"][:, :] = np.ones((3, 2))
 
 
 def recarray_set_record(ary, rec):
@@ -122,7 +116,7 @@ def recarray_write_array_of_nestedarray_broadcast(ary):
 
 
 def record_setitem_array(rec_source, rec_dest):
-    rec_dest['j'] = rec_source['j']
+    rec_dest["j"] = rec_source["j"]
 
 
 def recarray_write_array_of_nestedarray(ary):
@@ -135,7 +129,7 @@ def recarray_getitem_return(ary):
 
 
 def recarray_getitem_field_return(ary):
-    return ary['h']
+    return ary["h"]
 
 
 def recarray_getitem_field_return2(ary):
@@ -171,15 +165,14 @@ def record_read_2d_array01(ary):
 
 
 def assign_array_to_nested(dest, src):
-    dest['array1'] = src
+    dest["array1"] = src
 
 
 def assign_array_to_nested_2d(dest, src):
-    dest['array2'] = src
+    dest["array2"] = src
 
 
 class TestRecordDtype(CUDATestCase):
-
     def _createSampleArrays(self):
         self.sample1d = np.recarray(3, dtype=recordtype)
         self.samplerec1darr = np.recarray(1, dtype=recordwitharray)[0]
@@ -192,10 +185,10 @@ def setUp(self):
         ary = self.sample1d
         for i in range(ary.size):
             x = i + 1
-            ary[i]['a'] = x / 2
-            ary[i]['b'] = x
-            ary[i]['c'] = x * 1j
-            ary[i]['d'] = "%d" % x
+            ary[i]["a"] = x / 2
+            ary[i]["b"] = x
+            ary[i]["c"] = x * 1j
+            ary[i]["d"] = "%d" % x
 
     def get_cfunc(self, pyfunc, argspec):
         return cuda.jit()(pyfunc)
@@ -221,7 +214,7 @@ def _test_set_equal(self, pyfunc, value, valuetype):
     def test_set_a(self):
         self._test_set_equal(set_a, 3.1415, types.float64)
         # Test again to check if coercion works
-        self._test_set_equal(set_a, 3., types.float32)
+        self._test_set_equal(set_a, 3.0, types.float32)
 
     def test_set_b(self):
         self._test_set_equal(set_b, 123, types.int32)
@@ -259,13 +252,13 @@ def _test_rec_set(self, v, pyfunc, f):
         np.testing.assert_equal(rec[f], v)
 
     def test_rec_set_a(self):
-        self._test_rec_set(np.float64(1.5), record_set_a, 'a')
+        self._test_rec_set(np.float64(1.5), record_set_a, "a")
 
     def test_rec_set_b(self):
-        self._test_rec_set(np.int32(2), record_set_b, 'b')
+        self._test_rec_set(np.int32(2), record_set_b, "b")
 
     def test_rec_set_c(self):
-        self._test_rec_set(np.complex64(4.0 + 5.0j), record_set_c, 'c')
+        self._test_rec_set(np.complex64(4.0 + 5.0j), record_set_c, "c")
 
     def _test_rec_read(self, v, pyfunc, f):
         rec = self.sample1d.copy()[0]
@@ -277,81 +270,83 @@ def _test_rec_read(self, v, pyfunc, f):
         np.testing.assert_equal(arr[0], v)
 
     def test_rec_read_a(self):
-        self._test_rec_read(np.float64(1.5), record_read_a, 'a')
+        self._test_rec_read(np.float64(1.5), record_read_a, "a")
 
     def test_rec_read_b(self):
-        self._test_rec_read(np.int32(2), record_read_b, 'b')
+        self._test_rec_read(np.int32(2), record_read_b, "b")
 
     def test_rec_read_c(self):
-        self._test_rec_read(np.complex64(4.0 + 5.0j), record_read_c, 'c')
+        self._test_rec_read(np.complex64(4.0 + 5.0j), record_read_c, "c")
 
     def test_record_write_1d_array(self):
-        '''
+        """
         Test writing to a 1D array within a structured type
-        '''
+        """
         rec = self.samplerec1darr.copy()
         nbrecord = numpy_support.from_dtype(recordwitharray)
         cfunc = self.get_cfunc(record_write_array, (nbrecord,))
 
         cfunc[1, 1](rec)
         expected = self.samplerec1darr.copy()
-        expected['g'] = 2
-        expected['h'][0] = 3.0
-        expected['h'][1] = 4.0
+        expected["g"] = 2
+        expected["h"][0] = 3.0
+        expected["h"][1] = 4.0
 
         np.testing.assert_equal(expected, rec)
 
     def test_record_write_2d_array(self):
-        '''
+        """
         Test writing to a 2D array within a structured type
-        '''
+        """
         rec = self.samplerec2darr.copy()
         nbrecord = numpy_support.from_dtype(recordwith2darray)
         cfunc = self.get_cfunc(record_write_2d_array, (nbrecord,))
         cfunc[1, 1](rec)
 
         expected = self.samplerec2darr.copy()
-        expected['i'] = 3
-        expected['j'][:] = np.asarray([5.0, 6.0, 7.0, 8.0, 9.0, 10.0],
-                                      np.float32).reshape(3, 2)
+        expected["i"] = 3
+        expected["j"][:] = np.asarray(
+            [5.0, 6.0, 7.0, 8.0, 9.0, 10.0], np.float32
+        ).reshape(3, 2)
         np.testing.assert_equal(expected, rec)
 
     def test_record_read_1d_array(self):
-        '''
+        """
         Test reading from a 1D array within a structured type
-        '''
+        """
         rec = self.samplerec1darr.copy()
-        rec['h'][0] = 4.0
-        rec['h'][1] = 5.0
+        rec["h"][0] = 4.0
+        rec["h"][1] = 5.0
 
         nbrecord = numpy_support.from_dtype(recordwitharray)
         cfunc = self.get_cfunc(record_read_array, (nbrecord,))
-        arr = np.zeros(2, dtype=rec['h'].dtype)
+        arr = np.zeros(2, dtype=rec["h"].dtype)
         cfunc[1, 1](rec, arr)
 
-        np.testing.assert_equal(rec['h'], arr)
+        np.testing.assert_equal(rec["h"], arr)
 
     def test_record_read_2d_array(self):
-        '''
+        """
         Test reading from a 2D array within a structured type
-        '''
+        """
         rec = self.samplerec2darr.copy()
-        rec['j'][:] = np.asarray([5.0, 6.0, 7.0, 8.0, 9.0, 10.0],
-                                 np.float32).reshape(3, 2)
+        rec["j"][:] = np.asarray(
+            [5.0, 6.0, 7.0, 8.0, 9.0, 10.0], np.float32
+        ).reshape(3, 2)
 
         nbrecord = numpy_support.from_dtype(recordwith2darray)
         cfunc = self.get_cfunc(record_read_2d_array, (nbrecord,))
-        arr = np.zeros((3,2), dtype=rec['j'].dtype)
+        arr = np.zeros((3, 2), dtype=rec["j"].dtype)
         cfunc[1, 1](rec, arr)
 
-        np.testing.assert_equal(rec['j'], arr)
+        np.testing.assert_equal(rec["j"], arr)
 
 
-@skip_on_cudasim('Structured array attr access not supported in simulator')
+@skip_on_cudasim("Structured array attr access not supported in simulator")
 class TestRecordDtypeWithStructArrays(TestRecordDtype):
-    '''
+    """
     Same as TestRecordDtype, but using structured arrays instead of recarrays.
-    '''
+    """
 
     def _createSampleArrays(self):
         self.sample1d = np.zeros(3, dtype=recordtype)
@@ -360,7 +355,6 @@ def _createSampleArrays(self):
 
 
 class TestNestedArrays(CUDATestCase):
-
     # These tests mirror those from
     # numba.tests.test_record_dtype.TestNestedArrays added in PR
     # #7359: https://github.com/numba/numba/pull/7359
@@ -405,8 +399,9 @@ def test_record_read_array(self):
     def test_record_read_2d_array(self):
         # Test reading from a 2D array within a structured type
         nbval = np.recarray(1, dtype=recordwith2darray)
-        nbval[0].j = np.asarray([1.5, 2.5, 3.5, 4.5, 5.5, 6.5],
-                                np.float32).reshape(3, 2)
+        nbval[0].j = np.asarray(
+            [1.5, 2.5, 3.5, 4.5, 5.5, 6.5], np.float32
+        ).reshape(3, 2)
         cfunc = self.get_cfunc(record_read_2d_array00, np.float32)
         res = cfunc(nbval[0])
         np.testing.assert_equal(res, nbval[0].j[0, 0])
@@ -422,12 +417,15 @@ def test_record_read_2d_array(self):
     def test_setitem(self):
         def gen():
             nbarr1 = np.recarray(1, dtype=recordwith2darray)
-            nbarr1[0] = np.array([(1, ((1, 2), (4, 5), (2, 3)))],
-                                 dtype=recordwith2darray)[0]
+            nbarr1[0] = np.array(
+                [(1, ((1, 2), (4, 5), (2, 3)))], dtype=recordwith2darray
+            )[0]
             nbarr2 = np.recarray(1, dtype=recordwith2darray)
-            nbarr2[0] = np.array([(10, ((10, 20), (40, 50), (20, 30)))],
-                                 dtype=recordwith2darray)[0]
+            nbarr2[0] = np.array(
+                [(10, ((10, 20), (40, 50), (20, 30)))], dtype=recordwith2darray
+            )[0]
             return nbarr1[0], nbarr2[0]
+
         pyfunc = record_setitem_array
         pyargs = gen()
         pyfunc(*pyargs)
@@ -453,7 +451,7 @@ def test_getitem_idx(self):
 
     # Writing to records / recarrays
 
-    @skip_on_cudasim('Structured array attr access not supported in simulator')
+    @skip_on_cudasim("Structured array attr access not supported in simulator")
     def test_set_record(self):
         # Test setting an entire record
         rec = np.ones(2, dtype=recordwith2darray).view(np.recarray)[0]
@@ -492,20 +490,18 @@ def test_assign_array_to_nested_2d(self):
         np.testing.assert_array_equal(expected, got)
 
     def test_issue_7693(self):
-        src_dtype = np.dtype([
-            ("user", np.float64),
-            ("array", np.int16, (3,))],
-            align=True)
+        src_dtype = np.dtype(
+            [("user", np.float64), ("array", np.int16, (3,))], align=True
+        )
 
-        dest_dtype = np.dtype([
-            ("user1", np.float64),
-            ("array1", np.int16, (3,))],
-            align=True)
+        dest_dtype = np.dtype(
+            [("user1", np.float64), ("array1", np.int16, (3,))], align=True
+        )
 
         @cuda.jit
         def copy(index, src, dest):
-            dest['user1'] = src[index]['user']
-            dest['array1'] = src[index]['array']
+            dest["user1"] = src[index]["user"]
+            dest["array1"] = src[index]["array"]
 
         source = np.zeros(2, dtype=src_dtype)
         got = np.zeros(2, dtype=dest_dtype)
@@ -528,10 +524,13 @@ def test_getitem_idx_2darray(self):
         # This test returning a record when passing an array and
         # return the first item when passing a record
         nbarr = np.recarray(2, dtype=recordwith2darray)
-        nbarr[0] = np.array([(1, ((1,2),(4,5),(2,3)))],
-                            dtype=recordwith2darray)[0]
-        for arg, retty in [(nbarr, recordwith2darray),
-                           (nbarr[0], (np.float32, (3, 2)))]:
+        nbarr[0] = np.array(
+            [(1, ((1, 2), (4, 5), (2, 3)))], dtype=recordwith2darray
+        )[0]
+        for arg, retty in [
+            (nbarr, recordwith2darray),
+            (nbarr[0], (np.float32, (3, 2))),
+        ]:
             pyfunc = recarray_getitem_field_return2_2d
             arr_expected = pyfunc(arg)
             cfunc = self.get_cfunc(pyfunc, retty)
@@ -545,10 +544,12 @@ def test_return_getattr_getitem_fieldname(self):
         # This tests returning a array of nestedarrays when passing an array and
         # returning a nestedarray when passing a record
         nbarr = np.recarray(2, dtype=recordwitharray)
-        nbarr[0] = np.array([(1, (2,3))], dtype=recordwitharray)[0]
+        nbarr[0] = np.array([(1, (2, 3))], dtype=recordwitharray)[0]
         for arg, retty in [(nbarr, recordwitharray), (nbarr[0], np.float32)]:
-            for pyfunc in [recarray_getitem_field_return,
-                           recarray_getitem_field_return2]:
+            for pyfunc in [
+                recarray_getitem_field_return,
+                recarray_getitem_field_return2,
+            ]:
                 arr_expected = pyfunc(arg)
                 cfunc = self.get_cfunc(pyfunc, retty)
                 arr_res = cfunc(arg)
@@ -570,17 +571,17 @@ def test_record_read_arrays(self):
     def test_return_array(self):
         # Test getitem record AND array within record and returning it
         nbval = np.recarray(2, dtype=recordwitharray)
-        nbval[0] = np.array([(1, (2,3))], dtype=recordwitharray)[0]
+        nbval[0] = np.array([(1, (2, 3))], dtype=recordwitharray)[0]
         pyfunc = record_read_array0
         arr_expected = pyfunc(nbval)
         cfunc = self.get_cfunc(pyfunc, np.float32)
         arr_res = cfunc(nbval)
         np.testing.assert_equal(arr_expected, arr_res)
 
-    @skip_on_cudasim('Will unexpectedly pass on cudasim')
+    @skip_on_cudasim("Will unexpectedly pass on cudasim")
     @unittest.expectedFailure
     def test_set_array(self):
-        #Test setting an entire array within one record
+        # Test setting an entire array within one record
         arr = np.zeros(2, dtype=recordwith2darray).view(np.recarray)
         rec = arr[0]
         nbarr = np.zeros(2, dtype=recordwith2darray).view(np.recarray)
@@ -597,8 +598,8 @@ def test_set_arrays(self):
         arr = np.zeros(2, dtype=recordwith2darray).view(np.recarray)
         nbarr = np.zeros(2, dtype=recordwith2darray).view(np.recarray)
         for pyfunc in (
-                recarray_write_array_of_nestedarray_broadcast,
-                recarray_write_array_of_nestedarray,
+            recarray_write_array_of_nestedarray_broadcast,
+            recarray_write_array_of_nestedarray,
         ):
             arr_expected = pyfunc(arr)
             cfunc = self.get_cfunc(pyfunc, nbarr.dtype)
@@ -606,5 +607,5 @@ def test_set_arrays(self):
             np.testing.assert_equal(arr_res, arr_expected)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_recursion.py b/numba_cuda/numba/cuda/tests/cudapy/test_recursion.py
index 579275330..b73722e44 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_recursion.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_recursion.py
@@ -6,11 +6,11 @@
 
 
 class TestSelfRecursion(CUDATestCase):
-
     def setUp(self):
         # Avoid importing this module at the top level, as it triggers
         # compilation and can therefore fail
         from numba.cuda.tests.cudapy import recursion_usecases
+
         self.mod = recursion_usecases
         super().setUp()
 
@@ -36,19 +36,20 @@ def test_inner_explicit_sig(self):
     def test_global_implicit_sig(self):
         self.check_fib(self.mod.fib3)
 
-    @skip_on_cudasim('Simulator does not compile')
+    @skip_on_cudasim("Simulator does not compile")
     def test_runaway(self):
         with self.assertRaises(TypingError) as raises:
             cfunc = self.mod.runaway_self
 
-            @cuda.jit('void()')
+            @cuda.jit("void()")
             def kernel():
                 cfunc(1)
 
-        self.assertIn("cannot type infer runaway recursion",
-                      str(raises.exception))
+        self.assertIn(
+            "cannot type infer runaway recursion", str(raises.exception)
+        )
 
-    @unittest.skip('Needs insert_unresolved_ref support in target')
+    @unittest.skip("Needs insert_unresolved_ref support in target")
     def test_type_change(self):
         pfunc = self.mod.type_change_self.py_func
         cfunc = self.mod.type_change_self
@@ -79,7 +80,7 @@ def test_raise(self):
 
         self.assertEqual(str(raises.exception), "raise_self")
 
-    @unittest.skip('Needs insert_unresolved_ref support in target')
+    @unittest.skip("Needs insert_unresolved_ref support in target")
     def test_optional_return(self):
         pfunc = self.mod.make_optional_return_case()
         cfunc = self.mod.make_optional_return_case(cuda.jit)
@@ -106,12 +107,13 @@ def cpu_kernel(x):
 
             self.assertEqual(expected, actual)
 
-    @skip_on_cudasim('Recursion handled because simulator does not compile')
+    @skip_on_cudasim("Recursion handled because simulator does not compile")
     def test_growing_return_tuple(self):
         cfunc = self.mod.make_growing_tuple_case(cuda.jit)
 
         with self.assertRaises(TypingError) as raises:
-            @cuda.jit('void()')
+
+            @cuda.jit("void()")
             def kernel():
                 cfunc(100)
 
@@ -121,5 +123,5 @@ def kernel():
         )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_reduction.py b/numba_cuda/numba/cuda/tests/cudapy/test_reduction.py
index 420fc7516..cd34b018b 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_reduction.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_reduction.py
@@ -10,7 +10,7 @@
 
 class TestReduction(CUDATestCase):
     def _sum_reduce(self, n):
-        A = (np.arange(n, dtype=np.float64) + 1)
+        A = np.arange(n, dtype=np.float64) + 1
         expect = A.sum()
         got = sum_reduce(A)
         self.assertEqual(expect, got)
@@ -19,24 +19,39 @@ def test_sum_reduce(self):
         if ENABLE_CUDASIM:
             # Minimal test set for the simulator (which only wraps
             # functools.reduce)
-            test_sizes = [ 1, 16 ]
+            test_sizes = [1, 16]
         else:
             # Tests around the points where blocksize changes, and around larger
             # powers of two, sums of powers of two, and some "random" sizes
-            test_sizes = [ 1, 15, 16, 17, 127, 128, 129, 1023, 1024,
-                           1025, 1536, 1048576, 1049600, 1049728, 34567 ]
+            test_sizes = [
+                1,
+                15,
+                16,
+                17,
+                127,
+                128,
+                129,
+                1023,
+                1024,
+                1025,
+                1536,
+                1048576,
+                1049600,
+                1049728,
+                34567,
+            ]
         # Avoid recompilation by keeping sum_reduce here
         for n in test_sizes:
             self._sum_reduce(n)
 
     def test_empty_array_host(self):
-        A = (np.arange(0, dtype=np.float64) + 1)
+        A = np.arange(0, dtype=np.float64) + 1
         expect = A.sum()
         got = sum_reduce(A)
         self.assertEqual(expect, got)
 
     def test_empty_array_device(self):
-        A = (np.arange(0, dtype=np.float64) + 1)
+        A = np.arange(0, dtype=np.float64) + 1
         dA = cuda.to_device(A)
         expect = A.sum()
         got = sum_reduce(dA)
@@ -44,27 +59,27 @@ def test_empty_array_device(self):
 
     def test_prod_reduce(self):
         prod_reduce = cuda.reduce(lambda a, b: a * b)
-        A = (np.arange(64, dtype=np.float64) + 1)
+        A = np.arange(64, dtype=np.float64) + 1
         expect = A.prod()
         got = prod_reduce(A, init=1)
         np.testing.assert_allclose(expect, got)
 
     def test_max_reduce(self):
         max_reduce = cuda.Reduce(lambda a, b: max(a, b))
-        A = (np.arange(3717, dtype=np.float64) + 1)
+        A = np.arange(3717, dtype=np.float64) + 1
         expect = A.max()
         got = max_reduce(A, init=0)
         self.assertEqual(expect, got)
 
     def test_non_identity_init(self):
         init = 3
-        A = (np.arange(10, dtype=np.float64) + 1)
+        A = np.arange(10, dtype=np.float64) + 1
         expect = A.sum() + init
         got = sum_reduce(A, init=init)
         self.assertEqual(expect, got)
 
     def test_result_on_device(self):
-        A = (np.arange(10, dtype=np.float64) + 1)
+        A = np.arange(10, dtype=np.float64) + 1
         got = cuda.to_device(np.zeros(1, dtype=np.float64))
         expect = A.sum()
         res = sum_reduce(A, res=got)
@@ -72,5 +87,5 @@ def test_result_on_device(self):
         self.assertEqual(expect, got[0])
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py b/numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py
index 640efcac3..52b137c74 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py
@@ -21,13 +21,10 @@ def set_array_to_three(arr):
 
 
 def set_record_to_three(rec):
-    rec[0]['b'] = 3
+    rec[0]["b"] = 3
 
 
-recordtype = np.dtype(
-    [('b', np.int32)],
-    align=True
-)
+recordtype = np.dtype([("b", np.int32)], align=True)
 
 
 class TestRetrieveAutoconvertedArrays(CUDATestCase):
@@ -61,23 +58,23 @@ def test_array_default(self):
     def test_record_in(self):
         host_rec = np.zeros(1, dtype=recordtype)
         self.set_record_to_three[1, 1](cuda.In(host_rec))
-        self.assertEqual(0, host_rec[0]['b'])
+        self.assertEqual(0, host_rec[0]["b"])
 
     def test_record_inout(self):
         host_rec = np.zeros(1, dtype=recordtype)
         self.set_record_to_three[1, 1](cuda.InOut(host_rec))
-        self.assertEqual(3, host_rec[0]['b'])
+        self.assertEqual(3, host_rec[0]["b"])
 
     def test_record_default(self):
         host_rec = np.zeros(1, dtype=recordtype)
         self.set_record_to_three[1, 1](host_rec)
-        self.assertEqual(3, host_rec[0]['b'])
+        self.assertEqual(3, host_rec[0]["b"])
 
     def test_record_in_from_config(self):
         host_rec = np.zeros(1, dtype=recordtype)
         self.set_record_to_three_nocopy[1, 1](host_rec)
-        self.assertEqual(0, host_rec[0]['b'])
+        self.assertEqual(0, host_rec[0]["b"])
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_serialize.py b/numba_cuda/numba/cuda/tests/cudapy/test_serialize.py
index b98aa85a0..08ed0d6b3 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_serialize.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_serialize.py
@@ -7,9 +7,8 @@
 from numba.np import numpy_support
 
 
-@skip_on_cudasim('pickling not supported in CUDASIM')
+@skip_on_cudasim("pickling not supported in CUDASIM")
 class TestPickle(CUDATestCase):
-
     def check_call(self, callee):
         arr = np.array([100])
         expected = callee[1, 1](arr)
@@ -41,14 +40,13 @@ def test_pickling_jit_typing(self):
         def inner(a):
             return a + 1
 
-        @cuda.jit('void(intp[:])')
+        @cuda.jit("void(intp[:])")
         def foo(arr):
             arr[0] = inner(arr[0])
 
         self.check_call(foo)
 
     def test_pickling_jit(self):
-
         @cuda.jit(device=True)
         def inner(a):
             return a + 1
@@ -60,7 +58,7 @@ def foo(arr):
         self.check_call(foo)
 
     def test_pickling_vectorize(self):
-        @vectorize(['intp(intp)', 'float64(float64)'], target='cuda')
+        @vectorize(["intp(intp)", "float64(float64)"], target="cuda")
         def cuda_vect(x):
             return x * 2
 
@@ -81,5 +79,5 @@ def cuda_vect(x):
         np.testing.assert_equal(expected, got2)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_slicing.py b/numba_cuda/numba/cuda/tests/cudapy/test_slicing.py
index f5a3df7f3..40f2c05f4 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_slicing.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_slicing.py
@@ -33,5 +33,5 @@ def test_assign_empty_slice(self):
         arr[:] = cuda.to_device(a)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_sm.py b/numba_cuda/numba/cuda/tests/cudapy/test_sm.py
index b61784a73..c037d1a39 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_sm.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_sm.py
@@ -8,8 +8,7 @@
 
 from .extensions_usecases import test_struct_model_type, TestStruct
 
-recordwith2darray = np.dtype([('i', np.int32),
-                              ('j', np.float32, (3, 2))])
+recordwith2darray = np.dtype([("i", np.int32), ("j", np.float32, (3, 2))])
 
 
 class TestSharedMemoryIssue(CUDATestCase):
@@ -42,7 +41,6 @@ def test_issue_1051_shared_size_broken_2d(self):
         self._check_shared_array_size((2, 3), 6)
 
     def test_issue_1051_shared_size_broken_3d(self):
-
         self._check_shared_array_size((2, 3, 4), 24)
 
     def _check_shared_array_size_fp16(self, shape, expected, ty):
@@ -71,8 +69,9 @@ def test_issue_2393(self):
 
         @cuda.jit
         def costs_func(d_block_costs):
-            s_features = cuda.shared.array((examples_per_block, num_weights),
-                                           float64)
+            s_features = cuda.shared.array(
+                (examples_per_block, num_weights), float64
+            )
             s_initialcost = cuda.shared.array(7, float64)  # Bug
 
             threadIdx = cuda.threadIdx.x
@@ -364,7 +363,7 @@ def test_issue_5073(self):
         def sm_slice_copy(x, y, chunksize):
             dynsmem = cuda.shared.array(0, dtype=dt)
             sm1 = dynsmem[0:chunksize]
-            sm2 = dynsmem[chunksize:chunksize * 2]
+            sm2 = dynsmem[chunksize : chunksize * 2]
 
             tx = cuda.threadIdx.x
             bx = cuda.blockIdx.x
@@ -396,14 +395,16 @@ def test_invalid_array_type(self):
         rgx = ".*Cannot infer the type of variable 'arr'.*"
 
         def unsupported_type():
-            arr = cuda.shared.array(10, dtype=np.dtype('O')) # noqa: F841
+            arr = cuda.shared.array(10, dtype=np.dtype("O"))  # noqa: F841
+
         with self.assertRaisesRegex(TypingError, rgx):
             cuda.jit(void())(unsupported_type)
 
         rgx = ".*Invalid NumPy dtype specified: 'int33'.*"
 
         def invalid_string_type():
-            arr = cuda.shared.array(10, dtype='int33') # noqa: F841
+            arr = cuda.shared.array(10, dtype="int33")  # noqa: F841
+
         with self.assertRaisesRegex(TypingError, rgx):
             cuda.jit(void())(invalid_string_type)
 
@@ -440,5 +441,5 @@ def write_then_reverse_read_static(outx, outy):
             self.assertEqual(y, (nthreads - i - 1) * 2)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py b/numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py
index bff48e642..cfc09d5c2 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py
@@ -17,15 +17,17 @@ def udt_global_constants(A):
 
 
 def udt_global_build_tuple(A):
-    sa = cuda.shared.array(shape=(GLOBAL_CONSTANT, GLOBAL_CONSTANT_2),
-                           dtype=float32)
+    sa = cuda.shared.array(
+        shape=(GLOBAL_CONSTANT, GLOBAL_CONSTANT_2), dtype=float32
+    )
     i, j = cuda.grid(2)
     A[i, j] = sa[i, j]
 
 
 def udt_global_build_list(A):
-    sa = cuda.shared.array(shape=[GLOBAL_CONSTANT, GLOBAL_CONSTANT_2],
-                           dtype=float32)
+    sa = cuda.shared.array(
+        shape=[GLOBAL_CONSTANT, GLOBAL_CONSTANT_2], dtype=float32
+    )
     i, j = cuda.grid(2)
     A[i, j] = sa[i, j]
 
@@ -59,7 +61,7 @@ def getarg(self):
         return np.array(100, dtype=np.float32, ndmin=1)
 
     def getarg2(self):
-        return self.getarg().reshape(1,1)
+        return self.getarg().reshape(1, 1)
 
     def test_global_constants(self):
         udt = cuda.jit((float32[:],))(udt_global_constants)
@@ -69,18 +71,21 @@ def test_global_build_tuple(self):
         udt = cuda.jit((float32[:, :],))(udt_global_build_tuple)
         udt[1, 1](self.getarg2())
 
-    @skip_on_cudasim('Simulator does not prohibit lists for shared array shape')
+    @skip_on_cudasim("Simulator does not prohibit lists for shared array shape")
     def test_global_build_list(self):
         with self.assertRaises(TypingError) as raises:
             cuda.jit((float32[:, :],))(udt_global_build_list)
 
-        self.assertIn("No implementation of function "
-                      "Function(<function shared.array",
-                      str(raises.exception))
-        self.assertIn("found for signature:\n \n "
-                      ">>> array(shape=list(int64)<iv=[5, 6]>, "
-                      "dtype=class(float32)",
-                      str(raises.exception))
+        self.assertIn(
+            "No implementation of function Function(<function shared.array",
+            str(raises.exception),
+        )
+        self.assertIn(
+            "found for signature:\n \n "
+            ">>> array(shape=list(int64)<iv=[5, 6]>, "
+            "dtype=class(float32)",
+            str(raises.exception),
+        )
 
     def test_global_constant_tuple(self):
         udt = cuda.jit((float32[:, :],))(udt_global_constant_tuple)
@@ -92,12 +97,15 @@ def test_invalid_1(self):
         with self.assertRaises(TypingError) as raises:
             cuda.jit((float32[:],))(udt_invalid_1)
 
-        self.assertIn("No implementation of function "
-                      "Function(<function shared.array",
-                      str(raises.exception))
-        self.assertIn("found for signature:\n \n "
-                      ">>> array(shape=float32, dtype=class(float32))",
-                      str(raises.exception))
+        self.assertIn(
+            "No implementation of function Function(<function shared.array",
+            str(raises.exception),
+        )
+        self.assertIn(
+            "found for signature:\n \n "
+            ">>> array(shape=float32, dtype=class(float32))",
+            str(raises.exception),
+        )
 
     @skip_on_cudasim("Can't check for constants in simulator")
     def test_invalid_2(self):
@@ -105,13 +113,16 @@ def test_invalid_2(self):
         with self.assertRaises(TypingError) as raises:
             cuda.jit((float32[:, :],))(udt_invalid_2)
 
-        self.assertIn("No implementation of function "
-                      "Function(<function shared.array",
-                      str(raises.exception))
-        self.assertIn("found for signature:\n \n "
-                      ">>> array(shape=Tuple(Literal[int](1), "
-                      "array(float32, 1d, A)), dtype=class(float32))",
-                      str(raises.exception))
+        self.assertIn(
+            "No implementation of function Function(<function shared.array",
+            str(raises.exception),
+        )
+        self.assertIn(
+            "found for signature:\n \n "
+            ">>> array(shape=Tuple(Literal[int](1), "
+            "array(float32, 1d, A)), dtype=class(float32))",
+            str(raises.exception),
+        )
 
     @skip_on_cudasim("Can't check for constants in simulator")
     def test_invalid_3(self):
@@ -119,12 +130,15 @@ def test_invalid_3(self):
         with self.assertRaises(TypingError) as raises:
             cuda.jit((int32[:],))(udt_invalid_1)
 
-        self.assertIn("No implementation of function "
-                      "Function(<function shared.array",
-                      str(raises.exception))
-        self.assertIn("found for signature:\n \n "
-                      ">>> array(shape=int32, dtype=class(float32))",
-                      str(raises.exception))
+        self.assertIn(
+            "No implementation of function Function(<function shared.array",
+            str(raises.exception),
+        )
+        self.assertIn(
+            "found for signature:\n \n "
+            ">>> array(shape=int32, dtype=class(float32))",
+            str(raises.exception),
+        )
 
     @skip_on_cudasim("Can't check for constants in simulator")
     def test_invalid_4(self):
@@ -132,18 +146,21 @@ def test_invalid_4(self):
         with self.assertRaises(TypingError) as raises:
             cuda.jit((int32[:],))(udt_invalid_3)
 
-        self.assertIn("No implementation of function "
-                      "Function(<function shared.array",
-                      str(raises.exception))
-        self.assertIn("found for signature:\n \n "
-                      ">>> array(shape=Tuple(Literal[int](1), int32), "
-                      "dtype=class(float32))",
-                      str(raises.exception))
+        self.assertIn(
+            "No implementation of function Function(<function shared.array",
+            str(raises.exception),
+        )
+        self.assertIn(
+            "found for signature:\n \n "
+            ">>> array(shape=Tuple(Literal[int](1), int32), "
+            "dtype=class(float32))",
+            str(raises.exception),
+        )
 
     def check_dtype(self, f, dtype):
         # Find the typing of the dtype argument to cuda.shared.array
         annotation = next(iter(f.overloads.values()))._type_annotation
-        l_dtype = annotation.typemap['s'].dtype
+        l_dtype = annotation.typemap["s"].dtype
         # Ensure that the typing is correct
         self.assertEqual(l_dtype, dtype)
 
@@ -174,7 +191,7 @@ def test_string_dtype(self):
         # Check that strings can be used to specify the dtype of a shared array
         @cuda.jit(void(int32[::1]))
         def f(x):
-            s = cuda.shared.array(10, dtype='int32')
+            s = cuda.shared.array(10, dtype="int32")
             s[0] = x[0]
             x[0] = s[0]
 
@@ -185,9 +202,10 @@ def test_invalid_string_dtype(self):
         # Check that strings of invalid dtypes cause a typing error
         re = ".*Invalid NumPy dtype specified: 'int33'.*"
         with self.assertRaisesRegex(TypingError, re):
+
             @cuda.jit(void(int32[::1]))
             def f(x):
-                s = cuda.shared.array(10, dtype='int33')
+                s = cuda.shared.array(10, dtype="int33")
                 s[0] = x[0]
                 x[0] = s[0]
 
@@ -198,8 +216,9 @@ def f(x):
             s = cuda.shared.array(10, dtype=test_struct_model_type)
             s[0] = x[0]
             x[0] = s[0]
+
         self.check_dtype(f, test_struct_model_type)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py b/numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py
index c27055b02..8367b460e 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py
@@ -1,5 +1,9 @@
-from numba.cuda.testing import (skip_on_cudasim, skip_unless_cudasim, unittest,
-                                CUDATestCase)
+from numba.cuda.testing import (
+    skip_on_cudasim,
+    skip_unless_cudasim,
+    unittest,
+    CUDATestCase,
+)
 from numba import config, cuda
 
 # Basic tests that stream APIs execute on the hardware and in the simulator.
@@ -48,5 +52,5 @@ def test_external_stream_simulator_unavailable(self):
             cuda.external_stream(ptr)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_sync.py b/numba_cuda/numba/cuda/tests/cudapy/test_sync.py
index d4d9326f0..4eaff55c9 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_sync.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_sync.py
@@ -133,14 +133,16 @@ def test_useless_syncwarp(self):
         self._test_useless(useless_syncwarp)
 
     @skip_on_cudasim("syncwarp not implemented on cudasim")
-    @unittest.skipUnless(_safe_cc_check((7, 0)),
-                         "Partial masks require CC 7.0 or greater")
+    @unittest.skipUnless(
+        _safe_cc_check((7, 0)), "Partial masks require CC 7.0 or greater"
+    )
     def test_useless_syncwarp_with_mask(self):
         self._test_useless(useless_syncwarp_with_mask)
 
     @skip_on_cudasim("syncwarp not implemented on cudasim")
-    @unittest.skipUnless(_safe_cc_check((7, 0)),
-                         "Partial masks require CC 7.0 or greater")
+    @unittest.skipUnless(
+        _safe_cc_check((7, 0)), "Partial masks require CC 7.0 or greater"
+    )
     def test_coop_syncwarp(self):
         # coop_syncwarp computes the sum of all integers from 0 to 31 (496)
         # using a single warp
@@ -267,5 +269,5 @@ def test_syncthreads_or_downcast(self):
         self._test_syncthreads_or(np.int64)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_transpose.py b/numba_cuda/numba/cuda/tests/cudapy/test_transpose.py
index 9c13db534..38243b78d 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_transpose.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_transpose.py
@@ -5,17 +5,17 @@
 from numba.cuda.testing import skip_on_cudasim, CUDATestCase
 
 
-recordwith2darray = np.dtype([('i', np.int32),
-                              ('j', np.float32, (3, 2))])
+recordwith2darray = np.dtype([("i", np.int32), ("j", np.float32, (3, 2))])
 
 
-@skip_on_cudasim('Device Array API unsupported in the simulator')
+@skip_on_cudasim("Device Array API unsupported in the simulator")
 class TestTranspose(CUDATestCase):
-
     def test_transpose(self):
-        variants = ((5, 6, np.float64),
-                    (128, 128, np.complex128),
-                    (1025, 512, np.float64))
+        variants = (
+            (5, 6, np.float64),
+            (128, 128, np.complex128),
+            (1025, 512, np.float64),
+        )
 
         for rows, cols, dtype in variants:
             with self.subTest(rows=rows, cols=cols, dtype=dtype):
@@ -27,8 +27,15 @@ def test_transpose(self):
                 dy.copy_to_host(y)
                 np.testing.assert_array_equal(x.transpose(), y)
 
-    small_variants = ((2, 3), (16, 16), (16, 17), (17, 16), (14, 15), (15, 14),
-                      (14, 14))
+    small_variants = (
+        (2, 3),
+        (16, 16),
+        (16, 17),
+        (17, 16),
+        (14, 15),
+        (15, 14),
+        (14, 14),
+    )
 
     def test_transpose_record(self):
         for rows, cols in self.small_variants:
@@ -36,7 +43,7 @@ def test_transpose_record(self):
                 arr = np.recarray((rows, cols), dtype=recordwith2darray)
                 for x in range(rows):
                     for y in range(cols):
-                        arr[x, y].i = x ** 2 + y
+                        arr[x, y].i = x**2 + y
                         j = np.arange(3 * 2, dtype=np.float32)
                         arr[x, y].j = j.reshape(3, 2) * x + y
 
@@ -76,5 +83,5 @@ def test_transpose_view(self):
         np.testing.assert_array_equal(a_view_t, h_a_view_t)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py b/numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py
index 7a98abde7..63340ecce 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py
@@ -9,11 +9,11 @@
 
 def _make_ufunc_usecase(ufunc):
     ldict = {}
-    arg_str = ','.join(['a{0}'.format(i) for i in range(ufunc.nargs)])
-    func_str = f'def fn({arg_str}):\n    np.{ufunc.__name__}({arg_str})'
+    arg_str = ",".join(["a{0}".format(i) for i in range(ufunc.nargs)])
+    func_str = f"def fn({arg_str}):\n    np.{ufunc.__name__}({arg_str})"
     exec(func_str, globals(), ldict)
-    fn = ldict['fn']
-    fn.__name__ = '{0}_usecase'.format(ufunc.__name__)
+    fn = ldict["fn"]
+    fn.__name__ = "{0}_usecase".format(ufunc.__name__)
     return fn
 
 
@@ -32,58 +32,75 @@ def setUp(self):
 
         # The basic ufunc test does not set up complex inputs, so we'll add
         # some here for testing with CUDA.
-        self.inputs.extend([
-            (np.complex64(-0.5 - 0.5j), types.complex64),
-            (np.complex64(0.0), types.complex64),
-            (np.complex64(0.5 + 0.5j), types.complex64),
-
-            (np.complex128(-0.5 - 0.5j), types.complex128),
-            (np.complex128(0.0), types.complex128),
-            (np.complex128(0.5 + 0.5j), types.complex128),
-
-            (np.array([-0.5 - 0.5j, 0.0, 0.5 + 0.5j], dtype='c8'),
-             types.Array(types.complex64, 1, 'C')),
-            (np.array([-0.5 - 0.5j, 0.0, 0.5 + 0.5j], dtype='c16'),
-             types.Array(types.complex128, 1, 'C')),
-        ])
+        self.inputs.extend(
+            [
+                (np.complex64(-0.5 - 0.5j), types.complex64),
+                (np.complex64(0.0), types.complex64),
+                (np.complex64(0.5 + 0.5j), types.complex64),
+                (np.complex128(-0.5 - 0.5j), types.complex128),
+                (np.complex128(0.0), types.complex128),
+                (np.complex128(0.5 + 0.5j), types.complex128),
+                (
+                    np.array([-0.5 - 0.5j, 0.0, 0.5 + 0.5j], dtype="c8"),
+                    types.Array(types.complex64, 1, "C"),
+                ),
+                (
+                    np.array([-0.5 - 0.5j, 0.0, 0.5 + 0.5j], dtype="c16"),
+                    types.Array(types.complex128, 1, "C"),
+                ),
+            ]
+        )
 
         # Test with multiple dimensions
-        self.inputs.extend([
-            # Basic 2D and 3D arrays
-            (np.linspace(0, 1).reshape((5, -1)),
-             types.Array(types.float64, 2, 'C')),
-            (np.linspace(0, 1).reshape((2, 5, -1)),
-             types.Array(types.float64, 3, 'C')),
-            # Complex data (i.e. interleaved)
-            (np.linspace(0, 1 + 1j).reshape(5, -1),
-             types.Array(types.complex128, 2, 'C')),
-            # F-ordered
-            (np.asfortranarray(np.linspace(0, 1).reshape((5, -1))),
-             types.Array(types.float64, 2, 'F')),
-        ])
+        self.inputs.extend(
+            [
+                # Basic 2D and 3D arrays
+                (
+                    np.linspace(0, 1).reshape((5, -1)),
+                    types.Array(types.float64, 2, "C"),
+                ),
+                (
+                    np.linspace(0, 1).reshape((2, 5, -1)),
+                    types.Array(types.float64, 3, "C"),
+                ),
+                # Complex data (i.e. interleaved)
+                (
+                    np.linspace(0, 1 + 1j).reshape(5, -1),
+                    types.Array(types.complex128, 2, "C"),
+                ),
+                # F-ordered
+                (
+                    np.asfortranarray(np.linspace(0, 1).reshape((5, -1))),
+                    types.Array(types.float64, 2, "F"),
+                ),
+            ]
+        )
 
         # Add tests for other integer types
-        self.inputs.extend([
-            (np.uint8(0), types.uint8),
-            (np.uint8(1), types.uint8),
-            (np.int8(-1), types.int8),
-            (np.int8(0), types.int8),
-
-            (np.uint16(0), types.uint16),
-            (np.uint16(1), types.uint16),
-            (np.int16(-1), types.int16),
-            (np.int16(0), types.int16),
-
-            (np.ulonglong(0), types.ulonglong),
-            (np.ulonglong(1), types.ulonglong),
-            (np.longlong(-1), types.longlong),
-            (np.longlong(0), types.longlong),
-
-            (np.array([0,1], dtype=np.ulonglong),
-             types.Array(types.ulonglong, 1, 'C')),
-            (np.array([0,1], dtype=np.longlong),
-             types.Array(types.longlong, 1, 'C')),
-        ])
+        self.inputs.extend(
+            [
+                (np.uint8(0), types.uint8),
+                (np.uint8(1), types.uint8),
+                (np.int8(-1), types.int8),
+                (np.int8(0), types.int8),
+                (np.uint16(0), types.uint16),
+                (np.uint16(1), types.uint16),
+                (np.int16(-1), types.int16),
+                (np.int16(0), types.int16),
+                (np.ulonglong(0), types.ulonglong),
+                (np.ulonglong(1), types.ulonglong),
+                (np.longlong(-1), types.longlong),
+                (np.longlong(0), types.longlong),
+                (
+                    np.array([0, 1], dtype=np.ulonglong),
+                    types.Array(types.ulonglong, 1, "C"),
+                ),
+                (
+                    np.array([0, 1], dtype=np.longlong),
+                    types.Array(types.longlong, 1, "C"),
+                ),
+            ]
+        )
 
         self._low_occupancy_warnings = config.CUDA_LOW_OCCUPANCY_WARNINGS
         self._warn_on_implicit_copy = config.CUDA_WARN_ON_IMPLICIT_COPY
@@ -111,18 +128,18 @@ def basic_int_ufunc_test(self, name=None):
         skip_inputs = [
             types.float32,
             types.float64,
-            types.Array(types.float32, 1, 'C'),
-            types.Array(types.float32, 2, 'C'),
-            types.Array(types.float64, 1, 'C'),
-            types.Array(types.float64, 2, 'C'),
-            types.Array(types.float64, 3, 'C'),
-            types.Array(types.float64, 2, 'F'),
+            types.Array(types.float32, 1, "C"),
+            types.Array(types.float32, 2, "C"),
+            types.Array(types.float64, 1, "C"),
+            types.Array(types.float64, 2, "C"),
+            types.Array(types.float64, 3, "C"),
+            types.Array(types.float64, 2, "F"),
             types.complex64,
             types.complex128,
-            types.Array(types.complex64, 1, 'C'),
-            types.Array(types.complex64, 2, 'C'),
-            types.Array(types.complex128, 1, 'C'),
-            types.Array(types.complex128, 2, 'C'),
+            types.Array(types.complex64, 1, "C"),
+            types.Array(types.complex64, 2, "C"),
+            types.Array(types.complex128, 1, "C"),
+            types.Array(types.complex128, 2, "C"),
         ]
         self.basic_ufunc_test(name, skip_inputs=skip_inputs)
 
@@ -130,43 +147,43 @@ def basic_int_ufunc_test(self, name=None):
     # Trigonometric Functions
 
     def test_sin_ufunc(self):
-        self.basic_ufunc_test(np.sin, kinds='cf')
+        self.basic_ufunc_test(np.sin, kinds="cf")
 
     def test_cos_ufunc(self):
-        self.basic_ufunc_test(np.cos, kinds='cf')
+        self.basic_ufunc_test(np.cos, kinds="cf")
 
     def test_tan_ufunc(self):
-        self.basic_ufunc_test(np.tan, kinds='cf')
+        self.basic_ufunc_test(np.tan, kinds="cf")
 
     def test_arcsin_ufunc(self):
-        self.basic_ufunc_test(np.arcsin, kinds='cf')
+        self.basic_ufunc_test(np.arcsin, kinds="cf")
 
     def test_arccos_ufunc(self):
-        self.basic_ufunc_test(np.arccos, kinds='cf')
+        self.basic_ufunc_test(np.arccos, kinds="cf")
 
     def test_arctan_ufunc(self):
-        self.basic_ufunc_test(np.arctan, kinds='cf')
+        self.basic_ufunc_test(np.arctan, kinds="cf")
 
     def test_arctan2_ufunc(self):
-        self.basic_ufunc_test(np.arctan2, kinds='f')
+        self.basic_ufunc_test(np.arctan2, kinds="f")
 
     def test_hypot_ufunc(self):
-        self.basic_ufunc_test(np.hypot, kinds='f')
+        self.basic_ufunc_test(np.hypot, kinds="f")
 
     def test_sinh_ufunc(self):
-        self.basic_ufunc_test(np.sinh, kinds='cf')
+        self.basic_ufunc_test(np.sinh, kinds="cf")
 
     def test_cosh_ufunc(self):
-        self.basic_ufunc_test(np.cosh, kinds='cf')
+        self.basic_ufunc_test(np.cosh, kinds="cf")
 
     def test_tanh_ufunc(self):
-        self.basic_ufunc_test(np.tanh, kinds='cf')
+        self.basic_ufunc_test(np.tanh, kinds="cf")
 
     def test_arcsinh_ufunc(self):
-        self.basic_ufunc_test(np.arcsinh, kinds='cf')
+        self.basic_ufunc_test(np.arcsinh, kinds="cf")
 
     def test_arccosh_ufunc(self):
-        self.basic_ufunc_test(np.arccosh, kinds='cf')
+        self.basic_ufunc_test(np.arccosh, kinds="cf")
 
     def test_arctanh_ufunc(self):
         # arctanh is only valid is only finite in the range ]-1, 1[
@@ -177,24 +194,30 @@ def test_arctanh_ufunc(self):
         # used to compile NumPy may differ from the result generated by
         # llvm. Skipping the integer types in this test avoids failed
         # tests because of this.
-        to_skip = [types.Array(types.uint32, 1, 'C'), types.uint32,
-                   types.Array(types.int32, 1, 'C'), types.int32,
-                   types.Array(types.uint64, 1, 'C'), types.uint64,
-                   types.Array(types.int64, 1, 'C'), types.int64]
+        to_skip = [
+            types.Array(types.uint32, 1, "C"),
+            types.uint32,
+            types.Array(types.int32, 1, "C"),
+            types.int32,
+            types.Array(types.uint64, 1, "C"),
+            types.uint64,
+            types.Array(types.int64, 1, "C"),
+            types.int64,
+        ]
 
-        self.basic_ufunc_test(np.arctanh, skip_inputs=to_skip, kinds='cf')
+        self.basic_ufunc_test(np.arctanh, skip_inputs=to_skip, kinds="cf")
 
     def test_deg2rad_ufunc(self):
-        self.basic_ufunc_test(np.deg2rad, kinds='f')
+        self.basic_ufunc_test(np.deg2rad, kinds="f")
 
     def test_rad2deg_ufunc(self):
-        self.basic_ufunc_test(np.rad2deg, kinds='f')
+        self.basic_ufunc_test(np.rad2deg, kinds="f")
 
     def test_degrees_ufunc(self):
-        self.basic_ufunc_test(np.degrees, kinds='f')
+        self.basic_ufunc_test(np.degrees, kinds="f")
 
     def test_radians_ufunc(self):
-        self.basic_ufunc_test(np.radians, kinds='f')
+        self.basic_ufunc_test(np.radians, kinds="f")
 
     ############################################################################
     # Comparison functions
@@ -264,14 +287,14 @@ def test_bitwise_not_ufunc(self):
     # Mathematical Functions
 
     def test_log_ufunc(self):
-        self.basic_ufunc_test(np.log, kinds='cf')
+        self.basic_ufunc_test(np.log, kinds="cf")
 
     def test_log2_ufunc(self):
-        self.basic_ufunc_test(np.log2, kinds='cf')
+        self.basic_ufunc_test(np.log2, kinds="cf")
 
     def test_log10_ufunc(self):
-        self.basic_ufunc_test(np.log10, kinds='cf')
+        self.basic_ufunc_test(np.log10, kinds="cf")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_userexc.py b/numba_cuda/numba/cuda/tests/cudapy/test_userexc.py
index 6073c3f3f..b444c9155 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_userexc.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_userexc.py
@@ -13,7 +13,6 @@ class MyError(Exception):
 
 
 class TestUserExc(CUDATestCase):
-
     def setUp(self):
         super().setUp()
         # LTO optimizes away the exception status due to an oversight
@@ -29,7 +28,7 @@ def test_exc(x):
             elif x == 2:
                 raise MyError("foo")
 
-        test_exc[1, 1](0)    # no raise
+        test_exc[1, 1](0)  # no raise
         with self.assertRaises(MyError) as cm:
             test_exc[1, 1](1)
         if not config.ENABLE_CUDASIM:
@@ -43,5 +42,5 @@ def test_exc(x):
         self.assertIn("tid=[0, 0, 0] ctaid=[0, 0, 0]: foo", str(cm.exception))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py b/numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py
index 1ee72f2d3..9fef225df 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py
@@ -44,12 +44,7 @@ def kernel_3elem(res):
         res[2] = v.z
 
     def kernel_4elem(res):
-        v = vobj(
-            base_type(0),
-            base_type(1),
-            base_type(2),
-            base_type(3)
-        )
+        v = vobj(base_type(0), base_type(1), base_type(2), base_type(3))
         res[0] = v.x
         res[1] = v.y
         res[2] = v.z
@@ -59,7 +54,7 @@ def kernel_4elem(res):
         1: kernel_1elem,
         2: kernel_2elem,
         3: kernel_3elem,
-        4: kernel_4elem
+        4: kernel_4elem,
     }[vtype.num_elements]
     return cuda.jit(host_function)
 
@@ -83,13 +78,13 @@ def kernel(res):
         three = base_type(3.0)
         four = base_type(4.0)
 
-        j = 0   # index of the result array
+        j = 0  # index of the result array
 
         # Construct a 1-component vector type, possible combination includes:
         # 2C1 = 2 combinations.
 
         f1_1 = v1(one)  # 1
-        f1_2 = v1(f1_1) # 1
+        f1_2 = v1(f1_1)  # 1
 
         res[0] = f1_1.x
         res[1] = f1_2.x
@@ -98,11 +93,11 @@ def kernel(res):
         # Construct a 2-component vector type, possible combination includes:
         # 1 + 2C1 * 2 = 5 combinations
 
-        f2_1 = v2(two, three)       # 2 3
-        f2_2 = v2(f1_1, three)      # 1 3
-        f2_3 = v2(two, f1_1)        # 2 1
-        f2_4 = v2(f1_1, f1_1)       # 1 1
-        f2_5 = v2(f2_1)             # 2 3
+        f2_1 = v2(two, three)  # 2 3
+        f2_2 = v2(f1_1, three)  # 1 3
+        f2_3 = v2(two, f1_1)  # 2 1
+        f2_4 = v2(f1_1, f1_1)  # 1 1
+        f2_5 = v2(f2_1)  # 2 3
 
         for v in (f2_1, f2_2, f2_3, f2_4, f2_5):
             res[j] = v.x
@@ -112,24 +107,37 @@ def kernel(res):
         # Construct a 3-component vector type, possible combination includes:
         # 1 + 2C1 * 2 + 2^3 = 13 combinations
 
-        f3_1 = v3(f2_1, one)            # 2 3 1
-        f3_2 = v3(f2_1, f1_1)           # 2 3 1
-        f3_3 = v3(one, f2_1)            # 1 2 3
-        f3_4 = v3(f1_1, f2_1)           # 1 2 3
-
-        f3_5 = v3(one, two, three)      # 1 2 3
-        f3_6 = v3(f1_1, two, three)     # 1 2 3
-        f3_7 = v3(one, f1_1, three)     # 1 1 3
-        f3_8 = v3(one, two, f1_1)       # 1 2 1
-        f3_9 = v3(f1_1, f1_1, three)    # 1 1 3
-        f3_10 = v3(one, f1_1, f1_1)     # 1 1 1
-        f3_11 = v3(f1_1, two, f1_1)     # 1 2 1
-        f3_12 = v3(f1_1, f1_1, f1_1)    # 1 1 1
-
-        f3_13 = v3(f3_1)                # 2 3 1
-
-        for v in (f3_1, f3_2, f3_3, f3_4, f3_5, f3_6, f3_7, f3_8, f3_9,
-                  f3_10, f3_11, f3_12, f3_13):
+        f3_1 = v3(f2_1, one)  # 2 3 1
+        f3_2 = v3(f2_1, f1_1)  # 2 3 1
+        f3_3 = v3(one, f2_1)  # 1 2 3
+        f3_4 = v3(f1_1, f2_1)  # 1 2 3
+
+        f3_5 = v3(one, two, three)  # 1 2 3
+        f3_6 = v3(f1_1, two, three)  # 1 2 3
+        f3_7 = v3(one, f1_1, three)  # 1 1 3
+        f3_8 = v3(one, two, f1_1)  # 1 2 1
+        f3_9 = v3(f1_1, f1_1, three)  # 1 1 3
+        f3_10 = v3(one, f1_1, f1_1)  # 1 1 1
+        f3_11 = v3(f1_1, two, f1_1)  # 1 2 1
+        f3_12 = v3(f1_1, f1_1, f1_1)  # 1 1 1
+
+        f3_13 = v3(f3_1)  # 2 3 1
+
+        for v in (
+            f3_1,
+            f3_2,
+            f3_3,
+            f3_4,
+            f3_5,
+            f3_6,
+            f3_7,
+            f3_8,
+            f3_9,
+            f3_10,
+            f3_11,
+            f3_12,
+            f3_13,
+        ):
             res[j] = v.x
             res[j + 1] = v.y
             res[j + 2] = v.z
@@ -138,48 +146,80 @@ def kernel(res):
         # Construct a 4-component vector type, possible combination includes:
         # 1 + (2C1 * 2 + 1) + 3C1 * 2^2 + 2^4 = 34 combinations
 
-        f4_1 = v4(one, two, three, four)    # 1 2 3 4
-        f4_2 = v4(f1_1, two, three, four)   # 1 2 3 4
-        f4_3 = v4(one, f1_1, three, four)   # 1 1 3 4
-        f4_4 = v4(one, two, f1_1, four)     # 1 2 1 4
-        f4_5 = v4(one, two, three, f1_1)    # 1 2 3 1
+        f4_1 = v4(one, two, three, four)  # 1 2 3 4
+        f4_2 = v4(f1_1, two, three, four)  # 1 2 3 4
+        f4_3 = v4(one, f1_1, three, four)  # 1 1 3 4
+        f4_4 = v4(one, two, f1_1, four)  # 1 2 1 4
+        f4_5 = v4(one, two, three, f1_1)  # 1 2 3 1
         f4_6 = v4(f1_1, f1_1, three, four)  # 1 1 3 4
-        f4_7 = v4(f1_1, two, f1_1, four)    # 1 2 1 4
-        f4_8 = v4(f1_1, two, three, f1_1)   # 1 2 3 1
-        f4_9 = v4(one, f1_1, f1_1, four)    # 1 1 1 4
+        f4_7 = v4(f1_1, two, f1_1, four)  # 1 2 1 4
+        f4_8 = v4(f1_1, two, three, f1_1)  # 1 2 3 1
+        f4_9 = v4(one, f1_1, f1_1, four)  # 1 1 1 4
         f4_10 = v4(one, f1_1, three, f1_1)  # 1 1 3 1
-        f4_11 = v4(one, two, f1_1, f1_1)    # 1 2 1 1
+        f4_11 = v4(one, two, f1_1, f1_1)  # 1 2 1 1
         f4_12 = v4(f1_1, f1_1, f1_1, four)  # 1 1 1 4
-        f4_13 = v4(f1_1, f1_1, three, f1_1) # 1 1 3 1
-        f4_14 = v4(f1_1, two, f1_1, f1_1)   # 1 2 1 1
-        f4_15 = v4(one, f1_1, f1_1, f1_1)   # 1 1 1 1
+        f4_13 = v4(f1_1, f1_1, three, f1_1)  # 1 1 3 1
+        f4_14 = v4(f1_1, two, f1_1, f1_1)  # 1 2 1 1
+        f4_15 = v4(one, f1_1, f1_1, f1_1)  # 1 1 1 1
         f4_16 = v4(f1_1, f1_1, f1_1, f1_1)  # 1 1 1 1
 
-        f4_17 = v4(f2_1, two, three)        # 2 3 2 3
-        f4_18 = v4(f2_1, f1_1, three)       # 2 3 1 3
-        f4_19 = v4(f2_1, two, f1_1)         # 2 3 2 1
-        f4_20 = v4(f2_1, f1_1, f1_1)        # 2 3 1 1
-        f4_21 = v4(one, f2_1, three)        # 1 2 3 3
-        f4_22 = v4(f1_1, f2_1, three)       # 1 2 3 3
-        f4_23 = v4(one, f2_1, f1_1)         # 1 2 3 1
-        f4_24 = v4(f1_1, f2_1, f1_1)        # 1 2 3 1
-        f4_25 = v4(one, four, f2_1)         # 1 4 2 3
-        f4_26 = v4(f1_1, four, f2_1)        # 1 4 2 3
-        f4_27 = v4(one, f1_1, f2_1)         # 1 1 2 3
-        f4_28 = v4(f1_1, f1_1, f2_1)        # 1 1 2 3
-
-        f4_29 = v4(f2_1, f2_1)              # 2 3 2 3
-        f4_30 = v4(f3_1, four)              # 2 3 1 4
-        f4_31 = v4(f3_1, f1_1)              # 2 3 1 1
-        f4_32 = v4(four, f3_1)              # 4 2 3 1
-        f4_33 = v4(f1_1, f3_1)              # 1 2 3 1
-
-        f4_34 = v4(f4_1)                    # 1 2 3 4
-
-        for v in (f4_1, f4_2, f4_3, f4_4, f4_5, f4_6, f4_7, f4_8, f4_9, f4_10,
-                  f4_11, f4_12, f4_13, f4_14, f4_15, f4_16, f4_17, f4_18, f4_19,
-                  f4_20, f4_21, f4_22, f4_23, f4_24, f4_25, f4_26, f4_27, f4_28,
-                  f4_29, f4_30, f4_31, f4_32, f4_33, f4_34):
+        f4_17 = v4(f2_1, two, three)  # 2 3 2 3
+        f4_18 = v4(f2_1, f1_1, three)  # 2 3 1 3
+        f4_19 = v4(f2_1, two, f1_1)  # 2 3 2 1
+        f4_20 = v4(f2_1, f1_1, f1_1)  # 2 3 1 1
+        f4_21 = v4(one, f2_1, three)  # 1 2 3 3
+        f4_22 = v4(f1_1, f2_1, three)  # 1 2 3 3
+        f4_23 = v4(one, f2_1, f1_1)  # 1 2 3 1
+        f4_24 = v4(f1_1, f2_1, f1_1)  # 1 2 3 1
+        f4_25 = v4(one, four, f2_1)  # 1 4 2 3
+        f4_26 = v4(f1_1, four, f2_1)  # 1 4 2 3
+        f4_27 = v4(one, f1_1, f2_1)  # 1 1 2 3
+        f4_28 = v4(f1_1, f1_1, f2_1)  # 1 1 2 3
+
+        f4_29 = v4(f2_1, f2_1)  # 2 3 2 3
+        f4_30 = v4(f3_1, four)  # 2 3 1 4
+        f4_31 = v4(f3_1, f1_1)  # 2 3 1 1
+        f4_32 = v4(four, f3_1)  # 4 2 3 1
+        f4_33 = v4(f1_1, f3_1)  # 1 2 3 1
+
+        f4_34 = v4(f4_1)  # 1 2 3 4
+
+        for v in (
+            f4_1,
+            f4_2,
+            f4_3,
+            f4_4,
+            f4_5,
+            f4_6,
+            f4_7,
+            f4_8,
+            f4_9,
+            f4_10,
+            f4_11,
+            f4_12,
+            f4_13,
+            f4_14,
+            f4_15,
+            f4_16,
+            f4_17,
+            f4_18,
+            f4_19,
+            f4_20,
+            f4_21,
+            f4_22,
+            f4_23,
+            f4_24,
+            f4_25,
+            f4_26,
+            f4_27,
+            f4_28,
+            f4_29,
+            f4_30,
+            f4_31,
+            f4_32,
+            f4_33,
+            f4_34,
+        ):
             res[j] = v.x
             res[j + 1] = v.y
             res[j + 2] = v.z
@@ -190,13 +230,13 @@ def kernel(res):
 
 
 class TestCudaVectorType(CUDATestCase):
-
     def test_basic(self):
         """Basic test that makes sure that vector type and aliases
         are available within the cuda module from both device and
         simulator mode. This is an important sanity check, since other
         tests below tests the vector type objects programmatically.
         """
+
         @cuda.jit("void(float64[:])")
         def kernel(arr):
             v1 = cuda.float64x4(1.0, 3.0, 5.0, 7.0)
@@ -227,66 +267,201 @@ def test_fancy_creation_readout(self):
             with self.subTest(vty=vty):
                 kernel = make_fancy_creation_kernel(vty)
 
-                expected = np.array([
-                    # 1-component vectors
-                    1,
-                    1,
-                    # 2-component vectors
-                    2, 3,
-                    1, 3,
-                    2, 1,
-                    1, 1,
-                    2, 3,
-                    # 3-component vectors
-                    2, 3, 1,
-                    2, 3, 1,
-                    1, 2, 3,
-                    1, 2, 3,
-                    1, 2, 3,
-                    1, 2, 3,
-                    1, 1, 3,
-                    1, 2, 1,
-                    1, 1, 3,
-                    1, 1, 1,
-                    1, 2, 1,
-                    1, 1, 1,
-                    2, 3, 1,
-                    # 4-component vectors
-                    1, 2, 3, 4,
-                    1, 2, 3, 4,
-                    1, 1, 3, 4,
-                    1, 2, 1, 4,
-                    1, 2, 3, 1,
-                    1, 1, 3, 4,
-                    1, 2, 1, 4,
-                    1, 2, 3, 1,
-                    1, 1, 1, 4,
-                    1, 1, 3, 1,
-                    1, 2, 1, 1,
-                    1, 1, 1, 4,
-                    1, 1, 3, 1,
-                    1, 2, 1, 1,
-                    1, 1, 1, 1,
-                    1, 1, 1, 1,
-                    2, 3, 2, 3,
-                    2, 3, 1, 3,
-                    2, 3, 2, 1,
-                    2, 3, 1, 1,
-                    1, 2, 3, 3,
-                    1, 2, 3, 3,
-                    1, 2, 3, 1,
-                    1, 2, 3, 1,
-                    1, 4, 2, 3,
-                    1, 4, 2, 3,
-                    1, 1, 2, 3,
-                    1, 1, 2, 3,
-                    2, 3, 2, 3,
-                    2, 3, 1, 4,
-                    2, 3, 1, 1,
-                    4, 2, 3, 1,
-                    1, 2, 3, 1,
-                    1, 2, 3, 4
-                ])
+                expected = np.array(
+                    [
+                        # 1-component vectors
+                        1,
+                        1,
+                        # 2-component vectors
+                        2,
+                        3,
+                        1,
+                        3,
+                        2,
+                        1,
+                        1,
+                        1,
+                        2,
+                        3,
+                        # 3-component vectors
+                        2,
+                        3,
+                        1,
+                        2,
+                        3,
+                        1,
+                        1,
+                        2,
+                        3,
+                        1,
+                        2,
+                        3,
+                        1,
+                        2,
+                        3,
+                        1,
+                        2,
+                        3,
+                        1,
+                        1,
+                        3,
+                        1,
+                        2,
+                        1,
+                        1,
+                        1,
+                        3,
+                        1,
+                        1,
+                        1,
+                        1,
+                        2,
+                        1,
+                        1,
+                        1,
+                        1,
+                        2,
+                        3,
+                        1,
+                        # 4-component vectors
+                        1,
+                        2,
+                        3,
+                        4,
+                        1,
+                        2,
+                        3,
+                        4,
+                        1,
+                        1,
+                        3,
+                        4,
+                        1,
+                        2,
+                        1,
+                        4,
+                        1,
+                        2,
+                        3,
+                        1,
+                        1,
+                        1,
+                        3,
+                        4,
+                        1,
+                        2,
+                        1,
+                        4,
+                        1,
+                        2,
+                        3,
+                        1,
+                        1,
+                        1,
+                        1,
+                        4,
+                        1,
+                        1,
+                        3,
+                        1,
+                        1,
+                        2,
+                        1,
+                        1,
+                        1,
+                        1,
+                        1,
+                        4,
+                        1,
+                        1,
+                        3,
+                        1,
+                        1,
+                        2,
+                        1,
+                        1,
+                        1,
+                        1,
+                        1,
+                        1,
+                        1,
+                        1,
+                        1,
+                        1,
+                        2,
+                        3,
+                        2,
+                        3,
+                        2,
+                        3,
+                        1,
+                        3,
+                        2,
+                        3,
+                        2,
+                        1,
+                        2,
+                        3,
+                        1,
+                        1,
+                        1,
+                        2,
+                        3,
+                        3,
+                        1,
+                        2,
+                        3,
+                        3,
+                        1,
+                        2,
+                        3,
+                        1,
+                        1,
+                        2,
+                        3,
+                        1,
+                        1,
+                        4,
+                        2,
+                        3,
+                        1,
+                        4,
+                        2,
+                        3,
+                        1,
+                        1,
+                        2,
+                        3,
+                        1,
+                        1,
+                        2,
+                        3,
+                        2,
+                        3,
+                        2,
+                        3,
+                        2,
+                        3,
+                        1,
+                        4,
+                        2,
+                        3,
+                        1,
+                        1,
+                        4,
+                        2,
+                        3,
+                        1,
+                        1,
+                        2,
+                        3,
+                        1,
+                        1,
+                        2,
+                        3,
+                        4,
+                    ]
+                )
                 arr = np.zeros(expected.shape)
                 kernel[1, 1](arr)
                 np.testing.assert_almost_equal(arr, expected)
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py b/numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py
index c88e1792b..f4c540ca1 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py
@@ -13,9 +13,11 @@
 # Signatures to test with - these are all homogeneous in dtype, so the output
 # dtype should match the input dtype - the output should not have been cast
 # upwards, as reported in #8400: https://github.com/numba/numba/issues/8400
-signatures = [int32(int32, int32),
-              float32(float32, float32),
-              float64(float64, float64)]
+signatures = [
+    int32(int32, int32),
+    float32(float32, float32),
+    float64(float64, float64),
+]
 
 # The order here is chosen such that each subsequent dtype might have been
 # casted to a previously-used dtype. This is unlikely to be an issue for CUDA,
@@ -25,16 +27,16 @@
 dtypes = (np.float64, np.float32, np.int32)
 
 # NumPy ndarray orders
-orders = ('C', 'F')
+orders = ("C", "F")
 
 # Input sizes corresponding to operations:
 # - Less than one warp,
 # - Less than one block,
 # - Greater than one block (i.e. many blocks)
-input_sizes = (8, 100, 2 ** 10 + 1)
+input_sizes = (8, 100, 2**10 + 1)
 
 
-@skip_on_cudasim('ufunc API unsupported in the simulator')
+@skip_on_cudasim("ufunc API unsupported in the simulator")
 class TestCUDAVectorize(CUDATestCase):
     # Presumably chosen as an odd number unlikely to coincide with the total
     # thread count, and large enough to ensure a significant number of blocks
@@ -42,8 +44,7 @@ class TestCUDAVectorize(CUDATestCase):
     N = 1000001
 
     def test_scalar(self):
-
-        @vectorize(signatures, target='cuda')
+        @vectorize(signatures, target="cuda")
         def vector_add(a, b):
             return a + b
 
@@ -53,8 +54,7 @@ def vector_add(a, b):
         self.assertEqual(c, a + b)
 
     def test_1d(self):
-
-        @vectorize(signatures, target='cuda')
+        @vectorize(signatures, target="cuda")
         def vector_add(a, b):
             return a + b
 
@@ -66,8 +66,7 @@ def vector_add(a, b):
             self.assertEqual(actual.dtype, ty)
 
     def test_1d_async(self):
-
-        @vectorize(signatures, target='cuda')
+        @vectorize(signatures, target="cuda")
         def vector_add(a, b):
             return a + b
 
@@ -86,8 +85,7 @@ def vector_add(a, b):
             self.assertEqual(actual.dtype, ty)
 
     def test_nd(self):
-
-        @vectorize(signatures, target='cuda')
+        @vectorize(signatures, target="cuda")
         def vector_add(a, b):
             return a + b
 
@@ -102,7 +100,7 @@ def vector_add(a, b):
             self.assertEqual(actual.dtype, dtype)
 
     def test_output_arg(self):
-        @vectorize(signatures, target='cuda')
+        @vectorize(signatures, target="cuda")
         def vector_add(a, b):
             return a + b
 
@@ -117,7 +115,7 @@ def vector_add(a, b):
         self.assertEqual(expected.dtype, actual.dtype)
 
     def test_reduce(self):
-        @vectorize(signatures, target='cuda')
+        @vectorize(signatures, target="cuda")
         def vector_add(a, b):
             return a + b
 
@@ -135,8 +133,7 @@ def vector_add(a, b):
             self.assertEqual(dtype, actual.dtype)
 
     def test_reduce_async(self):
-
-        @vectorize(signatures, target='cuda')
+        @vectorize(signatures, target="cuda")
         def vector_add(a, b):
             return a + b
 
@@ -153,7 +150,7 @@ def vector_add(a, b):
             self.assertEqual(dtype, actual.dtype)
 
     def test_manual_transfer(self):
-        @vectorize(signatures, target='cuda')
+        @vectorize(signatures, target="cuda")
         def vector_add(a, b):
             return a + b
 
@@ -166,7 +163,7 @@ def vector_add(a, b):
         self.assertEqual(expected.dtype, actual.dtype)
 
     def test_ufunc_output_2d(self):
-        @vectorize(signatures, target='cuda')
+        @vectorize(signatures, target="cuda")
         def vector_add(a, b):
             return a + b
 
@@ -181,7 +178,7 @@ def vector_add(a, b):
         self.assertEqual(expected.dtype, actual.dtype)
 
     def check_tuple_arg(self, a, b):
-        @vectorize(signatures, target='cuda')
+        @vectorize(signatures, target="cuda")
         def vector_add(a, b):
             return a + b
 
@@ -194,7 +191,7 @@ def test_tuple_arg(self):
         self.check_tuple_arg(a, b)
 
     def test_namedtuple_arg(self):
-        Point = namedtuple('Point', ('x', 'y', 'z'))
+        Point = namedtuple("Point", ("x", "y", "z"))
         a = Point(x=1.0, y=2.0, z=3.0)
         b = Point(x=4.0, y=5.0, z=6.0)
         self.check_tuple_arg(a, b)
@@ -206,7 +203,7 @@ def test_tuple_of_array_arg(self):
         self.check_tuple_arg(a, b)
 
     def test_tuple_of_namedtuple_arg(self):
-        Point = namedtuple('Point', ('x', 'y', 'z'))
+        Point = namedtuple("Point", ("x", "y", "z"))
         a = (Point(x=1.0, y=2.0, z=3.0), Point(x=1.5, y=2.5, z=3.5))
         b = (Point(x=4.0, y=5.0, z=6.0), Point(x=4.5, y=5.5, z=6.5))
         self.check_tuple_arg(a, b)
@@ -216,17 +213,17 @@ def test_namedtuple_of_array_arg(self):
         ys1 = xs1 + 2
         xs2 = np.arange(10, dtype=np.int32) * 2
         ys2 = xs2 + 1
-        Points = namedtuple('Points', ('xs', 'ys'))
+        Points = namedtuple("Points", ("xs", "ys"))
         a = Points(xs=xs1, ys=ys1)
         b = Points(xs=xs2, ys=ys2)
         self.check_tuple_arg(a, b)
 
     def test_name_attribute(self):
-        @vectorize('f8(f8)', target='cuda')
+        @vectorize("f8(f8)", target="cuda")
         def bar(x):
-            return x ** 2
+            return x**2
 
-        self.assertEqual(bar.__name__, 'bar')
+        self.assertEqual(bar.__name__, "bar")
 
     def test_no_transfer_for_device_data(self):
         # Initialize test data on the device prior to banning host <-> device
@@ -238,15 +235,15 @@ def test_no_transfer_for_device_data(self):
         # A mock of a CUDA function that always raises a CudaAPIError
 
         def raising_transfer(*args, **kwargs):
-            raise CudaAPIError(999, 'Transfer not allowed')
+            raise CudaAPIError(999, "Transfer not allowed")
 
         # Use the mock for transfers between the host and device
 
-        old_HtoD = getattr(driver, 'cuMemcpyHtoD', None)
-        old_DtoH = getattr(driver, 'cuMemcpyDtoH', None)
+        old_HtoD = getattr(driver, "cuMemcpyHtoD", None)
+        old_DtoH = getattr(driver, "cuMemcpyDtoH", None)
 
-        setattr(driver, 'cuMemcpyHtoD', raising_transfer)
-        setattr(driver, 'cuMemcpyDtoH', raising_transfer)
+        setattr(driver, "cuMemcpyHtoD", raising_transfer)
+        setattr(driver, "cuMemcpyDtoH", raising_transfer)
 
         # Ensure that the mock functions are working as expected
 
@@ -260,7 +257,7 @@ def raising_transfer(*args, **kwargs):
             # Check that defining and calling a ufunc with data on the device
             # induces no transfers
 
-            @vectorize(['float32(float32)'], target='cuda')
+            @vectorize(["float32(float32)"], target="cuda")
             def func(noise):
                 return noise + 1.0
 
@@ -270,14 +267,14 @@ def func(noise):
             # no original implementation, simply remove ours.
 
             if old_HtoD is not None:
-                setattr(driver, 'cuMemcpyHtoD', old_HtoD)
+                setattr(driver, "cuMemcpyHtoD", old_HtoD)
             else:
                 del driver.cuMemcpyHtoD
             if old_DtoH is not None:
-                setattr(driver, 'cuMemcpyDtoH', old_DtoH)
+                setattr(driver, "cuMemcpyDtoH", old_DtoH)
             else:
                 del driver.cuMemcpyDtoH
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py b/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py
index 82c7ca8f8..8da551309 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py
@@ -4,17 +4,17 @@
 import unittest
 
 
-@skip_on_cudasim('ufunc API unsupported in the simulator')
+@skip_on_cudasim("ufunc API unsupported in the simulator")
 class TestVectorizeComplex(CUDATestCase):
     def test_vectorize_complex(self):
-        @vectorize(['complex128(complex128)'], target='cuda')
+        @vectorize(["complex128(complex128)"], target="cuda")
         def vcomp(a):
-            return a * a + 1.
+            return a * a + 1.0
 
         A = np.arange(5, dtype=np.complex128)
         B = vcomp(A)
-        self.assertTrue(np.allclose(A * A + 1., B))
+        self.assertTrue(np.allclose(A * A + 1.0, B))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py b/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py
index 12b8fa03c..1c2bd513d 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py
@@ -1,21 +1,25 @@
 import numpy as np
 
 from numba import vectorize, cuda
-from numba.tests.npyufunc.test_vectorize_decor import BaseVectorizeDecor, \
-    BaseVectorizeNopythonArg, BaseVectorizeUnrecognizedArg
+from numba.tests.npyufunc.test_vectorize_decor import (
+    BaseVectorizeDecor,
+    BaseVectorizeNopythonArg,
+    BaseVectorizeUnrecognizedArg,
+)
 from numba.cuda.testing import skip_on_cudasim, CUDATestCase
 import unittest
 
 
-@skip_on_cudasim('ufunc API unsupported in the simulator')
+@skip_on_cudasim("ufunc API unsupported in the simulator")
 class TestVectorizeDecor(CUDATestCase, BaseVectorizeDecor):
     """
     Runs the tests from BaseVectorizeDecor with the CUDA target.
     """
-    target = 'cuda'
 
+    target = "cuda"
 
-@skip_on_cudasim('ufunc API unsupported in the simulator')
+
+@skip_on_cudasim("ufunc API unsupported in the simulator")
 class TestGPUVectorizeBroadcast(CUDATestCase):
     def test_broadcast(self):
         a = np.random.randn(100, 3, 1)
@@ -24,7 +28,7 @@ def test_broadcast(self):
         def fn(a, b):
             return a - b
 
-        @vectorize(['float64(float64,float64)'], target='cuda')
+        @vectorize(["float64(float64,float64)"], target="cuda")
         def fngpu(a, b):
             return a - b
 
@@ -43,7 +47,7 @@ def test_device_broadcast(self):
         def fn(a, b):
             return a - b
 
-        @vectorize(['float64(float64,float64)'], target='cuda')
+        @vectorize(["float64(float64,float64)"], target="cuda")
         def fngpu(a, b):
             return a - b
 
@@ -52,18 +56,18 @@ def fngpu(a, b):
         np.testing.assert_almost_equal(expect, got.copy_to_host())
 
 
-@skip_on_cudasim('ufunc API unsupported in the simulator')
+@skip_on_cudasim("ufunc API unsupported in the simulator")
 class TestVectorizeNopythonArg(BaseVectorizeNopythonArg, CUDATestCase):
     def test_target_cuda_nopython(self):
         warnings = ["nopython kwarg for cuda target is redundant"]
-        self._test_target_nopython('cuda', warnings)
+        self._test_target_nopython("cuda", warnings)
 
 
-@skip_on_cudasim('ufunc API unsupported in the simulator')
+@skip_on_cudasim("ufunc API unsupported in the simulator")
 class TestVectorizeUnrecognizedArg(BaseVectorizeUnrecognizedArg, CUDATestCase):
     def test_target_cuda_unrecognized_arg(self):
-        self._test_target_unrecognized_arg('cuda')
+        self._test_target_unrecognized_arg("cuda")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py b/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py
index e33598d8b..67e2d3265 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py
@@ -5,19 +5,19 @@
 import unittest
 
 
-@skip_on_cudasim('ufunc API unsupported in the simulator')
+@skip_on_cudasim("ufunc API unsupported in the simulator")
 class TestCudaVectorizeDeviceCall(CUDATestCase):
     def test_cuda_vectorize_device_call(self):
-
         @cuda.jit(float32(float32, float32, float32), device=True)
         def cu_device_fn(x, y, z):
-            return x ** y / z
+            return x**y / z
 
         def cu_ufunc(x, y, z):
             return cu_device_fn(x, y, z)
 
-        ufunc = vectorize([float32(float32, float32, float32)], target='cuda')(
-            cu_ufunc)
+        ufunc = vectorize([float32(float32, float32, float32)], target="cuda")(
+            cu_ufunc
+        )
 
         N = 100
 
@@ -27,10 +27,10 @@ def cu_ufunc(x, y, z):
 
         out = ufunc(X, Y, Z)
 
-        gold = (X ** Y) / Z
+        gold = (X**Y) / Z
 
         self.assertTrue(np.allclose(out, gold))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py b/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py
index 1c65a41d7..e413e67d1 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py
@@ -7,11 +7,10 @@
 sig = [float64(float64, float64)]
 
 
-@skip_on_cudasim('ufunc API unsupported in the simulator')
+@skip_on_cudasim("ufunc API unsupported in the simulator")
 class TestCUDAVectorizeScalarArg(CUDATestCase):
-
     def test_vectorize_scalar_arg(self):
-        @vectorize(sig, target='cuda')
+        @vectorize(sig, target="cuda")
         def vector_add(a, b):
             return a + b
 
@@ -20,11 +19,11 @@ def vector_add(a, b):
         v = vector_add(1.0, dA)
 
         np.testing.assert_array_almost_equal(
-            v.copy_to_host(),
-            np.arange(1, 11, dtype=np.float64))
+            v.copy_to_host(), np.arange(1, 11, dtype=np.float64)
+        )
 
     def test_vectorize_all_scalars(self):
-        @vectorize(sig, target='cuda')
+        @vectorize(sig, target="cuda")
         def vector_add(a, b):
             return a + b
 
@@ -33,5 +32,5 @@ def vector_add(a, b):
         np.testing.assert_almost_equal(2.0, v)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_warning.py b/numba_cuda/numba/cuda/tests/cudapy/test_warning.py
index fbcb643fe..11fd61b55 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_warning.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_warning.py
@@ -7,27 +7,27 @@
 import warnings
 
 
-@skip_on_cudasim('cudasim does not raise performance warnings')
+@skip_on_cudasim("cudasim does not raise performance warnings")
 class TestWarnings(CUDATestCase):
     def test_inefficient_launch_configuration(self):
         @cuda.jit
         def kernel():
             pass
 
-        with override_config('CUDA_LOW_OCCUPANCY_WARNINGS', 1):
+        with override_config("CUDA_LOW_OCCUPANCY_WARNINGS", 1):
             with warnings.catch_warnings(record=True) as w:
                 kernel[1, 1]()
 
         self.assertEqual(w[0].category, NumbaPerformanceWarning)
-        self.assertIn('Grid size', str(w[0].message))
-        self.assertIn('low occupancy', str(w[0].message))
+        self.assertIn("Grid size", str(w[0].message))
+        self.assertIn("low occupancy", str(w[0].message))
 
     def test_efficient_launch_configuration(self):
         @cuda.jit
         def kernel():
             pass
 
-        with override_config('CUDA_LOW_OCCUPANCY_WARNINGS', 1):
+        with override_config("CUDA_LOW_OCCUPANCY_WARNINGS", 1):
             with warnings.catch_warnings(record=True) as w:
                 kernel[256, 256]()
 
@@ -40,14 +40,15 @@ def foo(r, x):
 
         N = 10
         arr_f32 = np.zeros(N, dtype=np.float32)
-        with override_config('CUDA_WARN_ON_IMPLICIT_COPY', 1):
+        with override_config("CUDA_WARN_ON_IMPLICIT_COPY", 1):
             with warnings.catch_warnings(record=True) as w:
                 foo[1, N](arr_f32, N)
 
         self.assertEqual(w[0].category, NumbaPerformanceWarning)
-        self.assertIn('Host array used in CUDA kernel will incur',
-                      str(w[0].message))
-        self.assertIn('copy overhead', str(w[0].message))
+        self.assertIn(
+            "Host array used in CUDA kernel will incur", str(w[0].message)
+        )
+        self.assertIn("copy overhead", str(w[0].message))
 
     def test_pinned_warn_on_host_array(self):
         @cuda.jit
@@ -57,14 +58,15 @@ def foo(r, x):
         N = 10
         ary = cuda.pinned_array(N, dtype=np.float32)
 
-        with override_config('CUDA_WARN_ON_IMPLICIT_COPY', 1):
+        with override_config("CUDA_WARN_ON_IMPLICIT_COPY", 1):
             with warnings.catch_warnings(record=True) as w:
                 foo[1, N](ary, N)
 
         self.assertEqual(w[0].category, NumbaPerformanceWarning)
-        self.assertIn('Host array used in CUDA kernel will incur',
-                      str(w[0].message))
-        self.assertIn('copy overhead', str(w[0].message))
+        self.assertIn(
+            "Host array used in CUDA kernel will incur", str(w[0].message)
+        )
+        self.assertIn("copy overhead", str(w[0].message))
 
     def test_nowarn_on_mapped_array(self):
         @cuda.jit
@@ -74,7 +76,7 @@ def foo(r, x):
         N = 10
         ary = cuda.mapped_array(N, dtype=np.float32)
 
-        with override_config('CUDA_WARN_ON_IMPLICIT_COPY', 1):
+        with override_config("CUDA_WARN_ON_IMPLICIT_COPY", 1):
             with warnings.catch_warnings(record=True) as w:
                 foo[1, N](ary, N)
 
@@ -89,7 +91,7 @@ def foo(r, x):
         N = 10
         ary = cuda.managed_array(N, dtype=np.float32)
 
-        with override_config('CUDA_WARN_ON_IMPLICIT_COPY', 1):
+        with override_config("CUDA_WARN_ON_IMPLICIT_COPY", 1):
             with warnings.catch_warnings(record=True) as w:
                 foo[1, N](ary, N)
 
@@ -103,7 +105,7 @@ def foo(r, x):
         N = 10
         ary = cuda.device_array(N, dtype=np.float32)
 
-        with override_config('CUDA_WARN_ON_IMPLICIT_COPY', 1):
+        with override_config("CUDA_WARN_ON_IMPLICIT_COPY", 1):
             with warnings.catch_warnings(record=True) as w:
                 foo[1, N](ary, N)
 
@@ -114,14 +116,14 @@ def test_warn_on_debug_and_opt(self):
             cuda.jit(debug=True, opt=True)
 
         self.assertEqual(len(w), 1)
-        self.assertIn('not supported by CUDA', str(w[0].message))
+        self.assertIn("not supported by CUDA", str(w[0].message))
 
     def test_warn_on_debug_and_opt_default(self):
         with warnings.catch_warnings(record=True) as w:
             cuda.jit(debug=True)
 
         self.assertEqual(len(w), 1)
-        self.assertIn('not supported by CUDA', str(w[0].message))
+        self.assertIn("not supported by CUDA", str(w[0].message))
 
     def test_no_warn_on_debug_and_no_opt(self):
         with warnings.catch_warnings(record=True) as w:
@@ -136,8 +138,8 @@ def test_no_warn_with_no_debug_and_opt_kwargs(self):
         self.assertEqual(len(w), 0)
 
     def test_no_warn_on_debug_and_opt_with_config(self):
-        with override_config('CUDA_DEBUGINFO_DEFAULT', 1):
-            with override_config('OPT', config._OptLevel(0)):
+        with override_config("CUDA_DEBUGINFO_DEFAULT", 1):
+            with override_config("OPT", config._OptLevel(0)):
                 with warnings.catch_warnings(record=True) as w:
                     cuda.jit()
 
@@ -148,30 +150,30 @@ def test_no_warn_on_debug_and_opt_with_config(self):
 
             self.assertEqual(len(w), 0)
 
-        with override_config('OPT', config._OptLevel(0)):
+        with override_config("OPT", config._OptLevel(0)):
             with warnings.catch_warnings(record=True) as w:
                 cuda.jit(debug=True)
 
             self.assertEqual(len(w), 0)
 
     def test_warn_on_debug_and_opt_with_config(self):
-        with override_config('CUDA_DEBUGINFO_DEFAULT', 1):
-            for opt in (1, 2, 3, 'max'):
-                with override_config('OPT', config._OptLevel(opt)):
+        with override_config("CUDA_DEBUGINFO_DEFAULT", 1):
+            for opt in (1, 2, 3, "max"):
+                with override_config("OPT", config._OptLevel(opt)):
                     with warnings.catch_warnings(record=True) as w:
                         cuda.jit()
 
                 self.assertEqual(len(w), 1)
-                self.assertIn('not supported by CUDA', str(w[0].message))
+                self.assertIn("not supported by CUDA", str(w[0].message))
 
-        for opt in (1, 2, 3, 'max'):
-            with override_config('OPT', config._OptLevel(opt)):
+        for opt in (1, 2, 3, "max"):
+            with override_config("OPT", config._OptLevel(opt)):
                 with warnings.catch_warnings(record=True) as w:
                     cuda.jit(debug=True)
 
                 self.assertEqual(len(w), 1)
-                self.assertIn('not supported by CUDA', str(w[0].message))
+                self.assertIn("not supported by CUDA", str(w[0].message))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py b/numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py
index 2fc157d07..6f3d0f26e 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py
@@ -8,73 +8,73 @@ def useful_syncwarp(ary):
     i = cuda.grid(1)
     if i == 0:
         ary[0] = 42
-    cuda.syncwarp(0xffffffff)
+    cuda.syncwarp(0xFFFFFFFF)
     ary[i] = ary[0]
 
 
 def use_shfl_sync_idx(ary, idx):
     i = cuda.grid(1)
-    val = cuda.shfl_sync(0xffffffff, i, idx)
+    val = cuda.shfl_sync(0xFFFFFFFF, i, idx)
     ary[i] = val
 
 
 def use_shfl_sync_up(ary, delta):
     i = cuda.grid(1)
-    val = cuda.shfl_up_sync(0xffffffff, i, delta)
+    val = cuda.shfl_up_sync(0xFFFFFFFF, i, delta)
     ary[i] = val
 
 
 def use_shfl_sync_down(ary, delta):
     i = cuda.grid(1)
-    val = cuda.shfl_down_sync(0xffffffff, i, delta)
+    val = cuda.shfl_down_sync(0xFFFFFFFF, i, delta)
     ary[i] = val
 
 
 def use_shfl_sync_xor(ary, xor):
     i = cuda.grid(1)
-    val = cuda.shfl_xor_sync(0xffffffff, i, xor)
+    val = cuda.shfl_xor_sync(0xFFFFFFFF, i, xor)
     ary[i] = val
 
 
 def use_shfl_sync_with_val(ary, into):
     i = cuda.grid(1)
-    val = cuda.shfl_sync(0xffffffff, into, 0)
+    val = cuda.shfl_sync(0xFFFFFFFF, into, 0)
     ary[i] = val
 
 
 def use_vote_sync_all(ary_in, ary_out):
     i = cuda.grid(1)
-    pred = cuda.all_sync(0xffffffff, ary_in[i])
+    pred = cuda.all_sync(0xFFFFFFFF, ary_in[i])
     ary_out[i] = pred
 
 
 def use_vote_sync_any(ary_in, ary_out):
     i = cuda.grid(1)
-    pred = cuda.any_sync(0xffffffff, ary_in[i])
+    pred = cuda.any_sync(0xFFFFFFFF, ary_in[i])
     ary_out[i] = pred
 
 
 def use_vote_sync_eq(ary_in, ary_out):
     i = cuda.grid(1)
-    pred = cuda.eq_sync(0xffffffff, ary_in[i])
+    pred = cuda.eq_sync(0xFFFFFFFF, ary_in[i])
     ary_out[i] = pred
 
 
 def use_vote_sync_ballot(ary):
     i = cuda.threadIdx.x
-    ballot = cuda.ballot_sync(0xffffffff, True)
+    ballot = cuda.ballot_sync(0xFFFFFFFF, True)
     ary[i] = ballot
 
 
 def use_match_any_sync(ary_in, ary_out):
     i = cuda.grid(1)
-    ballot = cuda.match_any_sync(0xffffffff, ary_in[i])
+    ballot = cuda.match_any_sync(0xFFFFFFFF, ary_in[i])
     ary_out[i] = ballot
 
 
 def use_match_all_sync(ary_in, ary_out):
     i = cuda.grid(1)
-    ballot, pred = cuda.match_all_sync(0xffffffff, ary_in[i])
+    ballot, pred = cuda.match_all_sync(0xFFFFFFFF, ary_in[i])
     ary_out[i] = ballot if pred else 0
 
 
@@ -146,8 +146,12 @@ def test_shfl_sync_xor(self):
 
     def test_shfl_sync_types(self):
         types = int32, int64, float32, float64
-        values = (np.int32(-1), np.int64(1 << 42),
-                  np.float32(np.pi), np.float64(np.pi))
+        values = (
+            np.int32(-1),
+            np.int64(1 << 42),
+            np.float32(np.pi),
+            np.float64(np.pi),
+        )
         for typ, val in zip(types, values):
             compiled = cuda.jit((typ[:], typ))(use_shfl_sync_with_val)
             nelem = 32
@@ -197,10 +201,11 @@ def test_vote_sync_ballot(self):
         nelem = 32
         ary = np.empty(nelem, dtype=np.uint32)
         compiled[1, nelem](ary)
-        self.assertTrue(np.all(ary == np.uint32(0xffffffff)))
+        self.assertTrue(np.all(ary == np.uint32(0xFFFFFFFF)))
 
-    @unittest.skipUnless(_safe_cc_check((7, 0)),
-                         "Matching requires at least Volta Architecture")
+    @unittest.skipUnless(
+        _safe_cc_check((7, 0)), "Matching requires at least Volta Architecture"
+    )
     def test_match_any_sync(self):
         compiled = cuda.jit("void(int32[:], int32[:])")(use_match_any_sync)
         nelem = 10
@@ -210,8 +215,9 @@ def test_match_any_sync(self):
         compiled[1, nelem](ary_in, ary_out)
         self.assertTrue(np.all(ary_out == exp))
 
-    @unittest.skipUnless(_safe_cc_check((7, 0)),
-                         "Matching requires at least Volta Architecture")
+    @unittest.skipUnless(
+        _safe_cc_check((7, 0)), "Matching requires at least Volta Architecture"
+    )
     def test_match_all_sync(self):
         compiled = cuda.jit("void(int32[:], int32[:])")(use_match_all_sync)
         nelem = 10
@@ -223,9 +229,10 @@ def test_match_all_sync(self):
         compiled[1, nelem](ary_in, ary_out)
         self.assertTrue(np.all(ary_out == 0))
 
-    @unittest.skipUnless(_safe_cc_check((7, 0)),
-                         "Independent scheduling requires at least Volta "
-                         "Architecture")
+    @unittest.skipUnless(
+        _safe_cc_check((7, 0)),
+        "Independent scheduling requires at least Volta Architecture",
+    )
     def test_independent_scheduling(self):
         compiled = cuda.jit("void(uint32[:])")(use_independent_scheduling)
         arr = np.empty(32, dtype=np.uint32)
@@ -267,10 +274,9 @@ def use_lanemask_lt(x):
         # 0, 1, 3, 7, F, 1F, 3F, 7F, FF, 1FF, etc.
         # or in binary:
         # ...0001, ....0011, ...0111, etc.
-        expected = np.asarray([(2 ** i) - 1 for i in range(32)],
-                              dtype=np.uint32)
+        expected = np.asarray([(2**i) - 1 for i in range(32)], dtype=np.uint32)
         np.testing.assert_equal(expected, out)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py b/numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py
index 0f544821a..c1e8d0b23 100644
--- a/numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py
+++ b/numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py
@@ -10,12 +10,16 @@
 
 class TestCudaSimIssues(CUDATestCase):
     def test_record_access(self):
-        backyard_type = [('statue', np.float64),
-                         ('newspaper', np.float64, (6,))]
+        backyard_type = [
+            ("statue", np.float64),
+            ("newspaper", np.float64, (6,)),
+        ]
 
-        goose_type = [('garden', np.float64, (12,)),
-                      ('town', np.float64, (42,)),
-                      ('backyard', backyard_type)]
+        goose_type = [
+            ("garden", np.float64, (12,)),
+            ("town", np.float64, (42,)),
+            ("backyard", backyard_type),
+        ]
 
         goose_np_type = np.dtype(goose_type, align=True)
 
@@ -27,20 +31,22 @@ def simple_kernel(f):
 
         item = np.recarray(1, dtype=goose_np_type)
         simple_kernel[1, 1](item[0])
-        np.testing.assert_equal(item[0]['garden'][0], 45)
-        np.testing.assert_equal(item[0]['backyard']['newspaper'][3], 5)
+        np.testing.assert_equal(item[0]["garden"][0], 45)
+        np.testing.assert_equal(item[0]["backyard"]["newspaper"][3], 5)
 
     def test_recarray_setting(self):
-        recordwith2darray = np.dtype([('i', np.int32),
-                                      ('j', np.float32, (3, 2))])
+        recordwith2darray = np.dtype(
+            [("i", np.int32), ("j", np.float32, (3, 2))]
+        )
         rec = np.recarray(2, dtype=recordwith2darray)
-        rec[0]['i'] = 45
+        rec[0]["i"] = 45
 
         @cuda.jit
         def simple_kernel(f):
             f[1] = f[0]
+
         simple_kernel[1, 1](rec)
-        np.testing.assert_equal(rec[0]['i'], rec[1]['i'])
+        np.testing.assert_equal(rec[0]["i"], rec[1]["i"])
 
     def test_cuda_module_in_device_function(self):
         """
@@ -63,7 +69,7 @@ def outer(out):
         expected = np.arange(arr.size, dtype=np.int32)
         np.testing.assert_equal(expected, arr)
 
-    @skip_unless_cudasim('Only works on CUDASIM')
+    @skip_unless_cudasim("Only works on CUDASIM")
     def test_deadlock_on_exception(self):
         def assert_no_blockthreads():
             blockthreads = []
@@ -98,5 +104,5 @@ def assign_with_sync(x, y):
         assert_no_blockthreads()
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/data/jitlink.cu b/numba_cuda/numba/cuda/tests/data/jitlink.cu
index 4d245366c..a2737a6ef 100644
--- a/numba_cuda/numba/cuda/tests/data/jitlink.cu
+++ b/numba_cuda/numba/cuda/tests/data/jitlink.cu
@@ -20,4 +20,4 @@ int array_mutator(void *out, int *a)
 {
   a[0] = a[1];
   return 0;
-}  
+}
diff --git a/numba_cuda/numba/cuda/tests/data/jitlink.ptx b/numba_cuda/numba/cuda/tests/data/jitlink.ptx
index dde0cc214..fdbbb261f 100644
--- a/numba_cuda/numba/cuda/tests/data/jitlink.ptx
+++ b/numba_cuda/numba/cuda/tests/data/jitlink.ptx
@@ -47,5 +47,3 @@
 	st.param.b32	[func_retval0+0], %r2;
 	ret;
 }
-
-
diff --git a/numba_cuda/numba/cuda/tests/doc_examples/test_cg.py b/numba_cuda/numba/cuda/tests/doc_examples/test_cg.py
index fc8405dbb..a14f4eac5 100644
--- a/numba_cuda/numba/cuda/tests/doc_examples/test_cg.py
+++ b/numba_cuda/numba/cuda/tests/doc_examples/test_cg.py
@@ -2,14 +2,18 @@
 # "magictoken" is used for markers as beginning and ending of example text.
 
 import unittest
-from numba.cuda.testing import (CUDATestCase, skip_on_cudasim,
-                                skip_if_cudadevrt_missing, skip_unless_cc_60,
-                                skip_if_mvc_enabled)
+from numba.cuda.testing import (
+    CUDATestCase,
+    skip_on_cudasim,
+    skip_if_cudadevrt_missing,
+    skip_unless_cc_60,
+    skip_if_mvc_enabled,
+)
 
 
 @skip_if_cudadevrt_missing
 @skip_unless_cc_60
-@skip_if_mvc_enabled('CG not supported with MVC')
+@skip_if_mvc_enabled("CG not supported with MVC")
 @skip_on_cudasim("cudasim doesn't support cuda import at non-top-level")
 class TestCooperativeGroups(CUDATestCase):
     def test_ex_grid_sync(self):
@@ -17,7 +21,7 @@ def test_ex_grid_sync(self):
         from numba import cuda, int32
         import numpy as np
 
-        sig = (int32[:,::1],)
+        sig = (int32[:, ::1],)
 
         @cuda.jit(sig)
         def sequential_rows(M):
@@ -34,6 +38,7 @@ def sequential_rows(M):
                 # Wait until all threads have written their column element,
                 # and that the write is visible to all other threads
                 g.sync()
+
         # magictoken.ex_grid_sync_kernel.end
 
         # magictoken.ex_grid_sync_data.begin
@@ -48,9 +53,11 @@ def sequential_rows(M):
 
         # Skip this test if the grid size used in the example is too large for
         # a cooperative launch on the current GPU
-        mb = sequential_rows.overloads[sig].max_cooperative_grid_blocks(blockdim)
+        mb = sequential_rows.overloads[sig].max_cooperative_grid_blocks(
+            blockdim
+        )
         if mb < griddim:
-            self.skipTest('Device does not support a large enough coop grid')
+            self.skipTest("Device does not support a large enough coop grid")
 
         # magictoken.ex_grid_sync_launch.begin
         # Kernel launch - this is implicitly a cooperative launch
@@ -73,5 +80,5 @@ def sequential_rows(M):
         np.testing.assert_equal(A, reference)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py b/numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py
index b879a12d2..f8ec6f51f 100644
--- a/numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py
+++ b/numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py
@@ -41,6 +41,7 @@ def test_ex_cpu_gpu_compat(self):
         @numba.jit
         def business_logic(x, y, z):
             return 4 * z * (2 * x - (4 * y) / 2 * pi)
+
         # ex_cpu_gpu_compat.define.end
 
         # ex_cpu_gpu_compat.cpurun.begin
@@ -54,6 +55,7 @@ def f(res, xarr, yarr, zarr):
             if tid < len(xarr):
                 # The function decorated with numba.jit may be directly reused
                 res[tid] = business_logic(xarr[tid], yarr[tid], zarr[tid])
+
         # ex_cpu_gpu_compat.usegpu.end
 
         # ex_cpu_gpu_compat.launch.begin
@@ -62,14 +64,9 @@ def f(res, xarr, yarr, zarr):
         # [-126.79644737231007, 416.28324559588634, -218912930.2987788]
         # ex_cpu_gpu_compat.launch.end
 
-        expect = [
-            business_logic(x, y, z) for x, y, z in zip(X, Y, Z)
-        ]
+        expect = [business_logic(x, y, z) for x, y, z in zip(X, Y, Z)]
 
-        np.testing.assert_equal(
-            expect,
-            results.copy_to_host()
-        )
+        np.testing.assert_equal(expect, results.copy_to_host())
 
 
 if __name__ == "__main__":
diff --git a/numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py b/numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py
index 48a59bddf..39e439f3f 100644
--- a/numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py
+++ b/numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py
@@ -2,7 +2,7 @@
 # "magictoken" is used for markers as beginning and ending of example text.
 
 import unittest
-from numba.cuda.testing import (CUDATestCase, skip_on_cudasim)
+from numba.cuda.testing import CUDATestCase, skip_on_cudasim
 from numba.tests.support import skip_unless_cffi
 
 
@@ -18,11 +18,12 @@ def test_ex_linking_cu(self):
         # Path to the source containing the foreign function
         # (here assumed to be in a subdirectory called "ffi")
         basedir = os.path.dirname(os.path.abspath(__file__))
-        functions_cu = os.path.join(basedir, 'ffi', 'functions.cu')
+        functions_cu = os.path.join(basedir, "ffi", "functions.cu")
 
         # Declaration of the foreign function
-        mul = cuda.declare_device('mul_f32_f32', 'float32(float32, float32)',
-                                  link=functions_cu)
+        mul = cuda.declare_device(
+            "mul_f32_f32", "float32(float32, float32)", link=functions_cu
+        )
 
         # A kernel that calls mul; functions.cu is linked automatically due to
         # the call to mul.
@@ -52,25 +53,29 @@ def test_ex_from_buffer(self):
         import os
 
         basedir = os.path.dirname(os.path.abspath(__file__))
-        functions_cu = os.path.join(basedir, 'ffi', 'functions.cu')
+        functions_cu = os.path.join(basedir, "ffi", "functions.cu")
 
         # magictoken.ex_from_buffer_decl.begin
-        signature = 'float32(CPointer(float32), int32)'
-        sum_reduce = cuda.declare_device('sum_reduce', signature,
-                                         link=functions_cu)
+        signature = "float32(CPointer(float32), int32)"
+        sum_reduce = cuda.declare_device(
+            "sum_reduce", signature, link=functions_cu
+        )
         # magictoken.ex_from_buffer_decl.end
 
         # magictoken.ex_from_buffer_kernel.begin
         import cffi
+
         ffi = cffi.FFI()
 
         @cuda.jit
         def reduction_caller(result, array):
             array_ptr = ffi.from_buffer(array)
             result[()] = sum_reduce(array_ptr, len(array))
+
         # magictoken.ex_from_buffer_kernel.end
 
         import numpy as np
+
         x = np.arange(10).astype(np.float32)
         r = np.ndarray((), dtype=np.float32)
 
@@ -81,5 +86,5 @@ def reduction_caller(result, array):
         np.testing.assert_allclose(expected, actual)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py b/numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py
index 4caea9286..75f38446a 100644
--- a/numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py
+++ b/numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py
@@ -1,14 +1,18 @@
 import unittest
 
-from numba.cuda.testing import (CUDATestCase, skip_if_cudadevrt_missing,
-                                skip_on_cudasim, skip_unless_cc_60,
-                                skip_if_mvc_enabled)
+from numba.cuda.testing import (
+    CUDATestCase,
+    skip_if_cudadevrt_missing,
+    skip_on_cudasim,
+    skip_unless_cc_60,
+    skip_if_mvc_enabled,
+)
 from numba.tests.support import captured_stdout
 
 
 @skip_if_cudadevrt_missing
 @skip_unless_cc_60
-@skip_if_mvc_enabled('CG not supported with MVC')
+@skip_if_mvc_enabled("CG not supported with MVC")
 @skip_on_cudasim("cudasim doesn't support cuda import at non-top-level")
 class TestLaplace(CUDATestCase):
     """
@@ -27,7 +31,6 @@ def tearDown(self):
         super().tearDown()
 
     def test_ex_laplace(self):
-
         # set True to regenerate the figures that
         # accompany this example
         plot = False
@@ -55,24 +58,25 @@ def test_ex_laplace(self):
 
         if plot:
             import matplotlib.pyplot as plt
+
             fig, ax = plt.subplots(figsize=(16 * 0.66, 9 * 0.66))
             plt.plot(
                 np.arange(len(buf_0)),
                 buf_0.copy_to_host(),
                 lw=3,
                 marker="*",
-                color='black'
+                color="black",
             )
 
-            plt.title('Initial State', fontsize=24)
-            plt.xlabel('Position', fontsize=24)
-            plt.ylabel('Temperature', fontsize=24)
+            plt.title("Initial State", fontsize=24)
+            plt.xlabel("Position", fontsize=24)
+            plt.ylabel("Temperature", fontsize=24)
 
             ax.set_xticks(ax.get_xticks(), fontsize=16)
             ax.set_yticks(ax.get_yticks(), fontsize=16)
             plt.xlim(0, len(data))
             plt.ylim(0, 10001)
-            plt.savefig('laplace_initial.svg')
+            plt.savefig("laplace_initial.svg")
 
         # ex_laplace.kernel.begin
         @cuda.jit
@@ -116,12 +120,11 @@ def solve_heat_equation(buf_0, buf_1, timesteps, k):
 
                 # Wait for every thread to write before moving on
                 grid.sync()
+
         # ex_laplace.kernel.end
 
         # ex_laplace.launch.begin
-        solve_heat_equation.forall(len(data))(
-            buf_0, buf_1, niter, 0.25
-        )
+        solve_heat_equation.forall(len(data))(buf_0, buf_1, niter, 0.25)
         # ex_laplace.launch.end
 
         results = buf_1.copy_to_host()
@@ -129,20 +132,21 @@ def solve_heat_equation(buf_0, buf_1, timesteps, k):
             fig, ax = plt.subplots(figsize=(16 * 0.66, 9 * 0.66))
             plt.plot(
                 np.arange(len(results)),
-                results, lw=3,
+                results,
+                lw=3,
                 marker="*",
-                color='black'
+                color="black",
             )
             plt.title(f"T = {niter}", fontsize=24)
-            plt.xlabel('Position', fontsize=24)
-            plt.ylabel('Temperature', fontsize=24)
+            plt.xlabel("Position", fontsize=24)
+            plt.ylabel("Temperature", fontsize=24)
 
             ax.set_xticks(ax.get_xticks(), fontsize=16)
             ax.set_yticks(ax.get_yticks(), fontsize=16)
 
             plt.ylim(0, max(results))
             plt.xlim(0, len(results))
-            plt.savefig('laplace_final.svg')
+            plt.savefig("laplace_final.svg")
 
         # Integral over the domain should be equal to its initial value.
         # Note that this should match the initial value of data[500] above, but
diff --git a/numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py b/numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py
index 6e0dd44c1..9633954f0 100644
--- a/numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py
+++ b/numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py
@@ -6,6 +6,7 @@
 Contents in this file are referenced from the sphinx-generated docs.
 "magictoken" is used for markers as beginning and ending of example text.
 """
+
 import unittest
 from numba.cuda.testing import CUDATestCase, skip_on_cudasim
 from numba.tests.support import captured_stdout
@@ -43,10 +44,11 @@ def matmul(A, B, C):
             """Perform square matrix multiplication of C = A * B."""
             i, j = cuda.grid(2)
             if i < C.shape[0] and j < C.shape[1]:
-                tmp = 0.
+                tmp = 0.0
                 for k in range(A.shape[1]):
                     tmp += A[i, k] * B[k, j]
                 C[i, j] = tmp
+
         # magictoken.ex_matmul.end
 
         # magictoken.ex_run_matmul.begin
@@ -91,11 +93,11 @@ def fast_matmul(A, B, C):
 
             tx = cuda.threadIdx.x
             ty = cuda.threadIdx.y
-            bpg = cuda.gridDim.x    # blocks per grid
+            bpg = cuda.gridDim.x  # blocks per grid
 
             # Each thread computes one element in the result matrix.
             # The dot product is chunked into dot products of TPB-long vectors.
-            tmp = float32(0.)
+            tmp = float32(0.0)
             for i in range(bpg):
                 # Preload data into shared memory
                 sA[ty, tx] = 0
@@ -116,6 +118,7 @@ def fast_matmul(A, B, C):
                 cuda.syncthreads()
             if y < C.shape[0] and x < C.shape[1]:
                 C[y, x] = tmp
+
         # magictoken.ex_fast_matmul.end
 
         # magictoken.ex_run_fast_matmul.begin
@@ -169,5 +172,5 @@ def fast_matmul(A, B, C):
         self.assertTrue(np.all(z_h == x_h @ y_h), msg=msg)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py b/numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py
index 92627084f..8a5d9f46f 100644
--- a/numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py
+++ b/numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py
@@ -59,6 +59,7 @@ def mc_integrator_kernel(out, rng_states, lower_lim, upper_lim):
                 # value of the sample
                 y = func(samp)
                 out[gid] = y
+
         # ex_montecarlo.kernel.end
 
         # ex_montecarlo.callfunc.begin
@@ -84,6 +85,7 @@ def mc_integrate(lower_lim, upper_lim, nsamps):
             factor = (upper_lim - lower_lim) / (nsamps - 1)
 
             return sum_reduce(out) * factor
+
         # ex_montecarlo.callfunc.end
 
         # ex_montecarlo.launch.begin
diff --git a/numba_cuda/numba/cuda/tests/doc_examples/test_random.py b/numba_cuda/numba/cuda/tests/doc_examples/test_random.py
index 0e93a1f17..3ef8ec3a9 100644
--- a/numba_cuda/numba/cuda/tests/doc_examples/test_random.py
+++ b/numba_cuda/numba/cuda/tests/doc_examples/test_random.py
@@ -10,8 +10,10 @@ class TestRandom(CUDATestCase):
     def test_ex_3d_grid(self):
         # magictoken.ex_3d_grid.begin
         from numba import cuda
-        from numba.cuda.random import (create_xoroshiro128p_states,
-                                       xoroshiro128p_uniform_float32)
+        from numba.cuda.random import (
+            create_xoroshiro128p_states,
+            xoroshiro128p_uniform_float32,
+        )
         import numpy as np
 
         @cuda.jit
@@ -27,7 +29,9 @@ def random_3d(arr, rng_states):
             for i in range(startz, arr.shape[0], stridez):
                 for j in range(starty, arr.shape[1], stridey):
                     for k in range(startx, arr.shape[2], stridex):
-                        arr[i, j, k] = xoroshiro128p_uniform_float32(rng_states, tid)
+                        arr[i, j, k] = xoroshiro128p_uniform_float32(
+                            rng_states, tid
+                        )
 
         # Array dimensions
         X, Y, Z = 701, 900, 719
@@ -55,5 +59,5 @@ def random_3d(arr, rng_states):
         self.assertTrue(np.all(host_arr >= 0.0))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py b/numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py
index c118fbf15..92a0e6ade 100644
--- a/numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py
+++ b/numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py
@@ -61,11 +61,12 @@ def array_sum(data):
                 # After the loop, the zeroth  element contains the sum
                 if tid == 0:
                     data[tid] = shr[tid]
+
         # ex_reduction.kernel.end
 
         # ex_reduction.launch.begin
         array_sum[1, nelem](a)
-        print(a[0])                  # 523776
+        print(a[0])  # 523776
         print(sum(np.arange(1024)))  # 523776
         # ex_reduction.launch.end
 
diff --git a/numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py b/numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py
index 6c66a6599..c3a23471a 100644
--- a/numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py
+++ b/numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py
@@ -1,14 +1,18 @@
 import unittest
 
-from numba.cuda.testing import (CUDATestCase, skip_if_cudadevrt_missing,
-                                skip_on_cudasim, skip_unless_cc_60,
-                                skip_if_mvc_enabled)
+from numba.cuda.testing import (
+    CUDATestCase,
+    skip_if_cudadevrt_missing,
+    skip_on_cudasim,
+    skip_unless_cc_60,
+    skip_if_mvc_enabled,
+)
 from numba.tests.support import captured_stdout
 
 
 @skip_if_cudadevrt_missing
 @skip_unless_cc_60
-@skip_if_mvc_enabled('CG not supported with MVC')
+@skip_if_mvc_enabled("CG not supported with MVC")
 @skip_on_cudasim("cudasim doesn't support cuda import at non-top-level")
 class TestSessionization(CUDATestCase):
     """
@@ -40,26 +44,71 @@ def test_ex_sessionize(self):
         ids = cuda.to_device(
             np.array(
                 [
-                    1, 1, 1, 1, 1, 1,
-                    2, 2, 2,
-                    3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-                    4, 4, 4, 4, 4, 4, 4, 4, 4,
+                    1,
+                    1,
+                    1,
+                    1,
+                    1,
+                    1,
+                    2,
+                    2,
+                    2,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    4,
+                    4,
+                    4,
+                    4,
+                    4,
+                    4,
+                    4,
+                    4,
+                    4,
                 ]
             )
         )
         sec = cuda.to_device(
             np.array(
                 [
-                    1, 2, 3, 5000, 5001, 5002, 1,
-                    2, 3, 1, 2, 5000, 5001, 10000,
-                    10001, 10002, 10003, 15000, 150001,
-                    1, 5000, 50001, 15000, 20000,
-                    25000, 25001, 25002, 25003,
+                    1,
+                    2,
+                    3,
+                    5000,
+                    5001,
+                    5002,
+                    1,
+                    2,
+                    3,
+                    1,
+                    2,
+                    5000,
+                    5001,
+                    10000,
+                    10001,
+                    10002,
+                    10003,
+                    15000,
+                    150001,
+                    1,
+                    5000,
+                    50001,
+                    15000,
+                    20000,
+                    25000,
+                    25001,
+                    25002,
+                    25003,
                 ],
                 dtype="datetime64[ns]",
-            ).astype(
-                "int64"
-            )  # Cast to int64 for compatibility
+            ).astype("int64")  # Cast to int64 for compatibility
         )
         # Create a vector to hold the results
         results = cuda.to_device(np.zeros(len(ids)))
@@ -105,6 +154,7 @@ def sessionize(user_id, timestamp, results):
                     if gid + look_ahead == size - 1:
                         results[gid + look_ahead] = gid
                         break
+
         # ex_sessionize.kernel.end
 
         # ex_sessionize.launch.begin
@@ -119,9 +169,34 @@ def sessionize(user_id, timestamp, results):
         # ex_sessionize.launch.end
 
         expect = [
-            0, 0, 0, 3, 3, 3, 6, 6, 6, 9, 9,
-            11, 11, 13, 13, 13, 13, 17, 18, 19, 20, 21,
-            21, 23, 24, 24, 24, 24
+            0,
+            0,
+            0,
+            3,
+            3,
+            3,
+            6,
+            6,
+            6,
+            9,
+            9,
+            11,
+            11,
+            13,
+            13,
+            13,
+            13,
+            17,
+            18,
+            19,
+            20,
+            21,
+            21,
+            23,
+            24,
+            24,
+            24,
+            24,
         ]
         np.testing.assert_equal(expect, results.copy_to_host())
 
diff --git a/numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py b/numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py
index c6ae197ee..64131f0a7 100644
--- a/numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py
+++ b/numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py
@@ -37,6 +37,7 @@ def f(a, b, c):
 
             if tid < size:
                 c[tid] = a[tid] + b[tid]
+
         # ex_vecadd.kernel.end
 
         # Seed RNG for test repeatability
@@ -64,8 +65,7 @@ def f(a, b, c):
         # ex_vecadd.launch.end
 
         np.testing.assert_equal(
-            c.copy_to_host(),
-            a.copy_to_host() + b.copy_to_host()
+            c.copy_to_host(), a.copy_to_host() + b.copy_to_host()
         )
 
 
diff --git a/numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py b/numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py
index e4ad7d0fd..a870d1e38 100644
--- a/numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py
+++ b/numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py
@@ -7,9 +7,8 @@
 
 @skip_on_cudasim("Tests internals of the CUDA driver device array")
 class TestSlicing(unittest.TestCase):
-
     def assertSameContig(self, arr, nparr):
-        attrs = 'C_CONTIGUOUS', 'F_CONTIGUOUS'
+        attrs = "C_CONTIGUOUS", "F_CONTIGUOUS"
         for attr in attrs:
             if arr.flags[attr] != nparr.flags[attr]:
                 if arr.size == 0 and nparr.size == 0:
@@ -17,15 +16,18 @@ def assertSameContig(self, arr, nparr):
                     # some are not
                     pass
                 else:
-                    self.fail("contiguous flag mismatch:\ngot=%s\nexpect=%s" %
-                              (arr.flags, nparr.flags))
+                    self.fail(
+                        "contiguous flag mismatch:\ngot=%s\nexpect=%s"
+                        % (arr.flags, nparr.flags)
+                    )
 
     #### 1D
 
     def test_slice0_1d(self):
         nparr = np.empty(4)
-        arr = Array.from_desc(0, nparr.shape, nparr.strides,
-                              nparr.dtype.itemsize)
+        arr = Array.from_desc(
+            0, nparr.shape, nparr.strides, nparr.dtype.itemsize
+        )
         self.assertSameContig(arr, nparr)
         xx = -2, -1, 0, 1, 2
         for x in xx:
@@ -37,8 +39,9 @@ def test_slice0_1d(self):
 
     def test_slice1_1d(self):
         nparr = np.empty(4)
-        arr = Array.from_desc(0, nparr.shape, nparr.strides,
-                              nparr.dtype.itemsize)
+        arr = Array.from_desc(
+            0, nparr.shape, nparr.strides, nparr.dtype.itemsize
+        )
         xx = -2, -1, 0, 1, 2
         for x in xx:
             expect = nparr[:x]
@@ -49,8 +52,9 @@ def test_slice1_1d(self):
 
     def test_slice2_1d(self):
         nparr = np.empty(4)
-        arr = Array.from_desc(0, nparr.shape, nparr.strides,
-                              nparr.dtype.itemsize)
+        arr = Array.from_desc(
+            0, nparr.shape, nparr.strides, nparr.dtype.itemsize
+        )
         xx = -2, -1, 0, 1, 2
         for x, y in itertools.product(xx, xx):
             expect = nparr[x:y]
@@ -63,8 +67,9 @@ def test_slice2_1d(self):
 
     def test_slice0_2d(self):
         nparr = np.empty((4, 5))
-        arr = Array.from_desc(0, nparr.shape, nparr.strides,
-                              nparr.dtype.itemsize)
+        arr = Array.from_desc(
+            0, nparr.shape, nparr.strides, nparr.dtype.itemsize
+        )
         xx = -2, 0, 1, 2
         for x in xx:
             expect = nparr[x:]
@@ -82,8 +87,9 @@ def test_slice0_2d(self):
 
     def test_slice1_2d(self):
         nparr = np.empty((4, 5))
-        arr = Array.from_desc(0, nparr.shape, nparr.strides,
-                              nparr.dtype.itemsize)
+        arr = Array.from_desc(
+            0, nparr.shape, nparr.strides, nparr.dtype.itemsize
+        )
         xx = -2, 0, 2
         for x in xx:
             expect = nparr[:x]
@@ -101,8 +107,9 @@ def test_slice1_2d(self):
 
     def test_slice2_2d(self):
         nparr = np.empty((4, 5))
-        arr = Array.from_desc(0, nparr.shape, nparr.strides,
-                              nparr.dtype.itemsize)
+        arr = Array.from_desc(
+            0, nparr.shape, nparr.strides, nparr.dtype.itemsize
+        )
         xx = -2, 0, 2
         for s, t, u, v in itertools.product(xx, xx, xx, xx):
             expect = nparr[s:t, u:v]
@@ -122,8 +129,9 @@ def test_slice2_2d(self):
 
     def test_strided_1d(self):
         nparr = np.empty(4)
-        arr = Array.from_desc(0, nparr.shape, nparr.strides,
-                              nparr.dtype.itemsize)
+        arr = Array.from_desc(
+            0, nparr.shape, nparr.strides, nparr.dtype.itemsize
+        )
         xx = -2, -1, 1, 2
         for x in xx:
             expect = nparr[::x]
@@ -134,8 +142,9 @@ def test_strided_1d(self):
 
     def test_strided_2d(self):
         nparr = np.empty((4, 5))
-        arr = Array.from_desc(0, nparr.shape, nparr.strides,
-                              nparr.dtype.itemsize)
+        arr = Array.from_desc(
+            0, nparr.shape, nparr.strides, nparr.dtype.itemsize
+        )
         xx = -2, -1, 1, 2
         for a, b in itertools.product(xx, xx):
             expect = nparr[::a, ::b]
@@ -146,8 +155,9 @@ def test_strided_2d(self):
 
     def test_strided_3d(self):
         nparr = np.empty((4, 5, 6))
-        arr = Array.from_desc(0, nparr.shape, nparr.strides,
-                              nparr.dtype.itemsize)
+        arr = Array.from_desc(
+            0, nparr.shape, nparr.strides, nparr.dtype.itemsize
+        )
         xx = -2, -1, 1, 2
         for a, b, c in itertools.product(xx, xx, xx):
             expect = nparr[::a, ::b, ::c]
@@ -160,16 +170,17 @@ def test_issue_2766(self):
         z = np.empty((1, 2, 3))
         z = np.transpose(z, axes=(2, 0, 1))
         arr = Array.from_desc(0, z.shape, z.strides, z.itemsize)
-        self.assertEqual(z.flags['C_CONTIGUOUS'], arr.flags['C_CONTIGUOUS'])
-        self.assertEqual(z.flags['F_CONTIGUOUS'], arr.flags['F_CONTIGUOUS'])
+        self.assertEqual(z.flags["C_CONTIGUOUS"], arr.flags["C_CONTIGUOUS"])
+        self.assertEqual(z.flags["F_CONTIGUOUS"], arr.flags["F_CONTIGUOUS"])
 
 
 @skip_on_cudasim("Tests internals of the CUDA driver device array")
 class TestReshape(unittest.TestCase):
     def test_reshape_2d2d(self):
         nparr = np.empty((4, 5))
-        arr = Array.from_desc(0, nparr.shape, nparr.strides,
-                              nparr.dtype.itemsize)
+        arr = Array.from_desc(
+            0, nparr.shape, nparr.strides, nparr.dtype.itemsize
+        )
         expect = nparr.reshape(5, 4)
         got = arr.reshape(5, 4)[0]
         self.assertEqual(got.shape, expect.shape)
@@ -177,8 +188,9 @@ def test_reshape_2d2d(self):
 
     def test_reshape_2d1d(self):
         nparr = np.empty((4, 5))
-        arr = Array.from_desc(0, nparr.shape, nparr.strides,
-                              nparr.dtype.itemsize)
+        arr = Array.from_desc(
+            0, nparr.shape, nparr.strides, nparr.dtype.itemsize
+        )
         expect = nparr.reshape(5 * 4)
         got = arr.reshape(5 * 4)[0]
         self.assertEqual(got.shape, expect.shape)
@@ -186,8 +198,9 @@ def test_reshape_2d1d(self):
 
     def test_reshape_3d3d(self):
         nparr = np.empty((3, 4, 5))
-        arr = Array.from_desc(0, nparr.shape, nparr.strides,
-                              nparr.dtype.itemsize)
+        arr = Array.from_desc(
+            0, nparr.shape, nparr.strides, nparr.dtype.itemsize
+        )
         expect = nparr.reshape(5, 3, 4)
         got = arr.reshape(5, 3, 4)[0]
         self.assertEqual(got.shape, expect.shape)
@@ -195,8 +208,9 @@ def test_reshape_3d3d(self):
 
     def test_reshape_3d2d(self):
         nparr = np.empty((3, 4, 5))
-        arr = Array.from_desc(0, nparr.shape, nparr.strides,
-                              nparr.dtype.itemsize)
+        arr = Array.from_desc(
+            0, nparr.shape, nparr.strides, nparr.dtype.itemsize
+        )
         expect = nparr.reshape(3 * 4, 5)
         got = arr.reshape(3 * 4, 5)[0]
         self.assertEqual(got.shape, expect.shape)
@@ -204,8 +218,9 @@ def test_reshape_3d2d(self):
 
     def test_reshape_3d1d(self):
         nparr = np.empty((3, 4, 5))
-        arr = Array.from_desc(0, nparr.shape, nparr.strides,
-                              nparr.dtype.itemsize)
+        arr = Array.from_desc(
+            0, nparr.shape, nparr.strides, nparr.dtype.itemsize
+        )
         expect = nparr.reshape(3 * 4 * 5)
         got = arr.reshape(3 * 4 * 5)[0]
         self.assertEqual(got.shape, expect.shape)
@@ -213,8 +228,9 @@ def test_reshape_3d1d(self):
 
     def test_reshape_infer2d2d(self):
         nparr = np.empty((4, 5))
-        arr = Array.from_desc(0, nparr.shape, nparr.strides,
-                              nparr.dtype.itemsize)
+        arr = Array.from_desc(
+            0, nparr.shape, nparr.strides, nparr.dtype.itemsize
+        )
         expect = nparr.reshape(-1, 4)
         got = arr.reshape(-1, 4)[0]
         self.assertEqual(got.shape, expect.shape)
@@ -222,8 +238,9 @@ def test_reshape_infer2d2d(self):
 
     def test_reshape_infer2d1d(self):
         nparr = np.empty((4, 5))
-        arr = Array.from_desc(0, nparr.shape, nparr.strides,
-                              nparr.dtype.itemsize)
+        arr = Array.from_desc(
+            0, nparr.shape, nparr.strides, nparr.dtype.itemsize
+        )
         expect = nparr.reshape(-1)
         got = arr.reshape(-1)[0]
         self.assertEqual(got.shape, expect.shape)
@@ -231,8 +248,9 @@ def test_reshape_infer2d1d(self):
 
     def test_reshape_infer3d3d(self):
         nparr = np.empty((3, 4, 5))
-        arr = Array.from_desc(0, nparr.shape, nparr.strides,
-                              nparr.dtype.itemsize)
+        arr = Array.from_desc(
+            0, nparr.shape, nparr.strides, nparr.dtype.itemsize
+        )
         expect = nparr.reshape(5, -1, 4)
         got = arr.reshape(5, -1, 4)[0]
         self.assertEqual(got.shape, expect.shape)
@@ -240,8 +258,9 @@ def test_reshape_infer3d3d(self):
 
     def test_reshape_infer3d2d(self):
         nparr = np.empty((3, 4, 5))
-        arr = Array.from_desc(0, nparr.shape, nparr.strides,
-                              nparr.dtype.itemsize)
+        arr = Array.from_desc(
+            0, nparr.shape, nparr.strides, nparr.dtype.itemsize
+        )
         expect = nparr.reshape(3, -1)
         got = arr.reshape(3, -1)[0]
         self.assertEqual(got.shape, expect.shape)
@@ -249,8 +268,9 @@ def test_reshape_infer3d2d(self):
 
     def test_reshape_infer3d1d(self):
         nparr = np.empty((3, 4, 5))
-        arr = Array.from_desc(0, nparr.shape, nparr.strides,
-                              nparr.dtype.itemsize)
+        arr = Array.from_desc(
+            0, nparr.shape, nparr.strides, nparr.dtype.itemsize
+        )
         expect = nparr.reshape(-1)
         got = arr.reshape(-1)[0]
         self.assertEqual(got.shape, expect.shape)
@@ -258,23 +278,26 @@ def test_reshape_infer3d1d(self):
 
     def test_reshape_infer_two_unknowns(self):
         nparr = np.empty((3, 4, 5))
-        arr = Array.from_desc(0, nparr.shape, nparr.strides,
-                              nparr.dtype.itemsize)
+        arr = Array.from_desc(
+            0, nparr.shape, nparr.strides, nparr.dtype.itemsize
+        )
 
         with self.assertRaises(ValueError) as raises:
             arr.reshape(-1, -1, 3)
-        self.assertIn('can only specify one unknown dimension',
-                      str(raises.exception))
+        self.assertIn(
+            "can only specify one unknown dimension", str(raises.exception)
+        )
 
     def test_reshape_infer_invalid_shape(self):
         nparr = np.empty((3, 4, 5))
-        arr = Array.from_desc(0, nparr.shape, nparr.strides,
-                              nparr.dtype.itemsize)
+        arr = Array.from_desc(
+            0, nparr.shape, nparr.strides, nparr.dtype.itemsize
+        )
 
         with self.assertRaises(ValueError) as raises:
             arr.reshape(-1, 7)
 
-        expected_message = 'cannot infer valid shape for unknown dimension'
+        expected_message = "cannot infer valid shape for unknown dimension"
         self.assertIn(expected_message, str(raises.exception))
 
 
@@ -289,6 +312,7 @@ def test_squeeze(self):
         def _assert_equal_shape_strides(arr1, arr2):
             self.assertEqual(arr1.shape, arr2.shape)
             self.assertEqual(arr1.strides, arr2.strides)
+
         _assert_equal_shape_strides(arr, nparr)
         _assert_equal_shape_strides(arr.squeeze()[0], nparr.squeeze())
         for axis in (0, 2, 4, (0, 2), (0, 4), (2, 4), (0, 2, 4)):
@@ -311,29 +335,33 @@ def test_squeeze_invalid_axis(self):
 class TestExtent(unittest.TestCase):
     def test_extent_1d(self):
         nparr = np.empty(4)
-        arr = Array.from_desc(0, nparr.shape, nparr.strides,
-                              nparr.dtype.itemsize)
+        arr = Array.from_desc(
+            0, nparr.shape, nparr.strides, nparr.dtype.itemsize
+        )
         s, e = arr.extent
         self.assertEqual(e - s, nparr.size * nparr.dtype.itemsize)
 
     def test_extent_2d(self):
         nparr = np.empty((4, 5))
-        arr = Array.from_desc(0, nparr.shape, nparr.strides,
-                              nparr.dtype.itemsize)
+        arr = Array.from_desc(
+            0, nparr.shape, nparr.strides, nparr.dtype.itemsize
+        )
         s, e = arr.extent
         self.assertEqual(e - s, nparr.size * nparr.dtype.itemsize)
 
     def test_extent_iter_1d(self):
         nparr = np.empty(4)
-        arr = Array.from_desc(0, nparr.shape, nparr.strides,
-                              nparr.dtype.itemsize)
+        arr = Array.from_desc(
+            0, nparr.shape, nparr.strides, nparr.dtype.itemsize
+        )
         [ext] = list(arr.iter_contiguous_extent())
         self.assertEqual(ext, arr.extent)
 
     def test_extent_iter_2d(self):
         nparr = np.empty((4, 5))
-        arr = Array.from_desc(0, nparr.shape, nparr.strides,
-                              nparr.dtype.itemsize)
+        arr = Array.from_desc(
+            0, nparr.shape, nparr.strides, nparr.dtype.itemsize
+        )
         [ext] = list(arr.iter_contiguous_extent())
         self.assertEqual(ext, arr.extent)
 
@@ -346,8 +374,9 @@ def test_for_loop(self):
         # for #4201
         N = 5
         nparr = np.empty(N)
-        arr = Array.from_desc(0, nparr.shape, nparr.strides,
-                              nparr.dtype.itemsize)
+        arr = Array.from_desc(
+            0, nparr.shape, nparr.strides, nparr.dtype.itemsize
+        )
 
         x = 0  # just a placeholder
         # this loop should not raise AssertionError
@@ -355,5 +384,5 @@ def test_for_loop(self):
             x = val  # noqa: F841
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py b/numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py
index 1153707bb..ec59f3fab 100644
--- a/numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py
+++ b/numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py
@@ -9,19 +9,28 @@
 class TestFunctionResolution(unittest.TestCase):
     def test_fp16_binary_operators(self):
         from numba.cuda.descriptor import cuda_target
-        ops = (operator.add, operator.iadd, operator.sub, operator.isub,
-               operator.mul, operator.imul)
+
+        ops = (
+            operator.add,
+            operator.iadd,
+            operator.sub,
+            operator.isub,
+            operator.mul,
+            operator.imul,
+        )
         for op in ops:
             fp16 = types.float16
             typingctx = cuda_target.typing_context
             typingctx.refresh()
             fnty = typingctx.resolve_value_type(op)
             out = typingctx.resolve_function_type(fnty, (fp16, fp16), {})
-            self.assertEqual(out, typing.signature(fp16, fp16, fp16),
-                             msg=str(out))
+            self.assertEqual(
+                out, typing.signature(fp16, fp16, fp16), msg=str(out)
+            )
 
     def test_fp16_unary_operators(self):
         from numba.cuda.descriptor import cuda_target
+
         ops = (operator.neg, abs)
         for op in ops:
             fp16 = types.float16
@@ -32,5 +41,5 @@ def test_fp16_unary_operators(self):
             self.assertEqual(out, typing.signature(fp16, fp16), msg=str(out))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/nocuda/test_import.py b/numba_cuda/numba/cuda/tests/nocuda/test_import.py
index 73126cd6e..b44ccbc95 100644
--- a/numba_cuda/numba/cuda/tests/nocuda/test_import.py
+++ b/numba_cuda/numba/cuda/tests/nocuda/test_import.py
@@ -11,30 +11,30 @@ def test_no_impl_import(self):
         """
 
         banlist = (
-            'numba.cpython.slicing',
-            'numba.cpython.tupleobj',
-            'numba.cpython.enumimpl',
-            'numba.cpython.hashing',
-            'numba.cpython.heapq',
-            'numba.cpython.iterators',
-            'numba.cpython.numbers',
-            'numba.cpython.rangeobj',
-            'numba.cpython.cmathimpl',
-            'numba.cpython.mathimpl',
-            'numba.cpython.printimpl',
-            'numba.cpython.randomimpl',
-            'numba.core.optional',
-            'numba.misc.gdb_hook',
-            'numba.misc.literal',
-            'numba.misc.cffiimpl',
-            'numba.np.linalg',
-            'numba.np.polynomial',
-            'numba.np.arraymath',
-            'numba.np.npdatetime',
-            'numba.np.npyimpl',
-            'numba.typed.typeddict',
-            'numba.typed.typedlist',
-            'numba.experimental.jitclass.base',
+            "numba.cpython.slicing",
+            "numba.cpython.tupleobj",
+            "numba.cpython.enumimpl",
+            "numba.cpython.hashing",
+            "numba.cpython.heapq",
+            "numba.cpython.iterators",
+            "numba.cpython.numbers",
+            "numba.cpython.rangeobj",
+            "numba.cpython.cmathimpl",
+            "numba.cpython.mathimpl",
+            "numba.cpython.printimpl",
+            "numba.cpython.randomimpl",
+            "numba.core.optional",
+            "numba.misc.gdb_hook",
+            "numba.misc.literal",
+            "numba.misc.cffiimpl",
+            "numba.np.linalg",
+            "numba.np.polynomial",
+            "numba.np.arraymath",
+            "numba.np.npdatetime",
+            "numba.np.npyimpl",
+            "numba.typed.typeddict",
+            "numba.typed.typedlist",
+            "numba.experimental.jitclass.base",
         )
 
         code = "import sys; from numba import cuda; print(list(sys.modules))"
@@ -45,5 +45,5 @@ def test_no_impl_import(self):
         self.assertFalse(unexpected, "some modules unexpectedly imported")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py b/numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py
index acf670829..bca9eb680 100644
--- a/numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py
+++ b/numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py
@@ -21,12 +21,12 @@
 
 
 has_cuda = nvvm.is_available()
-has_mp_get_context = hasattr(mp, 'get_context')
+has_mp_get_context = hasattr(mp, "get_context")
 
 
 class LibraryLookupBase(SerialMixin, unittest.TestCase):
     def setUp(self):
-        ctx = mp.get_context('spawn')
+        ctx = mp.get_context("spawn")
 
         qrecv = ctx.Queue()
         qsend = ctx.Queue()
@@ -84,108 +84,108 @@ def check_lib_lookup(qout, qin):
                 status = False
 
 
-@skip_on_cudasim('Library detection unsupported in the simulator')
-@unittest.skipUnless(has_mp_get_context, 'mp.get_context not available')
-@skip_unless_conda_cudatoolkit('test assumes conda installed cudatoolkit')
+@skip_on_cudasim("Library detection unsupported in the simulator")
+@unittest.skipUnless(has_mp_get_context, "mp.get_context not available")
+@skip_unless_conda_cudatoolkit("test assumes conda installed cudatoolkit")
 class TestLibDeviceLookUp(LibraryLookupBase):
     def test_libdevice_path_decision(self):
         # Check that the default is using conda environment
         by, info, warns = self.remote_do(self.do_clear_envs)
         if has_cuda:
-            self.assertEqual(by, 'Conda environment')
+            self.assertEqual(by, "Conda environment")
         else:
             self.assertEqual(by, "<unknown>")
             self.assertIsNone(info)
         self.assertFalse(warns)
         # Check that CUDA_HOME works by removing conda-env
         by, info, warns = self.remote_do(self.do_set_cuda_home)
-        self.assertEqual(by, 'CUDA_HOME')
-        self.assertEqual(info, os.path.join('mycudahome', 'nvvm', 'libdevice'))
+        self.assertEqual(by, "CUDA_HOME")
+        self.assertEqual(info, os.path.join("mycudahome", "nvvm", "libdevice"))
         self.assertFalse(warns)
 
         if get_system_ctk() is None:
             # Fake remove conda environment so no cudatoolkit is available
             by, info, warns = self.remote_do(self.do_clear_envs)
-            self.assertEqual(by, '<unknown>')
+            self.assertEqual(by, "<unknown>")
             self.assertIsNone(info)
             self.assertFalse(warns)
         else:
             # Use system available cudatoolkit
             by, info, warns = self.remote_do(self.do_clear_envs)
-            self.assertEqual(by, 'System')
+            self.assertEqual(by, "System")
             self.assertFalse(warns)
 
     @staticmethod
     def do_clear_envs():
-        remove_env('CUDA_HOME')
-        remove_env('CUDA_PATH')
+        remove_env("CUDA_HOME")
+        remove_env("CUDA_PATH")
         return True, _get_libdevice_path_decision()
 
     @staticmethod
     def do_set_cuda_home():
-        os.environ['CUDA_HOME'] = os.path.join('mycudahome')
+        os.environ["CUDA_HOME"] = os.path.join("mycudahome")
         _fake_non_conda_env()
         return True, _get_libdevice_path_decision()
 
 
-@skip_on_cudasim('Library detection unsupported in the simulator')
-@unittest.skipUnless(has_mp_get_context, 'mp.get_context not available')
-@skip_unless_conda_cudatoolkit('test assumes conda installed cudatoolkit')
+@skip_on_cudasim("Library detection unsupported in the simulator")
+@unittest.skipUnless(has_mp_get_context, "mp.get_context not available")
+@skip_unless_conda_cudatoolkit("test assumes conda installed cudatoolkit")
 class TestNvvmLookUp(LibraryLookupBase):
     def test_nvvm_path_decision(self):
         # Check that the default is using conda environment
         by, info, warns = self.remote_do(self.do_clear_envs)
         if has_cuda:
-            self.assertEqual(by, 'Conda environment')
+            self.assertEqual(by, "Conda environment")
         else:
             self.assertEqual(by, "<unknown>")
             self.assertIsNone(info)
         self.assertFalse(warns)
         # Check that CUDA_HOME works by removing conda-env
         by, info, warns = self.remote_do(self.do_set_cuda_home)
-        self.assertEqual(by, 'CUDA_HOME')
+        self.assertEqual(by, "CUDA_HOME")
         self.assertFalse(warns)
         if IS_WIN32:
-            self.assertEqual(info, os.path.join('mycudahome', 'nvvm', 'bin'))
+            self.assertEqual(info, os.path.join("mycudahome", "nvvm", "bin"))
         elif IS_OSX:
-            self.assertEqual(info, os.path.join('mycudahome', 'nvvm', 'lib'))
+            self.assertEqual(info, os.path.join("mycudahome", "nvvm", "lib"))
         else:
-            self.assertEqual(info, os.path.join('mycudahome', 'nvvm', 'lib64'))
+            self.assertEqual(info, os.path.join("mycudahome", "nvvm", "lib64"))
 
         if get_system_ctk() is None:
             # Fake remove conda environment so no cudatoolkit is available
             by, info, warns = self.remote_do(self.do_clear_envs)
-            self.assertEqual(by, '<unknown>')
+            self.assertEqual(by, "<unknown>")
             self.assertIsNone(info)
             self.assertFalse(warns)
         else:
             # Use system available cudatoolkit
             by, info, warns = self.remote_do(self.do_clear_envs)
-            self.assertEqual(by, 'System')
+            self.assertEqual(by, "System")
             self.assertFalse(warns)
 
     @staticmethod
     def do_clear_envs():
-        remove_env('CUDA_HOME')
-        remove_env('CUDA_PATH')
+        remove_env("CUDA_HOME")
+        remove_env("CUDA_PATH")
         return True, _get_nvvm_path_decision()
 
     @staticmethod
     def do_set_cuda_home():
-        os.environ['CUDA_HOME'] = os.path.join('mycudahome')
+        os.environ["CUDA_HOME"] = os.path.join("mycudahome")
         _fake_non_conda_env()
         return True, _get_nvvm_path_decision()
 
 
-@skip_on_cudasim('Library detection unsupported in the simulator')
-@unittest.skipUnless(has_mp_get_context, 'mp.get_context not available')
-@skip_unless_conda_cudatoolkit('test assumes conda installed cudatoolkit')
+@skip_on_cudasim("Library detection unsupported in the simulator")
+@unittest.skipUnless(has_mp_get_context, "mp.get_context not available")
+@skip_unless_conda_cudatoolkit("test assumes conda installed cudatoolkit")
 class TestCudaLibLookUp(LibraryLookupBase):
     def test_cudalib_path_decision(self):
         # Check that the default is using conda environment
         by, info, warns = self.remote_do(self.do_clear_envs)
         if has_cuda:
-            self.assertEqual(by, 'Conda environment')
+            self.assertEqual(by, "Conda environment")
         else:
             self.assertEqual(by, "<unknown>")
             self.assertIsNone(info)
@@ -194,14 +194,14 @@ def test_cudalib_path_decision(self):
         # Check that CUDA_HOME works by removing conda-env
         self.remote_do(self.do_clear_envs)
         by, info, warns = self.remote_do(self.do_set_cuda_home)
-        self.assertEqual(by, 'CUDA_HOME')
+        self.assertEqual(by, "CUDA_HOME")
         self.assertFalse(warns)
         if IS_WIN32:
-            self.assertEqual(info, os.path.join('mycudahome', 'bin'))
+            self.assertEqual(info, os.path.join("mycudahome", "bin"))
         elif IS_OSX:
-            self.assertEqual(info, os.path.join('mycudahome', 'lib'))
+            self.assertEqual(info, os.path.join("mycudahome", "lib"))
         else:
-            self.assertEqual(info, os.path.join('mycudahome', 'lib64'))
+            self.assertEqual(info, os.path.join("mycudahome", "lib64"))
         if get_system_ctk() is None:
             # Fake remove conda environment so no cudatoolkit is available
             by, info, warns = self.remote_do(self.do_clear_envs)
@@ -211,18 +211,18 @@ def test_cudalib_path_decision(self):
         else:
             # Use system available cudatoolkit
             by, info, warns = self.remote_do(self.do_clear_envs)
-            self.assertEqual(by, 'System')
+            self.assertEqual(by, "System")
             self.assertFalse(warns)
 
     @staticmethod
     def do_clear_envs():
-        remove_env('CUDA_HOME')
-        remove_env('CUDA_PATH')
+        remove_env("CUDA_HOME")
+        remove_env("CUDA_PATH")
         return True, _get_cudalib_dir_path_decision()
 
     @staticmethod
     def do_set_cuda_home():
-        os.environ['CUDA_HOME'] = os.path.join('mycudahome')
+        os.environ["CUDA_HOME"] = os.path.join("mycudahome")
         _fake_non_conda_env()
         return True, _get_cudalib_dir_path_decision()
 
@@ -231,8 +231,8 @@ def _fake_non_conda_env():
     """
     Monkeypatch sys.prefix to hide the fact we are in a conda-env
     """
-    sys.prefix = ''
+    sys.prefix = ""
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py b/numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py
index 742aa1017..ef5af7b97 100644
--- a/numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py
+++ b/numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py
@@ -8,14 +8,17 @@
 import unittest
 
 
-original = "call void @llvm.memset.p0i8.i64(" \
-           "i8* align 4 %arg.x.41, i8 0, i64 %0, i1 false)"
+original = (
+    "call void @llvm.memset.p0i8.i64("
+    "i8* align 4 %arg.x.41, i8 0, i64 %0, i1 false)"
+)
 
-missing_align = "call void @llvm.memset.p0i8.i64(" \
-                "i8* %arg.x.41, i8 0, i64 %0, i1 false)"
+missing_align = (
+    "call void @llvm.memset.p0i8.i64(i8* %arg.x.41, i8 0, i64 %0, i1 false)"
+)
 
 
-@skip_on_cudasim('libNVVM not supported in simulator')
+@skip_on_cudasim("libNVVM not supported in simulator")
 @unittest.skipIf(utils.MACHINE_BITS == 32, "CUDA not support for 32-bit")
 @unittest.skipIf(not nvvm.is_available(), "No libNVVM")
 class TestNvvmWithoutCuda(unittest.TestCase):
@@ -30,10 +33,9 @@ def test_nvvm_accepts_encoding(self):
         # NVVM that it cannot parse correctly
 
         # Create a module with a constant containing all 8-bit characters
-        c = ir.Constant(ir.ArrayType(ir.IntType(8), 256),
-                        bytearray(range(256)))
+        c = ir.Constant(ir.ArrayType(ir.IntType(8), 256), bytearray(range(256)))
         m = ir.Module()
-        m.triple = 'nvptx64-nvidia-cuda'
+        m.triple = "nvptx64-nvidia-cuda"
         nvvm.add_ir_version(m)
         gv = ir.GlobalVariable(m, c.type, "myconstant")
         gv.global_constant = True
@@ -46,9 +48,9 @@ def test_nvvm_accepts_encoding(self):
 
         # Ensure all characters appear in the generated constant array.
         elements = ", ".join([str(i) for i in range(256)])
-        myconstant = f"myconstant[256] = {{{elements}}}".encode('utf-8')
+        myconstant = f"myconstant[256] = {{{elements}}}".encode("utf-8")
         self.assertIn(myconstant, ptx)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/nrt/test_nrt.py b/numba_cuda/numba/cuda/tests/nrt/test_nrt.py
index 9f5f0fcf5..a621fe625 100644
--- a/numba_cuda/numba/cuda/tests/nrt/test_nrt.py
+++ b/numba_cuda/numba/cuda/tests/nrt/test_nrt.py
@@ -26,7 +26,7 @@ def g():
             x = np.empty(10, np.int64)
             f(x)
 
-        g[1,1]()
+        g[1, 1]()
         cuda.synchronize()
 
     def test_nrt_ptx_contains_refcount(self):
@@ -39,7 +39,7 @@ def g():
             x = np.empty(10, np.int64)
             f(x)
 
-        g[1,1]()
+        g[1, 1]()
 
         ptx = next(iter(g.inspect_asm().values()))
 
@@ -72,13 +72,12 @@ def g(out_ary):
 
         out_ary = np.zeros(1, dtype=np.int64)
 
-        g[1,1](out_ary)
+        g[1, 1](out_ary)
 
         self.assertEqual(out_ary[0], 1)
 
 
 class TestNrtStatistics(CUDATestCase):
-
     def setUp(self):
         self._stream = cuda.default_stream()
         # Store the current stats state
@@ -126,12 +125,11 @@ def foo():
 
         # Check env var explicitly being set works
         env = os.environ.copy()
-        env['NUMBA_CUDA_NRT_STATS'] = "1"
-        env['NUMBA_CUDA_ENABLE_NRT'] = "1"
+        env["NUMBA_CUDA_NRT_STATS"] = "1"
+        env["NUMBA_CUDA_ENABLE_NRT"] = "1"
         run_in_subprocess(src, env=env)
 
     def check_env_var_off(self, env):
-
         src = """if 1:
         from numba import cuda
         import numpy as np
@@ -152,27 +150,26 @@ def foo():
     def test_stats_env_var_explicit_off(self):
         # Checks that explicitly turning the stats off via the env var works.
         env = os.environ.copy()
-        env['NUMBA_CUDA_NRT_STATS'] = "0"
+        env["NUMBA_CUDA_NRT_STATS"] = "0"
         self.check_env_var_off(env)
 
     def test_stats_env_var_default_off(self):
         # Checks that the env var not being set is the same as "off", i.e.
         # default for Numba is off.
         env = os.environ.copy()
-        env.pop('NUMBA_CUDA_NRT_STATS', None)
+        env.pop("NUMBA_CUDA_NRT_STATS", None)
         self.check_env_var_off(env)
 
     def test_stats_status_toggle(self):
-
         @cuda.jit
         def foo():
             tmp = np.ones(3)
-            arr = np.arange(5 * tmp[0]) # noqa: F841
+            arr = np.arange(5 * tmp[0])  # noqa: F841
             return None
 
         with (
-            override_config('CUDA_ENABLE_NRT', True),
-            override_config('CUDA_NRT_STATS', True)
+            override_config("CUDA_ENABLE_NRT", True),
+            override_config("CUDA_NRT_STATS", True),
         ):
             # Switch on stats
             rtsys.memsys_enable_stats()
@@ -218,9 +215,9 @@ def test_rtsys_stats_query_raises_exception_when_disabled(self):
     def test_nrt_explicit_stats_query_raises_exception_when_disabled(self):
         # Checks the various memsys_get_stats functions raise if queried when
         # the stats counters are disabled.
-        method_variations = ('alloc', 'free', 'mi_alloc', 'mi_free')
+        method_variations = ("alloc", "free", "mi_alloc", "mi_free")
         for meth in method_variations:
-            stats_func = getattr(rtsys, f'memsys_get_stats_{meth}')
+            stats_func = getattr(rtsys, f"memsys_get_stats_{meth}")
             with self.subTest(stats_func=stats_func):
                 # Turn stats off
                 rtsys.memsys_disable_stats()
@@ -233,14 +230,13 @@ def test_read_one_stat(self):
         @cuda.jit
         def foo():
             tmp = np.ones(3)
-            arr = np.arange(5 * tmp[0]) # noqa: F841
+            arr = np.arange(5 * tmp[0])  # noqa: F841
             return None
 
         with (
-            override_config('CUDA_ENABLE_NRT', True),
-            override_config('CUDA_NRT_STATS', True)
+            override_config("CUDA_ENABLE_NRT", True),
+            override_config("CUDA_NRT_STATS", True),
         ):
-
             # Switch on stats
             rtsys.memsys_enable_stats()
 
@@ -262,5 +258,5 @@ def foo():
             self.assertEqual(stats.mi_free, stats_mi_free)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py b/numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py
index 1e9b7aa30..27811bdae 100644
--- a/numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py
+++ b/numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py
@@ -9,7 +9,6 @@
 
 
 class TestNrtRefCt(EnableNRTStatsMixin, CUDATestCase):
-
     def setUp(self):
         super(TestNrtRefCt, self).setUp()
 
@@ -19,7 +18,7 @@ def tearDown(self):
     def run(self, result=None):
         with (
             override_config("CUDA_ENABLE_NRT", True),
-            override_config('CUDA_NRT_STATS', True)
+            override_config("CUDA_NRT_STATS", True),
         ):
             super(TestNrtRefCt, self).run(result)
 
@@ -33,7 +32,7 @@ def test_no_return(self):
         @cuda.jit
         def kernel():
             for i in range(n):
-                temp = np.empty(2) # noqa: F841
+                temp = np.empty(2)  # noqa: F841
             return None
 
         init_stats = rtsys.get_allocation_stats()
@@ -49,14 +48,13 @@ def test_escaping_var_init_in_loop(self):
 
         @cuda.jit
         def g(n):
-
             x = np.empty((n, 2))
 
             for i in range(n):
                 y = x[i]
 
             for i in range(n):
-                y = x[i] # noqa: F841
+                y = x[i]  # noqa: F841
 
             return None
 
@@ -70,6 +68,7 @@ def test_invalid_computation_of_lifetime(self):
         """
         Test issue #1573
         """
+
         @cuda.jit
         def if_with_allocation_and_initialization(arr1, test1):
             tmp_arr = np.empty_like(arr1)
@@ -85,13 +84,15 @@ def if_with_allocation_and_initialization(arr1, test1):
         init_stats = rtsys.get_allocation_stats()
         if_with_allocation_and_initialization[1, 1](arr, False)
         cur_stats = rtsys.get_allocation_stats()
-        self.assertEqual(cur_stats.alloc - init_stats.alloc,
-                         cur_stats.free - init_stats.free)
+        self.assertEqual(
+            cur_stats.alloc - init_stats.alloc, cur_stats.free - init_stats.free
+        )
 
     def test_del_at_beginning_of_loop(self):
         """
         Test issue #1734
         """
+
         @cuda.jit
         def f(arr):
             res = 0
@@ -108,9 +109,10 @@ def f(arr):
         init_stats = rtsys.get_allocation_stats()
         f[1, 1](arr)
         cur_stats = rtsys.get_allocation_stats()
-        self.assertEqual(cur_stats.alloc - init_stats.alloc,
-                         cur_stats.free - init_stats.free)
+        self.assertEqual(
+            cur_stats.alloc - init_stats.alloc, cur_stats.free - init_stats.free
+        )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_cuda/numba/cuda/tests/test_binary_generation/build.bat b/numba_cuda/numba/cuda/tests/test_binary_generation/build.bat
index 9d47a334d..5eb0cbe9d 100644
--- a/numba_cuda/numba/cuda/tests/test_binary_generation/build.bat
+++ b/numba_cuda/numba/cuda/tests/test_binary_generation/build.bat
@@ -58,4 +58,4 @@ nvcc %NVCC_FLAGS% %LIBRARY_FLAGS% -o %OUTPUT_DIR%\test_device_functions.a test_d
 nvcc %NVCC_FLAGS% %LTOIR_FLAGS% -o %OUTPUT_DIR%\test_device_functions.ltoir.o test_device_functions.cu
 
 @REM Generate LTO-IR in a "raw" LTO-IR container
-python generate_raw_ltoir.py --arch sm_%GPU_CC% -o %OUTPUT_DIR%\test_device_functions.ltoir test_device_functions.cu
\ No newline at end of file
+python generate_raw_ltoir.py --arch sm_%GPU_CC% -o %OUTPUT_DIR%\test_device_functions.ltoir test_device_functions.cu
diff --git a/numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py b/numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py
index 934410b07..b4d32a34c 100644
--- a/numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py
+++ b/numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py
@@ -58,7 +58,7 @@ def determine_include_flags():
         return None
 
     # NVCC writes to stdout on Windows and stderr on Linux
-    if platform.system() == 'Windows':
+    if platform.system() == "Windows":
         stream = cp.stdout
     else:
         stream = cp.stderr
@@ -157,7 +157,7 @@ def main(sourcepath, outputpath, arch):
     parser.add_argument(
         "-a",
         "--arch",
-        help="compute arch to target (e.g. sm_87). " "Defaults to sm_50.",
+        help="compute arch to target (e.g. sm_87). Defaults to sm_50.",
         default="sm_50",
     )
 
diff --git a/numba_cuda/numba/cuda/types.py b/numba_cuda/numba/cuda/types.py
index 531dcb2cc..92b8f3ecb 100644
--- a/numba_cuda/numba/cuda/types.py
+++ b/numba_cuda/numba/cuda/types.py
@@ -5,16 +5,18 @@ class Dim3(types.Type):
     """
     A 3-tuple (x, y, z) representing the position of a block or thread.
     """
+
     def __init__(self):
-        super().__init__(name='Dim3')
+        super().__init__(name="Dim3")
 
 
 class GridGroup(types.Type):
     """
     The grid of all threads in a cooperative kernel launch.
     """
+
     def __init__(self):
-        super().__init__(name='GridGroup')
+        super().__init__(name="GridGroup")
 
 
 dim3 = Dim3()
@@ -23,6 +25,7 @@ def __init__(self):
 
 class CUDADispatcher(types.Dispatcher):
     """The type of CUDA dispatchers"""
+
     # This type exists (instead of using types.Dispatcher as the type of CUDA
     # dispatchers) so that we can have an alternative lowering for them to the
     # lowering of CPU dispatchers - the CPU target lowers all dispatchers as a
diff --git a/numba_cuda/numba/cuda/ufuncs.py b/numba_cuda/numba/cuda/ufuncs.py
index 1ab3f9605..bcfff371f 100644
--- a/numba_cuda/numba/cuda/ufuncs.py
+++ b/numba_cuda/numba/cuda/ufuncs.py
@@ -10,8 +10,10 @@
 import numpy as np
 from functools import lru_cache
 from numba.core import typing
-from numba.cuda.mathimpl import (get_unary_impl_for_fn_and_ty,
-                                 get_binary_impl_for_fn_and_ty)
+from numba.cuda.mathimpl import (
+    get_unary_impl_for_fn_and_ty,
+    get_binary_impl_for_fn_and_ty,
+)
 
 
 def get_ufunc_info(ufunc_key):
@@ -173,490 +175,508 @@ def np_real_atanh_impl(context, builder, sig, args):
     db = {}
 
     db[np.sin] = {
-        'f->f': np_real_sin_impl,
-        'd->d': np_real_sin_impl,
-        'F->F': npyfuncs.np_complex_sin_impl,
-        'D->D': npyfuncs.np_complex_sin_impl,
+        "f->f": np_real_sin_impl,
+        "d->d": np_real_sin_impl,
+        "F->F": npyfuncs.np_complex_sin_impl,
+        "D->D": npyfuncs.np_complex_sin_impl,
     }
 
     db[np.cos] = {
-        'f->f': np_real_cos_impl,
-        'd->d': np_real_cos_impl,
-        'F->F': npyfuncs.np_complex_cos_impl,
-        'D->D': npyfuncs.np_complex_cos_impl,
+        "f->f": np_real_cos_impl,
+        "d->d": np_real_cos_impl,
+        "F->F": npyfuncs.np_complex_cos_impl,
+        "D->D": npyfuncs.np_complex_cos_impl,
     }
 
     db[np.tan] = {
-        'f->f': np_real_tan_impl,
-        'd->d': np_real_tan_impl,
-        'F->F': cmathimpl.tan_impl,
-        'D->D': cmathimpl.tan_impl,
+        "f->f": np_real_tan_impl,
+        "d->d": np_real_tan_impl,
+        "F->F": cmathimpl.tan_impl,
+        "D->D": cmathimpl.tan_impl,
     }
 
     db[np.arcsin] = {
-        'f->f': np_real_asin_impl,
-        'd->d': np_real_asin_impl,
-        'F->F': cmathimpl.asin_impl,
-        'D->D': cmathimpl.asin_impl,
+        "f->f": np_real_asin_impl,
+        "d->d": np_real_asin_impl,
+        "F->F": cmathimpl.asin_impl,
+        "D->D": cmathimpl.asin_impl,
     }
 
     db[np.arccos] = {
-        'f->f': np_real_acos_impl,
-        'd->d': np_real_acos_impl,
-        'F->F': cmathimpl.acos_impl,
-        'D->D': cmathimpl.acos_impl,
+        "f->f": np_real_acos_impl,
+        "d->d": np_real_acos_impl,
+        "F->F": cmathimpl.acos_impl,
+        "D->D": cmathimpl.acos_impl,
     }
 
     db[np.arctan] = {
-        'f->f': np_real_atan_impl,
-        'd->d': np_real_atan_impl,
-        'F->F': cmathimpl.atan_impl,
-        'D->D': cmathimpl.atan_impl,
+        "f->f": np_real_atan_impl,
+        "d->d": np_real_atan_impl,
+        "F->F": cmathimpl.atan_impl,
+        "D->D": cmathimpl.atan_impl,
     }
 
     db[np.arctan2] = {
-        'ff->f': np_real_atan2_impl,
-        'dd->d': np_real_atan2_impl,
+        "ff->f": np_real_atan2_impl,
+        "dd->d": np_real_atan2_impl,
     }
 
     db[np.hypot] = {
-        'ff->f': np_real_hypot_impl,
-        'dd->d': np_real_hypot_impl,
+        "ff->f": np_real_hypot_impl,
+        "dd->d": np_real_hypot_impl,
     }
 
     db[np.sinh] = {
-        'f->f': np_real_sinh_impl,
-        'd->d': np_real_sinh_impl,
-        'F->F': np_complex_sinh_impl,
-        'D->D': np_complex_sinh_impl,
+        "f->f": np_real_sinh_impl,
+        "d->d": np_real_sinh_impl,
+        "F->F": np_complex_sinh_impl,
+        "D->D": np_complex_sinh_impl,
     }
 
     db[np.cosh] = {
-        'f->f': np_real_cosh_impl,
-        'd->d': np_real_cosh_impl,
-        'F->F': np_complex_cosh_impl,
-        'D->D': np_complex_cosh_impl,
+        "f->f": np_real_cosh_impl,
+        "d->d": np_real_cosh_impl,
+        "F->F": np_complex_cosh_impl,
+        "D->D": np_complex_cosh_impl,
     }
 
     db[np.tanh] = {
-        'f->f': np_real_tanh_impl,
-        'd->d': np_real_tanh_impl,
-        'F->F': np_complex_tanh_impl,
-        'D->D': np_complex_tanh_impl,
+        "f->f": np_real_tanh_impl,
+        "d->d": np_real_tanh_impl,
+        "F->F": np_complex_tanh_impl,
+        "D->D": np_complex_tanh_impl,
     }
 
     db[np.arcsinh] = {
-        'f->f': np_real_asinh_impl,
-        'd->d': np_real_asinh_impl,
-        'F->F': cmathimpl.asinh_impl,
-        'D->D': cmathimpl.asinh_impl,
+        "f->f": np_real_asinh_impl,
+        "d->d": np_real_asinh_impl,
+        "F->F": cmathimpl.asinh_impl,
+        "D->D": cmathimpl.asinh_impl,
     }
 
     db[np.arccosh] = {
-        'f->f': np_real_acosh_impl,
-        'd->d': np_real_acosh_impl,
-        'F->F': npyfuncs.np_complex_acosh_impl,
-        'D->D': npyfuncs.np_complex_acosh_impl,
+        "f->f": np_real_acosh_impl,
+        "d->d": np_real_acosh_impl,
+        "F->F": npyfuncs.np_complex_acosh_impl,
+        "D->D": npyfuncs.np_complex_acosh_impl,
     }
 
     db[np.arctanh] = {
-        'f->f': np_real_atanh_impl,
-        'd->d': np_real_atanh_impl,
-        'F->F': cmathimpl.atanh_impl,
-        'D->D': cmathimpl.atanh_impl,
+        "f->f": np_real_atanh_impl,
+        "d->d": np_real_atanh_impl,
+        "F->F": cmathimpl.atanh_impl,
+        "D->D": cmathimpl.atanh_impl,
     }
 
     db[np.deg2rad] = {
-        'f->f': mathimpl.radians_float_impl,
-        'd->d': mathimpl.radians_float_impl,
+        "f->f": mathimpl.radians_float_impl,
+        "d->d": mathimpl.radians_float_impl,
     }
 
     db[np.radians] = db[np.deg2rad]
 
     db[np.rad2deg] = {
-        'f->f': mathimpl.degrees_float_impl,
-        'd->d': mathimpl.degrees_float_impl,
+        "f->f": mathimpl.degrees_float_impl,
+        "d->d": mathimpl.degrees_float_impl,
     }
 
     db[np.degrees] = db[np.rad2deg]
 
     db[np.greater] = {
-        '??->?': numbers.int_ugt_impl,
-        'bb->?': numbers.int_sgt_impl,
-        'BB->?': numbers.int_ugt_impl,
-        'hh->?': numbers.int_sgt_impl,
-        'HH->?': numbers.int_ugt_impl,
-        'ii->?': numbers.int_sgt_impl,
-        'II->?': numbers.int_ugt_impl,
-        'll->?': numbers.int_sgt_impl,
-        'LL->?': numbers.int_ugt_impl,
-        'qq->?': numbers.int_sgt_impl,
-        'QQ->?': numbers.int_ugt_impl,
-        'ff->?': numbers.real_gt_impl,
-        'dd->?': numbers.real_gt_impl,
-        'FF->?': npyfuncs.np_complex_gt_impl,
-        'DD->?': npyfuncs.np_complex_gt_impl,
+        "??->?": numbers.int_ugt_impl,
+        "bb->?": numbers.int_sgt_impl,
+        "BB->?": numbers.int_ugt_impl,
+        "hh->?": numbers.int_sgt_impl,
+        "HH->?": numbers.int_ugt_impl,
+        "ii->?": numbers.int_sgt_impl,
+        "II->?": numbers.int_ugt_impl,
+        "ll->?": numbers.int_sgt_impl,
+        "LL->?": numbers.int_ugt_impl,
+        "qq->?": numbers.int_sgt_impl,
+        "QQ->?": numbers.int_ugt_impl,
+        "ff->?": numbers.real_gt_impl,
+        "dd->?": numbers.real_gt_impl,
+        "FF->?": npyfuncs.np_complex_gt_impl,
+        "DD->?": npyfuncs.np_complex_gt_impl,
     }
     if numpy_version >= (1, 25):
-        db[np.greater].update({
-            'qQ->?': numbers.int_signed_unsigned_cmp('>'),
-            'Qq->?': numbers.int_unsigned_signed_cmp('>')})
+        db[np.greater].update(
+            {
+                "qQ->?": numbers.int_signed_unsigned_cmp(">"),
+                "Qq->?": numbers.int_unsigned_signed_cmp(">"),
+            }
+        )
 
     db[np.greater_equal] = {
-        '??->?': numbers.int_uge_impl,
-        'bb->?': numbers.int_sge_impl,
-        'BB->?': numbers.int_uge_impl,
-        'hh->?': numbers.int_sge_impl,
-        'HH->?': numbers.int_uge_impl,
-        'ii->?': numbers.int_sge_impl,
-        'II->?': numbers.int_uge_impl,
-        'll->?': numbers.int_sge_impl,
-        'LL->?': numbers.int_uge_impl,
-        'qq->?': numbers.int_sge_impl,
-        'QQ->?': numbers.int_uge_impl,
-        'ff->?': numbers.real_ge_impl,
-        'dd->?': numbers.real_ge_impl,
-        'FF->?': npyfuncs.np_complex_ge_impl,
-        'DD->?': npyfuncs.np_complex_ge_impl,
+        "??->?": numbers.int_uge_impl,
+        "bb->?": numbers.int_sge_impl,
+        "BB->?": numbers.int_uge_impl,
+        "hh->?": numbers.int_sge_impl,
+        "HH->?": numbers.int_uge_impl,
+        "ii->?": numbers.int_sge_impl,
+        "II->?": numbers.int_uge_impl,
+        "ll->?": numbers.int_sge_impl,
+        "LL->?": numbers.int_uge_impl,
+        "qq->?": numbers.int_sge_impl,
+        "QQ->?": numbers.int_uge_impl,
+        "ff->?": numbers.real_ge_impl,
+        "dd->?": numbers.real_ge_impl,
+        "FF->?": npyfuncs.np_complex_ge_impl,
+        "DD->?": npyfuncs.np_complex_ge_impl,
     }
     if numpy_version >= (1, 25):
-        db[np.greater_equal].update({
-            'qQ->?': numbers.int_signed_unsigned_cmp('>='),
-            'Qq->?': numbers.int_unsigned_signed_cmp('>=')})
+        db[np.greater_equal].update(
+            {
+                "qQ->?": numbers.int_signed_unsigned_cmp(">="),
+                "Qq->?": numbers.int_unsigned_signed_cmp(">="),
+            }
+        )
 
     db[np.less] = {
-        '??->?': numbers.int_ult_impl,
-        'bb->?': numbers.int_slt_impl,
-        'BB->?': numbers.int_ult_impl,
-        'hh->?': numbers.int_slt_impl,
-        'HH->?': numbers.int_ult_impl,
-        'ii->?': numbers.int_slt_impl,
-        'II->?': numbers.int_ult_impl,
-        'll->?': numbers.int_slt_impl,
-        'LL->?': numbers.int_ult_impl,
-        'qq->?': numbers.int_slt_impl,
-        'QQ->?': numbers.int_ult_impl,
-        'ff->?': numbers.real_lt_impl,
-        'dd->?': numbers.real_lt_impl,
-        'FF->?': npyfuncs.np_complex_lt_impl,
-        'DD->?': npyfuncs.np_complex_lt_impl,
+        "??->?": numbers.int_ult_impl,
+        "bb->?": numbers.int_slt_impl,
+        "BB->?": numbers.int_ult_impl,
+        "hh->?": numbers.int_slt_impl,
+        "HH->?": numbers.int_ult_impl,
+        "ii->?": numbers.int_slt_impl,
+        "II->?": numbers.int_ult_impl,
+        "ll->?": numbers.int_slt_impl,
+        "LL->?": numbers.int_ult_impl,
+        "qq->?": numbers.int_slt_impl,
+        "QQ->?": numbers.int_ult_impl,
+        "ff->?": numbers.real_lt_impl,
+        "dd->?": numbers.real_lt_impl,
+        "FF->?": npyfuncs.np_complex_lt_impl,
+        "DD->?": npyfuncs.np_complex_lt_impl,
     }
     if numpy_version >= (1, 25):
-        db[np.less].update({
-            'qQ->?': numbers.int_signed_unsigned_cmp('<'),
-            'Qq->?': numbers.int_unsigned_signed_cmp('<')})
+        db[np.less].update(
+            {
+                "qQ->?": numbers.int_signed_unsigned_cmp("<"),
+                "Qq->?": numbers.int_unsigned_signed_cmp("<"),
+            }
+        )
 
     db[np.less_equal] = {
-        '??->?': numbers.int_ule_impl,
-        'bb->?': numbers.int_sle_impl,
-        'BB->?': numbers.int_ule_impl,
-        'hh->?': numbers.int_sle_impl,
-        'HH->?': numbers.int_ule_impl,
-        'ii->?': numbers.int_sle_impl,
-        'II->?': numbers.int_ule_impl,
-        'll->?': numbers.int_sle_impl,
-        'LL->?': numbers.int_ule_impl,
-        'qq->?': numbers.int_sle_impl,
-        'QQ->?': numbers.int_ule_impl,
-        'ff->?': numbers.real_le_impl,
-        'dd->?': numbers.real_le_impl,
-        'FF->?': npyfuncs.np_complex_le_impl,
-        'DD->?': npyfuncs.np_complex_le_impl,
+        "??->?": numbers.int_ule_impl,
+        "bb->?": numbers.int_sle_impl,
+        "BB->?": numbers.int_ule_impl,
+        "hh->?": numbers.int_sle_impl,
+        "HH->?": numbers.int_ule_impl,
+        "ii->?": numbers.int_sle_impl,
+        "II->?": numbers.int_ule_impl,
+        "ll->?": numbers.int_sle_impl,
+        "LL->?": numbers.int_ule_impl,
+        "qq->?": numbers.int_sle_impl,
+        "QQ->?": numbers.int_ule_impl,
+        "ff->?": numbers.real_le_impl,
+        "dd->?": numbers.real_le_impl,
+        "FF->?": npyfuncs.np_complex_le_impl,
+        "DD->?": npyfuncs.np_complex_le_impl,
     }
     if numpy_version >= (1, 25):
-        db[np.less_equal].update({
-            'qQ->?': numbers.int_signed_unsigned_cmp('<='),
-            'Qq->?': numbers.int_unsigned_signed_cmp('<=')})
+        db[np.less_equal].update(
+            {
+                "qQ->?": numbers.int_signed_unsigned_cmp("<="),
+                "Qq->?": numbers.int_unsigned_signed_cmp("<="),
+            }
+        )
 
     db[np.not_equal] = {
-        '??->?': numbers.int_ne_impl,
-        'bb->?': numbers.int_ne_impl,
-        'BB->?': numbers.int_ne_impl,
-        'hh->?': numbers.int_ne_impl,
-        'HH->?': numbers.int_ne_impl,
-        'ii->?': numbers.int_ne_impl,
-        'II->?': numbers.int_ne_impl,
-        'll->?': numbers.int_ne_impl,
-        'LL->?': numbers.int_ne_impl,
-        'qq->?': numbers.int_ne_impl,
-        'QQ->?': numbers.int_ne_impl,
-        'ff->?': numbers.real_ne_impl,
-        'dd->?': numbers.real_ne_impl,
-        'FF->?': npyfuncs.np_complex_ne_impl,
-        'DD->?': npyfuncs.np_complex_ne_impl,
+        "??->?": numbers.int_ne_impl,
+        "bb->?": numbers.int_ne_impl,
+        "BB->?": numbers.int_ne_impl,
+        "hh->?": numbers.int_ne_impl,
+        "HH->?": numbers.int_ne_impl,
+        "ii->?": numbers.int_ne_impl,
+        "II->?": numbers.int_ne_impl,
+        "ll->?": numbers.int_ne_impl,
+        "LL->?": numbers.int_ne_impl,
+        "qq->?": numbers.int_ne_impl,
+        "QQ->?": numbers.int_ne_impl,
+        "ff->?": numbers.real_ne_impl,
+        "dd->?": numbers.real_ne_impl,
+        "FF->?": npyfuncs.np_complex_ne_impl,
+        "DD->?": npyfuncs.np_complex_ne_impl,
     }
     if numpy_version >= (1, 25):
-        db[np.not_equal].update({
-            'qQ->?': numbers.int_signed_unsigned_cmp('!='),
-            'Qq->?': numbers.int_unsigned_signed_cmp('!=')})
+        db[np.not_equal].update(
+            {
+                "qQ->?": numbers.int_signed_unsigned_cmp("!="),
+                "Qq->?": numbers.int_unsigned_signed_cmp("!="),
+            }
+        )
 
     db[np.equal] = {
-        '??->?': numbers.int_eq_impl,
-        'bb->?': numbers.int_eq_impl,
-        'BB->?': numbers.int_eq_impl,
-        'hh->?': numbers.int_eq_impl,
-        'HH->?': numbers.int_eq_impl,
-        'ii->?': numbers.int_eq_impl,
-        'II->?': numbers.int_eq_impl,
-        'll->?': numbers.int_eq_impl,
-        'LL->?': numbers.int_eq_impl,
-        'qq->?': numbers.int_eq_impl,
-        'QQ->?': numbers.int_eq_impl,
-        'ff->?': numbers.real_eq_impl,
-        'dd->?': numbers.real_eq_impl,
-        'FF->?': npyfuncs.np_complex_eq_impl,
-        'DD->?': npyfuncs.np_complex_eq_impl,
+        "??->?": numbers.int_eq_impl,
+        "bb->?": numbers.int_eq_impl,
+        "BB->?": numbers.int_eq_impl,
+        "hh->?": numbers.int_eq_impl,
+        "HH->?": numbers.int_eq_impl,
+        "ii->?": numbers.int_eq_impl,
+        "II->?": numbers.int_eq_impl,
+        "ll->?": numbers.int_eq_impl,
+        "LL->?": numbers.int_eq_impl,
+        "qq->?": numbers.int_eq_impl,
+        "QQ->?": numbers.int_eq_impl,
+        "ff->?": numbers.real_eq_impl,
+        "dd->?": numbers.real_eq_impl,
+        "FF->?": npyfuncs.np_complex_eq_impl,
+        "DD->?": npyfuncs.np_complex_eq_impl,
     }
     if numpy_version >= (1, 25):
-        db[np.equal].update({
-            'qQ->?': numbers.int_signed_unsigned_cmp('=='),
-            'Qq->?': numbers.int_unsigned_signed_cmp('==')})
+        db[np.equal].update(
+            {
+                "qQ->?": numbers.int_signed_unsigned_cmp("=="),
+                "Qq->?": numbers.int_unsigned_signed_cmp("=="),
+            }
+        )
 
     db[np.logical_and] = {
-        '??->?': npyfuncs.np_logical_and_impl,
-        'bb->?': npyfuncs.np_logical_and_impl,
-        'BB->?': npyfuncs.np_logical_and_impl,
-        'hh->?': npyfuncs.np_logical_and_impl,
-        'HH->?': npyfuncs.np_logical_and_impl,
-        'ii->?': npyfuncs.np_logical_and_impl,
-        'II->?': npyfuncs.np_logical_and_impl,
-        'll->?': npyfuncs.np_logical_and_impl,
-        'LL->?': npyfuncs.np_logical_and_impl,
-        'qq->?': npyfuncs.np_logical_and_impl,
-        'QQ->?': npyfuncs.np_logical_and_impl,
-        'ff->?': npyfuncs.np_logical_and_impl,
-        'dd->?': npyfuncs.np_logical_and_impl,
-        'FF->?': npyfuncs.np_complex_logical_and_impl,
-        'DD->?': npyfuncs.np_complex_logical_and_impl,
+        "??->?": npyfuncs.np_logical_and_impl,
+        "bb->?": npyfuncs.np_logical_and_impl,
+        "BB->?": npyfuncs.np_logical_and_impl,
+        "hh->?": npyfuncs.np_logical_and_impl,
+        "HH->?": npyfuncs.np_logical_and_impl,
+        "ii->?": npyfuncs.np_logical_and_impl,
+        "II->?": npyfuncs.np_logical_and_impl,
+        "ll->?": npyfuncs.np_logical_and_impl,
+        "LL->?": npyfuncs.np_logical_and_impl,
+        "qq->?": npyfuncs.np_logical_and_impl,
+        "QQ->?": npyfuncs.np_logical_and_impl,
+        "ff->?": npyfuncs.np_logical_and_impl,
+        "dd->?": npyfuncs.np_logical_and_impl,
+        "FF->?": npyfuncs.np_complex_logical_and_impl,
+        "DD->?": npyfuncs.np_complex_logical_and_impl,
     }
 
     db[np.logical_or] = {
-        '??->?': npyfuncs.np_logical_or_impl,
-        'bb->?': npyfuncs.np_logical_or_impl,
-        'BB->?': npyfuncs.np_logical_or_impl,
-        'hh->?': npyfuncs.np_logical_or_impl,
-        'HH->?': npyfuncs.np_logical_or_impl,
-        'ii->?': npyfuncs.np_logical_or_impl,
-        'II->?': npyfuncs.np_logical_or_impl,
-        'll->?': npyfuncs.np_logical_or_impl,
-        'LL->?': npyfuncs.np_logical_or_impl,
-        'qq->?': npyfuncs.np_logical_or_impl,
-        'QQ->?': npyfuncs.np_logical_or_impl,
-        'ff->?': npyfuncs.np_logical_or_impl,
-        'dd->?': npyfuncs.np_logical_or_impl,
-        'FF->?': npyfuncs.np_complex_logical_or_impl,
-        'DD->?': npyfuncs.np_complex_logical_or_impl,
+        "??->?": npyfuncs.np_logical_or_impl,
+        "bb->?": npyfuncs.np_logical_or_impl,
+        "BB->?": npyfuncs.np_logical_or_impl,
+        "hh->?": npyfuncs.np_logical_or_impl,
+        "HH->?": npyfuncs.np_logical_or_impl,
+        "ii->?": npyfuncs.np_logical_or_impl,
+        "II->?": npyfuncs.np_logical_or_impl,
+        "ll->?": npyfuncs.np_logical_or_impl,
+        "LL->?": npyfuncs.np_logical_or_impl,
+        "qq->?": npyfuncs.np_logical_or_impl,
+        "QQ->?": npyfuncs.np_logical_or_impl,
+        "ff->?": npyfuncs.np_logical_or_impl,
+        "dd->?": npyfuncs.np_logical_or_impl,
+        "FF->?": npyfuncs.np_complex_logical_or_impl,
+        "DD->?": npyfuncs.np_complex_logical_or_impl,
     }
 
     db[np.logical_xor] = {
-        '??->?': npyfuncs.np_logical_xor_impl,
-        'bb->?': npyfuncs.np_logical_xor_impl,
-        'BB->?': npyfuncs.np_logical_xor_impl,
-        'hh->?': npyfuncs.np_logical_xor_impl,
-        'HH->?': npyfuncs.np_logical_xor_impl,
-        'ii->?': npyfuncs.np_logical_xor_impl,
-        'II->?': npyfuncs.np_logical_xor_impl,
-        'll->?': npyfuncs.np_logical_xor_impl,
-        'LL->?': npyfuncs.np_logical_xor_impl,
-        'qq->?': npyfuncs.np_logical_xor_impl,
-        'QQ->?': npyfuncs.np_logical_xor_impl,
-        'ff->?': npyfuncs.np_logical_xor_impl,
-        'dd->?': npyfuncs.np_logical_xor_impl,
-        'FF->?': npyfuncs.np_complex_logical_xor_impl,
-        'DD->?': npyfuncs.np_complex_logical_xor_impl,
+        "??->?": npyfuncs.np_logical_xor_impl,
+        "bb->?": npyfuncs.np_logical_xor_impl,
+        "BB->?": npyfuncs.np_logical_xor_impl,
+        "hh->?": npyfuncs.np_logical_xor_impl,
+        "HH->?": npyfuncs.np_logical_xor_impl,
+        "ii->?": npyfuncs.np_logical_xor_impl,
+        "II->?": npyfuncs.np_logical_xor_impl,
+        "ll->?": npyfuncs.np_logical_xor_impl,
+        "LL->?": npyfuncs.np_logical_xor_impl,
+        "qq->?": npyfuncs.np_logical_xor_impl,
+        "QQ->?": npyfuncs.np_logical_xor_impl,
+        "ff->?": npyfuncs.np_logical_xor_impl,
+        "dd->?": npyfuncs.np_logical_xor_impl,
+        "FF->?": npyfuncs.np_complex_logical_xor_impl,
+        "DD->?": npyfuncs.np_complex_logical_xor_impl,
     }
 
     db[np.logical_not] = {
-        '?->?': npyfuncs.np_logical_not_impl,
-        'b->?': npyfuncs.np_logical_not_impl,
-        'B->?': npyfuncs.np_logical_not_impl,
-        'h->?': npyfuncs.np_logical_not_impl,
-        'H->?': npyfuncs.np_logical_not_impl,
-        'i->?': npyfuncs.np_logical_not_impl,
-        'I->?': npyfuncs.np_logical_not_impl,
-        'l->?': npyfuncs.np_logical_not_impl,
-        'L->?': npyfuncs.np_logical_not_impl,
-        'q->?': npyfuncs.np_logical_not_impl,
-        'Q->?': npyfuncs.np_logical_not_impl,
-        'f->?': npyfuncs.np_logical_not_impl,
-        'd->?': npyfuncs.np_logical_not_impl,
-        'F->?': npyfuncs.np_complex_logical_not_impl,
-        'D->?': npyfuncs.np_complex_logical_not_impl,
+        "?->?": npyfuncs.np_logical_not_impl,
+        "b->?": npyfuncs.np_logical_not_impl,
+        "B->?": npyfuncs.np_logical_not_impl,
+        "h->?": npyfuncs.np_logical_not_impl,
+        "H->?": npyfuncs.np_logical_not_impl,
+        "i->?": npyfuncs.np_logical_not_impl,
+        "I->?": npyfuncs.np_logical_not_impl,
+        "l->?": npyfuncs.np_logical_not_impl,
+        "L->?": npyfuncs.np_logical_not_impl,
+        "q->?": npyfuncs.np_logical_not_impl,
+        "Q->?": npyfuncs.np_logical_not_impl,
+        "f->?": npyfuncs.np_logical_not_impl,
+        "d->?": npyfuncs.np_logical_not_impl,
+        "F->?": npyfuncs.np_complex_logical_not_impl,
+        "D->?": npyfuncs.np_complex_logical_not_impl,
     }
 
     db[np.maximum] = {
-        '??->?': npyfuncs.np_logical_or_impl,
-        'bb->b': npyfuncs.np_int_smax_impl,
-        'BB->B': npyfuncs.np_int_umax_impl,
-        'hh->h': npyfuncs.np_int_smax_impl,
-        'HH->H': npyfuncs.np_int_umax_impl,
-        'ii->i': npyfuncs.np_int_smax_impl,
-        'II->I': npyfuncs.np_int_umax_impl,
-        'll->l': npyfuncs.np_int_smax_impl,
-        'LL->L': npyfuncs.np_int_umax_impl,
-        'qq->q': npyfuncs.np_int_smax_impl,
-        'QQ->Q': npyfuncs.np_int_umax_impl,
-        'ff->f': npyfuncs.np_real_maximum_impl,
-        'dd->d': npyfuncs.np_real_maximum_impl,
-        'FF->F': npyfuncs.np_complex_maximum_impl,
-        'DD->D': npyfuncs.np_complex_maximum_impl,
+        "??->?": npyfuncs.np_logical_or_impl,
+        "bb->b": npyfuncs.np_int_smax_impl,
+        "BB->B": npyfuncs.np_int_umax_impl,
+        "hh->h": npyfuncs.np_int_smax_impl,
+        "HH->H": npyfuncs.np_int_umax_impl,
+        "ii->i": npyfuncs.np_int_smax_impl,
+        "II->I": npyfuncs.np_int_umax_impl,
+        "ll->l": npyfuncs.np_int_smax_impl,
+        "LL->L": npyfuncs.np_int_umax_impl,
+        "qq->q": npyfuncs.np_int_smax_impl,
+        "QQ->Q": npyfuncs.np_int_umax_impl,
+        "ff->f": npyfuncs.np_real_maximum_impl,
+        "dd->d": npyfuncs.np_real_maximum_impl,
+        "FF->F": npyfuncs.np_complex_maximum_impl,
+        "DD->D": npyfuncs.np_complex_maximum_impl,
     }
 
     db[np.minimum] = {
-        '??->?': npyfuncs.np_logical_and_impl,
-        'bb->b': npyfuncs.np_int_smin_impl,
-        'BB->B': npyfuncs.np_int_umin_impl,
-        'hh->h': npyfuncs.np_int_smin_impl,
-        'HH->H': npyfuncs.np_int_umin_impl,
-        'ii->i': npyfuncs.np_int_smin_impl,
-        'II->I': npyfuncs.np_int_umin_impl,
-        'll->l': npyfuncs.np_int_smin_impl,
-        'LL->L': npyfuncs.np_int_umin_impl,
-        'qq->q': npyfuncs.np_int_smin_impl,
-        'QQ->Q': npyfuncs.np_int_umin_impl,
-        'ff->f': npyfuncs.np_real_minimum_impl,
-        'dd->d': npyfuncs.np_real_minimum_impl,
-        'FF->F': npyfuncs.np_complex_minimum_impl,
-        'DD->D': npyfuncs.np_complex_minimum_impl,
+        "??->?": npyfuncs.np_logical_and_impl,
+        "bb->b": npyfuncs.np_int_smin_impl,
+        "BB->B": npyfuncs.np_int_umin_impl,
+        "hh->h": npyfuncs.np_int_smin_impl,
+        "HH->H": npyfuncs.np_int_umin_impl,
+        "ii->i": npyfuncs.np_int_smin_impl,
+        "II->I": npyfuncs.np_int_umin_impl,
+        "ll->l": npyfuncs.np_int_smin_impl,
+        "LL->L": npyfuncs.np_int_umin_impl,
+        "qq->q": npyfuncs.np_int_smin_impl,
+        "QQ->Q": npyfuncs.np_int_umin_impl,
+        "ff->f": npyfuncs.np_real_minimum_impl,
+        "dd->d": npyfuncs.np_real_minimum_impl,
+        "FF->F": npyfuncs.np_complex_minimum_impl,
+        "DD->D": npyfuncs.np_complex_minimum_impl,
     }
 
     db[np.fmax] = {
-        '??->?': npyfuncs.np_logical_or_impl,
-        'bb->b': npyfuncs.np_int_smax_impl,
-        'BB->B': npyfuncs.np_int_umax_impl,
-        'hh->h': npyfuncs.np_int_smax_impl,
-        'HH->H': npyfuncs.np_int_umax_impl,
-        'ii->i': npyfuncs.np_int_smax_impl,
-        'II->I': npyfuncs.np_int_umax_impl,
-        'll->l': npyfuncs.np_int_smax_impl,
-        'LL->L': npyfuncs.np_int_umax_impl,
-        'qq->q': npyfuncs.np_int_smax_impl,
-        'QQ->Q': npyfuncs.np_int_umax_impl,
-        'ff->f': npyfuncs.np_real_fmax_impl,
-        'dd->d': npyfuncs.np_real_fmax_impl,
-        'FF->F': npyfuncs.np_complex_fmax_impl,
-        'DD->D': npyfuncs.np_complex_fmax_impl,
+        "??->?": npyfuncs.np_logical_or_impl,
+        "bb->b": npyfuncs.np_int_smax_impl,
+        "BB->B": npyfuncs.np_int_umax_impl,
+        "hh->h": npyfuncs.np_int_smax_impl,
+        "HH->H": npyfuncs.np_int_umax_impl,
+        "ii->i": npyfuncs.np_int_smax_impl,
+        "II->I": npyfuncs.np_int_umax_impl,
+        "ll->l": npyfuncs.np_int_smax_impl,
+        "LL->L": npyfuncs.np_int_umax_impl,
+        "qq->q": npyfuncs.np_int_smax_impl,
+        "QQ->Q": npyfuncs.np_int_umax_impl,
+        "ff->f": npyfuncs.np_real_fmax_impl,
+        "dd->d": npyfuncs.np_real_fmax_impl,
+        "FF->F": npyfuncs.np_complex_fmax_impl,
+        "DD->D": npyfuncs.np_complex_fmax_impl,
     }
 
     db[np.fmin] = {
-        '??->?': npyfuncs.np_logical_and_impl,
-        'bb->b': npyfuncs.np_int_smin_impl,
-        'BB->B': npyfuncs.np_int_umin_impl,
-        'hh->h': npyfuncs.np_int_smin_impl,
-        'HH->H': npyfuncs.np_int_umin_impl,
-        'ii->i': npyfuncs.np_int_smin_impl,
-        'II->I': npyfuncs.np_int_umin_impl,
-        'll->l': npyfuncs.np_int_smin_impl,
-        'LL->L': npyfuncs.np_int_umin_impl,
-        'qq->q': npyfuncs.np_int_smin_impl,
-        'QQ->Q': npyfuncs.np_int_umin_impl,
-        'ff->f': npyfuncs.np_real_fmin_impl,
-        'dd->d': npyfuncs.np_real_fmin_impl,
-        'FF->F': npyfuncs.np_complex_fmin_impl,
-        'DD->D': npyfuncs.np_complex_fmin_impl,
+        "??->?": npyfuncs.np_logical_and_impl,
+        "bb->b": npyfuncs.np_int_smin_impl,
+        "BB->B": npyfuncs.np_int_umin_impl,
+        "hh->h": npyfuncs.np_int_smin_impl,
+        "HH->H": npyfuncs.np_int_umin_impl,
+        "ii->i": npyfuncs.np_int_smin_impl,
+        "II->I": npyfuncs.np_int_umin_impl,
+        "ll->l": npyfuncs.np_int_smin_impl,
+        "LL->L": npyfuncs.np_int_umin_impl,
+        "qq->q": npyfuncs.np_int_smin_impl,
+        "QQ->Q": npyfuncs.np_int_umin_impl,
+        "ff->f": npyfuncs.np_real_fmin_impl,
+        "dd->d": npyfuncs.np_real_fmin_impl,
+        "FF->F": npyfuncs.np_complex_fmin_impl,
+        "DD->D": npyfuncs.np_complex_fmin_impl,
     }
 
     db[np.bitwise_and] = {
-        '??->?': numbers.int_and_impl,
-        'bb->b': numbers.int_and_impl,
-        'BB->B': numbers.int_and_impl,
-        'hh->h': numbers.int_and_impl,
-        'HH->H': numbers.int_and_impl,
-        'ii->i': numbers.int_and_impl,
-        'II->I': numbers.int_and_impl,
-        'll->l': numbers.int_and_impl,
-        'LL->L': numbers.int_and_impl,
-        'qq->q': numbers.int_and_impl,
-        'QQ->Q': numbers.int_and_impl,
+        "??->?": numbers.int_and_impl,
+        "bb->b": numbers.int_and_impl,
+        "BB->B": numbers.int_and_impl,
+        "hh->h": numbers.int_and_impl,
+        "HH->H": numbers.int_and_impl,
+        "ii->i": numbers.int_and_impl,
+        "II->I": numbers.int_and_impl,
+        "ll->l": numbers.int_and_impl,
+        "LL->L": numbers.int_and_impl,
+        "qq->q": numbers.int_and_impl,
+        "QQ->Q": numbers.int_and_impl,
     }
 
     db[np.bitwise_or] = {
-        '??->?': numbers.int_or_impl,
-        'bb->b': numbers.int_or_impl,
-        'BB->B': numbers.int_or_impl,
-        'hh->h': numbers.int_or_impl,
-        'HH->H': numbers.int_or_impl,
-        'ii->i': numbers.int_or_impl,
-        'II->I': numbers.int_or_impl,
-        'll->l': numbers.int_or_impl,
-        'LL->L': numbers.int_or_impl,
-        'qq->q': numbers.int_or_impl,
-        'QQ->Q': numbers.int_or_impl,
+        "??->?": numbers.int_or_impl,
+        "bb->b": numbers.int_or_impl,
+        "BB->B": numbers.int_or_impl,
+        "hh->h": numbers.int_or_impl,
+        "HH->H": numbers.int_or_impl,
+        "ii->i": numbers.int_or_impl,
+        "II->I": numbers.int_or_impl,
+        "ll->l": numbers.int_or_impl,
+        "LL->L": numbers.int_or_impl,
+        "qq->q": numbers.int_or_impl,
+        "QQ->Q": numbers.int_or_impl,
     }
 
     db[np.bitwise_xor] = {
-        '??->?': numbers.int_xor_impl,
-        'bb->b': numbers.int_xor_impl,
-        'BB->B': numbers.int_xor_impl,
-        'hh->h': numbers.int_xor_impl,
-        'HH->H': numbers.int_xor_impl,
-        'ii->i': numbers.int_xor_impl,
-        'II->I': numbers.int_xor_impl,
-        'll->l': numbers.int_xor_impl,
-        'LL->L': numbers.int_xor_impl,
-        'qq->q': numbers.int_xor_impl,
-        'QQ->Q': numbers.int_xor_impl,
+        "??->?": numbers.int_xor_impl,
+        "bb->b": numbers.int_xor_impl,
+        "BB->B": numbers.int_xor_impl,
+        "hh->h": numbers.int_xor_impl,
+        "HH->H": numbers.int_xor_impl,
+        "ii->i": numbers.int_xor_impl,
+        "II->I": numbers.int_xor_impl,
+        "ll->l": numbers.int_xor_impl,
+        "LL->L": numbers.int_xor_impl,
+        "qq->q": numbers.int_xor_impl,
+        "QQ->Q": numbers.int_xor_impl,
     }
 
     db[np.invert] = {
-        '?->?': numbers.int_invert_impl,
-        'b->b': numbers.int_invert_impl,
-        'B->B': numbers.int_invert_impl,
-        'h->h': numbers.int_invert_impl,
-        'H->H': numbers.int_invert_impl,
-        'i->i': numbers.int_invert_impl,
-        'I->I': numbers.int_invert_impl,
-        'l->l': numbers.int_invert_impl,
-        'L->L': numbers.int_invert_impl,
-        'q->q': numbers.int_invert_impl,
-        'Q->Q': numbers.int_invert_impl,
+        "?->?": numbers.int_invert_impl,
+        "b->b": numbers.int_invert_impl,
+        "B->B": numbers.int_invert_impl,
+        "h->h": numbers.int_invert_impl,
+        "H->H": numbers.int_invert_impl,
+        "i->i": numbers.int_invert_impl,
+        "I->I": numbers.int_invert_impl,
+        "l->l": numbers.int_invert_impl,
+        "L->L": numbers.int_invert_impl,
+        "q->q": numbers.int_invert_impl,
+        "Q->Q": numbers.int_invert_impl,
     }
 
     db[np.left_shift] = {
-        'bb->b': numbers.int_shl_impl,
-        'BB->B': numbers.int_shl_impl,
-        'hh->h': numbers.int_shl_impl,
-        'HH->H': numbers.int_shl_impl,
-        'ii->i': numbers.int_shl_impl,
-        'II->I': numbers.int_shl_impl,
-        'll->l': numbers.int_shl_impl,
-        'LL->L': numbers.int_shl_impl,
-        'qq->q': numbers.int_shl_impl,
-        'QQ->Q': numbers.int_shl_impl,
+        "bb->b": numbers.int_shl_impl,
+        "BB->B": numbers.int_shl_impl,
+        "hh->h": numbers.int_shl_impl,
+        "HH->H": numbers.int_shl_impl,
+        "ii->i": numbers.int_shl_impl,
+        "II->I": numbers.int_shl_impl,
+        "ll->l": numbers.int_shl_impl,
+        "LL->L": numbers.int_shl_impl,
+        "qq->q": numbers.int_shl_impl,
+        "QQ->Q": numbers.int_shl_impl,
     }
 
     db[np.right_shift] = {
-        'bb->b': numbers.int_shr_impl,
-        'BB->B': numbers.int_shr_impl,
-        'hh->h': numbers.int_shr_impl,
-        'HH->H': numbers.int_shr_impl,
-        'ii->i': numbers.int_shr_impl,
-        'II->I': numbers.int_shr_impl,
-        'll->l': numbers.int_shr_impl,
-        'LL->L': numbers.int_shr_impl,
-        'qq->q': numbers.int_shr_impl,
-        'QQ->Q': numbers.int_shr_impl,
+        "bb->b": numbers.int_shr_impl,
+        "BB->B": numbers.int_shr_impl,
+        "hh->h": numbers.int_shr_impl,
+        "HH->H": numbers.int_shr_impl,
+        "ii->i": numbers.int_shr_impl,
+        "II->I": numbers.int_shr_impl,
+        "ll->l": numbers.int_shr_impl,
+        "LL->L": numbers.int_shr_impl,
+        "qq->q": numbers.int_shr_impl,
+        "QQ->Q": numbers.int_shr_impl,
     }
 
     db[np.log] = {
-        'f->f': np_real_log_impl,
-        'd->d': np_real_log_impl,
-        'F->F': npyfuncs.np_complex_log_impl,
-        'D->D': npyfuncs.np_complex_log_impl,
+        "f->f": np_real_log_impl,
+        "d->d": np_real_log_impl,
+        "F->F": npyfuncs.np_complex_log_impl,
+        "D->D": npyfuncs.np_complex_log_impl,
     }
 
     db[np.log2] = {
-        'f->f': np_real_log2_impl,
-        'd->d': np_real_log2_impl,
-        'F->F': npyfuncs.np_complex_log2_impl,
-        'D->D': npyfuncs.np_complex_log2_impl,
+        "f->f": np_real_log2_impl,
+        "d->d": np_real_log2_impl,
+        "F->F": npyfuncs.np_complex_log2_impl,
+        "D->D": npyfuncs.np_complex_log2_impl,
     }
 
     db[np.log10] = {
-        'f->f': np_real_log10_impl,
-        'd->d': np_real_log10_impl,
-        'F->F': npyfuncs.np_complex_log10_impl,
-        'D->D': npyfuncs.np_complex_log10_impl,
+        "f->f": np_real_log10_impl,
+        "d->d": np_real_log10_impl,
+        "F->F": npyfuncs.np_complex_log10_impl,
+        "D->D": npyfuncs.np_complex_log10_impl,
     }
 
     return db
diff --git a/numba_cuda/numba/cuda/utils.py b/numba_cuda/numba/cuda/utils.py
index 48ce2b011..a66989135 100644
--- a/numba_cuda/numba/cuda/utils.py
+++ b/numba_cuda/numba/cuda/utils.py
@@ -9,7 +9,7 @@ def _readenv(name, ctor, default):
         return default() if callable(default) else default
     try:
         if ctor is bool:
-            return value.lower() in {'1', "true"}
+            return value.lower() in {"1", "true"}
         return ctor(value)
     except Exception:
         warnings.warn(
@@ -17,6 +17,6 @@ def _readenv(name, ctor, default):
             f"value '{value}' could not be parsed.\n"
             "The parse failed with exception:\n"
             f"{traceback.format_exc()}",
-            RuntimeWarning
+            RuntimeWarning,
         )
         return default
diff --git a/numba_cuda/numba/cuda/vector_types.py b/numba_cuda/numba/cuda/vector_types.py
index 5174e2b20..147c21aee 100644
--- a/numba_cuda/numba/cuda/vector_types.py
+++ b/numba_cuda/numba/cuda/vector_types.py
@@ -50,7 +50,7 @@ def make_vector_type(
     name: str,
     base_type: types.Type,
     attr_names: Tuple[str, ...],
-    user_facing_object
+    user_facing_object,
 ) -> types.Type:
     """Create a vector type.
 
@@ -149,7 +149,7 @@ def lowering(context, builder, sig, actual_args):
         lower(ctor, *arglist)(lowering)
 
 
-vector_types : Dict[str, VectorType] = {}
+vector_types: Dict[str, VectorType] = {}
 
 
 def build_constructor_overloads(base_type, vty_name, num_elements, arglists, l):
diff --git a/numba_cuda/numba/cuda/vectorizers.py b/numba_cuda/numba/cuda/vectorizers.py
index b4c6bcf5d..4cd80edbf 100644
--- a/numba_cuda/numba/cuda/vectorizers.py
+++ b/numba_cuda/numba/cuda/vectorizers.py
@@ -1,8 +1,11 @@
 from numba import cuda
 from numpy import array as np_array
 from numba.cuda import deviceufunc
-from numba.cuda.deviceufunc import (UFuncMechanism, GeneralizedUFunc,
-                                    GUFuncCallSteps)
+from numba.cuda.deviceufunc import (
+    UFuncMechanism,
+    GeneralizedUFunc,
+    GUFuncCallSteps,
+)
 
 
 class CUDAUFuncDispatcher(object):
@@ -28,8 +31,9 @@ def __call__(self, *args, **kws):
         return CUDAUFuncMechanism.call(self.functions, args, kws)
 
     def reduce(self, arg, stream=0):
-        assert len(list(self.functions.keys())[0]) == 2, "must be a binary " \
-                                                         "ufunc"
+        assert len(list(self.functions.keys())[0]) == 2, (
+            "must be a binary ufunc"
+        )
         assert arg.ndim == 1, "must use 1d array"
 
         n = arg.shape[0]
@@ -82,12 +86,12 @@ def __reduce(self, mem, gpu_mems, stream):
 
 class _CUDAGUFuncCallSteps(GUFuncCallSteps):
     __slots__ = [
-        '_stream',
+        "_stream",
     ]
 
     def __init__(self, nin, nout, args, kwargs):
         super().__init__(nin, nout, args, kwargs)
-        self._stream = kwargs.get('stream', 0)
+        self._stream = kwargs.get("stream", 0)
 
     def is_device_array(self, obj):
         return cuda.is_cuda_array(obj)
@@ -126,25 +130,27 @@ def _call_steps(self):
         return _CUDAGUFuncCallSteps
 
     def _broadcast_scalar_input(self, ary, shape):
-        return cuda.cudadrv.devicearray.DeviceNDArray(shape=shape,
-                                                      strides=(0,),
-                                                      dtype=ary.dtype,
-                                                      gpu_data=ary.gpu_data)
+        return cuda.cudadrv.devicearray.DeviceNDArray(
+            shape=shape, strides=(0,), dtype=ary.dtype, gpu_data=ary.gpu_data
+        )
 
     def _broadcast_add_axis(self, ary, newshape):
         newax = len(newshape) - len(ary.shape)
         # Add 0 strides for missing dimension
         newstrides = (0,) * newax + ary.strides
-        return cuda.cudadrv.devicearray.DeviceNDArray(shape=newshape,
-                                                      strides=newstrides,
-                                                      dtype=ary.dtype,
-                                                      gpu_data=ary.gpu_data)
+        return cuda.cudadrv.devicearray.DeviceNDArray(
+            shape=newshape,
+            strides=newstrides,
+            dtype=ary.dtype,
+            gpu_data=ary.gpu_data,
+        )
 
 
 class CUDAUFuncMechanism(UFuncMechanism):
     """
     Provide CUDA specialization
     """
+
     DEFAULT_STREAM = 0
 
     def launch(self, func, count, stream, args):
@@ -173,9 +179,11 @@ def allocate_device_array(self, shape, dtype, stream):
         return cuda.device_array(shape=shape, dtype=dtype, stream=stream)
 
     def broadcast_device(self, ary, shape):
-        ax_differs = [ax for ax in range(len(shape))
-                      if ax >= ary.ndim
-                      or ary.shape[ax] != shape[ax]]
+        ax_differs = [
+            ax
+            for ax in range(len(shape))
+            if ax >= ary.ndim or ary.shape[ax] != shape[ax]
+        ]
 
         missingdim = len(shape) - len(ary.shape)
         strides = [0] * missingdim + list(ary.strides)
@@ -183,18 +191,17 @@ def broadcast_device(self, ary, shape):
         for ax in ax_differs:
             strides[ax] = 0
 
-        return cuda.cudadrv.devicearray.DeviceNDArray(shape=shape,
-                                                      strides=strides,
-                                                      dtype=ary.dtype,
-                                                      gpu_data=ary.gpu_data)
+        return cuda.cudadrv.devicearray.DeviceNDArray(
+            shape=shape, strides=strides, dtype=ary.dtype, gpu_data=ary.gpu_data
+        )
 
 
-vectorizer_stager_source = '''
+vectorizer_stager_source = """
 def __vectorized_{name}({args}, __out__):
     __tid__ = __cuda__.grid(1)
     if __tid__ < __out__.shape[0]:
         __out__[__tid__] = __core__({argitems})
-'''
+"""
 
 
 class CUDAVectorize(deviceufunc.DeviceVectorize):
@@ -204,8 +211,7 @@ def _compile_core(self, sig):
 
     def _get_globals(self, corefn):
         glbl = self.pyfunc.__globals__.copy()
-        glbl.update({'__cuda__': cuda,
-                     '__core__': corefn})
+        glbl.update({"__cuda__": cuda, "__core__": corefn})
         return glbl
 
     def _compile_kernel(self, fnobj, sig):
@@ -222,20 +228,20 @@ def _kernel_template(self):
 # ------------------------------------------------------------------------------
 # Generalized CUDA ufuncs
 
-_gufunc_stager_source = '''
+_gufunc_stager_source = """
 def __gufunc_{name}({args}):
     __tid__ = __cuda__.grid(1)
     if __tid__ < {checkedarg}:
         __core__({argitems})
-'''
+"""
 
 
 class CUDAGUFuncVectorize(deviceufunc.DeviceGUFuncVectorize):
     def build_ufunc(self):
         engine = deviceufunc.GUFuncEngine(self.inputsig, self.outputsig)
-        return CUDAGeneralizedUFunc(kernelmap=self.kernelmap,
-                                    engine=engine,
-                                    pyfunc=self.pyfunc)
+        return CUDAGeneralizedUFunc(
+            kernelmap=self.kernelmap, engine=engine, pyfunc=self.pyfunc
+        )
 
     def _compile_kernel(self, fnobj, sig):
         return cuda.jit(sig)(fnobj)
@@ -247,6 +253,5 @@ def _kernel_template(self):
     def _get_globals(self, sig):
         corefn = cuda.jit(sig, device=True)(self.pyfunc)
         glbls = self.py_func.__globals__.copy()
-        glbls.update({'__cuda__': cuda,
-                      '__core__': corefn})
+        glbls.update({"__cuda__": cuda, "__core__": corefn})
         return glbls
diff --git a/pyproject.toml b/pyproject.toml
index 2a484d9da..6dbf04e16 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,3 +37,66 @@ include = ["numba_cuda*"]
 
 [tool.setuptools.package-data]
 "*" = ["*.cu", "*.h", "*.hpp", "*.ptx", "*.cuh", "VERSION", "Makefile"]
+
+[tool.ruff]
+line-length = 80
+
+[tool.ruff.format]
+docstring-code-format = true
+docstring-code-line-length = 80
+
+[tool.ruff.lint.pycodestyle]
+max-doc-length = 80
+max-line-length = 80
+
+[tool.ruff.lint]
+ignore = [
+    # Extra space in brackets
+    "E20",
+    # Multiple spaces around ","
+    "E231",
+    "E241",
+    # Comments
+    "E26",
+    # Assigning lambda expression
+    "E731",
+    # Ambiguous variable names
+    "E741",
+]
+fixable = ["ALL"]
+
+exclude = [
+    "__pycache__",
+    ".git",
+    "*.pyc",
+    "*~",
+    "*.o",
+    "*.so",
+    "*.cpp",
+    "*.c",
+    "*.h",
+]
+
+[tool.ruff.lint.per-file-ignores]
+# Slightly long line in the standard version file
+"numba_cuda/_version.py" = ["E501"]
+# "Unused" imports / potentially undefined names in init files
+"numba_cuda/numba/cuda/__init__.py" = ["F401", "F403", "F405"]
+"numba_cuda/numba/cuda/simulator/__init__.py" = ["F401", "F403"]
+"numba_cuda/numba/cuda/simulator/cudadrv/__init__.py" = ["F401"]
+# Ignore star imports", " unused imports", " and "may be defined by star imports"
+# errors in device_init because its purpose is to bring together a lot of
+# the public API to be star-imported in numba.cuda.__init__
+"numba_cuda/numba/cuda/device_init.py" = ["F401", "F403", "F405"]
+# libdevice.py is an autogenerated file containing stubs for all the device
+# functions. Some of the lines in docstrings are a little over-long", " as they
+# contain the URLs of the reference pages in the online libdevice
+# documentation.
+"numba_cuda/numba/cuda/libdevice.py" = ["E501"]
+# Ignore too-long lines in the doc examples", " prioritising readability
+# in the docs over line length in the example source (especially given that
+# the test code is already indented by 8 spaces)
+"numba_cuda/numba/cuda/tests/doc_examples/test_random.py" = ["E501"]
+"numba_cuda/numba/cuda/tests/doc_examples/test_cg.py" = ["E501"]
+"numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py" = ["E501"]
+"numba_cuda/numba/tests/doc_examples/test_interval_example.py" = ["E501"]
diff --git a/setup.py b/setup.py
index 98a1061d2..bfb11f27a 100644
--- a/setup.py
+++ b/setup.py
@@ -25,10 +25,12 @@ def run(self):
 
     def get_source_files(self):
         src = super().get_source_files()
-        src.extend([
-            str(SITE_PACKAGES / REDIRECTOR_PTH),
-            str(SITE_PACKAGES / REDIRECTOR_PY),
-        ])
+        src.extend(
+            [
+                str(SITE_PACKAGES / REDIRECTOR_PTH),
+                str(SITE_PACKAGES / REDIRECTOR_PY),
+            ]
+        )
         return src
 
     def get_output_mapping(self):
@@ -60,11 +62,17 @@ def _select_strategy(self, name, tag, build_lib):
         # the repo. It could be implemented, but we only handle the default
         # case for now.
         if self.mode is not None and self.mode != "lenient":
-            raise RuntimeError("Only lenient mode is supported for editable "
-                               f"install. Current mode is {self.mode}")
+            raise RuntimeError(
+                "Only lenient mode is supported for editable "
+                f"install. Current mode is {self.mode}"
+            )
 
         return TopLevelFinderWithRedirector(self.distribution, name)
 
 
-setup(cmdclass={"build_py": build_py_with_redirector,
-                "editable_wheel": editable_wheel_with_redirector})
+setup(
+    cmdclass={
+        "build_py": build_py_with_redirector,
+        "editable_wheel": editable_wheel_with_redirector,
+    }
+)
diff --git a/site-packages/_numba_cuda_redirector.py b/site-packages/_numba_cuda_redirector.py
index ae9043307..1c76609ac 100644
--- a/site-packages/_numba_cuda_redirector.py
+++ b/site-packages/_numba_cuda_redirector.py
@@ -4,11 +4,14 @@
 import sys
 import warnings
 
-multiple_locations_msg = ("Multiple submodule search locations for {}. "
-                          "Cannot redirect numba.cuda to numba_cuda")
+multiple_locations_msg = (
+    "Multiple submodule search locations for {}. "
+    "Cannot redirect numba.cuda to numba_cuda"
+)
 
-no_spec_msg = ("Couldn't get spec for {}. "
-               "Cannot redirect numba.cuda to numba_cuda")
+no_spec_msg = (
+    "Couldn't get spec for {}. Cannot redirect numba.cuda to numba_cuda"
+)
 
 
 class NumbaCudaFinder(importlib.abc.MetaPathFinder):
@@ -19,17 +22,17 @@ def ensure_initialized(self):
         if self.initialized is not None:
             return self.initialized
 
-        numba_spec = importlib.util.find_spec('numba')
+        numba_spec = importlib.util.find_spec("numba")
 
         if numba_spec is None:
-            warnings.warn(no_spec_msg.format('numba'))
+            warnings.warn(no_spec_msg.format("numba"))
             self.initialized = False
             return False
 
-        numba_cuda_spec = importlib.util.find_spec('numba_cuda')
+        numba_cuda_spec = importlib.util.find_spec("numba_cuda")
 
         if numba_spec is None:
-            warnings.warn(no_spec_msg.format('numba_cuda'))
+            warnings.warn(no_spec_msg.format("numba_cuda"))
             self.initialized = False
             return False
 
@@ -37,19 +40,19 @@ def ensure_initialized(self):
         numba_cuda_search_locations = numba_cuda_spec.submodule_search_locations
 
         if len(numba_search_locations) != 1:
-            warnings.warn(multiple_locations_msg.format('numba'))
+            warnings.warn(multiple_locations_msg.format("numba"))
             self.initialized = False
             return False
 
         if len(numba_cuda_search_locations) != 1:
-            warnings.warn(multiple_locations_msg.format('numba_cuda'))
+            warnings.warn(multiple_locations_msg.format("numba_cuda"))
             self.initialized = False
             return False
 
         self.numba_path = numba_search_locations[0]
 
         location = numba_cuda_search_locations[0]
-        self.numba_cuda_path = str((pathlib.Path(location) / 'numba'))
+        self.numba_cuda_path = str((pathlib.Path(location) / "numba"))
 
         self.initialized = True
         return True
@@ -64,8 +67,9 @@ def find_spec(self, name, path, target=None):
                 # Re-entrancy - return and carry on
                 return None
 
-            oot_path = [p.replace(self.numba_path, self.numba_cuda_path)
-                        for p in path]
+            oot_path = [
+                p.replace(self.numba_path, self.numba_cuda_path) for p in path
+            ]
             for finder in sys.meta_path:
                 try:
                     spec = finder.find_spec(name, oot_path, target)