diff --git a/.flake8 b/.flake8 deleted file mode 100644 index 591e4a2af..000000000 --- a/.flake8 +++ /dev/null @@ -1,52 +0,0 @@ -[flake8] -ignore = - # Extra space in brackets - E20, - # Multiple spaces around "," - E231,E241, - # Comments - E26, - # Assigning lambda expression - E731, - # Ambiguous variable names - E741, - # line break before binary operator - W503, - # line break after binary operator - W504, -max-line-length = 80 - -exclude = - __pycache__ - .git - *.pyc - *~ - *.o - *.so - *.cpp - *.c - *.h - -per-file-ignores = - # Slightly long line in the standard version file - numba_cuda/_version.py: E501 - # "Unused" imports / potentially undefined names in init files - numba_cuda/numba/cuda/__init__.py:F401,F403,F405 - numba_cuda/numba/cuda/simulator/__init__.py:F401,F403 - numba_cuda/numba/cuda/simulator/cudadrv/__init__.py:F401 - # Ignore star imports, unused imports, and "may be defined by star imports" - # errors in device_init because its purpose is to bring together a lot of - # the public API to be star-imported in numba.cuda.__init__ - numba_cuda/numba/cuda/device_init.py:F401,F403,F405 - # libdevice.py is an autogenerated file containing stubs for all the device - # functions. Some of the lines in docstrings are a little over-long, as they - # contain the URLs of the reference pages in the online libdevice - # documentation. - numba_cuda/numba/cuda/libdevice.py:E501 - # Ignore too-long lines in the doc examples, prioritising readability - # in the docs over line length in the example source (especially given that - # the test code is already indented by 8 spaces) - numba_cuda/numba/cuda/tests/doc_examples/test_random.py:E501 - numba_cuda/numba/cuda/tests/doc_examples/test_cg.py:E501 - numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py:E501 - numba_cuda/numba/tests/doc_examples/test_interval_example.py:E501 diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs new file mode 100644 index 000000000..e1edafb25 --- /dev/null +++ b/.git-blame-ignore-revs @@ -0,0 +1,2 @@ +# Migrate code style to ruff +06b62024f77bb92b585315fe61b9ba15e0885d71 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0a114cd32..478cd1ef5 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,5 +1,23 @@ repos: -- repo: https://github.com/PyCQA/flake8 - rev: 7.1.0 +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v5.0.0 # Use the latest version or a specific tag hooks: - - id: flake8 + - id: check-added-large-files + - id: check-ast + - id: check-json + - id: check-merge-conflict + - id: check-toml + - id: check-yaml + exclude: ^conda/recipes/numba-cuda/meta.yaml + - id: debug-statements + - id: end-of-file-fixer + - id: requirements-txt-fixer + - id: trailing-whitespace + - id: mixed-line-ending + args: ['--fix=lf'] +- repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.11.2 + hooks: + - id: ruff + args: [--fix] + - id: ruff-format diff --git a/docs/make.bat b/docs/make.bat index 9c55982a2..3629950d6 100644 --- a/docs/make.bat +++ b/docs/make.bat @@ -1,39 +1,39 @@ -@ECHO OFF - -pushd %~dp0 - -REM Command file for Sphinx documentation - -if "%SPHINXBUILD%" == "" ( - set SPHINXBUILD=sphinx-build -) -set SOURCEDIR=source -set BUILDDIR=build - -%SPHINXBUILD% >NUL 2>NUL -if errorlevel 9009 ( - echo. - echo.The 'sphinx-build' command was not found. Make sure you have Sphinx - echo.installed, then set the SPHINXBUILD environment variable to point - echo.to the full path of the 'sphinx-build' executable. Alternatively you - echo.may add the Sphinx directory to PATH. - echo. - echo.If you don't have Sphinx installed, grab it from - echo.https://www.sphinx-doc.org/ - exit /b 1 -) - -if "%1" == "" goto help - -if "%SPHINXOPTS%" == "" ( - set SPHINXOPTS=-W -) - -%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% -goto end - -:help -%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% - -:end -popd +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +if "%SPHINXOPTS%" == "" ( + set SPHINXOPTS=-W +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/source/conf.py b/docs/source/conf.py index dae885740..23c78665b 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -6,39 +6,40 @@ # -- Project information ----------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information -project = 'Numba CUDA' -copyright = '2012-2024 Anaconda Inc. 2024, NVIDIA Corporation.' -author = 'NVIDIA Corporation' +project = "Numba CUDA" +copyright = "2012-2024 Anaconda Inc. 2024, NVIDIA Corporation." +author = "NVIDIA Corporation" # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration -extensions = ['numpydoc', 'sphinx.ext.intersphinx', 'sphinx.ext.autodoc'] +extensions = ["numpydoc", "sphinx.ext.intersphinx", "sphinx.ext.autodoc"] -templates_path = ['_templates'] +templates_path = ["_templates"] exclude_patterns = [] intersphinx_mapping = { - 'python': ('https://docs.python.org/3', None), - 'numpy': ('https://numpy.org/doc/stable/', None), - 'llvmlite': ('https://llvmlite.readthedocs.io/en/latest/', None), - 'numba': ('https://numba.readthedocs.io/en/latest/', None), + "python": ("https://docs.python.org/3", None), + "numpy": ("https://numpy.org/doc/stable/", None), + "llvmlite": ("https://llvmlite.readthedocs.io/en/latest/", None), + "numba": ("https://numba.readthedocs.io/en/latest/", None), } # To prevent autosummary warnings numpydoc_show_class_members = False -autodoc_typehints = 'none' +autodoc_typehints = "none" # -- Options for HTML output ------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output try: import nvidia_sphinx_theme # noqa: F401 + html_theme = "nvidia_sphinx_theme" except ImportError: html_theme = "sphinx_rtd_theme" -html_static_path = ['_static'] +html_static_path = ["_static"] html_favicon = "_static/numba-green-icon-rgb.svg" html_show_sphinx = False diff --git a/docs/source/reference/types.rst b/docs/source/reference/types.rst index 31197241e..7ed079f1c 100644 --- a/docs/source/reference/types.rst +++ b/docs/source/reference/types.rst @@ -19,7 +19,7 @@ this is the recommended way to instantiate vector types. For convenience, users adapting existing kernels from CUDA C/C++ to Python may use aliases consistent with the C/C++ namings. For example, ``float3`` aliases ``float32x3``, -``long3`` aliases ``int32x3`` or ``int64x3`` (depending on the platform), etc. +``long3`` aliases ``int32x3`` or ``int64x3`` (depending on the platform), etc. Second, unlike CUDA C/C++ where factory functions are used, vector types are constructed directly with their constructor. For example, to construct a ``float32x3``: @@ -44,7 +44,7 @@ vector type. For example, all of the following constructions are valid: # Construct a 4-component vector with 2 2-component vectors u4 = uint32x4(u2, u2) -The 1st, 2nd, 3rd and 4th component of the vector type can be accessed through fields +The 1st, 2nd, 3rd and 4th component of the vector type can be accessed through fields ``x``, ``y``, ``z``, and ``w`` respectively. The components are immutable after construction in the present version of Numba; it is expected that support for mutating vector components will be added in a future release. diff --git a/docs/source/user/cooperative_groups.rst b/docs/source/user/cooperative_groups.rst index a08fa3784..0ce70614d 100644 --- a/docs/source/user/cooperative_groups.rst +++ b/docs/source/user/cooperative_groups.rst @@ -50,7 +50,7 @@ overloads: This can be used to ensure that the kernel is launched with no more than the maximum number of blocks. Exceeding the maximum number of blocks for the cooperative launch will result in a ``CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE`` -error. +error. Applications and Example diff --git a/docs/source/user/device-management.rst b/docs/source/user/device-management.rst index 8f9beb4db..12878961d 100644 --- a/docs/source/user/device-management.rst +++ b/docs/source/user/device-management.rst @@ -89,4 +89,3 @@ For example, to obtain the UUID of the current device: dev = cuda.current_context().device # prints e.g. "GPU-e6489c45-5b68-3b03-bab7-0e7c8e809643" print(dev.uuid) - diff --git a/docs/source/user/examples.rst b/docs/source/user/examples.rst index 8adcf313b..5425f4bb1 100644 --- a/docs/source/user/examples.rst +++ b/docs/source/user/examples.rst @@ -101,7 +101,7 @@ propagates through an object over time. It works by discretizing the problem in 1. The domain is partitioned into a mesh of points that each have an individual temperature. 2. Time is partitioned into discrete intervals that are advanced forward sequentially. -Then, the following assumption is applied: The temperature of a point after some interval +Then, the following assumption is applied: The temperature of a point after some interval has passed is some weighted average of the temperature of the points that are directly adjacent to it. Intuitively, if all the points in the domain are very hot and a single point in the middle is very cold, as time passes, the hot points will cause @@ -109,9 +109,9 @@ the cold one to heat up and the cold point will cause the surrounding hot pieces slightly. Simply put, the heat spreads throughout the object. We can implement this simulation using a Numba kernel. Let's start simple by assuming -we have a one dimensional object which we'll represent with an array of values. The position +we have a one dimensional object which we'll represent with an array of values. The position of the element in the array is the position of a point within the object, and the value -of the element represents the temperature. +of the element represents the temperature. .. literalinclude:: ../../../numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py :language: python @@ -138,7 +138,7 @@ The initial state of the problem can be visualized as: In our kernel each thread will be responsible for managing the temperature update for a single element in a loop over the desired number of timesteps. The kernel is below. Note the use of cooperative group -synchronization and the use of two buffers swapped at each iteration to avoid race conditions. See +synchronization and the use of two buffers swapped at each iteration to avoid race conditions. See :func:`numba.cuda.cg.this_grid() ` for details. .. literalinclude:: ../../../numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py @@ -237,15 +237,15 @@ A common problem in business analytics is that of grouping the activity of users sessions, called "sessionization". The idea is that users generally traverse through a website and perform various actions (clicking something, filling out a form, etc.) in discrete groups. Perhaps a customer spends some time shopping for an item in the morning and then again at night - often the business is interested in -treating these periods as separate interactions with their service, and this creates the problem of +treating these periods as separate interactions with their service, and this creates the problem of programmatically splitting up activity in some agreed-upon way. -Here we'll illustrate how to write a Numba kernel to solve this problem. We'll start with data -containing two fields: let ``user_id`` represent a unique ID corresponding to an individual customer, and let -``action_time`` be a time that some unknown action was taken on the service. Right now, we'll assume there's +Here we'll illustrate how to write a Numba kernel to solve this problem. We'll start with data +containing two fields: let ``user_id`` represent a unique ID corresponding to an individual customer, and let +``action_time`` be a time that some unknown action was taken on the service. Right now, we'll assume there's only one type of action, so all there is to know is when it happened. -Our goal will be to create a new column called ``session_id``, which contains a label corresponding to a unique +Our goal will be to create a new column called ``session_id``, which contains a label corresponding to a unique session. We'll define the boundary between sessions as when there has been at least one hour between clicks. @@ -256,7 +256,7 @@ session. We'll define the boundary between sessions as when there has been at le :end-before: ex_sessionize.import.end :dedent: 8 :linenos: - + Here is a solution using Numba: .. literalinclude:: ../../../numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py @@ -285,8 +285,8 @@ and a similar pattern is seen throughout. JIT Function CPU-GPU Compatibility ================================== -This example demonstrates how ``numba.jit`` can be used to jit compile a function for the CPU, while at the same time making -it available for use inside CUDA kernels. This can be very useful for users that are migrating workflows from CPU to GPU as +This example demonstrates how ``numba.jit`` can be used to jit compile a function for the CPU, while at the same time making +it available for use inside CUDA kernels. This can be very useful for users that are migrating workflows from CPU to GPU as they can directly reuse potential business logic with fewer code changes. Take the following example function: @@ -309,7 +309,7 @@ The function ``business_logic`` can be run standalone in compiled form on the CP :dedent: 8 :linenos: -It can also be directly reused threadwise inside a GPU kernel. For example one may +It can also be directly reused threadwise inside a GPU kernel. For example one may generate some vectors to represent ``x``, ``y``, and ``z``: .. literalinclude:: ../../../numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py @@ -345,12 +345,12 @@ This kernel can be invoked in the normal way: Monte Carlo Integration ======================= -This example shows how to use Numba to approximate the value of a definite integral by rapidly generating +This example shows how to use Numba to approximate the value of a definite integral by rapidly generating random numbers on the GPU. A detailed description of the mathematical mechanics of Monte Carlo integration -is out of the scope of the example, but it can briefly be described as an averaging process where the area +is out of the scope of the example, but it can briefly be described as an averaging process where the area under the curve is approximated by taking the average of many rectangles formed by its function values. -In addition, this example shows how to perform reductions in numba using the +In addition, this example shows how to perform reductions in numba using the :func:`cuda.reduce() ` API. .. literalinclude:: ../../../numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py diff --git a/docs/source/user/external-memory.rst b/docs/source/user/external-memory.rst index 28a8f59f0..a13071f80 100644 --- a/docs/source/user/external-memory.rst +++ b/docs/source/user/external-memory.rst @@ -52,7 +52,7 @@ sections, using the :func:`~numba.cuda.defer_cleanup` context manager. When an EMM Plugin is in use, the deallocation strategy is implemented by the EMM, and Numba's internal deallocation mechanism is not used. The EMM Plugin could implement: - + - A similar strategy to the Numba deallocation behaviour, or - Something more appropriate to the plugin - for example, deallocated memory might immediately be returned to a memory pool. diff --git a/docs/source/user/intrinsics.rst b/docs/source/user/intrinsics.rst index 521c1d918..4dc342b89 100644 --- a/docs/source/user/intrinsics.rst +++ b/docs/source/user/intrinsics.rst @@ -54,5 +54,3 @@ Multiple dimension arrays are supported by using a tuple of ints for the index:: result = np.zeros((3, 3, 3), dtype=np.float64) max_example_3d[(2, 2, 2), (5, 5, 5)](result, arr) print(result[0, 1, 2], '==', np.max(arr)) - - diff --git a/docs/source/user/laplace_final.svg b/docs/source/user/laplace_final.svg index 4f3b197fb..1f88cc93e 100644 --- a/docs/source/user/laplace_final.svg +++ b/docs/source/user/laplace_final.svg @@ -21,19 +21,19 @@ - - @@ -41,8 +41,8 @@ z - @@ -53,25 +53,25 @@ L 0 3.5 - @@ -89,28 +89,28 @@ z - @@ -130,23 +130,23 @@ z - @@ -166,34 +166,34 @@ z - @@ -213,43 +213,43 @@ z - @@ -269,18 +269,18 @@ z - @@ -295,130 +295,130 @@ z - - - - - - @@ -437,8 +437,8 @@ z - @@ -490,36 +490,36 @@ L -3.5 0 - @@ -552,29 +552,29 @@ z - @@ -587,168 +587,168 @@ z - - - - - - - @@ -767,125 +767,125 @@ z - - @@ -1894,23 +1894,23 @@ z - - - - @@ -1918,17 +1918,17 @@ L 684.288 51.3216 - diff --git a/docs/source/user/laplace_initial.svg b/docs/source/user/laplace_initial.svg index dbede3687..204626f84 100644 --- a/docs/source/user/laplace_initial.svg +++ b/docs/source/user/laplace_initial.svg @@ -21,19 +21,19 @@ - - @@ -41,8 +41,8 @@ z - @@ -53,25 +53,25 @@ L 0 3.5 - @@ -89,28 +89,28 @@ z - @@ -130,23 +130,23 @@ z - @@ -166,34 +166,34 @@ z - @@ -213,43 +213,43 @@ z - @@ -269,18 +269,18 @@ z - @@ -295,130 +295,130 @@ z - - - - - - @@ -437,8 +437,8 @@ z - @@ -537,168 +537,168 @@ L -3.5 0 - - - - - - - @@ -717,24 +717,24 @@ z - - @@ -1743,73 +1743,73 @@ z - - - - - - - diff --git a/docs/source/user/memory.rst b/docs/source/user/memory.rst index 116531e83..c876a8bf6 100644 --- a/docs/source/user/memory.rst +++ b/docs/source/user/memory.rst @@ -126,10 +126,10 @@ traditional dynamic memory management. device function). *shape* is either an integer or a tuple of integers representing the array's dimensions and must be a simple constant expression. A "simple constant expression" includes, but is not limited to: - + #. A literal (e.g. ``10``) #. A local variable whose right-hand side is a literal or a simple constant - expression (e.g. ``shape``, where ``shape`` is defined earlier in the function + expression (e.g. ``shape``, where ``shape`` is defined earlier in the function as ``shape = 10``) #. A global variable that is defined in the jitted function's globals by the time of compilation (e.g. ``shape``, where ``shape`` is defined using any expression @@ -259,14 +259,14 @@ unlike traditional dynamic memory management. Allocate a local array of the given *shape* and *type* on the device. *shape* is either an integer or a tuple of integers representing the array's - dimensions and must be a simple constant expression. A "simple constant expression" + dimensions and must be a simple constant expression. A "simple constant expression" includes, but is not limited to: #. A literal (e.g. ``10``) #. A local variable whose right-hand side is a literal or a simple constant expression (e.g. ``shape``, where ``shape`` is defined earlier in the function as ``shape = 10``) - #. A global variable that is defined in the jitted function's globals by the time + #. A global variable that is defined in the jitted function's globals by the time of compilation (e.g. ``shape``, where ``shape`` is defined using any expression at global scope). diff --git a/docs/source/user/simulator.rst b/docs/source/user/simulator.rst index 099ffc347..b10a0e874 100644 --- a/docs/source/user/simulator.rst +++ b/docs/source/user/simulator.rst @@ -11,7 +11,7 @@ be used to debug CUDA Python code, either by adding print statements to your code, or by using the debugger to step through the execution of an individual thread. -The simulator deliberately allows running non-CUDA code like starting a debugger +The simulator deliberately allows running non-CUDA code like starting a debugger and printing arbitrary expressions for debugging purposes. Therefore, it is best to start from code that compiles for the CUDA target, and then move over to the simulator to investigate issues. @@ -24,7 +24,7 @@ Using the simulator =================== The simulator is enabled by setting the environment variable -:envvar:`NUMBA_ENABLE_CUDASIM` to 1 prior to importing Numba. CUDA Python code +:envvar:`NUMBA_ENABLE_CUDASIM` to 1 prior to importing Numba. CUDA Python code may then be executed as normal. The easiest way to use the debugger inside a kernel is to only stop a single thread, otherwise the interaction with the debugger is difficult to handle. For example, the kernel below will stop in @@ -93,8 +93,8 @@ Some limitations of the simulator include: structured array access by attribute that works with the hardware target may fail in the simulator - see :ref:`structured-array-access`. * Operations directly against device arrays are only partially supported, that - is, testing equality, less than, greater than, and basic mathematical - operations are supported, but many other operations, such as the in-place + is, testing equality, less than, greater than, and basic mathematical + operations are supported, but many other operations, such as the in-place operators and bit operators are not. * The :func:`ffs() ` function only works correctly for values that can be represented using 32-bit integers. diff --git a/docs/source/user/ufunc.rst b/docs/source/user/ufunc.rst index 06f85e7ca..6beb5baab 100644 --- a/docs/source/user/ufunc.rst +++ b/docs/source/user/ufunc.rst @@ -64,7 +64,7 @@ the CUDA ufunc functionality. This may be accomplished as follows:: from numba import guvectorize - @guvectorize(['void(float32[:,:], float32[:,:], float32[:,:])'], + @guvectorize(['void(float32[:,:], float32[:,:], float32[:,:])'], '(m,n),(n,p)->(m,p)', target='cuda') def matmulcore(A, B, C): ... diff --git a/numba_cuda/_version.py b/numba_cuda/_version.py index 1cd1c11d6..01fe47f9f 100644 --- a/numba_cuda/_version.py +++ b/numba_cuda/_version.py @@ -15,5 +15,8 @@ import importlib.resources __version__ = ( - importlib.resources.files("numba_cuda").joinpath("VERSION").read_text().strip() + importlib.resources.files("numba_cuda") + .joinpath("VERSION") + .read_text() + .strip() ) diff --git a/numba_cuda/numba/cuda/__init__.py b/numba_cuda/numba/cuda/__init__.py index 01d468155..639d4d469 100644 --- a/numba_cuda/numba/cuda/__init__.py +++ b/numba_cuda/numba/cuda/__init__.py @@ -7,8 +7,12 @@ from .device_init import * from .device_init import _auto_device -from numba.cuda.compiler import (compile, compile_for_current_device, - compile_ptx, compile_ptx_for_current_device) +from numba.cuda.compiler import ( + compile, + compile_for_current_device, + compile_ptx, + compile_ptx_for_current_device, +) # This is the out-of-tree NVIDIA-maintained target. This is reported in Numba # sysinfo (`numba -s`): diff --git a/numba_cuda/numba/cuda/api.py b/numba_cuda/numba/cuda/api.py index 5dfe7c434..9a2300a35 100644 --- a/numba_cuda/numba/cuda/api.py +++ b/numba_cuda/numba/cuda/api.py @@ -2,7 +2,6 @@ API that are reported to numba.cuda """ - import contextlib import os @@ -28,35 +27,37 @@ def from_cuda_array_interface(desc, owner=None, sync=True): If ``sync`` is ``True``, then the imported stream (if present) will be synchronized. """ - version = desc.get('version') + version = desc.get("version") # Mask introduced in version 1 if 1 <= version: - mask = desc.get('mask') + mask = desc.get("mask") # Would ideally be better to detect if the mask is all valid if mask is not None: - raise NotImplementedError('Masked arrays are not supported') + raise NotImplementedError("Masked arrays are not supported") - shape = desc['shape'] - strides = desc.get('strides') - dtype = np.dtype(desc['typestr']) + shape = desc["shape"] + strides = desc.get("strides") + dtype = np.dtype(desc["typestr"]) shape, strides, dtype = prepare_shape_strides_dtype( - shape, strides, dtype, order='C') + shape, strides, dtype, order="C" + ) size = driver.memory_size_from_info(shape, strides, dtype.itemsize) - devptr = driver.get_devptr_for_active_ctx(desc['data'][0]) + devptr = driver.get_devptr_for_active_ctx(desc["data"][0]) data = driver.MemoryPointer( - current_context(), devptr, size=size, owner=owner) - stream_ptr = desc.get('stream', None) + current_context(), devptr, size=size, owner=owner + ) + stream_ptr = desc.get("stream", None) if stream_ptr is not None: stream = external_stream(stream_ptr) if sync and config.CUDA_ARRAY_INTERFACE_SYNC: stream.synchronize() else: - stream = 0 # No "Numba default stream", not the CUDA default stream - da = devicearray.DeviceNDArray(shape=shape, strides=strides, - dtype=dtype, gpu_data=data, - stream=stream) + stream = 0 # No "Numba default stream", not the CUDA default stream + da = devicearray.DeviceNDArray( + shape=shape, strides=strides, dtype=dtype, gpu_data=data, stream=stream + ) return da @@ -73,8 +74,9 @@ def as_cuda_array(obj, sync=True): if not is_cuda_array(obj): raise TypeError("*obj* doesn't implement the cuda array interface.") else: - return from_cuda_array_interface(obj.__cuda_array_interface__, - owner=obj, sync=sync) + return from_cuda_array_interface( + obj.__cuda_array_interface__, owner=obj, sync=sync + ) def is_cuda_array(obj): @@ -82,7 +84,7 @@ def is_cuda_array(obj): Does not verify the validity of the interface. """ - return hasattr(obj, '__cuda_array_interface__') + return hasattr(obj, "__cuda_array_interface__") def is_float16_supported(): @@ -125,8 +127,9 @@ def to_device(obj, stream=0, copy=True, to=None): hary = d_ary.copy_to_host(stream=stream) """ if to is None: - to, new = devicearray.auto_device(obj, stream=stream, copy=copy, - user_explicit=True) + to, new = devicearray.auto_device( + obj, stream=stream, copy=copy, user_explicit=True + ) return to if copy: to.copy_to_device(obj, stream=stream) @@ -134,20 +137,28 @@ def to_device(obj, stream=0, copy=True, to=None): @require_context -def device_array(shape, dtype=np.float64, strides=None, order='C', stream=0): +def device_array(shape, dtype=np.float64, strides=None, order="C", stream=0): """device_array(shape, dtype=np.float64, strides=None, order='C', stream=0) Allocate an empty device ndarray. Similar to :meth:`numpy.empty`. """ - shape, strides, dtype = prepare_shape_strides_dtype(shape, strides, dtype, - order) - return devicearray.DeviceNDArray(shape=shape, strides=strides, dtype=dtype, - stream=stream) + shape, strides, dtype = prepare_shape_strides_dtype( + shape, strides, dtype, order + ) + return devicearray.DeviceNDArray( + shape=shape, strides=strides, dtype=dtype, stream=stream + ) @require_context -def managed_array(shape, dtype=np.float64, strides=None, order='C', stream=0, - attach_global=True): +def managed_array( + shape, + dtype=np.float64, + strides=None, + order="C", + stream=0, + attach_global=True, +): """managed_array(shape, dtype=np.float64, strides=None, order='C', stream=0, attach_global=True) @@ -163,37 +174,48 @@ def managed_array(shape, dtype=np.float64, strides=None, order='C', stream=0, *host*, and memory is only accessible by devices with Compute Capability 6.0 and later. """ - shape, strides, dtype = prepare_shape_strides_dtype(shape, strides, dtype, - order) + shape, strides, dtype = prepare_shape_strides_dtype( + shape, strides, dtype, order + ) bytesize = driver.memory_size_from_info(shape, strides, dtype.itemsize) - buffer = current_context().memallocmanaged(bytesize, - attach_global=attach_global) - npary = np.ndarray(shape=shape, strides=strides, dtype=dtype, order=order, - buffer=buffer) + buffer = current_context().memallocmanaged( + bytesize, attach_global=attach_global + ) + npary = np.ndarray( + shape=shape, strides=strides, dtype=dtype, order=order, buffer=buffer + ) managedview = np.ndarray.view(npary, type=devicearray.ManagedNDArray) managedview.device_setup(buffer, stream=stream) return managedview @require_context -def pinned_array(shape, dtype=np.float64, strides=None, order='C'): +def pinned_array(shape, dtype=np.float64, strides=None, order="C"): """pinned_array(shape, dtype=np.float64, strides=None, order='C') Allocate an :class:`ndarray ` with a buffer that is pinned (pagelocked). Similar to :func:`np.empty() `. """ - shape, strides, dtype = prepare_shape_strides_dtype(shape, strides, dtype, - order) - bytesize = driver.memory_size_from_info(shape, strides, - dtype.itemsize) + shape, strides, dtype = prepare_shape_strides_dtype( + shape, strides, dtype, order + ) + bytesize = driver.memory_size_from_info(shape, strides, dtype.itemsize) buffer = current_context().memhostalloc(bytesize) - return np.ndarray(shape=shape, strides=strides, dtype=dtype, order=order, - buffer=buffer) + return np.ndarray( + shape=shape, strides=strides, dtype=dtype, order=order, buffer=buffer + ) @require_context -def mapped_array(shape, dtype=np.float64, strides=None, order='C', stream=0, - portable=False, wc=False): +def mapped_array( + shape, + dtype=np.float64, + strides=None, + order="C", + stream=0, + portable=False, + wc=False, +): """mapped_array(shape, dtype=np.float64, strides=None, order='C', stream=0, portable=False, wc=False) @@ -206,12 +228,14 @@ def mapped_array(shape, dtype=np.float64, strides=None, order='C', stream=0, to write by the host and to read by the device, but slower to write by the host and slower to write by the device. """ - shape, strides, dtype = prepare_shape_strides_dtype(shape, strides, dtype, - order) + shape, strides, dtype = prepare_shape_strides_dtype( + shape, strides, dtype, order + ) bytesize = driver.memory_size_from_info(shape, strides, dtype.itemsize) buffer = current_context().memhostalloc(bytesize, mapped=True) - npary = np.ndarray(shape=shape, strides=strides, dtype=dtype, order=order, - buffer=buffer) + npary = np.ndarray( + shape=shape, strides=strides, dtype=dtype, order=order, buffer=buffer + ) mappedview = np.ndarray.view(npary, type=devicearray.MappedNDArray) mappedview.device_setup(buffer, stream=stream) return mappedview @@ -243,8 +267,9 @@ def open_ipc_array(handle, shape, dtype, strides=None, offset=0): driver_handle.reserved[:] = handle # use *IpcHandle* to open the IPC memory ipchandle = driver.IpcHandle(None, driver_handle, size, offset=offset) - yield ipchandle.open_array(current_context(), shape=shape, - strides=strides, dtype=dtype) + yield ipchandle.open_array( + current_context(), shape=shape, strides=strides, dtype=dtype + ) ipchandle.close() @@ -260,7 +285,7 @@ def _contiguous_strides_like_array(ary): """ # Don't recompute strides if the default strides will be sufficient to # create a contiguous array. - if ary.flags['C_CONTIGUOUS'] or ary.flags['F_CONTIGUOUS'] or ary.ndim <= 1: + if ary.flags["C_CONTIGUOUS"] or ary.flags["F_CONTIGUOUS"] or ary.ndim <= 1: return None # Otherwise, we need to compute new strides using an algorithm adapted from @@ -270,7 +295,7 @@ def _contiguous_strides_like_array(ary): # Stride permutation. E.g. a stride array (4, -2, 12) becomes # [(1, -2), (0, 4), (2, 12)] - strideperm = [ x for x in enumerate(ary.strides) ] + strideperm = [x for x in enumerate(ary.strides)] strideperm.sort(key=lambda x: x[1]) # Compute new strides using permutation @@ -283,10 +308,10 @@ def _contiguous_strides_like_array(ary): def _order_like_array(ary): - if ary.flags['F_CONTIGUOUS'] and not ary.flags['C_CONTIGUOUS']: - return 'F' + if ary.flags["F_CONTIGUOUS"] and not ary.flags["C_CONTIGUOUS"]: + return "F" else: - return 'C' + return "C" def device_array_like(ary, stream=0): @@ -296,8 +321,13 @@ def device_array_like(ary, stream=0): """ strides = _contiguous_strides_like_array(ary) order = _order_like_array(ary) - return device_array(shape=ary.shape, dtype=ary.dtype, strides=strides, - order=order, stream=stream) + return device_array( + shape=ary.shape, + dtype=ary.dtype, + strides=strides, + order=order, + stream=stream, + ) def mapped_array_like(ary, stream=0, portable=False, wc=False): @@ -307,8 +337,15 @@ def mapped_array_like(ary, stream=0, portable=False, wc=False): """ strides = _contiguous_strides_like_array(ary) order = _order_like_array(ary) - return mapped_array(shape=ary.shape, dtype=ary.dtype, strides=strides, - order=order, stream=stream, portable=portable, wc=wc) + return mapped_array( + shape=ary.shape, + dtype=ary.dtype, + strides=strides, + order=order, + stream=stream, + portable=portable, + wc=wc, + ) def pinned_array_like(ary): @@ -318,8 +355,9 @@ def pinned_array_like(ary): """ strides = _contiguous_strides_like_array(ary) order = _order_like_array(ary) - return pinned_array(shape=ary.shape, dtype=ary.dtype, strides=strides, - order=order) + return pinned_array( + shape=ary.shape, dtype=ary.dtype, strides=strides, order=order + ) # Stream helper @@ -373,13 +411,15 @@ def external_stream(ptr): @require_context @contextlib.contextmanager def pinned(*arylist): - """A context manager for temporary pinning a sequence of host ndarrays. - """ + """A context manager for temporary pinning a sequence of host ndarrays.""" pmlist = [] for ary in arylist: - pm = current_context().mempin(ary, driver.host_pointer(ary), - driver.host_memory_size(ary), - mapped=False) + pm = current_context().mempin( + ary, + driver.host_pointer(ary), + driver.host_memory_size(ary), + mapped=False, + ) pmlist.append(pm) yield @@ -387,16 +427,18 @@ def pinned(*arylist): @require_context @contextlib.contextmanager def mapped(*arylist, **kws): - """A context manager for temporarily mapping a sequence of host ndarrays. - """ - assert not kws or 'stream' in kws, "Only accept 'stream' as keyword." - stream = kws.get('stream', 0) + """A context manager for temporarily mapping a sequence of host ndarrays.""" + assert not kws or "stream" in kws, "Only accept 'stream' as keyword." + stream = kws.get("stream", 0) pmlist = [] devarylist = [] for ary in arylist: - pm = current_context().mempin(ary, driver.host_pointer(ary), - driver.host_memory_size(ary), - mapped=True) + pm = current_context().mempin( + ary, + driver.host_pointer(ary), + driver.host_memory_size(ary), + mapped=True, + ) pmlist.append(pm) devary = devicearray.from_array_like(ary, gpu_data=pm, stream=stream) devarylist.append(devary) @@ -427,6 +469,7 @@ def event(timing=True): # Device selection + def select_device(device_id): """ Make the context associated with device *device_id* the current context. @@ -468,7 +511,7 @@ def detect(): Returns a boolean indicating whether any supported devices were detected. """ devlist = list_devices() - print('Found %d CUDA devices' % len(devlist)) + print("Found %d CUDA devices" % len(devlist)) supported_count = 0 for dev in devlist: attrs = [] @@ -476,29 +519,29 @@ def detect(): kernel_timeout = dev.KERNEL_EXEC_TIMEOUT tcc = dev.TCC_DRIVER fp32_to_fp64_ratio = dev.SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO - attrs += [('Compute Capability', '%d.%d' % cc)] - attrs += [('PCI Device ID', dev.PCI_DEVICE_ID)] - attrs += [('PCI Bus ID', dev.PCI_BUS_ID)] - attrs += [('UUID', dev.uuid)] - attrs += [('Watchdog', 'Enabled' if kernel_timeout else 'Disabled')] + attrs += [("Compute Capability", "%d.%d" % cc)] + attrs += [("PCI Device ID", dev.PCI_DEVICE_ID)] + attrs += [("PCI Bus ID", dev.PCI_BUS_ID)] + attrs += [("UUID", dev.uuid)] + attrs += [("Watchdog", "Enabled" if kernel_timeout else "Disabled")] if os.name == "nt": - attrs += [('Compute Mode', 'TCC' if tcc else 'WDDM')] - attrs += [('FP32/FP64 Performance Ratio', fp32_to_fp64_ratio)] + attrs += [("Compute Mode", "TCC" if tcc else "WDDM")] + attrs += [("FP32/FP64 Performance Ratio", fp32_to_fp64_ratio)] if cc < (3, 5): - support = '[NOT SUPPORTED: CC < 3.5]' + support = "[NOT SUPPORTED: CC < 3.5]" elif cc < (5, 0): - support = '[SUPPORTED (DEPRECATED)]' + support = "[SUPPORTED (DEPRECATED)]" supported_count += 1 else: - support = '[SUPPORTED]' + support = "[SUPPORTED]" supported_count += 1 - print('id %d %20s %40s' % (dev.id, dev.name, support)) + print("id %d %20s %40s" % (dev.id, dev.name, support)) for key, val in attrs: - print('%40s: %s' % (key, val)) + print("%40s: %s" % (key, val)) - print('Summary:') - print('\t%d/%d devices are supported' % (supported_count, len(devlist))) + print("Summary:") + print("\t%d/%d devices are supported" % (supported_count, len(devlist))) return supported_count > 0 diff --git a/numba_cuda/numba/cuda/api_util.py b/numba_cuda/numba/cuda/api_util.py index b8bffb7c1..1b2694af7 100644 --- a/numba_cuda/numba/cuda/api_util.py +++ b/numba_cuda/numba/cuda/api_util.py @@ -17,14 +17,14 @@ def _fill_stride_by_order(shape, dtype, order): if nd == 0: return () strides = [0] * nd - if order == 'C': + if order == "C": strides[-1] = dtype.itemsize for d in reversed(range(nd - 1)): strides[d] = strides[d + 1] * shape[d + 1] - elif order == 'F': + elif order == "F": strides[0] = dtype.itemsize for d in range(1, nd): strides[d] = strides[d - 1] * shape[d - 1] else: - raise ValueError('must be either C/F order') + raise ValueError("must be either C/F order") return tuple(strides) diff --git a/numba_cuda/numba/cuda/args.py b/numba_cuda/numba/cuda/args.py index 472bd0b87..ff204c619 100644 --- a/numba_cuda/numba/cuda/args.py +++ b/numba_cuda/numba/cuda/args.py @@ -2,6 +2,7 @@ Hints to wrap Kernel arguments to indicate how to manage host-device memory transfers before & after the kernel call. """ + import abc from numba.core.typing.typeof import typeof, Purpose @@ -31,9 +32,8 @@ def _numba_type_(self): class In(ArgHint): def to_device(self, retr, stream=0): from .cudadrv.devicearray import auto_device - devary, _ = auto_device( - self.value, - stream=stream) + + devary, _ = auto_device(self.value, stream=stream) # A dummy writeback functor to keep devary alive until the kernel # is called. retr.append(lambda: devary) @@ -43,10 +43,8 @@ def to_device(self, retr, stream=0): class Out(ArgHint): def to_device(self, retr, stream=0): from .cudadrv.devicearray import auto_device - devary, conv = auto_device( - self.value, - copy=False, - stream=stream) + + devary, conv = auto_device(self.value, copy=False, stream=stream) if conv: retr.append(lambda: devary.copy_to_host(self.value, stream=stream)) return devary @@ -55,9 +53,8 @@ def to_device(self, retr, stream=0): class InOut(ArgHint): def to_device(self, retr, stream=0): from .cudadrv.devicearray import auto_device - devary, conv = auto_device( - self.value, - stream=stream) + + devary, conv = auto_device(self.value, stream=stream) if conv: retr.append(lambda: devary.copy_to_host(self.value, stream=stream)) return devary @@ -68,10 +65,9 @@ def wrap_arg(value, default=InOut): __all__ = [ - 'In', - 'Out', - 'InOut', - - 'ArgHint', - 'wrap_arg', + "In", + "Out", + "InOut", + "ArgHint", + "wrap_arg", ] diff --git a/numba_cuda/numba/cuda/cg.py b/numba_cuda/numba/cuda/cg.py index 00d55704b..c3dc4add6 100644 --- a/numba_cuda/numba/cuda/cg.py +++ b/numba_cuda/numba/cuda/cg.py @@ -26,13 +26,13 @@ def codegen(context, builder, sig, args): one = context.get_constant(types.int32, 1) mod = builder.module return builder.call( - nvvmutils.declare_cudaCGGetIntrinsicHandle(mod), - (one,)) + nvvmutils.declare_cudaCGGetIntrinsicHandle(mod), (one,) + ) return sig, codegen -@overload(this_grid, target='cuda') +@overload(this_grid, target="cuda") def _ol_this_grid(): def impl(): return _this_grid() @@ -48,13 +48,13 @@ def codegen(context, builder, sig, args): flags = context.get_constant(types.int32, 0) mod = builder.module return builder.call( - nvvmutils.declare_cudaCGSynchronize(mod), - (*args, flags)) + nvvmutils.declare_cudaCGSynchronize(mod), (*args, flags) + ) return sig, codegen -@overload_method(GridGroupClass, 'sync', target='cuda') +@overload_method(GridGroupClass, "sync", target="cuda") def _ol_grid_group_sync(group): def impl(group): return _grid_group_sync(group) diff --git a/numba_cuda/numba/cuda/codegen.py b/numba_cuda/numba/cuda/codegen.py index 426eb82b3..2660f574a 100644 --- a/numba_cuda/numba/cuda/codegen.py +++ b/numba_cuda/numba/cuda/codegen.py @@ -9,7 +9,7 @@ import subprocess import tempfile -CUDA_TRIPLE = 'nvptx64-nvidia-cuda' +CUDA_TRIPLE = "nvptx64-nvidia-cuda" def run_nvdisasm(cubin, flags): @@ -19,19 +19,24 @@ def run_nvdisasm(cubin, flags): fname = None try: fd, fname = tempfile.mkstemp() - with open(fname, 'wb') as f: + with open(fname, "wb") as f: f.write(cubin) try: - cp = subprocess.run(['nvdisasm', *flags, fname], check=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + cp = subprocess.run( + ["nvdisasm", *flags, fname], + check=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) except FileNotFoundError as e: - msg = ("nvdisasm has not been found. You may need " - "to install the CUDA toolkit and ensure that " - "it is available on your PATH.\n") + msg = ( + "nvdisasm has not been found. You may need " + "to install the CUDA toolkit and ensure that " + "it is available on your PATH.\n" + ) raise RuntimeError(msg) from e - return cp.stdout.decode('utf-8') + return cp.stdout.decode("utf-8") finally: if fd is not None: os.close(fd) @@ -41,13 +46,13 @@ def run_nvdisasm(cubin, flags): def disassemble_cubin(cubin): # Request lineinfo in disassembly - flags = ['-gi'] + flags = ["-gi"] return run_nvdisasm(cubin, flags) def disassemble_cubin_for_cfg(cubin): # Request control flow graph in disassembly - flags = ['-cfg'] + flags = ["-cfg"] return run_nvdisasm(cubin, flags) @@ -65,7 +70,7 @@ def __init__( entry_name=None, max_registers=None, lto=False, - nvvm_options=None + nvvm_options=None, ): """ codegen: @@ -142,7 +147,7 @@ def get_asm_str(self, cc=None): arch = nvvm.get_arch_option(*cc) options = self._nvvm_options.copy() - options['arch'] = arch + options["arch"] = arch irs = self.llvm_strs @@ -151,12 +156,12 @@ def get_asm_str(self, cc=None): # Sometimes the result from NVVM contains trailing whitespace and # nulls, which we strip so that the assembly dump looks a little # tidier. - ptx = ptx.decode().strip('\x00').strip() + ptx = ptx.decode().strip("\x00").strip() if config.DUMP_ASSEMBLY: - print(("ASSEMBLY %s" % self._name).center(80, '-')) + print(("ASSEMBLY %s" % self._name).center(80, "-")) print(ptx) - print('=' * 80) + print("=" * 80) self._ptx_cache[cc] = ptx @@ -171,8 +176,8 @@ def get_ltoir(self, cc=None): arch = nvvm.get_arch_option(*cc) options = self._nvvm_options.copy() - options['arch'] = arch - options['gen-lto'] = None + options["arch"] = arch + options["gen-lto"] = None irs = self.llvm_strs ltoir = nvvm.compile_ir(irs, **options) @@ -192,7 +197,7 @@ def _link_all(self, linker, cc, ignore_nonlto=False): linker.add_file_guess_ext(path, ignore_nonlto) if self.needs_cudadevrt: linker.add_file_guess_ext( - get_cudalib('cudadevrt', static=True), ignore_nonlto + get_cudalib("cudadevrt", static=True), ignore_nonlto ) def get_cubin(self, cc=None): @@ -207,22 +212,20 @@ def get_cubin(self, cc=None): max_registers=self._max_registers, cc=cc, additional_flags=["-ptx"], - lto=self._lto + lto=self._lto, ) # `-ptx` flag is meant to view the optimized PTX for LTO objects. # Non-LTO objects are not passed to linker. self._link_all(linker, cc, ignore_nonlto=True) - ptx = linker.get_linked_ptx().decode('utf-8') + ptx = linker.get_linked_ptx().decode("utf-8") - print(("ASSEMBLY (AFTER LTO) %s" % self._name).center(80, '-')) + print(("ASSEMBLY (AFTER LTO) %s" % self._name).center(80, "-")) print(ptx) - print('=' * 80) + print("=" * 80) linker = driver.Linker.new( - max_registers=self._max_registers, - cc=cc, - lto=self._lto + max_registers=self._max_registers, cc=cc, lto=self._lto ) self._link_all(linker, cc, ignore_nonlto=False) cubin = linker.complete() @@ -234,8 +237,10 @@ def get_cubin(self, cc=None): def get_cufunc(self): if self._entry_name is None: - msg = "Missing entry_name - are you trying to get the cufunc " \ - "for a device function?" + msg = ( + "Missing entry_name - are you trying to get the cufunc " + "for a device function?" + ) raise RuntimeError(msg) ctx = devices.get_context() @@ -260,7 +265,7 @@ def get_linkerinfo(self, cc): try: return self._linkerinfo_cache[cc] except KeyError: - raise KeyError(f'No linkerinfo for CC {cc}') + raise KeyError(f"No linkerinfo for CC {cc}") def get_sass(self, cc=None): return disassemble_cubin(self.get_cubin(cc=cc)) @@ -271,7 +276,7 @@ def get_sass_cfg(self, cc=None): def add_ir_module(self, mod): self._raise_if_finalized() if self._module is not None: - raise RuntimeError('CUDACodeLibrary only supports one module') + raise RuntimeError("CUDACodeLibrary only supports one module") self._module = mod def add_linking_library(self, library): @@ -291,12 +296,13 @@ def get_function(self, name): for fn in self._module.functions: if fn.name == name: return fn - raise KeyError(f'Function {name} not found') + raise KeyError(f"Function {name} not found") @property def modules(self): - return [self._module] + [mod for lib in self._linking_libraries - for mod in lib.modules] + return [self._module] + [ + mod for lib in self._linking_libraries for mod in lib.modules + ] @property def linking_libraries(self): @@ -331,7 +337,7 @@ def finalize(self): for mod in library.modules: for fn in mod.functions: if not fn.is_declaration: - fn.linkage = 'linkonce_odr' + fn.linkage = "linkonce_odr" self._finalized = True @@ -342,10 +348,10 @@ def _reduce_states(self): after deserialization. """ if self._linking_files: - msg = 'Cannot pickle CUDACodeLibrary with linking files' + msg = "Cannot pickle CUDACodeLibrary with linking files" raise RuntimeError(msg) if not self._finalized: - raise RuntimeError('Cannot pickle unfinalized CUDACodeLibrary') + raise RuntimeError("Cannot pickle unfinalized CUDACodeLibrary") return dict( codegen=None, name=self.name, @@ -356,13 +362,23 @@ def _reduce_states(self): linkerinfo_cache=self._linkerinfo_cache, max_registers=self._max_registers, nvvm_options=self._nvvm_options, - needs_cudadevrt=self.needs_cudadevrt + needs_cudadevrt=self.needs_cudadevrt, ) @classmethod - def _rebuild(cls, codegen, name, entry_name, llvm_strs, ptx_cache, - cubin_cache, linkerinfo_cache, max_registers, nvvm_options, - needs_cudadevrt): + def _rebuild( + cls, + codegen, + name, + entry_name, + llvm_strs, + ptx_cache, + cubin_cache, + linkerinfo_cache, + max_registers, + nvvm_options, + needs_cudadevrt, + ): """ Rebuild an instance. """ diff --git a/numba_cuda/numba/cuda/compiler.py b/numba_cuda/numba/cuda/compiler.py index 49968890e..2009e777f 100644 --- a/numba_cuda/numba/cuda/compiler.py +++ b/numba_cuda/numba/cuda/compiler.py @@ -1,19 +1,39 @@ from llvmlite import ir from numba.core.typing.templates import ConcreteTemplate from numba.core import ir as numba_ir -from numba.core import (cgutils, types, typing, funcdesc, config, compiler, - sigutils, utils) -from numba.core.compiler import (sanitize_compile_result_entries, CompilerBase, - DefaultPassBuilder, Flags, Option, - CompileResult) +from numba.core import ( + cgutils, + types, + typing, + funcdesc, + config, + compiler, + sigutils, + utils, +) +from numba.core.compiler import ( + sanitize_compile_result_entries, + CompilerBase, + DefaultPassBuilder, + Flags, + Option, + CompileResult, +) from numba.core.compiler_lock import global_compiler_lock -from numba.core.compiler_machinery import (FunctionPass, LoweringPass, - PassManager, register_pass) +from numba.core.compiler_machinery import ( + FunctionPass, + LoweringPass, + PassManager, + register_pass, +) from numba.core.interpreter import Interpreter from numba.core.errors import NumbaInvalidConfigWarning from numba.core.untyped_passes import TranslateByteCode -from numba.core.typed_passes import (IRLegalization, NativeLowering, - AnnotateTypes) +from numba.core.typed_passes import ( + IRLegalization, + NativeLowering, + AnnotateTypes, +) from warnings import warn from numba.cuda import nvvmutils from numba.cuda.api import get_current_device @@ -52,15 +72,9 @@ class CUDAFlags(Flags): doc="Compute Capability", ) max_registers = Option( - type=_optional_int_type, - default=None, - doc="Max registers" - ) - lto = Option( - type=bool, - default=False, - doc="Enable Link-time Optimization" + type=_optional_int_type, default=None, doc="Max registers" ) + lto = Option(type=bool, default=False, doc="Enable Link-time Optimization") # The CUDACompileResult (CCR) has a specially-defined entry point equal to its @@ -79,6 +93,7 @@ class CUDAFlags(Flags): # point will no longer need to be a synthetic value, but will instead be a # pointer to the compiled function as in the CPU target. + class CUDACompileResult(CompileResult): @property def entry_point(self): @@ -92,7 +107,6 @@ def cuda_compile_result(**entries): @register_pass(mutates_CFG=True, analysis_only=False) class CUDABackend(LoweringPass): - _name = "cuda_backend" def __init__(self): @@ -102,7 +116,7 @@ def run_pass(self, state): """ Back-end: Packages lowering output in a compile result """ - lowered = state['cr'] + lowered = state["cr"] signature = typing.signature(state.return_type, *state.args) state.cr = cuda_compile_result( @@ -137,9 +151,12 @@ def run_pass(self, state): nvvm_options = state.flags.nvvm_options max_registers = state.flags.max_registers lto = state.flags.lto - state.library = codegen.create_library(name, nvvm_options=nvvm_options, - max_registers=max_registers, - lto=lto) + state.library = codegen.create_library( + name, + nvvm_options=nvvm_options, + max_registers=max_registers, + lto=lto, + ) # Enable object caching upfront so that the library can be serialized. state.library.enable_object_caching() @@ -165,13 +182,15 @@ def _op_JUMP_IF(self, inst, pred, iftrue): gv_fn = numba_ir.Global("bool", bool, loc=self.loc) self.store(value=gv_fn, name=name) - callres = numba_ir.Expr.call(self.get(name), (self.get(pred),), (), - loc=self.loc) + callres = numba_ir.Expr.call( + self.get(name), (self.get(pred),), (), loc=self.loc + ) pname = "$%spred" % (inst.offset) predicate = self.store(value=callres, name=pname) - bra = numba_ir.Branch(cond=predicate, truebr=truebr, falsebr=falsebr, - loc=self.loc) + bra = numba_ir.Branch( + cond=predicate, truebr=truebr, falsebr=falsebr, loc=self.loc + ) self.current_block.append(bra) @@ -183,18 +202,18 @@ def __init__(self): FunctionPass.__init__(self) def run_pass(self, state): - func_id = state['func_id'] - bc = state['bc'] + func_id = state["func_id"] + bc = state["bc"] interp = CUDABytecodeInterpreter(func_id) func_ir = interp.interpret(bc) - state['func_ir'] = func_ir + state["func_ir"] = func_ir return True class CUDACompiler(CompilerBase): def define_pipelines(self): dpb = DefaultPassBuilder - pm = PassManager('cuda') + pm = PassManager("cuda") untyped_passes = dpb.define_untyped_pipeline(self.state) @@ -225,10 +244,9 @@ def replace_translate_pass(implementation, description): return [pm] def define_cuda_lowering_pipeline(self, state): - pm = PassManager('cuda_lowering') + pm = PassManager("cuda_lowering") # legalise - pm.add_pass(IRLegalization, - "ensure IR is legal prior to lowering") + pm.add_pass(IRLegalization, "ensure IR is legal prior to lowering") pm.add_pass(AnnotateTypes, "annotate types") # lower @@ -241,13 +259,24 @@ def define_cuda_lowering_pipeline(self, state): @global_compiler_lock -def compile_cuda(pyfunc, return_type, args, debug=False, lineinfo=False, - inline=False, fastmath=False, nvvm_options=None, - cc=None, max_registers=None, lto=False): +def compile_cuda( + pyfunc, + return_type, + args, + debug=False, + lineinfo=False, + inline=False, + fastmath=False, + nvvm_options=None, + cc=None, + max_registers=None, + lto=False, +): if cc is None: - raise ValueError('Compute Capability must be supplied') + raise ValueError("Compute Capability must be supplied") from .descriptor import cuda_target + typingctx = cuda_target.typing_context targetctx = cuda_target.target_context @@ -269,10 +298,10 @@ def compile_cuda(pyfunc, return_type, args, debug=False, lineinfo=False, flags.dbg_directives_only = True if debug: - flags.error_model = 'python' + flags.error_model = "python" flags.dbg_extend_lifetimes = True else: - flags.error_model = 'numpy' + flags.error_model = "numpy" if inline: flags.forceinline = True @@ -286,15 +315,18 @@ def compile_cuda(pyfunc, return_type, args, debug=False, lineinfo=False, # Run compilation pipeline from numba.core.target_extension import target_override - with target_override('cuda'): - cres = compiler.compile_extra(typingctx=typingctx, - targetctx=targetctx, - func=pyfunc, - args=args, - return_type=return_type, - flags=flags, - locals={}, - pipeline_class=CUDACompiler) + + with target_override("cuda"): + cres = compiler.compile_extra( + typingctx=typingctx, + targetctx=targetctx, + func=pyfunc, + args=args, + return_type=return_type, + flags=flags, + locals={}, + pipeline_class=CUDACompiler, + ) library = cres.library library.finalize() @@ -302,8 +334,9 @@ def compile_cuda(pyfunc, return_type, args, debug=False, lineinfo=False, return cres -def cabi_wrap_function(context, lib, fndesc, wrapper_function_name, - nvvm_options): +def cabi_wrap_function( + context, lib, fndesc, wrapper_function_name, nvvm_options +): """ Wrap a Numba ABI function in a C ABI wrapper at the NVVM IR level. @@ -311,9 +344,11 @@ def cabi_wrap_function(context, lib, fndesc, wrapper_function_name, """ # The wrapper will be contained in a new library that links to the wrapped # function's library - library = lib.codegen.create_library(f'{lib.name}_function_', - entry_name=wrapper_function_name, - nvvm_options=nvvm_options) + library = lib.codegen.create_library( + f"{lib.name}_function_", + entry_name=wrapper_function_name, + nvvm_options=nvvm_options, + ) library.add_linking_library(lib) # Determine the caller (C ABI) and wrapper (Numba ABI) function types @@ -331,14 +366,15 @@ def cabi_wrap_function(context, lib, fndesc, wrapper_function_name, # its return value wrapfn = ir.Function(wrapper_module, wrapfnty, wrapper_function_name) - builder = ir.IRBuilder(wrapfn.append_basic_block('')) + builder = ir.IRBuilder(wrapfn.append_basic_block("")) arginfo = context.get_arg_packer(argtypes) callargs = arginfo.from_arguments(builder, wrapfn.args) # We get (status, return_value), but we ignore the status since we # can't propagate it through the C ABI anyway _, return_value = context.call_conv.call_function( - builder, func, restype, argtypes, callargs) + builder, func, restype, argtypes, callargs + ) builder.ret(return_value) if config.DUMP_LLVM: @@ -395,8 +431,10 @@ def kernel_fixup(kernel, debug): # Find all stores first for inst in block.instructions: - if (isinstance(inst, ir.StoreInstr) - and inst.operands[1] == return_value): + if ( + isinstance(inst, ir.StoreInstr) + and inst.operands[1] == return_value + ): remove_list.append(inst) # Remove all stores @@ -407,8 +445,9 @@ def kernel_fixup(kernel, debug): # value if isinstance(kernel.type, ir.PointerType): - new_type = ir.PointerType(ir.FunctionType(ir.VoidType(), - kernel.type.pointee.args[1:])) + new_type = ir.PointerType( + ir.FunctionType(ir.VoidType(), kernel.type.pointee.args[1:]) + ) else: new_type = ir.FunctionType(ir.VoidType(), kernel.type.args[1:]) @@ -418,13 +457,13 @@ def kernel_fixup(kernel, debug): # If debug metadata is present, remove the return value from it - if kernel_metadata := getattr(kernel, 'metadata', None): - if dbg_metadata := kernel_metadata.get('dbg', None): + if kernel_metadata := getattr(kernel, "metadata", None): + if dbg_metadata := kernel_metadata.get("dbg", None): for name, value in dbg_metadata.operands: if name == "type": type_metadata = value for tm_name, tm_value in type_metadata.operands: - if tm_name == 'types': + if tm_name == "types": types = tm_value types.operands = types.operands[1:] if config.DUMP_LLVM: @@ -435,26 +474,24 @@ def kernel_fixup(kernel, debug): nvvm.set_cuda_kernel(kernel) if config.DUMP_LLVM: - print(f"LLVM DUMP: Post kernel fixup {kernel.name}".center(80, '-')) + print(f"LLVM DUMP: Post kernel fixup {kernel.name}".center(80, "-")) print(kernel.module) - print('=' * 80) + print("=" * 80) def add_exception_store_helper(kernel): - # Create global variables for exception state def define_error_gv(postfix): name = kernel.name + postfix - gv = cgutils.add_global_variable(kernel.module, ir.IntType(32), - name) + gv = cgutils.add_global_variable(kernel.module, ir.IntType(32), name) gv.initializer = ir.Constant(gv.type.pointee, None) return gv gv_exc = define_error_gv("__errcode__") gv_tid = [] gv_ctaid = [] - for i in 'xyz': + for i in "xyz": gv_tid.append(define_error_gv("__tid%s__" % i)) gv_ctaid.append(define_error_gv("__ctaid%s__" % i)) @@ -484,18 +521,25 @@ def define_error_gv(postfix): # Use atomic cmpxchg to prevent rewriting the error status # Only the first error is recorded - xchg = builder.cmpxchg(gv_exc, old, status.code, - 'monotonic', 'monotonic') + xchg = builder.cmpxchg( + gv_exc, old, status.code, "monotonic", "monotonic" + ) changed = builder.extract_value(xchg, 1) # If the xchange is successful, save the thread ID. sreg = nvvmutils.SRegBuilder(builder) with builder.if_then(changed): - for dim, ptr, in zip("xyz", gv_tid): + for ( + dim, + ptr, + ) in zip("xyz", gv_tid): val = sreg.tid(dim) builder.store(val, ptr) - for dim, ptr, in zip("xyz", gv_ctaid): + for ( + dim, + ptr, + ) in zip("xyz", gv_ctaid): val = sreg.ctaid(dim) builder.store(val, ptr) @@ -505,9 +549,19 @@ def define_error_gv(postfix): @global_compiler_lock -def compile(pyfunc, sig, debug=None, lineinfo=False, device=True, - fastmath=False, cc=None, opt=None, abi="c", abi_info=None, - output='ptx'): +def compile( + pyfunc, + sig, + debug=None, + lineinfo=False, + device=True, + fastmath=False, + cc=None, + opt=None, + abi="c", + abi_info=None, + output="ptx", +): """Compile a Python function to PTX or LTO-IR for a given set of argument types. @@ -551,43 +605,49 @@ def compile(pyfunc, sig, debug=None, lineinfo=False, device=True, :rtype: tuple """ if abi not in ("numba", "c"): - raise NotImplementedError(f'Unsupported ABI: {abi}') + raise NotImplementedError(f"Unsupported ABI: {abi}") - if abi == 'c' and not device: - raise NotImplementedError('The C ABI is not supported for kernels') + if abi == "c" and not device: + raise NotImplementedError("The C ABI is not supported for kernels") if output not in ("ptx", "ltoir"): - raise NotImplementedError(f'Unsupported output type: {output}') + raise NotImplementedError(f"Unsupported output type: {output}") debug = config.CUDA_DEBUGINFO_DEFAULT if debug is None else debug opt = (config.OPT != 0) if opt is None else opt if debug and opt: - msg = ("debug=True with opt=True " - "is not supported by CUDA. This may result in a crash" - " - set debug=False or opt=False.") + msg = ( + "debug=True with opt=True " + "is not supported by CUDA. This may result in a crash" + " - set debug=False or opt=False." + ) warn(NumbaInvalidConfigWarning(msg)) - lto = (output == 'ltoir') + lto = output == "ltoir" abi_info = abi_info or dict() - nvvm_options = { - 'fastmath': fastmath, - 'opt': 3 if opt else 0 - } + nvvm_options = {"fastmath": fastmath, "opt": 3 if opt else 0} if debug: - nvvm_options['g'] = None + nvvm_options["g"] = None if lto: - nvvm_options['gen-lto'] = None + nvvm_options["gen-lto"] = None args, return_type = sigutils.normalize_signature(sig) cc = cc or config.CUDA_DEFAULT_PTX_CC - cres = compile_cuda(pyfunc, return_type, args, debug=debug, - lineinfo=lineinfo, fastmath=fastmath, - nvvm_options=nvvm_options, cc=cc) + cres = compile_cuda( + pyfunc, + return_type, + args, + debug=debug, + lineinfo=lineinfo, + fastmath=fastmath, + nvvm_options=nvvm_options, + cc=cc, + ) resty = cres.signature.return_type if resty and not device and resty != types.void: @@ -598,9 +658,10 @@ def compile(pyfunc, sig, debug=None, lineinfo=False, device=True, if device: lib = cres.library if abi == "c": - wrapper_name = abi_info.get('abi_name', pyfunc.__name__) - lib = cabi_wrap_function(tgt, lib, cres.fndesc, wrapper_name, - nvvm_options) + wrapper_name = abi_info.get("abi_name", pyfunc.__name__) + lib = cabi_wrap_function( + tgt, lib, cres.fndesc, wrapper_name, nvvm_options + ) else: lib = cres.library kernel = lib.get_function(cres.fndesc.llvm_func_name) @@ -614,38 +675,94 @@ def compile(pyfunc, sig, debug=None, lineinfo=False, device=True, return code, resty -def compile_for_current_device(pyfunc, sig, debug=None, lineinfo=False, - device=True, fastmath=False, opt=None, - abi="c", abi_info=None, output='ptx'): +def compile_for_current_device( + pyfunc, + sig, + debug=None, + lineinfo=False, + device=True, + fastmath=False, + opt=None, + abi="c", + abi_info=None, + output="ptx", +): """Compile a Python function to PTX or LTO-IR for a given signature for the current device's compute capabilility. This calls :func:`compile` with an appropriate ``cc`` value for the current device.""" cc = get_current_device().compute_capability - return compile(pyfunc, sig, debug=debug, lineinfo=lineinfo, device=device, - fastmath=fastmath, cc=cc, opt=opt, abi=abi, - abi_info=abi_info, output=output) + return compile( + pyfunc, + sig, + debug=debug, + lineinfo=lineinfo, + device=device, + fastmath=fastmath, + cc=cc, + opt=opt, + abi=abi, + abi_info=abi_info, + output=output, + ) -def compile_ptx(pyfunc, sig, debug=None, lineinfo=False, device=False, - fastmath=False, cc=None, opt=None, abi="numba", abi_info=None): +def compile_ptx( + pyfunc, + sig, + debug=None, + lineinfo=False, + device=False, + fastmath=False, + cc=None, + opt=None, + abi="numba", + abi_info=None, +): """Compile a Python function to PTX for a given signature. See :func:`compile`. The defaults for this function are to compile a kernel with the Numba ABI, rather than :func:`compile`'s default of compiling a device function with the C ABI.""" - return compile(pyfunc, sig, debug=debug, lineinfo=lineinfo, device=device, - fastmath=fastmath, cc=cc, opt=opt, abi=abi, - abi_info=abi_info, output='ptx') + return compile( + pyfunc, + sig, + debug=debug, + lineinfo=lineinfo, + device=device, + fastmath=fastmath, + cc=cc, + opt=opt, + abi=abi, + abi_info=abi_info, + output="ptx", + ) -def compile_ptx_for_current_device(pyfunc, sig, debug=None, lineinfo=False, - device=False, fastmath=False, opt=None, - abi="numba", abi_info=None): +def compile_ptx_for_current_device( + pyfunc, + sig, + debug=None, + lineinfo=False, + device=False, + fastmath=False, + opt=None, + abi="numba", + abi_info=None, +): """Compile a Python function to PTX for a given signature for the current device's compute capabilility. See :func:`compile_ptx`.""" cc = get_current_device().compute_capability - return compile_ptx(pyfunc, sig, debug=debug, lineinfo=lineinfo, - device=device, fastmath=fastmath, cc=cc, opt=opt, - abi=abi, abi_info=abi_info) + return compile_ptx( + pyfunc, + sig, + debug=debug, + lineinfo=lineinfo, + device=device, + fastmath=fastmath, + cc=cc, + opt=opt, + abi=abi, + abi_info=abi_info, + ) def declare_device_function(name, restype, argtypes, link): @@ -654,6 +771,7 @@ def declare_device_function(name, restype, argtypes, link): def declare_device_function_template(name, restype, argtypes, link): from .descriptor import cuda_target + typingctx = cuda_target.typing_context targetctx = cuda_target.target_context sig = typing.signature(restype, *argtypes) @@ -664,7 +782,8 @@ class device_function_template(ConcreteTemplate): cases = [sig] fndesc = funcdesc.ExternalFunctionDescriptor( - name=name, restype=restype, argtypes=argtypes) + name=name, restype=restype, argtypes=argtypes + ) typingctx.insert_user_function(extfn, device_function_template) targetctx.insert_user_function(extfn, fndesc) diff --git a/numba_cuda/numba/cuda/cpp_function_wrappers.cu b/numba_cuda/numba/cuda/cpp_function_wrappers.cu index a2cd1e054..105152805 100644 --- a/numba_cuda/numba/cuda/cpp_function_wrappers.cu +++ b/numba_cuda/numba/cuda/cpp_function_wrappers.cu @@ -23,7 +23,7 @@ FNDEF(hdiv)( ) { __half retval = __hdiv(__short_as_half (x), __short_as_half (y)); - + *return_value = __half_as_short (retval); // Signal that no Python exception occurred return 0; @@ -44,4 +44,3 @@ UNARY_FUNCTION(hceil) UNARY_FUNCTION(hrcp) UNARY_FUNCTION(hrint) UNARY_FUNCTION(htrunc) - diff --git a/numba_cuda/numba/cuda/cuda_fp16.h b/numba_cuda/numba/cuda/cuda_fp16.h index 3001595e9..9780be106 100644 --- a/numba_cuda/numba/cuda/cuda_fp16.h +++ b/numba_cuda/numba/cuda/cuda_fp16.h @@ -112,33 +112,33 @@ /* Forward-declaration of structures defined in "cuda_fp16.hpp" */ /** - * \brief half datatype - * - * \details This structure implements the datatype for storing - * half-precision floating-point numbers. The structure implements - * assignment operators and type conversions. - * 16 bits are being used in total: 1 sign bit, 5 bits for the exponent, - * and the significand is being stored in 10 bits. - * The total precision is 11 bits. There are 15361 representable - * numbers within the interval [0.0, 1.0], endpoints included. - * On average we have log10(2**11) ~ 3.311 decimal digits. - * + * \brief half datatype + * + * \details This structure implements the datatype for storing + * half-precision floating-point numbers. The structure implements + * assignment operators and type conversions. + * 16 bits are being used in total: 1 sign bit, 5 bits for the exponent, + * and the significand is being stored in 10 bits. + * The total precision is 11 bits. There are 15361 representable + * numbers within the interval [0.0, 1.0], endpoints included. + * On average we have log10(2**11) ~ 3.311 decimal digits. + * * \internal - * \req IEEE 754-2008 compliant implementation of half-precision - * floating-point numbers. + * \req IEEE 754-2008 compliant implementation of half-precision + * floating-point numbers. * \endinternal */ struct __half; /** * \brief half2 datatype - * - * \details This structure implements the datatype for storing two - * half-precision floating-point numbers. - * The structure implements assignment operators and type conversions. - * + * + * \details This structure implements the datatype for storing two + * half-precision floating-point numbers. + * The structure implements assignment operators and type conversions. + * * \internal - * \req Vectorified version of half. + * \req Vectorified version of half. * \endinternal */ struct __half2; @@ -161,12 +161,12 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half __double2half(const double a); /** * \ingroup CUDA_MATH__HALF_MISC * \brief Converts float number to half precision in round-to-nearest-even mode -* and returns \p half with converted value. -* -* \details Converts float number \p a to half precision in round-to-nearest-even mode. -* \param[in] a - float. Is only being read. +* and returns \p half with converted value. +* +* \details Converts float number \p a to half precision in round-to-nearest-even mode. +* \param[in] a - float. Is only being read. * \returns half -* \retval a converted to half. +* \retval a converted to half. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -179,9 +179,9 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half(const float a); * and returns \p half with converted value. * * \details Converts float number \p a to half precision in round-to-nearest-even mode. -* \param[in] a - float. Is only being read. +* \param[in] a - float. Is only being read. * \returns half -* \retval a converted to half. +* \retval a converted to half. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -192,11 +192,11 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rn(const float a); * \ingroup CUDA_MATH__HALF_MISC * \brief Converts float number to half precision in round-towards-zero mode * and returns \p half with converted value. -* +* * \details Converts float number \p a to half precision in round-towards-zero mode. -* \param[in] a - float. Is only being read. +* \param[in] a - float. Is only being read. * \returns half -* \retval a converted to half. +* \retval a converted to half. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -207,12 +207,12 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rz(const float a); * \ingroup CUDA_MATH__HALF_MISC * \brief Converts float number to half precision in round-down mode * and returns \p half with converted value. -* +* * \details Converts float number \p a to half precision in round-down mode. -* \param[in] a - float. Is only being read. -* +* \param[in] a - float. Is only being read. +* * \returns half -* \retval a converted to half. +* \retval a converted to half. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -223,12 +223,12 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rd(const float a); * \ingroup CUDA_MATH__HALF_MISC * \brief Converts float number to half precision in round-up mode * and returns \p half with converted value. -* +* * \details Converts float number \p a to half precision in round-up mode. -* \param[in] a - float. Is only being read. -* +* \param[in] a - float. Is only being read. +* * \returns half -* \retval a converted to half. +* \retval a converted to half. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -238,12 +238,12 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_ru(const float a); /** * \ingroup CUDA_MATH__HALF_MISC * \brief Converts \p half number to float. -* +* * \details Converts half number \p a to float. -* \param[in] a - float. Is only being read. -* +* \param[in] a - float. Is only being read. +* * \returns float -* \retval a converted to float. +* \retval a converted to float. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -257,7 +257,7 @@ __CUDA_HOSTDEVICE_FP16_DECL__ float __half2float(const __half a); * * \details Converts input \p a to half precision in round-to-nearest-even mode and * populates both halves of \p half2 with converted value. -* \param[in] a - float. Is only being read. +* \param[in] a - float. Is only being read. * * \returns half2 * \retval The \p half2 value with both halves equal to the converted half @@ -277,9 +277,9 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float2half2_rn(const float a); * and combines the results into one \p half2 number. Low 16 bits of the return * value correspond to the input \p a, high 16 bits correspond to the input \p * b. -* \param[in] a - float. Is only being read. -* \param[in] b - float. Is only being read. -* +* \param[in] a - float. Is only being read. +* \param[in] b - float. Is only being read. +* * \returns half2 * \retval The \p half2 value with corresponding halves equal to the * converted input floats. @@ -292,11 +292,11 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half2 __floats2half2_rn(const float a, const flo /** * \ingroup CUDA_MATH__HALF_MISC * \brief Converts low 16 bits of \p half2 to float and returns the result -* +* * \details Converts low 16 bits of \p half2 input \p a to 32-bit floating-point number * and returns the result. -* \param[in] a - half2. Is only being read. -* +* \param[in] a - half2. Is only being read. +* * \returns float * \retval The low 16 bits of \p a converted to float. * \internal @@ -308,11 +308,11 @@ __CUDA_HOSTDEVICE_FP16_DECL__ float __low2float(const __half2 a); /** * \ingroup CUDA_MATH__HALF_MISC * \brief Converts high 16 bits of \p half2 to float and returns the result -* +* * \details Converts high 16 bits of \p half2 input \p a to 32-bit floating-point number * and returns the result. -* \param[in] a - half2. Is only being read. -* +* \param[in] a - half2. Is only being read. +* * \returns float * \retval The high 16 bits of \p a converted to float. * \internal @@ -327,13 +327,13 @@ __CUDA_HOSTDEVICE_FP16_DECL__ float __high2float(const __half2 a); * \ingroup CUDA_MATH__HALF_MISC * \brief Converts both components of float2 number to half precision in * round-to-nearest-even mode and returns \p half2 with converted values. -* +* * \details Converts both components of float2 to half precision in round-to-nearest * mode and combines the results into one \p half2 number. Low 16 bits of the * return value correspond to \p a.x and high 16 bits of the return value * correspond to \p a.y. -* \param[in] a - float2. Is only being read. -* +* \param[in] a - float2. Is only being read. +* * \returns half2 * \retval The \p half2 which has corresponding halves equal to the * converted float2 components. @@ -346,11 +346,11 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float22half2_rn(const float2 a); /** * \ingroup CUDA_MATH__HALF_MISC * \brief Converts both halves of \p half2 to float2 and returns the result. -* +* * \details Converts both halves of \p half2 input \p a to float2 and returns the * result. -* \param[in] a - half2. Is only being read. -* +* \param[in] a - half2. Is only being read. +* * \returns float2 * \retval a converted to float2. * \internal @@ -362,13 +362,13 @@ __CUDA_HOSTDEVICE_FP16_DECL__ float2 __half22float2(const __half2 a); /** * \ingroup CUDA_MATH__HALF_MISC * \brief Convert a half to a signed integer in round-to-nearest-even mode. -* +* * \details Convert the half-precision floating-point value \p h to a signed integer in * round-to-nearest-even mode. -* \param[in] h - half. Is only being read. -* +* \param[in] h - half. Is only being read. +* * \returns int -* \retval h converted to a signed integer. +* \retval h converted to a signed integer. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -378,13 +378,13 @@ __CUDA_FP16_DECL__ int __half2int_rn(const __half h); /** * \ingroup CUDA_MATH__HALF_MISC * \brief Convert a half to a signed integer in round-towards-zero mode. -* +* * \details Convert the half-precision floating-point value \p h to a signed integer in * round-towards-zero mode. -* \param[in] h - half. Is only being read. -* +* \param[in] h - half. Is only being read. +* * \returns int -* \retval h converted to a signed integer. +* \retval h converted to a signed integer. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -394,13 +394,13 @@ __CUDA_HOSTDEVICE_FP16_DECL__ int __half2int_rz(const __half h); /** * \ingroup CUDA_MATH__HALF_MISC * \brief Convert a half to a signed integer in round-down mode. -* +* * \details Convert the half-precision floating-point value \p h to a signed integer in * round-down mode. -* \param[in] h - half. Is only being read. -* +* \param[in] h - half. Is only being read. +* * \returns int -* \retval h converted to a signed integer. +* \retval h converted to a signed integer. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -410,13 +410,13 @@ __CUDA_FP16_DECL__ int __half2int_rd(const __half h); /** * \ingroup CUDA_MATH__HALF_MISC * \brief Convert a half to a signed integer in round-up mode. -* +* * \details Convert the half-precision floating-point value \p h to a signed integer in * round-up mode. -* \param[in] h - half. Is only being read. -* +* \param[in] h - half. Is only being read. +* * \returns int -* \retval h converted to a signed integer. +* \retval h converted to a signed integer. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -427,13 +427,13 @@ __CUDA_FP16_DECL__ int __half2int_ru(const __half h); /** * \ingroup CUDA_MATH__HALF_MISC * \brief Convert a signed integer to a half in round-to-nearest-even mode. -* +* * \details Convert the signed integer value \p i to a half-precision floating-point * value in round-to-nearest-even mode. -* \param[in] i - int. Is only being read. -* +* \param[in] i - int. Is only being read. +* * \returns half -* \retval i converted to half. +* \retval i converted to half. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -443,13 +443,13 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_rn(const int i); /** * \ingroup CUDA_MATH__HALF_MISC * \brief Convert a signed integer to a half in round-towards-zero mode. -* +* * \details Convert the signed integer value \p i to a half-precision floating-point * value in round-towards-zero mode. -* \param[in] i - int. Is only being read. -* +* \param[in] i - int. Is only being read. +* * \returns half -* \retval i converted to half. +* \retval i converted to half. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -459,13 +459,13 @@ __CUDA_FP16_DECL__ __half __int2half_rz(const int i); /** * \ingroup CUDA_MATH__HALF_MISC * \brief Convert a signed integer to a half in round-down mode. -* +* * \details Convert the signed integer value \p i to a half-precision floating-point * value in round-down mode. -* \param[in] i - int. Is only being read. -* +* \param[in] i - int. Is only being read. +* * \returns half -* \retval i converted to half. +* \retval i converted to half. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -475,13 +475,13 @@ __CUDA_FP16_DECL__ __half __int2half_rd(const int i); /** * \ingroup CUDA_MATH__HALF_MISC * \brief Convert a signed integer to a half in round-up mode. -* +* * \details Convert the signed integer value \p i to a half-precision floating-point * value in round-up mode. -* \param[in] i - int. Is only being read. -* +* \param[in] i - int. Is only being read. +* * \returns half -* \retval i converted to half. +* \retval i converted to half. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -493,13 +493,13 @@ __CUDA_FP16_DECL__ __half __int2half_ru(const int i); * \ingroup CUDA_MATH__HALF_MISC * \brief Convert a half to a signed short integer in round-to-nearest-even * mode. -* +* * \details Convert the half-precision floating-point value \p h to a signed short * integer in round-to-nearest-even mode. -* \param[in] h - half. Is only being read. -* +* \param[in] h - half. Is only being read. +* * \returns short int -* \retval h converted to a signed short integer. +* \retval h converted to a signed short integer. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -509,13 +509,13 @@ __CUDA_FP16_DECL__ short int __half2short_rn(const __half h); /** * \ingroup CUDA_MATH__HALF_MISC * \brief Convert a half to a signed short integer in round-towards-zero mode. -* +* * \details Convert the half-precision floating-point value \p h to a signed short * integer in round-towards-zero mode. -* \param[in] h - half. Is only being read. -* +* \param[in] h - half. Is only being read. +* * \returns short int -* \retval h converted to a signed short integer. +* \retval h converted to a signed short integer. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -525,13 +525,13 @@ __CUDA_HOSTDEVICE_FP16_DECL__ short int __half2short_rz(const __half h); /** * \ingroup CUDA_MATH__HALF_MISC * \brief Convert a half to a signed short integer in round-down mode. -* +* * \details Convert the half-precision floating-point value \p h to a signed short * integer in round-down mode. -* \param[in] h - half. Is only being read. -* +* \param[in] h - half. Is only being read. +* * \returns short int -* \retval h converted to a signed short integer. +* \retval h converted to a signed short integer. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -541,13 +541,13 @@ __CUDA_FP16_DECL__ short int __half2short_rd(const __half h); /** * \ingroup CUDA_MATH__HALF_MISC * \brief Convert a half to a signed short integer in round-up mode. -* +* * \details Convert the half-precision floating-point value \p h to a signed short * integer in round-up mode. -* \param[in] h - half. Is only being read. -* +* \param[in] h - half. Is only being read. +* * \returns short int -* \retval h converted to a signed short integer. +* \retval h converted to a signed short integer. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -559,13 +559,13 @@ __CUDA_FP16_DECL__ short int __half2short_ru(const __half h); * \ingroup CUDA_MATH__HALF_MISC * \brief Convert a signed short integer to a half in round-to-nearest-even * mode. -* +* * \details Convert the signed short integer value \p i to a half-precision floating-point * value in round-to-nearest-even mode. -* \param[in] i - short int. Is only being read. -* +* \param[in] i - short int. Is only being read. +* * \returns half -* \retval i converted to half. +* \retval i converted to half. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -575,13 +575,13 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_rn(const short int i); /** * \ingroup CUDA_MATH__HALF_MISC * \brief Convert a signed short integer to a half in round-towards-zero mode. -* +* * \details Convert the signed short integer value \p i to a half-precision floating-point * value in round-towards-zero mode. -* \param[in] i - short int. Is only being read. -* +* \param[in] i - short int. Is only being read. +* * \returns half -* \retval i converted to half. +* \retval i converted to half. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -591,13 +591,13 @@ __CUDA_FP16_DECL__ __half __short2half_rz(const short int i); /** * \ingroup CUDA_MATH__HALF_MISC * \brief Convert a signed short integer to a half in round-down mode. -* +* * \details Convert the signed short integer value \p i to a half-precision floating-point * value in round-down mode. -* \param[in] i - short int. Is only being read. -* +* \param[in] i - short int. Is only being read. +* * \returns half -* \retval i converted to half. +* \retval i converted to half. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -607,13 +607,13 @@ __CUDA_FP16_DECL__ __half __short2half_rd(const short int i); /** * \ingroup CUDA_MATH__HALF_MISC * \brief Convert a signed short integer to a half in round-up mode. -* +* * \details Convert the signed short integer value \p i to a half-precision floating-point * value in round-up mode. -* \param[in] i - short int. Is only being read. -* +* \param[in] i - short int. Is only being read. +* * \returns half -* \retval i converted to half. +* \retval i converted to half. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -624,13 +624,13 @@ __CUDA_FP16_DECL__ __half __short2half_ru(const short int i); /** * \ingroup CUDA_MATH__HALF_MISC * \brief Convert a half to an unsigned integer in round-to-nearest-even mode. -* +* * \details Convert the half-precision floating-point value \p h to an unsigned integer * in round-to-nearest-even mode. -* \param[in] h - half. Is only being read. -* +* \param[in] h - half. Is only being read. +* * \returns unsigned int -* \retval h converted to an unsigned integer. +* \retval h converted to an unsigned integer. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -640,13 +640,13 @@ __CUDA_FP16_DECL__ unsigned int __half2uint_rn(const __half h); /** * \ingroup CUDA_MATH__HALF_MISC * \brief Convert a half to an unsigned integer in round-towards-zero mode. -* +* * \details Convert the half-precision floating-point value \p h to an unsigned integer * in round-towards-zero mode. -* \param[in] h - half. Is only being read. -* +* \param[in] h - half. Is only being read. +* * \returns unsigned int -* \retval h converted to an unsigned integer. +* \retval h converted to an unsigned integer. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -659,10 +659,10 @@ __CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __half2uint_rz(const __half h); * * \details Convert the half-precision floating-point value \p h to an unsigned integer * in round-down mode. -* \param[in] h - half. Is only being read. +* \param[in] h - half. Is only being read. * * \returns unsigned int -* \retval h converted to an unsigned integer. +* \retval h converted to an unsigned integer. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -675,10 +675,10 @@ __CUDA_FP16_DECL__ unsigned int __half2uint_rd(const __half h); * * \details Convert the half-precision floating-point value \p h to an unsigned integer * in round-up mode. -* \param[in] h - half. Is only being read. +* \param[in] h - half. Is only being read. * * \returns unsigned int -* \retval h converted to an unsigned integer. +* \retval h converted to an unsigned integer. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -689,13 +689,13 @@ __CUDA_FP16_DECL__ unsigned int __half2uint_ru(const __half h); /** * \ingroup CUDA_MATH__HALF_MISC * \brief Convert an unsigned integer to a half in round-to-nearest-even mode. -* +* * \details Convert the unsigned integer value \p i to a half-precision floating-point * value in round-to-nearest-even mode. -* \param[in] i - unsigned int. Is only being read. -* +* \param[in] i - unsigned int. Is only being read. +* * \returns half -* \retval i converted to half. +* \retval i converted to half. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -705,13 +705,13 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_rn(const unsigned int i); /** * \ingroup CUDA_MATH__HALF_MISC * \brief Convert an unsigned integer to a half in round-towards-zero mode. -* +* * \details Convert the unsigned integer value \p i to a half-precision floating-point * value in round-towards-zero mode. -* \param[in] i - unsigned int. Is only being read. -* +* \param[in] i - unsigned int. Is only being read. +* * \returns half -* \retval i converted to half. +* \retval i converted to half. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -721,13 +721,13 @@ __CUDA_FP16_DECL__ __half __uint2half_rz(const unsigned int i); /** * \ingroup CUDA_MATH__HALF_MISC * \brief Convert an unsigned integer to a half in round-down mode. -* +* * \details Convert the unsigned integer value \p i to a half-precision floating-point * value in round-down mode. -* \param[in] i - unsigned int. Is only being read. -* +* \param[in] i - unsigned int. Is only being read. +* * \returns half -* \retval i converted to half. +* \retval i converted to half. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -737,13 +737,13 @@ __CUDA_FP16_DECL__ __half __uint2half_rd(const unsigned int i); /** * \ingroup CUDA_MATH__HALF_MISC * \brief Convert an unsigned integer to a half in round-up mode. -* +* * \details Convert the unsigned integer value \p i to a half-precision floating-point * value in round-up mode. -* \param[in] i - unsigned int. Is only being read. -* +* \param[in] i - unsigned int. Is only being read. +* * \returns half -* \retval i converted to half. +* \retval i converted to half. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -755,13 +755,13 @@ __CUDA_FP16_DECL__ __half __uint2half_ru(const unsigned int i); * \ingroup CUDA_MATH__HALF_MISC * \brief Convert a half to an unsigned short integer in round-to-nearest-even * mode. -* +* * \details Convert the half-precision floating-point value \p h to an unsigned short * integer in round-to-nearest-even mode. -* \param[in] h - half. Is only being read. -* +* \param[in] h - half. Is only being read. +* * \returns unsigned short int -* \retval h converted to an unsigned short integer. +* \retval h converted to an unsigned short integer. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -772,13 +772,13 @@ __CUDA_FP16_DECL__ unsigned short int __half2ushort_rn(const __half h); * \ingroup CUDA_MATH__HALF_MISC * \brief Convert a half to an unsigned short integer in round-towards-zero * mode. -* +* * \details Convert the half-precision floating-point value \p h to an unsigned short * integer in round-towards-zero mode. -* \param[in] h - half. Is only being read. -* +* \param[in] h - half. Is only being read. +* * \returns unsigned short int -* \retval h converted to an unsigned short integer. +* \retval h converted to an unsigned short integer. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -788,25 +788,25 @@ __CUDA_HOSTDEVICE_FP16_DECL__ unsigned short int __half2ushort_rz(const __half h /** * \ingroup CUDA_MATH__HALF_MISC * \brief Convert a half to an unsigned short integer in round-down mode. -* +* * \details Convert the half-precision floating-point value \p h to an unsigned short * integer in round-down mode. -* \param[in] h - half. Is only being read. -* +* \param[in] h - half. Is only being read. +* * \returns unsigned short int -* \retval h converted to an unsigned short integer. +* \retval h converted to an unsigned short integer. */ __CUDA_FP16_DECL__ unsigned short int __half2ushort_rd(const __half h); /** * \ingroup CUDA_MATH__HALF_MISC * \brief Convert a half to an unsigned short integer in round-up mode. -* +* * \details Convert the half-precision floating-point value \p h to an unsigned short * integer in round-up mode. -* \param[in] h - half. Is only being read. -* +* \param[in] h - half. Is only being read. +* * \returns unsigned short int -* \retval h converted to an unsigned short integer. +* \retval h converted to an unsigned short integer. */ __CUDA_FP16_DECL__ unsigned short int __half2ushort_ru(const __half h); @@ -814,13 +814,13 @@ __CUDA_FP16_DECL__ unsigned short int __half2ushort_ru(const __half h); * \ingroup CUDA_MATH__HALF_MISC * \brief Convert an unsigned short integer to a half in round-to-nearest-even * mode. -* +* * \details Convert the unsigned short integer value \p i to a half-precision floating-point * value in round-to-nearest-even mode. -* \param[in] i - unsigned short int. Is only being read. -* +* \param[in] i - unsigned short int. Is only being read. +* * \returns half -* \retval i converted to half. +* \retval i converted to half. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -831,13 +831,13 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_rn(const unsigned short int i * \ingroup CUDA_MATH__HALF_MISC * \brief Convert an unsigned short integer to a half in round-towards-zero * mode. -* +* * \details Convert the unsigned short integer value \p i to a half-precision floating-point * value in round-towards-zero mode. -* \param[in] i - unsigned short int. Is only being read. -* +* \param[in] i - unsigned short int. Is only being read. +* * \returns half -* \retval i converted to half. +* \retval i converted to half. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -847,13 +847,13 @@ __CUDA_FP16_DECL__ __half __ushort2half_rz(const unsigned short int i); /** * \ingroup CUDA_MATH__HALF_MISC * \brief Convert an unsigned short integer to a half in round-down mode. -* +* * \details Convert the unsigned short integer value \p i to a half-precision floating-point * value in round-down mode. -* \param[in] i - unsigned short int. Is only being read. -* +* \param[in] i - unsigned short int. Is only being read. +* * \returns half -* \retval i converted to half. +* \retval i converted to half. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -863,13 +863,13 @@ __CUDA_FP16_DECL__ __half __ushort2half_rd(const unsigned short int i); /** * \ingroup CUDA_MATH__HALF_MISC * \brief Convert an unsigned short integer to a half in round-up mode. -* +* * \details Convert the unsigned short integer value \p i to a half-precision floating-point * value in round-up mode. -* \param[in] i - unsigned short int. Is only being read. -* +* \param[in] i - unsigned short int. Is only being read. +* * \returns half -* \retval i converted to half. +* \retval i converted to half. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -881,13 +881,13 @@ __CUDA_FP16_DECL__ __half __ushort2half_ru(const unsigned short int i); * \ingroup CUDA_MATH__HALF_MISC * \brief Convert a half to an unsigned 64-bit integer in round-to-nearest-even * mode. -* +* * \details Convert the half-precision floating-point value \p h to an unsigned 64-bit * integer in round-to-nearest-even mode. -* \param[in] h - half. Is only being read. -* +* \param[in] h - half. Is only being read. +* * \returns unsigned long long int -* \retval h converted to an unsigned 64-bit integer. +* \retval h converted to an unsigned 64-bit integer. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -898,13 +898,13 @@ __CUDA_FP16_DECL__ unsigned long long int __half2ull_rn(const __half h); * \ingroup CUDA_MATH__HALF_MISC * \brief Convert a half to an unsigned 64-bit integer in round-towards-zero * mode. -* +* * \details Convert the half-precision floating-point value \p h to an unsigned 64-bit * integer in round-towards-zero mode. -* \param[in] h - half. Is only being read. -* +* \param[in] h - half. Is only being read. +* * \returns unsigned long long int -* \retval h converted to an unsigned 64-bit integer. +* \retval h converted to an unsigned 64-bit integer. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -914,13 +914,13 @@ __CUDA_HOSTDEVICE_FP16_DECL__ unsigned long long int __half2ull_rz(const __half /** * \ingroup CUDA_MATH__HALF_MISC * \brief Convert a half to an unsigned 64-bit integer in round-down mode. -* +* * \details Convert the half-precision floating-point value \p h to an unsigned 64-bit * integer in round-down mode. -* \param[in] h - half. Is only being read. -* +* \param[in] h - half. Is only being read. +* * \returns unsigned long long int -* \retval h converted to an unsigned 64-bit integer. +* \retval h converted to an unsigned 64-bit integer. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -930,13 +930,13 @@ __CUDA_FP16_DECL__ unsigned long long int __half2ull_rd(const __half h); /** * \ingroup CUDA_MATH__HALF_MISC * \brief Convert a half to an unsigned 64-bit integer in round-up mode. -* +* * \details Convert the half-precision floating-point value \p h to an unsigned 64-bit * integer in round-up mode. -* \param[in] h - half. Is only being read. -* +* \param[in] h - half. Is only being read. +* * \returns unsigned long long int -* \retval h converted to an unsigned 64-bit integer. +* \retval h converted to an unsigned 64-bit integer. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -948,13 +948,13 @@ __CUDA_FP16_DECL__ unsigned long long int __half2ull_ru(const __half h); * \ingroup CUDA_MATH__HALF_MISC * \brief Convert an unsigned 64-bit integer to a half in round-to-nearest-even * mode. -* +* * \details Convert the unsigned 64-bit integer value \p i to a half-precision floating-point * value in round-to-nearest-even mode. -* \param[in] i - unsigned long long int. Is only being read. -* +* \param[in] i - unsigned long long int. Is only being read. +* * \returns half -* \retval i converted to half. +* \retval i converted to half. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -965,13 +965,13 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_rn(const unsigned long long int * \ingroup CUDA_MATH__HALF_MISC * \brief Convert an unsigned 64-bit integer to a half in round-towards-zero * mode. -* +* * \details Convert the unsigned 64-bit integer value \p i to a half-precision floating-point * value in round-towards-zero mode. -* \param[in] i - unsigned long long int. Is only being read. -* +* \param[in] i - unsigned long long int. Is only being read. +* * \returns half -* \retval i converted to half. +* \retval i converted to half. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -981,13 +981,13 @@ __CUDA_FP16_DECL__ __half __ull2half_rz(const unsigned long long int i); /** * \ingroup CUDA_MATH__HALF_MISC * \brief Convert an unsigned 64-bit integer to a half in round-down mode. -* +* * \details Convert the unsigned 64-bit integer value \p i to a half-precision floating-point * value in round-down mode. -* \param[in] i - unsigned long long int. Is only being read. -* +* \param[in] i - unsigned long long int. Is only being read. +* * \returns half -* \retval i converted to half. +* \retval i converted to half. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -997,13 +997,13 @@ __CUDA_FP16_DECL__ __half __ull2half_rd(const unsigned long long int i); /** * \ingroup CUDA_MATH__HALF_MISC * \brief Convert an unsigned 64-bit integer to a half in round-up mode. -* +* * \details Convert the unsigned 64-bit integer value \p i to a half-precision floating-point * value in round-up mode. -* \param[in] i - unsigned long long int. Is only being read. -* +* \param[in] i - unsigned long long int. Is only being read. +* * \returns half -* \retval i converted to half. +* \retval i converted to half. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -1015,13 +1015,13 @@ __CUDA_FP16_DECL__ __half __ull2half_ru(const unsigned long long int i); * \ingroup CUDA_MATH__HALF_MISC * \brief Convert a half to a signed 64-bit integer in round-to-nearest-even * mode. -* +* * \details Convert the half-precision floating-point value \p h to a signed 64-bit * integer in round-to-nearest-even mode. -* \param[in] h - half. Is only being read. -* +* \param[in] h - half. Is only being read. +* * \returns long long int -* \retval h converted to a signed 64-bit integer. +* \retval h converted to a signed 64-bit integer. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -1031,13 +1031,13 @@ __CUDA_FP16_DECL__ long long int __half2ll_rn(const __half h); /** * \ingroup CUDA_MATH__HALF_MISC * \brief Convert a half to a signed 64-bit integer in round-towards-zero mode. -* +* * \details Convert the half-precision floating-point value \p h to a signed 64-bit * integer in round-towards-zero mode. -* \param[in] h - half. Is only being read. -* +* \param[in] h - half. Is only being read. +* * \returns long long int -* \retval h converted to a signed 64-bit integer. +* \retval h converted to a signed 64-bit integer. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -1047,13 +1047,13 @@ __CUDA_HOSTDEVICE_FP16_DECL__ long long int __half2ll_rz(const __half h); /** * \ingroup CUDA_MATH__HALF_MISC * \brief Convert a half to a signed 64-bit integer in round-down mode. -* +* * \details Convert the half-precision floating-point value \p h to a signed 64-bit * integer in round-down mode. -* \param[in] h - half. Is only being read. -* +* \param[in] h - half. Is only being read. +* * \returns long long int -* \retval h converted to a signed 64-bit integer. +* \retval h converted to a signed 64-bit integer. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -1063,13 +1063,13 @@ __CUDA_FP16_DECL__ long long int __half2ll_rd(const __half h); /** * \ingroup CUDA_MATH__HALF_MISC * \brief Convert a half to a signed 64-bit integer in round-up mode. -* +* * \details Convert the half-precision floating-point value \p h to a signed 64-bit * integer in round-up mode. -* \param[in] h - half. Is only being read. -* +* \param[in] h - half. Is only being read. +* * \returns long long int -* \retval h converted to a signed 64-bit integer. +* \retval h converted to a signed 64-bit integer. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -1081,13 +1081,13 @@ __CUDA_FP16_DECL__ long long int __half2ll_ru(const __half h); * \ingroup CUDA_MATH__HALF_MISC * \brief Convert a signed 64-bit integer to a half in round-to-nearest-even * mode. -* +* * \details Convert the signed 64-bit integer value \p i to a half-precision floating-point * value in round-to-nearest-even mode. -* \param[in] i - long long int. Is only being read. -* +* \param[in] i - long long int. Is only being read. +* * \returns half -* \retval i converted to half. +* \retval i converted to half. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -1097,25 +1097,25 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_rn(const long long int i); /** * \ingroup CUDA_MATH__HALF_MISC * \brief Convert a signed 64-bit integer to a half in round-towards-zero mode. -* +* * \details Convert the signed 64-bit integer value \p i to a half-precision floating-point * value in round-towards-zero mode. -* \param[in] i - long long int. Is only being read. -* +* \param[in] i - long long int. Is only being read. +* * \returns half -* \retval i converted to half. +* \retval i converted to half. */ __CUDA_FP16_DECL__ __half __ll2half_rz(const long long int i); /** * \ingroup CUDA_MATH__HALF_MISC * \brief Convert a signed 64-bit integer to a half in round-down mode. -* +* * \details Convert the signed 64-bit integer value \p i to a half-precision floating-point * value in round-down mode. -* \param[in] i - long long int. Is only being read. -* +* \param[in] i - long long int. Is only being read. +* * \returns half -* \retval i converted to half. +* \retval i converted to half. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -1125,13 +1125,13 @@ __CUDA_FP16_DECL__ __half __ll2half_rd(const long long int i); /** * \ingroup CUDA_MATH__HALF_MISC * \brief Convert a signed 64-bit integer to a half in round-up mode. -* +* * \details Convert the signed 64-bit integer value \p i to a half-precision floating-point * value in round-up mode. -* \param[in] i - long long int. Is only being read. -* +* \param[in] i - long long int. Is only being read. +* * \returns half -* \retval i converted to half. +* \retval i converted to half. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -1142,13 +1142,13 @@ __CUDA_FP16_DECL__ __half __ll2half_ru(const long long int i); /** * \ingroup CUDA_MATH__HALF_FUNCTIONS * \brief Truncate input argument to the integral part. -* +* * \details Round \p h to the nearest integer value that does not exceed \p h in * magnitude. -* \param[in] h - half. Is only being read. -* +* \param[in] h - half. Is only being read. +* * \returns half -* \retval The truncated integer value. +* \retval The truncated integer value. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -1158,12 +1158,12 @@ __CUDA_FP16_DECL__ __half htrunc(const __half h); /** * \ingroup CUDA_MATH__HALF_FUNCTIONS * \brief Calculate ceiling of the input argument. -* +* * \details Compute the smallest integer value not less than \p h. -* \param[in] h - half. Is only being read. -* +* \param[in] h - half. Is only being read. +* * \returns half -* \retval The smallest integer value not less than \p h. +* \retval The smallest integer value not less than \p h. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -1173,12 +1173,12 @@ __CUDA_FP16_DECL__ __half hceil(const __half h); /** * \ingroup CUDA_MATH__HALF_FUNCTIONS * \brief Calculate the largest integer less than or equal to \p h. -* +* * \details Calculate the largest integer value which is less than or equal to \p h. -* \param[in] h - half. Is only being read. -* +* \param[in] h - half. Is only being read. +* * \returns half -* \retval The largest integer value which is less than or equal to \p h. +* \retval The largest integer value which is less than or equal to \p h. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -1189,13 +1189,13 @@ __CUDA_FP16_DECL__ __half hfloor(const __half h); * \ingroup CUDA_MATH__HALF_FUNCTIONS * \brief Round input to nearest integer value in half-precision floating-point * number. -* +* * \details Round \p h to the nearest integer value in half-precision floating-point * format, with halfway cases rounded to the nearest even integer value. -* \param[in] h - half. Is only being read. -* +* \param[in] h - half. Is only being read. +* * \returns half -* \retval The nearest integer to \p h. +* \retval The nearest integer to \p h. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -1206,13 +1206,13 @@ __CUDA_FP16_DECL__ __half hrint(const __half h); /** * \ingroup CUDA_MATH__HALF2_FUNCTIONS * \brief Truncate \p half2 vector input argument to the integral part. -* +* * \details Round each component of vector \p h to the nearest integer value that does * not exceed \p h in magnitude. -* \param[in] h - half2. Is only being read. -* +* \param[in] h - half2. Is only being read. +* * \returns half2 -* \retval The truncated \p h. +* \retval The truncated \p h. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -1222,13 +1222,13 @@ __CUDA_FP16_DECL__ __half2 h2trunc(const __half2 h); /** * \ingroup CUDA_MATH__HALF2_FUNCTIONS * \brief Calculate \p half2 vector ceiling of the input argument. -* +* * \details For each component of vector \p h compute the smallest integer value not less * than \p h. -* \param[in] h - half2. Is only being read. -* +* \param[in] h - half2. Is only being read. +* * \returns half2 -* \retval The vector of smallest integers not less than \p h. +* \retval The vector of smallest integers not less than \p h. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -1238,13 +1238,13 @@ __CUDA_FP16_DECL__ __half2 h2ceil(const __half2 h); /** * \ingroup CUDA_MATH__HALF2_FUNCTIONS * \brief Calculate the largest integer less than or equal to \p h. -* +* * \details For each component of vector \p h calculate the largest integer value which * is less than or equal to \p h. -* \param[in] h - half2. Is only being read. -* +* \param[in] h - half2. Is only being read. +* * \returns half2 -* \retval The vector of largest integers which is less than or equal to \p h. +* \retval The vector of largest integers which is less than or equal to \p h. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -1255,14 +1255,14 @@ __CUDA_FP16_DECL__ __half2 h2floor(const __half2 h); * \ingroup CUDA_MATH__HALF2_FUNCTIONS * \brief Round input to nearest integer value in half-precision floating-point * number. -* +* * \details Round each component of \p half2 vector \p h to the nearest integer value in * half-precision floating-point format, with halfway cases rounded to the * nearest even integer value. -* \param[in] h - half2. Is only being read. -* +* \param[in] h - half2. Is only being read. +* * \returns half2 -* \retval The vector of rounded integer values. +* \retval The vector of rounded integer values. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -1273,13 +1273,13 @@ __CUDA_FP16_DECL__ __half2 h2rint(const __half2 h); /** * \ingroup CUDA_MATH__HALF_MISC * \brief Returns \p half2 with both halves equal to the input value. -* +* * \details Returns \p half2 number with both halves equal to the input \p a \p half * number. -* \param[in] a - half. Is only being read. -* +* \param[in] a - half. Is only being read. +* * \returns half2 -* \retval The vector which has both its halves equal to the input \p a. +* \retval The vector which has both its halves equal to the input \p a. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -1289,13 +1289,13 @@ __CUDA_FP16_DECL__ __half2 __half2half2(const __half a); /** * \ingroup CUDA_MATH__HALF_MISC * \brief Swaps both halves of the \p half2 input. -* +* * \details Swaps both halves of the \p half2 input and returns a new \p half2 number * with swapped halves. -* \param[in] a - half2. Is only being read. -* +* \param[in] a - half2. Is only being read. +* * \returns half2 -* \retval a with its halves being swapped. +* \retval a with its halves being swapped. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -1305,17 +1305,17 @@ __CUDA_FP16_DECL__ __half2 __lowhigh2highlow(const __half2 a); /** * \ingroup CUDA_MATH__HALF_MISC * \brief Extracts low 16 bits from each of the two \p half2 inputs and combines -* into one \p half2 number. -* +* into one \p half2 number. +* * \details Extracts low 16 bits from each of the two \p half2 inputs and combines into * one \p half2 number. Low 16 bits from input \p a is stored in low 16 bits of * the return value, low 16 bits from input \p b is stored in high 16 bits of -* the return value. -* \param[in] a - half2. Is only being read. -* \param[in] b - half2. Is only being read. -* +* the return value. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* * \returns half2 -* \retval The low 16 bits of \p a and of \p b. +* \retval The low 16 bits of \p a and of \p b. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -1326,16 +1326,16 @@ __CUDA_FP16_DECL__ __half2 __lows2half2(const __half2 a, const __half2 b); * \ingroup CUDA_MATH__HALF_MISC * \brief Extracts high 16 bits from each of the two \p half2 inputs and * combines into one \p half2 number. -* +* * \details Extracts high 16 bits from each of the two \p half2 inputs and combines into * one \p half2 number. High 16 bits from input \p a is stored in low 16 bits of * the return value, high 16 bits from input \p b is stored in high 16 bits of * the return value. -* \param[in] a - half2. Is only being read. -* \param[in] b - half2. Is only being read. -* +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* * \returns half2 -* \retval The high 16 bits of \p a and of \p b. +* \retval The high 16 bits of \p a and of \p b. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -1347,10 +1347,10 @@ __CUDA_FP16_DECL__ __half2 __highs2half2(const __half2 a, const __half2 b); * \brief Returns high 16 bits of \p half2 input. * * \details Returns high 16 bits of \p half2 input \p a. -* \param[in] a - half2. Is only being read. +* \param[in] a - half2. Is only being read. * * \returns half -* \retval The high 16 bits of the input. +* \retval The high 16 bits of the input. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -1362,10 +1362,10 @@ __CUDA_FP16_DECL__ __half __high2half(const __half2 a); * \brief Returns low 16 bits of \p half2 input. * * \details Returns low 16 bits of \p half2 input \p a. -* \param[in] a - half2. Is only being read. +* \param[in] a - half2. Is only being read. * * \returns half -* \retval Returns \p half which contains low 16 bits of the input \p a. +* \retval Returns \p half which contains low 16 bits of the input \p a. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -1375,14 +1375,14 @@ __CUDA_FP16_DECL__ __half __low2half(const __half2 a); /** * \ingroup CUDA_MATH__HALF_COMPARISON * \brief Checks if the input \p half number is infinite. -* -* \details Checks if the input \p half number \p a is infinite. -* \param[in] a - half. Is only being read. -* -* \returns int -* \retval -1 iff \p a is equal to negative infinity, -* \retval 1 iff \p a is equal to positive infinity, -* \retval 0 otherwise. +* +* \details Checks if the input \p half number \p a is infinite. +* \param[in] a - half. Is only being read. +* +* \returns int +* \retval -1 iff \p a is equal to negative infinity, +* \retval 1 iff \p a is equal to positive infinity, +* \retval 0 otherwise. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -1392,15 +1392,15 @@ __CUDA_FP16_DECL__ int __hisinf(const __half a); /** * \ingroup CUDA_MATH__HALF_MISC * \brief Combines two \p half numbers into one \p half2 number. -* +* * \details Combines two input \p half number \p a and \p b into one \p half2 number. * Input \p a is stored in low 16 bits of the return value, input \p b is stored * in high 16 bits of the return value. -* \param[in] a - half. Is only being read. -* \param[in] b - half. Is only being read. -* +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* * \returns half2 -* \retval The half2 with one half equal to \p a and the other to \p b. +* \retval The half2 with one half equal to \p a and the other to \p b. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -1410,13 +1410,13 @@ __CUDA_FP16_DECL__ __half2 __halves2half2(const __half a, const __half b); /** * \ingroup CUDA_MATH__HALF_MISC * \brief Extracts low 16 bits from \p half2 input. -* +* * \details Extracts low 16 bits from \p half2 input \p a and returns a new \p half2 * number which has both halves equal to the extracted bits. -* \param[in] a - half2. Is only being read. -* +* \param[in] a - half2. Is only being read. +* * \returns half2 -* \retval The half2 with both halves equal to the low 16 bits of the input. +* \retval The half2 with both halves equal to the low 16 bits of the input. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -1426,13 +1426,13 @@ __CUDA_FP16_DECL__ __half2 __low2half2(const __half2 a); /** * \ingroup CUDA_MATH__HALF_MISC * \brief Extracts high 16 bits from \p half2 input. -* +* * \details Extracts high 16 bits from \p half2 input \p a and returns a new \p half2 * number which has both halves equal to the extracted bits. -* \param[in] a - half2. Is only being read. -* +* \param[in] a - half2. Is only being read. +* * \returns half2 -* \retval The half2 with both halves equal to the high 16 bits of the input. +* \retval The half2 with both halves equal to the high 16 bits of the input. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -1443,13 +1443,13 @@ __CUDA_FP16_DECL__ __half2 __high2half2(const __half2 a); /** * \ingroup CUDA_MATH__HALF_MISC * \brief Reinterprets bits in a \p half as a signed short integer. -* +* * \details Reinterprets the bits in the half-precision floating-point number \p h -* as a signed short integer. -* \param[in] h - half. Is only being read. -* +* as a signed short integer. +* \param[in] h - half. Is only being read. +* * \returns short int -* \retval The reinterpreted value. +* \retval The reinterpreted value. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -1459,11 +1459,11 @@ __CUDA_FP16_DECL__ short int __half_as_short(const __half h); /** * \ingroup CUDA_MATH__HALF_MISC * \brief Reinterprets bits in a \p half as an unsigned short integer. -* +* * \details Reinterprets the bits in the half-precision floating-point \p h * as an unsigned short number. -* \param[in] h - half. Is only being read. -* +* \param[in] h - half. Is only being read. +* * \returns unsigned short int * \retval The reinterpreted value. * \internal @@ -1475,11 +1475,11 @@ __CUDA_FP16_DECL__ unsigned short int __half_as_ushort(const __half h); /** * \ingroup CUDA_MATH__HALF_MISC * \brief Reinterprets bits in a signed short integer as a \p half. -* +* * \details Reinterprets the bits in the signed short integer \p i as a * half-precision floating-point number. -* \param[in] i - short int. Is only being read. -* +* \param[in] i - short int. Is only being read. +* * \returns half * \retval The reinterpreted value. * \internal @@ -1491,11 +1491,11 @@ __CUDA_FP16_DECL__ __half __short_as_half(const short int i); /** * \ingroup CUDA_MATH__HALF_MISC * \brief Reinterprets bits in an unsigned short integer as a \p half. -* +* * \details Reinterprets the bits in the unsigned short integer \p i as a * half-precision floating-point number. -* \param[in] i - unsigned short int. Is only being read. -* +* \param[in] i - unsigned short int. Is only being read. +* * \returns half * \retval The reinterpreted value. * \internal @@ -1534,22 +1534,22 @@ __CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) __half /** * \ingroup CUDA_MATH__HALF_MISC -* \brief Exchange a variable between threads within a warp. Direct copy from indexed thread. -* -* \details Returns the value of var held by the thread whose ID is given by delta. -* If width is less than warpSize then each subsection of the warp behaves as a separate -* entity with a starting logical thread ID of 0. If delta is outside the range [0:width-1], -* the value returned corresponds to the value of var held by the delta modulo width (i.e. -* within the same subsection). width must have a value which is a power of 2; -* results are undefined if width is not a power of 2, or is a number greater than -* warpSize. -* \param[in] mask - unsigned int. Is only being read. -* \param[in] var - half2. Is only being read. -* \param[in] delta - int. Is only being read. -* \param[in] width - int. Is only being read. -* -* \returns Returns the 4-byte word referenced by var from the source thread ID as half2. -* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \brief Exchange a variable between threads within a warp. Direct copy from indexed thread. +* +* \details Returns the value of var held by the thread whose ID is given by delta. +* If width is less than warpSize then each subsection of the warp behaves as a separate +* entity with a starting logical thread ID of 0. If delta is outside the range [0:width-1], +* the value returned corresponds to the value of var held by the delta modulo width (i.e. +* within the same subsection). width must have a value which is a power of 2; +* results are undefined if width is not a power of 2, or is a number greater than +* warpSize. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half2. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 4-byte word referenced by var from the source thread ID as half2. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. * \internal * \exception-guarantee no-throw guarantee * \behavior not reentrant, not thread safe @@ -1558,22 +1558,22 @@ __CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) __half __CUDA_FP16_DECL__ __half2 __shfl_sync(const unsigned mask, const __half2 var, const int delta, const int width = warpSize); /** * \ingroup CUDA_MATH__HALF_MISC -* \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller. -* -* \details Calculates a source thread ID by subtracting delta from the caller's lane ID. -* The value of var held by the resulting lane ID is returned: in effect, var is shifted up -* the warp by delta threads. If width is less than warpSize then each subsection of the warp -* behaves as a separate entity with a starting logical thread ID of 0. The source thread index -* will not wrap around the value of width, so effectively the lower delta threads will be unchanged. -* width must have a value which is a power of 2; results are undefined if width is not a power of 2, -* or is a number greater than warpSize. -* \param[in] mask - unsigned int. Is only being read. -* \param[in] var - half2. Is only being read. -* \param[in] delta - int. Is only being read. -* \param[in] width - int. Is only being read. -* -* \returns Returns the 4-byte word referenced by var from the source thread ID as half2. -* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller. +* +* \details Calculates a source thread ID by subtracting delta from the caller's lane ID. +* The value of var held by the resulting lane ID is returned: in effect, var is shifted up +* the warp by delta threads. If width is less than warpSize then each subsection of the warp +* behaves as a separate entity with a starting logical thread ID of 0. The source thread index +* will not wrap around the value of width, so effectively the lower delta threads will be unchanged. +* width must have a value which is a power of 2; results are undefined if width is not a power of 2, +* or is a number greater than warpSize. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half2. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 4-byte word referenced by var from the source thread ID as half2. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. * \internal * \exception-guarantee no-throw guarantee * \behavior not reentrant, not thread safe @@ -1582,22 +1582,22 @@ __CUDA_FP16_DECL__ __half2 __shfl_sync(const unsigned mask, const __half2 var, c __CUDA_FP16_DECL__ __half2 __shfl_up_sync(const unsigned mask, const __half2 var, const unsigned int delta, const int width = warpSize); /** * \ingroup CUDA_MATH__HALF_MISC -* \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller. -* -* \details Calculates a source thread ID by adding delta to the caller's thread ID. -* The value of var held by the resulting thread ID is returned: this has the effect -* of shifting var down the warp by delta threads. If width is less than warpSize then -* each subsection of the warp behaves as a separate entity with a starting logical -* thread ID of 0. As for __shfl_up_sync(), the ID number of the source thread -* will not wrap around the value of width and so the upper delta threads -* will remain unchanged. -* \param[in] mask - unsigned int. Is only being read. -* \param[in] var - half2. Is only being read. -* \param[in] delta - int. Is only being read. -* \param[in] width - int. Is only being read. -* -* \returns Returns the 4-byte word referenced by var from the source thread ID as half2. -* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller. +* +* \details Calculates a source thread ID by adding delta to the caller's thread ID. +* The value of var held by the resulting thread ID is returned: this has the effect +* of shifting var down the warp by delta threads. If width is less than warpSize then +* each subsection of the warp behaves as a separate entity with a starting logical +* thread ID of 0. As for __shfl_up_sync(), the ID number of the source thread +* will not wrap around the value of width and so the upper delta threads +* will remain unchanged. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half2. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 4-byte word referenced by var from the source thread ID as half2. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. * \internal * \exception-guarantee no-throw guarantee * \behavior not reentrant, not thread safe @@ -1606,21 +1606,21 @@ __CUDA_FP16_DECL__ __half2 __shfl_up_sync(const unsigned mask, const __half2 var __CUDA_FP16_DECL__ __half2 __shfl_down_sync(const unsigned mask, const __half2 var, const unsigned int delta, const int width = warpSize); /** * \ingroup CUDA_MATH__HALF_MISC -* \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID. -* -* \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with mask: -* the value of var held by the resulting thread ID is returned. If width is less than warpSize then each -* group of width consecutive threads are able to access elements from earlier groups of threads, -* however if they attempt to access elements from later groups of threads their own value of var -* will be returned. This mode implements a butterfly addressing pattern such as is used in tree -* reduction and broadcast. -* \param[in] mask - unsigned int. Is only being read. -* \param[in] var - half2. Is only being read. -* \param[in] delta - int. Is only being read. -* \param[in] width - int. Is only being read. -* -* \returns Returns the 4-byte word referenced by var from the source thread ID as half2. -* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID. +* +* \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with mask: +* the value of var held by the resulting thread ID is returned. If width is less than warpSize then each +* group of width consecutive threads are able to access elements from earlier groups of threads, +* however if they attempt to access elements from later groups of threads their own value of var +* will be returned. This mode implements a butterfly addressing pattern such as is used in tree +* reduction and broadcast. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half2. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 4-byte word referenced by var from the source thread ID as half2. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. * \internal * \exception-guarantee no-throw guarantee * \behavior not reentrant, not thread safe @@ -1629,22 +1629,22 @@ __CUDA_FP16_DECL__ __half2 __shfl_down_sync(const unsigned mask, const __half2 v __CUDA_FP16_DECL__ __half2 __shfl_xor_sync(const unsigned mask, const __half2 var, const int delta, const int width = warpSize); /** * \ingroup CUDA_MATH__HALF_MISC -* \brief Exchange a variable between threads within a warp. Direct copy from indexed thread. -* -* \details Returns the value of var held by the thread whose ID is given by delta. -* If width is less than warpSize then each subsection of the warp behaves as a separate -* entity with a starting logical thread ID of 0. If delta is outside the range [0:width-1], -* the value returned corresponds to the value of var held by the delta modulo width (i.e. -* within the same subsection). width must have a value which is a power of 2; -* results are undefined if width is not a power of 2, or is a number greater than -* warpSize. -* \param[in] mask - unsigned int. Is only being read. -* \param[in] var - half. Is only being read. -* \param[in] delta - int. Is only being read. -* \param[in] width - int. Is only being read. -* -* \returns Returns the 2-byte word referenced by var from the source thread ID as half. -* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \brief Exchange a variable between threads within a warp. Direct copy from indexed thread. +* +* \details Returns the value of var held by the thread whose ID is given by delta. +* If width is less than warpSize then each subsection of the warp behaves as a separate +* entity with a starting logical thread ID of 0. If delta is outside the range [0:width-1], +* the value returned corresponds to the value of var held by the delta modulo width (i.e. +* within the same subsection). width must have a value which is a power of 2; +* results are undefined if width is not a power of 2, or is a number greater than +* warpSize. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 2-byte word referenced by var from the source thread ID as half. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. * \internal * \exception-guarantee no-throw guarantee * \behavior not reentrant, not thread safe @@ -1653,21 +1653,21 @@ __CUDA_FP16_DECL__ __half2 __shfl_xor_sync(const unsigned mask, const __half2 va __CUDA_FP16_DECL__ __half __shfl_sync(const unsigned mask, const __half var, const int delta, const int width = warpSize); /** * \ingroup CUDA_MATH__HALF_MISC -* \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller. -* \details Calculates a source thread ID by subtracting delta from the caller's lane ID. -* The value of var held by the resulting lane ID is returned: in effect, var is shifted up -* the warp by delta threads. If width is less than warpSize then each subsection of the warp -* behaves as a separate entity with a starting logical thread ID of 0. The source thread index -* will not wrap around the value of width, so effectively the lower delta threads will be unchanged. -* width must have a value which is a power of 2; results are undefined if width is not a power of 2, -* or is a number greater than warpSize. -* \param[in] mask - unsigned int. Is only being read. -* \param[in] var - half. Is only being read. -* \param[in] delta - int. Is only being read. -* \param[in] width - int. Is only being read. -* -* \returns Returns the 2-byte word referenced by var from the source thread ID as half. -* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller. +* \details Calculates a source thread ID by subtracting delta from the caller's lane ID. +* The value of var held by the resulting lane ID is returned: in effect, var is shifted up +* the warp by delta threads. If width is less than warpSize then each subsection of the warp +* behaves as a separate entity with a starting logical thread ID of 0. The source thread index +* will not wrap around the value of width, so effectively the lower delta threads will be unchanged. +* width must have a value which is a power of 2; results are undefined if width is not a power of 2, +* or is a number greater than warpSize. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 2-byte word referenced by var from the source thread ID as half. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. * \internal * \exception-guarantee no-throw guarantee * \behavior not reentrant, not thread safe @@ -1676,22 +1676,22 @@ __CUDA_FP16_DECL__ __half __shfl_sync(const unsigned mask, const __half var, con __CUDA_FP16_DECL__ __half __shfl_up_sync(const unsigned mask, const __half var, const unsigned int delta, const int width = warpSize); /** * \ingroup CUDA_MATH__HALF_MISC -* \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller. -* -* \details Calculates a source thread ID by adding delta to the caller's thread ID. -* The value of var held by the resulting thread ID is returned: this has the effect -* of shifting var down the warp by delta threads. If width is less than warpSize then -* each subsection of the warp behaves as a separate entity with a starting logical -* thread ID of 0. As for __shfl_up_sync(), the ID number of the source thread -* will not wrap around the value of width and so the upper delta threads -* will remain unchanged. -* \param[in] mask - unsigned int. Is only being read. -* \param[in] var - half. Is only being read. -* \param[in] delta - int. Is only being read. -* \param[in] width - int. Is only being read. -* -* \returns Returns the 2-byte word referenced by var from the source thread ID as half. -* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller. +* +* \details Calculates a source thread ID by adding delta to the caller's thread ID. +* The value of var held by the resulting thread ID is returned: this has the effect +* of shifting var down the warp by delta threads. If width is less than warpSize then +* each subsection of the warp behaves as a separate entity with a starting logical +* thread ID of 0. As for __shfl_up_sync(), the ID number of the source thread +* will not wrap around the value of width and so the upper delta threads +* will remain unchanged. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 2-byte word referenced by var from the source thread ID as half. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. * \internal * \exception-guarantee no-throw guarantee * \behavior not reentrant, not thread safe @@ -1700,21 +1700,21 @@ __CUDA_FP16_DECL__ __half __shfl_up_sync(const unsigned mask, const __half var, __CUDA_FP16_DECL__ __half __shfl_down_sync(const unsigned mask, const __half var, const unsigned int delta, const int width = warpSize); /** * \ingroup CUDA_MATH__HALF_MISC -* \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID. -* -* \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with mask: -* the value of var held by the resulting thread ID is returned. If width is less than warpSize then each -* group of width consecutive threads are able to access elements from earlier groups of threads, -* however if they attempt to access elements from later groups of threads their own value of var -* will be returned. This mode implements a butterfly addressing pattern such as is used in tree -* reduction and broadcast. -* \param[in] mask - unsigned int. Is only being read. -* \param[in] var - half. Is only being read. -* \param[in] delta - int. Is only being read. -* \param[in] width - int. Is only being read. -* -* \returns Returns the 2-byte word referenced by var from the source thread ID as half. -* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID. +* +* \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with mask: +* the value of var held by the resulting thread ID is returned. If width is less than warpSize then each +* group of width consecutive threads are able to access elements from earlier groups of threads, +* however if they attempt to access elements from later groups of threads their own value of var +* will be returned. This mode implements a butterfly addressing pattern such as is used in tree +* reduction and broadcast. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 2-byte word referenced by var from the source thread ID as half. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. * \internal * \exception-guarantee no-throw guarantee * \behavior not reentrant, not thread safe @@ -1875,13 +1875,13 @@ __CUDA_FP16_DECL__ void __stwt(__half *const ptr, const __half value); /** * \ingroup CUDA_MATH__HALF2_COMPARISON * \brief Performs half2 vector if-equal comparison. -* +* * \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b. * The corresponding \p half results are set to 1.0 for true, or 0.0 for false. * NaN inputs generate false results. -* \param[in] a - half2. Is only being read. -* \param[in] b - half2. Is only being read. -* +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* * \returns half2 * \retval The vector result of if-equal comparison of vectors \p a and \p b. * \internal @@ -1893,13 +1893,13 @@ __CUDA_FP16_DECL__ __half2 __heq2(const __half2 a, const __half2 b); /** * \ingroup CUDA_MATH__HALF2_COMPARISON * \brief Performs \p half2 vector not-equal comparison. -* +* * \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b. * The corresponding \p half results are set to 1.0 for true, or 0.0 for false. * NaN inputs generate false results. -* \param[in] a - half2. Is only being read. -* \param[in] b - half2. Is only being read. -* +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* * \returns half2 * \retval The vector result of not-equal comparison of vectors \p a and \p b. * \internal @@ -1915,8 +1915,8 @@ __CUDA_FP16_DECL__ __half2 __hne2(const __half2 a, const __half2 b); * \details Performs \p half2 vector less-equal comparison of inputs \p a and \p b. * The corresponding \p half results are set to 1.0 for true, or 0.0 for false. * NaN inputs generate false results. -* \param[in] a - half2. Is only being read. -* \param[in] b - half2. Is only being read. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. * * \returns half2 * \retval The \p half2 result of less-equal comparison of vectors \p a and \p b. @@ -1933,8 +1933,8 @@ __CUDA_FP16_DECL__ __half2 __hle2(const __half2 a, const __half2 b); * \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b. * The corresponding \p half results are set to 1.0 for true, or 0.0 for false. * NaN inputs generate false results. -* \param[in] a - half2. Is only being read. -* \param[in] b - half2. Is only being read. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. * * \returns half2 * \retval The vector result of greater-equal comparison of vectors \p a and \p b. @@ -1951,8 +1951,8 @@ __CUDA_FP16_DECL__ __half2 __hge2(const __half2 a, const __half2 b); * \details Performs \p half2 vector less-than comparison of inputs \p a and \p b. * The corresponding \p half results are set to 1.0 for true, or 0.0 for false. * NaN inputs generate false results. -* \param[in] a - half2. Is only being read. -* \param[in] b - half2. Is only being read. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. * * \returns half2 * \retval The half2 vector result of less-than comparison of vectors \p a and \p b. @@ -1965,13 +1965,13 @@ __CUDA_FP16_DECL__ __half2 __hlt2(const __half2 a, const __half2 b); /** * \ingroup CUDA_MATH__HALF2_COMPARISON * \brief Performs \p half2 vector greater-than comparison. -* +* * \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b. * The corresponding \p half results are set to 1.0 for true, or 0.0 for false. * NaN inputs generate false results. -* \param[in] a - half2. Is only being read. -* \param[in] b - half2. Is only being read. -* +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* * \returns half2 * \retval The vector result of greater-than comparison of vectors \p a and \p b. * \internal @@ -1983,13 +1983,13 @@ __CUDA_FP16_DECL__ __half2 __hgt2(const __half2 a, const __half2 b); /** * \ingroup CUDA_MATH__HALF2_COMPARISON * \brief Performs \p half2 vector unordered if-equal comparison. -* +* * \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b. * The corresponding \p half results are set to 1.0 for true, or 0.0 for false. * NaN inputs generate true results. -* \param[in] a - half2. Is only being read. -* \param[in] b - half2. Is only being read. -* +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* * \returns half2 * \retval The vector result of unordered if-equal comparison of vectors \p a and \p b. * \internal @@ -2005,8 +2005,8 @@ __CUDA_FP16_DECL__ __half2 __hequ2(const __half2 a, const __half2 b); * \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b. * The corresponding \p half results are set to 1.0 for true, or 0.0 for false. * NaN inputs generate true results. -* \param[in] a - half2. Is only being read. -* \param[in] b - half2. Is only being read. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. * * \returns half2 * \retval The vector result of unordered not-equal comparison of vectors \p a and \p b. @@ -2023,8 +2023,8 @@ __CUDA_FP16_DECL__ __half2 __hneu2(const __half2 a, const __half2 b); * Performs \p half2 vector less-equal comparison of inputs \p a and \p b. * The corresponding \p half results are set to 1.0 for true, or 0.0 for false. * NaN inputs generate true results. -* \param[in] a - half2. Is only being read. -* \param[in] b - half2. Is only being read. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. * * \returns half2 * \retval The vector result of unordered less-equal comparison of vectors \p a and \p b. @@ -2041,8 +2041,8 @@ __CUDA_FP16_DECL__ __half2 __hleu2(const __half2 a, const __half2 b); * \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b. * The corresponding \p half results are set to 1.0 for true, or 0.0 for false. * NaN inputs generate true results. -* \param[in] a - half2. Is only being read. -* \param[in] b - half2. Is only being read. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. * * \returns half2 * \retval The \p half2 vector result of unordered greater-equal comparison of vectors \p a and \p b. @@ -2059,8 +2059,8 @@ __CUDA_FP16_DECL__ __half2 __hgeu2(const __half2 a, const __half2 b); * \details Performs \p half2 vector less-than comparison of inputs \p a and \p b. * The corresponding \p half results are set to 1.0 for true, or 0.0 for false. * NaN inputs generate true results. -* \param[in] a - half2. Is only being read. -* \param[in] b - half2. Is only being read. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. * * \returns half2 * \retval The vector result of unordered less-than comparison of vectors \p a and \p b. @@ -2077,8 +2077,8 @@ __CUDA_FP16_DECL__ __half2 __hltu2(const __half2 a, const __half2 b); * \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b. * The corresponding \p half results are set to 1.0 for true, or 0.0 for false. * NaN inputs generate true results. -* \param[in] a - half2. Is only being read. -* \param[in] b - half2. Is only being read. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. * * \returns half2 * \retval The \p half2 vector result of unordered greater-than comparison of vectors \p a and \p b. @@ -2093,11 +2093,11 @@ __CUDA_FP16_DECL__ __half2 __hgtu2(const __half2 a, const __half2 b); * \brief Determine whether \p half2 argument is a NaN. * * \details Determine whether each half of input \p half2 number \p a is a NaN. -* \param[in] a - half2. Is only being read. +* \param[in] a - half2. Is only being read. * * \returns half2 * \retval The half2 with the corresponding \p half results set to -* 1.0 for NaN, 0.0 otherwise. +* 1.0 for NaN, 0.0 otherwise. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -2113,11 +2113,11 @@ __CUDA_FP16_DECL__ __half2 __hisnan2(const __half2 a); * \internal * \req DEEPLEARN-SRM_REQ-95 * \endinternal -* \param[in] a - half2. Is only being read. -* \param[in] b - half2. Is only being read. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. * * \returns half2 -* \retval The sum of vectors \p a and \p b. +* \retval The sum of vectors \p a and \p b. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -2133,11 +2133,11 @@ __CUDA_FP16_DECL__ __half2 __hadd2(const __half2 a, const __half2 b); * \internal * \req DEEPLEARN-SRM_REQ-104 * \endinternal -* \param[in] a - half2. Is only being read. -* \param[in] b - half2. Is only being read. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. * * \returns half2 -* \retval The subtraction of vector \p b from \p a. +* \retval The subtraction of vector \p b from \p a. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -2153,11 +2153,11 @@ __CUDA_FP16_DECL__ __half2 __hsub2(const __half2 a, const __half2 b); * \internal * \req DEEPLEARN-SRM_REQ-102 * \endinternal -* \param[in] a - half2. Is only being read. -* \param[in] b - half2. Is only being read. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. * * \returns half2 -* \retval The result of elementwise multiplying the vectors \p a and \p b. +* \retval The result of elementwise multiplying the vectors \p a and \p b. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -2173,11 +2173,11 @@ __CUDA_FP16_DECL__ __half2 __hmul2(const __half2 a, const __half2 b); * \internal * \req DEEPLEARN-SRM_REQ-103 * \endinternal -* \param[in] a - half2. Is only being read. -* \param[in] b - half2. Is only being read. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. * * \returns half2 -* \retval The elementwise division of \p a with \p b. +* \retval The elementwise division of \p a with \p b. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -2191,10 +2191,10 @@ __CUDA_FP16_DECL__ __half2 __h2div(const __half2 a, const __half2 b); * * \details Calculates the absolute value of both halves of the input \p half2 number and * returns the result. -* \param[in] a - half2. Is only being read. +* \param[in] a - half2. Is only being read. * * \returns half2 -* \retval Returns \p a with the absolute value of both halves. +* \retval Returns \p a with the absolute value of both halves. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -2209,11 +2209,11 @@ __CUDA_FP16_DECL__ __half2 __habs2(const __half2 a); * \details Performs \p half2 vector add of inputs \p a and \p b, in round-to-nearest * mode, and clamps the results to range [0.0, 1.0]. NaN results are flushed to * +0.0. -* \param[in] a - half2. Is only being read. -* \param[in] b - half2. Is only being read. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. * * \returns half2 -* \retval The sum of \p a and \p b, with respect to saturation. +* \retval The sum of \p a and \p b, with respect to saturation. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -2228,8 +2228,8 @@ __CUDA_FP16_DECL__ __half2 __hadd2_sat(const __half2 a, const __half2 b); * \details Subtracts \p half2 input vector \p b from input vector \p a in * round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN * results are flushed to +0.0. -* \param[in] a - half2. Is only being read. -* \param[in] b - half2. Is only being read. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. * * \returns half2 * \retval The subtraction of vector \p b from \p a, with respect to saturation. @@ -2247,12 +2247,12 @@ __CUDA_FP16_DECL__ __half2 __hsub2_sat(const __half2 a, const __half2 b); * \details Performs \p half2 vector multiplication of inputs \p a and \p b, in * round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN * results are flushed to +0.0. -* \param[in] a - half2. Is only being read. -* \param[in] b - half2. Is only being read. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. * * \returns half2 -* \retval The result of elementwise multiplication of vectors \p a and \p b, -* with respect to saturation. +* \retval The result of elementwise multiplication of vectors \p a and \p b, +* with respect to saturation. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -2270,12 +2270,12 @@ __CUDA_FP16_DECL__ __half2 __hmul2_sat(const __half2 a, const __half2 b); * \internal * \req DEEPLEARN-SRM_REQ-105 * \endinternal -* \param[in] a - half2. Is only being read. -* \param[in] b - half2. Is only being read. -* \param[in] c - half2. Is only being read. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* \param[in] c - half2. Is only being read. * * \returns half2 -* \retval The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c. +* \retval The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -2291,13 +2291,13 @@ __CUDA_FP16_DECL__ __half2 __hfma2(const __half2 a, const __half2 b, const __hal * then performs a \p half2 vector add of the result with \p c, * rounding the result once in round-to-nearest-even mode, and clamps the * results to range [0.0, 1.0]. NaN results are flushed to +0.0. -* \param[in] a - half2. Is only being read. -* \param[in] b - half2. Is only being read. -* \param[in] c - half2. Is only being read. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* \param[in] c - half2. Is only being read. * * \returns half2 -* \retval The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c, -* with respect to saturation. +* \retval The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c, +* with respect to saturation. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -2313,10 +2313,10 @@ __CUDA_FP16_DECL__ __half2 __hfma2_sat(const __half2 a, const __half2 b, const _ * \internal * \req DEEPLEARN-SRM_REQ-101 * \endinternal -* \param[in] a - half2. Is only being read. +* \param[in] a - half2. Is only being read. * * \returns half2 -* \retval Returns \p a with both halves negated. +* \retval Returns \p a with both halves negated. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -2328,7 +2328,7 @@ __CUDA_FP16_DECL__ __half2 __hneg2(const __half2 a); * \brief Calculates the absolute value of input \p half number and returns the result. * * \details Calculates the absolute value of input \p half number and returns the result. -* \param[in] a - half. Is only being read. +* \param[in] a - half. Is only being read. * * \returns half * \retval The absolute value of a. @@ -2347,11 +2347,11 @@ __CUDA_FP16_DECL__ __half __habs(const __half a); * \internal * \req DEEPLEARN-SRM_REQ-94 * \endinternal -* \param[in] a - half. Is only being read. -* \param[in] b - half. Is only being read. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. * * \returns half -* \retval The sum of \p a and \p b. +* \retval The sum of \p a and \p b. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -2367,11 +2367,11 @@ __CUDA_FP16_DECL__ __half __hadd(const __half a, const __half b); * \internal * \req DEEPLEARN-SRM_REQ-97 * \endinternal -* \param[in] a - half. Is only being read. -* \param[in] b - half. Is only being read. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. * * \returns half -* \retval The result of subtracting \p b from \p a. +* \retval The result of subtracting \p b from \p a. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -2387,27 +2387,27 @@ __CUDA_FP16_DECL__ __half __hsub(const __half a, const __half b); * \internal * \req DEEPLEARN-SRM_REQ-99 * \endinternal -* \param[in] a - half. Is only being read. -* \param[in] b - half. Is only being read. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. * * \returns half -* \retval The result of multiplying \p a and \p b. +* \retval The result of multiplying \p a and \p b. */ __CUDA_FP16_DECL__ __half __hmul(const __half a, const __half b); /** * \ingroup CUDA_MATH__HALF_ARITHMETIC * \brief Performs \p half division in round-to-nearest-even mode. -* +* * \details Divides \p half input \p a by input \p b in round-to-nearest * mode. * \internal * \req DEEPLEARN-SRM_REQ-98 * \endinternal -* \param[in] a - half. Is only being read. -* \param[in] b - half. Is only being read. -* +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* * \returns half -* \retval The result of dividing \p a by \p b. +* \retval The result of dividing \p a by \p b. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -2421,8 +2421,8 @@ __CUDA_FP16_DECL__ __half __hdiv(const __half a, const __half b); * * \details Performs \p half add of inputs \p a and \p b, in round-to-nearest-even mode, * and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0. -* \param[in] a - half. Is only being read. -* \param[in] b - half. Is only being read. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. * * \returns half * \retval The sum of \p a and \p b, with respect to saturation. @@ -2440,8 +2440,8 @@ __CUDA_FP16_DECL__ __half __hadd_sat(const __half a, const __half b); * \details Subtracts \p half input \p b from input \p a in round-to-nearest * mode, * and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0. -* \param[in] a - half. Is only being read. -* \param[in] b - half. Is only being read. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. * * \returns half * \retval The result of subtraction of \p b from \p a, with respect to saturation. @@ -2459,8 +2459,8 @@ __CUDA_FP16_DECL__ __half __hsub_sat(const __half a, const __half b); * \details Performs \p half multiplication of inputs \p a and \p b, in round-to-nearest * mode, and clamps the result to range [0.0, 1.0]. NaN results are flushed to * +0.0. -* \param[in] a - half. Is only being read. -* \param[in] b - half. Is only being read. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. * * \returns half * \retval The result of multiplying \p a and \p b, with respect to saturation. @@ -2480,13 +2480,13 @@ __CUDA_FP16_DECL__ __half __hmul_sat(const __half a, const __half b); * \internal * \req DEEPLEARN-SRM_REQ-96 * \endinternal -* \param[in] a - half. Is only being read. -* \param[in] b - half. Is only being read. -* \param[in] c - half. Is only being read. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* \param[in] c - half. Is only being read. * * \returns half * \retval The result of fused multiply-add operation on \p -* a, \p b, and \p c. +* a, \p b, and \p c. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -2502,13 +2502,13 @@ __CUDA_FP16_DECL__ __half __hfma(const __half a, const __half b, const __half c) * then performs a \p half add of the result with \p c, * rounding the result once in round-to-nearest-even mode, and clamps the result * to range [0.0, 1.0]. NaN results are flushed to +0.0. -* \param[in] a - half. Is only being read. -* \param[in] b - half. Is only being read. -* \param[in] c - half. Is only being read. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* \param[in] c - half. Is only being read. * * \returns half * \retval The result of fused multiply-add operation on \p -* a, \p b, and \p c, with respect to saturation. +* a, \p b, and \p c, with respect to saturation. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -2523,7 +2523,7 @@ __CUDA_FP16_DECL__ __half __hfma_sat(const __half a, const __half b, const __hal * \internal * \req DEEPLEARN-SRM_REQ-100 * \endinternal -* \param[in] a - half. Is only being read. +* \param[in] a - half. Is only being read. * * \returns half * \retval minus a @@ -2542,8 +2542,8 @@ __CUDA_FP16_DECL__ __half __hneg(const __half a); * The bool result is set to true only if both \p half if-equal comparisons * evaluate to true, or false otherwise. * NaN inputs generate false results. -* \param[in] a - half2. Is only being read. -* \param[in] b - half2. Is only being read. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. * * \returns bool * \retval true if both \p half results of if-equal comparison @@ -2564,13 +2564,13 @@ __CUDA_FP16_DECL__ bool __hbeq2(const __half2 a, const __half2 b); * The bool result is set to true only if both \p half not-equal comparisons * evaluate to true, or false otherwise. * NaN inputs generate false results. -* \param[in] a - half2. Is only being read. -* \param[in] b - half2. Is only being read. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. * * \returns bool * \retval true if both \p half results of not-equal comparison -* of vectors \p a and \p b are true, -* \retval false otherwise. +* of vectors \p a and \p b are true, +* \retval false otherwise. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -2586,13 +2586,13 @@ __CUDA_FP16_DECL__ bool __hbne2(const __half2 a, const __half2 b); * The bool result is set to true only if both \p half less-equal comparisons * evaluate to true, or false otherwise. * NaN inputs generate false results. -* \param[in] a - half2. Is only being read. -* \param[in] b - half2. Is only being read. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. * * \returns bool * \retval true if both \p half results of less-equal comparison -* of vectors \p a and \p b are true; -* \retval false otherwise. +* of vectors \p a and \p b are true; +* \retval false otherwise. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -2608,13 +2608,13 @@ __CUDA_FP16_DECL__ bool __hble2(const __half2 a, const __half2 b); * The bool result is set to true only if both \p half greater-equal comparisons * evaluate to true, or false otherwise. * NaN inputs generate false results. -* \param[in] a - half2. Is only being read. -* \param[in] b - half2. Is only being read. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. * * \returns bool * \retval true if both \p half results of greater-equal -* comparison of vectors \p a and \p b are true; -* \retval false otherwise. +* comparison of vectors \p a and \p b are true; +* \retval false otherwise. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -2630,13 +2630,13 @@ __CUDA_FP16_DECL__ bool __hbge2(const __half2 a, const __half2 b); * The bool result is set to true only if both \p half less-than comparisons * evaluate to true, or false otherwise. * NaN inputs generate false results. -* \param[in] a - half2. Is only being read. -* \param[in] b - half2. Is only being read. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. * * \returns bool * \retval true if both \p half results of less-than comparison -* of vectors \p a and \p b are true; -* \retval false otherwise. +* of vectors \p a and \p b are true; +* \retval false otherwise. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -2652,13 +2652,13 @@ __CUDA_FP16_DECL__ bool __hblt2(const __half2 a, const __half2 b); * The bool result is set to true only if both \p half greater-than comparisons * evaluate to true, or false otherwise. * NaN inputs generate false results. -* \param[in] a - half2. Is only being read. -* \param[in] b - half2. Is only being read. -* -* \returns bool +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool * \retval true if both \p half results of greater-than -* comparison of vectors \p a and \p b are true; -* \retval false otherwise. +* comparison of vectors \p a and \p b are true; +* \retval false otherwise. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -2674,13 +2674,13 @@ __CUDA_FP16_DECL__ bool __hbgt2(const __half2 a, const __half2 b); * The bool result is set to true only if both \p half if-equal comparisons * evaluate to true, or false otherwise. * NaN inputs generate true results. -* \param[in] a - half2. Is only being read. -* \param[in] b - half2. Is only being read. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. * * \returns bool * \retval true if both \p half results of unordered if-equal -* comparison of vectors \p a and \p b are true; -* \retval false otherwise. +* comparison of vectors \p a and \p b are true; +* \retval false otherwise. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -2696,13 +2696,13 @@ __CUDA_FP16_DECL__ bool __hbequ2(const __half2 a, const __half2 b); * The bool result is set to true only if both \p half not-equal comparisons * evaluate to true, or false otherwise. * NaN inputs generate true results. -* \param[in] a - half2. Is only being read. -* \param[in] b - half2. Is only being read. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. * * \returns bool * \retval true if both \p half results of unordered not-equal * comparison of vectors \p a and \p b are true; -* \retval false otherwise. +* \retval false otherwise. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -2718,13 +2718,13 @@ __CUDA_FP16_DECL__ bool __hbneu2(const __half2 a, const __half2 b); * The bool result is set to true only if both \p half less-equal comparisons * evaluate to true, or false otherwise. * NaN inputs generate true results. -* \param[in] a - half2. Is only being read. -* \param[in] b - half2. Is only being read. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. * * \returns bool * \retval true if both \p half results of unordered less-equal -* comparison of vectors \p a and \p b are true; -* \retval false otherwise. +* comparison of vectors \p a and \p b are true; +* \retval false otherwise. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -2741,13 +2741,13 @@ __CUDA_FP16_DECL__ bool __hbleu2(const __half2 a, const __half2 b); * The bool result is set to true only if both \p half greater-equal comparisons * evaluate to true, or false otherwise. * NaN inputs generate true results. -* \param[in] a - half2. Is only being read. -* \param[in] b - half2. Is only being read. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. * * \returns bool * \retval true if both \p half results of unordered -* greater-equal comparison of vectors \p a and \p b are true; -* \retval false otherwise. +* greater-equal comparison of vectors \p a and \p b are true; +* \retval false otherwise. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -2763,13 +2763,13 @@ __CUDA_FP16_DECL__ bool __hbgeu2(const __half2 a, const __half2 b); * The bool result is set to true only if both \p half less-than comparisons * evaluate to true, or false otherwise. * NaN inputs generate true results. -* \param[in] a - half2. Is only being read. -* \param[in] b - half2. Is only being read. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. * * \returns bool -* \retval true if both \p half results of unordered less-than comparison of -* vectors \p a and \p b are true; -* \retval false otherwise. +* \retval true if both \p half results of unordered less-than comparison of +* vectors \p a and \p b are true; +* \retval false otherwise. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -2786,13 +2786,13 @@ __CUDA_FP16_DECL__ bool __hbltu2(const __half2 a, const __half2 b); * The bool result is set to true only if both \p half greater-than comparisons * evaluate to true, or false otherwise. * NaN inputs generate true results. -* \param[in] a - half2. Is only being read. -* \param[in] b - half2. Is only being read. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. * * \returns bool * \retval true if both \p half results of unordered * greater-than comparison of vectors \p a and \p b are true; -* \retval false otherwise. +* \retval false otherwise. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -2805,11 +2805,11 @@ __CUDA_FP16_DECL__ bool __hbgtu2(const __half2 a, const __half2 b); * * \details Performs \p half if-equal comparison of inputs \p a and \p b. * NaN inputs generate false results. -* \param[in] a - half. Is only being read. -* \param[in] b - half. Is only being read. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. * * \returns bool -* \retval The boolean result of if-equal comparison of \p a and \p b. +* \retval The boolean result of if-equal comparison of \p a and \p b. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -2822,8 +2822,8 @@ __CUDA_FP16_DECL__ bool __heq(const __half a, const __half b); * * \details Performs \p half not-equal comparison of inputs \p a and \p b. * NaN inputs generate false results. -* \param[in] a - half. Is only being read. -* \param[in] b - half. Is only being read. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. * * \returns bool * \retval The boolean result of not-equal comparison of \p a and \p b. @@ -2839,8 +2839,8 @@ __CUDA_FP16_DECL__ bool __hne(const __half a, const __half b); * * \details Performs \p half less-equal comparison of inputs \p a and \p b. * NaN inputs generate false results. -* \param[in] a - half. Is only being read. -* \param[in] b - half. Is only being read. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. * * \returns bool * \retval The boolean result of less-equal comparison of \p a and \p b. @@ -2856,8 +2856,8 @@ __CUDA_FP16_DECL__ bool __hle(const __half a, const __half b); * * \details Performs \p half greater-equal comparison of inputs \p a and \p b. * NaN inputs generate false results. -* \param[in] a - half. Is only being read. -* \param[in] b - half. Is only being read. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. * * \returns bool * \retval The boolean result of greater-equal comparison of \p a and \p b. @@ -2873,8 +2873,8 @@ __CUDA_FP16_DECL__ bool __hge(const __half a, const __half b); * * \details Performs \p half less-than comparison of inputs \p a and \p b. * NaN inputs generate false results. -* \param[in] a - half. Is only being read. -* \param[in] b - half. Is only being read. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. * * \returns bool * \retval The boolean result of less-than comparison of \p a and \p b. @@ -2890,8 +2890,8 @@ __CUDA_FP16_DECL__ bool __hlt(const __half a, const __half b); * * \details Performs \p half greater-than comparison of inputs \p a and \p b. * NaN inputs generate false results. -* \param[in] a - half. Is only being read. -* \param[in] b - half. Is only being read. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. * * \returns bool * \retval The boolean result of greater-than comparison of \p a and \p b. @@ -2907,8 +2907,8 @@ __CUDA_FP16_DECL__ bool __hgt(const __half a, const __half b); * * \details Performs \p half if-equal comparison of inputs \p a and \p b. * NaN inputs generate true results. -* \param[in] a - half. Is only being read. -* \param[in] b - half. Is only being read. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. * * \returns bool * \retval The boolean result of unordered if-equal comparison of \p a and @@ -2925,8 +2925,8 @@ __CUDA_FP16_DECL__ bool __hequ(const __half a, const __half b); * * \details Performs \p half not-equal comparison of inputs \p a and \p b. * NaN inputs generate true results. -* \param[in] a - half. Is only being read. -* \param[in] b - half. Is only being read. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. * * \returns bool * \retval The boolean result of unordered not-equal comparison of \p a and @@ -2943,8 +2943,8 @@ __CUDA_FP16_DECL__ bool __hneu(const __half a, const __half b); * * \details Performs \p half less-equal comparison of inputs \p a and \p b. * NaN inputs generate true results. -* \param[in] a - half. Is only being read. -* \param[in] b - half. Is only being read. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. * * \returns bool * \retval The boolean result of unordered less-equal comparison of \p a and @@ -2961,8 +2961,8 @@ __CUDA_FP16_DECL__ bool __hleu(const __half a, const __half b); * * \details Performs \p half greater-equal comparison of inputs \p a and \p b. * NaN inputs generate true results. -* \param[in] a - half. Is only being read. -* \param[in] b - half. Is only being read. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. * * \returns bool * \retval The boolean result of unordered greater-equal comparison of \p a @@ -2979,8 +2979,8 @@ __CUDA_FP16_DECL__ bool __hgeu(const __half a, const __half b); * * \details Performs \p half less-than comparison of inputs \p a and \p b. * NaN inputs generate true results. -* \param[in] a - half. Is only being read. -* \param[in] b - half. Is only being read. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. * * \returns bool * \retval The boolean result of unordered less-than comparison of \p a and @@ -2997,8 +2997,8 @@ __CUDA_FP16_DECL__ bool __hltu(const __half a, const __half b); * * \details Performs \p half greater-than comparison of inputs \p a and \p b. * NaN inputs generate true results. -* \param[in] a - half. Is only being read. -* \param[in] b - half. Is only being read. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. * * \returns bool * \retval The boolean result of unordered greater-than comparison of \p a @@ -3014,10 +3014,10 @@ __CUDA_FP16_DECL__ bool __hgtu(const __half a, const __half b); * \brief Determine whether \p half argument is a NaN. * * \details Determine whether \p half value \p a is a NaN. -* \param[in] a - half. Is only being read. +* \param[in] a - half. Is only being read. * * \returns bool -* \retval true iff argument is NaN. +* \retval true iff argument is NaN. * \internal * \exception-guarantee no-throw guarantee * \behavior reentrant, thread safe @@ -3250,7 +3250,7 @@ __CUDA_FP16_DECL__ __half2 __hcmadd(const __half2 a, const __half2 b, const __ha * \brief Calculates \p half square root in round-to-nearest-even mode. * * \details Calculates \p half square root of input \p a in round-to-nearest-even mode. -* \param[in] a - half. Is only being read. +* \param[in] a - half. Is only being read. * * \returns half * \retval The square root of \p a. @@ -3267,7 +3267,7 @@ __CUDA_FP16_DECL__ __half hsqrt(const __half a); * * \details Calculates \p half reciprocal square root of input \p a in round-to-nearest * mode. -* \param[in] a - half. Is only being read. +* \param[in] a - half. Is only being read. * * \returns half * \retval The reciprocal square root of \p a. @@ -3282,7 +3282,7 @@ __CUDA_FP16_DECL__ __half hrsqrt(const __half a); * \brief Calculates \p half reciprocal in round-to-nearest-even mode. * * \details Calculates \p half reciprocal of input \p a in round-to-nearest-even mode. -* \param[in] a - half. Is only being read. +* \param[in] a - half. Is only being read. * * \returns half * \retval The reciprocal of \p a. @@ -3298,7 +3298,7 @@ __CUDA_FP16_DECL__ __half hrcp(const __half a); * * \details Calculates \p half natural logarithm of input \p a in round-to-nearest-even * mode. -* \param[in] a - half. Is only being read. +* \param[in] a - half. Is only being read. * * \returns half * \retval The natural logarithm of \p a. @@ -3314,7 +3314,7 @@ __CUDA_FP16_DECL__ __half hlog(const __half a); * * \details Calculates \p half binary logarithm of input \p a in round-to-nearest-even * mode. -* \param[in] a - half. Is only being read. +* \param[in] a - half. Is only being read. * * \returns half * \retval The binary logarithm of \p a. @@ -3330,7 +3330,7 @@ __CUDA_FP16_DECL__ __half hlog2(const __half a); * * \details Calculates \p half decimal logarithm of input \p a in round-to-nearest-even * mode. -* \param[in] a - half. Is only being read. +* \param[in] a - half. Is only being read. * * \returns half * \retval The decimal logarithm of \p a. @@ -3347,7 +3347,7 @@ __CUDA_FP16_DECL__ __half hlog10(const __half a); * * \details Calculates \p half natural exponential function of input \p a in * round-to-nearest-even mode. -* \param[in] a - half. Is only being read. +* \param[in] a - half. Is only being read. * * \returns half * \retval The natural exponential function on \p a. @@ -3364,7 +3364,7 @@ __CUDA_FP16_DECL__ __half hexp(const __half a); * * \details Calculates \p half binary exponential function of input \p a in * round-to-nearest-even mode. -* \param[in] a - half. Is only being read. +* \param[in] a - half. Is only being read. * * \returns half * \retval The binary exponential function on \p a. @@ -3381,7 +3381,7 @@ __CUDA_FP16_DECL__ __half hexp2(const __half a); * * \details Calculates \p half decimal exponential function of input \p a in * round-to-nearest-even mode. -* \param[in] a - half. Is only being read. +* \param[in] a - half. Is only being read. * * \returns half * \retval The decimal exponential function on \p a. @@ -3396,7 +3396,7 @@ __CUDA_FP16_DECL__ __half hexp10(const __half a); * \brief Calculates \p half cosine in round-to-nearest-even mode. * * \details Calculates \p half cosine of input \p a in round-to-nearest-even mode. -* \param[in] a - half. Is only being read. +* \param[in] a - half. Is only being read. * * \returns half * \retval The cosine of \p a. @@ -3411,7 +3411,7 @@ __CUDA_FP16_DECL__ __half hcos(const __half a); * \brief Calculates \p half sine in round-to-nearest-even mode. * * \details Calculates \p half sine of input \p a in round-to-nearest-even mode. -* \param[in] a - half. Is only being read. +* \param[in] a - half. Is only being read. * * \returns half * \retval The sine of \p a. @@ -3427,7 +3427,7 @@ __CUDA_FP16_DECL__ __half hsin(const __half a); * * \details Calculates \p half2 square root of input vector \p a in round-to-nearest * mode. -* \param[in] a - half2. Is only being read. +* \param[in] a - half2. Is only being read. * * \returns half2 * \retval The elementwise square root on vector \p a. @@ -3444,7 +3444,7 @@ __CUDA_FP16_DECL__ __half2 h2sqrt(const __half2 a); * * \details Calculates \p half2 reciprocal square root of input vector \p a in * round-to-nearest-even mode. -* \param[in] a - half2. Is only being read. +* \param[in] a - half2. Is only being read. * * \returns half2 * \retval The elementwise reciprocal square root on vector \p a. @@ -3460,7 +3460,7 @@ __CUDA_FP16_DECL__ __half2 h2rsqrt(const __half2 a); * * \details Calculates \p half2 reciprocal of input vector \p a in round-to-nearest-even * mode. -* \param[in] a - half2. Is only being read. +* \param[in] a - half2. Is only being read. * * \returns half2 * \retval The elementwise reciprocal on vector \p a. @@ -3477,7 +3477,7 @@ __CUDA_FP16_DECL__ __half2 h2rcp(const __half2 a); * * \details Calculates \p half2 natural logarithm of input vector \p a in * round-to-nearest-even mode. -* \param[in] a - half2. Is only being read. +* \param[in] a - half2. Is only being read. * * \returns half2 * \retval The elementwise natural logarithm on vector \p a. @@ -3494,7 +3494,7 @@ __CUDA_FP16_DECL__ __half2 h2log(const __half2 a); * * \details Calculates \p half2 binary logarithm of input vector \p a in round-to-nearest * mode. -* \param[in] a - half2. Is only being read. +* \param[in] a - half2. Is only being read. * * \returns half2 * \retval The elementwise binary logarithm on vector \p a. @@ -3511,7 +3511,7 @@ __CUDA_FP16_DECL__ __half2 h2log2(const __half2 a); * * \details Calculates \p half2 decimal logarithm of input vector \p a in * round-to-nearest-even mode. -* \param[in] a - half2. Is only being read. +* \param[in] a - half2. Is only being read. * * \returns half2 * \retval The elementwise decimal logarithm on vector \p a. @@ -3528,7 +3528,7 @@ __CUDA_FP16_DECL__ __half2 h2log10(const __half2 a); * * \details Calculates \p half2 exponential function of input vector \p a in * round-to-nearest-even mode. -* \param[in] a - half2. Is only being read. +* \param[in] a - half2. Is only being read. * * \returns half2 * \retval The elementwise exponential function on vector \p a. @@ -3545,7 +3545,7 @@ __CUDA_FP16_DECL__ __half2 h2exp(const __half2 a); * * \details Calculates \p half2 binary exponential function of input vector \p a in * round-to-nearest-even mode. -* \param[in] a - half2. Is only being read. +* \param[in] a - half2. Is only being read. * * \returns half2 * \retval The elementwise binary exponential function on vector \p a. @@ -3559,11 +3559,11 @@ __CUDA_FP16_DECL__ __half2 h2exp2(const __half2 a); * \ingroup CUDA_MATH__HALF2_FUNCTIONS * \brief Calculates \p half2 vector decimal exponential function in * round-to-nearest-even mode. -* +* * \details Calculates \p half2 decimal exponential function of input vector \p a in * round-to-nearest-even mode. -* \param[in] a - half2. Is only being read. -* +* \param[in] a - half2. Is only being read. +* * \returns half2 * \retval The elementwise decimal exponential function on vector \p a. * \internal @@ -3575,11 +3575,11 @@ __CUDA_FP16_DECL__ __half2 h2exp10(const __half2 a); /** * \ingroup CUDA_MATH__HALF2_FUNCTIONS * \brief Calculates \p half2 vector cosine in round-to-nearest-even mode. -* +* * \details Calculates \p half2 cosine of input vector \p a in round-to-nearest-even * mode. -* \param[in] a - half2. Is only being read. -* +* \param[in] a - half2. Is only being read. +* * \returns half2 * \retval The elementwise cosine on vector \p a. * \internal @@ -3591,10 +3591,10 @@ __CUDA_FP16_DECL__ __half2 h2cos(const __half2 a); /** * \ingroup CUDA_MATH__HALF2_FUNCTIONS * \brief Calculates \p half2 vector sine in round-to-nearest-even mode. -* +* * \details Calculates \p half2 sine of input vector \p a in round-to-nearest-even mode. -* \param[in] a - half2. Is only being read. -* +* \param[in] a - half2. Is only being read. +* * \returns half2 * \retval The elementwise sine on vector \p a. * \internal diff --git a/numba_cuda/numba/cuda/cuda_fp16.hpp b/numba_cuda/numba/cuda/cuda_fp16.hpp index 19bbd3412..2bc123b58 100644 --- a/numba_cuda/numba/cuda/cuda_fp16.hpp +++ b/numba_cuda/numba/cuda/cuda_fp16.hpp @@ -60,7 +60,7 @@ # define __CPP_VERSION_AT_LEAST_11_FP16 #endif -/* C++11 header for std::move. +/* C++11 header for std::move. * In RTC mode, std::move is provided implicitly; don't include the header */ #if defined(__CPP_VERSION_AT_LEAST_11_FP16) && !defined(__CUDACC_RTC__) @@ -145,7 +145,7 @@ * Types which allow static initialization of "half" and "half2" until * these become an actual builtin. Note this initialization is as a * bitfield representation of "half", and not a conversion from short->half. -* Such a representation will be deprecated in a future version of CUDA. +* Such a representation will be deprecated in a future version of CUDA. * (Note these are visible to non-nvcc compilers, including C-only compilation) */ typedef struct __CUDA_ALIGN__(2) { @@ -2443,7 +2443,7 @@ __CUDA_FP16_DECL__ __half atomicAdd(__half *const address, const __half val) { #undef __CUDA_HOSTDEVICE_FP16_DECL__ #undef __CUDA_FP16_DECL__ - + /* Define first-class types "half" and "half2", unless user specifies otherwise via "#define CUDA_NO_HALF" */ /* C cannot ever have these types defined here, because __half and __half2 are C++ classes */ #if defined(__cplusplus) && !defined(CUDA_NO_HALF) diff --git a/numba_cuda/numba/cuda/cuda_paths.py b/numba_cuda/numba/cuda/cuda_paths.py index 4290a0a95..7d7f7ce6f 100644 --- a/numba_cuda/numba/cuda/cuda_paths.py +++ b/numba_cuda/numba/cuda/cuda_paths.py @@ -9,7 +9,7 @@ from numba import config -_env_path_tuple = namedtuple('_env_path_tuple', ['by', 'info']) +_env_path_tuple = namedtuple("_env_path_tuple", ["by", "info"]) def _find_valid_path(options): @@ -21,16 +21,16 @@ def _find_valid_path(options): if data is not None: return by, data else: - return '', None + return "", None def _get_libdevice_path_decision(): options = [ - ('Conda environment', get_conda_ctk()), - ('Conda environment (NVIDIA package)', get_nvidia_libdevice_ctk()), - ('CUDA_HOME', get_cuda_home('nvvm', 'libdevice')), - ('System', get_system_ctk('nvvm', 'libdevice')), - ('Debian package', get_debian_pkg_libdevice()), + ("Conda environment", get_conda_ctk()), + ("Conda environment (NVIDIA package)", get_nvidia_libdevice_ctk()), + ("CUDA_HOME", get_cuda_home("nvvm", "libdevice")), + ("System", get_system_ctk("nvvm", "libdevice")), + ("Debian package", get_debian_pkg_libdevice()), ] by, libdir = _find_valid_path(options) return by, libdir @@ -38,17 +38,17 @@ def _get_libdevice_path_decision(): def _nvvm_lib_dir(): if IS_WIN32: - return 'nvvm', 'bin' + return "nvvm", "bin" else: - return 'nvvm', 'lib64' + return "nvvm", "lib64" def _get_nvvm_path_decision(): options = [ - ('Conda environment', get_conda_ctk()), - ('Conda environment (NVIDIA package)', get_nvidia_nvvm_ctk()), - ('CUDA_HOME', get_cuda_home(*_nvvm_lib_dir())), - ('System', get_system_ctk(*_nvvm_lib_dir())), + ("Conda environment", get_conda_ctk()), + ("Conda environment (NVIDIA package)", get_nvidia_nvvm_ctk()), + ("CUDA_HOME", get_cuda_home(*_nvvm_lib_dir())), + ("System", get_system_ctk(*_nvvm_lib_dir())), ] by, path = _find_valid_path(options) return by, path @@ -57,7 +57,7 @@ def _get_nvvm_path_decision(): def _get_libdevice_paths(): by, libdir = _get_libdevice_path_decision() # Search for pattern - pat = r'libdevice(\.\d+)*\.bc$' + pat = r"libdevice(\.\d+)*\.bc$" candidates = find_file(re.compile(pat), libdir) # Keep only the max (most recent version) of the bitcode files. out = max(candidates, default=None) @@ -66,24 +66,24 @@ def _get_libdevice_paths(): def _cudalib_path(): if IS_WIN32: - return 'bin' + return "bin" else: - return 'lib64' + return "lib64" def _cuda_home_static_cudalib_path(): if IS_WIN32: - return ('lib', 'x64') + return ("lib", "x64") else: - return ('lib64',) + return ("lib64",) def _get_cudalib_dir_path_decision(): options = [ - ('Conda environment', get_conda_ctk()), - ('Conda environment (NVIDIA package)', get_nvidia_cudalib_ctk()), - ('CUDA_HOME', get_cuda_home(_cudalib_path())), - ('System', get_system_ctk(_cudalib_path())), + ("Conda environment", get_conda_ctk()), + ("Conda environment (NVIDIA package)", get_nvidia_cudalib_ctk()), + ("CUDA_HOME", get_cuda_home(_cudalib_path())), + ("System", get_system_ctk(_cudalib_path())), ] by, libdir = _find_valid_path(options) return by, libdir @@ -91,10 +91,10 @@ def _get_cudalib_dir_path_decision(): def _get_static_cudalib_dir_path_decision(): options = [ - ('Conda environment', get_conda_ctk()), - ('Conda environment (NVIDIA package)', get_nvidia_static_cudalib_ctk()), - ('CUDA_HOME', get_cuda_home(*_cuda_home_static_cudalib_path())), - ('System', get_system_ctk(_cudalib_path())), + ("Conda environment", get_conda_ctk()), + ("Conda environment (NVIDIA package)", get_nvidia_static_cudalib_ctk()), + ("CUDA_HOME", get_cuda_home(*_cuda_home_static_cudalib_path())), + ("System", get_system_ctk(_cudalib_path())), ] by, libdir = _find_valid_path(options) return by, libdir @@ -111,25 +111,23 @@ def _get_static_cudalib_dir(): def get_system_ctk(*subdirs): - """Return path to system-wide cudatoolkit; or, None if it doesn't exist. - """ + """Return path to system-wide cudatoolkit; or, None if it doesn't exist.""" # Linux? - if sys.platform.startswith('linux'): + if sys.platform.startswith("linux"): # Is cuda alias to /usr/local/cuda? # We are intentionally not getting versioned cuda installation. - base = '/usr/local/cuda' + base = "/usr/local/cuda" if os.path.exists(base): return os.path.join(base, *subdirs) def get_conda_ctk(): - """Return path to directory containing the shared libraries of cudatoolkit. - """ - is_conda_env = os.path.exists(os.path.join(sys.prefix, 'conda-meta')) + """Return path to directory containing the shared libraries of cudatoolkit.""" + is_conda_env = os.path.exists(os.path.join(sys.prefix, "conda-meta")) if not is_conda_env: return # Assume the existence of NVVM to imply cudatoolkit installed - paths = find_lib('nvvm') + paths = find_lib("nvvm") if not paths: return # Use the directory name of the max path @@ -137,9 +135,8 @@ def get_conda_ctk(): def get_nvidia_nvvm_ctk(): - """Return path to directory containing the NVVM shared library. - """ - is_conda_env = os.path.exists(os.path.join(sys.prefix, 'conda-meta')) + """Return path to directory containing the NVVM shared library.""" + is_conda_env = os.path.exists(os.path.join(sys.prefix, "conda-meta")) if not is_conda_env: return @@ -147,16 +144,16 @@ def get_nvidia_nvvm_ctk(): # conda package is installed. # First, try the location used on Linux and the Windows 11.x packages - libdir = os.path.join(sys.prefix, 'nvvm', _cudalib_path()) + libdir = os.path.join(sys.prefix, "nvvm", _cudalib_path()) if not os.path.exists(libdir) or not os.path.isdir(libdir): # If that fails, try the location used for Windows 12.x packages - libdir = os.path.join(sys.prefix, 'Library', 'nvvm', _cudalib_path()) + libdir = os.path.join(sys.prefix, "Library", "nvvm", _cudalib_path()) if not os.path.exists(libdir) or not os.path.isdir(libdir): # If that doesn't exist either, assume we don't have the NVIDIA # conda package return - paths = find_lib('nvvm', libdir=libdir) + paths = find_lib("nvvm", libdir=libdir) if not paths: return # Use the directory name of the max path @@ -164,39 +161,36 @@ def get_nvidia_nvvm_ctk(): def get_nvidia_libdevice_ctk(): - """Return path to directory containing the libdevice library. - """ + """Return path to directory containing the libdevice library.""" nvvm_ctk = get_nvidia_nvvm_ctk() if not nvvm_ctk: return nvvm_dir = os.path.dirname(nvvm_ctk) - return os.path.join(nvvm_dir, 'libdevice') + return os.path.join(nvvm_dir, "libdevice") def get_nvidia_cudalib_ctk(): - """Return path to directory containing the shared libraries of cudatoolkit. - """ + """Return path to directory containing the shared libraries of cudatoolkit.""" nvvm_ctk = get_nvidia_nvvm_ctk() if not nvvm_ctk: return env_dir = os.path.dirname(os.path.dirname(nvvm_ctk)) - subdir = 'bin' if IS_WIN32 else 'lib' + subdir = "bin" if IS_WIN32 else "lib" return os.path.join(env_dir, subdir) def get_nvidia_static_cudalib_ctk(): - """Return path to directory containing the static libraries of cudatoolkit. - """ + """Return path to directory containing the static libraries of cudatoolkit.""" nvvm_ctk = get_nvidia_nvvm_ctk() if not nvvm_ctk: return if IS_WIN32 and ("Library" not in nvvm_ctk): # Location specific to CUDA 11.x packages on Windows - dirs = ('Lib', 'x64') + dirs = ("Lib", "x64") else: # Linux, or Windows with CUDA 12.x packages - dirs = ('lib',) + dirs = ("lib",) env_dir = os.path.dirname(os.path.dirname(nvvm_ctk)) return os.path.join(env_dir, *dirs) @@ -207,17 +201,17 @@ def get_cuda_home(*subdirs): If *subdirs* are the subdirectory name to be appended in the resulting path. """ - cuda_home = os.environ.get('CUDA_HOME') + cuda_home = os.environ.get("CUDA_HOME") if cuda_home is None: # Try Windows CUDA installation without Anaconda - cuda_home = os.environ.get('CUDA_PATH') + cuda_home = os.environ.get("CUDA_PATH") if cuda_home is not None: return os.path.join(cuda_home, *subdirs) def _get_nvvm_path(): by, path = _get_nvvm_path_decision() - candidates = find_lib('nvvm', path) + candidates = find_lib("nvvm", path) path = max(candidates) if candidates else None return _env_path_tuple(by, path) @@ -234,16 +228,16 @@ def get_cuda_paths(): Note: The result of the function is cached. """ # Check cache - if hasattr(get_cuda_paths, '_cached_result'): + if hasattr(get_cuda_paths, "_cached_result"): return get_cuda_paths._cached_result else: # Not in cache d = { - 'nvvm': _get_nvvm_path(), - 'libdevice': _get_libdevice_paths(), - 'cudalib_dir': _get_cudalib_dir(), - 'static_cudalib_dir': _get_static_cudalib_dir(), - 'include_dir': _get_include_dir(), + "nvvm": _get_nvvm_path(), + "libdevice": _get_libdevice_paths(), + "cudalib_dir": _get_cudalib_dir(), + "static_cudalib_dir": _get_static_cudalib_dir(), + "include_dir": _get_include_dir(), } # Cache result get_cuda_paths._cached_result = d @@ -255,7 +249,7 @@ def get_debian_pkg_libdevice(): Return the Debian NVIDIA Maintainers-packaged libdevice location, if it exists. """ - pkg_libdevice_location = '/usr/lib/nvidia-cuda-toolkit/libdevice' + pkg_libdevice_location = "/usr/lib/nvidia-cuda-toolkit/libdevice" if not os.path.exists(pkg_libdevice_location): return None return pkg_libdevice_location @@ -274,13 +268,10 @@ def get_current_cuda_target_name(): machine = platform.machine() if system == "Linux": - arch_to_targets = { - 'x86_64': 'x86_64-linux', - 'aarch64': 'sbsa-linux' - } + arch_to_targets = {"x86_64": "x86_64-linux", "aarch64": "sbsa-linux"} elif system == "Windows": arch_to_targets = { - 'AMD64': 'x64', + "AMD64": "x64", } else: arch_to_targets = {} @@ -293,26 +284,28 @@ def get_conda_include_dir(): Return the include directory in the current conda environment, if one is active and it exists. """ - is_conda_env = os.path.exists(os.path.join(sys.prefix, 'conda-meta')) + is_conda_env = os.path.exists(os.path.join(sys.prefix, "conda-meta")) if not is_conda_env: return if platform.system() == "Windows": - include_dir = os.path.join( - sys.prefix, 'Library', 'include' - ) + include_dir = os.path.join(sys.prefix, "Library", "include") elif target_name := get_current_cuda_target_name(): include_dir = os.path.join( - sys.prefix, 'targets', target_name, 'include' + sys.prefix, "targets", target_name, "include" ) else: # A fallback when target cannot determined # though usually it shouldn't. - include_dir = os.path.join(sys.prefix, 'include') + include_dir = os.path.join(sys.prefix, "include") - if (os.path.exists(include_dir) and os.path.isdir(include_dir) - and os.path.exists(os.path.join(include_dir, - 'cuda_device_runtime_api.h'))): + if ( + os.path.exists(include_dir) + and os.path.isdir(include_dir) + and os.path.exists( + os.path.join(include_dir, "cuda_device_runtime_api.h") + ) + ): return include_dir return @@ -320,8 +313,8 @@ def get_conda_include_dir(): def _get_include_dir(): """Find the root include directory.""" options = [ - ('Conda environment (NVIDIA package)', get_conda_include_dir()), - ('CUDA_INCLUDE_PATH Config Entry', config.CUDA_INCLUDE_PATH), + ("Conda environment (NVIDIA package)", get_conda_include_dir()), + ("CUDA_INCLUDE_PATH Config Entry", config.CUDA_INCLUDE_PATH), # TODO: add others ] by, include_dir = _find_valid_path(options) diff --git a/numba_cuda/numba/cuda/cudadecl.py b/numba_cuda/numba/cuda/cudadecl.py index de2541e58..547272601 100644 --- a/numba_cuda/numba/cuda/cudadecl.py +++ b/numba_cuda/numba/cuda/cudadecl.py @@ -1,15 +1,23 @@ import operator from numba.core import types -from numba.core.typing.npydecl import (parse_dtype, parse_shape, - register_number_classes, - register_numpy_ufunc, - trigonometric_functions, - comparison_functions, - math_operations, - bit_twiddling_functions) -from numba.core.typing.templates import (AttributeTemplate, ConcreteTemplate, - AbstractTemplate, CallableTemplate, - signature, Registry) +from numba.core.typing.npydecl import ( + parse_dtype, + parse_shape, + register_number_classes, + register_numpy_ufunc, + trigonometric_functions, + comparison_functions, + math_operations, + bit_twiddling_functions, +) +from numba.core.typing.templates import ( + AttributeTemplate, + ConcreteTemplate, + AbstractTemplate, + CallableTemplate, + signature, + Registry, +) from numba.cuda.types import dim3 from numba.core.typeconv import Conversion from numba import cuda @@ -26,15 +34,15 @@ class Cuda_array_decl(CallableTemplate): def generic(self): def typer(shape, dtype): - # Only integer literals and tuples of integer literals are valid # shapes if isinstance(shape, types.Integer): if not isinstance(shape, types.IntegerLiteral): return None elif isinstance(shape, (types.Tuple, types.UniTuple)): - if any([not isinstance(s, types.IntegerLiteral) - for s in shape]): + if any( + [not isinstance(s, types.IntegerLiteral) for s in shape] + ): return None else: return None @@ -42,7 +50,7 @@ def typer(shape, dtype): ndim = parse_shape(shape) nb_dtype = parse_dtype(dtype) if nb_dtype is not None and ndim is not None: - return types.Array(dtype=nb_dtype, ndim=ndim, layout='C') + return types.Array(dtype=nb_dtype, ndim=ndim, layout="C") return typer @@ -64,6 +72,7 @@ class Cuda_const_array_like(CallableTemplate): def generic(self): def typer(ndarray): return ndarray + return typer @@ -95,22 +104,49 @@ class Cuda_syncwarp(ConcreteTemplate): class Cuda_shfl_sync_intrinsic(ConcreteTemplate): key = cuda.shfl_sync_intrinsic cases = [ - signature(types.Tuple((types.i4, types.b1)), - types.i4, types.i4, types.i4, types.i4, types.i4), - signature(types.Tuple((types.i8, types.b1)), - types.i4, types.i4, types.i8, types.i4, types.i4), - signature(types.Tuple((types.f4, types.b1)), - types.i4, types.i4, types.f4, types.i4, types.i4), - signature(types.Tuple((types.f8, types.b1)), - types.i4, types.i4, types.f8, types.i4, types.i4), + signature( + types.Tuple((types.i4, types.b1)), + types.i4, + types.i4, + types.i4, + types.i4, + types.i4, + ), + signature( + types.Tuple((types.i8, types.b1)), + types.i4, + types.i4, + types.i8, + types.i4, + types.i4, + ), + signature( + types.Tuple((types.f4, types.b1)), + types.i4, + types.i4, + types.f4, + types.i4, + types.i4, + ), + signature( + types.Tuple((types.f8, types.b1)), + types.i4, + types.i4, + types.f8, + types.i4, + types.i4, + ), ] @register class Cuda_vote_sync_intrinsic(ConcreteTemplate): key = cuda.vote_sync_intrinsic - cases = [signature(types.Tuple((types.i4, types.b1)), - types.i4, types.i4, types.b1)] + cases = [ + signature( + types.Tuple((types.i4, types.b1)), types.i4, types.i4, types.b1 + ) + ] @register @@ -153,6 +189,7 @@ class Cuda_popc(ConcreteTemplate): Supported types from `llvm.popc` [here](http://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#bit-manipulations-intrinics) """ + key = cuda.popc cases = [ signature(types.int8, types.int8), @@ -172,6 +209,7 @@ class Cuda_fma(ConcreteTemplate): Supported types from `llvm.fma` [here](https://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#standard-c-library-intrinics) """ + key = cuda.fma cases = [ signature(types.float32, types.float32, types.float32, types.float32), @@ -189,7 +227,6 @@ class Cuda_hfma(ConcreteTemplate): @register class Cuda_cbrt(ConcreteTemplate): - key = cuda.cbrt cases = [ signature(types.float32, types.float32), @@ -212,6 +249,7 @@ class Cuda_clz(ConcreteTemplate): Supported types from `llvm.ctlz` [here](http://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#bit-manipulations-intrinics) """ + key = cuda.clz cases = [ signature(types.int8, types.int8), @@ -231,6 +269,7 @@ class Cuda_ffs(ConcreteTemplate): Supported types from `llvm.cttz` [here](http://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#bit-manipulations-intrinics) """ + key = cuda.ffs cases = [ signature(types.uint32, types.int8), @@ -254,10 +293,16 @@ def generic(self, args, kws): # per docs # http://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-selp - supported_types = (types.float64, types.float32, - types.int16, types.uint16, - types.int32, types.uint32, - types.int64, types.uint64) + supported_types = ( + types.float64, + types.float32, + types.int16, + types.uint16, + types.int32, + types.uint32, + types.int64, + types.uint64, + ) if a != b or a not in supported_types: return @@ -298,7 +343,6 @@ class Cuda_fp16_binary(ConcreteTemplate): @register_global(float) class Float(AbstractTemplate): - def generic(self, args, kws): assert not kws @@ -313,11 +357,11 @@ def _genfp16_binary_comparison(l_key): class Cuda_fp16_cmp(ConcreteTemplate): key = l_key - cases = [ - signature(types.b1, types.float16, types.float16) - ] + cases = [signature(types.b1, types.float16, types.float16)] + return Cuda_fp16_cmp + # If multiple ConcreteTemplates provide typing for a single function, then # function resolution will pick the first compatible typing it finds even if it # involves inserting a cast that would be considered undesirable (in this @@ -340,9 +384,10 @@ class Cuda_fp16_operator(AbstractTemplate): def generic(self, args, kws): assert not kws - if len(args) == 2 and \ - (args[0] == types.float16 or args[1] == types.float16): - if (args[0] == types.float16): + if len(args) == 2 and ( + args[0] == types.float16 or args[1] == types.float16 + ): + if args[0] == types.float16: convertible = self.context.can_convert(args[1], args[0]) else: convertible = self.context.can_convert(args[0], args[1]) @@ -355,9 +400,11 @@ def generic(self, args, kws): # 3. fp16 to int8 (safe conversion) - # - Conversion.safe - if (convertible == Conversion.exact) or \ - (convertible == Conversion.promote) or \ - (convertible == Conversion.safe): + if ( + (convertible == Conversion.exact) + or (convertible == Conversion.promote) + or (convertible == Conversion.safe) + ): return signature(retty, types.float16, types.float16) return Cuda_fp16_operator @@ -404,38 +451,42 @@ def _genfp16_binary_operator(op): def _resolve_wrapped_unary(fname): link = tuple() - decl = declare_device_function_template(f'__numba_wrapper_{fname}', - types.float16, - (types.float16,), - link) + decl = declare_device_function_template( + f"__numba_wrapper_{fname}", types.float16, (types.float16,), link + ) return types.Function(decl) def _resolve_wrapped_binary(fname): link = tuple() - decl = declare_device_function_template(f'__numba_wrapper_{fname}', - types.float16, - (types.float16, types.float16,), - link) + decl = declare_device_function_template( + f"__numba_wrapper_{fname}", + types.float16, + ( + types.float16, + types.float16, + ), + link, + ) return types.Function(decl) -hsin_device = _resolve_wrapped_unary('hsin') -hcos_device = _resolve_wrapped_unary('hcos') -hlog_device = _resolve_wrapped_unary('hlog') -hlog10_device = _resolve_wrapped_unary('hlog10') -hlog2_device = _resolve_wrapped_unary('hlog2') -hexp_device = _resolve_wrapped_unary('hexp') -hexp10_device = _resolve_wrapped_unary('hexp10') -hexp2_device = _resolve_wrapped_unary('hexp2') -hsqrt_device = _resolve_wrapped_unary('hsqrt') -hrsqrt_device = _resolve_wrapped_unary('hrsqrt') -hfloor_device = _resolve_wrapped_unary('hfloor') -hceil_device = _resolve_wrapped_unary('hceil') -hrcp_device = _resolve_wrapped_unary('hrcp') -hrint_device = _resolve_wrapped_unary('hrint') -htrunc_device = _resolve_wrapped_unary('htrunc') -hdiv_device = _resolve_wrapped_binary('hdiv') +hsin_device = _resolve_wrapped_unary("hsin") +hcos_device = _resolve_wrapped_unary("hcos") +hlog_device = _resolve_wrapped_unary("hlog") +hlog10_device = _resolve_wrapped_unary("hlog10") +hlog2_device = _resolve_wrapped_unary("hlog2") +hexp_device = _resolve_wrapped_unary("hexp") +hexp10_device = _resolve_wrapped_unary("hexp10") +hexp2_device = _resolve_wrapped_unary("hexp2") +hsqrt_device = _resolve_wrapped_unary("hsqrt") +hrsqrt_device = _resolve_wrapped_unary("hrsqrt") +hfloor_device = _resolve_wrapped_unary("hfloor") +hceil_device = _resolve_wrapped_unary("hceil") +hrcp_device = _resolve_wrapped_unary("hrcp") +hrint_device = _resolve_wrapped_unary("hrint") +htrunc_device = _resolve_wrapped_unary("htrunc") +hdiv_device = _resolve_wrapped_binary("hdiv") # generate atomic operations @@ -455,15 +506,20 @@ def generic(self, args, kws): return signature(ary.dtype, ary, types.intp, ary.dtype) elif ary.ndim > 1: return signature(ary.dtype, ary, idx, ary.dtype) + return Cuda_atomic -all_numba_types = (types.float64, types.float32, - types.int32, types.uint32, - types.int64, types.uint64) +all_numba_types = ( + types.float64, + types.float32, + types.int32, + types.uint32, + types.int64, + types.uint64, +) -integer_numba_types = (types.int32, types.uint32, - types.int64, types.uint64) +integer_numba_types = (types.int32, types.uint32, types.int64, types.uint64) unsigned_int_numba_types = (types.uint32, types.uint64) @@ -811,5 +867,5 @@ def resolve_local(self, mod): register_numpy_ufunc(func, register_global) for func in math_operations: - if func in ('log', 'log2', 'log10'): + if func in ("log", "log2", "log10"): register_numpy_ufunc(func, register_global) diff --git a/numba_cuda/numba/cuda/cudadrv/__init__.py b/numba_cuda/numba/cuda/cudadrv/__init__.py index 33bfca345..c7d60a5e3 100644 --- a/numba_cuda/numba/cuda/cudadrv/__init__.py +++ b/numba_cuda/numba/cuda/cudadrv/__init__.py @@ -5,5 +5,7 @@ - Device array implementation """ + from numba.core import config -assert not config.ENABLE_CUDASIM, 'Cannot use real driver API with simulator' + +assert not config.ENABLE_CUDASIM, "Cannot use real driver API with simulator" diff --git a/numba_cuda/numba/cuda/cudadrv/devicearray.py b/numba_cuda/numba/cuda/cudadrv/devicearray.py index 87b00edcf..7ffbca924 100644 --- a/numba_cuda/numba/cuda/cudadrv/devicearray.py +++ b/numba_cuda/numba/cuda/cudadrv/devicearray.py @@ -25,7 +25,7 @@ from warnings import warn try: - lru_cache = getattr(functools, 'lru_cache')(None) + lru_cache = getattr(functools, "lru_cache")(None) except AttributeError: # Python 3.1 or lower def lru_cache(func): @@ -34,7 +34,7 @@ def lru_cache(func): def is_cuda_ndarray(obj): "Check if an object is a CUDA ndarray" - return getattr(obj, '__cuda_ndarray__', False) + return getattr(obj, "__cuda_ndarray__", False) def verify_cuda_ndarray_interface(obj): @@ -45,25 +45,25 @@ def requires_attr(attr, typ): if not hasattr(obj, attr): raise AttributeError(attr) if not isinstance(getattr(obj, attr), typ): - raise AttributeError('%s must be of type %s' % (attr, typ)) + raise AttributeError("%s must be of type %s" % (attr, typ)) - requires_attr('shape', tuple) - requires_attr('strides', tuple) - requires_attr('dtype', np.dtype) - requires_attr('size', int) + requires_attr("shape", tuple) + requires_attr("strides", tuple) + requires_attr("dtype", np.dtype) + requires_attr("size", int) def require_cuda_ndarray(obj): "Raises ValueError is is_cuda_ndarray(obj) evaluates False" if not is_cuda_ndarray(obj): - raise ValueError('require an cuda ndarray object') + raise ValueError("require an cuda ndarray object") class DeviceNDArrayBase(_devicearray.DeviceArray): - """A on GPU NDArray representation - """ + """A on GPU NDArray representation""" + __cuda_memory__ = True - __cuda_ndarray__ = True # There must be gpu_data attribute + __cuda_ndarray__ = True # There must be gpu_data attribute def __init__(self, shape, strides, dtype, stream=0, gpu_data=None): """ @@ -88,9 +88,10 @@ def __init__(self, shape, strides, dtype, stream=0, gpu_data=None): dtype = np.dtype(dtype) self.ndim = len(shape) if len(strides) != self.ndim: - raise ValueError('strides not match ndim') - self._dummy = dummyarray.Array.from_desc(0, shape, strides, - dtype.itemsize) + raise ValueError("strides not match ndim") + self._dummy = dummyarray.Array.from_desc( + 0, shape, strides, dtype.itemsize + ) self.shape = tuple(shape) self.strides = tuple(strides) self.dtype = dtype @@ -99,7 +100,8 @@ def __init__(self, shape, strides, dtype, stream=0, gpu_data=None): if self.size > 0: if gpu_data is None: self.alloc_size = _driver.memory_size_from_info( - self.shape, self.strides, self.dtype.itemsize) + self.shape, self.strides, self.dtype.itemsize + ) gpu_data = devices.get_context().memalloc(self.alloc_size) else: self.alloc_size = _driver.device_memory_size(gpu_data) @@ -109,8 +111,9 @@ def __init__(self, shape, strides, dtype, stream=0, gpu_data=None): null = _driver.binding.CUdeviceptr(0) else: null = c_void_p(0) - gpu_data = _driver.MemoryPointer(context=devices.get_context(), - pointer=null, size=0) + gpu_data = _driver.MemoryPointer( + context=devices.get_context(), pointer=null, size=0 + ) self.alloc_size = 0 self.gpu_data = gpu_data @@ -130,12 +133,12 @@ def __cuda_array_interface__(self): ptr = 0 return { - 'shape': tuple(self.shape), - 'strides': None if is_contiguous(self) else tuple(self.strides), - 'data': (ptr, False), - 'typestr': self.dtype.str, - 'stream': int(self.stream) if self.stream != 0 else None, - 'version': 3, + "shape": tuple(self.shape), + "strides": None if is_contiguous(self) else tuple(self.strides), + "data": (ptr, False), + "typestr": self.dtype.str, + "stream": int(self.stream) if self.stream != 0 else None, + "version": 3, } def bind(self, stream=0): @@ -160,6 +163,7 @@ def transpose(self, axes=None): raise ValueError("invalid axes list %r" % (axes,)) else: from numba.cuda.kernels.transpose import transpose + return transpose(self) def _default_stream(self, stream): @@ -186,20 +190,19 @@ def _numba_type_(self): # layouts. broadcast = 0 in self.strides - if self.flags['C_CONTIGUOUS'] and not broadcast: - layout = 'C' - elif self.flags['F_CONTIGUOUS'] and not broadcast: - layout = 'F' + if self.flags["C_CONTIGUOUS"] and not broadcast: + layout = "C" + elif self.flags["F_CONTIGUOUS"] and not broadcast: + layout = "F" else: - layout = 'A' + layout = "A" dtype = numpy_support.from_dtype(self.dtype) return types.Array(dtype, self.ndim, layout) @property def device_ctypes_pointer(self): - """Returns the ctypes pointer to the GPU data buffer - """ + """Returns the ctypes pointer to the GPU data buffer""" if self.gpu_data is None: if _driver.USE_NV_BINDING: return _driver.binding.CUdeviceptr(0) @@ -232,13 +235,16 @@ def copy_to_device(self, ary, stream=0): # (i.e., in order to materialize a writable strided view) ary_core = np.array( ary_core, - order='C' if self_core.flags['C_CONTIGUOUS'] else 'F', + order="C" if self_core.flags["C_CONTIGUOUS"] else "F", subok=True, - copy=(not ary_core.flags['WRITEABLE']) - if numpy_version < (2, 0) else None) + copy=(not ary_core.flags["WRITEABLE"]) + if numpy_version < (2, 0) + else None, + ) check_array_compatibility(self_core, ary_core) - _driver.host_to_device(self, ary_core, self.alloc_size, - stream=stream) + _driver.host_to_device( + self, ary_core, self.alloc_size, stream=stream + ) @devices.require_context def copy_to_host(self, ary=None, stream=0): @@ -264,7 +270,7 @@ def copy_to_host(self, ary=None, stream=0): result_array = d_arr.copy_to_host() """ if any(s < 0 for s in self.strides): - msg = 'D->H copy not implemented for negative strides: {}' + msg = "D->H copy not implemented for negative strides: {}" raise NotImplementedError(msg.format(self.strides)) assert self.alloc_size >= 0, "Negative memory size" stream = self._default_stream(stream) @@ -275,16 +281,22 @@ def copy_to_host(self, ary=None, stream=0): hostary = ary if self.alloc_size != 0: - _driver.device_to_host(hostary, self, self.alloc_size, - stream=stream) + _driver.device_to_host( + hostary, self, self.alloc_size, stream=stream + ) if ary is None: if self.size == 0: - hostary = np.ndarray(shape=self.shape, dtype=self.dtype, - buffer=hostary) + hostary = np.ndarray( + shape=self.shape, dtype=self.dtype, buffer=hostary + ) else: - hostary = np.ndarray(shape=self.shape, dtype=self.dtype, - strides=self.strides, buffer=hostary) + hostary = np.ndarray( + shape=self.shape, + dtype=self.dtype, + strides=self.strides, + buffer=hostary, + ) return hostary def split(self, section, stream=0): @@ -305,12 +317,16 @@ def split(self, section, stream=0): end = min(begin + section, self.size) shape = (end - begin,) gpu_data = self.gpu_data.view(begin * itemsize, end * itemsize) - yield DeviceNDArray(shape, strides, dtype=self.dtype, stream=stream, - gpu_data=gpu_data) + yield DeviceNDArray( + shape, + strides, + dtype=self.dtype, + stream=stream, + gpu_data=gpu_data, + ) def as_cuda_arg(self): - """Returns a device memory object that is used as the argument. - """ + """Returns a device memory object that is used as the argument.""" return self.gpu_data def get_ipc_handle(self): @@ -368,8 +384,7 @@ def view(self, dtype): ) shape[-1], rem = divmod( - shape[-1] * self.dtype.itemsize, - dtype.itemsize + shape[-1] * self.dtype.itemsize, dtype.itemsize ) if rem != 0: @@ -398,14 +413,16 @@ def nbytes(self): class DeviceRecord(DeviceNDArrayBase): - ''' + """ An on-GPU record type - ''' + """ + def __init__(self, dtype, stream=0, gpu_data=None): shape = () strides = () - super(DeviceRecord, self).__init__(shape, strides, dtype, stream, - gpu_data) + super(DeviceRecord, self).__init__( + shape, strides, dtype, stream, gpu_data + ) @property def flags(self): @@ -415,7 +432,7 @@ def flags(self): with an existing `numpy.ndarray` (as the C- and F- contiguous flags aren't writeable). """ - return dict(self._dummy.flags) # defensive copy + return dict(self._dummy.flags) # defensive copy @property def _numba_type_(self): @@ -431,8 +448,7 @@ def __getitem__(self, item): @devices.require_context def getitem(self, item, stream=0): - """Do `__getitem__(item)` with CUDA stream - """ + """Do `__getitem__(item)` with CUDA stream""" return self._do_getitem(item, stream) def _do_getitem(self, item, stream=0): @@ -442,22 +458,24 @@ def _do_getitem(self, item, stream=0): if typ.shape == (): if typ.names is not None: - return DeviceRecord(dtype=typ, stream=stream, - gpu_data=newdata) + return DeviceRecord(dtype=typ, stream=stream, gpu_data=newdata) else: hostary = np.empty(1, dtype=typ) - _driver.device_to_host(dst=hostary, src=newdata, - size=typ.itemsize, - stream=stream) + _driver.device_to_host( + dst=hostary, src=newdata, size=typ.itemsize, stream=stream + ) return hostary[0] else: - shape, strides, dtype = \ - prepare_shape_strides_dtype(typ.shape, - None, - typ.subdtype[0], 'C') - return DeviceNDArray(shape=shape, strides=strides, - dtype=dtype, gpu_data=newdata, - stream=stream) + shape, strides, dtype = prepare_shape_strides_dtype( + typ.shape, None, typ.subdtype[0], "C" + ) + return DeviceNDArray( + shape=shape, + strides=strides, + dtype=dtype, + gpu_data=newdata, + stream=stream, + ) @devices.require_context def __setitem__(self, key, value): @@ -465,12 +483,10 @@ def __setitem__(self, key, value): @devices.require_context def setitem(self, key, value, stream=0): - """Do `__setitem__(key, value)` with CUDA stream - """ + """Do `__setitem__(key, value)` with CUDA stream""" return self._do_setitem(key, value, stream=stream) def _do_setitem(self, key, value, stream=0): - stream = self._default_stream(stream) # If the record didn't have a default stream, and the user didn't @@ -515,6 +531,7 @@ def _assign_kernel(ndim): @cuda.jit def kernel(lhs, rhs): lhs[()] = rhs[()] + return kernel @cuda.jit @@ -531,9 +548,7 @@ def kernel(lhs, rhs): # [0, :] is the to-index (into `lhs`) # [1, :] is the from-index (into `rhs`) - idx = cuda.local.array( - shape=(2, ndim), - dtype=types.int64) + idx = cuda.local.array(shape=(2, ndim), dtype=types.int64) for i in range(ndim - 1, -1, -1): idx[0, i] = location % lhs.shape[i] @@ -541,17 +556,19 @@ def kernel(lhs, rhs): location //= lhs.shape[i] lhs[to_fixed_tuple(idx[0], ndim)] = rhs[to_fixed_tuple(idx[1], ndim)] + return kernel class DeviceNDArray(DeviceNDArrayBase): - ''' + """ An on-GPU array type - ''' + """ + def is_f_contiguous(self): - ''' + """ Return true if the array is Fortran-contiguous. - ''' + """ return self._dummy.is_f_contig @property @@ -562,12 +579,12 @@ def flags(self): with an existing `numpy.ndarray` (as the C- and F- contiguous flags aren't writeable). """ - return dict(self._dummy.flags) # defensive copy + return dict(self._dummy.flags) # defensive copy def is_c_contiguous(self): - ''' + """ Return true if the array is C-contiguous. - ''' + """ return self._dummy.is_c_contig def __array__(self, dtype=None, copy=None): @@ -590,7 +607,7 @@ def reshape(self, *newshape, **kws): Reshape the array without changing its contents, similarly to :meth:`numpy.ndarray.reshape`. Example:: - d_arr = d_arr.reshape(20, 50, order='F') + d_arr = d_arr.reshape(20, 50, order="F") """ if len(newshape) == 1 and isinstance(newshape[0], (tuple, list)): newshape = newshape[0] @@ -598,31 +615,43 @@ def reshape(self, *newshape, **kws): cls = type(self) if newshape == self.shape: # nothing to do - return cls(shape=self.shape, strides=self.strides, - dtype=self.dtype, gpu_data=self.gpu_data) + return cls( + shape=self.shape, + strides=self.strides, + dtype=self.dtype, + gpu_data=self.gpu_data, + ) newarr, extents = self._dummy.reshape(*newshape, **kws) if extents == [self._dummy.extent]: - return cls(shape=newarr.shape, strides=newarr.strides, - dtype=self.dtype, gpu_data=self.gpu_data) + return cls( + shape=newarr.shape, + strides=newarr.strides, + dtype=self.dtype, + gpu_data=self.gpu_data, + ) else: raise NotImplementedError("operation requires copying") - def ravel(self, order='C', stream=0): - ''' + def ravel(self, order="C", stream=0): + """ Flattens a contiguous array without changing its contents, similar to :meth:`numpy.ndarray.ravel`. If the array is not contiguous, raises an exception. - ''' + """ stream = self._default_stream(stream) cls = type(self) newarr, extents = self._dummy.ravel(order=order) if extents == [self._dummy.extent]: - return cls(shape=newarr.shape, strides=newarr.strides, - dtype=self.dtype, gpu_data=self.gpu_data, - stream=stream) + return cls( + shape=newarr.shape, + strides=newarr.strides, + dtype=self.dtype, + gpu_data=self.gpu_data, + stream=stream, + ) else: raise NotImplementedError("operation requires copying") @@ -633,8 +662,7 @@ def __getitem__(self, item): @devices.require_context def getitem(self, item, stream=0): - """Do `__getitem__(item)` with CUDA stream - """ + """Do `__getitem__(item)` with CUDA stream""" return self._do_getitem(item, stream) def _do_getitem(self, item, stream=0): @@ -649,22 +677,36 @@ def _do_getitem(self, item, stream=0): if not arr.is_array: # Check for structured array type (record) if self.dtype.names is not None: - return DeviceRecord(dtype=self.dtype, stream=stream, - gpu_data=newdata) + return DeviceRecord( + dtype=self.dtype, stream=stream, gpu_data=newdata + ) else: # Element indexing hostary = np.empty(1, dtype=self.dtype) - _driver.device_to_host(dst=hostary, src=newdata, - size=self._dummy.itemsize, - stream=stream) + _driver.device_to_host( + dst=hostary, + src=newdata, + size=self._dummy.itemsize, + stream=stream, + ) return hostary[0] else: - return cls(shape=arr.shape, strides=arr.strides, - dtype=self.dtype, gpu_data=newdata, stream=stream) + return cls( + shape=arr.shape, + strides=arr.strides, + dtype=self.dtype, + gpu_data=newdata, + stream=stream, + ) else: newdata = self.gpu_data.view(*arr.extent) - return cls(shape=arr.shape, strides=arr.strides, - dtype=self.dtype, gpu_data=newdata, stream=stream) + return cls( + shape=arr.shape, + strides=arr.strides, + dtype=self.dtype, + gpu_data=newdata, + stream=stream, + ) @devices.require_context def __setitem__(self, key, value): @@ -672,12 +714,10 @@ def __setitem__(self, key, value): @devices.require_context def setitem(self, key, value, stream=0): - """Do `__setitem__(key, value)` with CUDA stream - """ + """Do `__setitem__(key, value)` with CUDA stream""" return self._do_setitem(key, value, stream=stream) def _do_setitem(self, key, value, stream=0): - stream = self._default_stream(stream) # If the array didn't have a default stream, and the user didn't provide @@ -706,23 +746,26 @@ def _do_setitem(self, key, value, stream=0): strides=strides, dtype=self.dtype, gpu_data=newdata, - stream=stream) + stream=stream, + ) # (2) prepare RHS rhs, _ = auto_device(value, stream=stream, user_explicit=True) if rhs.ndim > lhs.ndim: - raise ValueError("Can't assign %s-D array to %s-D self" % ( - rhs.ndim, - lhs.ndim)) + raise ValueError( + "Can't assign %s-D array to %s-D self" % (rhs.ndim, lhs.ndim) + ) rhs_shape = np.ones(lhs.ndim, dtype=np.int64) # negative indices would not work if rhs.ndim == 0 - rhs_shape[lhs.ndim - rhs.ndim:] = rhs.shape + rhs_shape[lhs.ndim - rhs.ndim :] = rhs.shape rhs = rhs.reshape(*rhs_shape) for i, (l, r) in enumerate(zip(lhs.shape, rhs.shape)): if r != 1 and l != r: - raise ValueError("Can't copy sequence with size %d to array " - "axis %d with dimension %d" % ( r, i, l)) + raise ValueError( + "Can't copy sequence with size %d to array " + "axis %d with dimension %d" % (r, i, l) + ) # (3) do the copy @@ -751,6 +794,7 @@ class IpcArrayHandle(object): some_code(ipc_array) # ipc_array is dead at this point """ + def __init__(self, ipc_handle, array_desc): self._array_desc = array_desc self._ipc_handle = ipc_handle @@ -798,8 +842,9 @@ def device_setup(self, gpu_data, stream=0): def from_array_like(ary, stream=0, gpu_data=None): "Create a DeviceNDArray object that is like ary." - return DeviceNDArray(ary.shape, ary.strides, ary.dtype, stream=stream, - gpu_data=gpu_data) + return DeviceNDArray( + ary.shape, ary.strides, ary.dtype, stream=stream, gpu_data=gpu_data + ) def from_record_like(rec, stream=0, gpu_data=None): @@ -841,15 +886,17 @@ def is_contiguous(ary): return True -errmsg_contiguous_buffer = ("Array contains non-contiguous buffer and cannot " - "be transferred as a single memory region. Please " - "ensure contiguous buffer with numpy " - ".ascontiguousarray()") +errmsg_contiguous_buffer = ( + "Array contains non-contiguous buffer and cannot " + "be transferred as a single memory region. Please " + "ensure contiguous buffer with numpy " + ".ascontiguousarray()" +) def sentry_contiguous(ary): core = array_core(ary) - if not core.flags['C_CONTIGUOUS'] and not core.flags['F_CONTIGUOUS']: + if not core.flags["C_CONTIGUOUS"] and not core.flags["F_CONTIGUOUS"]: raise ValueError(errmsg_contiguous_buffer) @@ -861,7 +908,7 @@ def auto_device(obj, stream=0, copy=True, user_explicit=False): """ if _driver.is_device_memory(obj): return obj, False - elif hasattr(obj, '__cuda_array_interface__'): + elif hasattr(obj, "__cuda_array_interface__"): return numba.cuda.as_cuda_array(obj), False else: if isinstance(obj, np.void): @@ -873,9 +920,8 @@ def auto_device(obj, stream=0, copy=True, user_explicit=False): # into this function (with no overhead -- copies -- for `obj`s # that are already `ndarray`s. obj = np.array( - obj, - copy=False if numpy_version < (2, 0) else None, - subok=True) + obj, copy=False if numpy_version < (2, 0) else None, subok=True + ) sentry_contiguous(obj) devobj = from_array_like(obj, stream=stream) if copy: @@ -883,13 +929,14 @@ def auto_device(obj, stream=0, copy=True, user_explicit=False): config.CUDA_WARN_ON_IMPLICIT_COPY and not config.DISABLE_PERFORMANCE_WARNINGS ): - if ( - not user_explicit and - (not isinstance(obj, DeviceNDArray) - and isinstance(obj, np.ndarray)) + if not user_explicit and ( + not isinstance(obj, DeviceNDArray) + and isinstance(obj, np.ndarray) ): - msg = ("Host array used in CUDA kernel will incur " - "copy overhead to/from device.") + msg = ( + "Host array used in CUDA kernel will incur " + "copy overhead to/from device." + ) warn(NumbaPerformanceWarning(msg)) devobj.copy_to_device(obj, stream=stream) return devobj, True @@ -898,13 +945,16 @@ def auto_device(obj, stream=0, copy=True, user_explicit=False): def check_array_compatibility(ary1, ary2): ary1sq, ary2sq = ary1.squeeze(), ary2.squeeze() if ary1.dtype != ary2.dtype: - raise TypeError('incompatible dtype: %s vs. %s' % - (ary1.dtype, ary2.dtype)) + raise TypeError( + "incompatible dtype: %s vs. %s" % (ary1.dtype, ary2.dtype) + ) if ary1sq.shape != ary2sq.shape: - raise ValueError('incompatible shape: %s vs. %s' % - (ary1.shape, ary2.shape)) + raise ValueError( + "incompatible shape: %s vs. %s" % (ary1.shape, ary2.shape) + ) # We check strides only if the size is nonzero, because strides are # irrelevant (and can differ) for zero-length copies. if ary1.size and ary1sq.strides != ary2sq.strides: - raise ValueError('incompatible strides: %s vs. %s' % - (ary1.strides, ary2.strides)) + raise ValueError( + "incompatible strides: %s vs. %s" % (ary1.strides, ary2.strides) + ) diff --git a/numba_cuda/numba/cuda/cudadrv/devices.py b/numba_cuda/numba/cuda/cudadrv/devices.py index 6cc9e2e39..a570f91dd 100644 --- a/numba_cuda/numba/cuda/cudadrv/devices.py +++ b/numba_cuda/numba/cuda/cudadrv/devices.py @@ -10,6 +10,7 @@ - This module must be imported by the main-thread. """ + import functools import threading from contextlib import contextmanager @@ -24,8 +25,10 @@ def __getattr__(self, attr): # Device list is not initialized. # Query all CUDA devices. numdev = driver.get_device_count() - gpus = [_DeviceContextManager(driver.get_device(devid)) - for devid in range(numdev)] + gpus = [ + _DeviceContextManager(driver.get_device(devid)) + for devid in range(numdev) + ] # Define "lst" to avoid re-initialization self.lst = gpus return gpus @@ -34,13 +37,13 @@ def __getattr__(self, attr): return super(_DeviceList, self).__getattr__(attr) def __getitem__(self, devnum): - ''' + """ Returns the context manager for device *devnum*. - ''' + """ return self.lst[devnum] def __str__(self): - return ', '.join([str(d) for d in self.lst]) + return ", ".join([str(d) for d in self.lst]) def __iter__(self): return iter(self.lst) @@ -50,8 +53,7 @@ def __len__(self): @property def current(self): - """Returns the active device or None if there's no active device - """ + """Returns the active device or None if there's no active device""" with driver.get_active_context() as ac: devnum = ac.devnum if devnum is not None: @@ -164,8 +166,10 @@ def _get_or_create_context_uncached(self, devnum): ctx_handle = ctx.handle.value ac_ctx_handle = ac.context_handle.value if ctx_handle != ac_ctx_handle: - msg = ('Numba cannot operate on non-primary' - ' CUDA context {:x}') + msg = ( + "Numba cannot operate on non-primary" + " CUDA context {:x}" + ) raise RuntimeError(msg.format(ac_ctx_handle)) # Ensure the context is ready ctx.prepare_for_use() @@ -178,12 +182,12 @@ def _activate_context_for(self, devnum): # Detect unexpected context switch cached_ctx = self._get_attached_context() if cached_ctx is not None and cached_ctx is not newctx: - raise RuntimeError('Cannot switch CUDA-context.') + raise RuntimeError("Cannot switch CUDA-context.") newctx.push() return newctx def _get_attached_context(self): - return getattr(self._tls, 'attached_context', None) + return getattr(self._tls, "attached_context", None) def _set_attached_context(self, ctx): self._tls.attached_context = ctx @@ -226,6 +230,7 @@ def require_context(fn): Note: The function *fn* cannot switch CUDA-context. """ + @functools.wraps(fn) def _require_cuda_context(*args, **kws): with _runtime.ensure_context(): diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py index 1641bf779..8db11880b 100644 --- a/numba_cuda/numba/cuda/cudadrv/driver.py +++ b/numba_cuda/numba/cuda/cudadrv/driver.py @@ -10,6 +10,7 @@ system to freeze in some cases. """ + import sys import os import ctypes @@ -25,8 +26,17 @@ import re from itertools import product from abc import ABCMeta, abstractmethod -from ctypes import (c_int, byref, c_size_t, c_char, c_char_p, addressof, - c_void_p, c_float, c_uint) +from ctypes import ( + c_int, + byref, + c_size_t, + c_char, + c_char_p, + addressof, + c_void_p, + c_float, + c_uint, +) import contextlib import importlib import numpy as np @@ -51,13 +61,14 @@ if USE_NV_BINDING: from cuda import cuda as binding + # There is no definition of the default stream in the Nvidia bindings (nor # is there at the C/C++ level), so we define it here so we don't need to # use a magic number 0 in places where we want the default stream. CU_STREAM_DEFAULT = 0 MIN_REQUIRED_CC = (3, 5) -SUPPORTS_IPC = sys.platform.startswith('linux') +SUPPORTS_IPC = sys.platform.startswith("linux") _py_decref = ctypes.pythonapi.Py_DecRef @@ -71,10 +82,9 @@ "to be available" ) -ENABLE_PYNVJITLINK = ( - _readenv("NUMBA_CUDA_ENABLE_PYNVJITLINK", bool, False) - or getattr(config, "CUDA_ENABLE_PYNVJITLINK", False) -) +ENABLE_PYNVJITLINK = _readenv( + "NUMBA_CUDA_ENABLE_PYNVJITLINK", bool, False +) or getattr(config, "CUDA_ENABLE_PYNVJITLINK", False) if not hasattr(config, "CUDA_ENABLE_PYNVJITLINK"): config.CUDA_ENABLE_PYNVJITLINK = ENABLE_PYNVJITLINK @@ -94,7 +104,7 @@ def make_logger(): if config.CUDA_LOG_LEVEL: # create a simple handler that prints to stderr handler = logging.StreamHandler(sys.stderr) - fmt = '== CUDA [%(relativeCreated)d] %(levelname)5s -- %(message)s' + fmt = "== CUDA [%(relativeCreated)d] %(levelname)5s -- %(message)s" handler.setFormatter(logging.Formatter(fmt=fmt)) logger.addHandler(handler) else: @@ -122,50 +132,52 @@ def __str__(self): def locate_driver_and_loader(): - envpath = config.CUDA_DRIVER - if envpath == '0': + if envpath == "0": # Force fail _raise_driver_not_found() # Determine DLL type - if sys.platform == 'win32': + if sys.platform == "win32": dlloader = ctypes.WinDLL - dldir = ['\\windows\\system32'] - dlnames = ['nvcuda.dll'] - elif sys.platform == 'darwin': + dldir = ["\\windows\\system32"] + dlnames = ["nvcuda.dll"] + elif sys.platform == "darwin": dlloader = ctypes.CDLL - dldir = ['/usr/local/cuda/lib'] - dlnames = ['libcuda.dylib'] + dldir = ["/usr/local/cuda/lib"] + dlnames = ["libcuda.dylib"] else: # Assume to be *nix like dlloader = ctypes.CDLL - dldir = ['/usr/lib', '/usr/lib64'] - dlnames = ['libcuda.so', 'libcuda.so.1'] + dldir = ["/usr/lib", "/usr/lib64"] + dlnames = ["libcuda.so", "libcuda.so.1"] if envpath: try: envpath = os.path.abspath(envpath) except ValueError: - raise ValueError("NUMBA_CUDA_DRIVER %s is not a valid path" % - envpath) + raise ValueError( + "NUMBA_CUDA_DRIVER %s is not a valid path" % envpath + ) if not os.path.isfile(envpath): - raise ValueError("NUMBA_CUDA_DRIVER %s is not a valid file " - "path. Note it must be a filepath of the .so/" - ".dll/.dylib or the driver" % envpath) + raise ValueError( + "NUMBA_CUDA_DRIVER %s is not a valid file " + "path. Note it must be a filepath of the .so/" + ".dll/.dylib or the driver" % envpath + ) candidates = [envpath] else: # First search for the name in the default library path. # If that is not found, try the specific path. - candidates = dlnames + [os.path.join(x, y) - for x, y in product(dldir, dlnames)] + candidates = dlnames + [ + os.path.join(x, y) for x, y in product(dldir, dlnames) + ] return dlloader, candidates def load_driver(dlloader, candidates): - # Load the driver; Collect driver error information path_not_exist = [] driver_load_error = [] @@ -184,7 +196,7 @@ def load_driver(dlloader, candidates): if all(path_not_exist): _raise_driver_not_found() else: - errmsg = '\n'.join(str(e) for e in driver_load_error) + errmsg = "\n".join(str(e) for e in driver_load_error) _raise_driver_error(errmsg) @@ -216,7 +228,7 @@ def _raise_driver_error(e): def _build_reverse_error_map(): - prefix = 'CUDA_ERROR' + prefix = "CUDA_ERROR" map = utils.UniqueDict() for name in dir(enums): if name.startswith(prefix): @@ -236,6 +248,7 @@ class Driver(object): """ Driver API functions are lazily bound. """ + _singleton = None def __new__(cls): @@ -254,9 +267,11 @@ def __init__(self): self.pid = None try: if config.DISABLE_CUDA: - msg = ("CUDA is disabled due to setting NUMBA_DISABLE_CUDA=1 " - "in the environment, or because CUDA is unsupported on " - "32-bit systems.") + msg = ( + "CUDA is disabled due to setting NUMBA_DISABLE_CUDA=1 " + "in the environment, or because CUDA is unsupported on " + "32-bit systems." + ) raise CudaSupportError(msg) self.lib = find_driver() except CudaSupportError as e: @@ -273,7 +288,7 @@ def ensure_initialized(self): self.is_initialized = True try: - _logger.info('init') + _logger.info("init") self.cuInit(0) except CudaAPIError as e: description = f"{e.msg} ({e.code})" @@ -292,8 +307,9 @@ def __getattr__(self, fname): self.ensure_initialized() if self.initialization_error is not None: - raise CudaSupportError("Error at driver init: \n%s:" % - self.initialization_error) + raise CudaSupportError( + "Error at driver init: \n%s:" % self.initialization_error + ) if USE_NV_BINDING: return self._cuda_python_wrap_fn(fname) @@ -317,12 +333,12 @@ def _ctypes_wrap_fn(self, fname, libfn=None): def verbose_cuda_api_call(*args): argstr = ", ".join([str(arg) for arg in args]) - _logger.debug('call driver api: %s(%s)', libfn.__name__, argstr) + _logger.debug("call driver api: %s(%s)", libfn.__name__, argstr) retcode = libfn(*args) self._check_ctypes_error(fname, retcode) def safe_cuda_api_call(*args): - _logger.debug('call driver api: %s', libfn.__name__) + _logger.debug("call driver api: %s", libfn.__name__) retcode = libfn(*args) self._check_ctypes_error(fname, retcode) @@ -340,11 +356,11 @@ def _cuda_python_wrap_fn(self, fname): def verbose_cuda_api_call(*args): argstr = ", ".join([str(arg) for arg in args]) - _logger.debug('call driver api: %s(%s)', libfn.__name__, argstr) + _logger.debug("call driver api: %s(%s)", libfn.__name__, argstr) return self._check_cuda_python_error(fname, libfn(*args)) def safe_cuda_api_call(*args): - _logger.debug('call driver api: %s', libfn.__name__) + _logger.debug("call driver api: %s", libfn.__name__) return self._check_cuda_python_error(fname, libfn(*args)) if config.CUDA_LOG_API_ARGS: @@ -361,27 +377,27 @@ def _find_api(self, fname): # binding. For the NVidia binding, it handles linking to the correct # variant. if config.CUDA_PER_THREAD_DEFAULT_STREAM and not USE_NV_BINDING: - variants = ('_v2_ptds', '_v2_ptsz', '_ptds', '_ptsz', '_v2', '') + variants = ("_v2_ptds", "_v2_ptsz", "_ptds", "_ptsz", "_v2", "") else: - variants = ('_v2', '') + variants = ("_v2", "") for variant in variants: try: - return getattr(self.lib, f'{fname}{variant}') + return getattr(self.lib, f"{fname}{variant}") except AttributeError: pass # Not found. # Delay missing function error to use def absent_function(*args, **kws): - raise CudaDriverError(f'Driver missing function: {fname}') + raise CudaDriverError(f"Driver missing function: {fname}") setattr(self, fname, absent_function) return absent_function def _detect_fork(self): if self.pid is not None and _getpid() != self.pid: - msg = 'pid %s forked from pid %s after CUDA driver init' + msg = "pid %s forked from pid %s after CUDA driver init" _logger.critical(msg, _getpid(), self.pid) raise CudaDriverError("CUDA initialized before forking") @@ -425,13 +441,11 @@ def get_device_count(self): return count.value def list_devices(self): - """Returns a list of active devices - """ + """Returns a list of active devices""" return list(self.devices.values()) def reset(self): - """Reset all devices - """ + """Reset all devices""" for dev in self.devices.values(): dev.reset() @@ -449,8 +463,7 @@ def pop_active_context(self): return popped def get_active_context(self): - """Returns an instance of ``_ActiveContext``. - """ + """Returns an instance of ``_ActiveContext``.""" return _ActiveContext() def get_version(self): @@ -477,12 +490,13 @@ class _ActiveContext(object): Once entering the context, it is assumed that the active CUDA context is not changed until the context is exited. """ + _tls_cache = threading.local() def __enter__(self): is_top = False # check TLS cache - if hasattr(self._tls_cache, 'ctx_devnum'): + if hasattr(self._tls_cache, "ctx_devnum"): hctx, devnum = self._tls_cache.ctx_devnum # Not cached. Query the driver API. else: @@ -515,11 +529,10 @@ def __enter__(self): def __exit__(self, exc_type, exc_val, exc_tb): if self._is_top: - delattr(self._tls_cache, 'ctx_devnum') + delattr(self._tls_cache, "ctx_devnum") def __bool__(self): - """Returns True is there's a valid and active CUDA context. - """ + """Returns True is there's a valid and active CUDA context.""" return self.context_handle is not None __nonzero__ = __bool__ @@ -533,7 +546,7 @@ def _build_reverse_device_attrs(): map = utils.UniqueDict() for name in dir(enums): if name.startswith(prefix): - map[name[len(prefix):]] = getattr(enums, name) + map[name[len(prefix) :]] = getattr(enums, name) return map @@ -545,6 +558,7 @@ class Device(object): The device object owns the CUDA contexts. This is owned by the driver object. User should not construct devices directly. """ + @classmethod def from_identity(self, identity): """Create Device object from device identity created by @@ -579,15 +593,17 @@ def __init__(self, devnum): self.attributes = {} # Read compute capability - self.compute_capability = (self.COMPUTE_CAPABILITY_MAJOR, - self.COMPUTE_CAPABILITY_MINOR) + self.compute_capability = ( + self.COMPUTE_CAPABILITY_MAJOR, + self.COMPUTE_CAPABILITY_MINOR, + ) # Read name bufsz = 128 if USE_NV_BINDING: buf = driver.cuDeviceGetName(bufsz, self.id) - name = buf.decode('utf-8').rstrip('\0') + name = buf.decode("utf-8").rstrip("\0") else: buf = (c_char * bufsz)() driver.cuDeviceGetName(buf, bufsz, self.id) @@ -604,31 +620,31 @@ def __init__(self, devnum): driver.cuDeviceGetUuid(byref(uuid), self.id) uuid_vals = tuple(bytes(uuid)) - b = '%02x' + b = "%02x" b2 = b * 2 b4 = b * 4 b6 = b * 6 - fmt = f'GPU-{b4}-{b2}-{b2}-{b2}-{b6}' + fmt = f"GPU-{b4}-{b2}-{b2}-{b2}-{b6}" self.uuid = fmt % uuid_vals self.primary_context = None def get_device_identity(self): return { - 'pci_domain_id': self.PCI_DOMAIN_ID, - 'pci_bus_id': self.PCI_BUS_ID, - 'pci_device_id': self.PCI_DEVICE_ID, + "pci_domain_id": self.PCI_DOMAIN_ID, + "pci_bus_id": self.PCI_BUS_ID, + "pci_device_id": self.PCI_DEVICE_ID, } def __repr__(self): return "" % (self.id, self.name) def __getattr__(self, attr): - """Read attributes lazily - """ + """Read attributes lazily""" if USE_NV_BINDING: - code = getattr(binding.CUdevice_attribute, - f'CU_DEVICE_ATTRIBUTE_{attr}') + code = getattr( + binding.CUdevice_attribute, f"CU_DEVICE_ATTRIBUTE_{attr}" + ) value = driver.cuDeviceGetAttribute(code, self.id) else: try: @@ -698,17 +714,18 @@ def supports_float16(self): def met_requirement_for_device(device): if device.compute_capability < MIN_REQUIRED_CC: - raise CudaSupportError("%s has compute capability < %s" % - (device, MIN_REQUIRED_CC)) + raise CudaSupportError( + "%s has compute capability < %s" % (device, MIN_REQUIRED_CC) + ) class BaseCUDAMemoryManager(object, metaclass=ABCMeta): """Abstract base class for External Memory Management (EMM) Plugins.""" def __init__(self, *args, **kwargs): - if 'context' not in kwargs: + if "context" not in kwargs: raise RuntimeError("Memory manager requires a context") - self.context = kwargs.pop('context') + self.context = kwargs.pop("context") @abstractmethod def memalloc(self, size): @@ -864,8 +881,7 @@ def _attempt_allocation(self, allocator): else: raise - def memhostalloc(self, size, mapped=False, portable=False, - wc=False): + def memhostalloc(self, size, mapped=False, portable=False, wc=False): """Implements the allocation of pinned host memory. It is recommended that this method is not overridden by EMM Plugin @@ -880,6 +896,7 @@ def memhostalloc(self, size, mapped=False, portable=False, flags |= enums.CU_MEMHOSTALLOC_WRITECOMBINED if USE_NV_BINDING: + def allocator(): return driver.cuMemHostAlloc(size, flags) @@ -946,16 +963,19 @@ def allocator(): ctx = weakref.proxy(self.context) if mapped: - mem = MappedMemory(ctx, pointer, size, owner=owner, - finalizer=finalizer) + mem = MappedMemory( + ctx, pointer, size, owner=owner, finalizer=finalizer + ) self.allocations[alloc_key] = mem return mem.own() else: - return PinnedMemory(ctx, pointer, size, owner=owner, - finalizer=finalizer) + return PinnedMemory( + ctx, pointer, size, owner=owner, finalizer=finalizer + ) def memallocmanaged(self, size, attach_global): if USE_NV_BINDING: + def allocator(): ma_flags = binding.CUmemAttach_flags @@ -1014,8 +1034,7 @@ def defer_cleanup(self): class GetIpcHandleMixin: - """A class that provides a default implementation of ``get_ipc_handle()``. - """ + """A class that provides a default implementation of ``get_ipc_handle()``.""" def get_ipc_handle(self, memory): """Open an IPC memory handle by using ``cuMemGetAddressRange`` to @@ -1034,8 +1053,9 @@ def get_ipc_handle(self, memory): offset = memory.handle.value - base source_info = self.context.device.get_device_identity() - return IpcHandle(memory, ipchandle, memory.size, source_info, - offset=offset) + return IpcHandle( + memory, ipchandle, memory.size, source_info, offset=offset + ) class NumbaCUDAMemoryManager(GetIpcHandleMixin, HostOnlyCUDAMemoryManager): @@ -1050,6 +1070,7 @@ def initialize(self): def memalloc(self, size): if USE_NV_BINDING: + def allocator(): return driver.cuMemAlloc(size) @@ -1098,7 +1119,7 @@ def _ensure_memory_manager(): if _memory_manager: return - if config.CUDA_MEMORY_MANAGER == 'default': + if config.CUDA_MEMORY_MANAGER == "default": _memory_manager = NumbaCUDAMemoryManager return @@ -1106,8 +1127,9 @@ def _ensure_memory_manager(): mgr_module = importlib.import_module(config.CUDA_MEMORY_MANAGER) set_memory_manager(mgr_module._numba_memory_manager) except Exception: - raise RuntimeError("Failed to use memory manager from %s" % - config.CUDA_MEMORY_MANAGER) + raise RuntimeError( + "Failed to use memory manager from %s" % config.CUDA_MEMORY_MANAGER + ) def set_memory_manager(mm_plugin): @@ -1124,8 +1146,10 @@ def set_memory_manager(mm_plugin): dummy = mm_plugin(context=None) iv = dummy.interface_version if iv != _SUPPORTED_EMM_INTERFACE_VERSION: - err = "EMM Plugin interface has version %d - version %d required" \ - % (iv, _SUPPORTED_EMM_INTERFACE_VERSION) + err = "EMM Plugin interface has version %d - version %d required" % ( + iv, + _SUPPORTED_EMM_INTERFACE_VERSION, + ) raise RuntimeError(err) _memory_manager = mm_plugin @@ -1140,7 +1164,7 @@ def __new__(cls, *args, **kwargs): return super().__new__(cls, 0) def __str__(self): - return '?' + return "?" _SizeNotSet = _SizeNotSet() @@ -1153,6 +1177,7 @@ class _PendingDeallocs(object): modified later once the driver is initialized and the total memory capacity known. """ + def __init__(self, capacity=_SizeNotSet): self._cons = deque() self._disable_count = 0 @@ -1172,11 +1197,13 @@ def add_item(self, dtor, handle, size=_SizeNotSet): byte size of the resource added. It is an optional argument. Some resources (e.g. CUModule) has an unknown memory footprint on the device. """ - _logger.info('add pending dealloc: %s %s bytes', dtor.__name__, size) + _logger.info("add pending dealloc: %s %s bytes", dtor.__name__, size) self._cons.append((dtor, handle, size)) self._size += int(size) - if (len(self._cons) > config.CUDA_DEALLOCS_COUNT or - self._size > self._max_pending_bytes): + if ( + len(self._cons) > config.CUDA_DEALLOCS_COUNT + or self._size > self._max_pending_bytes + ): self.clear() def clear(self): @@ -1187,7 +1214,7 @@ def clear(self): if not self.is_disabled: while self._cons: [dtor, handle, size] = self._cons.popleft() - _logger.info('dealloc: %s %s bytes', dtor.__name__, size) + _logger.info("dealloc: %s %s bytes", dtor.__name__, size) dtor(handle) self._size = 0 @@ -1251,19 +1278,19 @@ def reset(self): Clean up all owned resources in this context. """ # Free owned resources - _logger.info('reset context of device %s', self.device.id) + _logger.info("reset context of device %s", self.device.id) self.memory_manager.reset() self.modules.clear() # Clear trash self.deallocations.clear() def get_memory_info(self): - """Returns (free, total) memory in bytes in the context. - """ + """Returns (free, total) memory in bytes in the context.""" return self.memory_manager.get_memory_info() - def get_active_blocks_per_multiprocessor(self, func, blocksize, memsize, - flags=None): + def get_active_blocks_per_multiprocessor( + self, func, blocksize, memsize, flags=None + ): """Return occupancy of a function. :param func: kernel for which occupancy is calculated :param blocksize: block size the kernel is intended to be launched with @@ -1275,8 +1302,9 @@ def get_active_blocks_per_multiprocessor(self, func, blocksize, memsize, else: return self._ctypes_active_blocks_per_multiprocessor(*args) - def _cuda_python_active_blocks_per_multiprocessor(self, func, blocksize, - memsize, flags): + def _cuda_python_active_blocks_per_multiprocessor( + self, func, blocksize, memsize, flags + ): ps = [func.handle, blocksize, memsize] if not flags: @@ -1285,8 +1313,9 @@ def _cuda_python_active_blocks_per_multiprocessor(self, func, blocksize, ps.append(flags) return driver.cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(*ps) - def _ctypes_active_blocks_per_multiprocessor(self, func, blocksize, - memsize, flags): + def _ctypes_active_blocks_per_multiprocessor( + self, func, blocksize, memsize, flags + ): retval = c_int() args = (byref(retval), func.handle, blocksize, memsize) @@ -1297,8 +1326,9 @@ def _ctypes_active_blocks_per_multiprocessor(self, func, blocksize, return retval.value - def get_max_potential_block_size(self, func, b2d_func, memsize, - blocksizelimit, flags=None): + def get_max_potential_block_size( + self, func, b2d_func, memsize, blocksizelimit, flags=None + ): """Suggest a launch configuration with reasonable occupancy. :param func: kernel for which occupancy is calculated :param b2d_func: function that calculates how much per-block dynamic @@ -1315,13 +1345,20 @@ def get_max_potential_block_size(self, func, b2d_func, memsize, else: return self._ctypes_max_potential_block_size(*args) - def _ctypes_max_potential_block_size(self, func, b2d_func, memsize, - blocksizelimit, flags): + def _ctypes_max_potential_block_size( + self, func, b2d_func, memsize, blocksizelimit, flags + ): gridsize = c_int() blocksize = c_int() b2d_cb = cu_occupancy_b2d_size(b2d_func) - args = [byref(gridsize), byref(blocksize), func.handle, b2d_cb, - memsize, blocksizelimit] + args = [ + byref(gridsize), + byref(blocksize), + func.handle, + b2d_cb, + memsize, + blocksizelimit, + ] if not flags: driver.cuOccupancyMaxPotentialBlockSize(*args) @@ -1331,10 +1368,11 @@ def _ctypes_max_potential_block_size(self, func, b2d_func, memsize, return (gridsize.value, blocksize.value) - def _cuda_python_max_potential_block_size(self, func, b2d_func, memsize, - blocksizelimit, flags): + def _cuda_python_max_potential_block_size( + self, func, b2d_func, memsize, blocksizelimit, flags + ): b2d_cb = ctypes.CFUNCTYPE(c_size_t, c_int)(b2d_func) - ptr = int.from_bytes(b2d_cb, byteorder='little') + ptr = int.from_bytes(b2d_cb, byteorder="little") driver_b2d_cb = binding.CUoccupancyB2DSize(ptr) args = [func.handle, driver_b2d_cb, memsize, blocksizelimit] @@ -1387,7 +1425,7 @@ def get_ipc_handle(self, memory): Returns an *IpcHandle* from a GPU allocation. """ if not SUPPORTS_IPC: - raise OSError('OS does not support CUDA IPC') + raise OSError("OS does not support CUDA IPC") return self.memory_manager.get_ipc_handle(memory) def open_ipc_handle(self, handle, size): @@ -1400,13 +1438,13 @@ def open_ipc_handle(self, handle, size): driver.cuIpcOpenMemHandle(byref(dptr), handle, flags) # wrap it - return MemoryPointer(context=weakref.proxy(self), pointer=dptr, - size=size) + return MemoryPointer( + context=weakref.proxy(self), pointer=dptr, size=size + ) def enable_peer_access(self, peer_context, flags=0): - """Enable peer access between the current context and the peer context - """ - assert flags == 0, '*flags* is reserved and MUST be zero' + """Enable peer access between the current context and the peer context""" + assert flags == 0, "*flags* is reserved and MUST be zero" driver.cuCtxEnablePeerAccess(peer_context, flags) def can_access_peer(self, peer_device): @@ -1415,18 +1453,22 @@ def can_access_peer(self, peer_device): """ if USE_NV_BINDING: peer_device = binding.CUdevice(peer_device) - can_access_peer = driver.cuDeviceCanAccessPeer(self.device.id, - peer_device) + can_access_peer = driver.cuDeviceCanAccessPeer( + self.device.id, peer_device + ) else: can_access_peer = c_int() - driver.cuDeviceCanAccessPeer(byref(can_access_peer), - self.device.id, peer_device,) + driver.cuDeviceCanAccessPeer( + byref(can_access_peer), + self.device.id, + peer_device, + ) return bool(can_access_peer) def create_module_ptx(self, ptx): if isinstance(ptx, str): - ptx = ptx.encode('utf8') + ptx = ptx.encode("utf8") if USE_NV_BINDING: image = ptx else: @@ -1481,8 +1523,11 @@ def create_stream(self): else: handle = drvapi.cu_stream() driver.cuStreamCreate(byref(handle), 0) - return Stream(weakref.proxy(self), handle, - _stream_finalizer(self.deallocations, handle)) + return Stream( + weakref.proxy(self), + handle, + _stream_finalizer(self.deallocations, handle), + ) def create_external_stream(self, ptr): if not isinstance(ptr, int): @@ -1491,8 +1536,7 @@ def create_external_stream(self, ptr): handle = binding.CUstream(ptr) else: handle = drvapi.cu_stream(ptr) - return Stream(weakref.proxy(self), handle, None, - external=True) + return Stream(weakref.proxy(self), handle, None, external=True) def create_event(self, timing=True): flags = 0 @@ -1503,8 +1547,11 @@ def create_event(self, timing=True): else: handle = drvapi.cu_event() driver.cuEventCreate(byref(handle), flags) - return Event(weakref.proxy(self), handle, - finalizer=_event_finalizer(self.deallocations, handle)) + return Event( + weakref.proxy(self), + handle, + finalizer=_event_finalizer(self.deallocations, handle), + ) def synchronize(self): driver.cuCtxSynchronize() @@ -1557,16 +1604,21 @@ def load_module_image_ctypes(context, image): handle = drvapi.cu_module() try: - driver.cuModuleLoadDataEx(byref(handle), image, len(options), - option_keys, option_vals) + driver.cuModuleLoadDataEx( + byref(handle), image, len(options), option_keys, option_vals + ) except CudaAPIError as e: msg = "cuModuleLoadDataEx error:\n%s" % jiterrors.value.decode("utf8") raise CudaAPIError(e.code, msg) info_log = jitinfo.value - return CtypesModule(weakref.proxy(context), handle, info_log, - _module_finalizer(context, handle)) + return CtypesModule( + weakref.proxy(context), + handle, + info_log, + _module_finalizer(context, handle), + ) def load_module_image_cuda_python(context, image): @@ -1591,17 +1643,22 @@ def load_module_image_cuda_python(context, image): option_vals = [v for v in options.values()] try: - handle = driver.cuModuleLoadDataEx(image, len(options), option_keys, - option_vals) + handle = driver.cuModuleLoadDataEx( + image, len(options), option_keys, option_vals + ) except CudaAPIError as e: - err_string = jiterrors.decode('utf-8') + err_string = jiterrors.decode("utf-8") msg = "cuModuleLoadDataEx error:\n%s" % err_string raise CudaAPIError(e.code, msg) - info_log = jitinfo.decode('utf-8') + info_log = jitinfo.decode("utf-8") - return CudaPythonModule(weakref.proxy(context), handle, info_log, - _module_finalizer(context, handle)) + return CudaPythonModule( + weakref.proxy(context), + handle, + info_log, + _module_finalizer(context, handle), + ) def _alloc_finalizer(memory_manager, ptr, alloc_key, size): @@ -1704,6 +1761,7 @@ class _CudaIpcImpl(object): """Implementation of GPU IPC using CUDA driver API. This requires the devices to be peer accessible. """ + def __init__(self, parent): self.base = parent.base self.handle = parent.handle @@ -1717,10 +1775,10 @@ def open(self, context): Import the IPC memory and returns a raw CUDA memory pointer object """ if self.base is not None: - raise ValueError('opening IpcHandle from original process') + raise ValueError("opening IpcHandle from original process") if self._opened_mem is not None: - raise ValueError('IpcHandle is already opened') + raise ValueError("IpcHandle is already opened") mem = context.open_ipc_handle(self.handle, self.offset + self.size) # this object owns the opened allocation @@ -1731,7 +1789,7 @@ def open(self, context): def close(self): if self._opened_mem is None: - raise ValueError('IpcHandle not opened') + raise ValueError("IpcHandle not opened") driver.cuIpcCloseMemHandle(self._opened_mem.handle) self._opened_mem = None @@ -1740,6 +1798,7 @@ class _StagedIpcImpl(object): """Implementation of GPU IPC using custom staging logic to workaround CUDA IPC limitation on peer accessibility between devices. """ + def __init__(self, parent, source_info): self.parent = parent self.base = parent.base @@ -1795,6 +1854,7 @@ class IpcHandle(object): referred to by this IPC handle. :type offset: int """ + def __init__(self, base, handle, size, source_info=None, offset=0): self.base = base self.handle = handle @@ -1818,12 +1878,11 @@ def can_access_peer(self, context): return context.can_access_peer(source_device.id) def open_staged(self, context): - """Open the IPC by allowing staging on the host memory first. - """ + """Open the IPC by allowing staging on the host memory first.""" self._sentry_source_info() if self._impl is not None: - raise ValueError('IpcHandle is already opened') + raise ValueError("IpcHandle is already opened") self._impl = _StagedIpcImpl(self, self.source_info) return self._impl.open(context) @@ -1833,7 +1892,7 @@ def open_direct(self, context): Import the IPC memory and returns a raw CUDA memory pointer object """ if self._impl is not None: - raise ValueError('IpcHandle is already opened') + raise ValueError("IpcHandle is already opened") self._impl = _CudaIpcImpl(self) return self._impl.open(context) @@ -1864,12 +1923,13 @@ def open_array(self, context, shape, dtype, strides=None): strides = dtype.itemsize dptr = self.open(context) # read the device pointer as an array - return devicearray.DeviceNDArray(shape=shape, strides=strides, - dtype=dtype, gpu_data=dptr) + return devicearray.DeviceNDArray( + shape=shape, strides=strides, dtype=dtype, gpu_data=dptr + ) def close(self): if self._impl is None: - raise ValueError('IpcHandle not opened') + raise ValueError("IpcHandle not opened") self._impl.close() self._impl = None @@ -1895,8 +1955,13 @@ def _rebuild(cls, handle_ary, size, source_info, offset): else: handle = drvapi.cu_ipc_mem_handle() handle.reserved = handle_ary - return cls(base=None, handle=handle, size=size, - source_info=source_info, offset=offset) + return cls( + base=None, + handle=handle, + size=size, + source_info=source_info, + offset=offset, + ) class MemoryPointer(object): @@ -1930,6 +1995,7 @@ class MemoryPointer(object): :param finalizer: A function that is called when the buffer is to be freed. :type finalizer: function """ + __cuda_memory__ = True def __init__(self, context, pointer, size, owner=None, finalizer=None): @@ -1965,8 +2031,9 @@ def free(self): def memset(self, byte, count=None, stream=0): count = self.size if count is None else count if stream: - driver.cuMemsetD8Async(self.device_pointer, byte, count, - stream.handle) + driver.cuMemsetD8Async( + self.device_pointer, byte, count, stream.handle + ) else: driver.cuMemsetD8(self.device_pointer, byte, count) @@ -1980,12 +2047,12 @@ def view(self, start, stop=None): if not self.device_pointer_value: if size != 0: raise RuntimeError("non-empty slice into empty slice") - view = self # new view is just a reference to self + view = self # new view is just a reference to self # Handle normal case else: base = self.device_pointer_value + start if size < 0: - raise RuntimeError('size cannot be negative') + raise RuntimeError("size cannot be negative") if USE_NV_BINDING: pointer = binding.CUdeviceptr() ctypes_ptr = drvapi.cu_device_ptr.from_address(pointer.getPtr()) @@ -2021,6 +2088,7 @@ class AutoFreePointer(MemoryPointer): Constructor arguments are the same as for :class:`MemoryPointer`. """ + def __init__(self, *args, **kwargs): super(AutoFreePointer, self).__init__(*args, **kwargs) # Releease the self reference to the buffer, so that the finalizer @@ -2063,8 +2131,9 @@ def __init__(self, context, pointer, size, owner=None, finalizer=None): self._bufptr_ = self.host_pointer.value self.device_pointer = devptr - super(MappedMemory, self).__init__(context, devptr, size, - finalizer=finalizer) + super(MappedMemory, self).__init__( + context, devptr, size, finalizer=finalizer + ) self.handle = self.host_pointer # For buffer interface @@ -2179,8 +2248,7 @@ def deref(): weakref.finalize(self, deref) def __getattr__(self, fname): - """Proxy MemoryPointer methods - """ + """Proxy MemoryPointer methods""" return getattr(self._view, fname) @@ -2211,18 +2279,15 @@ def __repr__(self): if USE_NV_BINDING: default_streams = { CU_STREAM_DEFAULT: "", - binding.CU_STREAM_LEGACY: - "", - binding.CU_STREAM_PER_THREAD: - "", + binding.CU_STREAM_LEGACY: "", + binding.CU_STREAM_PER_THREAD: "", } ptr = int(self.handle) or 0 else: default_streams = { drvapi.CU_STREAM_DEFAULT: "", drvapi.CU_STREAM_LEGACY: "", - drvapi.CU_STREAM_PER_THREAD: - "", + drvapi.CU_STREAM_PER_THREAD: "", } ptr = self.handle.value or drvapi.CU_STREAM_DEFAULT @@ -2234,18 +2299,18 @@ def __repr__(self): return "" % (ptr, self.context) def synchronize(self): - ''' + """ Wait for all commands in this stream to execute. This will commit any pending memory transfers. - ''' + """ driver.cuStreamSynchronize(self.handle) @contextlib.contextmanager def auto_synchronize(self): - ''' + """ A context manager that waits for all commands in this stream to execute and commits any pending memory transfers upon exiting the context. - ''' + """ yield self self.synchronize() @@ -2272,7 +2337,7 @@ def add_callback(self, callback, arg=None): data = (self, callback, arg) _py_incref(data) if USE_NV_BINDING: - ptr = int.from_bytes(self._stream_callback, byteorder='little') + ptr = int.from_bytes(self._stream_callback, byteorder="little") stream_callback = binding.CUstreamCallback(ptr) # The callback needs to receive a pointer to the data PyObject data = id(data) @@ -2373,9 +2438,9 @@ def elapsed_time(self, evtend): def event_elapsed_time(evtstart, evtend): - ''' + """ Compute the elapsed time between two events in milliseconds. - ''' + """ if USE_NV_BINDING: return driver.cuEventElapsedTime(evtstart.handle, evtend.handle) else: @@ -2408,34 +2473,35 @@ def get_global_symbol(self, name): class CtypesModule(Module): - def get_function(self, name): handle = drvapi.cu_function() - driver.cuModuleGetFunction(byref(handle), self.handle, - name.encode('utf8')) + driver.cuModuleGetFunction( + byref(handle), self.handle, name.encode("utf8") + ) return CtypesFunction(weakref.proxy(self), handle, name) def get_global_symbol(self, name): ptr = drvapi.cu_device_ptr() size = drvapi.c_size_t() - driver.cuModuleGetGlobal(byref(ptr), byref(size), self.handle, - name.encode('utf8')) + driver.cuModuleGetGlobal( + byref(ptr), byref(size), self.handle, name.encode("utf8") + ) return MemoryPointer(self.context, ptr, size), size.value class CudaPythonModule(Module): - def get_function(self, name): - handle = driver.cuModuleGetFunction(self.handle, name.encode('utf8')) + handle = driver.cuModuleGetFunction(self.handle, name.encode("utf8")) return CudaPythonFunction(weakref.proxy(self), handle, name) def get_global_symbol(self, name): - ptr, size = driver.cuModuleGetGlobal(self.handle, name.encode('utf8')) + ptr, size = driver.cuModuleGetGlobal(self.handle, name.encode("utf8")) return MemoryPointer(self.context, ptr, size), size -FuncAttr = namedtuple("FuncAttr", ["regs", "shared", "local", "const", - "maxthreads"]) +FuncAttr = namedtuple( + "FuncAttr", ["regs", "shared", "local", "const", "maxthreads"] +) class Function(metaclass=ABCMeta): @@ -2458,8 +2524,9 @@ def device(self): return self.module.context.device @abstractmethod - def cache_config(self, prefer_equal=False, prefer_cache=False, - prefer_shared=False): + def cache_config( + self, prefer_equal=False, prefer_cache=False, prefer_shared=False + ): """Set the cache configuration for this function.""" @abstractmethod @@ -2473,9 +2540,9 @@ def read_func_attr_all(self): class CtypesFunction(Function): - - def cache_config(self, prefer_equal=False, prefer_cache=False, - prefer_shared=False): + def cache_config( + self, prefer_equal=False, prefer_cache=False, prefer_shared=False + ): prefer_equal = prefer_equal or (prefer_cache and prefer_shared) if prefer_equal: flag = enums.CU_FUNC_CACHE_PREFER_EQUAL @@ -2498,15 +2565,17 @@ def read_func_attr_all(self): lmem = self.read_func_attr(enums.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES) smem = self.read_func_attr(enums.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES) maxtpb = self.read_func_attr( - enums.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK) - return FuncAttr(regs=nregs, const=cmem, local=lmem, shared=smem, - maxthreads=maxtpb) + enums.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK + ) + return FuncAttr( + regs=nregs, const=cmem, local=lmem, shared=smem, maxthreads=maxtpb + ) class CudaPythonFunction(Function): - - def cache_config(self, prefer_equal=False, prefer_cache=False, - prefer_shared=False): + def cache_config( + self, prefer_equal=False, prefer_cache=False, prefer_shared=False + ): prefer_equal = prefer_equal or (prefer_cache and prefer_shared) attr = binding.CUfunction_attribute if prefer_equal: @@ -2529,19 +2598,26 @@ def read_func_attr_all(self): lmem = self.read_func_attr(attr.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES) smem = self.read_func_attr(attr.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES) maxtpb = self.read_func_attr( - attr.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK) - return FuncAttr(regs=nregs, const=cmem, local=lmem, shared=smem, - maxthreads=maxtpb) - + attr.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK + ) + return FuncAttr( + regs=nregs, const=cmem, local=lmem, shared=smem, maxthreads=maxtpb + ) -def launch_kernel(cufunc_handle, - gx, gy, gz, - bx, by, bz, - sharedmem, - hstream, - args, - cooperative=False): +def launch_kernel( + cufunc_handle, + gx, + gy, + gz, + bx, + by, + bz, + sharedmem, + hstream, + args, + cooperative=False, +): param_ptrs = [addressof(arg) for arg in args] params = (c_void_p * len(param_ptrs))(*param_ptrs) @@ -2553,46 +2629,54 @@ def launch_kernel(cufunc_handle, extra = None if cooperative: - driver.cuLaunchCooperativeKernel(cufunc_handle, - gx, gy, gz, - bx, by, bz, - sharedmem, - hstream, - params_for_launch) + driver.cuLaunchCooperativeKernel( + cufunc_handle, + gx, + gy, + gz, + bx, + by, + bz, + sharedmem, + hstream, + params_for_launch, + ) else: - driver.cuLaunchKernel(cufunc_handle, - gx, gy, gz, - bx, by, bz, - sharedmem, - hstream, - params_for_launch, - extra) + driver.cuLaunchKernel( + cufunc_handle, + gx, + gy, + gz, + bx, + by, + bz, + sharedmem, + hstream, + params_for_launch, + extra, + ) class Linker(metaclass=ABCMeta): """Abstract base class for linkers""" @classmethod - def new(cls, - max_registers=0, - lineinfo=False, - cc=None, - lto=None, - additional_flags=None - ): - + def new( + cls, + max_registers=0, + lineinfo=False, + cc=None, + lto=None, + additional_flags=None, + ): driver_ver = driver.get_version() - if ( - config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY - and driver_ver >= (12, 0) + if config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY and driver_ver >= ( + 12, + 0, ): - raise ValueError( - "Use CUDA_ENABLE_PYNVJITLINK for CUDA >= 12.0 MVC" - ) + raise ValueError("Use CUDA_ENABLE_PYNVJITLINK for CUDA >= 12.0 MVC") if config.CUDA_ENABLE_PYNVJITLINK and driver_ver < (12, 0): - raise ValueError( - "Enabling pynvjitlink requires CUDA 12." - ) + raise ValueError("Enabling pynvjitlink requires CUDA 12.") if config.CUDA_ENABLE_PYNVJITLINK: linker = PyNvJitLinker @@ -2641,9 +2725,9 @@ def add_cu(self, cu, name): ptx, log = nvrtc.compile(cu, name, cc) if config.DUMP_ASSEMBLY: - print(("ASSEMBLY %s" % name).center(80, '-')) + print(("ASSEMBLY %s" % name).center(80, "-")) print(ptx) - print('=' * 80) + print("=" * 80) # Link the program's PTX using the normal linker mechanism ptx_name = os.path.splitext(name)[0] + ".ptx" @@ -2654,7 +2738,7 @@ def add_file(self, path, kind): """Add code from a file to the link""" def add_cu_file(self, path): - with open(path, 'rb') as f: + with open(path, "rb") as f: cu = f.read() self.add_cu(cu, os.path.basename(path)) @@ -2672,24 +2756,24 @@ def add_file_guess_ext(self, path_or_code, ignore_nonlto=False): if isinstance(path_or_code, str): ext = pathlib.Path(path_or_code).suffix - if ext == '': + if ext == "": raise RuntimeError( "Don't know how to link file with no extension" ) - elif ext == '.cu': + elif ext == ".cu": self.add_cu_file(path_or_code) else: - kind = FILE_EXTENSION_MAP.get(ext.lstrip('.'), None) + kind = FILE_EXTENSION_MAP.get(ext.lstrip("."), None) if kind is None: raise RuntimeError( - "Don't know how to link file with extension " - f"{ext}" + f"Don't know how to link file with extension {ext}" ) if ignore_nonlto: warn_and_return = False if kind in ( - FILE_EXTENSION_MAP["fatbin"], FILE_EXTENSION_MAP["o"] + FILE_EXTENSION_MAP["fatbin"], + FILE_EXTENSION_MAP["o"], ): entry_types = inspect_obj_content(path_or_code) if "nvvm" not in entry_types: @@ -2754,6 +2838,7 @@ class MVCLinker(Linker): Linker supporting Minor Version Compatibility, backed by the cubinlinker package. """ + def __init__(self, max_registers=None, lineinfo=False, cc=None): try: from cubinlinker import CubinLinker @@ -2761,18 +2846,20 @@ def __init__(self, max_registers=None, lineinfo=False, cc=None): raise ImportError(_MVC_ERROR_MESSAGE) from err if cc is None: - raise RuntimeError("MVCLinker requires Compute Capability to be " - "specified, but cc is None") + raise RuntimeError( + "MVCLinker requires Compute Capability to be " + "specified, but cc is None" + ) super().__init__(max_registers, lineinfo, cc) arch = f"sm_{cc[0] * 10 + cc[1]}" - ptx_compile_opts = ['--gpu-name', arch, '-c'] + ptx_compile_opts = ["--gpu-name", arch, "-c"] if max_registers: arg = f"--maxrregcount={max_registers}" ptx_compile_opts.append(arg) if lineinfo: - ptx_compile_opts.append('--generate-line-info') + ptx_compile_opts.append("--generate-line-info") self.ptx_compile_options = tuple(ptx_compile_opts) self._linker = CubinLinker(f"--arch={arch}") @@ -2785,7 +2872,7 @@ def info_log(self): def error_log(self): return self._linker.error_log - def add_ptx(self, ptx, name=''): + def add_ptx(self, ptx, name=""): try: from ptxcompiler import compile_ptx from cubinlinker import CubinLinkerError @@ -2804,19 +2891,19 @@ def add_file(self, path, kind): raise ImportError(_MVC_ERROR_MESSAGE) from err try: - with open(path, 'rb') as f: + with open(path, "rb") as f: data = f.read() except FileNotFoundError: - raise LinkerError(f'{path} not found') + raise LinkerError(f"{path} not found") name = pathlib.Path(path).name - if kind == FILE_EXTENSION_MAP['cubin']: + if kind == FILE_EXTENSION_MAP["cubin"]: fn = self._linker.add_cubin - elif kind == FILE_EXTENSION_MAP['fatbin']: + elif kind == FILE_EXTENSION_MAP["fatbin"]: fn = self._linker.add_fatbin - elif kind == FILE_EXTENSION_MAP['a']: + elif kind == FILE_EXTENSION_MAP["a"]: raise LinkerError(f"Don't know how to link {kind}") - elif kind == FILE_EXTENSION_MAP['ptx']: + elif kind == FILE_EXTENSION_MAP["ptx"]: return self.add_ptx(data, name) else: raise LinkerError(f"Don't know how to link {kind}") @@ -2842,6 +2929,7 @@ class CtypesLinker(Linker): """ Links for current device if no CC given """ + def __init__(self, max_registers=0, lineinfo=False, cc=None): super().__init__(max_registers, lineinfo, cc) @@ -2875,8 +2963,9 @@ def __init__(self, max_registers=0, lineinfo=False, cc=None): option_vals = (c_void_p * len(raw_values))(*raw_values) self.handle = handle = drvapi.cu_link_state() - driver.cuLinkCreate(len(raw_keys), option_keys, option_vals, - byref(self.handle)) + driver.cuLinkCreate( + len(raw_keys), option_keys, option_vals, byref(self.handle) + ) weakref.finalize(self, driver.cuLinkDestroy, handle) @@ -2887,19 +2976,27 @@ def __init__(self, max_registers=0, lineinfo=False, cc=None): @property def info_log(self): - return self.linker_info_buf.value.decode('utf8') + return self.linker_info_buf.value.decode("utf8") @property def error_log(self): - return self.linker_errors_buf.value.decode('utf8') + return self.linker_errors_buf.value.decode("utf8") - def add_ptx(self, ptx, name=''): + def add_ptx(self, ptx, name=""): ptxbuf = c_char_p(ptx) - namebuf = c_char_p(name.encode('utf8')) + namebuf = c_char_p(name.encode("utf8")) self._keep_alive += [ptxbuf, namebuf] try: - driver.cuLinkAddData(self.handle, enums.CU_JIT_INPUT_PTX, - ptxbuf, len(ptx), namebuf, 0, None, None) + driver.cuLinkAddData( + self.handle, + enums.CU_JIT_INPUT_PTX, + ptxbuf, + len(ptx), + namebuf, + 0, + None, + None, + ) except CudaAPIError as e: raise LinkerError("%s\n%s" % (e, self.error_log)) @@ -2911,7 +3008,7 @@ def add_file(self, path, kind): driver.cuLinkAddFile(self.handle, kind, pathbuf, 0, None, None) except CudaAPIError as e: if e.code == enums.CUDA_ERROR_FILE_NOT_FOUND: - msg = f'{path} not found' + msg = f"{path} not found" else: msg = "%s\n%s" % (e, self.error_log) raise LinkerError(msg) @@ -2926,7 +3023,7 @@ def complete(self): raise LinkerError("%s\n%s" % (e, self.error_log)) size = size.value - assert size > 0, 'linker returned a zero sized cubin' + assert size > 0, "linker returned a zero sized cubin" del self._keep_alive[:] # We return a copy of the cubin because it's owned by the linker @@ -2938,6 +3035,7 @@ class CudaPythonLinker(Linker): """ Links for current device if no CC given """ + def __init__(self, max_registers=0, lineinfo=False, cc=None): super().__init__(max_registers, lineinfo, cc) @@ -2964,8 +3062,9 @@ def __init__(self, max_registers=0, lineinfo=False, cc=None): options[jit_option.CU_JIT_TARGET_FROM_CUCONTEXT] = 1 else: cc_val = cc[0] * 10 + cc[1] - cc_enum = getattr(binding.CUjit_target, - f'CU_TARGET_COMPUTE_{cc_val}') + cc_enum = getattr( + binding.CUjit_target, f"CU_TARGET_COMPUTE_{cc_val}" + ) options[jit_option.CU_JIT_TARGET] = cc_enum raw_keys = list(options.keys()) @@ -2982,19 +3081,20 @@ def __init__(self, max_registers=0, lineinfo=False, cc=None): @property def info_log(self): - return self.linker_info_buf.decode('utf8') + return self.linker_info_buf.decode("utf8") @property def error_log(self): - return self.linker_errors_buf.decode('utf8') + return self.linker_errors_buf.decode("utf8") - def add_ptx(self, ptx, name=''): - namebuf = name.encode('utf8') + def add_ptx(self, ptx, name=""): + namebuf = name.encode("utf8") self._keep_alive += [ptx, namebuf] try: input_ptx = binding.CUjitInputType.CU_JIT_INPUT_PTX - driver.cuLinkAddData(self.handle, input_ptx, ptx, len(ptx), - namebuf, 0, [], []) + driver.cuLinkAddData( + self.handle, input_ptx, ptx, len(ptx), namebuf, 0, [], [] + ) except CudaAPIError as e: raise LinkerError("%s\n%s" % (e, self.error_log)) @@ -3006,7 +3106,7 @@ def add_file(self, path, kind): driver.cuLinkAddFile(self.handle, kind, pathbuf, 0, [], []) except CudaAPIError as e: if e.code == binding.CUresult.CUDA_ERROR_FILE_NOT_FOUND: - msg = f'{path} not found' + msg = f"{path} not found" else: msg = "%s\n%s" % (e, self.error_log) raise LinkerError(msg) @@ -3017,7 +3117,7 @@ def complete(self): except CudaAPIError as e: raise LinkerError("%s\n%s" % (e, self.error_log)) - assert size > 0, 'linker returned a zero sized cubin' + assert size > 0, "linker returned a zero sized cubin" del self._keep_alive[:] # We return a copy of the cubin because it's owned by the linker cubin_ptr = ctypes.cast(cubin_buf, ctypes.POINTER(ctypes.c_char)) @@ -3151,6 +3251,7 @@ def complete(self): except NvJitLinkError as e: raise LinkerError from e + # ----------------------------------------------------------------------------- @@ -3200,7 +3301,7 @@ def device_memory_size(devmem): The result is cached in the device memory object. It may query the driver for the memory size of the device memory allocation. """ - sz = getattr(devmem, '_cuda_memsize_', None) + sz = getattr(devmem, "_cuda_memsize_", None) if sz is None: s, e = device_extents(devmem) if USE_NV_BINDING: @@ -3213,10 +3314,9 @@ def device_memory_size(devmem): def _is_datetime_dtype(obj): - """Returns True if the obj.dtype is datetime64 or timedelta64 - """ - dtype = getattr(obj, 'dtype', None) - return dtype is not None and dtype.char in 'Mm' + """Returns True if the obj.dtype is datetime64 or timedelta64""" + dtype = getattr(obj, "dtype", None) + return dtype is not None and dtype.char in "Mm" def _workaround_for_datetime(obj): @@ -3295,12 +3395,11 @@ def is_device_memory(obj): "device_pointer" which value is an int object carrying the pointer value of the device memory address. This is not tested in this method. """ - return getattr(obj, '__cuda_memory__', False) + return getattr(obj, "__cuda_memory__", False) def require_device_memory(obj): - """A sentry for methods that accept CUDA memory object. - """ + """A sentry for methods that accept CUDA memory object.""" if not is_device_memory(obj): raise Exception("Not a CUDA memory object.") @@ -3391,16 +3490,16 @@ def device_memset(dst, val, size, stream=0): def profile_start(): - ''' + """ Enable profile collection in the current context. - ''' + """ driver.cuProfilerStart() def profile_stop(): - ''' + """ Disable profile collection in the current context. - ''' + """ driver.cuProfilerStop() @@ -3427,18 +3526,21 @@ def inspect_obj_content(objpath: str): Given path to a fatbin or object, use `cuobjdump` to examine its content Return the set of entries in the object. """ - code_types :set[str] = set() + code_types: set[str] = set() try: - out = subprocess.run(["cuobjdump", objpath], check=True, - capture_output=True) + out = subprocess.run( + ["cuobjdump", objpath], check=True, capture_output=True + ) except FileNotFoundError as e: - msg = ("cuobjdump has not been found. You may need " - "to install the CUDA toolkit and ensure that " - "it is available on your PATH.\n") + msg = ( + "cuobjdump has not been found. You may need " + "to install the CUDA toolkit and ensure that " + "it is available on your PATH.\n" + ) raise RuntimeError(msg) from e - objtable = out.stdout.decode('utf-8') + objtable = out.stdout.decode("utf-8") entry_pattern = r"Fatbin (.*) code" for line in objtable.split("\n"): if match := re.match(entry_pattern, line): diff --git a/numba_cuda/numba/cuda/cudadrv/drvapi.py b/numba_cuda/numba/cuda/cudadrv/drvapi.py index 7f6dfbbdc..1aeeecc44 100644 --- a/numba_cuda/numba/cuda/cudadrv/drvapi.py +++ b/numba_cuda/numba/cuda/cudadrv/drvapi.py @@ -1,20 +1,31 @@ -from ctypes import (c_byte, c_char_p, c_float, c_int, c_size_t, c_uint, - c_uint8, c_void_p, py_object, CFUNCTYPE, POINTER, - Structure) +from ctypes import ( + c_byte, + c_char_p, + c_float, + c_int, + c_size_t, + c_uint, + c_uint8, + c_void_p, + py_object, + CFUNCTYPE, + POINTER, + Structure, +) cu_device = c_int -cu_device_attribute = c_int # enum -cu_context = c_void_p # an opaque handle -cu_module = c_void_p # an opaque handle -cu_jit_option = c_int # enum -cu_jit_input_type = c_int # enum -cu_function = c_void_p # an opaque handle -cu_device_ptr = c_size_t # defined as unsigned long long -cu_stream = c_void_p # an opaque handle +cu_device_attribute = c_int # enum +cu_context = c_void_p # an opaque handle +cu_module = c_void_p # an opaque handle +cu_jit_option = c_int # enum +cu_jit_input_type = c_int # enum +cu_function = c_void_p # an opaque handle +cu_device_ptr = c_size_t # defined as unsigned long long +cu_stream = c_void_p # an opaque handle cu_event = c_void_p cu_link_state = c_void_p cu_function_attribute = c_int -cu_uuid = (c_byte * 16) # Device UUID +cu_uuid = c_byte * 16 # Device UUID cu_stream_callback_pyobj = CFUNCTYPE(None, cu_stream, c_int, py_object) @@ -33,154 +44,145 @@ class cu_ipc_mem_handle(Structure): API_PROTOTYPES = { # CUresult cuInit(unsigned int Flags); - 'cuInit' : (c_int, c_uint), - + "cuInit": (c_int, c_uint), # CUresult cuDriverGetVersion (int* driverVersion ) - 'cuDriverGetVersion': (c_int, POINTER(c_int)), - + "cuDriverGetVersion": (c_int, POINTER(c_int)), # CUresult cuDeviceGetCount(int *count); - 'cuDeviceGetCount': (c_int, POINTER(c_int)), - + "cuDeviceGetCount": (c_int, POINTER(c_int)), # CUresult cuDeviceGet(CUdevice *device, int ordinal); - 'cuDeviceGet': (c_int, POINTER(cu_device), c_int), - + "cuDeviceGet": (c_int, POINTER(cu_device), c_int), # CUresult cuDeviceGetName ( char* name, int len, CUdevice dev ) - 'cuDeviceGetName': (c_int, c_char_p, c_int, cu_device), - + "cuDeviceGetName": (c_int, c_char_p, c_int, cu_device), # CUresult cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, # CUdevice dev); - 'cuDeviceGetAttribute': (c_int, POINTER(c_int), cu_device_attribute, - cu_device), - + "cuDeviceGetAttribute": ( + c_int, + POINTER(c_int), + cu_device_attribute, + cu_device, + ), # CUresult cuDeviceComputeCapability(int *major, int *minor, # CUdevice dev); - 'cuDeviceComputeCapability': (c_int, POINTER(c_int), POINTER(c_int), - cu_device), - + "cuDeviceComputeCapability": ( + c_int, + POINTER(c_int), + POINTER(c_int), + cu_device, + ), # CUresult cuDevicePrimaryCtxGetState( # CUdevice dev, # unsigned int* flags, # int* active) - 'cuDevicePrimaryCtxGetState': (c_int, - cu_device, POINTER(c_uint), POINTER(c_int)), - + "cuDevicePrimaryCtxGetState": ( + c_int, + cu_device, + POINTER(c_uint), + POINTER(c_int), + ), # CUresult cuDevicePrimaryCtxRelease ( CUdevice dev ) - 'cuDevicePrimaryCtxRelease': (c_int, cu_device), - + "cuDevicePrimaryCtxRelease": (c_int, cu_device), # CUresult cuDevicePrimaryCtxReset ( CUdevice dev ) - 'cuDevicePrimaryCtxReset': (c_int, cu_device), - + "cuDevicePrimaryCtxReset": (c_int, cu_device), # CUresult cuDevicePrimaryCtxRetain ( CUcontext* pctx, CUdevice dev ) - 'cuDevicePrimaryCtxRetain': (c_int, POINTER(cu_context), cu_device), - + "cuDevicePrimaryCtxRetain": (c_int, POINTER(cu_context), cu_device), # CUresult cuDevicePrimaryCtxSetFlags ( CUdevice dev, unsigned int flags ) - 'cuDevicePrimaryCtxSetFlags': (c_int, cu_device, c_uint), - + "cuDevicePrimaryCtxSetFlags": (c_int, cu_device, c_uint), # CUresult cuCtxCreate(CUcontext *pctx, unsigned int flags, # CUdevice dev); - 'cuCtxCreate': (c_int, POINTER(cu_context), c_uint, cu_device), - + "cuCtxCreate": (c_int, POINTER(cu_context), c_uint, cu_device), # CUresult cuCtxGetDevice ( CUdevice * device ) - 'cuCtxGetDevice': (c_int, POINTER(cu_device)), - + "cuCtxGetDevice": (c_int, POINTER(cu_device)), # CUresult cuCtxGetCurrent (CUcontext *pctx); - 'cuCtxGetCurrent': (c_int, POINTER(cu_context)), - + "cuCtxGetCurrent": (c_int, POINTER(cu_context)), # CUresult cuCtxPushCurrent (CUcontext pctx); - 'cuCtxPushCurrent': (c_int, cu_context), - + "cuCtxPushCurrent": (c_int, cu_context), # CUresult cuCtxPopCurrent (CUcontext *pctx); - 'cuCtxPopCurrent': (c_int, POINTER(cu_context)), - + "cuCtxPopCurrent": (c_int, POINTER(cu_context)), # CUresult cuCtxDestroy(CUcontext pctx); - 'cuCtxDestroy': (c_int, cu_context), - + "cuCtxDestroy": (c_int, cu_context), # CUresult cuModuleLoadDataEx(CUmodule *module, const void *image, # unsigned int numOptions, # CUjit_option *options, # void **optionValues); - 'cuModuleLoadDataEx': (c_int, cu_module, c_void_p, c_uint, - POINTER(cu_jit_option), POINTER(c_void_p)), - + "cuModuleLoadDataEx": ( + c_int, + cu_module, + c_void_p, + c_uint, + POINTER(cu_jit_option), + POINTER(c_void_p), + ), # CUresult cuModuleUnload(CUmodule hmod); - 'cuModuleUnload': (c_int, cu_module), - + "cuModuleUnload": (c_int, cu_module), # CUresult cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, # const char *name); - 'cuModuleGetFunction': (c_int, cu_function, cu_module, c_char_p), - + "cuModuleGetFunction": (c_int, cu_function, cu_module, c_char_p), # CUresult cuModuleGetGlobal ( CUdeviceptr* dptr, size_t* bytes, CUmodule # hmod, const char* name ) - 'cuModuleGetGlobal': (c_int, POINTER(cu_device_ptr), POINTER(c_size_t), - cu_module, c_char_p), - + "cuModuleGetGlobal": ( + c_int, + POINTER(cu_device_ptr), + POINTER(c_size_t), + cu_module, + c_char_p, + ), # CUresult CUDAAPI cuFuncSetCacheConfig(CUfunction hfunc, # CUfunc_cache config); - 'cuFuncSetCacheConfig': (c_int, cu_function, c_uint), - + "cuFuncSetCacheConfig": (c_int, cu_function, c_uint), # CUresult cuMemAlloc(CUdeviceptr *dptr, size_t bytesize); - 'cuMemAlloc': (c_int, POINTER(cu_device_ptr), c_size_t), - + "cuMemAlloc": (c_int, POINTER(cu_device_ptr), c_size_t), # CUresult cuMemAllocManaged(CUdeviceptr *dptr, size_t bytesize, # unsigned int flags); - 'cuMemAllocManaged': (c_int, c_void_p, c_size_t, c_uint), - + "cuMemAllocManaged": (c_int, c_void_p, c_size_t, c_uint), # CUresult cuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, size_t N) - 'cuMemsetD8': (c_int, cu_device_ptr, c_uint8, c_size_t), - + "cuMemsetD8": (c_int, cu_device_ptr, c_uint8, c_size_t), # CUresult cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc, # size_t N, CUstream hStream); - 'cuMemsetD8Async': (c_int, - cu_device_ptr, c_uint8, c_size_t, cu_stream), - + "cuMemsetD8Async": (c_int, cu_device_ptr, c_uint8, c_size_t, cu_stream), # CUresult cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, # size_t ByteCount); - 'cuMemcpyHtoD': (c_int, cu_device_ptr, c_void_p, c_size_t), - + "cuMemcpyHtoD": (c_int, cu_device_ptr, c_void_p, c_size_t), # CUresult cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost, # size_t ByteCount, CUstream hStream); - 'cuMemcpyHtoDAsync': (c_int, cu_device_ptr, c_void_p, c_size_t, - cu_stream), - + "cuMemcpyHtoDAsync": (c_int, cu_device_ptr, c_void_p, c_size_t, cu_stream), # CUresult cuMemcpyDtoD(CUdeviceptr dstDevice, const void *srcDevice, # size_t ByteCount); - 'cuMemcpyDtoD': (c_int, cu_device_ptr, cu_device_ptr, c_size_t), - + "cuMemcpyDtoD": (c_int, cu_device_ptr, cu_device_ptr, c_size_t), # CUresult cuMemcpyDtoDAsync(CUdeviceptr dstDevice, const void *srcDevice, # size_t ByteCount, CUstream hStream); - 'cuMemcpyDtoDAsync': (c_int, cu_device_ptr, cu_device_ptr, c_size_t, - cu_stream), - - + "cuMemcpyDtoDAsync": ( + c_int, + cu_device_ptr, + cu_device_ptr, + c_size_t, + cu_stream, + ), # CUresult cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, # size_t ByteCount); - 'cuMemcpyDtoH': (c_int, c_void_p, cu_device_ptr, c_size_t), - + "cuMemcpyDtoH": (c_int, c_void_p, cu_device_ptr, c_size_t), # CUresult cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice, # size_t ByteCount, CUstream hStream); - 'cuMemcpyDtoHAsync': (c_int, c_void_p, cu_device_ptr, c_size_t, - cu_stream), - + "cuMemcpyDtoHAsync": (c_int, c_void_p, cu_device_ptr, c_size_t, cu_stream), # CUresult cuMemFree(CUdeviceptr dptr); - 'cuMemFree': (c_int, cu_device_ptr), - + "cuMemFree": (c_int, cu_device_ptr), # CUresult cuStreamCreate(CUstream *phStream, unsigned int Flags); - 'cuStreamCreate': (c_int, POINTER(cu_stream), c_uint), - + "cuStreamCreate": (c_int, POINTER(cu_stream), c_uint), # CUresult cuStreamDestroy(CUstream hStream); - 'cuStreamDestroy': (c_int, cu_stream), - + "cuStreamDestroy": (c_int, cu_stream), # CUresult cuStreamSynchronize(CUstream hStream); - 'cuStreamSynchronize': (c_int, cu_stream), - + "cuStreamSynchronize": (c_int, cu_stream), # CUresult cuStreamAddCallback( # CUstream hStream, # CUstreamCallback callback, # void* userData, # unsigned int flags) - 'cuStreamAddCallback': (c_int, cu_stream, cu_stream_callback_pyobj, - py_object, c_uint), - + "cuStreamAddCallback": ( + c_int, + cu_stream, + cu_stream_callback_pyobj, + py_object, + c_uint, + ), # CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX, # unsigned int gridDimY, # unsigned int gridDimZ, @@ -190,10 +192,20 @@ class cu_ipc_mem_handle(Structure): # unsigned int sharedMemBytes, # CUstream hStream, void **kernelParams, # void ** extra) - 'cuLaunchKernel': (c_int, cu_function, c_uint, c_uint, c_uint, - c_uint, c_uint, c_uint, c_uint, cu_stream, - POINTER(c_void_p), POINTER(c_void_p)), - + "cuLaunchKernel": ( + c_int, + cu_function, + c_uint, + c_uint, + c_uint, + c_uint, + c_uint, + c_uint, + c_uint, + cu_stream, + POINTER(c_void_p), + POINTER(c_void_p), + ), # CUresult cuLaunchCooperativeKernel(CUfunction f, unsigned int gridDimX, # unsigned int gridDimY, # unsigned int gridDimZ, @@ -202,197 +214,219 @@ class cu_ipc_mem_handle(Structure): # unsigned int blockDimZ, # unsigned int sharedMemBytes, # CUstream hStream, void **kernelParams) - 'cuLaunchCooperativeKernel': (c_int, cu_function, c_uint, c_uint, c_uint, - c_uint, c_uint, c_uint, c_uint, cu_stream, - POINTER(c_void_p)), - + "cuLaunchCooperativeKernel": ( + c_int, + cu_function, + c_uint, + c_uint, + c_uint, + c_uint, + c_uint, + c_uint, + c_uint, + cu_stream, + POINTER(c_void_p), + ), # CUresult cuMemHostAlloc ( void ** pp, # size_t bytesize, # unsigned int Flags # ) - 'cuMemHostAlloc': (c_int, c_void_p, c_size_t, c_uint), - + "cuMemHostAlloc": (c_int, c_void_p, c_size_t, c_uint), # CUresult cuMemFreeHost ( void * p ) - 'cuMemFreeHost': (c_int, c_void_p), - + "cuMemFreeHost": (c_int, c_void_p), # CUresult cuMemHostRegister(void * p, # size_t bytesize, # unsigned int Flags) - 'cuMemHostRegister': (c_int, c_void_p, c_size_t, c_uint), - + "cuMemHostRegister": (c_int, c_void_p, c_size_t, c_uint), # CUresult cuMemHostUnregister(void * p) - 'cuMemHostUnregister': (c_int, c_void_p), - + "cuMemHostUnregister": (c_int, c_void_p), # CUresult cuMemHostGetDevicePointer(CUdeviceptr * pdptr, # void * p, # unsigned int Flags) - 'cuMemHostGetDevicePointer': (c_int, POINTER(cu_device_ptr), - c_void_p, c_uint), - + "cuMemHostGetDevicePointer": ( + c_int, + POINTER(cu_device_ptr), + c_void_p, + c_uint, + ), # CUresult cuMemGetInfo(size_t * free, size_t * total) - 'cuMemGetInfo' : (c_int, POINTER(c_size_t), POINTER(c_size_t)), - + "cuMemGetInfo": (c_int, POINTER(c_size_t), POINTER(c_size_t)), # CUresult cuEventCreate ( CUevent * phEvent, # unsigned int Flags ) - 'cuEventCreate': (c_int, POINTER(cu_event), c_uint), - + "cuEventCreate": (c_int, POINTER(cu_event), c_uint), # CUresult cuEventDestroy ( CUevent hEvent ) - 'cuEventDestroy': (c_int, cu_event), - + "cuEventDestroy": (c_int, cu_event), # CUresult cuEventElapsedTime ( float * pMilliseconds, # CUevent hStart, # CUevent hEnd ) - 'cuEventElapsedTime': (c_int, POINTER(c_float), cu_event, cu_event), - + "cuEventElapsedTime": (c_int, POINTER(c_float), cu_event, cu_event), # CUresult cuEventQuery ( CUevent hEvent ) - 'cuEventQuery': (c_int, cu_event), - + "cuEventQuery": (c_int, cu_event), # CUresult cuEventRecord ( CUevent hEvent, # CUstream hStream ) - 'cuEventRecord': (c_int, cu_event, cu_stream), - + "cuEventRecord": (c_int, cu_event, cu_stream), # CUresult cuEventSynchronize ( CUevent hEvent ) - 'cuEventSynchronize': (c_int, cu_event), - - + "cuEventSynchronize": (c_int, cu_event), # CUresult cuStreamWaitEvent ( CUstream hStream, # CUevent hEvent, # unsigned int Flags ) - 'cuStreamWaitEvent': (c_int, cu_stream, cu_event, c_uint), - + "cuStreamWaitEvent": (c_int, cu_stream, cu_event, c_uint), # CUresult cuPointerGetAttribute ( # void *data, # CUpointer_attribute attribute, # CUdeviceptr ptr) - 'cuPointerGetAttribute': (c_int, c_void_p, c_uint, cu_device_ptr), - + "cuPointerGetAttribute": (c_int, c_void_p, c_uint, cu_device_ptr), # CUresult cuMemGetAddressRange ( CUdeviceptr * pbase, # size_t * psize, # CUdeviceptr dptr # ) - 'cuMemGetAddressRange': (c_int, - POINTER(cu_device_ptr), - POINTER(c_size_t), - cu_device_ptr), - + "cuMemGetAddressRange": ( + c_int, + POINTER(cu_device_ptr), + POINTER(c_size_t), + cu_device_ptr, + ), # CUresult cuMemHostGetFlags ( unsigned int * pFlags, # void * p ) - 'cuMemHostGetFlags': (c_int, - POINTER(c_uint), - c_void_p), - + "cuMemHostGetFlags": (c_int, POINTER(c_uint), c_void_p), # CUresult cuCtxSynchronize ( void ) - 'cuCtxSynchronize' : (c_int,), - + "cuCtxSynchronize": (c_int,), # CUresult # cuLinkCreate(unsigned int numOptions, CUjit_option *options, # void **optionValues, CUlinkState *stateOut); - 'cuLinkCreate': (c_int, - c_uint, POINTER(cu_jit_option), - POINTER(c_void_p), POINTER(cu_link_state)), - + "cuLinkCreate": ( + c_int, + c_uint, + POINTER(cu_jit_option), + POINTER(c_void_p), + POINTER(cu_link_state), + ), # CUresult # cuLinkAddData(CUlinkState state, CUjitInputType type, void *data, # size_t size, const char *name, unsigned # int numOptions, CUjit_option *options, # void **optionValues); - 'cuLinkAddData': (c_int, - cu_link_state, cu_jit_input_type, c_void_p, - c_size_t, c_char_p, c_uint, POINTER(cu_jit_option), - POINTER(c_void_p)), - + "cuLinkAddData": ( + c_int, + cu_link_state, + cu_jit_input_type, + c_void_p, + c_size_t, + c_char_p, + c_uint, + POINTER(cu_jit_option), + POINTER(c_void_p), + ), # CUresult # cuLinkAddFile(CUlinkState state, CUjitInputType type, # const char *path, unsigned int numOptions, # CUjit_option *options, void **optionValues); - - 'cuLinkAddFile': (c_int, - cu_link_state, cu_jit_input_type, c_char_p, c_uint, - POINTER(cu_jit_option), POINTER(c_void_p)), - + "cuLinkAddFile": ( + c_int, + cu_link_state, + cu_jit_input_type, + c_char_p, + c_uint, + POINTER(cu_jit_option), + POINTER(c_void_p), + ), # CUresult CUDAAPI # cuLinkComplete(CUlinkState state, void **cubinOut, size_t *sizeOut) - 'cuLinkComplete': (c_int, - cu_link_state, POINTER(c_void_p), POINTER(c_size_t)), - + "cuLinkComplete": ( + c_int, + cu_link_state, + POINTER(c_void_p), + POINTER(c_size_t), + ), # CUresult CUDAAPI # cuLinkDestroy(CUlinkState state) - 'cuLinkDestroy': (c_int, cu_link_state), - + "cuLinkDestroy": (c_int, cu_link_state), # cuProfilerStart ( void ) - 'cuProfilerStart': (c_int,), - + "cuProfilerStart": (c_int,), # cuProfilerStop ( void ) - 'cuProfilerStop': (c_int,), - + "cuProfilerStop": (c_int,), # CUresult cuFuncGetAttribute ( int* pi, CUfunction_attribute attrib, # CUfunction hfunc ) - 'cuFuncGetAttribute': (c_int, - POINTER(c_int), cu_function_attribute, cu_function), - + "cuFuncGetAttribute": ( + c_int, + POINTER(c_int), + cu_function_attribute, + cu_function, + ), # CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessor( # int *numBlocks, # CUfunction func, # int blockSize, # size_t dynamicSMemSize); - 'cuOccupancyMaxActiveBlocksPerMultiprocessor': (c_int, POINTER(c_int), - cu_function, c_size_t, - c_uint), - + "cuOccupancyMaxActiveBlocksPerMultiprocessor": ( + c_int, + POINTER(c_int), + cu_function, + c_size_t, + c_uint, + ), # CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( # int *numBlocks, # CUfunction func, # int blockSize, # size_t dynamicSMemSize, # unsigned int flags); - 'cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags': (c_int, - POINTER(c_int), - cu_function, - c_size_t, c_uint), - + "cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags": ( + c_int, + POINTER(c_int), + cu_function, + c_size_t, + c_uint, + ), # CUresult CUDAAPI cuOccupancyMaxPotentialBlockSize( # int *minGridSize, int *blockSize, # CUfunction func, # CUoccupancyB2DSize blockSizeToDynamicSMemSize, # size_t dynamicSMemSize, int blockSizeLimit); - 'cuOccupancyMaxPotentialBlockSize': (c_int, POINTER(c_int), POINTER(c_int), - cu_function, cu_occupancy_b2d_size, - c_size_t, c_int), - + "cuOccupancyMaxPotentialBlockSize": ( + c_int, + POINTER(c_int), + POINTER(c_int), + cu_function, + cu_occupancy_b2d_size, + c_size_t, + c_int, + ), # CUresult CUDAAPI cuOccupancyMaxPotentialBlockSizeWithFlags( # int *minGridSize, int *blockSize, # CUfunction func, # CUoccupancyB2DSize blockSizeToDynamicSMemSize, # size_t dynamicSMemSize, int blockSizeLimit, # unsigned int flags); - 'cuOccupancyMaxPotentialBlockSizeWithFlags': (c_int, POINTER(c_int), - POINTER(c_int), cu_function, - cu_occupancy_b2d_size, - c_size_t, c_int, c_uint), - + "cuOccupancyMaxPotentialBlockSizeWithFlags": ( + c_int, + POINTER(c_int), + POINTER(c_int), + cu_function, + cu_occupancy_b2d_size, + c_size_t, + c_int, + c_uint, + ), # CUresult cuIpcGetMemHandle ( CUipcMemHandle* pHandle, CUdeviceptr dptr ) - 'cuIpcGetMemHandle': (c_int, - POINTER(cu_ipc_mem_handle), cu_device_ptr), - + "cuIpcGetMemHandle": (c_int, POINTER(cu_ipc_mem_handle), cu_device_ptr), # CUresult cuIpcOpenMemHandle( # CUdeviceptr* pdptr, # CUipcMemHandle handle, # unsigned int Flags) - 'cuIpcOpenMemHandle': (c_int, POINTER(cu_device_ptr), cu_ipc_mem_handle, - c_uint), - + "cuIpcOpenMemHandle": ( + c_int, + POINTER(cu_device_ptr), + cu_ipc_mem_handle, + c_uint, + ), # CUresult cuIpcCloseMemHandle ( CUdeviceptr dptr ) - - 'cuIpcCloseMemHandle': (c_int, cu_device_ptr), - + "cuIpcCloseMemHandle": (c_int, cu_device_ptr), # CUresult cuCtxEnablePeerAccess (CUcontext peerContext, unsigned int Flags) - 'cuCtxEnablePeerAccess': (c_int, cu_context, c_int), - + "cuCtxEnablePeerAccess": (c_int, cu_context, c_int), # CUresult cuDeviceCanAccessPeer ( int* canAccessPeer, # CUdevice dev, CUdevice peerDev ) - 'cuDeviceCanAccessPeer': (c_int, - POINTER(c_int), cu_device, cu_device), - + "cuDeviceCanAccessPeer": (c_int, POINTER(c_int), cu_device, cu_device), # CUresult cuDeviceGetUuid ( CUuuid* uuid, CUdevice dev ) - 'cuDeviceGetUuid': (c_int, POINTER(cu_uuid), cu_device), + "cuDeviceGetUuid": (c_int, POINTER(cu_uuid), cu_device), } diff --git a/numba_cuda/numba/cuda/cudadrv/dummyarray.py b/numba_cuda/numba/cuda/cudadrv/dummyarray.py index 38e1b890e..a3e21b633 100644 --- a/numba_cuda/numba/cuda/cudadrv/dummyarray.py +++ b/numba_cuda/numba/cuda/cudadrv/dummyarray.py @@ -20,7 +20,7 @@ np.ctypeslib.ndpointer(np.ctypeslib.c_intp, ndim=1), # newstrides ctypes.c_long, # itemsize ctypes.c_int, # is_f_order -)(_helperlib.c_helpers['attempt_nocopy_reshape']) +)(_helperlib.c_helpers["attempt_nocopy_reshape"]) class Dim(object): @@ -37,7 +37,8 @@ class Dim(object): stride: item stride """ - __slots__ = 'start', 'stop', 'size', 'stride', 'single' + + __slots__ = "start", "stop", "size", "stride", "single" def __init__(self, start, stop, size, stride, single): self.start = start @@ -58,15 +59,11 @@ def __getitem__(self, item): else: size = _compute_size(start, stop, stride) ret = Dim( - start=start, - stop=stop, - size=size, - stride=stride, - single=False + start=start, stop=stop, size=size, stride=stride, single=False ) return ret else: - sliced = self[item:item + 1] if item != -1 else self[-1:] + sliced = self[item : item + 1] if item != -1 else self[-1:] if sliced.size != 1: raise IndexError return Dim( @@ -85,8 +82,13 @@ def __repr__(self): return strfmt % (self.start, self.stop, self.size, self.stride) def normalize(self, base): - return Dim(start=self.start - base, stop=self.stop - base, - size=self.size, stride=self.stride, single=self.single) + return Dim( + start=self.start - base, + stop=self.stop - base, + size=self.size, + stride=self.stride, + single=self.single, + ) def copy(self, start=None, stop=None, size=None, stride=None, single=None): if start is None: @@ -143,14 +145,16 @@ class Array(object): extent: (start, end) start and end offset containing the memory region """ + is_array = True @classmethod def from_desc(cls, offset, shape, strides, itemsize): dims = [] for ashape, astride in zip(shape, strides): - dim = Dim(offset, offset + ashape * astride, ashape, astride, - single=False) + dim = Dim( + offset, offset + ashape * astride, ashape, astride, single=False + ) dims.append(dim) offset = 0 # offset only applies to first dimension return cls(dims, itemsize) @@ -173,23 +177,23 @@ def _compute_layout(self): # Records have no dims, and we can treat them as contiguous if not self.dims: - return {'C_CONTIGUOUS': True, 'F_CONTIGUOUS': True} + return {"C_CONTIGUOUS": True, "F_CONTIGUOUS": True} # If this is a broadcast array then it is not contiguous if any([dim.stride == 0 for dim in self.dims]): - return {'C_CONTIGUOUS': False, 'F_CONTIGUOUS': False} + return {"C_CONTIGUOUS": False, "F_CONTIGUOUS": False} - flags = {'C_CONTIGUOUS': True, 'F_CONTIGUOUS': True} + flags = {"C_CONTIGUOUS": True, "F_CONTIGUOUS": True} # Check C contiguity sd = self.itemsize for dim in reversed(self.dims): if dim.size == 0: # Contiguous by definition - return {'C_CONTIGUOUS': True, 'F_CONTIGUOUS': True} + return {"C_CONTIGUOUS": True, "F_CONTIGUOUS": True} if dim.size != 1: if dim.stride != sd: - flags['C_CONTIGUOUS'] = False + flags["C_CONTIGUOUS"] = False sd *= dim.size # Check F contiguity @@ -197,7 +201,7 @@ def _compute_layout(self): for dim in self.dims: if dim.size != 1: if dim.stride != sd: - flags['F_CONTIGUOUS'] = False + flags["F_CONTIGUOUS"] = False return flags sd *= dim.size @@ -208,11 +212,11 @@ def _compute_extent(self): lastidx = [s - 1 for s in self.shape] start = compute_index(firstidx, self.dims) stop = compute_index(lastidx, self.dims) + self.itemsize - stop = max(stop, start) # ensure positive extent + stop = max(stop, start) # ensure positive extent return Extent(start, stop) def __repr__(self): - return '' % (self.dims, self.itemsize) + return "" % (self.dims, self.itemsize) def __getitem__(self, item): if not isinstance(item, tuple): @@ -240,15 +244,14 @@ def __getitem__(self, item): @property def is_c_contig(self): - return self.flags['C_CONTIGUOUS'] + return self.flags["C_CONTIGUOUS"] @property def is_f_contig(self): - return self.flags['F_CONTIGUOUS'] + return self.flags["F_CONTIGUOUS"] def iter_contiguous_extent(self): - """ Generates extents - """ + """Generates extents""" if self.is_c_contig or self.is_f_contig: yield self.extent else: @@ -279,11 +282,11 @@ def reshape(self, *newdims, **kws): if newdims == self.shape: return self, None - order = kws.pop('order', 'C') + order = kws.pop("order", "C") if kws: - raise TypeError('unknown keyword arguments %s' % kws.keys()) - if order not in 'CFA': - raise ValueError('order not C|F|A') + raise TypeError("unknown keyword arguments %s" % kws.keys()) + if order not in "CFA": + raise ValueError("order not C|F|A") # check for exactly one instance of -1 in newdims # https://github.com/numpy/numpy/blob/623bc1fae1d47df24e7f1e29321d0c0ba2771ce0/numpy/core/src/multiarray/shape.c#L470-L515 # noqa: E501 @@ -301,25 +304,28 @@ def reshape(self, *newdims, **kws): # compute the missing dimension if unknownidx >= 0: if knownsize == 0 or self.size % knownsize != 0: - raise ValueError("cannot infer valid shape " - "for unknown dimension") + raise ValueError( + "cannot infer valid shape for unknown dimension" + ) else: - newdims = newdims[0:unknownidx] \ - + (self.size // knownsize,) \ - + newdims[unknownidx + 1:] + newdims = ( + newdims[0:unknownidx] + + (self.size // knownsize,) + + newdims[unknownidx + 1 :] + ) newsize = functools.reduce(operator.mul, newdims, 1) - if order == 'A': - order = 'F' if self.is_f_contig else 'C' + if order == "A": + order = "F" if self.is_f_contig else "C" if newsize != self.size: raise ValueError("reshape changes the size of the array") if self.is_c_contig or self.is_f_contig: - if order == 'C': + if order == "C": newstrides = list(iter_strides_c_contig(self, newdims)) - elif order == 'F': + elif order == "F": newstrides = list(iter_strides_f_contig(self, newdims)) else: raise AssertionError("unreachable") @@ -340,12 +346,16 @@ def reshape(self, *newdims, **kws): newdims, newstrides, self.itemsize, - order == 'F', + order == "F", ): - raise NotImplementedError('reshape would require copy') + raise NotImplementedError("reshape would require copy") - ret = self.from_desc(self.extent.begin, shape=newdims, - strides=newstrides, itemsize=self.itemsize) + ret = self.from_desc( + self.extent.begin, + shape=newdims, + strides=newstrides, + itemsize=self.itemsize, + ) return ret, list(self.iter_contiguous_extent()) @@ -377,16 +387,21 @@ def squeeze(self, axis=None): ) return newarr, list(self.iter_contiguous_extent()) - def ravel(self, order='C'): - if order not in 'CFA': - raise ValueError('order not C|F|A') + def ravel(self, order="C"): + if order not in "CFA": + raise ValueError("order not C|F|A") - if (order in 'CA' and self.is_c_contig - or order in 'FA' and self.is_f_contig): + if ( + order in "CA" + and self.is_c_contig + or order in "FA" + and self.is_f_contig + ): newshape = (self.size,) newstrides = (self.itemsize,) - arr = self.from_desc(self.extent.begin, newshape, newstrides, - self.itemsize) + arr = self.from_desc( + self.extent.begin, newshape, newstrides, self.itemsize + ) return arr, list(self.iter_contiguous_extent()) else: @@ -394,8 +409,7 @@ def ravel(self, order='C'): def iter_strides_f_contig(arr, shape=None): - """yields the f-contiguous strides - """ + """yields the f-contiguous strides""" shape = arr.shape if shape is None else shape itemsize = arr.itemsize yield itemsize @@ -406,8 +420,7 @@ def iter_strides_f_contig(arr, shape=None): def iter_strides_c_contig(arr, shape=None): - """yields the c-contiguous strides - """ + """yields the c-contiguous strides""" shape = arr.shape if shape is None else shape itemsize = arr.itemsize @@ -438,8 +451,7 @@ def is_element_indexing(item, ndim): def _compute_size(start, stop, step): - """Algorithm adapted from cpython rangeobject.c - """ + """Algorithm adapted from cpython rangeobject.c""" if step > 0: lo = start hi = stop diff --git a/numba_cuda/numba/cuda/cudadrv/enums.py b/numba_cuda/numba/cuda/cudadrv/enums.py index e40bb182f..987234b6f 100644 --- a/numba_cuda/numba/cuda/cudadrv/enums.py +++ b/numba_cuda/numba/cuda/cudadrv/enums.py @@ -140,7 +140,7 @@ # Force synchronous blocking on cudaMemcpy/cudaMemset CU_CTX_SYNC_MEMOPS = 0x80 -CU_CTX_FLAGS_MASK = 0xff +CU_CTX_FLAGS_MASK = 0xFF # DEFINES diff --git a/numba_cuda/numba/cuda/cudadrv/error.py b/numba_cuda/numba/cuda/cudadrv/error.py index ec3420586..87528d06d 100644 --- a/numba_cuda/numba/cuda/cudadrv/error.py +++ b/numba_cuda/numba/cuda/cudadrv/error.py @@ -12,7 +12,7 @@ class CudaSupportError(ImportError): class NvvmError(Exception): def __str__(self): - return '\n'.join(map(str, self.args)) + return "\n".join(map(str, self.args)) class NvvmSupportError(ImportError): @@ -25,7 +25,7 @@ class NvvmWarning(Warning): class NvrtcError(Exception): def __str__(self): - return '\n'.join(map(str, self.args)) + return "\n".join(map(str, self.args)) class NvrtcCompilationError(NvrtcError): diff --git a/numba_cuda/numba/cuda/cudadrv/libs.py b/numba_cuda/numba/cuda/cudadrv/libs.py index 70c385041..7388db898 100644 --- a/numba_cuda/numba/cuda/cudadrv/libs.py +++ b/numba_cuda/numba/cuda/cudadrv/libs.py @@ -21,25 +21,25 @@ from numba.core import config -if sys.platform == 'win32': - _dllnamepattern = '%s.dll' - _staticnamepattern = '%s.lib' -elif sys.platform == 'darwin': - _dllnamepattern = 'lib%s.dylib' - _staticnamepattern = 'lib%s.a' +if sys.platform == "win32": + _dllnamepattern = "%s.dll" + _staticnamepattern = "%s.lib" +elif sys.platform == "darwin": + _dllnamepattern = "lib%s.dylib" + _staticnamepattern = "lib%s.a" else: - _dllnamepattern = 'lib%s.so' - _staticnamepattern = 'lib%s.a' + _dllnamepattern = "lib%s.so" + _staticnamepattern = "lib%s.a" def get_libdevice(): d = get_cuda_paths() - paths = d['libdevice'].info + paths = d["libdevice"].info return paths def open_libdevice(): - with open(get_libdevice(), 'rb') as bcfile: + with open(get_libdevice(), "rb") as bcfile: return bcfile.read() @@ -50,10 +50,10 @@ def get_cudalib(lib, static=False): 'libnvvm.so' for 'nvvm') so that we may attempt to load it using the system loader's search mechanism. """ - if lib == 'nvvm': - return get_cuda_paths()['nvvm'].info or _dllnamepattern % 'nvvm' + if lib == "nvvm": + return get_cuda_paths()["nvvm"].info or _dllnamepattern % "nvvm" else: - dir_type = 'static_cudalib_dir' if static else 'cudalib_dir' + dir_type = "static_cudalib_dir" if static else "cudalib_dir" libdir = get_cuda_paths()[dir_type].info candidates = find_lib(lib, libdir, static=static) @@ -68,7 +68,7 @@ def get_cuda_include_dir(): configuration. """ - return get_cuda_paths()['include_dir'].info + return get_cuda_paths()["include_dir"].info def check_cuda_include_dir(path): @@ -86,39 +86,38 @@ def open_cudalib(lib): def check_static_lib(path): if not os.path.isfile(path): - raise FileNotFoundError(f'{path} not found') + raise FileNotFoundError(f"{path} not found") def _get_source_variable(lib, static=False): - if lib == 'nvvm': - return get_cuda_paths()['nvvm'].by - elif lib == 'libdevice': - return get_cuda_paths()['libdevice'].by - elif lib == 'include_dir': - return get_cuda_paths()['include_dir'].by + if lib == "nvvm": + return get_cuda_paths()["nvvm"].by + elif lib == "libdevice": + return get_cuda_paths()["libdevice"].by + elif lib == "include_dir": + return get_cuda_paths()["include_dir"].by else: - dir_type = 'static_cudalib_dir' if static else 'cudalib_dir' + dir_type = "static_cudalib_dir" if static else "cudalib_dir" return get_cuda_paths()[dir_type].by def test(): - """Test library lookup. Path info is printed to stdout. - """ + """Test library lookup. Path info is printed to stdout.""" failed = False # Check for the driver try: dlloader, candidates = locate_driver_and_loader() - print('Finding driver from candidates:') + print("Finding driver from candidates:") for location in candidates: - print(f'\t{location}') - print(f'Using loader {dlloader}') - print('\tTrying to load driver', end='...') + print(f"\t{location}") + print(f"Using loader {dlloader}") + print("\tTrying to load driver", end="...") dll, path = load_driver(dlloader, candidates) - print('\tok') - print(f'\t\tLoaded from {path}') + print("\tok") + print(f"\t\tLoaded from {path}") except CudaSupportError as e: - print(f'\tERROR: failed to open driver: {e}') + print(f"\tERROR: failed to open driver: {e}") failed = True # Find the absolute location of the driver on Linux. Various driver-related @@ -127,9 +126,9 @@ def test(): # Providing the absolute location of the driver indicates its version # number in the soname (e.g. "libcuda.so.530.30.02"), which can be used to # look up whether the driver was intended for "native" Linux. - if sys.platform == 'linux' and not failed: + if sys.platform == "linux" and not failed: pid = os.getpid() - mapsfile = os.path.join(os.path.sep, 'proc', f'{pid}', 'maps') + mapsfile = os.path.join(os.path.sep, "proc", f"{pid}", "maps") try: with open(mapsfile) as f: maps = f.read() @@ -140,58 +139,61 @@ def test(): # It's helpful to report that this went wrong to the user, but we # don't set failed to True because this doesn't have any connection # to actual CUDA functionality. - print(f'\tERROR: Could not open {mapsfile} to determine absolute ' - 'path to libcuda.so') + print( + f"\tERROR: Could not open {mapsfile} to determine absolute " + "path to libcuda.so" + ) else: # In this case we could read the maps, so we can report the # relevant ones to the user - locations = set(s for s in maps.split() if 'libcuda.so' in s) - print('\tMapped libcuda.so paths:') + locations = set(s for s in maps.split() if "libcuda.so" in s) + print("\tMapped libcuda.so paths:") for location in locations: - print(f'\t\t{location}') + print(f"\t\t{location}") # Checks for dynamic libraries - libs = 'nvvm nvrtc cudart'.split() + libs = "nvvm nvrtc cudart".split() for lib in libs: path = get_cudalib(lib) - print('Finding {} from {}'.format(lib, _get_source_variable(lib))) - print('\tLocated at', path) + print("Finding {} from {}".format(lib, _get_source_variable(lib))) + print("\tLocated at", path) try: - print('\tTrying to open library', end='...') + print("\tTrying to open library", end="...") open_cudalib(lib) - print('\tok') + print("\tok") except OSError as e: - print('\tERROR: failed to open %s:\n%s' % (lib, e)) + print("\tERROR: failed to open %s:\n%s" % (lib, e)) failed = True # Check for cudadevrt (the only static library) - lib = 'cudadevrt' + lib = "cudadevrt" path = get_cudalib(lib, static=True) - print('Finding {} from {}'.format(lib, _get_source_variable(lib, - static=True))) - print('\tLocated at', path) + print( + "Finding {} from {}".format(lib, _get_source_variable(lib, static=True)) + ) + print("\tLocated at", path) try: - print('\tChecking library', end='...') + print("\tChecking library", end="...") check_static_lib(path) - print('\tok') + print("\tok") except FileNotFoundError as e: - print('\tERROR: failed to find %s:\n%s' % (lib, e)) + print("\tERROR: failed to find %s:\n%s" % (lib, e)) failed = True # Check for libdevice - where = _get_source_variable('libdevice') - print(f'Finding libdevice from {where}') + where = _get_source_variable("libdevice") + print(f"Finding libdevice from {where}") path = get_libdevice() - print('\tLocated at', path) + print("\tLocated at", path) try: - print('\tChecking library', end='...') + print("\tChecking library", end="...") check_static_lib(path) - print('\tok') + print("\tok") except FileNotFoundError as e: - print('\tERROR: failed to find %s:\n%s' % (lib, e)) + print("\tERROR: failed to find %s:\n%s" % (lib, e)) failed = True # Check cuda include paths @@ -199,16 +201,16 @@ def test(): print("Include directory configuration variable:") print(f"\tCUDA_INCLUDE_PATH={config.CUDA_INCLUDE_PATH}") - where = _get_source_variable('include_dir') - print(f'Finding include directory from {where}') + where = _get_source_variable("include_dir") + print(f"Finding include directory from {where}") include = get_cuda_include_dir() - print('\tLocated at', include) + print("\tLocated at", include) try: - print('\tChecking include directory', end='...') + print("\tChecking include directory", end="...") check_cuda_include_dir(include) - print('\tok') + print("\tok") except FileNotFoundError as e: - print('\tERROR: failed to find cuda include directory:\n%s' % e) + print("\tERROR: failed to find cuda include directory:\n%s" % e) failed = True return not failed diff --git a/numba_cuda/numba/cuda/cudadrv/mappings.py b/numba_cuda/numba/cuda/cudadrv/mappings.py index 95d369efd..aa94d22e9 100644 --- a/numba_cuda/numba/cuda/cudadrv/mappings.py +++ b/numba_cuda/numba/cuda/cudadrv/mappings.py @@ -1,24 +1,26 @@ from numba import config from . import enums + if config.CUDA_USE_NVIDIA_BINDING: from cuda import cuda + jitty = cuda.CUjitInputType FILE_EXTENSION_MAP = { - 'o': jitty.CU_JIT_INPUT_OBJECT, - 'ptx': jitty.CU_JIT_INPUT_PTX, - 'a': jitty.CU_JIT_INPUT_LIBRARY, - 'lib': jitty.CU_JIT_INPUT_LIBRARY, - 'cubin': jitty.CU_JIT_INPUT_CUBIN, - 'fatbin': jitty.CU_JIT_INPUT_FATBINARY, - 'ltoir': jitty.CU_JIT_INPUT_NVVM, + "o": jitty.CU_JIT_INPUT_OBJECT, + "ptx": jitty.CU_JIT_INPUT_PTX, + "a": jitty.CU_JIT_INPUT_LIBRARY, + "lib": jitty.CU_JIT_INPUT_LIBRARY, + "cubin": jitty.CU_JIT_INPUT_CUBIN, + "fatbin": jitty.CU_JIT_INPUT_FATBINARY, + "ltoir": jitty.CU_JIT_INPUT_NVVM, } else: FILE_EXTENSION_MAP = { - 'o': enums.CU_JIT_INPUT_OBJECT, - 'ptx': enums.CU_JIT_INPUT_PTX, - 'a': enums.CU_JIT_INPUT_LIBRARY, - 'lib': enums.CU_JIT_INPUT_LIBRARY, - 'cubin': enums.CU_JIT_INPUT_CUBIN, - 'fatbin': enums.CU_JIT_INPUT_FATBINARY, - 'ltoir': enums.CU_JIT_INPUT_NVVM, + "o": enums.CU_JIT_INPUT_OBJECT, + "ptx": enums.CU_JIT_INPUT_PTX, + "a": enums.CU_JIT_INPUT_LIBRARY, + "lib": enums.CU_JIT_INPUT_LIBRARY, + "cubin": enums.CU_JIT_INPUT_CUBIN, + "fatbin": enums.CU_JIT_INPUT_FATBINARY, + "ltoir": enums.CU_JIT_INPUT_NVVM, } diff --git a/numba_cuda/numba/cuda/cudadrv/nvrtc.py b/numba_cuda/numba/cuda/cudadrv/nvrtc.py index 5ab970c02..145873848 100644 --- a/numba_cuda/numba/cuda/cudadrv/nvrtc.py +++ b/numba_cuda/numba/cuda/cudadrv/nvrtc.py @@ -1,7 +1,10 @@ from ctypes import byref, c_char, c_char_p, c_int, c_size_t, c_void_p, POINTER from enum import IntEnum -from numba.cuda.cudadrv.error import (NvrtcError, NvrtcCompilationError, - NvrtcSupportError) +from numba.cuda.cudadrv.error import ( + NvrtcError, + NvrtcCompilationError, + NvrtcSupportError, +) from numba.cuda.cuda_paths import get_cuda_paths import functools import os @@ -39,6 +42,7 @@ class NvrtcProgram: the class own an nvrtcProgram; when an instance is deleted, the underlying nvrtcProgram is destroyed using the appropriate NVRTC API. """ + def __init__(self, nvrtc, handle): self._nvrtc = nvrtc self._handle = handle @@ -66,42 +70,56 @@ class NVRTC: # nvrtcResult nvrtcGetLTOIRSize(nvrtcProgram prog, size_t *ltoSizeRet); "nvrtcGetLTOIRSize": (nvrtc_result, nvrtc_program, POINTER(c_size_t)), # nvrtcResult nvrtcGetLTOIR(nvrtcProgram prog, char *lto); - "nvrtcGetLTOIR": (nvrtc_result, nvrtc_program, c_char_p) + "nvrtcGetLTOIR": (nvrtc_result, nvrtc_program, c_char_p), } _PROTOTYPES = { # nvrtcResult nvrtcVersion(int *major, int *minor) - 'nvrtcVersion': (nvrtc_result, POINTER(c_int), POINTER(c_int)), + "nvrtcVersion": (nvrtc_result, POINTER(c_int), POINTER(c_int)), # nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog, # const char *src, # const char *name, # int numHeaders, # const char * const *headers, # const char * const *includeNames) - 'nvrtcCreateProgram': (nvrtc_result, nvrtc_program, c_char_p, c_char_p, - c_int, POINTER(c_char_p), POINTER(c_char_p)), + "nvrtcCreateProgram": ( + nvrtc_result, + nvrtc_program, + c_char_p, + c_char_p, + c_int, + POINTER(c_char_p), + POINTER(c_char_p), + ), # nvrtcResult nvrtcDestroyProgram(nvrtcProgram *prog); - 'nvrtcDestroyProgram': (nvrtc_result, POINTER(nvrtc_program)), + "nvrtcDestroyProgram": (nvrtc_result, POINTER(nvrtc_program)), # nvrtcResult nvrtcCompileProgram(nvrtcProgram prog, # int numOptions, # const char * const *options) - 'nvrtcCompileProgram': (nvrtc_result, nvrtc_program, c_int, - POINTER(c_char_p)), + "nvrtcCompileProgram": ( + nvrtc_result, + nvrtc_program, + c_int, + POINTER(c_char_p), + ), # nvrtcResult nvrtcGetPTXSize(nvrtcProgram prog, size_t *ptxSizeRet); - 'nvrtcGetPTXSize': (nvrtc_result, nvrtc_program, POINTER(c_size_t)), + "nvrtcGetPTXSize": (nvrtc_result, nvrtc_program, POINTER(c_size_t)), # nvrtcResult nvrtcGetPTX(nvrtcProgram prog, char *ptx); - 'nvrtcGetPTX': (nvrtc_result, nvrtc_program, c_char_p), + "nvrtcGetPTX": (nvrtc_result, nvrtc_program, c_char_p), # nvrtcResult nvrtcGetCUBINSize(nvrtcProgram prog, # size_t *cubinSizeRet); - 'nvrtcGetCUBINSize': (nvrtc_result, nvrtc_program, POINTER(c_size_t)), + "nvrtcGetCUBINSize": (nvrtc_result, nvrtc_program, POINTER(c_size_t)), # nvrtcResult nvrtcGetCUBIN(nvrtcProgram prog, char *cubin); - 'nvrtcGetCUBIN': (nvrtc_result, nvrtc_program, c_char_p), + "nvrtcGetCUBIN": (nvrtc_result, nvrtc_program, c_char_p), # nvrtcResult nvrtcGetProgramLogSize(nvrtcProgram prog, # size_t *logSizeRet); - 'nvrtcGetProgramLogSize': (nvrtc_result, nvrtc_program, - POINTER(c_size_t)), + "nvrtcGetProgramLogSize": ( + nvrtc_result, + nvrtc_program, + POINTER(c_size_t), + ), # nvrtcResult nvrtcGetProgramLog(nvrtcProgram prog, char *log); - 'nvrtcGetProgramLog': (nvrtc_result, nvrtc_program, c_char_p), + "nvrtcGetProgramLog": (nvrtc_result, nvrtc_program, c_char_p), } # Singleton reference @@ -111,14 +129,16 @@ def __new__(cls): with _nvrtc_lock: if cls.__INSTANCE is None: from numba.cuda.cudadrv.libs import open_cudalib + cls.__INSTANCE = inst = object.__new__(cls) try: - lib = open_cudalib('nvrtc') + lib = open_cudalib("nvrtc") except OSError as e: cls.__INSTANCE = None raise NvrtcSupportError("NVRTC cannot be loaded") from e from numba.cuda.cudadrv.runtime import get_version + if get_version() >= (12, 0): inst._PROTOTYPES |= inst._CU12ONLY_PROTOTYPES @@ -137,9 +157,11 @@ def checked_call(*args, func=func, name=name): try: error_name = NvrtcResult(error).name except ValueError: - error_name = ('Unknown nvrtc_result ' - f'(error code: {error})') - msg = f'Failed to call {name}: {error_name}' + error_name = ( + "Unknown nvrtc_result " + f"(error code: {error})" + ) + msg = f"Failed to call {name}: {error_name}" raise NvrtcError(msg) setattr(inst, name, checked_call) @@ -182,7 +204,7 @@ def compile_program(self, program, options): # prior to the call to nvrtcCompileProgram encoded_options = [opt.encode() for opt in options] option_pointers = [c_char_p(opt) for opt in encoded_options] - c_options_type = (c_char_p * len(options)) + c_options_type = c_char_p * len(options) c_options = c_options_type(*option_pointers) try: self.nvrtcCompileProgram(program.handle, len(options), c_options) @@ -257,7 +279,7 @@ def compile(src, name, cc, ltoir=False): # - Relocatable Device Code (rdc) is needed to prevent device functions # being optimized away. major, minor = cc - arch = f'--gpu-architecture=compute_{major}{minor}' + arch = f"--gpu-architecture=compute_{major}{minor}" cuda_include = [ f"-I{get_cuda_paths()['include_dir'].info}", @@ -265,12 +287,12 @@ def compile(src, name, cc, ltoir=False): cudadrv_path = os.path.dirname(os.path.abspath(__file__)) numba_cuda_path = os.path.dirname(cudadrv_path) - numba_include = f'-I{numba_cuda_path}' + numba_include = f"-I{numba_cuda_path}" nrt_path = os.path.join(numba_cuda_path, "runtime") - nrt_include = f'-I{nrt_path}' + nrt_include = f"-I{nrt_path}" - options = [arch, *cuda_include, numba_include, nrt_include, '-rdc', 'true'] + options = [arch, *cuda_include, numba_include, nrt_include, "-rdc", "true"] if ltoir: options.append("-dlto") @@ -286,12 +308,12 @@ def compile(src, name, cc, ltoir=False): # If the compile failed, provide the log in an exception if compile_error: - msg = (f'NVRTC Compilation failure whilst compiling {name}:\n\n{log}') + msg = f"NVRTC Compilation failure whilst compiling {name}:\n\n{log}" raise NvrtcError(msg) # Otherwise, if there's any content in the log, present it as a warning if log: - msg = (f"NVRTC log messages whilst compiling {name}:\n\n{log}") + msg = f"NVRTC log messages whilst compiling {name}:\n\n{log}" warnings.warn(msg) if ltoir: diff --git a/numba_cuda/numba/cuda/cudadrv/nvvm.py b/numba_cuda/numba/cuda/cudadrv/nvvm.py index 0844661e2..b46fb0a39 100644 --- a/numba_cuda/numba/cuda/cudadrv/nvvm.py +++ b/numba_cuda/numba/cuda/cudadrv/nvvm.py @@ -1,12 +1,12 @@ """ This is a direct translation of nvvm.h """ + import logging import re import sys import warnings -from ctypes import (c_void_p, c_int, POINTER, c_char_p, c_size_t, byref, - c_char) +from ctypes import c_void_p, c_int, POINTER, c_char_p, c_size_t, byref, c_char import threading @@ -31,7 +31,7 @@ # Result code nvvm_result = c_int -RESULT_CODE_NAMES = ''' +RESULT_CODE_NAMES = """ NVVM_SUCCESS NVVM_ERROR_OUT_OF_MEMORY NVVM_ERROR_PROGRAM_CREATION_FAILURE @@ -42,19 +42,23 @@ NVVM_ERROR_INVALID_OPTION NVVM_ERROR_NO_MODULE_IN_PROGRAM NVVM_ERROR_COMPILATION -'''.split() +""".split() for i, k in enumerate(RESULT_CODE_NAMES): setattr(sys.modules[__name__], k, i) # Data layouts. NVVM IR 1.8 (CUDA 11.6) introduced 128-bit integer support. -_datalayout_original = ('e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-' - 'i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-' - 'v64:64:64-v128:128:128-n16:32:64') -_datalayout_i128 = ('e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-' - 'i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-' - 'v64:64:64-v128:128:128-n16:32:64') +_datalayout_original = ( + "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-" + "i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-" + "v64:64:64-v128:128:128-n16:32:64" +) +_datalayout_i128 = ( + "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-" + "i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-" + "v64:64:64-v128:128:128-n16:32:64" +) def is_available(): @@ -73,59 +77,74 @@ def is_available(): class NVVM(object): - '''Process-wide singleton. - ''' - _PROTOTYPES = { + """Process-wide singleton.""" + _PROTOTYPES = { # nvvmResult nvvmVersion(int *major, int *minor) - 'nvvmVersion': (nvvm_result, POINTER(c_int), POINTER(c_int)), - + "nvvmVersion": (nvvm_result, POINTER(c_int), POINTER(c_int)), # nvvmResult nvvmCreateProgram(nvvmProgram *cu) - 'nvvmCreateProgram': (nvvm_result, POINTER(nvvm_program)), - + "nvvmCreateProgram": (nvvm_result, POINTER(nvvm_program)), # nvvmResult nvvmDestroyProgram(nvvmProgram *cu) - 'nvvmDestroyProgram': (nvvm_result, POINTER(nvvm_program)), - + "nvvmDestroyProgram": (nvvm_result, POINTER(nvvm_program)), # nvvmResult nvvmAddModuleToProgram(nvvmProgram cu, const char *buffer, # size_t size, const char *name) - 'nvvmAddModuleToProgram': ( - nvvm_result, nvvm_program, c_char_p, c_size_t, c_char_p), - + "nvvmAddModuleToProgram": ( + nvvm_result, + nvvm_program, + c_char_p, + c_size_t, + c_char_p, + ), # nvvmResult nvvmLazyAddModuleToProgram(nvvmProgram cu, # const char* buffer, # size_t size, # const char *name) - 'nvvmLazyAddModuleToProgram': ( - nvvm_result, nvvm_program, c_char_p, c_size_t, c_char_p), - + "nvvmLazyAddModuleToProgram": ( + nvvm_result, + nvvm_program, + c_char_p, + c_size_t, + c_char_p, + ), # nvvmResult nvvmCompileProgram(nvvmProgram cu, int numOptions, # const char **options) - 'nvvmCompileProgram': ( - nvvm_result, nvvm_program, c_int, POINTER(c_char_p)), - + "nvvmCompileProgram": ( + nvvm_result, + nvvm_program, + c_int, + POINTER(c_char_p), + ), # nvvmResult nvvmGetCompiledResultSize(nvvmProgram cu, # size_t *bufferSizeRet) - 'nvvmGetCompiledResultSize': ( - nvvm_result, nvvm_program, POINTER(c_size_t)), - + "nvvmGetCompiledResultSize": ( + nvvm_result, + nvvm_program, + POINTER(c_size_t), + ), # nvvmResult nvvmGetCompiledResult(nvvmProgram cu, char *buffer) - 'nvvmGetCompiledResult': (nvvm_result, nvvm_program, c_char_p), - + "nvvmGetCompiledResult": (nvvm_result, nvvm_program, c_char_p), # nvvmResult nvvmGetProgramLogSize(nvvmProgram cu, # size_t *bufferSizeRet) - 'nvvmGetProgramLogSize': (nvvm_result, nvvm_program, POINTER(c_size_t)), - + "nvvmGetProgramLogSize": (nvvm_result, nvvm_program, POINTER(c_size_t)), # nvvmResult nvvmGetProgramLog(nvvmProgram cu, char *buffer) - 'nvvmGetProgramLog': (nvvm_result, nvvm_program, c_char_p), - + "nvvmGetProgramLog": (nvvm_result, nvvm_program, c_char_p), # nvvmResult nvvmIRVersion (int* majorIR, int* minorIR, int* majorDbg, # int* minorDbg ) - 'nvvmIRVersion': (nvvm_result, POINTER(c_int), POINTER(c_int), - POINTER(c_int), POINTER(c_int)), + "nvvmIRVersion": ( + nvvm_result, + POINTER(c_int), + POINTER(c_int), + POINTER(c_int), + POINTER(c_int), + ), # nvvmResult nvvmVerifyProgram (nvvmProgram prog, int numOptions, # const char** options) - 'nvvmVerifyProgram': (nvvm_result, nvvm_program, c_int, - POINTER(c_char_p)) + "nvvmVerifyProgram": ( + nvvm_result, + nvvm_program, + c_int, + POINTER(c_char_p), + ), } # Singleton reference @@ -136,11 +155,13 @@ def __new__(cls): if cls.__INSTANCE is None: cls.__INSTANCE = inst = object.__new__(cls) try: - inst.driver = open_cudalib('nvvm') + inst.driver = open_cudalib("nvvm") except OSError as e: cls.__INSTANCE = None - errmsg = ("libNVVM cannot be found. Do `conda install " - "cudatoolkit`:\n%s") + errmsg = ( + "libNVVM cannot be found. Do `conda install " + "cudatoolkit`:\n%s" + ) raise NvvmSupportError(errmsg % e) # Find & populate functions @@ -175,7 +196,7 @@ def get_version(self): major = c_int() minor = c_int() err = self.nvvmVersion(byref(major), byref(minor)) - self.check_error(err, 'Failed to get version.') + self.check_error(err, "Failed to get version.") return major.value, minor.value def get_ir_version(self): @@ -183,9 +204,10 @@ def get_ir_version(self): minorIR = c_int() majorDbg = c_int() minorDbg = c_int() - err = self.nvvmIRVersion(byref(majorIR), byref(minorIR), - byref(majorDbg), byref(minorDbg)) - self.check_error(err, 'Failed to get IR version.') + err = self.nvvmIRVersion( + byref(majorIR), byref(minorIR), byref(majorDbg), byref(minorDbg) + ) + self.check_error(err, "Failed to get IR version.") return majorIR.value, minorIR.value, majorDbg.value, minorDbg.value def check_error(self, error, msg, exit=False): @@ -223,18 +245,18 @@ def __init__(self, options): self.driver = NVVM() self._handle = nvvm_program() err = self.driver.nvvmCreateProgram(byref(self._handle)) - self.driver.check_error(err, 'Failed to create CU') + self.driver.check_error(err, "Failed to create CU") def stringify_option(k, v): - k = k.replace('_', '-') + k = k.replace("_", "-") if v is None: - return f'-{k}'.encode('utf-8') + return f"-{k}".encode("utf-8") if isinstance(v, bool): v = int(v) - return f'-{k}={v}'.encode('utf-8') + return f"-{k}={v}".encode("utf-8") options = [stringify_option(k, v) for k, v in options.items()] option_ptrs = (c_char_p * len(options))(*[c_char_p(x) for x in options]) @@ -248,17 +270,18 @@ def stringify_option(k, v): def __del__(self): driver = NVVM() err = driver.nvvmDestroyProgram(byref(self._handle)) - driver.check_error(err, 'Failed to destroy CU', exit=True) + driver.check_error(err, "Failed to destroy CU", exit=True) def add_module(self, buffer): """ - Add a module level NVVM IR to a compilation unit. - - The buffer should contain an NVVM module IR either in the bitcode - representation (LLVM3.0) or in the text representation. + Add a module level NVVM IR to a compilation unit. + - The buffer should contain an NVVM module IR either in the bitcode + representation (LLVM3.0) or in the text representation. """ - err = self.driver.nvvmAddModuleToProgram(self._handle, buffer, - len(buffer), None) - self.driver.check_error(err, 'Failed to add module') + err = self.driver.nvvmAddModuleToProgram( + self._handle, buffer, len(buffer), None + ) + self.driver.check_error(err, "Failed to add module") def lazy_add_module(self, buffer): """ @@ -266,37 +289,41 @@ def lazy_add_module(self, buffer): The buffer should contain NVVM module IR either in the bitcode representation or in the text representation. """ - err = self.driver.nvvmLazyAddModuleToProgram(self._handle, buffer, - len(buffer), None) - self.driver.check_error(err, 'Failed to add module') + err = self.driver.nvvmLazyAddModuleToProgram( + self._handle, buffer, len(buffer), None + ) + self.driver.check_error(err, "Failed to add module") def verify(self): """ Run the NVVM verifier on all code added to the compilation unit. """ - err = self.driver.nvvmVerifyProgram(self._handle, self.n_options, - self.option_ptrs) - self._try_error(err, 'Failed to verify\n') + err = self.driver.nvvmVerifyProgram( + self._handle, self.n_options, self.option_ptrs + ) + self._try_error(err, "Failed to verify\n") def compile(self): """ Compile all modules added to the compilation unit and return the resulting PTX or LTO-IR (depending on the options). """ - err = self.driver.nvvmCompileProgram(self._handle, self.n_options, - self.option_ptrs) - self._try_error(err, 'Failed to compile\n') + err = self.driver.nvvmCompileProgram( + self._handle, self.n_options, self.option_ptrs + ) + self._try_error(err, "Failed to compile\n") # Get result result_size = c_size_t() - err = self.driver.nvvmGetCompiledResultSize(self._handle, - byref(result_size)) + err = self.driver.nvvmGetCompiledResultSize( + self._handle, byref(result_size) + ) - self._try_error(err, 'Failed to get size of compiled result.') + self._try_error(err, "Failed to get size of compiled result.") output_buffer = (c_char * result_size.value)() err = self.driver.nvvmGetCompiledResult(self._handle, output_buffer) - self._try_error(err, 'Failed to get compiled result.') + self._try_error(err, "Failed to get compiled result.") # Get log self.log = self.get_log() @@ -311,26 +338,37 @@ def _try_error(self, err, msg): def get_log(self): reslen = c_size_t() err = self.driver.nvvmGetProgramLogSize(self._handle, byref(reslen)) - self.driver.check_error(err, 'Failed to get compilation log size.') + self.driver.check_error(err, "Failed to get compilation log size.") if reslen.value > 1: logbuf = (c_char * reslen.value)() err = self.driver.nvvmGetProgramLog(self._handle, logbuf) - self.driver.check_error(err, 'Failed to get compilation log.') + self.driver.check_error(err, "Failed to get compilation log.") - return logbuf.value.decode('utf8') # populate log attribute + return logbuf.value.decode("utf8") # populate log attribute - return '' + return "" COMPUTE_CAPABILITIES = ( - (3, 5), (3, 7), - (5, 0), (5, 2), (5, 3), - (6, 0), (6, 1), (6, 2), - (7, 0), (7, 2), (7, 5), - (8, 0), (8, 6), (8, 7), (8, 9), + (3, 5), + (3, 7), + (5, 0), + (5, 2), + (5, 3), + (6, 0), + (6, 1), + (6, 2), + (7, 0), + (7, 2), + (7, 5), + (8, 0), + (8, 6), + (8, 7), + (8, 9), (9, 0), - (10, 0), (10, 1), + (10, 0), + (10, 1), (12, 0), ) @@ -358,20 +396,27 @@ def ccs_supported_by_ctk(ctk_version): try: # For supported versions, we look up the range of supported CCs min_cc, max_cc = CTK_SUPPORTED[ctk_version] - return tuple([cc for cc in COMPUTE_CAPABILITIES - if min_cc <= cc <= max_cc]) + return tuple( + [cc for cc in COMPUTE_CAPABILITIES if min_cc <= cc <= max_cc] + ) except KeyError: # For unsupported CUDA toolkit versions, all we can do is assume all # non-deprecated versions we are aware of are supported. - return tuple([cc for cc in COMPUTE_CAPABILITIES - if cc >= config.CUDA_DEFAULT_PTX_CC]) + return tuple( + [ + cc + for cc in COMPUTE_CAPABILITIES + if cc >= config.CUDA_DEFAULT_PTX_CC + ] + ) def get_supported_ccs(): try: from numba.cuda.cudadrv.runtime import runtime + cudart_version = runtime.get_version() - except: # noqa: E722 + except: # noqa: E722 # We can't support anything if there's an error getting the runtime # version (e.g. if it's not present or there's another issue) _supported_cc = () @@ -382,9 +427,11 @@ def get_supported_ccs(): if cudart_version < min_cudart: _supported_cc = () ctk_ver = f"{cudart_version[0]}.{cudart_version[1]}" - unsupported_ver = (f"CUDA Toolkit {ctk_ver} is unsupported by Numba - " - f"{min_cudart[0]}.{min_cudart[1]} is the minimum " - "required version.") + unsupported_ver = ( + f"CUDA Toolkit {ctk_ver} is unsupported by Numba - " + f"{min_cudart[0]}.{min_cudart[1]} is the minimum " + "required version." + ) warnings.warn(unsupported_ver) return _supported_cc @@ -403,8 +450,10 @@ def find_closest_arch(mycc): supported_ccs = NVVM().supported_ccs if not supported_ccs: - msg = "No supported GPU compute capabilities found. " \ - "Please check your cudatoolkit version matches your CUDA version." + msg = ( + "No supported GPU compute capabilities found. " + "Please check your cudatoolkit version matches your CUDA version." + ) raise NvvmSupportError(msg) for i, cc in enumerate(supported_ccs): @@ -415,8 +464,10 @@ def find_closest_arch(mycc): # Exceeded if i == 0: # CC lower than supported - msg = "GPU compute capability %d.%d is not supported" \ - "(requires >=%d.%d)" % (mycc + cc) + msg = ( + "GPU compute capability %d.%d is not supported" + "(requires >=%d.%d)" % (mycc + cc) + ) raise NvvmSupportError(msg) else: # return the previous CC @@ -427,16 +478,15 @@ def find_closest_arch(mycc): def get_arch_option(major, minor): - """Matches with the closest architecture option - """ + """Matches with the closest architecture option""" if config.FORCE_CUDA_CC: arch = config.FORCE_CUDA_CC else: arch = find_closest_arch((major, minor)) - return 'compute_%d%d' % arch + return "compute_%d%d" % arch -MISSING_LIBDEVICE_FILE_MSG = '''Missing libdevice file. +MISSING_LIBDEVICE_FILE_MSG = """Missing libdevice file. Please ensure you have a CUDA Toolkit 11.2 or higher. For CUDA 12, ``cuda-nvcc`` and ``cuda-nvrtc`` are required: @@ -445,7 +495,7 @@ def get_arch_option(major, minor): For CUDA 11, ``cudatoolkit`` is required: $ conda install -c conda-forge cudatoolkit "cuda-version>=11.2,<12.0" -''' +""" class LibDevice(object): @@ -466,7 +516,7 @@ def get(self): cas_nvvm = """ %cas_success = cmpxchg volatile {Ti}* %iptr, {Ti} %old, {Ti} %new monotonic monotonic %cas = extractvalue {{ {Ti}, i1 }} %cas_success, 0 -""" # noqa: E501 +""" # noqa: E501 # Translation of code from CUDA Programming Guide v6.5, section B.12 @@ -490,7 +540,7 @@ def get(self): %result = bitcast {Ti} %old to {T} ret {T} %result }} -""" # noqa: E501 +""" # noqa: E501 ir_numba_atomic_inc_template = """ define internal {T} @___numba_atomic_{Tu}_inc({T}* %iptr, {T} %val) alwaysinline {{ @@ -510,7 +560,7 @@ def get(self): done: ret {T} %old }} -""" # noqa: E501 +""" # noqa: E501 ir_numba_atomic_dec_template = """ define internal {T} @___numba_atomic_{Tu}_dec({T}* %iptr, {T} %val) alwaysinline {{ @@ -530,7 +580,7 @@ def get(self): done: ret {T} %old }} -""" # noqa: E501 +""" # noqa: E501 ir_numba_atomic_minmax_template = """ define internal {T} @___numba_atomic_{T}_{NAN}{FUNC}({T}* %ptr, {T} %val) alwaysinline {{ @@ -561,7 +611,7 @@ def get(self): done: ret {T} %ptrval }} -""" # noqa: E501 +""" # noqa: E501 def ir_cas(Ti): @@ -574,8 +624,15 @@ def ir_numba_atomic_binary(T, Ti, OP, FUNC): def ir_numba_atomic_minmax(T, Ti, NAN, OP, PTR_OR_VAL, FUNC): - params = dict(T=T, Ti=Ti, NAN=NAN, OP=OP, PTR_OR_VAL=PTR_OR_VAL, - FUNC=FUNC, CAS=ir_cas(Ti)) + params = dict( + T=T, + Ti=Ti, + NAN=NAN, + OP=OP, + PTR_OR_VAL=PTR_OR_VAL, + FUNC=FUNC, + CAS=ir_cas(Ti), + ) return ir_numba_atomic_minmax_template.format(**params) @@ -590,41 +647,115 @@ def ir_numba_atomic_dec(T, Tu): def llvm_replace(llvmir): replacements = [ - ('declare double @"___numba_atomic_double_add"(double* %".1", double %".2")', # noqa: E501 - ir_numba_atomic_binary(T='double', Ti='i64', OP='fadd', FUNC='add')), - ('declare float @"___numba_atomic_float_sub"(float* %".1", float %".2")', # noqa: E501 - ir_numba_atomic_binary(T='float', Ti='i32', OP='fsub', FUNC='sub')), - ('declare double @"___numba_atomic_double_sub"(double* %".1", double %".2")', # noqa: E501 - ir_numba_atomic_binary(T='double', Ti='i64', OP='fsub', FUNC='sub')), - ('declare i64 @"___numba_atomic_u64_inc"(i64* %".1", i64 %".2")', - ir_numba_atomic_inc(T='i64', Tu='u64')), - ('declare i64 @"___numba_atomic_u64_dec"(i64* %".1", i64 %".2")', - ir_numba_atomic_dec(T='i64', Tu='u64')), - ('declare float @"___numba_atomic_float_max"(float* %".1", float %".2")', # noqa: E501 - ir_numba_atomic_minmax(T='float', Ti='i32', NAN='', OP='nnan olt', - PTR_OR_VAL='ptr', FUNC='max')), - ('declare double @"___numba_atomic_double_max"(double* %".1", double %".2")', # noqa: E501 - ir_numba_atomic_minmax(T='double', Ti='i64', NAN='', OP='nnan olt', - PTR_OR_VAL='ptr', FUNC='max')), - ('declare float @"___numba_atomic_float_min"(float* %".1", float %".2")', # noqa: E501 - ir_numba_atomic_minmax(T='float', Ti='i32', NAN='', OP='nnan ogt', - PTR_OR_VAL='ptr', FUNC='min')), - ('declare double @"___numba_atomic_double_min"(double* %".1", double %".2")', # noqa: E501 - ir_numba_atomic_minmax(T='double', Ti='i64', NAN='', OP='nnan ogt', - PTR_OR_VAL='ptr', FUNC='min')), - ('declare float @"___numba_atomic_float_nanmax"(float* %".1", float %".2")', # noqa: E501 - ir_numba_atomic_minmax(T='float', Ti='i32', NAN='nan', OP='ult', - PTR_OR_VAL='', FUNC='max')), - ('declare double @"___numba_atomic_double_nanmax"(double* %".1", double %".2")', # noqa: E501 - ir_numba_atomic_minmax(T='double', Ti='i64', NAN='nan', OP='ult', - PTR_OR_VAL='', FUNC='max')), - ('declare float @"___numba_atomic_float_nanmin"(float* %".1", float %".2")', # noqa: E501 - ir_numba_atomic_minmax(T='float', Ti='i32', NAN='nan', OP='ugt', - PTR_OR_VAL='', FUNC='min')), - ('declare double @"___numba_atomic_double_nanmin"(double* %".1", double %".2")', # noqa: E501 - ir_numba_atomic_minmax(T='double', Ti='i64', NAN='nan', OP='ugt', - PTR_OR_VAL='', FUNC='min')), - ('immarg', '') + ( + 'declare double @"___numba_atomic_double_add"(double* %".1", double %".2")', # noqa: E501 + ir_numba_atomic_binary(T="double", Ti="i64", OP="fadd", FUNC="add"), + ), + ( + 'declare float @"___numba_atomic_float_sub"(float* %".1", float %".2")', # noqa: E501 + ir_numba_atomic_binary(T="float", Ti="i32", OP="fsub", FUNC="sub"), + ), + ( + 'declare double @"___numba_atomic_double_sub"(double* %".1", double %".2")', # noqa: E501 + ir_numba_atomic_binary(T="double", Ti="i64", OP="fsub", FUNC="sub"), + ), + ( + 'declare i64 @"___numba_atomic_u64_inc"(i64* %".1", i64 %".2")', + ir_numba_atomic_inc(T="i64", Tu="u64"), + ), + ( + 'declare i64 @"___numba_atomic_u64_dec"(i64* %".1", i64 %".2")', + ir_numba_atomic_dec(T="i64", Tu="u64"), + ), + ( + 'declare float @"___numba_atomic_float_max"(float* %".1", float %".2")', # noqa: E501 + ir_numba_atomic_minmax( + T="float", + Ti="i32", + NAN="", + OP="nnan olt", + PTR_OR_VAL="ptr", + FUNC="max", + ), + ), + ( + 'declare double @"___numba_atomic_double_max"(double* %".1", double %".2")', # noqa: E501 + ir_numba_atomic_minmax( + T="double", + Ti="i64", + NAN="", + OP="nnan olt", + PTR_OR_VAL="ptr", + FUNC="max", + ), + ), + ( + 'declare float @"___numba_atomic_float_min"(float* %".1", float %".2")', # noqa: E501 + ir_numba_atomic_minmax( + T="float", + Ti="i32", + NAN="", + OP="nnan ogt", + PTR_OR_VAL="ptr", + FUNC="min", + ), + ), + ( + 'declare double @"___numba_atomic_double_min"(double* %".1", double %".2")', # noqa: E501 + ir_numba_atomic_minmax( + T="double", + Ti="i64", + NAN="", + OP="nnan ogt", + PTR_OR_VAL="ptr", + FUNC="min", + ), + ), + ( + 'declare float @"___numba_atomic_float_nanmax"(float* %".1", float %".2")', # noqa: E501 + ir_numba_atomic_minmax( + T="float", + Ti="i32", + NAN="nan", + OP="ult", + PTR_OR_VAL="", + FUNC="max", + ), + ), + ( + 'declare double @"___numba_atomic_double_nanmax"(double* %".1", double %".2")', # noqa: E501 + ir_numba_atomic_minmax( + T="double", + Ti="i64", + NAN="nan", + OP="ult", + PTR_OR_VAL="", + FUNC="max", + ), + ), + ( + 'declare float @"___numba_atomic_float_nanmin"(float* %".1", float %".2")', # noqa: E501 + ir_numba_atomic_minmax( + T="float", + Ti="i32", + NAN="nan", + OP="ugt", + PTR_OR_VAL="", + FUNC="min", + ), + ), + ( + 'declare double @"___numba_atomic_double_nanmin"(double* %".1", double %".2")', # noqa: E501 + ir_numba_atomic_minmax( + T="double", + Ti="i64", + NAN="nan", + OP="ugt", + PTR_OR_VAL="", + FUNC="min", + ), + ), + ("immarg", ""), ] for decl, fn in replacements: @@ -639,19 +770,21 @@ def compile_ir(llvmir, **options): if isinstance(llvmir, str): llvmir = [llvmir] - if options.pop('fastmath', False): - options.update({ - 'ftz': True, - 'fma': True, - 'prec_div': False, - 'prec_sqrt': False, - }) + if options.pop("fastmath", False): + options.update( + { + "ftz": True, + "fma": True, + "prec_div": False, + "prec_sqrt": False, + } + ) cu = CompilationUnit(options) for mod in llvmir: mod = llvm_replace(mod) - cu.add_module(mod.encode('utf8')) + cu.add_module(mod.encode("utf8")) cu.verify() # We add libdevice following verification so that it is not subject to the @@ -671,16 +804,16 @@ def llvm150_to_70_ir(ir): """ buf = [] for line in ir.splitlines(): - if line.startswith('attributes #'): + if line.startswith("attributes #"): # Remove function attributes unsupported by LLVM 7.0 m = re_attributes_def.match(line) attrs = m.group(1).split() - attrs = ' '.join(a for a in attrs if a != 'willreturn') + attrs = " ".join(a for a in attrs if a != "willreturn") line = line.replace(m.group(1), attrs) buf.append(line) - return '\n'.join(buf) + return "\n".join(buf) def set_cuda_kernel(function): @@ -704,7 +837,7 @@ def set_cuda_kernel(function): mdvalue = ir.Constant(ir.IntType(32), 1) md = module.add_metadata((function, mdstr, mdvalue)) - nmd = cgutils.get_or_insert_named_metadata(module, 'nvvm.annotations') + nmd = cgutils.get_or_insert_named_metadata(module, "nvvm.annotations") nmd.add(md) # Create the used list @@ -713,13 +846,13 @@ def set_cuda_kernel(function): fnptr = function.bitcast(ptrty) - llvm_used = ir.GlobalVariable(module, usedty, 'llvm.used') - llvm_used.linkage = 'appending' - llvm_used.section = 'llvm.metadata' + llvm_used = ir.GlobalVariable(module, usedty, "llvm.used") + llvm_used.linkage = "appending" + llvm_used.section = "llvm.metadata" llvm_used.initializer = ir.Constant(usedty, [fnptr]) # Remove 'noinline' if it is present. - function.attributes.discard('noinline') + function.attributes.discard("noinline") def add_ir_version(mod): @@ -728,4 +861,4 @@ def add_ir_version(mod): i32 = ir.IntType(32) ir_versions = [i32(v) for v in NVVM().get_ir_version()] md_ver = mod.add_metadata(ir_versions) - mod.add_named_metadata('nvvmir.version', md_ver) + mod.add_named_metadata("nvvmir.version", md_ver) diff --git a/numba_cuda/numba/cuda/cudadrv/rtapi.py b/numba_cuda/numba/cuda/cudadrv/rtapi.py index 4a88457f9..4d30f5c63 100644 --- a/numba_cuda/numba/cuda/cudadrv/rtapi.py +++ b/numba_cuda/numba/cuda/cudadrv/rtapi.py @@ -6,5 +6,5 @@ API_PROTOTYPES = { # cudaError_t cudaRuntimeGetVersion ( int* runtimeVersion ) - 'cudaRuntimeGetVersion': (c_int, POINTER(c_int)), + "cudaRuntimeGetVersion": (c_int, POINTER(c_int)), } diff --git a/numba_cuda/numba/cuda/cudadrv/runtime.py b/numba_cuda/numba/cuda/cudadrv/runtime.py index 20634d8f4..d665f4db1 100644 --- a/numba_cuda/numba/cuda/cudadrv/runtime.py +++ b/numba_cuda/numba/cuda/cudadrv/runtime.py @@ -21,6 +21,7 @@ class CudaRuntimeAPIError(CudaRuntimeError): """ Raised when there is an error accessing a C API from the CUDA Runtime. """ + def __init__(self, code, msg): self.code = code self.msg = msg @@ -44,11 +45,13 @@ def _initialize(self): _logger = make_logger() if config.DISABLE_CUDA: - msg = ("CUDA is disabled due to setting NUMBA_DISABLE_CUDA=1 " - "in the environment, or because CUDA is unsupported on " - "32-bit systems.") + msg = ( + "CUDA is disabled due to setting NUMBA_DISABLE_CUDA=1 " + "in the environment, or because CUDA is unsupported on " + "32-bit systems." + ) raise CudaSupportError(msg) - self.lib = open_cudalib('cudart') + self.lib = open_cudalib("cudart") self.is_initialized = True @@ -76,9 +79,10 @@ def __getattr__(self, fname): def _wrap_api_call(self, fname, libfn): @functools.wraps(libfn) def safe_cuda_api_call(*args): - _logger.debug('call runtime api: %s', libfn.__name__) + _logger.debug("call runtime api: %s", libfn.__name__) retcode = libfn(*args) self._check_error(fname, retcode) + return safe_cuda_api_call def _check_error(self, fname, retcode): @@ -125,11 +129,19 @@ def is_supported_version(self): def supported_versions(self): """A tuple of all supported CUDA toolkit versions. Versions are given in the form ``(major_version, minor_version)``.""" - if sys.platform not in ('linux', 'win32') or config.MACHINE_BITS != 64: + if sys.platform not in ("linux", "win32") or config.MACHINE_BITS != 64: # Only 64-bit Linux and Windows are supported return () - return ((11, 0), (11, 1), (11, 2), (11, 3), (11, 4), (11, 5), (11, 6), - (11, 7)) + return ( + (11, 0), + (11, 1), + (11, 2), + (11, 3), + (11, 4), + (11, 5), + (11, 6), + (11, 7), + ) runtime = Runtime() diff --git a/numba_cuda/numba/cuda/cudaimpl.py b/numba_cuda/numba/cuda/cudaimpl.py index 0ec08298c..931c43e31 100644 --- a/numba_cuda/numba/cuda/cudaimpl.py +++ b/numba_cuda/numba/cuda/cudaimpl.py @@ -29,48 +29,49 @@ def initialize_dim3(builder, prefix): return cgutils.pack_struct(builder, (x, y, z)) -@lower_attr(types.Module(cuda), 'threadIdx') +@lower_attr(types.Module(cuda), "threadIdx") def cuda_threadIdx(context, builder, sig, args): - return initialize_dim3(builder, 'tid') + return initialize_dim3(builder, "tid") -@lower_attr(types.Module(cuda), 'blockDim') +@lower_attr(types.Module(cuda), "blockDim") def cuda_blockDim(context, builder, sig, args): - return initialize_dim3(builder, 'ntid') + return initialize_dim3(builder, "ntid") -@lower_attr(types.Module(cuda), 'blockIdx') +@lower_attr(types.Module(cuda), "blockIdx") def cuda_blockIdx(context, builder, sig, args): - return initialize_dim3(builder, 'ctaid') + return initialize_dim3(builder, "ctaid") -@lower_attr(types.Module(cuda), 'gridDim') +@lower_attr(types.Module(cuda), "gridDim") def cuda_gridDim(context, builder, sig, args): - return initialize_dim3(builder, 'nctaid') + return initialize_dim3(builder, "nctaid") -@lower_attr(types.Module(cuda), 'laneid') +@lower_attr(types.Module(cuda), "laneid") def cuda_laneid(context, builder, sig, args): - return nvvmutils.call_sreg(builder, 'laneid') + return nvvmutils.call_sreg(builder, "laneid") -@lower_attr(dim3, 'x') +@lower_attr(dim3, "x") def dim3_x(context, builder, sig, args): return builder.extract_value(args, 0) -@lower_attr(dim3, 'y') +@lower_attr(dim3, "y") def dim3_y(context, builder, sig, args): return builder.extract_value(args, 1) -@lower_attr(dim3, 'z') +@lower_attr(dim3, "z") def dim3_z(context, builder, sig, args): return builder.extract_value(args, 2) # ----------------------------------------------------------------------------- + @lower(cuda.const.array_like, types.Array) def cuda_const_array_like(context, builder, sig, args): # This is a no-op because CUDATargetContext.make_constant_array already @@ -95,48 +96,68 @@ def _get_unique_smem_id(name): def cuda_shared_array_integer(context, builder, sig, args): length = sig.args[0].literal_value dtype = parse_dtype(sig.args[1]) - return _generic_array(context, builder, shape=(length,), dtype=dtype, - symbol_name=_get_unique_smem_id('_cudapy_smem'), - addrspace=nvvm.ADDRSPACE_SHARED, - can_dynsized=True) + return _generic_array( + context, + builder, + shape=(length,), + dtype=dtype, + symbol_name=_get_unique_smem_id("_cudapy_smem"), + addrspace=nvvm.ADDRSPACE_SHARED, + can_dynsized=True, + ) @lower(cuda.shared.array, types.Tuple, types.Any) @lower(cuda.shared.array, types.UniTuple, types.Any) def cuda_shared_array_tuple(context, builder, sig, args): - shape = [ s.literal_value for s in sig.args[0] ] + shape = [s.literal_value for s in sig.args[0]] dtype = parse_dtype(sig.args[1]) - return _generic_array(context, builder, shape=shape, dtype=dtype, - symbol_name=_get_unique_smem_id('_cudapy_smem'), - addrspace=nvvm.ADDRSPACE_SHARED, - can_dynsized=True) + return _generic_array( + context, + builder, + shape=shape, + dtype=dtype, + symbol_name=_get_unique_smem_id("_cudapy_smem"), + addrspace=nvvm.ADDRSPACE_SHARED, + can_dynsized=True, + ) @lower(cuda.local.array, types.IntegerLiteral, types.Any) def cuda_local_array_integer(context, builder, sig, args): length = sig.args[0].literal_value dtype = parse_dtype(sig.args[1]) - return _generic_array(context, builder, shape=(length,), dtype=dtype, - symbol_name='_cudapy_lmem', - addrspace=nvvm.ADDRSPACE_LOCAL, - can_dynsized=False) + return _generic_array( + context, + builder, + shape=(length,), + dtype=dtype, + symbol_name="_cudapy_lmem", + addrspace=nvvm.ADDRSPACE_LOCAL, + can_dynsized=False, + ) @lower(cuda.local.array, types.Tuple, types.Any) @lower(cuda.local.array, types.UniTuple, types.Any) def ptx_lmem_alloc_array(context, builder, sig, args): - shape = [ s.literal_value for s in sig.args[0] ] + shape = [s.literal_value for s in sig.args[0]] dtype = parse_dtype(sig.args[1]) - return _generic_array(context, builder, shape=shape, dtype=dtype, - symbol_name='_cudapy_lmem', - addrspace=nvvm.ADDRSPACE_LOCAL, - can_dynsized=False) + return _generic_array( + context, + builder, + shape=shape, + dtype=dtype, + symbol_name="_cudapy_lmem", + addrspace=nvvm.ADDRSPACE_LOCAL, + can_dynsized=False, + ) @lower(stubs.threadfence_block) def ptx_threadfence_block(context, builder, sig, args): assert not args - fname = 'llvm.nvvm.membar.cta' + fname = "llvm.nvvm.membar.cta" lmod = builder.module fnty = ir.FunctionType(ir.VoidType(), ()) sync = cgutils.get_or_insert_function(lmod, fnty, fname) @@ -147,7 +168,7 @@ def ptx_threadfence_block(context, builder, sig, args): @lower(stubs.threadfence_system) def ptx_threadfence_system(context, builder, sig, args): assert not args - fname = 'llvm.nvvm.membar.sys' + fname = "llvm.nvvm.membar.sys" lmod = builder.module fnty = ir.FunctionType(ir.VoidType(), ()) sync = cgutils.get_or_insert_function(lmod, fnty, fname) @@ -158,7 +179,7 @@ def ptx_threadfence_system(context, builder, sig, args): @lower(stubs.threadfence) def ptx_threadfence_device(context, builder, sig, args): assert not args - fname = 'llvm.nvvm.membar.gl' + fname = "llvm.nvvm.membar.gl" lmod = builder.module fnty = ir.FunctionType(ir.VoidType(), ()) sync = cgutils.get_or_insert_function(lmod, fnty, fname) @@ -175,7 +196,7 @@ def ptx_syncwarp(context, builder, sig, args): @lower(stubs.syncwarp, types.i4) def ptx_syncwarp_mask(context, builder, sig, args): - fname = 'llvm.nvvm.bar.warp.sync' + fname = "llvm.nvvm.bar.warp.sync" lmod = builder.module fnty = ir.FunctionType(ir.VoidType(), (ir.IntType(32),)) sync = cgutils.get_or_insert_function(lmod, fnty, fname) @@ -183,14 +204,18 @@ def ptx_syncwarp_mask(context, builder, sig, args): return context.get_dummy_value() -@lower(stubs.shfl_sync_intrinsic, types.i4, types.i4, types.i4, types.i4, - types.i4) -@lower(stubs.shfl_sync_intrinsic, types.i4, types.i4, types.i8, types.i4, - types.i4) -@lower(stubs.shfl_sync_intrinsic, types.i4, types.i4, types.f4, types.i4, - types.i4) -@lower(stubs.shfl_sync_intrinsic, types.i4, types.i4, types.f8, types.i4, - types.i4) +@lower( + stubs.shfl_sync_intrinsic, types.i4, types.i4, types.i4, types.i4, types.i4 +) +@lower( + stubs.shfl_sync_intrinsic, types.i4, types.i4, types.i8, types.i4, types.i4 +) +@lower( + stubs.shfl_sync_intrinsic, types.i4, types.i4, types.f4, types.i4, types.i4 +) +@lower( + stubs.shfl_sync_intrinsic, types.i4, types.i4, types.f8, types.i4, types.i4 +) def ptx_shfl_sync_i32(context, builder, sig, args): """ The NVVM intrinsic for shfl only supports i32, but the cuda intrinsic @@ -203,12 +228,17 @@ def ptx_shfl_sync_i32(context, builder, sig, args): value_type = sig.args[2] if value_type in types.real_domain: value = builder.bitcast(value, ir.IntType(value_type.bitwidth)) - fname = 'llvm.nvvm.shfl.sync.i32' + fname = "llvm.nvvm.shfl.sync.i32" lmod = builder.module fnty = ir.FunctionType( ir.LiteralStructType((ir.IntType(32), ir.IntType(1))), - (ir.IntType(32), ir.IntType(32), ir.IntType(32), - ir.IntType(32), ir.IntType(32)) + ( + ir.IntType(32), + ir.IntType(32), + ir.IntType(32), + ir.IntType(32), + ir.IntType(32), + ), ) func = cgutils.get_or_insert_function(lmod, fnty, fname) if value_type.bitwidth == 32: @@ -239,11 +269,12 @@ def ptx_shfl_sync_i32(context, builder, sig, args): @lower(stubs.vote_sync_intrinsic, types.i4, types.i4, types.boolean) def ptx_vote_sync(context, builder, sig, args): - fname = 'llvm.nvvm.vote.sync' + fname = "llvm.nvvm.vote.sync" lmod = builder.module - fnty = ir.FunctionType(ir.LiteralStructType((ir.IntType(32), - ir.IntType(1))), - (ir.IntType(32), ir.IntType(32), ir.IntType(1))) + fnty = ir.FunctionType( + ir.LiteralStructType((ir.IntType(32), ir.IntType(1))), + (ir.IntType(32), ir.IntType(32), ir.IntType(1)), + ) func = cgutils.get_or_insert_function(lmod, fnty, fname) return builder.call(func, args) @@ -257,7 +288,7 @@ def ptx_match_any_sync(context, builder, sig, args): width = sig.args[1].bitwidth if sig.args[1] in types.real_domain: value = builder.bitcast(value, ir.IntType(width)) - fname = 'llvm.nvvm.match.any.sync.i{}'.format(width) + fname = "llvm.nvvm.match.any.sync.i{}".format(width) lmod = builder.module fnty = ir.FunctionType(ir.IntType(32), (ir.IntType(32), ir.IntType(width))) func = cgutils.get_or_insert_function(lmod, fnty, fname) @@ -273,27 +304,35 @@ def ptx_match_all_sync(context, builder, sig, args): width = sig.args[1].bitwidth if sig.args[1] in types.real_domain: value = builder.bitcast(value, ir.IntType(width)) - fname = 'llvm.nvvm.match.all.sync.i{}'.format(width) + fname = "llvm.nvvm.match.all.sync.i{}".format(width) lmod = builder.module - fnty = ir.FunctionType(ir.LiteralStructType((ir.IntType(32), - ir.IntType(1))), - (ir.IntType(32), ir.IntType(width))) + fnty = ir.FunctionType( + ir.LiteralStructType((ir.IntType(32), ir.IntType(1))), + (ir.IntType(32), ir.IntType(width)), + ) func = cgutils.get_or_insert_function(lmod, fnty, fname) return builder.call(func, (mask, value)) @lower(stubs.activemask) def ptx_activemask(context, builder, sig, args): - activemask = ir.InlineAsm(ir.FunctionType(ir.IntType(32), []), - "activemask.b32 $0;", '=r', side_effect=True) + activemask = ir.InlineAsm( + ir.FunctionType(ir.IntType(32), []), + "activemask.b32 $0;", + "=r", + side_effect=True, + ) return builder.call(activemask, []) @lower(stubs.lanemask_lt) def ptx_lanemask_lt(context, builder, sig, args): - activemask = ir.InlineAsm(ir.FunctionType(ir.IntType(32), []), - "mov.u32 $0, %lanemask_lt;", '=r', - side_effect=True) + activemask = ir.InlineAsm( + ir.FunctionType(ir.IntType(32), []), + "mov.u32 $0, %lanemask_lt;", + "=r", + side_effect=True, + ) return builder.call(activemask, []) @@ -308,7 +347,7 @@ def ptx_fma(context, builder, sig, args): def float16_float_ty_constraint(bitwidth): - typemap = {32: ('f32', 'f'), 64: ('f64', 'd')} + typemap = {32: ("f32", "f"), 64: ("f64", "d")} try: return typemap[bitwidth] @@ -342,7 +381,7 @@ def float_to_float16_cast(context, builder, fromty, toty, val): def float16_int_constraint(bitwidth): - typemap = { 8: 'c', 16: 'h', 32: 'r', 64: 'l' } + typemap = {8: "c", 16: "h", 32: "r", 64: "l"} try: return typemap[bitwidth] @@ -355,12 +394,12 @@ def float16_int_constraint(bitwidth): def float16_to_integer_cast(context, builder, fromty, toty, val): bitwidth = toty.bitwidth constraint = float16_int_constraint(bitwidth) - signedness = 's' if toty.signed else 'u' + signedness = "s" if toty.signed else "u" fnty = ir.FunctionType(context.get_value_type(toty), [ir.IntType(16)]) - asm = ir.InlineAsm(fnty, - f"cvt.rni.{signedness}{bitwidth}.f16 $0, $1;", - f"={constraint},h") + asm = ir.InlineAsm( + fnty, f"cvt.rni.{signedness}{bitwidth}.f16 $0, $1;", f"={constraint},h" + ) return builder.call(asm, [val]) @@ -369,40 +408,38 @@ def float16_to_integer_cast(context, builder, fromty, toty, val): def integer_to_float16_cast(context, builder, fromty, toty, val): bitwidth = fromty.bitwidth constraint = float16_int_constraint(bitwidth) - signedness = 's' if fromty.signed else 'u' + signedness = "s" if fromty.signed else "u" - fnty = ir.FunctionType(ir.IntType(16), - [context.get_value_type(fromty)]) - asm = ir.InlineAsm(fnty, - f"cvt.rn.f16.{signedness}{bitwidth} $0, $1;", - f"=h,{constraint}") + fnty = ir.FunctionType(ir.IntType(16), [context.get_value_type(fromty)]) + asm = ir.InlineAsm( + fnty, f"cvt.rn.f16.{signedness}{bitwidth} $0, $1;", f"=h,{constraint}" + ) return builder.call(asm, [val]) def lower_fp16_binary(fn, op): @lower(fn, types.float16, types.float16) def ptx_fp16_binary(context, builder, sig, args): - fnty = ir.FunctionType(ir.IntType(16), - [ir.IntType(16), ir.IntType(16)]) - asm = ir.InlineAsm(fnty, f'{op}.f16 $0,$1,$2;', '=h,h,h') + fnty = ir.FunctionType(ir.IntType(16), [ir.IntType(16), ir.IntType(16)]) + asm = ir.InlineAsm(fnty, f"{op}.f16 $0,$1,$2;", "=h,h,h") return builder.call(asm, args) -lower_fp16_binary(stubs.fp16.hadd, 'add') -lower_fp16_binary(operator.add, 'add') -lower_fp16_binary(operator.iadd, 'add') -lower_fp16_binary(stubs.fp16.hsub, 'sub') -lower_fp16_binary(operator.sub, 'sub') -lower_fp16_binary(operator.isub, 'sub') -lower_fp16_binary(stubs.fp16.hmul, 'mul') -lower_fp16_binary(operator.mul, 'mul') -lower_fp16_binary(operator.imul, 'mul') +lower_fp16_binary(stubs.fp16.hadd, "add") +lower_fp16_binary(operator.add, "add") +lower_fp16_binary(operator.iadd, "add") +lower_fp16_binary(stubs.fp16.hsub, "sub") +lower_fp16_binary(operator.sub, "sub") +lower_fp16_binary(operator.isub, "sub") +lower_fp16_binary(stubs.fp16.hmul, "mul") +lower_fp16_binary(operator.mul, "mul") +lower_fp16_binary(operator.imul, "mul") @lower(stubs.fp16.hneg, types.float16) def ptx_fp16_hneg(context, builder, sig, args): fnty = ir.FunctionType(ir.IntType(16), [ir.IntType(16)]) - asm = ir.InlineAsm(fnty, 'neg.f16 $0, $1;', '=h,h') + asm = ir.InlineAsm(fnty, "neg.f16 $0, $1;", "=h,h") return builder.call(asm, args) @@ -414,7 +451,7 @@ def operator_hneg(context, builder, sig, args): @lower(stubs.fp16.habs, types.float16) def ptx_fp16_habs(context, builder, sig, args): fnty = ir.FunctionType(ir.IntType(16), [ir.IntType(16)]) - asm = ir.InlineAsm(fnty, 'abs.f16 $0, $1;', '=h,h') + asm = ir.InlineAsm(fnty, "abs.f16 $0, $1;", "=h,h") return builder.call(asm, args) @@ -450,27 +487,28 @@ def fp16_div(x, y): def _gen_fp16_cmp(op): def ptx_fp16_comparison(context, builder, sig, args): fnty = ir.FunctionType(ir.IntType(16), [ir.IntType(16), ir.IntType(16)]) - asm = ir.InlineAsm(fnty, _fp16_cmp.format(op=op), '=h,h,h') + asm = ir.InlineAsm(fnty, _fp16_cmp.format(op=op), "=h,h,h") result = builder.call(asm, args) zero = context.get_constant(types.int16, 0) int_result = builder.bitcast(result, ir.IntType(16)) return builder.icmp_unsigned("!=", int_result, zero) + return ptx_fp16_comparison -lower(stubs.fp16.heq, types.float16, types.float16)(_gen_fp16_cmp('eq')) -lower(operator.eq, types.float16, types.float16)(_gen_fp16_cmp('eq')) -lower(stubs.fp16.hne, types.float16, types.float16)(_gen_fp16_cmp('ne')) -lower(operator.ne, types.float16, types.float16)(_gen_fp16_cmp('ne')) -lower(stubs.fp16.hge, types.float16, types.float16)(_gen_fp16_cmp('ge')) -lower(operator.ge, types.float16, types.float16)(_gen_fp16_cmp('ge')) -lower(stubs.fp16.hgt, types.float16, types.float16)(_gen_fp16_cmp('gt')) -lower(operator.gt, types.float16, types.float16)(_gen_fp16_cmp('gt')) -lower(stubs.fp16.hle, types.float16, types.float16)(_gen_fp16_cmp('le')) -lower(operator.le, types.float16, types.float16)(_gen_fp16_cmp('le')) -lower(stubs.fp16.hlt, types.float16, types.float16)(_gen_fp16_cmp('lt')) -lower(operator.lt, types.float16, types.float16)(_gen_fp16_cmp('lt')) +lower(stubs.fp16.heq, types.float16, types.float16)(_gen_fp16_cmp("eq")) +lower(operator.eq, types.float16, types.float16)(_gen_fp16_cmp("eq")) +lower(stubs.fp16.hne, types.float16, types.float16)(_gen_fp16_cmp("ne")) +lower(operator.ne, types.float16, types.float16)(_gen_fp16_cmp("ne")) +lower(stubs.fp16.hge, types.float16, types.float16)(_gen_fp16_cmp("ge")) +lower(operator.ge, types.float16, types.float16)(_gen_fp16_cmp("ge")) +lower(stubs.fp16.hgt, types.float16, types.float16)(_gen_fp16_cmp("gt")) +lower(operator.gt, types.float16, types.float16)(_gen_fp16_cmp("gt")) +lower(stubs.fp16.hle, types.float16, types.float16)(_gen_fp16_cmp("le")) +lower(operator.le, types.float16, types.float16)(_gen_fp16_cmp("le")) +lower(stubs.fp16.hlt, types.float16, types.float16)(_gen_fp16_cmp("lt")) +lower(operator.lt, types.float16, types.float16)(_gen_fp16_cmp("lt")) def lower_fp16_minmax(fn, fname, op): @@ -480,8 +518,8 @@ def ptx_fp16_minmax(context, builder, sig, args): return builder.select(choice, args[0], args[1]) -lower_fp16_minmax(stubs.fp16.hmax, 'max', 'gt') -lower_fp16_minmax(stubs.fp16.hmin, 'min', 'lt') +lower_fp16_minmax(stubs.fp16.hmax, "max", "gt") +lower_fp16_minmax(stubs.fp16.hmin, "min", "lt") # See: # https://docs.nvidia.com/cuda/libdevice-users-guide/__nv_cbrt.html#__nv_cbrt @@ -489,8 +527,8 @@ def ptx_fp16_minmax(context, builder, sig, args): cbrt_funcs = { - types.float32: '__nv_cbrtf', - types.float64: '__nv_cbrt', + types.float32: "__nv_cbrtf", + types.float64: "__nv_cbrt", } @@ -514,7 +552,8 @@ def ptx_brev_u4(context, builder, sig, args): fn = cgutils.get_or_insert_function( builder.module, ir.FunctionType(ir.IntType(32), (ir.IntType(32),)), - '__nv_brev') + "__nv_brev", + ) return builder.call(fn, args) @@ -526,15 +565,14 @@ def ptx_brev_u8(context, builder, sig, args): fn = cgutils.get_or_insert_function( builder.module, ir.FunctionType(ir.IntType(64), (ir.IntType(64),)), - '__nv_brevll') + "__nv_brevll", + ) return builder.call(fn, args) @lower(stubs.clz, types.Any) def ptx_clz(context, builder, sig, args): - return builder.ctlz( - args[0], - context.get_constant(types.boolean, 0)) + return builder.ctlz(args[0], context.get_constant(types.boolean, 0)) @lower(stubs.ffs, types.i4) @@ -543,7 +581,8 @@ def ptx_ffs_32(context, builder, sig, args): fn = cgutils.get_or_insert_function( builder.module, ir.FunctionType(ir.IntType(32), (ir.IntType(32),)), - '__nv_ffs') + "__nv_ffs", + ) return builder.call(fn, args) @@ -553,7 +592,8 @@ def ptx_ffs_64(context, builder, sig, args): fn = cgutils.get_or_insert_function( builder.module, ir.FunctionType(ir.IntType(32), (ir.IntType(64),)), - '__nv_ffsll') + "__nv_ffsll", + ) return builder.call(fn, args) @@ -567,10 +607,9 @@ def ptx_selp(context, builder, sig, args): def ptx_max_f4(context, builder, sig, args): fn = cgutils.get_or_insert_function( builder.module, - ir.FunctionType( - ir.FloatType(), - (ir.FloatType(), ir.FloatType())), - '__nv_fmaxf') + ir.FunctionType(ir.FloatType(), (ir.FloatType(), ir.FloatType())), + "__nv_fmaxf", + ) return builder.call(fn, args) @@ -580,25 +619,26 @@ def ptx_max_f4(context, builder, sig, args): def ptx_max_f8(context, builder, sig, args): fn = cgutils.get_or_insert_function( builder.module, - ir.FunctionType( - ir.DoubleType(), - (ir.DoubleType(), ir.DoubleType())), - '__nv_fmax') + ir.FunctionType(ir.DoubleType(), (ir.DoubleType(), ir.DoubleType())), + "__nv_fmax", + ) - return builder.call(fn, [ - context.cast(builder, args[0], sig.args[0], types.double), - context.cast(builder, args[1], sig.args[1], types.double), - ]) + return builder.call( + fn, + [ + context.cast(builder, args[0], sig.args[0], types.double), + context.cast(builder, args[1], sig.args[1], types.double), + ], + ) @lower(min, types.f4, types.f4) def ptx_min_f4(context, builder, sig, args): fn = cgutils.get_or_insert_function( builder.module, - ir.FunctionType( - ir.FloatType(), - (ir.FloatType(), ir.FloatType())), - '__nv_fminf') + ir.FunctionType(ir.FloatType(), (ir.FloatType(), ir.FloatType())), + "__nv_fminf", + ) return builder.call(fn, args) @@ -608,15 +648,17 @@ def ptx_min_f4(context, builder, sig, args): def ptx_min_f8(context, builder, sig, args): fn = cgutils.get_or_insert_function( builder.module, - ir.FunctionType( - ir.DoubleType(), - (ir.DoubleType(), ir.DoubleType())), - '__nv_fmin') + ir.FunctionType(ir.DoubleType(), (ir.DoubleType(), ir.DoubleType())), + "__nv_fmin", + ) - return builder.call(fn, [ - context.cast(builder, args[0], sig.args[0], types.double), - context.cast(builder, args[1], sig.args[1], types.double), - ]) + return builder.call( + fn, + [ + context.cast(builder, args[0], sig.args[0], types.double), + context.cast(builder, args[1], sig.args[1], types.double), + ], + ) @lower(round, types.f4) @@ -624,19 +666,22 @@ def ptx_min_f8(context, builder, sig, args): def ptx_round(context, builder, sig, args): fn = cgutils.get_or_insert_function( builder.module, - ir.FunctionType( - ir.IntType(64), - (ir.DoubleType(),)), - '__nv_llrint') - return builder.call(fn, [ - context.cast(builder, args[0], sig.args[0], types.double), - ]) + ir.FunctionType(ir.IntType(64), (ir.DoubleType(),)), + "__nv_llrint", + ) + return builder.call( + fn, + [ + context.cast(builder, args[0], sig.args[0], types.double), + ], + ) # This rounding implementation follows the algorithm used in the "fallback # version" of double_round in CPython. # https://github.com/python/cpython/blob/a755410e054e1e2390de5830befc08fe80706c66/Objects/floatobject.c#L964-L1007 + @lower(round, types.f4, types.Integer) @lower(round, types.f8, types.Integer) def round_to_impl(context, builder, sig, args): @@ -651,7 +696,7 @@ def round_ndigits(x, ndigits): pow1 = 10.0 ** (ndigits - 22) pow2 = 1e22 else: - pow1 = 10.0 ** ndigits + pow1 = 10.0**ndigits pow2 = 1.0 y = (x * pow1) * pow2 if math.isinf(y): @@ -662,7 +707,7 @@ def round_ndigits(x, ndigits): y = x / pow1 z = round(y) - if (math.fabs(y - z) == 0.5): + if math.fabs(y - z) == 0.5: # halfway between two integers; use round-half-even z = 2.0 * round(y / 2.0) @@ -673,19 +718,25 @@ def round_ndigits(x, ndigits): return z - return context.compile_internal(builder, round_ndigits, sig, args, ) + return context.compile_internal( + builder, + round_ndigits, + sig, + args, + ) def gen_deg_rad(const): def impl(context, builder, sig, args): - argty, = sig.args + (argty,) = sig.args factor = context.get_constant(argty, const) return builder.fmul(factor, args[0]) + return impl -_deg2rad = math.pi / 180. -_rad2deg = 180. / math.pi +_deg2rad = math.pi / 180.0 +_rad2deg = 180.0 / math.pi lower(math.radians, types.f4)(gen_deg_rad(_deg2rad)) lower(math.radians, types.f8)(gen_deg_rad(_deg2rad)) lower(math.degrees, types.f4)(gen_deg_rad(_rad2deg)) @@ -701,16 +752,18 @@ def _normalize_indices(context, builder, indty, inds, aryty, valty): indices = [inds] else: indices = cgutils.unpack_tuple(builder, inds, count=len(indty)) - indices = [context.cast(builder, i, t, types.intp) - for t, i in zip(indty, indices)] + indices = [ + context.cast(builder, i, t, types.intp) for t, i in zip(indty, indices) + ] dtype = aryty.dtype if dtype != valty: raise TypeError("expect %s but got %s" % (dtype, valty)) if aryty.ndim != len(indty): - raise TypeError("indexing %d-D array with %d-D index" % - (aryty.ndim, len(indty))) + raise TypeError( + "indexing %d-D array with %d-D index" % (aryty.ndim, len(indty)) + ) return indty, indices @@ -722,14 +775,17 @@ def imp(context, builder, sig, args): ary, inds, val = args dtype = aryty.dtype - indty, indices = _normalize_indices(context, builder, indty, inds, - aryty, valty) + indty, indices = _normalize_indices( + context, builder, indty, inds, aryty, valty + ) lary = context.make_array(aryty)(context, builder, ary) - ptr = cgutils.get_item_pointer(context, builder, aryty, lary, indices, - wraparound=True) + ptr = cgutils.get_item_pointer( + context, builder, aryty, lary, indices, wraparound=True + ) # dispatcher to implementation base on dtype return dispatch_fn(context, builder, dtype, ptr, val) + return imp @@ -740,14 +796,16 @@ def imp(context, builder, sig, args): def ptx_atomic_add_tuple(context, builder, dtype, ptr, val): if dtype == types.float32: lmod = builder.module - return builder.call(nvvmutils.declare_atomic_add_float32(lmod), - (ptr, val)) + return builder.call( + nvvmutils.declare_atomic_add_float32(lmod), (ptr, val) + ) elif dtype == types.float64: lmod = builder.module - return builder.call(nvvmutils.declare_atomic_add_float64(lmod), - (ptr, val)) + return builder.call( + nvvmutils.declare_atomic_add_float64(lmod), (ptr, val) + ) else: - return builder.atomic_rmw('add', ptr, val, 'monotonic') + return builder.atomic_rmw("add", ptr, val, "monotonic") @lower(stubs.atomic.sub, types.Array, types.intp, types.Any) @@ -757,14 +815,16 @@ def ptx_atomic_add_tuple(context, builder, dtype, ptr, val): def ptx_atomic_sub(context, builder, dtype, ptr, val): if dtype == types.float32: lmod = builder.module - return builder.call(nvvmutils.declare_atomic_sub_float32(lmod), - (ptr, val)) + return builder.call( + nvvmutils.declare_atomic_sub_float32(lmod), (ptr, val) + ) elif dtype == types.float64: lmod = builder.module - return builder.call(nvvmutils.declare_atomic_sub_float64(lmod), - (ptr, val)) + return builder.call( + nvvmutils.declare_atomic_sub_float64(lmod), (ptr, val) + ) else: - return builder.atomic_rmw('sub', ptr, val, 'monotonic') + return builder.atomic_rmw("sub", ptr, val, "monotonic") @lower(stubs.atomic.inc, types.Array, types.intp, types.Any) @@ -775,10 +835,10 @@ def ptx_atomic_inc(context, builder, dtype, ptr, val): if dtype in cuda.cudadecl.unsigned_int_numba_types: bw = dtype.bitwidth lmod = builder.module - fn = getattr(nvvmutils, f'declare_atomic_inc_int{bw}') + fn = getattr(nvvmutils, f"declare_atomic_inc_int{bw}") return builder.call(fn(lmod), (ptr, val)) else: - raise TypeError(f'Unimplemented atomic inc with {dtype} array') + raise TypeError(f"Unimplemented atomic inc with {dtype} array") @lower(stubs.atomic.dec, types.Array, types.intp, types.Any) @@ -789,27 +849,27 @@ def ptx_atomic_dec(context, builder, dtype, ptr, val): if dtype in cuda.cudadecl.unsigned_int_numba_types: bw = dtype.bitwidth lmod = builder.module - fn = getattr(nvvmutils, f'declare_atomic_dec_int{bw}') + fn = getattr(nvvmutils, f"declare_atomic_dec_int{bw}") return builder.call(fn(lmod), (ptr, val)) else: - raise TypeError(f'Unimplemented atomic dec with {dtype} array') + raise TypeError(f"Unimplemented atomic dec with {dtype} array") def ptx_atomic_bitwise(stub, op): @_atomic_dispatcher def impl_ptx_atomic(context, builder, dtype, ptr, val): if dtype in (cuda.cudadecl.integer_numba_types): - return builder.atomic_rmw(op, ptr, val, 'monotonic') + return builder.atomic_rmw(op, ptr, val, "monotonic") else: - raise TypeError(f'Unimplemented atomic {op} with {dtype} array') + raise TypeError(f"Unimplemented atomic {op} with {dtype} array") for ty in (types.intp, types.UniTuple, types.Tuple): lower(stub, types.Array, ty, types.Any)(impl_ptx_atomic) -ptx_atomic_bitwise(stubs.atomic.and_, 'and') -ptx_atomic_bitwise(stubs.atomic.or_, 'or') -ptx_atomic_bitwise(stubs.atomic.xor, 'xor') +ptx_atomic_bitwise(stubs.atomic.and_, "and") +ptx_atomic_bitwise(stubs.atomic.or_, "or") +ptx_atomic_bitwise(stubs.atomic.xor, "xor") @lower(stubs.atomic.exch, types.Array, types.intp, types.Any) @@ -818,9 +878,9 @@ def impl_ptx_atomic(context, builder, dtype, ptr, val): @_atomic_dispatcher def ptx_atomic_exch(context, builder, dtype, ptr, val): if dtype in (cuda.cudadecl.integer_numba_types): - return builder.atomic_rmw('xchg', ptr, val, 'monotonic') + return builder.atomic_rmw("xchg", ptr, val, "monotonic") else: - raise TypeError(f'Unimplemented atomic exch with {dtype} array') + raise TypeError(f"Unimplemented atomic exch with {dtype} array") @lower(stubs.atomic.max, types.Array, types.intp, types.Any) @@ -830,17 +890,19 @@ def ptx_atomic_exch(context, builder, dtype, ptr, val): def ptx_atomic_max(context, builder, dtype, ptr, val): lmod = builder.module if dtype == types.float64: - return builder.call(nvvmutils.declare_atomic_max_float64(lmod), - (ptr, val)) + return builder.call( + nvvmutils.declare_atomic_max_float64(lmod), (ptr, val) + ) elif dtype == types.float32: - return builder.call(nvvmutils.declare_atomic_max_float32(lmod), - (ptr, val)) + return builder.call( + nvvmutils.declare_atomic_max_float32(lmod), (ptr, val) + ) elif dtype in (types.int32, types.int64): - return builder.atomic_rmw('max', ptr, val, ordering='monotonic') + return builder.atomic_rmw("max", ptr, val, ordering="monotonic") elif dtype in (types.uint32, types.uint64): - return builder.atomic_rmw('umax', ptr, val, ordering='monotonic') + return builder.atomic_rmw("umax", ptr, val, ordering="monotonic") else: - raise TypeError('Unimplemented atomic max with %s array' % dtype) + raise TypeError("Unimplemented atomic max with %s array" % dtype) @lower(stubs.atomic.min, types.Array, types.intp, types.Any) @@ -850,17 +912,19 @@ def ptx_atomic_max(context, builder, dtype, ptr, val): def ptx_atomic_min(context, builder, dtype, ptr, val): lmod = builder.module if dtype == types.float64: - return builder.call(nvvmutils.declare_atomic_min_float64(lmod), - (ptr, val)) + return builder.call( + nvvmutils.declare_atomic_min_float64(lmod), (ptr, val) + ) elif dtype == types.float32: - return builder.call(nvvmutils.declare_atomic_min_float32(lmod), - (ptr, val)) + return builder.call( + nvvmutils.declare_atomic_min_float32(lmod), (ptr, val) + ) elif dtype in (types.int32, types.int64): - return builder.atomic_rmw('min', ptr, val, ordering='monotonic') + return builder.atomic_rmw("min", ptr, val, ordering="monotonic") elif dtype in (types.uint32, types.uint64): - return builder.atomic_rmw('umin', ptr, val, ordering='monotonic') + return builder.atomic_rmw("umin", ptr, val, ordering="monotonic") else: - raise TypeError('Unimplemented atomic min with %s array' % dtype) + raise TypeError("Unimplemented atomic min with %s array" % dtype) @lower(stubs.atomic.nanmax, types.Array, types.intp, types.Any) @@ -870,17 +934,19 @@ def ptx_atomic_min(context, builder, dtype, ptr, val): def ptx_atomic_nanmax(context, builder, dtype, ptr, val): lmod = builder.module if dtype == types.float64: - return builder.call(nvvmutils.declare_atomic_nanmax_float64(lmod), - (ptr, val)) + return builder.call( + nvvmutils.declare_atomic_nanmax_float64(lmod), (ptr, val) + ) elif dtype == types.float32: - return builder.call(nvvmutils.declare_atomic_nanmax_float32(lmod), - (ptr, val)) + return builder.call( + nvvmutils.declare_atomic_nanmax_float32(lmod), (ptr, val) + ) elif dtype in (types.int32, types.int64): - return builder.atomic_rmw('max', ptr, val, ordering='monotonic') + return builder.atomic_rmw("max", ptr, val, ordering="monotonic") elif dtype in (types.uint32, types.uint64): - return builder.atomic_rmw('umax', ptr, val, ordering='monotonic') + return builder.atomic_rmw("umax", ptr, val, ordering="monotonic") else: - raise TypeError('Unimplemented atomic max with %s array' % dtype) + raise TypeError("Unimplemented atomic max with %s array" % dtype) @lower(stubs.atomic.nanmin, types.Array, types.intp, types.Any) @@ -890,17 +956,19 @@ def ptx_atomic_nanmax(context, builder, dtype, ptr, val): def ptx_atomic_nanmin(context, builder, dtype, ptr, val): lmod = builder.module if dtype == types.float64: - return builder.call(nvvmutils.declare_atomic_nanmin_float64(lmod), - (ptr, val)) + return builder.call( + nvvmutils.declare_atomic_nanmin_float64(lmod), (ptr, val) + ) elif dtype == types.float32: - return builder.call(nvvmutils.declare_atomic_nanmin_float32(lmod), - (ptr, val)) + return builder.call( + nvvmutils.declare_atomic_nanmin_float32(lmod), (ptr, val) + ) elif dtype in (types.int32, types.int64): - return builder.atomic_rmw('min', ptr, val, ordering='monotonic') + return builder.atomic_rmw("min", ptr, val, ordering="monotonic") elif dtype in (types.uint32, types.uint64): - return builder.atomic_rmw('umin', ptr, val, ordering='monotonic') + return builder.atomic_rmw("umin", ptr, val, ordering="monotonic") else: - raise TypeError('Unimplemented atomic min with %s array' % dtype) + raise TypeError("Unimplemented atomic min with %s array" % dtype) @lower(stubs.atomic.compare_and_swap, types.Array, types.Any, types.Any) @@ -917,19 +985,21 @@ def ptx_atomic_cas(context, builder, sig, args): aryty, indty, oldty, valty = sig.args ary, inds, old, val = args - indty, indices = _normalize_indices(context, builder, indty, inds, aryty, - valty) + indty, indices = _normalize_indices( + context, builder, indty, inds, aryty, valty + ) lary = context.make_array(aryty)(context, builder, ary) - ptr = cgutils.get_item_pointer(context, builder, aryty, lary, indices, - wraparound=True) + ptr = cgutils.get_item_pointer( + context, builder, aryty, lary, indices, wraparound=True + ) if aryty.dtype in (cuda.cudadecl.integer_numba_types): lmod = builder.module bitwidth = aryty.dtype.bitwidth return nvvmutils.atomic_cmpxchg(builder, lmod, bitwidth, ptr, old, val) else: - raise TypeError('Unimplemented atomic cas with %s array' % aryty.dtype) + raise TypeError("Unimplemented atomic cas with %s array" % aryty.dtype) # ----------------------------------------------------------------------------- @@ -937,15 +1007,20 @@ def ptx_atomic_cas(context, builder, sig, args): @lower(breakpoint) def ptx_brkpt(context, builder, sig, args): - brkpt = ir.InlineAsm(ir.FunctionType(ir.VoidType(), []), - "brkpt;", '', side_effect=True) + brkpt = ir.InlineAsm( + ir.FunctionType(ir.VoidType(), []), "brkpt;", "", side_effect=True + ) builder.call(brkpt, ()) @lower(stubs.nanosleep, types.uint32) def ptx_nanosleep(context, builder, sig, args): - nanosleep = ir.InlineAsm(ir.FunctionType(ir.VoidType(), [ir.IntType(32)]), - "nanosleep.u32 $0;", 'r', side_effect=True) + nanosleep = ir.InlineAsm( + ir.FunctionType(ir.VoidType(), [ir.IntType(32)]), + "nanosleep.u32 $0;", + "r", + side_effect=True, + ) ns = args[0] builder.call(nanosleep, [ns]) @@ -953,8 +1028,9 @@ def ptx_nanosleep(context, builder, sig, args): # ----------------------------------------------------------------------------- -def _generic_array(context, builder, shape, dtype, symbol_name, addrspace, - can_dynsized=False): +def _generic_array( + context, builder, shape, dtype, symbol_name, addrspace, can_dynsized=False +): elemcount = reduce(operator.mul, shape, 1) # Check for valid shape for this type of allocation. @@ -985,16 +1061,17 @@ def _generic_array(context, builder, shape, dtype, symbol_name, addrspace, lmod = builder.module # Create global variable in the requested address space - gvmem = cgutils.add_global_variable(lmod, laryty, symbol_name, - addrspace) + gvmem = cgutils.add_global_variable( + lmod, laryty, symbol_name, addrspace + ) # Specify alignment to avoid misalignment bug align = context.get_abi_sizeof(lldtype) # Alignment is required to be a power of 2 for shared memory. If it is # not a power of 2 (e.g. for a Record array) then round up accordingly. - gvmem.align = 1 << (align - 1 ).bit_length() + gvmem.align = 1 << (align - 1).bit_length() if dynamic_smem: - gvmem.linkage = 'external' + gvmem.linkage = "external" else: ## Comment out the following line to workaround a NVVM bug ## which generates a invalid symbol name when the linkage @@ -1005,8 +1082,9 @@ def _generic_array(context, builder, shape, dtype, symbol_name, addrspace, gvmem.initializer = ir.Constant(laryty, ir.Undefined) # Convert to generic address-space - dataptr = builder.addrspacecast(gvmem, ir.PointerType(ir.IntType(8)), - 'generic') + dataptr = builder.addrspacecast( + gvmem, ir.PointerType(ir.IntType(8)), "generic" + ) targetdata = ll.create_target_data(nvvm.NVVM().data_layout) lldtype = context.get_data_type(dtype) @@ -1027,11 +1105,15 @@ def _generic_array(context, builder, shape, dtype, symbol_name, addrspace, # Unfortunately NVVM does not provide an intrinsic for the # %dynamic_smem_size register, so we must read it using inline # assembly. - get_dynshared_size = ir.InlineAsm(ir.FunctionType(ir.IntType(32), []), - "mov.u32 $0, %dynamic_smem_size;", - '=r', side_effect=True) - dynsmem_size = builder.zext(builder.call(get_dynshared_size, []), - ir.IntType(64)) + get_dynshared_size = ir.InlineAsm( + ir.FunctionType(ir.IntType(32), []), + "mov.u32 $0, %dynamic_smem_size;", + "=r", + side_effect=True, + ) + dynsmem_size = builder.zext( + builder.call(get_dynshared_size, []), ir.IntType(64) + ) # Only 1-D dynamic shared memory is supported so the following is a # sufficient construction of the shape kitemsize = context.get_constant(types.intp, itemsize) @@ -1041,15 +1123,17 @@ def _generic_array(context, builder, shape, dtype, symbol_name, addrspace, # Create array object ndim = len(shape) - aryty = types.Array(dtype=dtype, ndim=ndim, layout='C') + aryty = types.Array(dtype=dtype, ndim=ndim, layout="C") ary = context.make_array(aryty)(context, builder) - context.populate_array(ary, - data=builder.bitcast(dataptr, ary.data.type), - shape=kshape, - strides=kstrides, - itemsize=context.get_constant(types.intp, itemsize), - meminfo=None) + context.populate_array( + ary, + data=builder.bitcast(dataptr, ary.data.type), + shape=kshape, + strides=kstrides, + itemsize=context.get_constant(types.intp, itemsize), + meminfo=None, + ) return ary._getvalue() diff --git a/numba_cuda/numba/cuda/cudamath.py b/numba_cuda/numba/cuda/cudamath.py index 12d9715b6..f03c3b2ba 100644 --- a/numba_cuda/numba/cuda/cudamath.py +++ b/numba_cuda/numba/cuda/cudamath.py @@ -136,5 +136,5 @@ class Math_isnan(ConcreteTemplate): class Math_modf(ConcreteTemplate): cases = [ signature(types.UniTuple(types.float64, 2), types.float64), - signature(types.UniTuple(types.float32, 2), types.float32) + signature(types.UniTuple(types.float32, 2), types.float32), ] diff --git a/numba_cuda/numba/cuda/debuginfo.py b/numba_cuda/numba/cuda/debuginfo.py index 8b65c825b..2cfc5916d 100644 --- a/numba_cuda/numba/cuda/debuginfo.py +++ b/numba_cuda/numba/cuda/debuginfo.py @@ -7,7 +7,6 @@ class CUDADIBuilder(DIBuilder): - def _var_type(self, lltype, size, datamodel=None): is_bool = False is_grid_group = False @@ -34,11 +33,14 @@ def _var_type(self, lltype, size, datamodel=None): elif is_grid_group: ditok = "DW_ATE_unsigned" - return m.add_debug_info('DIBasicType', { - 'name': name, - 'size': bitsize, - 'encoding': ir.DIToken(ditok), - }) + return m.add_debug_info( + "DIBasicType", + { + "name": name, + "size": bitsize, + "encoding": ir.DIToken(ditok), + }, + ) # For other cases, use upstream Numba implementation return super()._var_type(lltype, size, datamodel=datamodel) diff --git a/numba_cuda/numba/cuda/decorators.py b/numba_cuda/numba/cuda/decorators.py index db62fb96a..edc904f0d 100644 --- a/numba_cuda/numba/cuda/decorators.py +++ b/numba_cuda/numba/cuda/decorators.py @@ -6,13 +6,24 @@ from numba.cuda.simulator.kernel import FakeCUDAKernel -_msg_deprecated_signature_arg = ("Deprecated keyword argument `{0}`. " - "Signatures should be passed as the first " - "positional argument.") - - -def jit(func_or_sig=None, device=False, inline=False, link=[], debug=None, - opt=None, lineinfo=False, cache=False, **kws): +_msg_deprecated_signature_arg = ( + "Deprecated keyword argument `{0}`. " + "Signatures should be passed as the first " + "positional argument." +) + + +def jit( + func_or_sig=None, + device=False, + inline=False, + link=[], + debug=None, + opt=None, + lineinfo=False, + cache=False, + **kws, +): """ JIT compile a Python function for CUDA GPUs. @@ -55,39 +66,43 @@ def jit(func_or_sig=None, device=False, inline=False, link=[], debug=None, """ if link and config.ENABLE_CUDASIM: - raise NotImplementedError('Cannot link PTX in the simulator') + raise NotImplementedError("Cannot link PTX in the simulator") - if kws.get('boundscheck'): + if kws.get("boundscheck"): raise NotImplementedError("bounds checking is not supported for CUDA") - if kws.get('argtypes') is not None: - msg = _msg_deprecated_signature_arg.format('argtypes') + if kws.get("argtypes") is not None: + msg = _msg_deprecated_signature_arg.format("argtypes") raise DeprecationError(msg) - if kws.get('restype') is not None: - msg = _msg_deprecated_signature_arg.format('restype') + if kws.get("restype") is not None: + msg = _msg_deprecated_signature_arg.format("restype") raise DeprecationError(msg) - if kws.get('bind') is not None: - msg = _msg_deprecated_signature_arg.format('bind') + if kws.get("bind") is not None: + msg = _msg_deprecated_signature_arg.format("bind") raise DeprecationError(msg) debug = config.CUDA_DEBUGINFO_DEFAULT if debug is None else debug opt = (config.OPT != 0) if opt is None else opt - fastmath = kws.get('fastmath', False) - extensions = kws.get('extensions', []) + fastmath = kws.get("fastmath", False) + extensions = kws.get("extensions", []) if debug and opt: - msg = ("debug=True with opt=True " - "is not supported by CUDA. This may result in a crash" - " - set debug=False or opt=False.") + msg = ( + "debug=True with opt=True " + "is not supported by CUDA. This may result in a crash" + " - set debug=False or opt=False." + ) warn(NumbaInvalidConfigWarning(msg)) if debug and lineinfo: - msg = ("debug and lineinfo are mutually exclusive. Use debug to get " - "full debug info (this disables some optimizations), or " - "lineinfo for line info only with code generation unaffected.") + msg = ( + "debug and lineinfo are mutually exclusive. Use debug to get " + "full debug info (this disables some optimizations), or " + "lineinfo for line info only with code generation unaffected." + ) warn(NumbaInvalidConfigWarning(msg)) - if device and kws.get('link'): + if device and kws.get("link"): raise ValueError("link keyword invalid for device function") if sigutils.is_signature(func_or_sig): @@ -101,19 +116,21 @@ def jit(func_or_sig=None, device=False, inline=False, link=[], debug=None, if signatures is not None: if config.ENABLE_CUDASIM: + def jitwrapper(func): return FakeCUDAKernel(func, device=device, fastmath=fastmath) + return jitwrapper def _jit(func): targetoptions = kws.copy() - targetoptions['debug'] = debug - targetoptions['lineinfo'] = lineinfo - targetoptions['link'] = link - targetoptions['opt'] = opt - targetoptions['fastmath'] = fastmath - targetoptions['device'] = device - targetoptions['extensions'] = extensions + targetoptions["debug"] = debug + targetoptions["lineinfo"] = lineinfo + targetoptions["link"] = link + targetoptions["opt"] = opt + targetoptions["fastmath"] = fastmath + targetoptions["device"] = device + targetoptions["extensions"] = extensions disp = CUDADispatcher(func, targetoptions=targetoptions) @@ -128,6 +145,7 @@ def _jit(func): if device: from numba.core import typeinfer + with typeinfer.register_dispatcher(disp): disp.compile_device(argtypes, restype) else: @@ -142,29 +160,41 @@ def _jit(func): else: if func_or_sig is None: if config.ENABLE_CUDASIM: + def autojitwrapper(func): - return FakeCUDAKernel(func, device=device, - fastmath=fastmath) + return FakeCUDAKernel( + func, device=device, fastmath=fastmath + ) else: + def autojitwrapper(func): - return jit(func, device=device, debug=debug, opt=opt, - lineinfo=lineinfo, link=link, cache=cache, **kws) + return jit( + func, + device=device, + debug=debug, + opt=opt, + lineinfo=lineinfo, + link=link, + cache=cache, + **kws, + ) return autojitwrapper # func_or_sig is a function else: if config.ENABLE_CUDASIM: - return FakeCUDAKernel(func_or_sig, device=device, - fastmath=fastmath) + return FakeCUDAKernel( + func_or_sig, device=device, fastmath=fastmath + ) else: targetoptions = kws.copy() - targetoptions['debug'] = debug - targetoptions['lineinfo'] = lineinfo - targetoptions['opt'] = opt - targetoptions['link'] = link - targetoptions['fastmath'] = fastmath - targetoptions['device'] = device - targetoptions['extensions'] = extensions + targetoptions["debug"] = debug + targetoptions["lineinfo"] = lineinfo + targetoptions["opt"] = opt + targetoptions["link"] = link + targetoptions["fastmath"] = fastmath + targetoptions["device"] = device + targetoptions["extensions"] = extensions disp = CUDADispatcher(func_or_sig, targetoptions=targetoptions) if cache: @@ -191,7 +221,7 @@ def declare_device(name, sig, link=None): argtypes, restype = sigutils.normalize_signature(sig) if restype is None: - msg = 'Return type must be provided for device declarations' + msg = "Return type must be provided for device declarations" raise TypeError(msg) return declare_device_function(name, restype, argtypes, link) diff --git a/numba_cuda/numba/cuda/descriptor.py b/numba_cuda/numba/cuda/descriptor.py index b91ddf7a1..965f301be 100644 --- a/numba_cuda/numba/cuda/descriptor.py +++ b/numba_cuda/numba/cuda/descriptor.py @@ -30,4 +30,4 @@ def target_context(self): return self._targetctx -cuda_target = CUDATarget('cuda') +cuda_target = CUDATarget("cuda") diff --git a/numba_cuda/numba/cuda/device_init.py b/numba_cuda/numba/cuda/device_init.py index e4352903b..da8074754 100644 --- a/numba_cuda/numba/cuda/device_init.py +++ b/numba_cuda/numba/cuda/device_init.py @@ -1,21 +1,58 @@ # Re export import sys from numba.cuda import cg -from .stubs import (threadIdx, blockIdx, blockDim, gridDim, laneid, warpsize, - syncwarp, shared, local, const, atomic, - shfl_sync_intrinsic, vote_sync_intrinsic, match_any_sync, - match_all_sync, threadfence_block, threadfence_system, - threadfence, selp, popc, brev, clz, ffs, fma, cbrt, - activemask, lanemask_lt, nanosleep, fp16, - _vector_type_stubs) -from .intrinsics import (grid, gridsize, syncthreads, syncthreads_and, - syncthreads_count, syncthreads_or) +from .stubs import ( + threadIdx, + blockIdx, + blockDim, + gridDim, + laneid, + warpsize, + syncwarp, + shared, + local, + const, + atomic, + shfl_sync_intrinsic, + vote_sync_intrinsic, + match_any_sync, + match_all_sync, + threadfence_block, + threadfence_system, + threadfence, + selp, + popc, + brev, + clz, + ffs, + fma, + cbrt, + activemask, + lanemask_lt, + nanosleep, + fp16, + _vector_type_stubs, +) +from .intrinsics import ( + grid, + gridsize, + syncthreads, + syncthreads_and, + syncthreads_count, + syncthreads_or, +) from .cudadrv.error import CudaSupportError -from numba.cuda.cudadrv.driver import (BaseCUDAMemoryManager, - HostOnlyCUDAMemoryManager, - GetIpcHandleMixin, MemoryPointer, - MappedMemory, PinnedMemory, MemoryInfo, - IpcHandle, set_memory_manager) +from numba.cuda.cudadrv.driver import ( + BaseCUDAMemoryManager, + HostOnlyCUDAMemoryManager, + GetIpcHandleMixin, + MemoryPointer, + MappedMemory, + PinnedMemory, + MemoryInfo, + IpcHandle, + set_memory_manager, +) from numba.cuda.cudadrv.runtime import runtime from .cudadrv import nvvm from numba.cuda import initialize @@ -26,13 +63,27 @@ from .api import _auto_device from .args import In, Out, InOut -from .intrinsic_wrapper import (all_sync, any_sync, eq_sync, ballot_sync, - shfl_sync, shfl_up_sync, shfl_down_sync, - shfl_xor_sync) +from .intrinsic_wrapper import ( + all_sync, + any_sync, + eq_sync, + ballot_sync, + shfl_sync, + shfl_up_sync, + shfl_down_sync, + shfl_xor_sync, +) from .kernels import reduction from numba.cuda.cudadrv.linkable_code import ( - Archive, CUSource, Cubin, Fatbin, LinkableCode, LTOIR, Object, PTXSource + Archive, + CUSource, + Cubin, + Fatbin, + LinkableCode, + LTOIR, + Object, + PTXSource, ) reduce = Reduce = reduction.Reduce diff --git a/numba_cuda/numba/cuda/deviceufunc.py b/numba_cuda/numba/cuda/deviceufunc.py index c29335a91..e6e69f4db 100644 --- a/numba_cuda/numba/cuda/deviceufunc.py +++ b/numba_cuda/numba/cuda/deviceufunc.py @@ -72,12 +72,12 @@ class UFuncMechanism(object): """ Prepare ufunc arguments for vectorize. """ + DEFAULT_STREAM = None SUPPORT_DEVICE_SLICING = False def __init__(self, typemap, args): - """Never used directly by user. Invoke by UFuncMechanism.call(). - """ + """Never used directly by user. Invoke by UFuncMechanism.call().""" self.typemap = typemap self.args = args nargs = len(self.args) @@ -105,7 +105,7 @@ def _fill_argtypes(self): """ for i, ary in enumerate(self.arrays): if ary is not None: - dtype = getattr(ary, 'dtype') + dtype = getattr(ary, "dtype") if dtype is None: dtype = np.asarray(ary).dtype self.argtypes[i] = dtype @@ -120,8 +120,9 @@ def _resolve_signature(self): # Try resolve scalar arguments for formaltys in self.typemap: match_map = [] - for i, (formal, actual) in enumerate(zip(formaltys, - self.argtypes)): + for i, (formal, actual) in enumerate( + zip(formaltys, self.argtypes) + ): if actual is None: actual = np.asarray(self.args[i]).dtype @@ -134,21 +135,26 @@ def _resolve_signature(self): if not matches: matches = [] for formaltys in self.typemap: - all_matches = all(actual is None or formal == actual - for formal, actual in - zip(formaltys, self.argtypes)) + all_matches = all( + actual is None or formal == actual + for formal, actual in zip(formaltys, self.argtypes) + ) if all_matches: matches.append(formaltys) if not matches: - raise TypeError("No matching version. GPU ufunc requires array " - "arguments to have the exact types. This behaves " - "like regular ufunc with casting='no'.") + raise TypeError( + "No matching version. GPU ufunc requires array " + "arguments to have the exact types. This behaves " + "like regular ufunc with casting='no'." + ) if len(matches) > 1: - raise TypeError("Failed to resolve ufunc due to ambiguous " - "signature. Too many untyped scalars. " - "Use numpy dtype object to type tag.") + raise TypeError( + "Failed to resolve ufunc due to ambiguous " + "signature. Too many untyped scalars. " + "Use numpy dtype object to type tag." + ) # Try scalar arguments self.argtypes = matches[0] @@ -163,8 +169,7 @@ def _get_actual_args(self): return self.arrays def _broadcast(self, arys): - """Perform numpy ufunc broadcasting - """ + """Perform numpy ufunc broadcasting""" shapelist = [a.shape for a in arys] shape = _multi_broadcast(*shapelist) @@ -177,9 +182,11 @@ def _broadcast(self, arys): arys[i] = self.broadcast_device(ary, shape) else: - ax_differs = [ax for ax in range(len(shape)) - if ax >= ary.ndim - or ary.shape[ax] != shape[ax]] + ax_differs = [ + ax + for ax in range(len(shape)) + if ax >= ary.ndim or ary.shape[ax] != shape[ax] + ] missingdim = len(shape) - len(ary.shape) strides = [0] * missingdim + list(ary.strides) @@ -187,9 +194,9 @@ def _broadcast(self, arys): for ax in ax_differs: strides[ax] = 0 - strided = np.lib.stride_tricks.as_strided(ary, - shape=shape, - strides=strides) + strided = np.lib.stride_tricks.as_strided( + ary, shape=shape, strides=strides + ) arys[i] = self.force_array_layout(strided) @@ -206,8 +213,7 @@ def get_arguments(self): return self._broadcast(arys) def get_function(self): - """Returns (result_dtype, function) - """ + """Returns (result_dtype, function)""" return self.typemap[self.argtypes] def is_device_array(self, obj): @@ -240,14 +246,13 @@ def force_array_layout(self, ary): @classmethod def call(cls, typemap, args, kws): - """Perform the entire ufunc call mechanism. - """ + """Perform the entire ufunc call mechanism.""" # Handle keywords - stream = kws.pop('stream', cls.DEFAULT_STREAM) - out = kws.pop('out', None) + stream = kws.pop("stream", cls.DEFAULT_STREAM) + out = kws.pop("out", None) if kws: - warnings.warn("unrecognized keywords: %s" % ', '.join(kws)) + warnings.warn("unrecognized keywords: %s" % ", ".join(kws)) # Begin call resolution cr = cls(typemap, args) @@ -364,9 +369,11 @@ def __init__(self, func, identity=None, cache=False, targetoptions={}): if cache: raise TypeError("caching is not supported") for opt in targetoptions: - if opt == 'nopython': - warnings.warn("nopython kwarg for cuda target is redundant", - RuntimeWarning) + if opt == "nopython": + warnings.warn( + "nopython kwarg for cuda target is redundant", + RuntimeWarning, + ) else: fmt = "Unrecognized options. " fmt += "cuda vectorize target does not support option: '%s'" @@ -386,14 +393,15 @@ def add(self, sig=None): devfnsig = signature(return_type, *args) funcname = self.pyfunc.__name__ - kernelsource = self._get_kernel_source(self._kernel_template, - devfnsig, funcname) + kernelsource = self._get_kernel_source( + self._kernel_template, devfnsig, funcname + ) corefn, return_type = self._compile_core(devfnsig) glbl = self._get_globals(corefn) sig = signature(types.void, *([a[:] for a in args] + [return_type[:]])) exec(kernelsource, glbl) - stager = glbl['__vectorized_%s' % funcname] + stager = glbl["__vectorized_%s" % funcname] kernel = self._compile_kernel(stager, sig) argdtypes = tuple(to_dtype(t) for t in devfnsig.args) @@ -404,10 +412,12 @@ def build_ufunc(self): raise NotImplementedError def _get_kernel_source(self, template, sig, funcname): - args = ['a%d' % i for i in range(len(sig.args))] - fmts = dict(name=funcname, - args=', '.join(args), - argitems=', '.join('%s[__tid__]' % i for i in args)) + args = ["a%d" % i for i in range(len(sig.args))] + fmts = dict( + name=funcname, + args=", ".join(args), + argitems=", ".join("%s[__tid__]" % i for i in args), + ) return template.format(**fmts) def _compile_core(self, sig): @@ -421,19 +431,26 @@ def _compile_kernel(self, fnobj, sig): class DeviceGUFuncVectorize(_BaseUFuncBuilder): - def __init__(self, func, sig, identity=None, cache=False, targetoptions={}, - writable_args=()): + def __init__( + self, + func, + sig, + identity=None, + cache=False, + targetoptions={}, + writable_args=(), + ): if cache: raise TypeError("caching is not supported") if writable_args: raise TypeError("writable_args are not supported") # Allow nopython flag to be set. - if not targetoptions.pop('nopython', True): + if not targetoptions.pop("nopython", True): raise TypeError("nopython flag must be True") # Are there any more target options? if targetoptions: - opts = ', '.join([repr(k) for k in targetoptions.keys()]) + opts = ", ".join([repr(k) for k in targetoptions.keys()]) fmt = "The following target options are not supported: {0}" raise TypeError(fmt.format(opts)) @@ -458,18 +475,21 @@ def add(self, sig=None): # specify the return type (where the "Python None" is the return type) valid_return_type = return_type in (types.none, None) if not valid_return_type: - raise TypeError('guvectorized functions cannot return values: ' - f'signature {sig} specifies {return_type} return ' - 'type') + raise TypeError( + "guvectorized functions cannot return values: " + f"signature {sig} specifies {return_type} return " + "type" + ) funcname = self.py_func.__name__ - src = expand_gufunc_template(self._kernel_template, indims, - outdims, funcname, args) + src = expand_gufunc_template( + self._kernel_template, indims, outdims, funcname, args + ) glbls = self._get_globals(sig) exec(src, glbls) - fnobj = glbls['__gufunc_{name}'.format(name=funcname)] + fnobj = glbls["__gufunc_{name}".format(name=funcname)] outertys = list(_determine_gufunc_outer_types(args, indims + outdims)) kernel = self._compile_kernel(fnobj, sig=tuple(outertys)) @@ -495,49 +515,58 @@ def _determine_gufunc_outer_types(argtys, dims): else: if nd > 0: raise ValueError("gufunc signature mismatch: ndim>0 for scalar") - yield types.Array(dtype=at, ndim=1, layout='A') + yield types.Array(dtype=at, ndim=1, layout="A") def expand_gufunc_template(template, indims, outdims, funcname, argtypes): - """Expand gufunc source template - """ + """Expand gufunc source template""" argdims = indims + outdims argnames = ["arg{0}".format(i) for i in range(len(argdims))] - checkedarg = "min({0})".format(', '.join(["{0}.shape[0]".format(a) - for a in argnames])) - inputs = [_gen_src_for_indexing(aref, adims, atype) - for aref, adims, atype in zip(argnames, indims, argtypes)] - outputs = [_gen_src_for_indexing(aref, adims, atype) - for aref, adims, atype in zip(argnames[len(indims):], outdims, - argtypes[len(indims):])] + checkedarg = "min({0})".format( + ", ".join(["{0}.shape[0]".format(a) for a in argnames]) + ) + inputs = [ + _gen_src_for_indexing(aref, adims, atype) + for aref, adims, atype in zip(argnames, indims, argtypes) + ] + outputs = [ + _gen_src_for_indexing(aref, adims, atype) + for aref, adims, atype in zip( + argnames[len(indims) :], outdims, argtypes[len(indims) :] + ) + ] argitems = inputs + outputs - src = template.format(name=funcname, args=', '.join(argnames), - checkedarg=checkedarg, - argitems=', '.join(argitems)) + src = template.format( + name=funcname, + args=", ".join(argnames), + checkedarg=checkedarg, + argitems=", ".join(argitems), + ) return src def _gen_src_for_indexing(aref, adims, atype): - return "{aref}[{sliced}]".format(aref=aref, - sliced=_gen_src_index(adims, atype)) + return "{aref}[{sliced}]".format( + aref=aref, sliced=_gen_src_index(adims, atype) + ) def _gen_src_index(adims, atype): if adims > 0: - return ','.join(['__tid__'] + [':'] * adims) + return ",".join(["__tid__"] + [":"] * adims) elif isinstance(atype, types.Array) and atype.ndim - 1 == adims: # Special case for 0-nd in shape-signature but # 1d array in type signature. # Slice it so that the result has the same dimension. - return '__tid__:(__tid__ + 1)' + return "__tid__:(__tid__ + 1)" else: - return '__tid__' + return "__tid__" class GUFuncEngine(object): - '''Determine how to broadcast and execute a gufunc + """Determine how to broadcast and execute a gufunc base on input shape and signature - ''' + """ @classmethod def from_signature(cls, signature): @@ -553,7 +582,7 @@ def __init__(self, inputsig, outputsig): def schedule(self, ishapes): if len(ishapes) != self.nin: - raise TypeError('invalid number of input argument') + raise TypeError("invalid number of input argument") # associate symbol values for input signature symbolmap = {} @@ -626,7 +655,7 @@ def __init__(self, parent, ishapes, oshapes, loopdims, pinned): def __str__(self): import pprint - attrs = 'ishapes', 'oshapes', 'loopdims', 'loopn', 'pinned' + attrs = "ishapes", "oshapes", "loopdims", "loopn", "pinned" values = [(k, getattr(self, k)) for k in attrs] return pprint.pformat(dict(values)) @@ -635,13 +664,15 @@ class GeneralizedUFunc(object): def __init__(self, kernelmap, engine): self.kernelmap = kernelmap self.engine = engine - self.max_blocksize = 2 ** 30 + self.max_blocksize = 2**30 def __call__(self, *args, **kws): - callsteps = self._call_steps(self.engine.nin, self.engine.nout, - args, kws) + callsteps = self._call_steps( + self.engine.nin, self.engine.nout, args, kws + ) indtypes, schedule, outdtypes, kernel = self._schedule( - callsteps.inputs, callsteps.outputs) + callsteps.inputs, callsteps.outputs + ) callsteps.adjust_input_types(indtypes) outputs = callsteps.prepare_outputs(schedule, outdtypes) @@ -671,7 +702,7 @@ def _schedule(self, inputs, outs): # check output for sched_shape, out in zip(schedule.output_shapes, outs): if out is not None and sched_shape != out.shape: - raise ValueError('output shape mismatch') + raise ValueError("output shape mismatch") return indtypes, schedule, outdtypes, kernel @@ -683,8 +714,10 @@ def _search_matching_signature(self, idtypes): Note: Ordering is guaranteed by `kernelmap` being a OrderedDict """ for sig in self.kernelmap.keys(): - if all(np.can_cast(actual, desired) - for actual, desired in zip(sig, idtypes)): + if all( + np.can_cast(actual, desired) + for actual, desired in zip(sig, idtypes) + ): return sig else: raise TypeError("no matching signature") @@ -716,8 +749,9 @@ def _broadcast_array(self, ary, newdim, innerdim): # Creating new dimension elif len(ary.shape) < len(newshape): - assert newshape[-len(ary.shape):] == ary.shape, \ + assert newshape[-len(ary.shape) :] == ary.shape, ( "cannot add dim and reshape at the same time" + ) return self._broadcast_add_axis(ary, newshape) # Collapsing dimension @@ -744,9 +778,9 @@ class GUFuncCallSteps(metaclass=ABCMeta): # The base class uses these slots; subclasses may provide additional slots. __slots__ = [ - 'outputs', - 'inputs', - '_copy_result_to_host', + "outputs", + "inputs", + "_copy_result_to_host", ] @abstractmethod @@ -782,21 +816,25 @@ def allocate_device_array(self, shape, dtype): """ def __init__(self, nin, nout, args, kwargs): - outputs = kwargs.get('out') + outputs = kwargs.get("out") # Ensure the user has passed a correct number of arguments if outputs is None and len(args) not in (nin, (nin + nout)): + def pos_argn(n): - return f'{n} positional argument{"s" * (n != 1)}' + return f"{n} positional argument{'s' * (n != 1)}" - msg = (f'This gufunc accepts {pos_argn(nin)} (when providing ' - f'input only) or {pos_argn(nin + nout)} (when providing ' - f'input and output). Got {pos_argn(len(args))}.') + msg = ( + f"This gufunc accepts {pos_argn(nin)} (when providing " + f"input only) or {pos_argn(nin + nout)} (when providing " + f"input and output). Got {pos_argn(len(args))}." + ) raise TypeError(msg) if outputs is not None and len(args) > nin: - raise ValueError("cannot specify argument 'out' as both positional " - "and keyword") + raise ValueError( + "cannot specify argument 'out' as both positional and keyword" + ) else: # If the user did not pass outputs either in the out kwarg or as # positional arguments, then we need to generate an initial list of @@ -819,8 +857,9 @@ def pos_argn(n): # - If any of the arguments are device arrays, we leave the output on # the device. - self._copy_result_to_host = (all_host_arrays and - all_user_outputs_are_host) + self._copy_result_to_host = ( + all_host_arrays and all_user_outputs_are_host + ) # Normalize arguments - ensure they are either device- or host-side # arrays (as opposed to lists, tuples, etc). @@ -850,9 +889,11 @@ def adjust_input_types(self, indtypes): """ for i, (ity, val) in enumerate(zip(indtypes, self.inputs)): if ity != val.dtype: - if not hasattr(val, 'astype'): - msg = ("compatible signature is possible by casting but " - "{0} does not support .astype()").format(type(val)) + if not hasattr(val, "astype"): + msg = ( + "compatible signature is possible by casting but " + "{0} does not support .astype()" + ).format(type(val)) raise TypeError(msg) # Cast types self.inputs[i] = val.astype(ity) @@ -866,8 +907,9 @@ def prepare_outputs(self, schedule, outdtypes): device; other outputs are allocated as necessary. """ outputs = [] - for shape, dtype, output in zip(schedule.output_shapes, outdtypes, - self.outputs): + for shape, dtype, output in zip( + schedule.output_shapes, outdtypes, self.outputs + ): if output is None or self._copy_result_to_host: output = self.allocate_device_array(shape, dtype) outputs.append(output) @@ -878,6 +920,7 @@ def prepare_inputs(self): """ Returns a list of input parameters that all reside on the target device. """ + def ensure_device(parameter): if self.is_device_array(parameter): convert = self.as_device_array @@ -897,8 +940,10 @@ def post_process_outputs(self, outputs): jarring, it is consistent with the behavior of GUFuncs in general. """ if self._copy_result_to_host: - outputs = [self.to_host(output, self_output) - for output, self_output in zip(outputs, self.outputs)] + outputs = [ + self.to_host(output, self_output) + for output, self_output in zip(outputs, self.outputs) + ] elif self.outputs[0] is not None: outputs = self.outputs diff --git a/numba_cuda/numba/cuda/dispatcher.py b/numba_cuda/numba/cuda/dispatcher.py index ba90f53a0..9f258db33 100644 --- a/numba_cuda/numba/cuda/dispatcher.py +++ b/numba_cuda/numba/cuda/dispatcher.py @@ -15,13 +15,19 @@ from numba.core.types.functions import Function from numba.cuda.api import get_current_device from numba.cuda.args import wrap_arg -from numba.cuda.compiler import (compile_cuda, CUDACompiler, kernel_fixup, - ExternFunction) +from numba.cuda.compiler import ( + compile_cuda, + CUDACompiler, + kernel_fixup, + ExternFunction, +) from numba.cuda.cudadrv import driver from numba.cuda.cudadrv.devices import get_context from numba.cuda.descriptor import cuda_target -from numba.cuda.errors import (missing_launch_config_msg, - normalize_kernel_dimensions) +from numba.cuda.errors import ( + missing_launch_config_msg, + normalize_kernel_dimensions, +) from numba.cuda import types as cuda_types from numba.cuda.runtime.nrt import rtsys @@ -30,17 +36,26 @@ from warnings import warn -cuda_fp16_math_funcs = ['hsin', 'hcos', - 'hlog', 'hlog10', - 'hlog2', - 'hexp', 'hexp10', - 'hexp2', - 'hsqrt', 'hrsqrt', - 'hfloor', 'hceil', - 'hrcp', 'hrint', - 'htrunc', 'hdiv'] - -reshape_funcs = ['nocopy_empty_reshape', 'numba_attempt_nocopy_reshape'] +cuda_fp16_math_funcs = [ + "hsin", + "hcos", + "hlog", + "hlog10", + "hlog2", + "hexp", + "hexp10", + "hexp2", + "hsqrt", + "hrsqrt", + "hfloor", + "hceil", + "hrcp", + "hrint", + "htrunc", + "hdiv", +] + +reshape_funcs = ["nocopy_empty_reshape", "numba_attempt_nocopy_reshape"] def get_cres_link_objects(cres): @@ -51,17 +66,16 @@ def get_cres_link_objects(cres): # List of calls into declared device functions device_func_calls = [ - (name, v) for name, v in cres.fndesc.typemap.items() if ( - isinstance(v, cuda_types.CUDADispatcher) - ) + (name, v) + for name, v in cres.fndesc.typemap.items() + if (isinstance(v, cuda_types.CUDADispatcher)) ] # List of tuples with SSA name of calls and corresponding signature call_signatures = [ (call.func.name, sig) - for call, sig in cres.fndesc.calltypes.items() if ( - isinstance(call, ir.Expr) and call.op == 'call' - ) + for call, sig in cres.fndesc.calltypes.items() + if (isinstance(call, ir.Expr) and call.op == "call") ] # Map SSA names to all invoked signatures @@ -93,10 +107,10 @@ def get_cres_link_objects(cres): class _Kernel(serialize.ReduceMixin): - ''' + """ CUDA Kernel specialized for a given set of argument types. When called, this object launches the kernel on the device. - ''' + """ NRT_functions = [ "NRT_Allocate", @@ -110,16 +124,27 @@ class _Kernel(serialize.ReduceMixin): "NRT_MemInfo_alloc_aligned", "NRT_Allocate_External", "NRT_decref", - "NRT_incref" + "NRT_incref", ] @global_compiler_lock - def __init__(self, py_func, argtypes, link=None, debug=False, - lineinfo=False, inline=False, fastmath=False, extensions=None, - max_registers=None, lto=False, opt=True, device=False): - + def __init__( + self, + py_func, + argtypes, + link=None, + debug=False, + lineinfo=False, + inline=False, + fastmath=False, + extensions=None, + max_registers=None, + lto=False, + opt=True, + device=False, + ): if device: - raise RuntimeError('Cannot compile a device function as a kernel') + raise RuntimeError("Cannot compile a device function as a kernel") super().__init__() @@ -144,24 +169,25 @@ def __init__(self, py_func, argtypes, link=None, debug=False, self.lineinfo = lineinfo self.extensions = extensions or [] - nvvm_options = { - 'fastmath': fastmath, - 'opt': 3 if opt else 0 - } + nvvm_options = {"fastmath": fastmath, "opt": 3 if opt else 0} if debug: - nvvm_options['g'] = None + nvvm_options["g"] = None cc = get_current_device().compute_capability - cres = compile_cuda(self.py_func, types.void, self.argtypes, - debug=self.debug, - lineinfo=lineinfo, - inline=inline, - fastmath=fastmath, - nvvm_options=nvvm_options, - cc=cc, - max_registers=max_registers, - lto=lto) + cres = compile_cuda( + self.py_func, + types.void, + self.argtypes, + debug=self.debug, + lineinfo=lineinfo, + inline=inline, + fastmath=fastmath, + nvvm_options=nvvm_options, + cc=cc, + max_registers=max_registers, + lto=lto, + ) tgt_ctx = cres.target_context lib = cres.library kernel = lib.get_function(cres.fndesc.llvm_func_name) @@ -174,24 +200,25 @@ def __init__(self, py_func, argtypes, link=None, debug=False, asm = lib.get_asm_str() # A kernel needs cooperative launch if grid_sync is being used. - self.cooperative = 'cudaCGGetIntrinsicHandle' in asm + self.cooperative = "cudaCGGetIntrinsicHandle" in asm # We need to link against cudadevrt if grid sync is being used. if self.cooperative: lib.needs_cudadevrt = True - def link_to_library_functions(library_functions, library_path, - prefix=None): + def link_to_library_functions( + library_functions, library_path, prefix=None + ): """ Dynamically links to library functions by searching for their names in the specified library and linking to the corresponding source file. """ if prefix is not None: - library_functions = [f"{prefix}{fn}" for fn in - library_functions] + library_functions = [ + f"{prefix}{fn}" for fn in library_functions + ] - found_functions = [fn for fn in library_functions - if f'{fn}' in asm] + found_functions = [fn for fn in library_functions if f"{fn}" in asm] if found_functions: basedir = os.path.dirname(os.path.abspath(__file__)) @@ -201,11 +228,11 @@ def link_to_library_functions(library_functions, library_path, return found_functions # Link to the helper library functions if needed - link_to_library_functions(reshape_funcs, 'reshape_funcs.cu') + link_to_library_functions(reshape_funcs, "reshape_funcs.cu") # Link to the CUDA FP16 math library functions if needed - link_to_library_functions(cuda_fp16_math_funcs, - 'cpp_function_wrappers.cu', - '__numba_wrapper_') + link_to_library_functions( + cuda_fp16_math_funcs, "cpp_function_wrappers.cu", "__numba_wrapper_" + ) self.maybe_link_nrt(link, tgt_ctx, asm) @@ -239,15 +266,16 @@ def maybe_link_nrt(self, link, tgt_ctx, asm): all_nrt = "|".join(self.NRT_functions) pattern = ( - r'\.extern\s+\.func\s+(?:\s*\(.+\)\s*)?(' - + all_nrt + r')\s*\([^)]*\)\s*;' + r"\.extern\s+\.func\s+(?:\s*\(.+\)\s*)?(" + + all_nrt + + r")\s*\([^)]*\)\s*;" ) nrt_in_asm = re.findall(pattern, asm) basedir = os.path.dirname(os.path.abspath(__file__)) if nrt_in_asm: - nrt_path = os.path.join(basedir, 'runtime', 'nrt.cu') + nrt_path = os.path.join(basedir, "runtime", "nrt.cu") link.append(nrt_path) @property @@ -270,8 +298,17 @@ def argument_types(self): return tuple(self.signature.args) @classmethod - def _rebuild(cls, cooperative, name, signature, codelibrary, - debug, lineinfo, call_helper, extensions): + def _rebuild( + cls, + cooperative, + name, + signature, + codelibrary, + debug, + lineinfo, + call_helper, + extensions, + ): """ Rebuild an instance. """ @@ -299,10 +336,16 @@ def _reduce_states(self): Thread, block and shared memory configuration are serialized. Stream information is discarded. """ - return dict(cooperative=self.cooperative, name=self.entry_name, - signature=self.signature, codelibrary=self._codelibrary, - debug=self.debug, lineinfo=self.lineinfo, - call_helper=self.call_helper, extensions=self.extensions) + return dict( + cooperative=self.cooperative, + name=self.entry_name, + signature=self.signature, + codelibrary=self._codelibrary, + debug=self.debug, + lineinfo=self.lineinfo, + call_helper=self.call_helper, + extensions=self.extensions, + ) def bind(self): """ @@ -323,73 +366,73 @@ def bind(self): @property def regs_per_thread(self): - ''' + """ The number of registers used by each thread for this kernel. - ''' + """ return self._codelibrary.get_cufunc().attrs.regs @property def const_mem_size(self): - ''' + """ The amount of constant memory used by this kernel. - ''' + """ return self._codelibrary.get_cufunc().attrs.const @property def shared_mem_per_block(self): - ''' + """ The amount of shared memory used per block for this kernel. - ''' + """ return self._codelibrary.get_cufunc().attrs.shared @property def max_threads_per_block(self): - ''' + """ The maximum allowable threads per block. - ''' + """ return self._codelibrary.get_cufunc().attrs.maxthreads @property def local_mem_per_thread(self): - ''' + """ The amount of local memory used per thread for this kernel. - ''' + """ return self._codelibrary.get_cufunc().attrs.local def inspect_llvm(self): - ''' + """ Returns the LLVM IR for this kernel. - ''' + """ return self._codelibrary.get_llvm_str() def inspect_asm(self, cc): - ''' + """ Returns the PTX code for this kernel. - ''' + """ return self._codelibrary.get_asm_str(cc=cc) def inspect_sass_cfg(self): - ''' + """ Returns the CFG of the SASS for this kernel. Requires nvdisasm to be available on the PATH. - ''' + """ return self._codelibrary.get_sass_cfg() def inspect_sass(self): - ''' + """ Returns the SASS code for this kernel. Requires nvdisasm to be available on the PATH. - ''' + """ return self._codelibrary.get_sass() def inspect_types(self, file=None): - ''' + """ Produce a dump of the Python source of this function annotated with the corresponding Numba IR and type information. The dump is written to *file*, or *sys.stdout* if *file* is *None*. - ''' + """ if self._type_annotation is None: raise ValueError("Type annotation is not available") @@ -397,12 +440,12 @@ def inspect_types(self, file=None): file = sys.stdout print("%s %s" % (self.entry_name, self.argument_types), file=file) - print('-' * 80, file=file) + print("-" * 80, file=file) print(self._type_annotation, file=file) - print('=' * 80, file=file) + print("=" * 80, file=file) def max_cooperative_grid_blocks(self, blockdim, dynsmemsize=0): - ''' + """ Calculates the maximum number of blocks that can be launched for this kernel in a cooperative grid in the current context, for the given block and dynamic shared memory sizes. @@ -411,15 +454,15 @@ def max_cooperative_grid_blocks(self, blockdim, dynsmemsize=0): a tuple for 2D or 3D blocks. :param dynsmemsize: Dynamic shared memory size in bytes. :return: The maximum number of blocks in the grid. - ''' + """ ctx = get_context() cufunc = self._codelibrary.get_cufunc() if isinstance(blockdim, tuple): blockdim = functools.reduce(lambda x, y: x * y, blockdim) - active_per_sm = ctx.get_active_blocks_per_multiprocessor(cufunc, - blockdim, - dynsmemsize) + active_per_sm = ctx.get_active_blocks_per_multiprocessor( + cufunc, blockdim, dynsmemsize + ) sm_count = ctx.device.MULTIPROCESSOR_COUNT return active_per_sm * sm_count @@ -435,7 +478,7 @@ def launch(self, args, griddim, blockdim, stream=0, sharedmem=0): excmem.memset(0, stream=stream) # Prepare arguments - retr = [] # hold functors for writeback + retr = [] # hold functors for writeback kernelargs = [] for t, v in zip(self.argument_types, args): @@ -449,46 +492,51 @@ def launch(self, args, griddim, blockdim, stream=0, sharedmem=0): stream_handle = stream and stream.handle or zero_stream # Invoke kernel - driver.launch_kernel(cufunc.handle, - *griddim, - *blockdim, - sharedmem, - stream_handle, - kernelargs, - cooperative=self.cooperative) + driver.launch_kernel( + cufunc.handle, + *griddim, + *blockdim, + sharedmem, + stream_handle, + kernelargs, + cooperative=self.cooperative, + ) if self.debug: driver.device_to_host(ctypes.addressof(excval), excmem, excsz) if excval.value != 0: # An error occurred def load_symbol(name): - mem, sz = cufunc.module.get_global_symbol("%s__%s__" % - (cufunc.name, - name)) + mem, sz = cufunc.module.get_global_symbol( + "%s__%s__" % (cufunc.name, name) + ) val = ctypes.c_int() driver.device_to_host(ctypes.addressof(val), mem, sz) return val.value - tid = [load_symbol("tid" + i) for i in 'zyx'] - ctaid = [load_symbol("ctaid" + i) for i in 'zyx'] + tid = [load_symbol("tid" + i) for i in "zyx"] + ctaid = [load_symbol("ctaid" + i) for i in "zyx"] code = excval.value exccls, exc_args, loc = self.call_helper.get_exception(code) # Prefix the exception message with the source location if loc is None: - locinfo = '' + locinfo = "" else: sym, filepath, lineno = loc filepath = os.path.abspath(filepath) - locinfo = 'In function %r, file %s, line %s, ' % (sym, - filepath, - lineno,) + locinfo = "In function %r, file %s, line %s, " % ( + sym, + filepath, + lineno, + ) # Prefix the exception message with the thread position prefix = "%stid=%s ctaid=%s" % (locinfo, tid, ctaid) if exc_args: - exc_args = ("%s: %s" % (prefix, exc_args[0]),) + \ - exc_args[1:] + exc_args = ("%s: %s" % (prefix, exc_args[0]),) + exc_args[ + 1: + ] else: - exc_args = prefix, + exc_args = (prefix,) raise exccls(*exc_args) # retrieve auto converted arrays @@ -502,11 +550,7 @@ def _prepare_args(self, ty, val, stream, retr, kernelargs): # map the arguments using any extension you've registered for extension in reversed(self.extensions): - ty, val = extension.prepare_args( - ty, - val, - stream=stream, - retr=retr) + ty, val = extension.prepare_args(ty, val, stream=stream, retr=retr) if isinstance(ty, types.Array): devary = wrap_arg(val).to_device(retr, stream) @@ -592,8 +636,9 @@ def _prepare_args(self, ty, val, stream, retr, kernelargs): class ForAll(object): def __init__(self, dispatcher, ntasks, tpb, stream, sharedmem): if ntasks < 0: - raise ValueError("Can't create ForAll with negative task count: %s" - % ntasks) + raise ValueError( + "Can't create ForAll with negative task count: %s" % ntasks + ) self.dispatcher = dispatcher self.ntasks = ntasks self.thread_per_block = tpb @@ -611,8 +656,9 @@ def __call__(self, *args): blockdim = self._compute_thread_per_block(specialized) griddim = (self.ntasks + blockdim - 1) // blockdim - return specialized[griddim, blockdim, self.stream, - self.sharedmem](*args) + return specialized[griddim, blockdim, self.stream, self.sharedmem]( + *args + ) def _compute_thread_per_block(self, dispatcher): tpb = self.thread_per_block @@ -627,7 +673,7 @@ def _compute_thread_per_block(self, dispatcher): kernel = next(iter(dispatcher.overloads.values())) kwargs = dict( func=kernel._codelibrary.get_cufunc(), - b2d_func=0, # dynamic-shared memory is constant to blksz + b2d_func=0, # dynamic-shared memory is constant to blksz memsize=self.sharedmem, blocksizelimit=1024, ) @@ -658,13 +704,16 @@ def __init__(self, dispatcher, griddim, blockdim, stream, sharedmem): min_grid_size = 128 grid_size = griddim[0] * griddim[1] * griddim[2] if grid_size < min_grid_size: - msg = (f"Grid size {grid_size} will likely result in GPU " - "under-utilization due to low occupancy.") + msg = ( + f"Grid size {grid_size} will likely result in GPU " + "under-utilization due to low occupancy." + ) warn(NumbaPerformanceWarning(msg)) def __call__(self, *args): - return self.dispatcher.call(args, self.griddim, self.blockdim, - self.stream, self.sharedmem) + return self.dispatcher.call( + args, self.griddim, self.blockdim, self.stream, self.sharedmem + ) class CUDACacheImpl(CacheImpl): @@ -689,6 +738,7 @@ class CUDACache(Cache): """ Implements a cache that saves and loads CUDA kernels and compile results. """ + _impl_class = CUDACacheImpl def load_overload(self, sig, target_context): @@ -696,12 +746,13 @@ def load_overload(self, sig, target_context): # initialized. To initialize the correct (i.e. CUDA) target, we need to # enforce that the current target is the CUDA target. from numba.core.target_extension import target_override - with target_override('cuda'): + + with target_override("cuda"): return super().load_overload(sig, target_context) class CUDADispatcher(Dispatcher, serialize.ReduceMixin): - ''' + """ CUDA Dispatcher object. When configured and called, the dispatcher will specialize itself for the given arguments (if no suitable specialized version already exists) & compute capability, and launch on the device @@ -709,7 +760,7 @@ class CUDADispatcher(Dispatcher, serialize.ReduceMixin): Dispatcher objects are not to be constructed by the user, but instead are created using the :func:`numba.cuda.jit` decorator. - ''' + """ # Whether to fold named arguments and default values. Default values are # presently unsupported on CUDA, so we can leave this as False in all @@ -719,8 +770,9 @@ class CUDADispatcher(Dispatcher, serialize.ReduceMixin): targetdescr = cuda_target def __init__(self, py_func, targetoptions, pipeline_class=CUDACompiler): - super().__init__(py_func, targetoptions=targetoptions, - pipeline_class=pipeline_class) + super().__init__( + py_func, targetoptions=targetoptions, pipeline_class=pipeline_class + ) # The following properties are for specialization of CUDADispatchers. A # specialized CUDADispatcher is one that is compiled for exactly one @@ -748,7 +800,7 @@ def configure(self, griddim, blockdim, stream=0, sharedmem=0): def __getitem__(self, args): if len(args) not in [2, 3, 4]: - raise ValueError('must specify at least the griddim and blockdim') + raise ValueError("must specify at least the griddim and blockdim") return self.configure(*args) def forall(self, ntasks, tpb=0, stream=0, sharedmem=0): @@ -775,7 +827,7 @@ def forall(self, ntasks, tpb=0, stream=0, sharedmem=0): @property def extensions(self): - ''' + """ A list of objects that must have a `prepare_args` function. When a specialized kernel is called, each argument will be passed through to the `prepare_args` (from the last object in this list to the @@ -791,17 +843,17 @@ def extensions(self): will be passed in turn to the next right-most `extension`. After all the extensions have been called, the resulting `(ty, val)` will be passed into Numba's default argument marshalling logic. - ''' - return self.targetoptions.get('extensions') + """ + return self.targetoptions.get("extensions") def __call__(self, *args, **kwargs): # An attempt to launch an unconfigured kernel raise ValueError(missing_launch_config_msg) def call(self, args, griddim, blockdim, stream, sharedmem): - ''' + """ Compile if necessary and invoke this kernel with *args*. - ''' + """ if self.specialized: kernel = next(iter(self.overloads.values())) else: @@ -824,28 +876,30 @@ def typeof_pyval(self, val): if cuda.is_cuda_array(val): # When typing, we don't need to synchronize on the array's # stream - this is done when the kernel is launched. - return typeof(cuda.as_cuda_array(val, sync=False), - Purpose.argument) + return typeof( + cuda.as_cuda_array(val, sync=False), Purpose.argument + ) else: raise def specialize(self, *args): - ''' + """ Create a new instance of this dispatcher specialized for the given *args*. - ''' + """ cc = get_current_device().compute_capability argtypes = tuple(self.typeof_pyval(a) for a in args) if self.specialized: - raise RuntimeError('Dispatcher already specialized') + raise RuntimeError("Dispatcher already specialized") specialization = self.specializations.get((cc, argtypes)) if specialization: return specialization targetoptions = self.targetoptions - specialization = CUDADispatcher(self.py_func, - targetoptions=targetoptions) + specialization = CUDADispatcher( + self.py_func, targetoptions=targetoptions + ) specialization.compile(argtypes) specialization.disable_compile() specialization._specialized = True @@ -860,7 +914,7 @@ def specialized(self): return self._specialized def get_regs_per_thread(self, signature=None): - ''' + """ Returns the number of registers used by each thread in this kernel for the device in the current context. @@ -869,17 +923,19 @@ def get_regs_per_thread(self, signature=None): kernel. :return: The number of registers used by the compiled variant of the kernel for the given signature and current device. - ''' + """ if signature is not None: return self.overloads[signature.args].regs_per_thread if self.specialized: return next(iter(self.overloads.values())).regs_per_thread else: - return {sig: overload.regs_per_thread - for sig, overload in self.overloads.items()} + return { + sig: overload.regs_per_thread + for sig, overload in self.overloads.items() + } def get_const_mem_size(self, signature=None): - ''' + """ Returns the size in bytes of constant memory used by this kernel for the device in the current context. @@ -889,17 +945,19 @@ def get_const_mem_size(self, signature=None): :return: The size in bytes of constant memory allocated by the compiled variant of the kernel for the given signature and current device. - ''' + """ if signature is not None: return self.overloads[signature.args].const_mem_size if self.specialized: return next(iter(self.overloads.values())).const_mem_size else: - return {sig: overload.const_mem_size - for sig, overload in self.overloads.items()} + return { + sig: overload.const_mem_size + for sig, overload in self.overloads.items() + } def get_shared_mem_per_block(self, signature=None): - ''' + """ Returns the size in bytes of statically allocated shared memory for this kernel. @@ -908,17 +966,19 @@ def get_shared_mem_per_block(self, signature=None): specialized kernel. :return: The amount of shared memory allocated by the compiled variant of the kernel for the given signature and current device. - ''' + """ if signature is not None: return self.overloads[signature.args].shared_mem_per_block if self.specialized: return next(iter(self.overloads.values())).shared_mem_per_block else: - return {sig: overload.shared_mem_per_block - for sig, overload in self.overloads.items()} + return { + sig: overload.shared_mem_per_block + for sig, overload in self.overloads.items() + } def get_max_threads_per_block(self, signature=None): - ''' + """ Returns the maximum allowable number of threads per block for this kernel. Exceeding this threshold will result in the kernel failing to launch. @@ -929,17 +989,19 @@ def get_max_threads_per_block(self, signature=None): :return: The maximum allowable threads per block for the compiled variant of the kernel for the given signature and current device. - ''' + """ if signature is not None: return self.overloads[signature.args].max_threads_per_block if self.specialized: return next(iter(self.overloads.values())).max_threads_per_block else: - return {sig: overload.max_threads_per_block - for sig, overload in self.overloads.items()} + return { + sig: overload.max_threads_per_block + for sig, overload in self.overloads.items() + } def get_local_mem_per_thread(self, signature=None): - ''' + """ Returns the size in bytes of local memory per thread for this kernel. @@ -948,14 +1010,16 @@ def get_local_mem_per_thread(self, signature=None): specialized kernel. :return: The amount of local memory allocated by the compiled variant of the kernel for the given signature and current device. - ''' + """ if signature is not None: return self.overloads[signature.args].local_mem_per_thread if self.specialized: return next(iter(self.overloads.values())).local_mem_per_thread else: - return {sig: overload.local_mem_per_thread - for sig, overload in self.overloads.items()} + return { + sig: overload.local_mem_per_thread + for sig, overload in self.overloads.items() + } def get_call_template(self, args, kws): # Originally copied from _DispatcherBase.get_call_template. This @@ -983,7 +1047,8 @@ def get_call_template(self, args, kws): name = "CallTemplate({0})".format(func_name) call_template = typing.make_concrete_template( - name, key=func_name, signatures=self.nopython_signatures) + name, key=func_name, signatures=self.nopython_signatures + ) pysig = utils.pysignature(self.py_func) return call_template, pysig, args, kws @@ -998,33 +1063,36 @@ def compile_device(self, args, return_type=None): """ if args not in self.overloads: with self._compiling_counter: - - debug = self.targetoptions.get('debug') - lineinfo = self.targetoptions.get('lineinfo') - inline = self.targetoptions.get('inline') - fastmath = self.targetoptions.get('fastmath') + debug = self.targetoptions.get("debug") + lineinfo = self.targetoptions.get("lineinfo") + inline = self.targetoptions.get("inline") + fastmath = self.targetoptions.get("fastmath") nvvm_options = { - 'opt': 3 if self.targetoptions.get('opt') else 0, - 'fastmath': fastmath + "opt": 3 if self.targetoptions.get("opt") else 0, + "fastmath": fastmath, } if debug: - nvvm_options['g'] = None + nvvm_options["g"] = None cc = get_current_device().compute_capability - cres = compile_cuda(self.py_func, return_type, args, - debug=debug, - lineinfo=lineinfo, - inline=inline, - fastmath=fastmath, - nvvm_options=nvvm_options, - cc=cc) + cres = compile_cuda( + self.py_func, + return_type, + args, + debug=debug, + lineinfo=lineinfo, + inline=inline, + fastmath=fastmath, + nvvm_options=nvvm_options, + cc=cc, + ) self.overloads[args] = cres - cres.target_context.insert_user_function(cres.entry_point, - cres.fndesc, - [cres.library]) + cres.target_context.insert_user_function( + cres.entry_point, cres.fndesc, [cres.library] + ) else: cres = self.overloads[args] @@ -1036,10 +1104,10 @@ def add_overload(self, kernel, argtypes): self.overloads[argtypes] = kernel def compile(self, sig): - ''' + """ Compile and bind to the current context a version of this kernel specialized for the given signature. - ''' + """ argtypes, return_type = sigutils.normalize_signature(sig) assert return_type is None or return_type == types.none @@ -1072,15 +1140,15 @@ def compile(self, sig): return kernel def inspect_llvm(self, signature=None): - ''' + """ Return the LLVM IR for this kernel. :param signature: A tuple of argument types. :return: The LLVM IR for the given signature, or a dict of LLVM IR for all previously-encountered signatures. - ''' - device = self.targetoptions.get('device') + """ + device = self.targetoptions.get("device") if signature is not None: if device: return self.overloads[signature].library.get_llvm_str() @@ -1088,23 +1156,27 @@ def inspect_llvm(self, signature=None): return self.overloads[signature].inspect_llvm() else: if device: - return {sig: overload.library.get_llvm_str() - for sig, overload in self.overloads.items()} + return { + sig: overload.library.get_llvm_str() + for sig, overload in self.overloads.items() + } else: - return {sig: overload.inspect_llvm() - for sig, overload in self.overloads.items()} + return { + sig: overload.inspect_llvm() + for sig, overload in self.overloads.items() + } def inspect_asm(self, signature=None): - ''' + """ Return this kernel's PTX assembly code for for the device in the current context. :param signature: A tuple of argument types. :return: The PTX code for the given signature, or a dict of PTX codes for all previously-encountered signatures. - ''' + """ cc = get_current_device().compute_capability - device = self.targetoptions.get('device') + device = self.targetoptions.get("device") if signature is not None: if device: return self.overloads[signature].library.get_asm_str(cc) @@ -1112,14 +1184,18 @@ def inspect_asm(self, signature=None): return self.overloads[signature].inspect_asm(cc) else: if device: - return {sig: overload.library.get_asm_str(cc) - for sig, overload in self.overloads.items()} + return { + sig: overload.library.get_asm_str(cc) + for sig, overload in self.overloads.items() + } else: - return {sig: overload.inspect_asm(cc) - for sig, overload in self.overloads.items()} + return { + sig: overload.inspect_asm(cc) + for sig, overload in self.overloads.items() + } def inspect_sass_cfg(self, signature=None): - ''' + """ Return this kernel's CFG for the device in the current context. :param signature: A tuple of argument types. @@ -1129,18 +1205,20 @@ def inspect_sass_cfg(self, signature=None): The CFG for the device in the current context is returned. Requires nvdisasm to be available on the PATH. - ''' - if self.targetoptions.get('device'): - raise RuntimeError('Cannot get the CFG of a device function') + """ + if self.targetoptions.get("device"): + raise RuntimeError("Cannot get the CFG of a device function") if signature is not None: return self.overloads[signature].inspect_sass_cfg() else: - return {sig: defn.inspect_sass_cfg() - for sig, defn in self.overloads.items()} + return { + sig: defn.inspect_sass_cfg() + for sig, defn in self.overloads.items() + } def inspect_sass(self, signature=None): - ''' + """ Return this kernel's SASS assembly code for for the device in the current context. @@ -1151,22 +1229,23 @@ def inspect_sass(self, signature=None): SASS for the device in the current context is returned. Requires nvdisasm to be available on the PATH. - ''' - if self.targetoptions.get('device'): - raise RuntimeError('Cannot inspect SASS of a device function') + """ + if self.targetoptions.get("device"): + raise RuntimeError("Cannot inspect SASS of a device function") if signature is not None: return self.overloads[signature].inspect_sass() else: - return {sig: defn.inspect_sass() - for sig, defn in self.overloads.items()} + return { + sig: defn.inspect_sass() for sig, defn in self.overloads.items() + } def inspect_types(self, file=None): - ''' + """ Produce a dump of the Python source of this function annotated with the corresponding Numba IR and type information. The dump is written to *file*, or *sys.stdout* if *file* is *None*. - ''' + """ if file is None: file = sys.stdout @@ -1186,5 +1265,4 @@ def _reduce_states(self): Reduce the instance for serialization. Compiled definitions are discarded. """ - return dict(py_func=self.py_func, - targetoptions=self.targetoptions) + return dict(py_func=self.py_func, targetoptions=self.targetoptions) diff --git a/numba_cuda/numba/cuda/errors.py b/numba_cuda/numba/cuda/errors.py index 653a0db6e..16989714e 100644 --- a/numba_cuda/numba/cuda/errors.py +++ b/numba_cuda/numba/cuda/errors.py @@ -7,8 +7,7 @@ def __init__(self, msg, tid=None, ctaid=None): self.tid = tid self.ctaid = ctaid self.msg = msg - t = ("An exception was raised in thread=%s block=%s\n" - "\t%s") + t = "An exception was raised in thread=%s block=%s\n\t%s" msg = t % (self.tid, self.ctaid, self.msg) super(KernelRuntimeError, self).__init__(msg) @@ -17,8 +16,9 @@ class CudaLoweringError(LoweringError): pass -_launch_help_url = ("https://numba.readthedocs.io/en/stable/cuda/" - "kernels.html#kernel-invocation") +_launch_help_url = ( + "https://numba.readthedocs.io/en/stable/cuda/kernels.html#kernel-invocation" +) missing_launch_config_msg = """ Kernel launch configuration was not specified. Use the syntax: @@ -40,12 +40,15 @@ def check_dim(dim, name): else: dim = list(dim) if len(dim) > 3: - raise ValueError('%s must be a sequence of 1, 2 or 3 integers, ' - 'got %r' % (name, dim)) + raise ValueError( + "%s must be a sequence of 1, 2 or 3 integers, " + "got %r" % (name, dim) + ) for v in dim: if not isinstance(v, numbers.Integral): - raise TypeError('%s must be a sequence of integers, got %r' - % (name, dim)) + raise TypeError( + "%s must be a sequence of integers, got %r" % (name, dim) + ) while len(dim) < 3: dim.append(1) return tuple(dim) @@ -53,7 +56,7 @@ def check_dim(dim, name): if None in (griddim, blockdim): raise ValueError(missing_launch_config_msg) - griddim = check_dim(griddim, 'griddim') - blockdim = check_dim(blockdim, 'blockdim') + griddim = check_dim(griddim, "griddim") + blockdim = check_dim(blockdim, "blockdim") return griddim, blockdim diff --git a/numba_cuda/numba/cuda/extending.py b/numba_cuda/numba/cuda/extending.py index cbc482aaa..a6b370523 100644 --- a/numba_cuda/numba/cuda/extending.py +++ b/numba_cuda/numba/cuda/extending.py @@ -4,4 +4,4 @@ from numba.core.extending import intrinsic as _intrinsic -intrinsic = _intrinsic(target='cuda') +intrinsic = _intrinsic(target="cuda") diff --git a/numba_cuda/numba/cuda/initialize.py b/numba_cuda/numba/cuda/initialize.py index e90c95b31..832891a66 100644 --- a/numba_cuda/numba/cuda/initialize.py +++ b/numba_cuda/numba/cuda/initialize.py @@ -4,9 +4,11 @@ def initialize_all(): from numba.cuda.decorators import jit from numba.cuda.dispatcher import CUDADispatcher - from numba.core.target_extension import (target_registry, - dispatcher_registry, - jit_registry) + from numba.core.target_extension import ( + target_registry, + dispatcher_registry, + jit_registry, + ) cuda_target = target_registry["cuda"] jit_registry[cuda_target] = jit diff --git a/numba_cuda/numba/cuda/intrinsic_wrapper.py b/numba_cuda/numba/cuda/intrinsic_wrapper.py index e02639f21..cfbdf06fe 100644 --- a/numba_cuda/numba/cuda/intrinsic_wrapper.py +++ b/numba_cuda/numba/cuda/intrinsic_wrapper.py @@ -45,7 +45,7 @@ def shfl_sync(mask, value, src_lane): from src_lane. If this is outside the warp, then the given value is returned. """ - return numba.cuda.shfl_sync_intrinsic(mask, 0, value, src_lane, 0x1f)[0] + return numba.cuda.shfl_sync_intrinsic(mask, 0, value, src_lane, 0x1F)[0] @jit(device=True) @@ -65,7 +65,7 @@ def shfl_down_sync(mask, value, delta): from (laneid + delta). If this is outside the warp, then the given value is returned. """ - return numba.cuda.shfl_sync_intrinsic(mask, 2, value, delta, 0x1f)[0] + return numba.cuda.shfl_sync_intrinsic(mask, 2, value, delta, 0x1F)[0] @jit(device=True) @@ -74,4 +74,4 @@ def shfl_xor_sync(mask, value, lane_mask): Shuffles value across the masked warp and returns the value from (laneid ^ lane_mask). """ - return numba.cuda.shfl_sync_intrinsic(mask, 3, value, lane_mask, 0x1f)[0] + return numba.cuda.shfl_sync_intrinsic(mask, 3, value, lane_mask, 0x1F)[0] diff --git a/numba_cuda/numba/cuda/intrinsics.py b/numba_cuda/numba/cuda/intrinsics.py index f5b186e88..2691ee8eb 100644 --- a/numba_cuda/numba/cuda/intrinsics.py +++ b/numba_cuda/numba/cuda/intrinsics.py @@ -9,9 +9,10 @@ from numba.cuda.extending import intrinsic -#------------------------------------------------------------------------------- +# ------------------------------------------------------------------------------- # Grid functions + def _type_grid_function(ndim): val = ndim.literal_value if val == 1: @@ -19,14 +20,14 @@ def _type_grid_function(ndim): elif val in (2, 3): restype = types.UniTuple(types.int64, val) else: - raise ValueError('argument can only be 1, 2, 3') + raise ValueError("argument can only be 1, 2, 3") return signature(restype, types.int32) @intrinsic def grid(typingctx, ndim): - '''grid(ndim) + """grid(ndim) Return the absolute position of the current thread in the entire grid of blocks. *ndim* should correspond to the number of dimensions declared when @@ -39,7 +40,7 @@ def grid(typingctx, ndim): and is similar for the other two indices, but using the ``y`` and ``z`` attributes. - ''' + """ if not isinstance(ndim, types.IntegerLiteral): raise RequireLiteralValue(ndim) @@ -59,7 +60,7 @@ def codegen(context, builder, sig, args): @intrinsic def gridsize(typingctx, ndim): - '''gridsize(ndim) + """gridsize(ndim) Return the absolute size (or shape) in threads of the entire grid of blocks. *ndim* should correspond to the number of dimensions declared when @@ -72,7 +73,7 @@ def gridsize(typingctx, ndim): and is similar for the other two indices, but using the ``y`` and ``z`` attributes. - ''' + """ if not isinstance(ndim, types.IntegerLiteral): raise RequireLiteralValue(ndim) @@ -87,17 +88,17 @@ def _nthreads_for_dim(builder, dim): def codegen(context, builder, sig, args): restype = sig.return_type - nx = _nthreads_for_dim(builder, 'x') + nx = _nthreads_for_dim(builder, "x") if restype == types.int64: return nx elif isinstance(restype, types.UniTuple): - ny = _nthreads_for_dim(builder, 'y') + ny = _nthreads_for_dim(builder, "y") if restype.count == 2: return cgutils.pack_array(builder, (nx, ny)) elif restype.count == 3: - nz = _nthreads_for_dim(builder, 'z') + nz = _nthreads_for_dim(builder, "z") return cgutils.pack_array(builder, (nx, ny, nz)) return sig, codegen @@ -108,37 +109,40 @@ def _warpsize(typingctx): sig = signature(types.int32) def codegen(context, builder, sig, args): - return nvvmutils.call_sreg(builder, 'warpsize') + return nvvmutils.call_sreg(builder, "warpsize") return sig, codegen -@overload_attribute(types.Module(cuda), 'warpsize', target='cuda') +@overload_attribute(types.Module(cuda), "warpsize", target="cuda") def cuda_warpsize(mod): - ''' + """ The size of a warp. All architectures implemented to date have a warp size of 32. - ''' + """ + def get(mod): return _warpsize() + return get -#------------------------------------------------------------------------------- +# ------------------------------------------------------------------------------- # syncthreads + @intrinsic def syncthreads(typingctx): - ''' + """ Synchronize all threads in the same thread block. This function implements the same pattern as barriers in traditional multi-threaded programming: this function waits until all threads in the block call it, at which point it returns control to all its callers. - ''' + """ sig = signature(types.none) def codegen(context, builder, sig, args): - fname = 'llvm.nvvm.barrier0' + fname = "llvm.nvvm.barrier0" lmod = builder.module fnty = ir.FunctionType(ir.VoidType(), ()) sync = cgutils.get_or_insert_function(lmod, fnty, fname) @@ -164,40 +168,40 @@ def codegen(context, builder, sig, args): @intrinsic def syncthreads_count(typingctx, predicate): - ''' + """ syncthreads_count(predicate) An extension to numba.cuda.syncthreads where the return value is a count of the threads where predicate is true. - ''' - fname = 'llvm.nvvm.barrier0.popc' + """ + fname = "llvm.nvvm.barrier0.popc" return _syncthreads_predicate(typingctx, predicate, fname) @intrinsic def syncthreads_and(typingctx, predicate): - ''' + """ syncthreads_and(predicate) An extension to numba.cuda.syncthreads where 1 is returned if predicate is true for all threads or 0 otherwise. - ''' - fname = 'llvm.nvvm.barrier0.and' + """ + fname = "llvm.nvvm.barrier0.and" return _syncthreads_predicate(typingctx, predicate, fname) @intrinsic def syncthreads_or(typingctx, predicate): - ''' + """ syncthreads_or(predicate) An extension to numba.cuda.syncthreads where 1 is returned if predicate is true for any thread or 0 otherwise. - ''' - fname = 'llvm.nvvm.barrier0.or' + """ + fname = "llvm.nvvm.barrier0.or" return _syncthreads_predicate(typingctx, predicate, fname) -@overload_method(types.Integer, 'bit_count', target='cuda') +@overload_method(types.Integer, "bit_count", target="cuda") def integer_bit_count(i): return lambda i: cuda.popc(i) diff --git a/numba_cuda/numba/cuda/kernels/reduction.py b/numba_cuda/numba/cuda/kernels/reduction.py index f733935b6..52d362599 100644 --- a/numba_cuda/numba/cuda/kernels/reduction.py +++ b/numba_cuda/numba/cuda/kernels/reduction.py @@ -13,7 +13,7 @@ def _gpu_reduce_factory(fn, nbtype): from numba import cuda reduce_op = cuda.jit(device=True)(fn) - inner_sm_size = _WARPSIZE + 1 # plus one to avoid SM collision + inner_sm_size = _WARPSIZE + 1 # plus one to avoid SM collision max_blocksize = _NUMWARPS * _WARPSIZE @cuda.jit(device=True) @@ -86,8 +86,9 @@ def device_reduce_full_block(arr, partials, sm_partials): # warning: this is assuming 4 warps. # assert numwarps == 4 if tid < 2: - sm_partials[tid, 0] = reduce_op(sm_partials[tid, 0], - sm_partials[tid + 2, 0]) + sm_partials[tid, 0] = reduce_op( + sm_partials[tid, 0], sm_partials[tid + 2, 0] + ) cuda.syncwarp() if tid == 0: partials[blkid] = reduce_op(sm_partials[0, 0], sm_partials[1, 0]) @@ -148,8 +149,9 @@ def gpu_reduce_block_strided(arr, partials, init, use_init): """ tid = cuda.threadIdx.x - sm_partials = cuda.shared.array((_NUMWARPS, inner_sm_size), - dtype=nbtype) + sm_partials = cuda.shared.array( + (_NUMWARPS, inner_sm_size), dtype=nbtype + ) if cuda.blockDim.x == max_blocksize: device_reduce_full_block(arr, partials, sm_partials) else: @@ -238,17 +240,15 @@ def __call__(self, arr, size=None, res=None, init=0, stream=0): if size_full: # kernel for the fully populated threadblocks - kernel[full_blockct, blocksize, stream](arr[:size_full], - partials[:full_blockct], - init, - True) + kernel[full_blockct, blocksize, stream]( + arr[:size_full], partials[:full_blockct], init, True + ) if size_partial: # kernel for partially populated threadblocks - kernel[1, size_partial, stream](arr[size_full:], - partials[full_blockct:], - init, - not full_blockct) + kernel[1, size_partial, stream]( + arr[size_full:], partials[full_blockct:], init, not full_blockct + ) if partials.size > 1: # finish up diff --git a/numba_cuda/numba/cuda/kernels/transpose.py b/numba_cuda/numba/cuda/kernels/transpose.py index b1df36e04..1a1af2b41 100644 --- a/numba_cuda/numba/cuda/kernels/transpose.py +++ b/numba_cuda/numba/cuda/kernels/transpose.py @@ -18,16 +18,14 @@ def transpose(a, b=None): """ # prefer `a`'s stream if - stream = getattr(a, 'stream', 0) + stream = getattr(a, "stream", 0) if not b: cols, rows = a.shape strides = a.dtype.itemsize * cols, a.dtype.itemsize b = cuda.cudadrv.devicearray.DeviceNDArray( - (rows, cols), - strides, - dtype=a.dtype, - stream=stream) + (rows, cols), strides, dtype=a.dtype, stream=stream + ) dt = nps.from_dtype(a.dtype) @@ -40,7 +38,6 @@ def transpose(a, b=None): @cuda.jit def kernel(input, output): - tile = cuda.shared.array(shape=tile_shape, dtype=dt) tx = cuda.threadIdx.x diff --git a/numba_cuda/numba/cuda/libdevice.py b/numba_cuda/numba/cuda/libdevice.py index 303ade74b..4a066b77b 100644 --- a/numba_cuda/numba/cuda/libdevice.py +++ b/numba_cuda/numba/cuda/libdevice.py @@ -5,7 +5,7 @@ def abs(x): :param x: Argument. :type x: int32 :rtype: int32 -""" + """ def acos(x): @@ -15,7 +15,7 @@ def acos(x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def acosf(x): @@ -25,7 +25,7 @@ def acosf(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def acosh(x): @@ -35,7 +35,7 @@ def acosh(x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def acoshf(x): @@ -45,7 +45,7 @@ def acoshf(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def asin(x): @@ -55,7 +55,7 @@ def asin(x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def asinf(x): @@ -65,7 +65,7 @@ def asinf(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def asinh(x): @@ -75,7 +75,7 @@ def asinh(x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def asinhf(x): @@ -85,7 +85,7 @@ def asinhf(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def atan(x): @@ -95,7 +95,7 @@ def atan(x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def atan2(x, y): @@ -107,7 +107,7 @@ def atan2(x, y): :param y: Argument. :type y: float64 :rtype: float64 -""" + """ def atan2f(x, y): @@ -119,7 +119,7 @@ def atan2f(x, y): :param y: Argument. :type y: float32 :rtype: float32 -""" + """ def atanf(x): @@ -129,7 +129,7 @@ def atanf(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def atanh(x): @@ -139,7 +139,7 @@ def atanh(x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def atanhf(x): @@ -149,7 +149,7 @@ def atanhf(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def brev(x): @@ -159,7 +159,7 @@ def brev(x): :param x: Argument. :type x: int32 :rtype: int32 -""" + """ def brevll(x): @@ -169,7 +169,7 @@ def brevll(x): :param x: Argument. :type x: int64 :rtype: int64 -""" + """ def byte_perm(x, y, z): @@ -183,7 +183,7 @@ def byte_perm(x, y, z): :param z: Argument. :type z: int32 :rtype: int32 -""" + """ def cbrt(x): @@ -193,7 +193,7 @@ def cbrt(x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def cbrtf(x): @@ -203,7 +203,7 @@ def cbrtf(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def ceil(x): @@ -213,7 +213,7 @@ def ceil(x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def ceilf(x): @@ -223,7 +223,7 @@ def ceilf(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def clz(x): @@ -233,7 +233,7 @@ def clz(x): :param x: Argument. :type x: int32 :rtype: int32 -""" + """ def clzll(x): @@ -243,7 +243,7 @@ def clzll(x): :param x: Argument. :type x: int64 :rtype: int32 -""" + """ def copysign(x, y): @@ -255,7 +255,7 @@ def copysign(x, y): :param y: Argument. :type y: float64 :rtype: float64 -""" + """ def copysignf(x, y): @@ -267,7 +267,7 @@ def copysignf(x, y): :param y: Argument. :type y: float32 :rtype: float32 -""" + """ def cos(x): @@ -277,7 +277,7 @@ def cos(x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def cosf(x): @@ -287,7 +287,7 @@ def cosf(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def cosh(x): @@ -297,7 +297,7 @@ def cosh(x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def coshf(x): @@ -307,7 +307,7 @@ def coshf(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def cospi(x): @@ -317,7 +317,7 @@ def cospi(x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def cospif(x): @@ -327,7 +327,7 @@ def cospif(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def dadd_rd(x, y): @@ -339,7 +339,7 @@ def dadd_rd(x, y): :param y: Argument. :type y: float64 :rtype: float64 -""" + """ def dadd_rn(x, y): @@ -351,7 +351,7 @@ def dadd_rn(x, y): :param y: Argument. :type y: float64 :rtype: float64 -""" + """ def dadd_ru(x, y): @@ -363,7 +363,7 @@ def dadd_ru(x, y): :param y: Argument. :type y: float64 :rtype: float64 -""" + """ def dadd_rz(x, y): @@ -375,7 +375,7 @@ def dadd_rz(x, y): :param y: Argument. :type y: float64 :rtype: float64 -""" + """ def ddiv_rd(x, y): @@ -387,7 +387,7 @@ def ddiv_rd(x, y): :param y: Argument. :type y: float64 :rtype: float64 -""" + """ def ddiv_rn(x, y): @@ -399,7 +399,7 @@ def ddiv_rn(x, y): :param y: Argument. :type y: float64 :rtype: float64 -""" + """ def ddiv_ru(x, y): @@ -411,7 +411,7 @@ def ddiv_ru(x, y): :param y: Argument. :type y: float64 :rtype: float64 -""" + """ def ddiv_rz(x, y): @@ -423,7 +423,7 @@ def ddiv_rz(x, y): :param y: Argument. :type y: float64 :rtype: float64 -""" + """ def dmul_rd(x, y): @@ -435,7 +435,7 @@ def dmul_rd(x, y): :param y: Argument. :type y: float64 :rtype: float64 -""" + """ def dmul_rn(x, y): @@ -447,7 +447,7 @@ def dmul_rn(x, y): :param y: Argument. :type y: float64 :rtype: float64 -""" + """ def dmul_ru(x, y): @@ -459,7 +459,7 @@ def dmul_ru(x, y): :param y: Argument. :type y: float64 :rtype: float64 -""" + """ def dmul_rz(x, y): @@ -471,7 +471,7 @@ def dmul_rz(x, y): :param y: Argument. :type y: float64 :rtype: float64 -""" + """ def double2float_rd(d): @@ -481,7 +481,7 @@ def double2float_rd(d): :param d: Argument. :type d: float64 :rtype: float32 -""" + """ def double2float_rn(d): @@ -491,7 +491,7 @@ def double2float_rn(d): :param d: Argument. :type d: float64 :rtype: float32 -""" + """ def double2float_ru(d): @@ -501,7 +501,7 @@ def double2float_ru(d): :param d: Argument. :type d: float64 :rtype: float32 -""" + """ def double2float_rz(d): @@ -511,7 +511,7 @@ def double2float_rz(d): :param d: Argument. :type d: float64 :rtype: float32 -""" + """ def double2hiint(d): @@ -521,7 +521,7 @@ def double2hiint(d): :param d: Argument. :type d: float64 :rtype: int32 -""" + """ def double2int_rd(d): @@ -531,7 +531,7 @@ def double2int_rd(d): :param d: Argument. :type d: float64 :rtype: int32 -""" + """ def double2int_rn(d): @@ -541,7 +541,7 @@ def double2int_rn(d): :param d: Argument. :type d: float64 :rtype: int32 -""" + """ def double2int_ru(d): @@ -551,7 +551,7 @@ def double2int_ru(d): :param d: Argument. :type d: float64 :rtype: int32 -""" + """ def double2int_rz(d): @@ -561,7 +561,7 @@ def double2int_rz(d): :param d: Argument. :type d: float64 :rtype: int32 -""" + """ def double2ll_rd(f): @@ -571,7 +571,7 @@ def double2ll_rd(f): :param f: Argument. :type f: float64 :rtype: int64 -""" + """ def double2ll_rn(f): @@ -581,7 +581,7 @@ def double2ll_rn(f): :param f: Argument. :type f: float64 :rtype: int64 -""" + """ def double2ll_ru(f): @@ -591,7 +591,7 @@ def double2ll_ru(f): :param f: Argument. :type f: float64 :rtype: int64 -""" + """ def double2ll_rz(f): @@ -601,7 +601,7 @@ def double2ll_rz(f): :param f: Argument. :type f: float64 :rtype: int64 -""" + """ def double2loint(d): @@ -611,7 +611,7 @@ def double2loint(d): :param d: Argument. :type d: float64 :rtype: int32 -""" + """ def double2uint_rd(d): @@ -621,7 +621,7 @@ def double2uint_rd(d): :param d: Argument. :type d: float64 :rtype: int32 -""" + """ def double2uint_rn(d): @@ -631,7 +631,7 @@ def double2uint_rn(d): :param d: Argument. :type d: float64 :rtype: int32 -""" + """ def double2uint_ru(d): @@ -641,7 +641,7 @@ def double2uint_ru(d): :param d: Argument. :type d: float64 :rtype: int32 -""" + """ def double2uint_rz(d): @@ -651,7 +651,7 @@ def double2uint_rz(d): :param d: Argument. :type d: float64 :rtype: int32 -""" + """ def double2ull_rd(f): @@ -661,7 +661,7 @@ def double2ull_rd(f): :param f: Argument. :type f: float64 :rtype: int64 -""" + """ def double2ull_rn(f): @@ -671,7 +671,7 @@ def double2ull_rn(f): :param f: Argument. :type f: float64 :rtype: int64 -""" + """ def double2ull_ru(f): @@ -681,7 +681,7 @@ def double2ull_ru(f): :param f: Argument. :type f: float64 :rtype: int64 -""" + """ def double2ull_rz(f): @@ -691,7 +691,7 @@ def double2ull_rz(f): :param f: Argument. :type f: float64 :rtype: int64 -""" + """ def double_as_longlong(x): @@ -701,7 +701,7 @@ def double_as_longlong(x): :param x: Argument. :type x: float64 :rtype: int64 -""" + """ def drcp_rd(x): @@ -711,7 +711,7 @@ def drcp_rd(x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def drcp_rn(x): @@ -721,7 +721,7 @@ def drcp_rn(x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def drcp_ru(x): @@ -731,7 +731,7 @@ def drcp_ru(x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def drcp_rz(x): @@ -741,7 +741,7 @@ def drcp_rz(x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def dsqrt_rd(x): @@ -751,7 +751,7 @@ def dsqrt_rd(x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def dsqrt_rn(x): @@ -761,7 +761,7 @@ def dsqrt_rn(x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def dsqrt_ru(x): @@ -771,7 +771,7 @@ def dsqrt_ru(x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def dsqrt_rz(x): @@ -781,7 +781,7 @@ def dsqrt_rz(x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def erf(x): @@ -791,7 +791,7 @@ def erf(x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def erfc(x): @@ -801,7 +801,7 @@ def erfc(x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def erfcf(x): @@ -811,7 +811,7 @@ def erfcf(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def erfcinv(x): @@ -821,7 +821,7 @@ def erfcinv(x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def erfcinvf(x): @@ -831,7 +831,7 @@ def erfcinvf(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def erfcx(x): @@ -841,7 +841,7 @@ def erfcx(x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def erfcxf(x): @@ -851,7 +851,7 @@ def erfcxf(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def erff(x): @@ -861,7 +861,7 @@ def erff(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def erfinv(x): @@ -871,7 +871,7 @@ def erfinv(x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def erfinvf(x): @@ -881,7 +881,7 @@ def erfinvf(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def exp(x): @@ -891,7 +891,7 @@ def exp(x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def exp10(x): @@ -901,7 +901,7 @@ def exp10(x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def exp10f(x): @@ -911,7 +911,7 @@ def exp10f(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def exp2(x): @@ -921,7 +921,7 @@ def exp2(x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def exp2f(x): @@ -931,7 +931,7 @@ def exp2f(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def expf(x): @@ -941,7 +941,7 @@ def expf(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def expm1(x): @@ -951,7 +951,7 @@ def expm1(x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def expm1f(x): @@ -961,7 +961,7 @@ def expm1f(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def fabs(f): @@ -971,7 +971,7 @@ def fabs(f): :param f: Argument. :type f: float64 :rtype: float64 -""" + """ def fabsf(f): @@ -981,7 +981,7 @@ def fabsf(f): :param f: Argument. :type f: float32 :rtype: float32 -""" + """ def fadd_rd(x, y): @@ -993,7 +993,7 @@ def fadd_rd(x, y): :param y: Argument. :type y: float32 :rtype: float32 -""" + """ def fadd_rn(x, y): @@ -1005,7 +1005,7 @@ def fadd_rn(x, y): :param y: Argument. :type y: float32 :rtype: float32 -""" + """ def fadd_ru(x, y): @@ -1017,7 +1017,7 @@ def fadd_ru(x, y): :param y: Argument. :type y: float32 :rtype: float32 -""" + """ def fadd_rz(x, y): @@ -1029,7 +1029,7 @@ def fadd_rz(x, y): :param y: Argument. :type y: float32 :rtype: float32 -""" + """ def fast_cosf(x): @@ -1039,7 +1039,7 @@ def fast_cosf(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def fast_exp10f(x): @@ -1049,7 +1049,7 @@ def fast_exp10f(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def fast_expf(x): @@ -1059,7 +1059,7 @@ def fast_expf(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def fast_fdividef(x, y): @@ -1071,7 +1071,7 @@ def fast_fdividef(x, y): :param y: Argument. :type y: float32 :rtype: float32 -""" + """ def fast_log10f(x): @@ -1081,7 +1081,7 @@ def fast_log10f(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def fast_log2f(x): @@ -1091,7 +1091,7 @@ def fast_log2f(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def fast_logf(x): @@ -1101,7 +1101,7 @@ def fast_logf(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def fast_powf(x, y): @@ -1113,7 +1113,7 @@ def fast_powf(x, y): :param y: Argument. :type y: float32 :rtype: float32 -""" + """ def fast_sincosf(x): @@ -1123,7 +1123,7 @@ def fast_sincosf(x): :param x: Argument. :type x: float32 :rtype: UniTuple(float32 x 2) -""" + """ def fast_sinf(x): @@ -1133,7 +1133,7 @@ def fast_sinf(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def fast_tanf(x): @@ -1143,7 +1143,7 @@ def fast_tanf(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def fdim(x, y): @@ -1155,7 +1155,7 @@ def fdim(x, y): :param y: Argument. :type y: float64 :rtype: float64 -""" + """ def fdimf(x, y): @@ -1167,7 +1167,7 @@ def fdimf(x, y): :param y: Argument. :type y: float32 :rtype: float32 -""" + """ def fdiv_rd(x, y): @@ -1179,7 +1179,7 @@ def fdiv_rd(x, y): :param y: Argument. :type y: float32 :rtype: float32 -""" + """ def fdiv_rn(x, y): @@ -1191,7 +1191,7 @@ def fdiv_rn(x, y): :param y: Argument. :type y: float32 :rtype: float32 -""" + """ def fdiv_ru(x, y): @@ -1203,7 +1203,7 @@ def fdiv_ru(x, y): :param y: Argument. :type y: float32 :rtype: float32 -""" + """ def fdiv_rz(x, y): @@ -1215,7 +1215,7 @@ def fdiv_rz(x, y): :param y: Argument. :type y: float32 :rtype: float32 -""" + """ def ffs(x): @@ -1225,7 +1225,7 @@ def ffs(x): :param x: Argument. :type x: int32 :rtype: int32 -""" + """ def ffsll(x): @@ -1235,7 +1235,7 @@ def ffsll(x): :param x: Argument. :type x: int64 :rtype: int32 -""" + """ def finitef(x): @@ -1245,7 +1245,7 @@ def finitef(x): :param x: Argument. :type x: float32 :rtype: int32 -""" + """ def float2half_rn(f): @@ -1255,7 +1255,7 @@ def float2half_rn(f): :param f: Argument. :type f: float32 :rtype: int16 -""" + """ def float2int_rd(x): @@ -1265,7 +1265,7 @@ def float2int_rd(x): :param in: Argument. :type in: float32 :rtype: int32 -""" + """ def float2int_rn(x): @@ -1275,7 +1275,7 @@ def float2int_rn(x): :param in: Argument. :type in: float32 :rtype: int32 -""" + """ def float2int_ru(x): @@ -1285,7 +1285,7 @@ def float2int_ru(x): :param in: Argument. :type in: float32 :rtype: int32 -""" + """ def float2int_rz(x): @@ -1295,7 +1295,7 @@ def float2int_rz(x): :param in: Argument. :type in: float32 :rtype: int32 -""" + """ def float2ll_rd(f): @@ -1305,7 +1305,7 @@ def float2ll_rd(f): :param f: Argument. :type f: float32 :rtype: int64 -""" + """ def float2ll_rn(f): @@ -1315,7 +1315,7 @@ def float2ll_rn(f): :param f: Argument. :type f: float32 :rtype: int64 -""" + """ def float2ll_ru(f): @@ -1325,7 +1325,7 @@ def float2ll_ru(f): :param f: Argument. :type f: float32 :rtype: int64 -""" + """ def float2ll_rz(f): @@ -1335,7 +1335,7 @@ def float2ll_rz(f): :param f: Argument. :type f: float32 :rtype: int64 -""" + """ def float2uint_rd(x): @@ -1345,7 +1345,7 @@ def float2uint_rd(x): :param in: Argument. :type in: float32 :rtype: int32 -""" + """ def float2uint_rn(x): @@ -1355,7 +1355,7 @@ def float2uint_rn(x): :param in: Argument. :type in: float32 :rtype: int32 -""" + """ def float2uint_ru(x): @@ -1365,7 +1365,7 @@ def float2uint_ru(x): :param in: Argument. :type in: float32 :rtype: int32 -""" + """ def float2uint_rz(x): @@ -1375,7 +1375,7 @@ def float2uint_rz(x): :param in: Argument. :type in: float32 :rtype: int32 -""" + """ def float2ull_rd(f): @@ -1385,7 +1385,7 @@ def float2ull_rd(f): :param f: Argument. :type f: float32 :rtype: int64 -""" + """ def float2ull_rn(f): @@ -1395,7 +1395,7 @@ def float2ull_rn(f): :param f: Argument. :type f: float32 :rtype: int64 -""" + """ def float2ull_ru(f): @@ -1405,7 +1405,7 @@ def float2ull_ru(f): :param f: Argument. :type f: float32 :rtype: int64 -""" + """ def float2ull_rz(f): @@ -1415,7 +1415,7 @@ def float2ull_rz(f): :param f: Argument. :type f: float32 :rtype: int64 -""" + """ def float_as_int(x): @@ -1425,7 +1425,7 @@ def float_as_int(x): :param x: Argument. :type x: float32 :rtype: int32 -""" + """ def floor(f): @@ -1435,7 +1435,7 @@ def floor(f): :param f: Argument. :type f: float64 :rtype: float64 -""" + """ def floorf(f): @@ -1445,7 +1445,7 @@ def floorf(f): :param f: Argument. :type f: float32 :rtype: float32 -""" + """ def fma(x, y, z): @@ -1459,7 +1459,7 @@ def fma(x, y, z): :param z: Argument. :type z: float64 :rtype: float64 -""" + """ def fma_rd(x, y, z): @@ -1473,7 +1473,7 @@ def fma_rd(x, y, z): :param z: Argument. :type z: float64 :rtype: float64 -""" + """ def fma_rn(x, y, z): @@ -1487,7 +1487,7 @@ def fma_rn(x, y, z): :param z: Argument. :type z: float64 :rtype: float64 -""" + """ def fma_ru(x, y, z): @@ -1501,7 +1501,7 @@ def fma_ru(x, y, z): :param z: Argument. :type z: float64 :rtype: float64 -""" + """ def fma_rz(x, y, z): @@ -1515,7 +1515,7 @@ def fma_rz(x, y, z): :param z: Argument. :type z: float64 :rtype: float64 -""" + """ def fmaf(x, y, z): @@ -1529,7 +1529,7 @@ def fmaf(x, y, z): :param z: Argument. :type z: float32 :rtype: float32 -""" + """ def fmaf_rd(x, y, z): @@ -1543,7 +1543,7 @@ def fmaf_rd(x, y, z): :param z: Argument. :type z: float32 :rtype: float32 -""" + """ def fmaf_rn(x, y, z): @@ -1557,7 +1557,7 @@ def fmaf_rn(x, y, z): :param z: Argument. :type z: float32 :rtype: float32 -""" + """ def fmaf_ru(x, y, z): @@ -1571,7 +1571,7 @@ def fmaf_ru(x, y, z): :param z: Argument. :type z: float32 :rtype: float32 -""" + """ def fmaf_rz(x, y, z): @@ -1585,7 +1585,7 @@ def fmaf_rz(x, y, z): :param z: Argument. :type z: float32 :rtype: float32 -""" + """ def fmax(x, y): @@ -1597,7 +1597,7 @@ def fmax(x, y): :param y: Argument. :type y: float64 :rtype: float64 -""" + """ def fmaxf(x, y): @@ -1609,7 +1609,7 @@ def fmaxf(x, y): :param y: Argument. :type y: float32 :rtype: float32 -""" + """ def fmin(x, y): @@ -1621,7 +1621,7 @@ def fmin(x, y): :param y: Argument. :type y: float64 :rtype: float64 -""" + """ def fminf(x, y): @@ -1633,7 +1633,7 @@ def fminf(x, y): :param y: Argument. :type y: float32 :rtype: float32 -""" + """ def fmod(x, y): @@ -1645,7 +1645,7 @@ def fmod(x, y): :param y: Argument. :type y: float64 :rtype: float64 -""" + """ def fmodf(x, y): @@ -1657,7 +1657,7 @@ def fmodf(x, y): :param y: Argument. :type y: float32 :rtype: float32 -""" + """ def fmul_rd(x, y): @@ -1669,7 +1669,7 @@ def fmul_rd(x, y): :param y: Argument. :type y: float32 :rtype: float32 -""" + """ def fmul_rn(x, y): @@ -1681,7 +1681,7 @@ def fmul_rn(x, y): :param y: Argument. :type y: float32 :rtype: float32 -""" + """ def fmul_ru(x, y): @@ -1693,7 +1693,7 @@ def fmul_ru(x, y): :param y: Argument. :type y: float32 :rtype: float32 -""" + """ def fmul_rz(x, y): @@ -1705,7 +1705,7 @@ def fmul_rz(x, y): :param y: Argument. :type y: float32 :rtype: float32 -""" + """ def frcp_rd(x): @@ -1715,7 +1715,7 @@ def frcp_rd(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def frcp_rn(x): @@ -1725,7 +1725,7 @@ def frcp_rn(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def frcp_ru(x): @@ -1735,7 +1735,7 @@ def frcp_ru(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def frcp_rz(x): @@ -1745,7 +1745,7 @@ def frcp_rz(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def frexp(x): @@ -1755,7 +1755,7 @@ def frexp(x): :param x: Argument. :type x: float64 :rtype: Tuple(float64, int32) -""" + """ def frexpf(x): @@ -1765,7 +1765,7 @@ def frexpf(x): :param x: Argument. :type x: float32 :rtype: Tuple(float32, int32) -""" + """ def frsqrt_rn(x): @@ -1775,7 +1775,7 @@ def frsqrt_rn(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def fsqrt_rd(x): @@ -1785,7 +1785,7 @@ def fsqrt_rd(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def fsqrt_rn(x): @@ -1795,7 +1795,7 @@ def fsqrt_rn(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def fsqrt_ru(x): @@ -1805,7 +1805,7 @@ def fsqrt_ru(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def fsqrt_rz(x): @@ -1815,7 +1815,7 @@ def fsqrt_rz(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def fsub_rd(x, y): @@ -1827,7 +1827,7 @@ def fsub_rd(x, y): :param y: Argument. :type y: float32 :rtype: float32 -""" + """ def fsub_rn(x, y): @@ -1839,7 +1839,7 @@ def fsub_rn(x, y): :param y: Argument. :type y: float32 :rtype: float32 -""" + """ def fsub_ru(x, y): @@ -1851,7 +1851,7 @@ def fsub_ru(x, y): :param y: Argument. :type y: float32 :rtype: float32 -""" + """ def fsub_rz(x, y): @@ -1863,7 +1863,7 @@ def fsub_rz(x, y): :param y: Argument. :type y: float32 :rtype: float32 -""" + """ def hadd(x, y): @@ -1875,7 +1875,7 @@ def hadd(x, y): :param y: Argument. :type y: int32 :rtype: int32 -""" + """ def half2float(h): @@ -1885,7 +1885,7 @@ def half2float(h): :param h: Argument. :type h: int16 :rtype: float32 -""" + """ def hiloint2double(x, y): @@ -1897,7 +1897,7 @@ def hiloint2double(x, y): :param y: Argument. :type y: int32 :rtype: float64 -""" + """ def hypot(x, y): @@ -1909,7 +1909,7 @@ def hypot(x, y): :param y: Argument. :type y: float64 :rtype: float64 -""" + """ def hypotf(x, y): @@ -1921,7 +1921,7 @@ def hypotf(x, y): :param y: Argument. :type y: float32 :rtype: float32 -""" + """ def ilogb(x): @@ -1931,7 +1931,7 @@ def ilogb(x): :param x: Argument. :type x: float64 :rtype: int32 -""" + """ def ilogbf(x): @@ -1941,7 +1941,7 @@ def ilogbf(x): :param x: Argument. :type x: float32 :rtype: int32 -""" + """ def int2double_rn(i): @@ -1951,7 +1951,7 @@ def int2double_rn(i): :param i: Argument. :type i: int32 :rtype: float64 -""" + """ def int2float_rd(x): @@ -1961,7 +1961,7 @@ def int2float_rd(x): :param in: Argument. :type in: int32 :rtype: float32 -""" + """ def int2float_rn(x): @@ -1971,7 +1971,7 @@ def int2float_rn(x): :param in: Argument. :type in: int32 :rtype: float32 -""" + """ def int2float_ru(x): @@ -1981,7 +1981,7 @@ def int2float_ru(x): :param in: Argument. :type in: int32 :rtype: float32 -""" + """ def int2float_rz(x): @@ -1991,7 +1991,7 @@ def int2float_rz(x): :param in: Argument. :type in: int32 :rtype: float32 -""" + """ def int_as_float(x): @@ -2001,7 +2001,7 @@ def int_as_float(x): :param x: Argument. :type x: int32 :rtype: float32 -""" + """ def isfinited(x): @@ -2011,7 +2011,7 @@ def isfinited(x): :param x: Argument. :type x: float64 :rtype: int32 -""" + """ def isinfd(x): @@ -2021,7 +2021,7 @@ def isinfd(x): :param x: Argument. :type x: float64 :rtype: int32 -""" + """ def isinff(x): @@ -2031,7 +2031,7 @@ def isinff(x): :param x: Argument. :type x: float32 :rtype: int32 -""" + """ def isnand(x): @@ -2041,7 +2041,7 @@ def isnand(x): :param x: Argument. :type x: float64 :rtype: int32 -""" + """ def isnanf(x): @@ -2051,7 +2051,7 @@ def isnanf(x): :param x: Argument. :type x: float32 :rtype: int32 -""" + """ def j0(x): @@ -2061,7 +2061,7 @@ def j0(x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def j0f(x): @@ -2071,7 +2071,7 @@ def j0f(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def j1(x): @@ -2081,7 +2081,7 @@ def j1(x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def j1f(x): @@ -2091,7 +2091,7 @@ def j1f(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def jn(n, x): @@ -2103,7 +2103,7 @@ def jn(n, x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def jnf(n, x): @@ -2115,7 +2115,7 @@ def jnf(n, x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def ldexp(x, y): @@ -2127,7 +2127,7 @@ def ldexp(x, y): :param y: Argument. :type y: int32 :rtype: float64 -""" + """ def ldexpf(x, y): @@ -2139,7 +2139,7 @@ def ldexpf(x, y): :param y: Argument. :type y: int32 :rtype: float32 -""" + """ def lgamma(x): @@ -2149,7 +2149,7 @@ def lgamma(x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def lgammaf(x): @@ -2159,7 +2159,7 @@ def lgammaf(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def ll2double_rd(l): @@ -2169,7 +2169,7 @@ def ll2double_rd(l): :param l: Argument. :type l: int64 :rtype: float64 -""" + """ def ll2double_rn(l): @@ -2179,7 +2179,7 @@ def ll2double_rn(l): :param l: Argument. :type l: int64 :rtype: float64 -""" + """ def ll2double_ru(l): @@ -2189,7 +2189,7 @@ def ll2double_ru(l): :param l: Argument. :type l: int64 :rtype: float64 -""" + """ def ll2double_rz(l): @@ -2199,7 +2199,7 @@ def ll2double_rz(l): :param l: Argument. :type l: int64 :rtype: float64 -""" + """ def ll2float_rd(l): @@ -2209,7 +2209,7 @@ def ll2float_rd(l): :param l: Argument. :type l: int64 :rtype: float32 -""" + """ def ll2float_rn(l): @@ -2219,7 +2219,7 @@ def ll2float_rn(l): :param l: Argument. :type l: int64 :rtype: float32 -""" + """ def ll2float_ru(l): @@ -2229,7 +2229,7 @@ def ll2float_ru(l): :param l: Argument. :type l: int64 :rtype: float32 -""" + """ def ll2float_rz(l): @@ -2239,7 +2239,7 @@ def ll2float_rz(l): :param l: Argument. :type l: int64 :rtype: float32 -""" + """ def llabs(x): @@ -2249,7 +2249,7 @@ def llabs(x): :param x: Argument. :type x: int64 :rtype: int64 -""" + """ def llmax(x, y): @@ -2261,7 +2261,7 @@ def llmax(x, y): :param y: Argument. :type y: int64 :rtype: int64 -""" + """ def llmin(x, y): @@ -2273,7 +2273,7 @@ def llmin(x, y): :param y: Argument. :type y: int64 :rtype: int64 -""" + """ def llrint(x): @@ -2283,7 +2283,7 @@ def llrint(x): :param x: Argument. :type x: float64 :rtype: int64 -""" + """ def llrintf(x): @@ -2293,7 +2293,7 @@ def llrintf(x): :param x: Argument. :type x: float32 :rtype: int64 -""" + """ def llround(x): @@ -2303,7 +2303,7 @@ def llround(x): :param x: Argument. :type x: float64 :rtype: int64 -""" + """ def llroundf(x): @@ -2313,7 +2313,7 @@ def llroundf(x): :param x: Argument. :type x: float32 :rtype: int64 -""" + """ def log(x): @@ -2323,7 +2323,7 @@ def log(x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def log10(x): @@ -2333,7 +2333,7 @@ def log10(x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def log10f(x): @@ -2343,7 +2343,7 @@ def log10f(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def log1p(x): @@ -2353,7 +2353,7 @@ def log1p(x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def log1pf(x): @@ -2363,7 +2363,7 @@ def log1pf(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def log2(x): @@ -2373,7 +2373,7 @@ def log2(x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def log2f(x): @@ -2383,7 +2383,7 @@ def log2f(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def logb(x): @@ -2393,7 +2393,7 @@ def logb(x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def logbf(x): @@ -2403,7 +2403,7 @@ def logbf(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def logf(x): @@ -2413,7 +2413,7 @@ def logf(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def longlong_as_double(x): @@ -2423,7 +2423,7 @@ def longlong_as_double(x): :param x: Argument. :type x: int64 :rtype: float64 -""" + """ def max(x, y): @@ -2435,7 +2435,7 @@ def max(x, y): :param y: Argument. :type y: int32 :rtype: int32 -""" + """ def min(x, y): @@ -2447,7 +2447,7 @@ def min(x, y): :param y: Argument. :type y: int32 :rtype: int32 -""" + """ def modf(x): @@ -2457,7 +2457,7 @@ def modf(x): :param x: Argument. :type x: float64 :rtype: UniTuple(float64 x 2) -""" + """ def modff(x): @@ -2467,7 +2467,7 @@ def modff(x): :param x: Argument. :type x: float32 :rtype: UniTuple(float32 x 2) -""" + """ def mul24(x, y): @@ -2479,7 +2479,7 @@ def mul24(x, y): :param y: Argument. :type y: int32 :rtype: int32 -""" + """ def mul64hi(x, y): @@ -2491,7 +2491,7 @@ def mul64hi(x, y): :param y: Argument. :type y: int64 :rtype: int64 -""" + """ def mulhi(x, y): @@ -2503,7 +2503,7 @@ def mulhi(x, y): :param y: Argument. :type y: int32 :rtype: int32 -""" + """ def nearbyint(x): @@ -2513,7 +2513,7 @@ def nearbyint(x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def nearbyintf(x): @@ -2523,7 +2523,7 @@ def nearbyintf(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def nextafter(x, y): @@ -2535,7 +2535,7 @@ def nextafter(x, y): :param y: Argument. :type y: float64 :rtype: float64 -""" + """ def nextafterf(x, y): @@ -2547,7 +2547,7 @@ def nextafterf(x, y): :param y: Argument. :type y: float32 :rtype: float32 -""" + """ def normcdf(x): @@ -2557,7 +2557,7 @@ def normcdf(x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def normcdff(x): @@ -2567,7 +2567,7 @@ def normcdff(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def normcdfinv(x): @@ -2577,7 +2577,7 @@ def normcdfinv(x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def normcdfinvf(x): @@ -2587,7 +2587,7 @@ def normcdfinvf(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def popc(x): @@ -2597,7 +2597,7 @@ def popc(x): :param x: Argument. :type x: int32 :rtype: int32 -""" + """ def popcll(x): @@ -2607,7 +2607,7 @@ def popcll(x): :param x: Argument. :type x: int64 :rtype: int32 -""" + """ def pow(x, y): @@ -2619,7 +2619,7 @@ def pow(x, y): :param y: Argument. :type y: float64 :rtype: float64 -""" + """ def powf(x, y): @@ -2631,7 +2631,7 @@ def powf(x, y): :param y: Argument. :type y: float32 :rtype: float32 -""" + """ def powi(x, y): @@ -2643,7 +2643,7 @@ def powi(x, y): :param y: Argument. :type y: int32 :rtype: float64 -""" + """ def powif(x, y): @@ -2655,7 +2655,7 @@ def powif(x, y): :param y: Argument. :type y: int32 :rtype: float32 -""" + """ def rcbrt(x): @@ -2665,7 +2665,7 @@ def rcbrt(x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def rcbrtf(x): @@ -2675,7 +2675,7 @@ def rcbrtf(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def remainder(x, y): @@ -2687,7 +2687,7 @@ def remainder(x, y): :param y: Argument. :type y: float64 :rtype: float64 -""" + """ def remainderf(x, y): @@ -2699,7 +2699,7 @@ def remainderf(x, y): :param y: Argument. :type y: float32 :rtype: float32 -""" + """ def remquo(x, y): @@ -2711,7 +2711,7 @@ def remquo(x, y): :param y: Argument. :type y: float64 :rtype: Tuple(float64, int32) -""" + """ def remquof(x, y): @@ -2723,7 +2723,7 @@ def remquof(x, y): :param y: Argument. :type y: float32 :rtype: Tuple(float32, int32) -""" + """ def rhadd(x, y): @@ -2735,7 +2735,7 @@ def rhadd(x, y): :param y: Argument. :type y: int32 :rtype: int32 -""" + """ def rint(x): @@ -2745,7 +2745,7 @@ def rint(x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def rintf(x): @@ -2755,7 +2755,7 @@ def rintf(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def round(x): @@ -2765,7 +2765,7 @@ def round(x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def roundf(x): @@ -2775,7 +2775,7 @@ def roundf(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def rsqrt(x): @@ -2785,7 +2785,7 @@ def rsqrt(x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def rsqrtf(x): @@ -2795,7 +2795,7 @@ def rsqrtf(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def sad(x, y, z): @@ -2809,7 +2809,7 @@ def sad(x, y, z): :param z: Argument. :type z: int32 :rtype: int32 -""" + """ def saturatef(x): @@ -2819,7 +2819,7 @@ def saturatef(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def scalbn(x, y): @@ -2831,7 +2831,7 @@ def scalbn(x, y): :param y: Argument. :type y: int32 :rtype: float64 -""" + """ def scalbnf(x, y): @@ -2843,7 +2843,7 @@ def scalbnf(x, y): :param y: Argument. :type y: int32 :rtype: float32 -""" + """ def signbitd(x): @@ -2853,7 +2853,7 @@ def signbitd(x): :param x: Argument. :type x: float64 :rtype: int32 -""" + """ def signbitf(x): @@ -2863,7 +2863,7 @@ def signbitf(x): :param x: Argument. :type x: float32 :rtype: int32 -""" + """ def sin(x): @@ -2873,7 +2873,7 @@ def sin(x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def sincos(x): @@ -2883,7 +2883,7 @@ def sincos(x): :param x: Argument. :type x: float64 :rtype: UniTuple(float64 x 2) -""" + """ def sincosf(x): @@ -2893,7 +2893,7 @@ def sincosf(x): :param x: Argument. :type x: float32 :rtype: UniTuple(float32 x 2) -""" + """ def sincospi(x): @@ -2903,7 +2903,7 @@ def sincospi(x): :param x: Argument. :type x: float64 :rtype: UniTuple(float64 x 2) -""" + """ def sincospif(x): @@ -2913,7 +2913,7 @@ def sincospif(x): :param x: Argument. :type x: float32 :rtype: UniTuple(float32 x 2) -""" + """ def sinf(x): @@ -2923,7 +2923,7 @@ def sinf(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def sinh(x): @@ -2933,7 +2933,7 @@ def sinh(x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def sinhf(x): @@ -2943,7 +2943,7 @@ def sinhf(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def sinpi(x): @@ -2953,7 +2953,7 @@ def sinpi(x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def sinpif(x): @@ -2963,7 +2963,7 @@ def sinpif(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def sqrt(x): @@ -2973,7 +2973,7 @@ def sqrt(x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def sqrtf(x): @@ -2983,7 +2983,7 @@ def sqrtf(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def tan(x): @@ -2993,7 +2993,7 @@ def tan(x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def tanf(x): @@ -3003,7 +3003,7 @@ def tanf(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def tanh(x): @@ -3013,7 +3013,7 @@ def tanh(x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def tanhf(x): @@ -3023,7 +3023,7 @@ def tanhf(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def tgamma(x): @@ -3033,7 +3033,7 @@ def tgamma(x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def tgammaf(x): @@ -3043,7 +3043,7 @@ def tgammaf(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def trunc(x): @@ -3053,7 +3053,7 @@ def trunc(x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def truncf(x): @@ -3063,7 +3063,7 @@ def truncf(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def uhadd(x, y): @@ -3075,7 +3075,7 @@ def uhadd(x, y): :param y: Argument. :type y: int32 :rtype: int32 -""" + """ def uint2double_rn(i): @@ -3085,7 +3085,7 @@ def uint2double_rn(i): :param i: Argument. :type i: int32 :rtype: float64 -""" + """ def uint2float_rd(x): @@ -3095,7 +3095,7 @@ def uint2float_rd(x): :param in: Argument. :type in: int32 :rtype: float32 -""" + """ def uint2float_rn(x): @@ -3105,7 +3105,7 @@ def uint2float_rn(x): :param in: Argument. :type in: int32 :rtype: float32 -""" + """ def uint2float_ru(x): @@ -3115,7 +3115,7 @@ def uint2float_ru(x): :param in: Argument. :type in: int32 :rtype: float32 -""" + """ def uint2float_rz(x): @@ -3125,7 +3125,7 @@ def uint2float_rz(x): :param in: Argument. :type in: int32 :rtype: float32 -""" + """ def ull2double_rd(l): @@ -3135,7 +3135,7 @@ def ull2double_rd(l): :param l: Argument. :type l: int64 :rtype: float64 -""" + """ def ull2double_rn(l): @@ -3145,7 +3145,7 @@ def ull2double_rn(l): :param l: Argument. :type l: int64 :rtype: float64 -""" + """ def ull2double_ru(l): @@ -3155,7 +3155,7 @@ def ull2double_ru(l): :param l: Argument. :type l: int64 :rtype: float64 -""" + """ def ull2double_rz(l): @@ -3165,7 +3165,7 @@ def ull2double_rz(l): :param l: Argument. :type l: int64 :rtype: float64 -""" + """ def ull2float_rd(l): @@ -3175,7 +3175,7 @@ def ull2float_rd(l): :param l: Argument. :type l: int64 :rtype: float32 -""" + """ def ull2float_rn(l): @@ -3185,7 +3185,7 @@ def ull2float_rn(l): :param l: Argument. :type l: int64 :rtype: float32 -""" + """ def ull2float_ru(l): @@ -3195,7 +3195,7 @@ def ull2float_ru(l): :param l: Argument. :type l: int64 :rtype: float32 -""" + """ def ull2float_rz(l): @@ -3205,7 +3205,7 @@ def ull2float_rz(l): :param l: Argument. :type l: int64 :rtype: float32 -""" + """ def ullmax(x, y): @@ -3217,7 +3217,7 @@ def ullmax(x, y): :param y: Argument. :type y: int64 :rtype: int64 -""" + """ def ullmin(x, y): @@ -3229,7 +3229,7 @@ def ullmin(x, y): :param y: Argument. :type y: int64 :rtype: int64 -""" + """ def umax(x, y): @@ -3241,7 +3241,7 @@ def umax(x, y): :param y: Argument. :type y: int32 :rtype: int32 -""" + """ def umin(x, y): @@ -3253,7 +3253,7 @@ def umin(x, y): :param y: Argument. :type y: int32 :rtype: int32 -""" + """ def umul24(x, y): @@ -3265,7 +3265,7 @@ def umul24(x, y): :param y: Argument. :type y: int32 :rtype: int32 -""" + """ def umul64hi(x, y): @@ -3277,7 +3277,7 @@ def umul64hi(x, y): :param y: Argument. :type y: int64 :rtype: int64 -""" + """ def umulhi(x, y): @@ -3289,7 +3289,7 @@ def umulhi(x, y): :param y: Argument. :type y: int32 :rtype: int32 -""" + """ def urhadd(x, y): @@ -3301,7 +3301,7 @@ def urhadd(x, y): :param y: Argument. :type y: int32 :rtype: int32 -""" + """ def usad(x, y, z): @@ -3315,7 +3315,7 @@ def usad(x, y, z): :param z: Argument. :type z: int32 :rtype: int32 -""" + """ def y0(x): @@ -3325,7 +3325,7 @@ def y0(x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def y0f(x): @@ -3335,7 +3335,7 @@ def y0f(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def y1(x): @@ -3345,7 +3345,7 @@ def y1(x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def y1f(x): @@ -3355,7 +3355,7 @@ def y1f(x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ def yn(n, x): @@ -3367,7 +3367,7 @@ def yn(n, x): :param x: Argument. :type x: float64 :rtype: float64 -""" + """ def ynf(n, x): @@ -3379,4 +3379,4 @@ def ynf(n, x): :param x: Argument. :type x: float32 :rtype: float32 -""" + """ diff --git a/numba_cuda/numba/cuda/libdeviceimpl.py b/numba_cuda/numba/cuda/libdeviceimpl.py index 4bb2e905e..827948954 100644 --- a/numba_cuda/numba/cuda/libdeviceimpl.py +++ b/numba_cuda/numba/cuda/libdeviceimpl.py @@ -49,8 +49,9 @@ def core(context, builder, sig, args): for arg in prototype_args: if arg.is_ptr: # Allocate space for return value and add to args - tmp_arg = cgutils.alloca_once(builder, - context.get_value_type(arg.ty)) + tmp_arg = cgutils.alloca_once( + builder, context.get_value_type(arg.ty) + ) actual_args.append(tmp_arg) virtual_args.append(tmp_arg) else: diff --git a/numba_cuda/numba/cuda/mathimpl.py b/numba_cuda/numba/cuda/mathimpl.py index c22deb564..9b1e1f767 100644 --- a/numba_cuda/numba/cuda/mathimpl.py +++ b/numba_cuda/numba/cuda/mathimpl.py @@ -12,57 +12,57 @@ booleans = [] -booleans += [('isnand', 'isnanf', math.isnan)] -booleans += [('isinfd', 'isinff', math.isinf)] -booleans += [('isfinited', 'finitef', math.isfinite)] +booleans += [("isnand", "isnanf", math.isnan)] +booleans += [("isinfd", "isinff", math.isinf)] +booleans += [("isfinited", "finitef", math.isfinite)] unarys = [] -unarys += [('ceil', 'ceilf', math.ceil)] -unarys += [('floor', 'floorf', math.floor)] -unarys += [('fabs', 'fabsf', math.fabs)] -unarys += [('exp', 'expf', math.exp)] -unarys += [('expm1', 'expm1f', math.expm1)] -unarys += [('erf', 'erff', math.erf)] -unarys += [('erfc', 'erfcf', math.erfc)] -unarys += [('tgamma', 'tgammaf', math.gamma)] -unarys += [('lgamma', 'lgammaf', math.lgamma)] -unarys += [('sqrt', 'sqrtf', math.sqrt)] -unarys += [('log', 'logf', math.log)] -unarys += [('log2', 'log2f', math.log2)] -unarys += [('log10', 'log10f', math.log10)] -unarys += [('log1p', 'log1pf', math.log1p)] -unarys += [('acosh', 'acoshf', math.acosh)] -unarys += [('acos', 'acosf', math.acos)] -unarys += [('cos', 'cosf', math.cos)] -unarys += [('cosh', 'coshf', math.cosh)] -unarys += [('asinh', 'asinhf', math.asinh)] -unarys += [('asin', 'asinf', math.asin)] -unarys += [('sin', 'sinf', math.sin)] -unarys += [('sinh', 'sinhf', math.sinh)] -unarys += [('atan', 'atanf', math.atan)] -unarys += [('atanh', 'atanhf', math.atanh)] -unarys += [('tan', 'tanf', math.tan)] -unarys += [('trunc', 'truncf', math.trunc)] +unarys += [("ceil", "ceilf", math.ceil)] +unarys += [("floor", "floorf", math.floor)] +unarys += [("fabs", "fabsf", math.fabs)] +unarys += [("exp", "expf", math.exp)] +unarys += [("expm1", "expm1f", math.expm1)] +unarys += [("erf", "erff", math.erf)] +unarys += [("erfc", "erfcf", math.erfc)] +unarys += [("tgamma", "tgammaf", math.gamma)] +unarys += [("lgamma", "lgammaf", math.lgamma)] +unarys += [("sqrt", "sqrtf", math.sqrt)] +unarys += [("log", "logf", math.log)] +unarys += [("log2", "log2f", math.log2)] +unarys += [("log10", "log10f", math.log10)] +unarys += [("log1p", "log1pf", math.log1p)] +unarys += [("acosh", "acoshf", math.acosh)] +unarys += [("acos", "acosf", math.acos)] +unarys += [("cos", "cosf", math.cos)] +unarys += [("cosh", "coshf", math.cosh)] +unarys += [("asinh", "asinhf", math.asinh)] +unarys += [("asin", "asinf", math.asin)] +unarys += [("sin", "sinf", math.sin)] +unarys += [("sinh", "sinhf", math.sinh)] +unarys += [("atan", "atanf", math.atan)] +unarys += [("atanh", "atanhf", math.atanh)] +unarys += [("tan", "tanf", math.tan)] +unarys += [("trunc", "truncf", math.trunc)] unarys_fastmath = {} -unarys_fastmath['cosf'] = 'fast_cosf' -unarys_fastmath['sinf'] = 'fast_sinf' -unarys_fastmath['tanf'] = 'fast_tanf' -unarys_fastmath['expf'] = 'fast_expf' -unarys_fastmath['log2f'] = 'fast_log2f' -unarys_fastmath['log10f'] = 'fast_log10f' -unarys_fastmath['logf'] = 'fast_logf' +unarys_fastmath["cosf"] = "fast_cosf" +unarys_fastmath["sinf"] = "fast_sinf" +unarys_fastmath["tanf"] = "fast_tanf" +unarys_fastmath["expf"] = "fast_expf" +unarys_fastmath["log2f"] = "fast_log2f" +unarys_fastmath["log10f"] = "fast_log10f" +unarys_fastmath["logf"] = "fast_logf" binarys = [] -binarys += [('copysign', 'copysignf', math.copysign)] -binarys += [('atan2', 'atan2f', math.atan2)] -binarys += [('pow', 'powf', math.pow)] -binarys += [('fmod', 'fmodf', math.fmod)] -binarys += [('hypot', 'hypotf', math.hypot)] -binarys += [('remainder', 'remainderf', math.remainder)] +binarys += [("copysign", "copysignf", math.copysign)] +binarys += [("atan2", "atan2f", math.atan2)] +binarys += [("pow", "powf", math.pow)] +binarys += [("fmod", "fmodf", math.fmod)] +binarys += [("hypot", "hypotf", math.hypot)] +binarys += [("remainder", "remainderf", math.remainder)] binarys_fastmath = {} -binarys_fastmath['powf'] = 'fast_powf' +binarys_fastmath["powf"] = "fast_powf" @lower(math.isinf, types.Integer) @@ -179,8 +179,9 @@ def fp16_trunc(x): def impl_boolean(key, ty, libfunc): def lower_boolean_impl(context, builder, sig, args): - libfunc_impl = context.get_function(libfunc, - typing.signature(types.int32, ty)) + libfunc_impl = context.get_function( + libfunc, typing.signature(types.int32, ty) + ) result = libfunc_impl(builder, args) return context.cast(builder, result, types.int32, types.boolean) @@ -197,9 +198,11 @@ def lower_unary_impl(context, builder, sig, args): if fast_replacement is not None: actual_libfunc = getattr(libdevice, fast_replacement) - libfunc_impl = context.get_function(actual_libfunc, - typing.signature(ty, ty)) + libfunc_impl = context.get_function( + actual_libfunc, typing.signature(ty, ty) + ) return libfunc_impl(builder, args) + return lower_unary_impl @@ -208,7 +211,7 @@ def get_unary_impl_for_fn_and_ty(fn, ty): # unary implementations, it does not appear in the unarys list. However, # its implementation can be looked up by key like the other # implementations, so we add it to the list we search here. - tanh_impls = ('tanh', 'tanhf', math.tanh) + tanh_impls = ("tanh", "tanhf", math.tanh) for fname64, fname32, key in unarys + [tanh_impls]: if fn == key: if ty == float32: @@ -233,7 +236,7 @@ def lower_unary_int_impl(context, builder, sig, args): elif sig.args[0] == uint64: convert = builder.uitofp else: - m = 'Only 64-bit integers are supported for generic unary int ops' + m = "Only 64-bit integers are supported for generic unary int ops" raise TypeError(m) arg = convert(args[0], ir.DoubleType()) @@ -254,9 +257,11 @@ def lower_binary_impl(context, builder, sig, args): if fast_replacement is not None: actual_libfunc = getattr(libdevice, fast_replacement) - libfunc_impl = context.get_function(actual_libfunc, - typing.signature(ty, ty, ty)) + libfunc_impl = context.get_function( + actual_libfunc, typing.signature(ty, ty, ty) + ) return libfunc_impl(builder, args) + return lower_binary_impl @@ -285,7 +290,7 @@ def lower_binary_int_impl(context, builder, sig, args): elif sig.args[0] == uint64: convert = builder.uitofp else: - m = 'Only 64-bit integers are supported for generic binary int ops' + m = "Only 64-bit integers are supported for generic binary int ops" raise TypeError(m) args = [convert(arg, ir.DoubleType()) for arg in args] @@ -390,12 +395,12 @@ def tanh_impl_libdevice(): def tanhf_impl_fastmath(): fnty = ir.FunctionType(ir.FloatType(), [ir.FloatType()]) - asm = ir.InlineAsm(fnty, 'tanh.approx.f32 $0, $1;', '=f,f') + asm = ir.InlineAsm(fnty, "tanh.approx.f32 $0, $1;", "=f,f") return builder.call(asm, args) if ty == float32 and context.fastmath: cc = get_compute_capability() - if cc >= (7,5): + if cc >= (7, 5): return tanhf_impl_fastmath() return tanh_impl_libdevice() @@ -420,7 +425,6 @@ def tanhf_impl_fastmath(): def cpow_implement(fty, cty): def core(context, builder, sig, args): def cpow_internal(a, b): - if b.real == fty(0.0) and b.imag == fty(0.0): return cty(1.0) + cty(0.0j) elif a.real == fty(0.0) and b.real == fty(0.0): @@ -434,8 +438,9 @@ def cpow_internal(a, b): len /= math.exp(at * b.imag) phase += b.imag * math.log(vabs) - return len * (cty(math.cos(phase)) + - cty(math.sin(phase) * cty(1.0j))) + return len * ( + cty(math.cos(phase)) + cty(math.sin(phase) * cty(1.0j)) + ) return context.compile_internal(builder, cpow_internal, sig, args) diff --git a/numba_cuda/numba/cuda/models.py b/numba_cuda/numba/cuda/models.py index 21d115125..f9735d7fc 100644 --- a/numba_cuda/numba/cuda/models.py +++ b/numba_cuda/numba/cuda/models.py @@ -16,11 +16,7 @@ @register_model(Dim3) class Dim3Model(models.StructModel): def __init__(self, dmm, fe_type): - members = [ - ('x', types.int32), - ('y', types.int32), - ('z', types.int32) - ] + members = [("x", types.int32), ("y", types.int32), ("z", types.int32)] super().__init__(dmm, fe_type, members) diff --git a/numba_cuda/numba/cuda/nvvmutils.py b/numba_cuda/numba/cuda/nvvmutils.py index 9a7dcde02..1b4fa1c33 100644 --- a/numba_cuda/numba/cuda/nvvmutils.py +++ b/numba_cuda/numba/cuda/nvvmutils.py @@ -5,159 +5,178 @@ def declare_atomic_cas_int(lmod, isize): - fname = '___numba_atomic_i' + str(isize) + '_cas_hack' - fnty = ir.FunctionType(ir.IntType(isize), - (ir.PointerType(ir.IntType(isize)), - ir.IntType(isize), - ir.IntType(isize))) + fname = "___numba_atomic_i" + str(isize) + "_cas_hack" + fnty = ir.FunctionType( + ir.IntType(isize), + ( + ir.PointerType(ir.IntType(isize)), + ir.IntType(isize), + ir.IntType(isize), + ), + ) return cgutils.get_or_insert_function(lmod, fnty, fname) def atomic_cmpxchg(builder, lmod, isize, ptr, cmp, val): - out = builder.cmpxchg(ptr, cmp, val, 'monotonic', 'monotonic') + out = builder.cmpxchg(ptr, cmp, val, "monotonic", "monotonic") return builder.extract_value(out, 0) def declare_atomic_add_float32(lmod): - fname = 'llvm.nvvm.atomic.load.add.f32.p0f32' - fnty = ir.FunctionType(ir.FloatType(), - (ir.PointerType(ir.FloatType(), 0), ir.FloatType())) + fname = "llvm.nvvm.atomic.load.add.f32.p0f32" + fnty = ir.FunctionType( + ir.FloatType(), (ir.PointerType(ir.FloatType(), 0), ir.FloatType()) + ) return cgutils.get_or_insert_function(lmod, fnty, fname) def declare_atomic_add_float64(lmod): flags = targetconfig.ConfigStack().top() if flags.compute_capability >= (6, 0): - fname = 'llvm.nvvm.atomic.load.add.f64.p0f64' + fname = "llvm.nvvm.atomic.load.add.f64.p0f64" else: - fname = '___numba_atomic_double_add' - fnty = ir.FunctionType(ir.DoubleType(), - (ir.PointerType(ir.DoubleType()), ir.DoubleType())) + fname = "___numba_atomic_double_add" + fnty = ir.FunctionType( + ir.DoubleType(), (ir.PointerType(ir.DoubleType()), ir.DoubleType()) + ) return cgutils.get_or_insert_function(lmod, fnty, fname) def declare_atomic_sub_float32(lmod): - fname = '___numba_atomic_float_sub' - fnty = ir.FunctionType(ir.FloatType(), - (ir.PointerType(ir.FloatType()), ir.FloatType())) + fname = "___numba_atomic_float_sub" + fnty = ir.FunctionType( + ir.FloatType(), (ir.PointerType(ir.FloatType()), ir.FloatType()) + ) return cgutils.get_or_insert_function(lmod, fnty, fname) def declare_atomic_sub_float64(lmod): - fname = '___numba_atomic_double_sub' - fnty = ir.FunctionType(ir.DoubleType(), - (ir.PointerType(ir.DoubleType()), ir.DoubleType())) + fname = "___numba_atomic_double_sub" + fnty = ir.FunctionType( + ir.DoubleType(), (ir.PointerType(ir.DoubleType()), ir.DoubleType()) + ) return cgutils.get_or_insert_function(lmod, fnty, fname) def declare_atomic_inc_int32(lmod): - fname = 'llvm.nvvm.atomic.load.inc.32.p0i32' - fnty = ir.FunctionType(ir.IntType(32), - (ir.PointerType(ir.IntType(32)), ir.IntType(32))) + fname = "llvm.nvvm.atomic.load.inc.32.p0i32" + fnty = ir.FunctionType( + ir.IntType(32), (ir.PointerType(ir.IntType(32)), ir.IntType(32)) + ) return cgutils.get_or_insert_function(lmod, fnty, fname) def declare_atomic_inc_int64(lmod): - fname = '___numba_atomic_u64_inc' - fnty = ir.FunctionType(ir.IntType(64), - (ir.PointerType(ir.IntType(64)), ir.IntType(64))) + fname = "___numba_atomic_u64_inc" + fnty = ir.FunctionType( + ir.IntType(64), (ir.PointerType(ir.IntType(64)), ir.IntType(64)) + ) return cgutils.get_or_insert_function(lmod, fnty, fname) def declare_atomic_dec_int32(lmod): - fname = 'llvm.nvvm.atomic.load.dec.32.p0i32' - fnty = ir.FunctionType(ir.IntType(32), - (ir.PointerType(ir.IntType(32)), ir.IntType(32))) + fname = "llvm.nvvm.atomic.load.dec.32.p0i32" + fnty = ir.FunctionType( + ir.IntType(32), (ir.PointerType(ir.IntType(32)), ir.IntType(32)) + ) return cgutils.get_or_insert_function(lmod, fnty, fname) def declare_atomic_dec_int64(lmod): - fname = '___numba_atomic_u64_dec' - fnty = ir.FunctionType(ir.IntType(64), - (ir.PointerType(ir.IntType(64)), ir.IntType(64))) + fname = "___numba_atomic_u64_dec" + fnty = ir.FunctionType( + ir.IntType(64), (ir.PointerType(ir.IntType(64)), ir.IntType(64)) + ) return cgutils.get_or_insert_function(lmod, fnty, fname) def declare_atomic_max_float32(lmod): - fname = '___numba_atomic_float_max' - fnty = ir.FunctionType(ir.FloatType(), - (ir.PointerType(ir.FloatType()), ir.FloatType())) + fname = "___numba_atomic_float_max" + fnty = ir.FunctionType( + ir.FloatType(), (ir.PointerType(ir.FloatType()), ir.FloatType()) + ) return cgutils.get_or_insert_function(lmod, fnty, fname) def declare_atomic_max_float64(lmod): - fname = '___numba_atomic_double_max' - fnty = ir.FunctionType(ir.DoubleType(), - (ir.PointerType(ir.DoubleType()), ir.DoubleType())) + fname = "___numba_atomic_double_max" + fnty = ir.FunctionType( + ir.DoubleType(), (ir.PointerType(ir.DoubleType()), ir.DoubleType()) + ) return cgutils.get_or_insert_function(lmod, fnty, fname) def declare_atomic_min_float32(lmod): - fname = '___numba_atomic_float_min' - fnty = ir.FunctionType(ir.FloatType(), - (ir.PointerType(ir.FloatType()), ir.FloatType())) + fname = "___numba_atomic_float_min" + fnty = ir.FunctionType( + ir.FloatType(), (ir.PointerType(ir.FloatType()), ir.FloatType()) + ) return cgutils.get_or_insert_function(lmod, fnty, fname) def declare_atomic_min_float64(lmod): - fname = '___numba_atomic_double_min' - fnty = ir.FunctionType(ir.DoubleType(), - (ir.PointerType(ir.DoubleType()), ir.DoubleType())) + fname = "___numba_atomic_double_min" + fnty = ir.FunctionType( + ir.DoubleType(), (ir.PointerType(ir.DoubleType()), ir.DoubleType()) + ) return cgutils.get_or_insert_function(lmod, fnty, fname) def declare_atomic_nanmax_float32(lmod): - fname = '___numba_atomic_float_nanmax' - fnty = ir.FunctionType(ir.FloatType(), - (ir.PointerType(ir.FloatType()), ir.FloatType())) + fname = "___numba_atomic_float_nanmax" + fnty = ir.FunctionType( + ir.FloatType(), (ir.PointerType(ir.FloatType()), ir.FloatType()) + ) return cgutils.get_or_insert_function(lmod, fnty, fname) def declare_atomic_nanmax_float64(lmod): - fname = '___numba_atomic_double_nanmax' - fnty = ir.FunctionType(ir.DoubleType(), - (ir.PointerType(ir.DoubleType()), ir.DoubleType())) + fname = "___numba_atomic_double_nanmax" + fnty = ir.FunctionType( + ir.DoubleType(), (ir.PointerType(ir.DoubleType()), ir.DoubleType()) + ) return cgutils.get_or_insert_function(lmod, fnty, fname) def declare_atomic_nanmin_float32(lmod): - fname = '___numba_atomic_float_nanmin' - fnty = ir.FunctionType(ir.FloatType(), - (ir.PointerType(ir.FloatType()), ir.FloatType())) + fname = "___numba_atomic_float_nanmin" + fnty = ir.FunctionType( + ir.FloatType(), (ir.PointerType(ir.FloatType()), ir.FloatType()) + ) return cgutils.get_or_insert_function(lmod, fnty, fname) def declare_atomic_nanmin_float64(lmod): - fname = '___numba_atomic_double_nanmin' - fnty = ir.FunctionType(ir.DoubleType(), - (ir.PointerType(ir.DoubleType()), ir.DoubleType())) + fname = "___numba_atomic_double_nanmin" + fnty = ir.FunctionType( + ir.DoubleType(), (ir.PointerType(ir.DoubleType()), ir.DoubleType()) + ) return cgutils.get_or_insert_function(lmod, fnty, fname) def declare_cudaCGGetIntrinsicHandle(lmod): - fname = 'cudaCGGetIntrinsicHandle' - fnty = ir.FunctionType(ir.IntType(64), - (ir.IntType(32),)) + fname = "cudaCGGetIntrinsicHandle" + fnty = ir.FunctionType(ir.IntType(64), (ir.IntType(32),)) return cgutils.get_or_insert_function(lmod, fnty, fname) def declare_cudaCGSynchronize(lmod): - fname = 'cudaCGSynchronize' - fnty = ir.FunctionType(ir.IntType(32), - (ir.IntType(64), ir.IntType(32))) + fname = "cudaCGSynchronize" + fnty = ir.FunctionType(ir.IntType(32), (ir.IntType(64), ir.IntType(32))) return cgutils.get_or_insert_function(lmod, fnty, fname) def declare_string(builder, value): lmod = builder.basic_block.function.module cval = cgutils.make_bytearray(value.encode("utf-8") + b"\x00") - gl = cgutils.add_global_variable(lmod, cval.type, name="_str", - addrspace=nvvm.ADDRSPACE_CONSTANT) - gl.linkage = 'internal' + gl = cgutils.add_global_variable( + lmod, cval.type, name="_str", addrspace=nvvm.ADDRSPACE_CONSTANT + ) + gl.linkage = "internal" gl.global_constant = True gl.initializer = cval - return builder.addrspacecast(gl, ir.PointerType(ir.IntType(8)), 'generic') + return builder.addrspacecast(gl, ir.PointerType(ir.IntType(8)), "generic") def declare_vprint(lmod): @@ -172,24 +191,20 @@ def declare_vprint(lmod): # ----------------------------------------------------------------------------- SREG_MAPPING = { - 'tid.x': 'llvm.nvvm.read.ptx.sreg.tid.x', - 'tid.y': 'llvm.nvvm.read.ptx.sreg.tid.y', - 'tid.z': 'llvm.nvvm.read.ptx.sreg.tid.z', - - 'ntid.x': 'llvm.nvvm.read.ptx.sreg.ntid.x', - 'ntid.y': 'llvm.nvvm.read.ptx.sreg.ntid.y', - 'ntid.z': 'llvm.nvvm.read.ptx.sreg.ntid.z', - - 'ctaid.x': 'llvm.nvvm.read.ptx.sreg.ctaid.x', - 'ctaid.y': 'llvm.nvvm.read.ptx.sreg.ctaid.y', - 'ctaid.z': 'llvm.nvvm.read.ptx.sreg.ctaid.z', - - 'nctaid.x': 'llvm.nvvm.read.ptx.sreg.nctaid.x', - 'nctaid.y': 'llvm.nvvm.read.ptx.sreg.nctaid.y', - 'nctaid.z': 'llvm.nvvm.read.ptx.sreg.nctaid.z', - - 'warpsize': 'llvm.nvvm.read.ptx.sreg.warpsize', - 'laneid': 'llvm.nvvm.read.ptx.sreg.laneid', + "tid.x": "llvm.nvvm.read.ptx.sreg.tid.x", + "tid.y": "llvm.nvvm.read.ptx.sreg.tid.y", + "tid.z": "llvm.nvvm.read.ptx.sreg.tid.z", + "ntid.x": "llvm.nvvm.read.ptx.sreg.ntid.x", + "ntid.y": "llvm.nvvm.read.ptx.sreg.ntid.y", + "ntid.z": "llvm.nvvm.read.ptx.sreg.ntid.z", + "ctaid.x": "llvm.nvvm.read.ptx.sreg.ctaid.x", + "ctaid.y": "llvm.nvvm.read.ptx.sreg.ctaid.y", + "ctaid.z": "llvm.nvvm.read.ptx.sreg.ctaid.z", + "nctaid.x": "llvm.nvvm.read.ptx.sreg.nctaid.x", + "nctaid.y": "llvm.nvvm.read.ptx.sreg.nctaid.y", + "nctaid.z": "llvm.nvvm.read.ptx.sreg.nctaid.z", + "warpsize": "llvm.nvvm.read.ptx.sreg.warpsize", + "laneid": "llvm.nvvm.read.ptx.sreg.laneid", } @@ -205,16 +220,16 @@ def __init__(self, builder): self.builder = builder def tid(self, xyz): - return call_sreg(self.builder, 'tid.%s' % xyz) + return call_sreg(self.builder, "tid.%s" % xyz) def ctaid(self, xyz): - return call_sreg(self.builder, 'ctaid.%s' % xyz) + return call_sreg(self.builder, "ctaid.%s" % xyz) def ntid(self, xyz): - return call_sreg(self.builder, 'ntid.%s' % xyz) + return call_sreg(self.builder, "ntid.%s" % xyz) def nctaid(self, xyz): - return call_sreg(self.builder, 'nctaid.%s' % xyz) + return call_sreg(self.builder, "nctaid.%s" % xyz) def getdim(self, xyz): i64 = ir.IntType(64) @@ -227,7 +242,7 @@ def getdim(self, xyz): def get_global_id(builder, dim): sreg = SRegBuilder(builder) - it = (sreg.getdim(xyz) for xyz in 'xyz') + it = (sreg.getdim(xyz) for xyz in "xyz") seq = list(itertools.islice(it, None, dim)) if dim == 1: return seq[0] diff --git a/numba_cuda/numba/cuda/printimpl.py b/numba_cuda/numba/cuda/printimpl.py index b8f3d2eec..6acd70049 100644 --- a/numba_cuda/numba/cuda/printimpl.py +++ b/numba_cuda/numba/cuda/printimpl.py @@ -15,6 +15,7 @@ # NOTE: we don't use @lower here since print_item() doesn't return a LLVM value + @singledispatch def print_item(ty, context, builder, val): """ @@ -22,8 +23,9 @@ def print_item(ty, context, builder, val): A (format string, [list of arguments]) is returned that will allow forming the final printf()-like call. """ - raise NotImplementedError("printing unimplemented for values of type %s" - % (ty,)) + raise NotImplementedError( + "printing unimplemented for values of type %s" % (ty,) + ) @print_item.register(types.Integer) @@ -92,11 +94,13 @@ def print_varargs(context, builder, sig, args): rawfmt = " ".join(formats) + "\n" if len(args) > 32: - msg = ('CUDA print() cannot print more than 32 items. ' - 'The raw format string will be emitted by the kernel instead.') + msg = ( + "CUDA print() cannot print more than 32 items. " + "The raw format string will be emitted by the kernel instead." + ) warn(msg, NumbaWarning) - rawfmt = rawfmt.replace('%', '%%') + rawfmt = rawfmt.replace("%", "%%") fmt = context.insert_string_const_addrspace(builder, rawfmt) array = cgutils.make_anonymous_struct(builder, values) arrayptr = cgutils.alloca_once_value(builder, array) diff --git a/numba_cuda/numba/cuda/random.py b/numba_cuda/numba/cuda/random.py index 460c7fc21..82905e8ac 100644 --- a/numba_cuda/numba/cuda/random.py +++ b/numba_cuda/numba/cuda/random.py @@ -1,7 +1,16 @@ import math -from numba import (config, cuda, float32, float64, uint32, int64, uint64, - from_dtype, jit) +from numba import ( + config, + cuda, + float32, + float64, + uint32, + int64, + uint64, + from_dtype, + jit, +) import numpy as np @@ -29,8 +38,9 @@ # using the CPU @jit decorator everywhere to create functions that work as # both CPU and CUDA device functions. -xoroshiro128p_dtype = np.dtype([('s0', np.uint64), ('s1', np.uint64)], - align=True) +xoroshiro128p_dtype = np.dtype( + [("s0", np.uint64), ("s1", np.uint64)], align=True +) xoroshiro128p_type = from_dtype(xoroshiro128p_dtype) # When cudasim is enabled, Fake CUDA arrays are passed to some of the @@ -45,7 +55,7 @@ @jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython) def init_xoroshiro128p_state(states, index, seed): - '''Use SplitMix64 to generate an xoroshiro128p state from 64-bit seed. + """Use SplitMix64 to generate an xoroshiro128p state from 64-bit seed. This ensures that manually set small seeds don't result in a predictable initial sequence from the random number generator. @@ -56,7 +66,7 @@ def init_xoroshiro128p_state(states, index, seed): :param index: offset in states to update :type seed: int64 :param seed: seed value to use when initializing state - ''' + """ index = int64(index) seed = uint64(seed) @@ -65,13 +75,13 @@ def init_xoroshiro128p_state(states, index, seed): z = (z ^ (z >> uint32(27))) * uint64(0x94D049BB133111EB) z = z ^ (z >> uint32(31)) - states[index]['s0'] = z - states[index]['s1'] = z + states[index]["s0"] = z + states[index]["s1"] = z @jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython) def rotl(x, k): - '''Left rotate x by k bits.''' + """Left rotate x by k bits.""" x = uint64(x) k = uint32(k) return (x << k) | (x >> uint32(64 - k)) @@ -79,38 +89,38 @@ def rotl(x, k): @jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython) def xoroshiro128p_next(states, index): - '''Return the next random uint64 and advance the RNG in states[index]. + """Return the next random uint64 and advance the RNG in states[index]. :type states: 1D array, dtype=xoroshiro128p_dtype :param states: array of RNG states :type index: int64 :param index: offset in states to update :rtype: uint64 - ''' + """ index = int64(index) - s0 = states[index]['s0'] - s1 = states[index]['s1'] + s0 = states[index]["s0"] + s1 = states[index]["s1"] result = s0 + s1 s1 ^= s0 - states[index]['s0'] = uint64(rotl(s0, uint32(55))) ^ s1 ^ (s1 << uint32(14)) - states[index]['s1'] = uint64(rotl(s1, uint32(36))) + states[index]["s0"] = uint64(rotl(s0, uint32(55))) ^ s1 ^ (s1 << uint32(14)) + states[index]["s1"] = uint64(rotl(s1, uint32(36))) return result @jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython) def xoroshiro128p_jump(states, index): - '''Advance the RNG in ``states[index]`` by 2**64 steps. + """Advance the RNG in ``states[index]`` by 2**64 steps. :type states: 1D array, dtype=xoroshiro128p_dtype :param states: array of RNG states :type index: int64 :param index: offset in states to update - ''' + """ index = int64(index) - jump = (uint64(0xbeac0467eba5facb), uint64(0xd86b048b86aa9922)) + jump = (uint64(0xBEAC0467EBA5FACB), uint64(0xD86B048B86AA9922)) s0 = uint64(0) s1 = uint64(0) @@ -118,52 +128,52 @@ def xoroshiro128p_jump(states, index): for i in range(2): for b in range(64): if jump[i] & (uint64(1) << uint32(b)): - s0 ^= states[index]['s0'] - s1 ^= states[index]['s1'] + s0 ^= states[index]["s0"] + s1 ^= states[index]["s1"] xoroshiro128p_next(states, index) - states[index]['s0'] = s0 - states[index]['s1'] = s1 + states[index]["s0"] = s0 + states[index]["s1"] = s1 @jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython) def uint64_to_unit_float64(x): - '''Convert uint64 to float64 value in the range [0.0, 1.0)''' + """Convert uint64 to float64 value in the range [0.0, 1.0)""" x = uint64(x) return (x >> uint32(11)) * (float64(1) / (uint64(1) << uint32(53))) @jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython) def uint64_to_unit_float32(x): - '''Convert uint64 to float32 value in the range [0.0, 1.0)''' + """Convert uint64 to float32 value in the range [0.0, 1.0)""" x = uint64(x) return float32(uint64_to_unit_float64(x)) @jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython) def xoroshiro128p_uniform_float32(states, index): - '''Return a float32 in range [0.0, 1.0) and advance ``states[index]``. + """Return a float32 in range [0.0, 1.0) and advance ``states[index]``. :type states: 1D array, dtype=xoroshiro128p_dtype :param states: array of RNG states :type index: int64 :param index: offset in states to update :rtype: float32 - ''' + """ index = int64(index) return uint64_to_unit_float32(xoroshiro128p_next(states, index)) @jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython) def xoroshiro128p_uniform_float64(states, index): - '''Return a float64 in range [0.0, 1.0) and advance ``states[index]``. + """Return a float64 in range [0.0, 1.0) and advance ``states[index]``. :type states: 1D array, dtype=xoroshiro128p_dtype :param states: array of RNG states :type index: int64 :param index: offset in states to update :rtype: float64 - ''' + """ index = int64(index) return uint64_to_unit_float64(xoroshiro128p_next(states, index)) @@ -174,7 +184,7 @@ def xoroshiro128p_uniform_float64(states, index): @jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython) def xoroshiro128p_normal_float32(states, index): - '''Return a normally distributed float32 and advance ``states[index]``. + """Return a normally distributed float32 and advance ``states[index]``. The return value is drawn from a Gaussian of mean=0 and sigma=1 using the Box-Muller transform. This advances the RNG sequence by two steps. @@ -184,7 +194,7 @@ def xoroshiro128p_normal_float32(states, index): :type index: int64 :param index: offset in states to update :rtype: float32 - ''' + """ index = int64(index) u1 = xoroshiro128p_uniform_float32(states, index) @@ -199,7 +209,7 @@ def xoroshiro128p_normal_float32(states, index): @jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython) def xoroshiro128p_normal_float64(states, index): - '''Return a normally distributed float32 and advance ``states[index]``. + """Return a normally distributed float32 and advance ``states[index]``. The return value is drawn from a Gaussian of mean=0 and sigma=1 using the Box-Muller transform. This advances the RNG sequence by two steps. @@ -209,7 +219,7 @@ def xoroshiro128p_normal_float64(states, index): :type index: int64 :param index: offset in states to update :rtype: float64 - ''' + """ index = int64(index) u1 = xoroshiro128p_uniform_float32(states, index) @@ -242,7 +252,7 @@ def init_xoroshiro128p_states_cpu(states, seed, subsequence_start): def init_xoroshiro128p_states(states, seed, subsequence_start=0, stream=0): - '''Initialize RNG states on the GPU for parallel generators. + """Initialize RNG states on the GPU for parallel generators. This initializes the RNG states so that each state in the array corresponds subsequences in the separated by 2**64 steps from each other in the main @@ -257,7 +267,7 @@ def init_xoroshiro128p_states(states, seed, subsequence_start=0, stream=0): :param states: array of RNG states :type seed: uint64 :param seed: starting seed for list of generators - ''' + """ # Initialization on CPU is much faster than the GPU states_cpu = np.empty(shape=states.shape, dtype=xoroshiro128p_dtype) @@ -267,7 +277,7 @@ def init_xoroshiro128p_states(states, seed, subsequence_start=0, stream=0): def create_xoroshiro128p_states(n, seed, subsequence_start=0, stream=0): - '''Returns a new device array initialized for n random number generators. + """Returns a new device array initialized for n random number generators. This initializes the RNG states so that each state in the array corresponds subsequences in the separated by 2**64 steps from each other in the main @@ -286,7 +296,7 @@ def create_xoroshiro128p_states(n, seed, subsequence_start=0, stream=0): :param subsequence_start: :type stream: CUDA stream :param stream: stream to run initialization kernel on - ''' + """ states = cuda.device_array(n, dtype=xoroshiro128p_dtype, stream=stream) init_xoroshiro128p_states(states, seed, subsequence_start, stream) return states diff --git a/numba_cuda/numba/cuda/reshape_funcs.cu b/numba_cuda/numba/cuda/reshape_funcs.cu index 123bfed97..7dfc19db7 100644 --- a/numba_cuda/numba/cuda/reshape_funcs.cu +++ b/numba_cuda/numba/cuda/reshape_funcs.cu @@ -148,4 +148,4 @@ numba_attempt_nocopy_reshape(npy_intp nd, const npy_intp *dims, const npy_intp * } return 1; -} \ No newline at end of file +} diff --git a/numba_cuda/numba/cuda/runtime/__init__.py b/numba_cuda/numba/cuda/runtime/__init__.py index 636f187d3..881d0d0d0 100644 --- a/numba_cuda/numba/cuda/runtime/__init__.py +++ b/numba_cuda/numba/cuda/runtime/__init__.py @@ -1 +1 @@ -from numba.cuda.runtime.nrt import rtsys # noqa: F401 +from numba.cuda.runtime.nrt import rtsys # noqa: F401 diff --git a/numba_cuda/numba/cuda/runtime/memsys.cu b/numba_cuda/numba/cuda/runtime/memsys.cu index a5820971c..ed1133dbd 100644 --- a/numba_cuda/numba/cuda/runtime/memsys.cu +++ b/numba_cuda/numba/cuda/runtime/memsys.cu @@ -91,4 +91,4 @@ extern "C" __global__ void NRT_MemSys_print(void) } else { printf("TheMsys is null.\n"); } -} \ No newline at end of file +} diff --git a/numba_cuda/numba/cuda/runtime/memsys.cuh b/numba_cuda/numba/cuda/runtime/memsys.cuh index 862a1754c..74cfefc49 100644 --- a/numba_cuda/numba/cuda/runtime/memsys.cuh +++ b/numba_cuda/numba/cuda/runtime/memsys.cuh @@ -14,4 +14,4 @@ struct NRT_MemSys { /* The Memory System object */ __device__ NRT_MemSys* TheMSys; -extern "C" __global__ void NRT_MemSys_set(NRT_MemSys *memsys_ptr); \ No newline at end of file +extern "C" __global__ void NRT_MemSys_set(NRT_MemSys *memsys_ptr); diff --git a/numba_cuda/numba/cuda/runtime/nrt.cu b/numba_cuda/numba/cuda/runtime/nrt.cu index 879bf8d2f..a318dd4dd 100644 --- a/numba_cuda/numba/cuda/runtime/nrt.cu +++ b/numba_cuda/numba/cuda/runtime/nrt.cu @@ -33,7 +33,7 @@ extern "C" __device__ void* NRT_Allocate(size_t size) { void* ptr = NULL; ptr = malloc(size); - if (TheMSys && TheMSys->stats.enabled) { + if (TheMSys && TheMSys->stats.enabled) { TheMSys->stats.alloc.fetch_add(1, cuda::memory_order_relaxed); } return ptr; } @@ -49,7 +49,7 @@ extern "C" __device__ void NRT_MemInfo_init(NRT_MemInfo* mi, mi->dtor_info = dtor_info; mi->data = data; mi->size = size; - if (TheMSys && TheMSys->stats.enabled) { + if (TheMSys && TheMSys->stats.enabled) { TheMSys->stats.mi_alloc.fetch_add(1, cuda::memory_order_relaxed); } } @@ -77,7 +77,7 @@ extern "C" __device__ void NRT_dealloc(NRT_MemInfo* mi) extern "C" __device__ void NRT_MemInfo_destroy(NRT_MemInfo* mi) { NRT_dealloc(mi); - if (TheMSys && TheMSys->stats.enabled) { + if (TheMSys && TheMSys->stats.enabled) { TheMSys->stats.mi_free.fetch_add(1, cuda::memory_order_relaxed); } } diff --git a/numba_cuda/numba/cuda/runtime/nrt.py b/numba_cuda/numba/cuda/runtime/nrt.py index 0b6781789..8b3be0c8e 100644 --- a/numba_cuda/numba/cuda/runtime/nrt.py +++ b/numba_cuda/numba/cuda/runtime/nrt.py @@ -5,26 +5,28 @@ from numba import cuda, config from numba.core.runtime.nrt import _nrt_mstats -from numba.cuda.cudadrv.driver import (Linker, driver, launch_kernel, - USE_NV_BINDING) +from numba.cuda.cudadrv.driver import ( + Linker, + driver, + launch_kernel, + USE_NV_BINDING, +) from numba.cuda.cudadrv import devices from numba.cuda.api import get_current_device from numba.cuda.utils import _readenv # Check environment variable or config for NRT statistics enablement -NRT_STATS = ( - _readenv("NUMBA_CUDA_NRT_STATS", bool, False) or - getattr(config, "NUMBA_CUDA_NRT_STATS", False) +NRT_STATS = _readenv("NUMBA_CUDA_NRT_STATS", bool, False) or getattr( + config, "NUMBA_CUDA_NRT_STATS", False ) if not hasattr(config, "NUMBA_CUDA_NRT_STATS"): config.CUDA_NRT_STATS = NRT_STATS # Check environment variable or config for NRT enablement -ENABLE_NRT = ( - _readenv("NUMBA_CUDA_ENABLE_NRT", bool, False) or - getattr(config, "NUMBA_CUDA_ENABLE_NRT", False) +ENABLE_NRT = _readenv("NUMBA_CUDA_ENABLE_NRT", bool, False) or getattr( + config, "NUMBA_CUDA_ENABLE_NRT", False ) if not hasattr(config, "NUMBA_CUDA_ENABLE_NRT"): config.CUDA_ENABLE_NRT = ENABLE_NRT @@ -35,16 +37,19 @@ def _alloc_init_guard(method): """ Ensure NRT memory allocation and initialization before running the method """ + @wraps(method) def wrapper(self, *args, **kwargs): self.ensure_allocated() self.ensure_initialized() return method(self, *args, **kwargs) + return wrapper class _Runtime: """Singleton class for Numba CUDA runtime""" + _instance = None def __new__(cls, *args, **kwargs): @@ -64,8 +69,7 @@ def _compile_memsys_module(self): """ # Define the path for memsys.cu memsys_mod = os.path.join( - os.path.dirname(os.path.abspath(__file__)), - "memsys.cu" + os.path.dirname(os.path.abspath(__file__)), "memsys.cu" ) cc = get_current_device().compute_capability @@ -105,10 +109,12 @@ def allocate(self, stream=None): # Allocate space for NRT_MemSys ptr, nbytes = self._memsys_module.get_global_symbol("memsys_size") memsys_size = ctypes.c_uint64() - driver.cuMemcpyDtoH(ctypes.addressof(memsys_size), - ptr.device_ctypes_pointer, nbytes) + driver.cuMemcpyDtoH( + ctypes.addressof(memsys_size), ptr.device_ctypes_pointer, nbytes + ) self._memsys = device_array( - (memsys_size.value,), dtype="i1", stream=stream) + (memsys_size.value,), dtype="i1", stream=stream + ) self.set_memsys_to_module(self._memsys_module, stream=stream) def _single_thread_launch(self, module, stream, name, params=()): @@ -121,12 +127,16 @@ def _single_thread_launch(self, module, stream, name, params=()): func = module.get_function(name) launch_kernel( func.handle, - 1, 1, 1, - 1, 1, 1, + 1, + 1, + 1, + 1, + 1, + 1, 0, stream.handle, params, - cooperative=False + cooperative=False, ) def _ctypes_pointer(self, array): @@ -158,7 +168,8 @@ def initialize(self, stream=None): self.ensure_allocated() self._single_thread_launch( - self._memsys_module, stream, "NRT_MemSys_init") + self._memsys_module, stream, "NRT_MemSys_init" + ) self._initialized = True if config.CUDA_NRT_STATS: @@ -170,7 +181,8 @@ def memsys_enable_stats(self, stream=None): Enable memsys statistics """ self._single_thread_launch( - self._memsys_module, stream, "NRT_MemSys_enable_stats") + self._memsys_module, stream, "NRT_MemSys_enable_stats" + ) @_alloc_init_guard def memsys_disable_stats(self, stream=None): @@ -178,7 +190,8 @@ def memsys_disable_stats(self, stream=None): Disable memsys statistics """ self._single_thread_launch( - self._memsys_module, stream, "NRT_MemSys_disable_stats") + self._memsys_module, stream, "NRT_MemSys_disable_stats" + ) @_alloc_init_guard def memsys_stats_enabled(self, stream=None): @@ -193,7 +206,7 @@ def memsys_stats_enabled(self, stream=None): self._memsys_module, stream, "NRT_MemSys_stats_enabled", - (enabled_ptr,) + (enabled_ptr,), ) cuda.synchronize() @@ -204,21 +217,20 @@ def _copy_memsys_to_host(self, stream): """ Copy all statistics of memsys to the host """ - dt = np.dtype([ - ('alloc', np.uint64), - ('free', np.uint64), - ('mi_alloc', np.uint64), - ('mi_free', np.uint64) - ]) + dt = np.dtype( + [ + ("alloc", np.uint64), + ("free", np.uint64), + ("mi_alloc", np.uint64), + ("mi_free", np.uint64), + ] + ) stats_for_read = cuda.managed_array(1, dt) stats_ptr = self._ctypes_pointer(stats_for_read) self._single_thread_launch( - self._memsys_module, - stream, - "NRT_MemSys_read", - [stats_ptr] + self._memsys_module, stream, "NRT_MemSys_read", [stats_ptr] ) cuda.synchronize() @@ -237,7 +249,7 @@ def get_allocation_stats(self, stream=None): alloc=memsys["alloc"], free=memsys["free"], mi_alloc=memsys["mi_alloc"], - mi_free=memsys["mi_free"] + mi_free=memsys["mi_free"], ) @_alloc_init_guard @@ -249,10 +261,7 @@ def _get_single_stat(self, stat, stream=None): got_ptr = self._ctypes_pointer(got) self._single_thread_launch( - self._memsys_module, - stream, - f"NRT_MemSys_read_{stat}", - [got_ptr] + self._memsys_module, stream, f"NRT_MemSys_read_{stat}", [got_ptr] ) cuda.synchronize() @@ -309,15 +318,13 @@ def set_memsys_to_module(self, module, stream=None): """ if self._memsys is None: raise RuntimeError( - "Please allocate NRT Memsys first before setting to module.") + "Please allocate NRT Memsys first before setting to module." + ) memsys_ptr = self._ctypes_pointer(self._memsys) self._single_thread_launch( - module, - stream, - "NRT_MemSys_set", - [memsys_ptr] + module, stream, "NRT_MemSys_set", [memsys_ptr] ) @_alloc_init_guard @@ -327,9 +334,7 @@ def print_memsys(self, stream=None): """ cuda.synchronize() self._single_thread_launch( - self._memsys_module, - stream, - "NRT_MemSys_print" + self._memsys_module, stream, "NRT_MemSys_print" ) diff --git a/numba_cuda/numba/cuda/simulator/__init__.py b/numba_cuda/numba/cuda/simulator/__init__.py index d24aa6e7d..ad75c1ec7 100644 --- a/numba_cuda/numba/cuda/simulator/__init__.py +++ b/numba_cuda/numba/cuda/simulator/__init__.py @@ -3,14 +3,22 @@ from .api import * from .vector_types import vector_types from .reduction import Reduce -from .cudadrv.devicearray import (device_array, device_array_like, pinned, - pinned_array, pinned_array_like, - mapped_array, to_device, auto_device) +from .cudadrv.devicearray import ( + device_array, + device_array_like, + pinned, + pinned_array, + pinned_array_like, + mapped_array, + to_device, + auto_device, +) from .cudadrv import devicearray from .cudadrv.devices import require_context, gpus from .cudadrv.devices import get_context as current_context from .cudadrv.runtime import runtime from numba.core import config + reduce = Reduce # Register simulated vector types as module level variables @@ -25,14 +33,16 @@ if config.ENABLE_CUDASIM: import sys from numba.cuda.simulator import cudadrv - sys.modules['numba.cuda.cudadrv'] = cudadrv - sys.modules['numba.cuda.cudadrv.devicearray'] = cudadrv.devicearray - sys.modules['numba.cuda.cudadrv.devices'] = cudadrv.devices - sys.modules['numba.cuda.cudadrv.driver'] = cudadrv.driver - sys.modules['numba.cuda.cudadrv.runtime'] = cudadrv.runtime - sys.modules['numba.cuda.cudadrv.drvapi'] = cudadrv.drvapi - sys.modules['numba.cuda.cudadrv.error'] = cudadrv.error - sys.modules['numba.cuda.cudadrv.nvvm'] = cudadrv.nvvm + + sys.modules["numba.cuda.cudadrv"] = cudadrv + sys.modules["numba.cuda.cudadrv.devicearray"] = cudadrv.devicearray + sys.modules["numba.cuda.cudadrv.devices"] = cudadrv.devices + sys.modules["numba.cuda.cudadrv.driver"] = cudadrv.driver + sys.modules["numba.cuda.cudadrv.runtime"] = cudadrv.runtime + sys.modules["numba.cuda.cudadrv.drvapi"] = cudadrv.drvapi + sys.modules["numba.cuda.cudadrv.error"] = cudadrv.error + sys.modules["numba.cuda.cudadrv.nvvm"] = cudadrv.nvvm from . import compiler - sys.modules['numba.cuda.compiler'] = compiler + + sys.modules["numba.cuda.compiler"] = compiler diff --git a/numba_cuda/numba/cuda/simulator/api.py b/numba_cuda/numba/cuda/simulator/api.py index c6a55e88e..39a893d3f 100644 --- a/numba_cuda/numba/cuda/simulator/api.py +++ b/numba_cuda/numba/cuda/simulator/api.py @@ -1,6 +1,6 @@ -''' +""" Contains CUDA API functions -''' +""" # Imports here bring together parts of the API from other modules, so some of # them appear unused. @@ -15,7 +15,7 @@ def select_device(dev=0): - assert dev == 0, 'Only a single device supported by the simulator' + assert dev == 0, "Only a single device supported by the simulator" def is_float16_supported(): @@ -23,10 +23,11 @@ def is_float16_supported(): class stream(object): - ''' + """ The stream API is supported in the simulator - however, all execution occurs synchronously, so synchronization requires no operation. - ''' + """ + @contextmanager def auto_synchronize(self): yield @@ -62,9 +63,9 @@ def declare_device(*args, **kwargs): def detect(): - print('Found 1 CUDA devices') - print('id %d %20s %40s' % (0, 'SIMULATOR', '[SUPPORTED]')) - print('%40s: 5.0' % 'compute capability') + print("Found 1 CUDA devices") + print("id %d %20s %40s" % (0, "SIMULATOR", "[SUPPORTED]")) + print("%40s: 5.0" % "compute capability") def list_devices(): @@ -73,11 +74,13 @@ def list_devices(): # Events + class Event(object): - ''' + """ The simulator supports the event API, but they do not record timing info, and all simulation is synchronous. Execution time is not recorded. - ''' + """ + def record(self, stream=0): pass @@ -88,35 +91,48 @@ def synchronize(self): pass def elapsed_time(self, event): - warn('Simulator timings are bogus') + warn("Simulator timings are bogus") return 0.0 event = Event -def jit(func_or_sig=None, device=False, debug=None, argtypes=None, - inline=False, restype=None, fastmath=False, link=None, - boundscheck=None, opt=None, cache=None - ): +def jit( + func_or_sig=None, + device=False, + debug=None, + argtypes=None, + inline=False, + restype=None, + fastmath=False, + link=None, + boundscheck=None, + opt=None, + cache=None, +): # Here for API compatibility if boundscheck: raise NotImplementedError("bounds checking is not supported for CUDA") if link is not None: - raise NotImplementedError('Cannot link PTX in the simulator') + raise NotImplementedError("Cannot link PTX in the simulator") debug = config.CUDA_DEBUGINFO_DEFAULT if debug is None else debug # Check for first argument specifying types - in that case the # decorator is not being passed a function - if (func_or_sig is None or is_signature(func_or_sig) - or isinstance(func_or_sig, list)): + if ( + func_or_sig is None + or is_signature(func_or_sig) + or isinstance(func_or_sig, list) + ): + def jitwrapper(fn): - return FakeCUDAKernel(fn, - device=device, - fastmath=fastmath, - debug=debug) + return FakeCUDAKernel( + fn, device=device, fastmath=fastmath, debug=debug + ) + return jitwrapper return FakeCUDAKernel(func_or_sig, device=device, debug=debug) diff --git a/numba_cuda/numba/cuda/simulator/compiler.py b/numba_cuda/numba/cuda/simulator/compiler.py index 7db28d41a..ddebaf51c 100644 --- a/numba_cuda/numba/cuda/simulator/compiler.py +++ b/numba_cuda/numba/cuda/simulator/compiler.py @@ -1,7 +1,7 @@ -''' +""" The compiler is not implemented in the simulator. This module provides a stub to allow tests to import successfully. -''' +""" compile = None compile_for_current_device = None diff --git a/numba_cuda/numba/cuda/simulator/cudadrv/__init__.py b/numba_cuda/numba/cuda/simulator/cudadrv/__init__.py index dde9362d4..128579600 100644 --- a/numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +++ b/numba_cuda/numba/cuda/simulator/cudadrv/__init__.py @@ -1,2 +1,8 @@ -from numba.cuda.simulator.cudadrv import (devicearray, devices, driver, drvapi, - error, nvvm) +from numba.cuda.simulator.cudadrv import ( + devicearray, + devices, + driver, + drvapi, + error, + nvvm, +) diff --git a/numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py b/numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py index 088184fd4..47d7777af 100644 --- a/numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +++ b/numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py @@ -1,7 +1,8 @@ -''' +""" The Device Array API is not implemented in the simulator. This module provides stubs to allow tests to import correctly. -''' +""" + from contextlib import contextmanager from numba.np.numpy_support import numpy_version @@ -12,37 +13,39 @@ from_record_like = None -errmsg_contiguous_buffer = ("Array contains non-contiguous buffer and cannot " - "be transferred as a single memory region. Please " - "ensure contiguous buffer with numpy " - ".ascontiguousarray()") +errmsg_contiguous_buffer = ( + "Array contains non-contiguous buffer and cannot " + "be transferred as a single memory region. Please " + "ensure contiguous buffer with numpy " + ".ascontiguousarray()" +) class FakeShape(tuple): - ''' + """ The FakeShape class is used to provide a shape which does not allow negative indexing, similar to the shape in CUDA Python. (Numpy shape arrays allow negative indexing) - ''' + """ def __getitem__(self, k): if isinstance(k, int) and k < 0: - raise IndexError('tuple index out of range') + raise IndexError("tuple index out of range") return super(FakeShape, self).__getitem__(k) class FakeWithinKernelCUDAArray(object): - ''' + """ Created to emulate the behavior of arrays within kernels, where either array.item or array['item'] is valid (that is, give all structured arrays `numpy.recarray`-like semantics). This behaviour does not follow the semantics of Python and NumPy with non-jitted code, and will be deprecated and removed. - ''' + """ def __init__(self, item): assert isinstance(item, FakeCUDAArray) - self.__dict__['_item'] = item + self.__dict__["_item"] = item def __wrap_if_fake(self, item): if isinstance(item, FakeCUDAArray): @@ -84,18 +87,18 @@ def convert_fakes(obj): return obj - out = kwargs.get('out') + out = kwargs.get("out") if out: - kwargs['out'] = tuple(convert_fakes(o) for o in out) + kwargs["out"] = tuple(convert_fakes(o) for o in out) args = tuple(convert_fakes(a) for a in args) return call(*args, **kwargs) class FakeCUDAArray(object): - ''' + """ Implements the interface of a DeviceArray/DeviceRecord, but mostly just wraps a NumPy array. - ''' + """ __cuda_ndarray__ = True # There must be gpu_data attribute @@ -149,13 +152,13 @@ def copy_to_host(self, ary=None, stream=0): return ary def copy_to_device(self, ary, stream=0): - ''' + """ Copy from the provided array into this array. This may be less forgiving than the CUDA Python implementation, which will copy data up to the length of the smallest of the two arrays, whereas this expects the size of the arrays to be equal. - ''' + """ sentry_contiguous(self) self_core, ary_core = array_core(self), array_core(ary) if isinstance(ary, FakeCUDAArray): @@ -164,9 +167,10 @@ def copy_to_device(self, ary, stream=0): else: ary_core = np.array( ary_core, - order='C' if self_core.flags['C_CONTIGUOUS'] else 'F', + order="C" if self_core.flags["C_CONTIGUOUS"] else "F", subok=True, - copy=False if numpy_version < (2, 0) else None) + copy=False if numpy_version < (2, 0) else None, + ) check_array_compatibility(self_core, ary_core) np.copyto(self_core._ary, ary_core) @@ -237,7 +241,7 @@ def __mod__(self, other): return FakeCUDAArray(self._ary % other) def __pow__(self, other): - return FakeCUDAArray(self._ary ** other) + return FakeCUDAArray(self._ary**other) def split(self, section, stream=0): return [ @@ -282,30 +286,33 @@ def is_contiguous(ary): def sentry_contiguous(ary): core = array_core(ary) - if not core.flags['C_CONTIGUOUS'] and not core.flags['F_CONTIGUOUS']: + if not core.flags["C_CONTIGUOUS"] and not core.flags["F_CONTIGUOUS"]: raise ValueError(errmsg_contiguous_buffer) def check_array_compatibility(ary1, ary2): ary1sq, ary2sq = ary1.squeeze(), ary2.squeeze() if ary1.dtype != ary2.dtype: - raise TypeError('incompatible dtype: %s vs. %s' % - (ary1.dtype, ary2.dtype)) + raise TypeError( + "incompatible dtype: %s vs. %s" % (ary1.dtype, ary2.dtype) + ) if ary1sq.shape != ary2sq.shape: - raise ValueError('incompatible shape: %s vs. %s' % - (ary1.shape, ary2.shape)) + raise ValueError( + "incompatible shape: %s vs. %s" % (ary1.shape, ary2.shape) + ) if ary1sq.strides != ary2sq.strides: - raise ValueError('incompatible strides: %s vs. %s' % - (ary1.strides, ary2.strides)) + raise ValueError( + "incompatible strides: %s vs. %s" % (ary1.strides, ary2.strides) + ) def to_device(ary, stream=0, copy=True, to=None): - ary = np.array(ary, - copy=False if numpy_version < (2, 0) else None, - subok=True) + ary = np.array( + ary, copy=False if numpy_version < (2, 0) else None, subok=True + ) sentry_contiguous(ary) if to is None: - buffer_dtype = np.int64 if ary.dtype.char in 'Mm' else ary.dtype + buffer_dtype = np.int64 if ary.dtype.char in "Mm" else ary.dtype return FakeCUDAArray( np.ndarray( buffer=np.copy(array_core(ary)).view(buffer_dtype), @@ -324,22 +331,22 @@ def pinned(arg): def mapped_array(*args, **kwargs): - for unused_arg in ('portable', 'wc'): + for unused_arg in ("portable", "wc"): if unused_arg in kwargs: kwargs.pop(unused_arg) return device_array(*args, **kwargs) -def pinned_array(shape, dtype=np.float64, strides=None, order='C'): +def pinned_array(shape, dtype=np.float64, strides=None, order="C"): return np.ndarray(shape=shape, strides=strides, dtype=dtype, order=order) -def managed_array(shape, dtype=np.float64, strides=None, order='C'): +def managed_array(shape, dtype=np.float64, strides=None, order="C"): return np.ndarray(shape=shape, strides=strides, dtype=dtype, order=order) def device_array(*args, **kwargs): - stream = kwargs.pop('stream') if 'stream' in kwargs else 0 + stream = kwargs.pop("stream") if "stream" in kwargs else 0 return FakeCUDAArray(np.ndarray(*args, **kwargs), stream=stream) @@ -350,7 +357,7 @@ def _contiguous_strides_like_array(ary): """ # Don't recompute strides if the default strides will be sufficient to # create a contiguous array. - if ary.flags['C_CONTIGUOUS'] or ary.flags['F_CONTIGUOUS'] or ary.ndim <= 1: + if ary.flags["C_CONTIGUOUS"] or ary.flags["F_CONTIGUOUS"] or ary.ndim <= 1: return None # Otherwise, we need to compute new strides using an algorithm adapted from @@ -360,7 +367,7 @@ def _contiguous_strides_like_array(ary): # Stride permutation. E.g. a stride array (4, -2, 12) becomes # [(1, -2), (0, 4), (2, 12)] - strideperm = [ x for x in enumerate(ary.strides) ] + strideperm = [x for x in enumerate(ary.strides)] strideperm.sort(key=lambda x: x[1]) # Compute new strides using permutation @@ -373,24 +380,26 @@ def _contiguous_strides_like_array(ary): def _order_like_array(ary): - if ary.flags['F_CONTIGUOUS'] and not ary.flags['C_CONTIGUOUS']: - return 'F' + if ary.flags["F_CONTIGUOUS"] and not ary.flags["C_CONTIGUOUS"]: + return "F" else: - return 'C' + return "C" def device_array_like(ary, stream=0): strides = _contiguous_strides_like_array(ary) order = _order_like_array(ary) - return device_array(shape=ary.shape, dtype=ary.dtype, strides=strides, - order=order) + return device_array( + shape=ary.shape, dtype=ary.dtype, strides=strides, order=order + ) def pinned_array_like(ary): strides = _contiguous_strides_like_array(ary) order = _order_like_array(ary) - return pinned_array(shape=ary.shape, dtype=ary.dtype, strides=strides, - order=order) + return pinned_array( + shape=ary.shape, dtype=ary.dtype, strides=strides, order=order + ) def auto_device(ary, stream=0, copy=True): @@ -399,15 +408,14 @@ def auto_device(ary, stream=0, copy=True): if not isinstance(ary, np.void): ary = np.array( - ary, - copy=False if numpy_version < (2, 0) else None, - subok=True) + ary, copy=False if numpy_version < (2, 0) else None, subok=True + ) return to_device(ary, stream, copy), True def is_cuda_ndarray(obj): "Check if an object is a CUDA ndarray" - return getattr(obj, '__cuda_ndarray__', False) + return getattr(obj, "__cuda_ndarray__", False) def verify_cuda_ndarray_interface(obj): @@ -418,15 +426,15 @@ def requires_attr(attr, typ): if not hasattr(obj, attr): raise AttributeError(attr) if not isinstance(getattr(obj, attr), typ): - raise AttributeError('%s must be of type %s' % (attr, typ)) + raise AttributeError("%s must be of type %s" % (attr, typ)) - requires_attr('shape', tuple) - requires_attr('strides', tuple) - requires_attr('dtype', np.dtype) - requires_attr('size', int) + requires_attr("shape", tuple) + requires_attr("strides", tuple) + requires_attr("dtype", np.dtype) + requires_attr("size", int) def require_cuda_ndarray(obj): "Raises ValueError is is_cuda_ndarray(obj) evaluates False" if not is_cuda_ndarray(obj): - raise ValueError('require an cuda ndarray object') + raise ValueError("require an cuda ndarray object") diff --git a/numba_cuda/numba/cuda/simulator/cudadrv/devices.py b/numba_cuda/numba/cuda/simulator/cudadrv/devices.py index 3237fb2c6..433316262 100644 --- a/numba_cuda/numba/cuda/simulator/cudadrv/devices.py +++ b/numba_cuda/numba/cuda/simulator/cudadrv/devices.py @@ -8,7 +8,7 @@ class FakeCUDADevice: def __init__(self): - self.uuid = 'GPU-00000000-0000-0000-0000-000000000000' + self.uuid = "GPU-00000000-0000-0000-0000-000000000000" @property def compute_capability(self): @@ -16,10 +16,11 @@ def compute_capability(self): class FakeCUDAContext: - ''' + """ This stub implements functionality only for simulating a single GPU at the moment. - ''' + """ + def __init__(self, device_id): self._device_id = device_id self._device = FakeCUDADevice() @@ -54,7 +55,7 @@ def get_memory_info(self): dependencies, e.g. `psutil` - so return infinite memory to maintain API type compatibility """ - return _MemoryInfo(float('inf'), float('inf')) + return _MemoryInfo(float("inf"), float("inf")) def memalloc(self, sz): """ @@ -62,19 +63,20 @@ def memalloc(self, sz): At present, there is no division between simulated host memory and simulated device memory. """ - return np.ndarray(sz, dtype='u1') + return np.ndarray(sz, dtype="u1") def memhostalloc(self, sz, mapped=False, portable=False, wc=False): - '''Allocates memory on the host''' + """Allocates memory on the host""" return self.memalloc(sz) class FakeDeviceList: - ''' + """ This stub implements a device list containing a single GPU. It also keeps track of the GPU status, i.e. whether the context is closed or not, which may have been set by the user calling reset() - ''' + """ + def __init__(self): self.lst = (FakeCUDAContext(0),) self.closed = False @@ -84,7 +86,7 @@ def __getitem__(self, devnum): return self.lst[devnum] def __str__(self): - return ', '.join([str(d) for d in self.lst]) + return ", ".join([str(d) for d in self.lst]) def __iter__(self): return iter(self.lst) @@ -111,7 +113,7 @@ def get_context(devnum=0): def require_context(func): - ''' + """ In the simulator, a context is always "available", so this is a no-op. - ''' + """ return func diff --git a/numba_cuda/numba/cuda/simulator/cudadrv/driver.py b/numba_cuda/numba/cuda/simulator/cudadrv/driver.py index 09de5b729..7a567de58 100644 --- a/numba_cuda/numba/cuda/simulator/cudadrv/driver.py +++ b/numba_cuda/numba/cuda/simulator/cudadrv/driver.py @@ -1,15 +1,15 @@ -''' +""" Most of the driver API is unsupported in the simulator, but some stubs are provided to allow tests to import correctly. -''' +""" def device_memset(dst, val, size, stream=0): - dst.view('u1')[:size].fill(bytes([val])[0]) + dst.view("u1")[:size].fill(bytes([val])[0]) def host_to_device(dst, src, size, stream=0): - dst.view('u1')[:size] = src.view('u1')[:size] + dst.view("u1")[:size] = src.view("u1")[:size] def device_to_host(dst, src, size, stream=0): @@ -55,7 +55,7 @@ class CudaAPIError(RuntimeError): def launch_kernel(*args, **kwargs): - msg = 'Launching kernels directly is not supported in the simulator' + msg = "Launching kernels directly is not supported in the simulator" raise RuntimeError(msg) diff --git a/numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py b/numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py index 44c697f37..8229cba8d 100644 --- a/numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +++ b/numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py @@ -1,4 +1,4 @@ -''' +""" drvapi is not implemented in the simulator, but this module exists to allow tests to import correctly. -''' +""" diff --git a/numba_cuda/numba/cuda/simulator/cudadrv/libs.py b/numba_cuda/numba/cuda/simulator/cudadrv/libs.py index 347b936c5..3b56434d6 100644 --- a/numba_cuda/numba/cuda/simulator/cudadrv/libs.py +++ b/numba_cuda/numba/cuda/simulator/cudadrv/libs.py @@ -1,2 +1,2 @@ def check_static_lib(lib): - raise FileNotFoundError('Linking libraries not supported by cudasim') + raise FileNotFoundError("Linking libraries not supported by cudasim") diff --git a/numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py b/numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py index 2a011a77a..4fa5561db 100644 --- a/numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +++ b/numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py @@ -1,7 +1,7 @@ -''' +""" NVVM is not supported in the simulator, but stubs are provided to allow tests to import correctly. -''' +""" class NvvmSupportError(ImportError): @@ -10,7 +10,7 @@ class NvvmSupportError(ImportError): class NVVM(object): def __init__(self): - raise NvvmSupportError('NVVM not supported in the simulator') + raise NvvmSupportError("NVVM not supported in the simulator") CompilationUnit = None diff --git a/numba_cuda/numba/cuda/simulator/cudadrv/runtime.py b/numba_cuda/numba/cuda/simulator/cudadrv/runtime.py index 308d19e76..b38abedb6 100644 --- a/numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +++ b/numba_cuda/numba/cuda/simulator/cudadrv/runtime.py @@ -1,7 +1,7 @@ -''' +""" The runtime API is unsupported in the simulator, but some stubs are provided to allow tests to import correctly. -''' +""" class FakeRuntime(object): @@ -13,7 +13,7 @@ def is_supported_version(self): @property def supported_versions(self): - return (-1, -1), + return ((-1, -1),) runtime = FakeRuntime() diff --git a/numba_cuda/numba/cuda/simulator/kernel.py b/numba_cuda/numba/cuda/simulator/kernel.py index b3ca22599..74d6d0dd3 100644 --- a/numba_cuda/numba/cuda/simulator/kernel.py +++ b/numba_cuda/numba/cuda/simulator/kernel.py @@ -41,9 +41,10 @@ def _get_kernel_context(): class FakeOverload: - ''' + """ Used only to provide the max_cooperative_grid_blocks method - ''' + """ + def max_cooperative_grid_blocks(self, blockdim): # We can only run one block in a cooperative grid because we have no # mechanism for synchronization between different blocks @@ -58,16 +59,16 @@ def __getitem__(self, key): class FakeCUDAKernel(object): - ''' + """ Wraps a @cuda.jit-ed function. - ''' + """ def __init__(self, fn, device, fastmath=False, extensions=[], debug=False): self.fn = fn self._device = device self._fastmath = fastmath self._debug = debug - self.extensions = list(extensions) # defensive copy + self.extensions = list(extensions) # defensive copy # Initial configuration: grid unconfigured, stream 0, no dynamic shared # memory. self.grid_dim = None @@ -82,11 +83,13 @@ def __call__(self, *args): return self.fn(*args) # Ensure we've been given a valid grid configuration - grid_dim, block_dim = normalize_kernel_dimensions(self.grid_dim, - self.block_dim) + grid_dim, block_dim = normalize_kernel_dimensions( + self.grid_dim, self.block_dim + ) - fake_cuda_module = FakeCUDAModule(grid_dim, block_dim, - self.dynshared_size) + fake_cuda_module = FakeCUDAModule( + grid_dim, block_dim, self.dynshared_size + ) with _push_kernel_context(fake_cuda_module): # fake_args substitutes all numpy arrays for FakeCUDAArrays # because they implement some semantics differently @@ -96,11 +99,10 @@ def fake_arg(arg): # map the arguments using any extension you've registered _, arg = functools.reduce( lambda ty_val, extension: extension.prepare_args( - *ty_val, - stream=0, - retr=retr), + *ty_val, stream=0, retr=retr + ), self.extensions, - (None, arg) + (None, arg), ) if isinstance(arg, np.ndarray) and arg.ndim > 0: @@ -126,8 +128,9 @@ def fake_arg(arg): wb() def __getitem__(self, configuration): - self.grid_dim, self.block_dim = \ - normalize_kernel_dimensions(*configuration[:2]) + self.grid_dim, self.block_dim = normalize_kernel_dimensions( + *configuration[:2] + ) if len(configuration) == 4: self.dynshared_size = configuration[3] @@ -142,8 +145,9 @@ def specialize(self, *args): def forall(self, ntasks, tpb=0, stream=0, sharedmem=0): if ntasks < 0: - raise ValueError("Can't create ForAll with negative task count: %s" - % ntasks) + raise ValueError( + "Can't create ForAll with negative task count: %s" % ntasks + ) return self[ntasks, 1, stream, sharedmem] @property @@ -157,15 +161,19 @@ def py_func(self): # Thread emulation + class BlockThread(threading.Thread): - ''' + """ Manages the execution of a function for a single CUDA thread. - ''' + """ + def __init__(self, f, manager, blockIdx, threadIdx, debug): if debug: + def debug_wrapper(*args, **kwargs): - np.seterr(divide='raise') + np.seterr(divide="raise") f(*args, **kwargs) + target = debug_wrapper else: target = f @@ -181,27 +189,26 @@ def debug_wrapper(*args, **kwargs): self.abort = False self.debug = debug blockDim = Dim3(*self._manager._block_dim) - self.thread_id = self.threadIdx.x + (blockDim.x * (self.threadIdx.y + - blockDim.y * - self.threadIdx.z)) + self.thread_id = self.threadIdx.x + ( + blockDim.x * (self.threadIdx.y + blockDim.y * self.threadIdx.z) + ) def run(self): try: super(BlockThread, self).run() except Exception as e: - tid = 'tid=%s' % list(self.threadIdx) - ctaid = 'ctaid=%s' % list(self.blockIdx) - if str(e) == '': - msg = '%s %s' % (tid, ctaid) + tid = "tid=%s" % list(self.threadIdx) + ctaid = "ctaid=%s" % list(self.blockIdx) + if str(e) == "": + msg = "%s %s" % (tid, ctaid) else: - msg = '%s %s: %s' % (tid, ctaid, e) + msg = "%s %s: %s" % (tid, ctaid, e) tb = sys.exc_info()[2] # Using `with_traceback` here would cause it to be mutated by # future raise statements, which may or may not matter. self.exception = (type(e)(msg), tb) def syncthreads(self): - if self.abort: raise RuntimeError("abort flag set on syncthreads call") @@ -237,11 +244,11 @@ def syncthreads_or(self, value): return 1 if test else 0 def __str__(self): - return 'Thread <<<%s, %s>>>' % (self.blockIdx, self.threadIdx) + return "Thread <<<%s, %s>>>" % (self.blockIdx, self.threadIdx) class BlockManager(object): - ''' + """ Manages the execution of a thread block. When run() is called, all threads are started. Each thread executes until it @@ -257,7 +264,8 @@ class BlockManager(object): The polling continues until no threads are alive, when execution is complete. - ''' + """ + def __init__(self, f, grid_dim, block_dim, debug): self._grid_dim = grid_dim self._block_dim = block_dim @@ -271,8 +279,10 @@ def run(self, grid_point, *args): livethreads = set() blockedthreads = set() for block_point in np.ndindex(*self._block_dim): + def target(): self._f(*args) + t = BlockThread(target, self, grid_point, block_point, self._debug) t.start() threads.add(t) @@ -286,7 +296,6 @@ def target(): if t.syncthreads_blocked: blockedthreads.add(t) elif t.exception: - # Abort all other simulator threads on exception, # do *not* join immediately to facilitate debugging. for t_other in threads: @@ -300,7 +309,7 @@ def target(): t.syncthreads_blocked = False t.syncthreads_event.set() blockedthreads = set() - livethreads = set([ t for t in livethreads if t.is_alive() ]) + livethreads = set([t for t in livethreads if t.is_alive()]) # Final check for exceptions in case any were set prior to thread # finishing, before we could check it for t in threads: diff --git a/numba_cuda/numba/cuda/simulator/kernelapi.py b/numba_cuda/numba/cuda/simulator/kernelapi.py index 64793df05..49670ab3b 100644 --- a/numba_cuda/numba/cuda/simulator/kernelapi.py +++ b/numba_cuda/numba/cuda/simulator/kernelapi.py @@ -1,7 +1,7 @@ -''' +""" Implements the cuda module as called from within an executing kernel (@cuda.jit-decorated function). -''' +""" from contextlib import contextmanager import sys @@ -16,19 +16,20 @@ class Dim3(object): - ''' + """ Used to implement thread/block indices/dimensions - ''' + """ + def __init__(self, x, y, z): self.x = x self.y = y self.z = z def __str__(self): - return '(%s, %s, %s)' % (self.x, self.y, self.z) + return "(%s, %s, %s)" % (self.x, self.y, self.z) def __repr__(self): - return 'Dim3(%s, %s, %s)' % (self.x, self.y, self.z) + return "Dim3(%s, %s, %s)" % (self.x, self.y, self.z) def __iter__(self): yield self.x @@ -37,9 +38,9 @@ def __iter__(self): class GridGroup: - ''' + """ Used to implement the grid group. - ''' + """ def sync(self): # Synchronization of the grid group is equivalent to synchronization of @@ -49,17 +50,19 @@ def sync(self): class FakeCUDACg: - ''' + """ CUDA Cooperative Groups - ''' + """ + def this_grid(self): return GridGroup() class FakeCUDALocal(object): - ''' + """ CUDA Local arrays - ''' + """ + def array(self, shape, dtype): if isinstance(dtype, types.Type): dtype = numpy_support.as_dtype(dtype) @@ -67,21 +70,23 @@ def array(self, shape, dtype): class FakeCUDAConst(object): - ''' + """ CUDA Const arrays - ''' + """ + def array_like(self, ary): return ary class FakeCUDAShared(object): - ''' + """ CUDA Shared arrays. Limitations: assumes that only one call to cuda.shared.array is on a line, and that that line is only executed once per thread. i.e.:: - a = cuda.shared.array(...); b = cuda.shared.array(...) + a = cuda.shared.array(...) + b = cuda.shared.array(...) will erroneously alias a and b, and:: @@ -90,7 +95,7 @@ class FakeCUDAShared(object): will alias all arrays created at that point (though it is not certain that this would be supported by Numba anyway). - ''' + """ def __init__(self, dynshared_size): self._allocations = {} @@ -274,13 +279,13 @@ def hexp2(self, x): return np.exp2(x, dtype=np.float16) def hexp10(self, x): - return np.float16(10 ** x) + return np.float16(10**x) def hsqrt(self, x): return np.sqrt(x, dtype=np.float16) def hrsqrt(self, x): - return np.float16(x ** -0.5) + return np.float16(x**-0.5) def hceil(self, x): return np.ceil(x, dtype=np.float16) @@ -323,7 +328,7 @@ def hmin(self, a, b): class FakeCUDAModule(object): - ''' + """ An instance of this class will be injected into the __globals__ for an executing function in order to implement calls to cuda.*. This will fail to work correctly if the user code does:: @@ -331,7 +336,7 @@ class FakeCUDAModule(object): from numba import cuda as something_else In other words, the CUDA module must be called cuda. - ''' + """ def __init__(self, grid_dim, block_dim, dynshared_size): self.gridDim = Dim3(*grid_dim) @@ -426,11 +431,11 @@ def cbrt(self, a): return a ** (1 / 3) def brev(self, val): - return int('{:032b}'.format(val)[::-1], 2) + return int("{:032b}".format(val)[::-1], 2) def clz(self, val): - s = '{:032b}'.format(val) - return len(s) - len(s.lstrip('0')) + s = "{:032b}".format(val) + return len(s) - len(s.lstrip("0")) def ffs(self, val): # The algorithm is: @@ -438,8 +443,8 @@ def ffs(self, val): # 2. Add 1, because the LSB is numbered 1 rather than 0, and so on. # 3. If we've counted 32 zeros (resulting in 33), there were no bits # set so we need to return zero. - s = '{:032b}'.format(val) - r = (len(s) - len(s.rstrip('0')) + 1) % 33 + s = "{:032b}".format(val) + r = (len(s) - len(s.rstrip("0")) + 1) % 33 return r def selp(self, a, b, c): diff --git a/numba_cuda/numba/cuda/simulator/reduction.py b/numba_cuda/numba/cuda/simulator/reduction.py index 1b819c043..5a3a8e87b 100644 --- a/numba_cuda/numba/cuda/simulator/reduction.py +++ b/numba_cuda/numba/cuda/simulator/reduction.py @@ -9,6 +9,7 @@ def reduce_wrapper(seq, res=None, init=0): return None else: return r + return reduce_wrapper diff --git a/numba_cuda/numba/cuda/simulator/vector_types.py b/numba_cuda/numba/cuda/simulator/vector_types.py index 82a6fbe8a..55792d9b8 100644 --- a/numba_cuda/numba/cuda/simulator/vector_types.py +++ b/numba_cuda/numba/cuda/simulator/vector_types.py @@ -3,7 +3,7 @@ class SimulatedVectorType: - attributes = ['x', 'y', 'z', 'w'] + attributes = ["x", "y", "z", "w"] def __init__(self, *args): args_flattened = [] @@ -12,7 +12,7 @@ def __init__(self, *args): args_flattened += arg.as_list() else: args_flattened.append(arg) - self._attrs = self.attributes[:len(args_flattened)] + self._attrs = self.attributes[: len(args_flattened)] if not self.num_elements == len(args_flattened): raise TypeError( f"{self.name} expects {self.num_elements}" @@ -35,11 +35,15 @@ def as_list(self): def make_simulated_vector_type(num_elements, name): - obj = type(name, (SimulatedVectorType,), { - "num_elements": num_elements, - "base_type": types.float32, - "name": name - }) + obj = type( + name, + (SimulatedVectorType,), + { + "num_elements": num_elements, + "base_type": types.float32, + "name": name, + }, + ) obj.user_facing_object = obj return obj @@ -48,8 +52,8 @@ def _initialize(): _simulated_vector_types = {} for stub in _vector_type_stubs: num_elements = int(stub.__name__[-1]) - _simulated_vector_types[stub.__name__] = ( - make_simulated_vector_type(num_elements, stub.__name__) + _simulated_vector_types[stub.__name__] = make_simulated_vector_type( + num_elements, stub.__name__ ) _simulated_vector_types[stub.__name__].aliases = stub.aliases return _simulated_vector_types diff --git a/numba_cuda/numba/cuda/simulator_init.py b/numba_cuda/numba/cuda/simulator_init.py index 9d7dd124a..fb2120632 100644 --- a/numba_cuda/numba/cuda/simulator_init.py +++ b/numba_cuda/numba/cuda/simulator_init.py @@ -4,14 +4,12 @@ def is_available(): - """Returns a boolean to indicate the availability of a CUDA GPU. - """ + """Returns a boolean to indicate the availability of a CUDA GPU.""" # Simulator is always available return True def cuda_error(): - """Returns None or an exception if the CUDA driver fails to initialize. - """ + """Returns None or an exception if the CUDA driver fails to initialize.""" # Simulator never fails to initialize return None diff --git a/numba_cuda/numba/cuda/stubs.py b/numba_cuda/numba/cuda/stubs.py index 205cf8045..a16607699 100644 --- a/numba_cuda/numba/cuda/stubs.py +++ b/numba_cuda/numba/cuda/stubs.py @@ -1,6 +1,7 @@ """ This scripts specifies all PTX special objects. """ + import numpy as np from collections import defaultdict import functools @@ -9,12 +10,13 @@ class Stub(object): - ''' + """ A stub object to represent special objects that are meaningless outside the context of a CUDA kernel - ''' - _description_ = '' - __slots__ = () # don't allocate __dict__ + """ + + _description_ = "" + __slots__ = () # don't allocate __dict__ def __new__(cls): raise NotImplementedError("%s is not instantiable" % cls) @@ -24,23 +26,26 @@ def __repr__(self): def stub_function(fn): - ''' + """ A stub function to represent special functions that are meaningless outside the context of a CUDA kernel - ''' + """ + @functools.wraps(fn) def wrapped(*args, **kwargs): raise NotImplementedError("%s cannot be called from host code" % fn) + return wrapped -#------------------------------------------------------------------------------- +# ------------------------------------------------------------------------------- # Thread and grid indices and dimensions class Dim3(Stub): - '''A triple, (x, y, z)''' - _description_ = '' + """A triple, (x, y, z)""" + + _description_ = "" @property def x(self): @@ -56,68 +61,76 @@ def z(self): class threadIdx(Dim3): - ''' + """ The thread indices in the current thread block. Each index is an integer spanning the range from 0 inclusive to the corresponding value of the attribute in :attr:`numba.cuda.blockDim` exclusive. - ''' - _description_ = '' + """ + + _description_ = "" class blockIdx(Dim3): - ''' + """ The block indices in the grid of thread blocks. Each index is an integer spanning the range from 0 inclusive to the corresponding value of the attribute in :attr:`numba.cuda.gridDim` exclusive. - ''' - _description_ = '' + """ + + _description_ = "" class blockDim(Dim3): - ''' + """ The shape of a block of threads, as declared when instantiating the kernel. This value is the same for all threads in a given kernel launch, even if they belong to different blocks (i.e. each block is "full"). - ''' - _description_ = '' + """ + + _description_ = "" class gridDim(Dim3): - ''' + """ The shape of the grid of blocks. This value is the same for all threads in a given kernel launch. - ''' - _description_ = '' + """ + + _description_ = "" class warpsize(Stub): - ''' + """ The size of a warp. All architectures implemented to date have a warp size of 32. - ''' - _description_ = '' + """ + + _description_ = "" class laneid(Stub): - ''' + """ This thread's lane within a warp. Ranges from 0 to :attr:`numba.cuda.warpsize` - 1. - ''' - _description_ = '' + """ + + _description_ = "" -#------------------------------------------------------------------------------- +# ------------------------------------------------------------------------------- # Array creation + class shared(Stub): - ''' + """ Shared memory namespace - ''' - _description_ = '' + """ + + _description_ = "" @stub_function def array(shape, dtype): - ''' + """ Allocate a shared array of the given *shape* and *type*. *shape* is either an integer or a tuple of integers representing the array's dimensions. *type* is a :ref:`Numba type ` of the @@ -125,83 +138,89 @@ def array(shape, dtype): The returned array-like object can be read and written to like any normal device array (e.g. through indexing). - ''' + """ class local(Stub): - ''' + """ Local memory namespace - ''' - _description_ = '' + """ + + _description_ = "" @stub_function def array(shape, dtype): - ''' + """ Allocate a local array of the given *shape* and *type*. The array is private to the current thread, and resides in global memory. An array-like object is returned which can be read and written to like any standard array (e.g. through indexing). - ''' + """ class const(Stub): - ''' + """ Constant memory namespace - ''' + """ @stub_function def array_like(ndarray): - ''' + """ Create a const array from *ndarry*. The resulting const array will have the same shape, type, and values as *ndarray*. - ''' + """ # ------------------------------------------------------------------------------- # warp level operations + class syncwarp(Stub): - ''' + """ syncwarp(mask=0xFFFFFFFF) Synchronizes a masked subset of threads in a warp. - ''' - _description_ = '' + """ + + _description_ = "" class shfl_sync_intrinsic(Stub): - ''' + """ shfl_sync_intrinsic(mask, mode, value, mode_offset, clamp) Nvvm intrinsic for shuffling data across a warp docs.nvidia.com/cuda/nvvm-ir-spec/index.html#nvvm-intrin-warp-level-datamove - ''' - _description_ = '' + """ + + _description_ = "" class vote_sync_intrinsic(Stub): - ''' + """ vote_sync_intrinsic(mask, mode, predictate) Nvvm intrinsic for performing a reduce and broadcast across a warp docs.nvidia.com/cuda/nvvm-ir-spec/index.html#nvvm-intrin-warp-level-vote - ''' - _description_ = '' + """ + + _description_ = "" class match_any_sync(Stub): - ''' + """ match_any_sync(mask, value) Nvvm intrinsic for performing a compare and broadcast across a warp. Returns a mask of threads that have same value as the given value from within the masked warp. - ''' - _description_ = '' + """ + + _description_ = "" class match_all_sync(Stub): - ''' + """ match_all_sync(mask, value) Nvvm intrinsic for performing a compare and broadcast across a warp. @@ -209,12 +228,13 @@ class match_all_sync(Stub): same value as the given value from within the masked warp, if they all have the same value, otherwise it is 0. Pred is a boolean of whether or not all threads in the mask warp have the same warp. - ''' - _description_ = '' + """ + + _description_ = "" class activemask(Stub): - ''' + """ activemask() Returns a 32-bit integer mask of all currently active threads in the @@ -222,47 +242,54 @@ class activemask(Stub): activemask() is called. Inactive threads are represented by 0 bits in the returned mask. Threads which have exited the kernel are always marked as inactive. - ''' - _description_ = '' + """ + + _description_ = "" class lanemask_lt(Stub): - ''' + """ lanemask_lt() Returns a 32-bit integer mask of all lanes (including inactive ones) with ID less than the current lane. - ''' - _description_ = '' + """ + + _description_ = "" # ------------------------------------------------------------------------------- # memory fences + class threadfence_block(Stub): - ''' + """ A memory fence at thread block level - ''' - _description_ = '' + """ + + _description_ = "" class threadfence_system(Stub): - ''' + """ A memory fence at system level: across devices - ''' - _description_ = '' + """ + + _description_ = "" class threadfence(Stub): - ''' + """ A memory fence at device level - ''' - _description_ = '' + """ + + _description_ = "" -#------------------------------------------------------------------------------- +# ------------------------------------------------------------------------------- # bit manipulation + class popc(Stub): """ popc(x) @@ -297,9 +324,10 @@ class ffs(Stub): """ -#------------------------------------------------------------------------------- +# ------------------------------------------------------------------------------- # comparison and selection instructions + class selp(Stub): """ selp(a, b, c) @@ -309,9 +337,10 @@ class selp(Stub): """ -#------------------------------------------------------------------------------- +# ------------------------------------------------------------------------------- # single / double precision arithmetic + class fma(Stub): """ fma(a, b, c) @@ -321,20 +350,21 @@ class fma(Stub): class cbrt(Stub): - """" + """ " cbrt(a) Perform the cube root operation. """ -#------------------------------------------------------------------------------- +# ------------------------------------------------------------------------------- # atomic + class atomic(Stub): - """Namespace for atomic operations - """ - _description_ = '' + """Namespace for atomic operations""" + + _description_ = "" class add(Stub): """add(ary, idx, val) @@ -401,8 +431,7 @@ class dec(Stub): Performs:: - ary[idx] = (val if (ary[idx] == 0) or - (ary[idx] > val) else ary[idx] - 1) + ary[idx] = val if (ary[idx] == 0) or (ary[idx] > val) else ary[idx] - 1 Supported on uint32, and uint64 operands only. @@ -497,26 +526,29 @@ class cas(Stub): """ -#------------------------------------------------------------------------------- +# ------------------------------------------------------------------------------- # timers + class nanosleep(Stub): - ''' + """ nanosleep(ns) Suspends the thread for a sleep duration approximately close to the delay `ns`, specified in nanoseconds. - ''' - _description_ = '' + """ + + _description_ = "" -#------------------------------------------------------------------------------- + +# ------------------------------------------------------------------------------- # Floating point 16 class fp16(Stub): - """Namespace for fp16 operations - """ - _description_ = '' + """Namespace for fp16 operations""" + + _description_ = "" class hadd(Stub): """hadd(a, b) @@ -817,9 +849,10 @@ class hmin(Stub): """ -#------------------------------------------------------------------------------- +# ------------------------------------------------------------------------------- # vector types + def make_vector_type_stubs(): """Make user facing objects for vector types""" vector_type_stubs = [] @@ -833,7 +866,7 @@ def make_vector_type_stubs(): "uint32", "uint64", "float32", - "float64" + "float64", ) vector_type_element_counts = (1, 2, 3, 4) vector_type_attribute_names = ("x", "y", "z", "w") @@ -845,21 +878,25 @@ def make_vector_type_stubs(): attr_names = vector_type_attribute_names[:nelem] vector_type_stub = type( - type_name, (Stub,), + type_name, + (Stub,), { **{attr: lambda self: None for attr in attr_names}, **{ "_description_": f"<{type_name}>", - "__signature__": Signature(parameters=[ - Parameter( - name=attr_name, kind=Parameter.POSITIONAL_ONLY - ) for attr_name in attr_names[:nelem] - ]), + "__signature__": Signature( + parameters=[ + Parameter( + name=attr_name, kind=Parameter.POSITIONAL_ONLY + ) + for attr_name in attr_names[:nelem] + ] + ), "__doc__": f"A stub for {type_name} to be used in " - "CUDA kernels." + "CUDA kernels.", }, - **{"aliases": []} - } + **{"aliases": []}, + }, ) vector_type_stubs.append(vector_type_stub) return vector_type_stubs @@ -884,7 +921,7 @@ def map_vector_type_stubs_to_alias(vector_type_stubs): "ulong": f"uint{np.dtype(np.uint).itemsize * 8}", "ulonglong": f"uint{np.dtype(np.ulonglong).itemsize * 8}", "float": f"float{np.dtype(np.single).itemsize * 8}", - "double": f"float{np.dtype(np.double).itemsize * 8}" + "double": f"float{np.dtype(np.double).itemsize * 8}", } base_type_to_vector_type = defaultdict(list) diff --git a/numba_cuda/numba/cuda/target.py b/numba_cuda/numba/cuda/target.py index 5713cdf93..ead46f446 100644 --- a/numba_cuda/numba/cuda/target.py +++ b/numba_cuda/numba/cuda/target.py @@ -35,19 +35,21 @@ def load_additional_registries(self): def resolve_value_type(self, val): # treat other dispatcher object as another device function from numba.cuda.dispatcher import CUDADispatcher - if (isinstance(val, Dispatcher) and not - isinstance(val, CUDADispatcher)): + + if isinstance(val, Dispatcher) and not isinstance(val, CUDADispatcher): try: # use cached device function val = val.__dispatcher except AttributeError: if not val._can_compile: - raise ValueError('using cpu function on device ' - 'but its compilation is disabled') + raise ValueError( + "using cpu function on device " + "but its compilation is disabled" + ) targetoptions = val.targetoptions.copy() - targetoptions['device'] = True - targetoptions['debug'] = targetoptions.get('debug', False) - targetoptions['opt'] = targetoptions.get('opt', True) + targetoptions["device"] = True + targetoptions["debug"] = targetoptions.get("debug", False) + targetoptions["opt"] = targetoptions.get("opt", True) disp = CUDADispatcher(val.py_func, targetoptions) # cache the device function for future use and to avoid # duplicated copy of the same function. @@ -57,18 +59,19 @@ def resolve_value_type(self, val): # continue with parent logic return super(CUDATypingContext, self).resolve_value_type(val) + # ----------------------------------------------------------------------------- # Implementation -VALID_CHARS = re.compile(r'[^a-z0-9]', re.I) +VALID_CHARS = re.compile(r"[^a-z0-9]", re.I) class CUDATargetContext(BaseContext): implement_powi_as_math_call = True strict_alignment = True - def __init__(self, typingctx, target='cuda'): + def __init__(self, typingctx, target="cuda"): super().__init__(typingctx, target) self.data_model_manager = cuda_data_manager.chain( datamodel.default_manager @@ -76,7 +79,7 @@ def __init__(self, typingctx, target='cuda'): @property def enable_nrt(self): - return getattr(config, 'CUDA_ENABLE_NRT', False) + return getattr(config, "CUDA_ENABLE_NRT", False) @property def DIBuilder(self): @@ -98,18 +101,17 @@ def init(self): def load_additional_registries(self): # side effect of import needed for numba.cpython.*, the builtins # registry is updated at import time. - from numba.cpython import numbers, tupleobj, slicing # noqa: F401 - from numba.cpython import rangeobj, iterators, enumimpl # noqa: F401 - from numba.cpython import unicode, charseq # noqa: F401 + from numba.cpython import numbers, tupleobj, slicing # noqa: F401 + from numba.cpython import rangeobj, iterators, enumimpl # noqa: F401 + from numba.cpython import unicode, charseq # noqa: F401 from numba.cpython import cmathimpl from numba.misc import cffiimpl - from numba.np import arrayobj # noqa: F401 - from numba.np import npdatetime # noqa: F401 - from . import ( - cudaimpl, printimpl, libdeviceimpl, mathimpl, vector_types - ) + from numba.np import arrayobj # noqa: F401 + from numba.np import npdatetime # noqa: F401 + from . import cudaimpl, printimpl, libdeviceimpl, mathimpl, vector_types + # fix for #8940 - from numba.np.unsafe import ndarray # noqa F401 + from numba.np.unsafe import ndarray # noqa F401 self.install_registry(cudaimpl.registry) self.install_registry(cffiimpl.registry) @@ -136,10 +138,18 @@ def nonconst_module_attrs(self): These include threadIdx, blockDim, etc. """ from numba import cuda - nonconsts = ('threadIdx', 'blockDim', 'blockIdx', 'gridDim', 'laneid', - 'warpsize') - nonconsts_with_mod = tuple([(types.Module(cuda), nc) - for nc in nonconsts]) + + nonconsts = ( + "threadIdx", + "blockDim", + "blockIdx", + "gridDim", + "laneid", + "warpsize", + ) + nonconsts_with_mod = tuple( + [(types.Module(cuda), nc) for nc in nonconsts] + ) return nonconsts_with_mod @cached_property @@ -147,8 +157,9 @@ def call_conv(self): return CUDACallConv(self) def mangler(self, name, argtypes, *, abi_tags=(), uid=None): - return itanium_mangler.mangle(name, argtypes, abi_tags=abi_tags, - uid=uid) + return itanium_mangler.mangle( + name, argtypes, abi_tags=abi_tags, uid=uid + ) def make_constant_array(self, builder, aryty, arr): """ @@ -160,15 +171,16 @@ def make_constant_array(self, builder, aryty, arr): constvals = [ self.get_constant(types.byte, i) - for i in iter(arr.tobytes(order='A')) + for i in iter(arr.tobytes(order="A")) ] constaryty = ir.ArrayType(ir.IntType(8), len(constvals)) constary = ir.Constant(constaryty, constvals) addrspace = nvvm.ADDRSPACE_CONSTANT - gv = cgutils.add_global_variable(lmod, constary.type, "_cudapy_cmem", - addrspace=addrspace) - gv.linkage = 'internal' + gv = cgutils.add_global_variable( + lmod, constary.type, "_cudapy_cmem", addrspace=addrspace + ) + gv.linkage = "internal" gv.global_constant = True gv.initializer = constary @@ -179,17 +191,21 @@ def make_constant_array(self, builder, aryty, arr): # Convert to generic address-space ptrty = ir.PointerType(ir.IntType(8)) - genptr = builder.addrspacecast(gv, ptrty, 'generic') + genptr = builder.addrspacecast(gv, ptrty, "generic") # Create array object ary = self.make_array(aryty)(self, builder) kshape = [self.get_constant(types.intp, s) for s in arr.shape] kstrides = [self.get_constant(types.intp, s) for s in arr.strides] - self.populate_array(ary, data=builder.bitcast(genptr, ary.data.type), - shape=kshape, - strides=kstrides, - itemsize=ary.itemsize, parent=ary.parent, - meminfo=None) + self.populate_array( + ary, + data=builder.bitcast(genptr, ary.data.type), + shape=kshape, + strides=kstrides, + itemsize=ary.itemsize, + parent=ary.parent, + meminfo=None, + ) return ary._getvalue() @@ -199,15 +215,17 @@ def insert_const_string(self, mod, string): addrspace. """ text = cgutils.make_bytearray(string.encode("utf-8") + b"\x00") - name = '$'.join(["__conststring__", - itanium_mangler.mangle_identifier(string)]) + name = "$".join( + ["__conststring__", itanium_mangler.mangle_identifier(string)] + ) # Try to reuse existing global gv = mod.globals.get(name) if gv is None: # Not defined yet - gv = cgutils.add_global_variable(mod, text.type, name, - addrspace=nvvm.ADDRSPACE_CONSTANT) - gv.linkage = 'internal' + gv = cgutils.add_global_variable( + mod, text.type, name, addrspace=nvvm.ADDRSPACE_CONSTANT + ) + gv.linkage = "internal" gv.global_constant = True gv.initializer = text @@ -225,11 +243,10 @@ def insert_string_const_addrspace(self, builder, string): lmod = builder.module gv = self.insert_const_string(lmod, string) charptrty = ir.PointerType(ir.IntType(8)) - return builder.addrspacecast(gv, charptrty, 'generic') + return builder.addrspacecast(gv, charptrty, "generic") def optimize_function(self, func): - """Run O1 function passes - """ + """Run O1 function passes""" pass ## XXX skipped for now # fpm = lp.FunctionPassManager.new(func.module) @@ -266,8 +283,9 @@ def _make_call_helper(self, builder): def return_value(self, builder, retval): return builder.ret(retval) - def return_user_exc(self, builder, exc, exc_args=None, loc=None, - func_name=None): + def return_user_exc( + self, builder, exc, exc_args=None, loc=None, func_name=None + ): msg = "Python exceptions are unsupported in the CUDA C/C++ ABI" raise NotImplementedError(msg) @@ -290,8 +308,7 @@ def decorate_function(self, fn, args, fe_argtypes, noalias=False): """ assert not noalias arginfo = self._get_arg_packer(fe_argtypes) - arginfo.assign_names(self.get_arguments(fn), - ['arg.' + a for a in args]) + arginfo.assign_names(self.get_arguments(fn), ["arg." + a for a in args]) def get_arguments(self, func): """ diff --git a/numba_cuda/numba/cuda/testing.py b/numba_cuda/numba/cuda/testing.py index 3c2d7bf46..86a95e789 100644 --- a/numba_cuda/numba/cuda/testing.py +++ b/numba_cuda/numba/cuda/testing.py @@ -11,7 +11,7 @@ import unittest numba_cuda_dir = Path(__file__).parent -test_data_dir = numba_cuda_dir / 'tests' / 'data' +test_data_dir = numba_cuda_dir / "tests" / "data" class CUDATestCase(SerialMixin, TestCase): @@ -55,6 +55,7 @@ class ContextResettingTestCase(CUDATestCase): def tearDown(self): super().tearDown() from numba.cuda.cudadrv.devices import reset + reset() @@ -89,26 +90,26 @@ def skip_unless_conda_cudatoolkit(reason): def skip_if_external_memmgr(reason): """Skip test if an EMM Plugin is in use""" - return unittest.skipIf(config.CUDA_MEMORY_MANAGER != 'default', reason) + return unittest.skipIf(config.CUDA_MEMORY_MANAGER != "default", reason) def skip_under_cuda_memcheck(reason): - return unittest.skipIf(os.environ.get('CUDA_MEMCHECK') is not None, reason) + return unittest.skipIf(os.environ.get("CUDA_MEMCHECK") is not None, reason) def skip_without_nvdisasm(reason): - nvdisasm_path = shutil.which('nvdisasm') + nvdisasm_path = shutil.which("nvdisasm") return unittest.skipIf(nvdisasm_path is None, reason) def skip_with_nvdisasm(reason): - nvdisasm_path = shutil.which('nvdisasm') + nvdisasm_path = shutil.which("nvdisasm") return unittest.skipIf(nvdisasm_path is not None, reason) def skip_on_arm(reason): cpu = platform.processor() - is_arm = cpu.startswith('arm') or cpu.startswith('aarch') + is_arm = cpu.startswith("arm") or cpu.startswith("aarch") return unittest.skipIf(is_arm, reason) @@ -116,25 +117,27 @@ def skip_if_cuda_includes_missing(fn): # Skip when cuda.h is not available - generally this should indicate # whether the CUDA includes are available or not cuda_include_path = libs.get_cuda_include_dir() - cuda_h = os.path.join(cuda_include_path, 'cuda.h') - cuda_h_file = (os.path.exists(cuda_h) and os.path.isfile(cuda_h)) - reason = 'CUDA include dir not available on this system' + cuda_h = os.path.join(cuda_include_path, "cuda.h") + cuda_h_file = os.path.exists(cuda_h) and os.path.isfile(cuda_h) + reason = "CUDA include dir not available on this system" return unittest.skipUnless(cuda_h_file, reason)(fn) def skip_if_curand_kernel_missing(fn): cuda_include_path = libs.get_cuda_include_dir() - curand_kernel_h = os.path.join(cuda_include_path, 'curand_kernel.h') - curand_kernel_h_file = (os.path.exists(curand_kernel_h) and - os.path.isfile(curand_kernel_h)) - reason = 'curand_kernel.h not available on this system' + curand_kernel_h = os.path.join(cuda_include_path, "curand_kernel.h") + curand_kernel_h_file = os.path.exists(curand_kernel_h) and os.path.isfile( + curand_kernel_h + ) + reason = "curand_kernel.h not available on this system" return unittest.skipUnless(curand_kernel_h_file, reason)(fn) def skip_if_mvc_enabled(reason): """Skip a test if Minor Version Compatibility is enabled""" - return unittest.skipIf(config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY, - reason) + return unittest.skipIf( + config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY, reason + ) def skip_if_mvc_libraries_unavailable(fn): @@ -142,12 +145,14 @@ def skip_if_mvc_libraries_unavailable(fn): try: import cubinlinker # noqa: F401 import ptxcompiler # noqa: F401 + libs_available = True except ImportError: pass - return unittest.skipUnless(libs_available, - "Requires cubinlinker and ptxcompiler")(fn) + return unittest.skipUnless( + libs_available, "Requires cubinlinker and ptxcompiler" + )(fn) def cc_X_or_above(major, minor): @@ -189,7 +194,7 @@ def cudadevrt_missing(): if config.ENABLE_CUDASIM: return False try: - path = libs.get_cudalib('cudadevrt', static=True) + path = libs.get_cudalib("cudadevrt", static=True) libs.check_static_lib(path) except FileNotFoundError: return True @@ -197,7 +202,7 @@ def cudadevrt_missing(): def skip_if_cudadevrt_missing(fn): - return unittest.skipIf(cudadevrt_missing(), 'cudadevrt missing')(fn) + return unittest.skipIf(cudadevrt_missing(), "cudadevrt missing")(fn) class ForeignArray(object): diff --git a/numba_cuda/numba/cuda/tests/__init__.py b/numba_cuda/numba/cuda/tests/__init__.py index 425a52b2e..d04d546ed 100644 --- a/numba_cuda/numba/cuda/tests/__init__.py +++ b/numba_cuda/numba/cuda/tests/__init__.py @@ -19,18 +19,19 @@ def load_testsuite(loader, dir): files = [] for f in os.listdir(dir): path = join(dir, f) - if isfile(path) and fnmatch(f, 'test_*.py'): + if isfile(path) and fnmatch(f, "test_*.py"): files.append(f) - elif isfile(join(path, '__init__.py')): - suite.addTests(loader.discover(path, - top_level_dir=top_level_dir)) + elif isfile(join(path, "__init__.py")): + suite.addTests( + loader.discover(path, top_level_dir=top_level_dir) + ) for f in files: # turn 'f' into a filename relative to the toplevel dir and # translate it to a module name. This differs from the # implementation in Numba, because the toplevel dir is the # numba_cuda module location, not the numba one. f = relpath(join(dir, f), top_level_dir) - f = splitext(normpath(f.replace(os.path.sep, '.')))[0] + f = splitext(normpath(f.replace(os.path.sep, ".")))[0] suite.addTests(loader.loadTestsFromName(f)) return suite except Exception: @@ -42,16 +43,17 @@ def load_tests(loader, tests, pattern): suite = unittest.TestSuite() this_dir = dirname(__file__) ensure_supported_ccs_initialized() - suite.addTests(load_testsuite(loader, join(this_dir, 'nocuda'))) + suite.addTests(load_testsuite(loader, join(this_dir, "nocuda"))) if cuda.is_available(): - suite.addTests(load_testsuite(loader, join(this_dir, 'cudasim'))) + suite.addTests(load_testsuite(loader, join(this_dir, "cudasim"))) gpus = cuda.list_devices() if gpus and gpus[0].compute_capability >= (2, 0): - suite.addTests(load_testsuite(loader, join(this_dir, 'cudadrv'))) - suite.addTests(load_testsuite(loader, join(this_dir, 'cudapy'))) - suite.addTests(load_testsuite(loader, join(this_dir, 'nrt'))) - suite.addTests(load_testsuite(loader, join(this_dir, - 'doc_examples'))) + suite.addTests(load_testsuite(loader, join(this_dir, "cudadrv"))) + suite.addTests(load_testsuite(loader, join(this_dir, "cudapy"))) + suite.addTests(load_testsuite(loader, join(this_dir, "nrt"))) + suite.addTests( + load_testsuite(loader, join(this_dir, "doc_examples")) + ) else: print("skipped CUDA tests because GPU CC < 2.0") else: diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py b/numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py index 32f75c855..27a61cf5e 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py @@ -4,7 +4,6 @@ class TestArrayAttr(CUDATestCase): - def test_contigous_2d(self): ary = np.arange(10) cary = ary.reshape(2, 5) @@ -44,7 +43,7 @@ def test_contigous_4d(self): def test_ravel_1d(self): ary = np.arange(60) dary = cuda.to_device(ary) - for order in 'CFA': + for order in "CFA": expect = ary.ravel(order=order) dflat = dary.ravel(order=order) flat = dflat.copy_to_host() @@ -52,14 +51,14 @@ def test_ravel_1d(self): self.assertEqual(flat.ndim, 1) self.assertPreciseEqual(expect, flat) - @skip_on_cudasim('CUDA Array Interface is not supported in the simulator') + @skip_on_cudasim("CUDA Array Interface is not supported in the simulator") def test_ravel_stride_1d(self): ary = np.arange(60) dary = cuda.to_device(ary) # No-copy stride device array darystride = dary[::2] - dary_data = dary.__cuda_array_interface__['data'][0] - ddarystride_data = darystride.__cuda_array_interface__['data'][0] + dary_data = dary.__cuda_array_interface__["data"][0] + ddarystride_data = darystride.__cuda_array_interface__["data"][0] self.assertEqual(dary_data, ddarystride_data) # Fail on ravel on non-contiguous array with self.assertRaises(NotImplementedError): @@ -69,7 +68,7 @@ def test_ravel_c(self): ary = np.arange(60) reshaped = ary.reshape(2, 5, 2, 3) - expect = reshaped.ravel(order='C') + expect = reshaped.ravel(order="C") dary = cuda.to_device(reshaped) dflat = dary.ravel() flat = dflat.copy_to_host() @@ -78,7 +77,7 @@ def test_ravel_c(self): self.assertPreciseEqual(expect, flat) # explicit order kwarg - for order in 'CA': + for order in "CA": expect = reshaped.ravel(order=order) dary = cuda.to_device(reshaped) dflat = dary.ravel(order=order) @@ -87,15 +86,15 @@ def test_ravel_c(self): self.assertEqual(flat.ndim, 1) self.assertPreciseEqual(expect, flat) - @skip_on_cudasim('CUDA Array Interface is not supported in the simulator') + @skip_on_cudasim("CUDA Array Interface is not supported in the simulator") def test_ravel_stride_c(self): ary = np.arange(60) reshaped = ary.reshape(2, 5, 2, 3) dary = cuda.to_device(reshaped) darystride = dary[::2, ::2, ::2, ::2] - dary_data = dary.__cuda_array_interface__['data'][0] - ddarystride_data = darystride.__cuda_array_interface__['data'][0] + dary_data = dary.__cuda_array_interface__["data"][0] + ddarystride_data = darystride.__cuda_array_interface__["data"][0] self.assertEqual(dary_data, ddarystride_data) with self.assertRaises(NotImplementedError): darystride.ravel() @@ -103,7 +102,7 @@ def test_ravel_stride_c(self): def test_ravel_f(self): ary = np.arange(60) reshaped = np.asfortranarray(ary.reshape(2, 5, 2, 3)) - for order in 'FA': + for order in "FA": expect = reshaped.ravel(order=order) dary = cuda.to_device(reshaped) dflat = dary.ravel(order=order) @@ -112,14 +111,14 @@ def test_ravel_f(self): self.assertEqual(flat.ndim, 1) self.assertPreciseEqual(expect, flat) - @skip_on_cudasim('CUDA Array Interface is not supported in the simulator') + @skip_on_cudasim("CUDA Array Interface is not supported in the simulator") def test_ravel_stride_f(self): ary = np.arange(60) reshaped = np.asfortranarray(ary.reshape(2, 5, 2, 3)) dary = cuda.to_device(reshaped) darystride = dary[::2, ::2, ::2, ::2] - dary_data = dary.__cuda_array_interface__['data'][0] - ddarystride_data = darystride.__cuda_array_interface__['data'][0] + dary_data = dary.__cuda_array_interface__["data"][0] + ddarystride_data = darystride.__cuda_array_interface__["data"][0] self.assertEqual(dary_data, ddarystride_data) with self.assertRaises(NotImplementedError): darystride.ravel() @@ -134,12 +133,12 @@ def test_reshape_c(self): def test_reshape_f(self): ary = np.arange(10) - expect = ary.reshape(2, 5, order='F') + expect = ary.reshape(2, 5, order="F") dary = cuda.to_device(ary) - dary_reshaped = dary.reshape(2, 5, order='F') + dary_reshaped = dary.reshape(2, 5, order="F") got = dary_reshaped.copy_to_host() self.assertPreciseEqual(expect, got) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py b/numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py index 030052507..049804e7a 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py @@ -27,7 +27,6 @@ def test_gpus_iter(self): class TestContextAPI(CUDATestCase): - def tearDown(self): super().tearDown() cuda.close() @@ -36,7 +35,7 @@ def test_context_memory(self): try: mem = cuda.current_context().get_memory_info() except NotImplementedError: - self.skipTest('EMM Plugin does not implement get_memory_info()') + self.skipTest("EMM Plugin does not implement get_memory_info()") self.assertIsInstance(mem.free, numbers.Number) self.assertEqual(mem.free, mem[0]) @@ -47,7 +46,7 @@ def test_context_memory(self): self.assertLessEqual(mem.free, mem.total) @unittest.skipIf(len(cuda.gpus) < 2, "need more than 1 gpus") - @skip_on_cudasim('CUDA HW required') + @skip_on_cudasim("CUDA HW required") def test_forbidden_context_switch(self): # Cannot switch context inside a `cuda.require_context` @cuda.require_context @@ -72,7 +71,7 @@ def switch_gpu(): self.assertEqual(int(devid), 1) -@skip_on_cudasim('CUDA HW required') +@skip_on_cudasim("CUDA HW required") class Test3rdPartyContext(CUDATestCase): def tearDown(self): super().tearDown() @@ -118,8 +117,9 @@ def test_attached_non_primary(self): cuda.current_context() except RuntimeError as e: # Expecting an error about non-primary CUDA context - self.assertIn("Numba cannot operate on non-primary CUDA context ", - str(e)) + self.assertIn( + "Numba cannot operate on non-primary CUDA context ", str(e) + ) else: self.fail("No RuntimeError raised") finally: @@ -141,5 +141,5 @@ def foo(a): self.test_attached_primary(do) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py b/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py index 5033a115f..ec8f239b3 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py @@ -11,7 +11,7 @@ class CudaArrayIndexing(CUDATestCase): def test_index_1d(self): arr = np.arange(10) darr = cuda.to_device(arr) - x, = arr.shape + (x,) = arr.shape for i in range(-x, x): self.assertEqual(arr[i], darr[i]) with self.assertRaises(IndexError): @@ -58,7 +58,6 @@ def test_index_3d(self): class CudaArrayStridedSlice(CUDATestCase): - def test_strided_index_1d(self): arr = np.arange(10) darr = cuda.to_device(arr) @@ -71,8 +70,9 @@ def test_strided_index_2d(self): for i in range(arr.shape[0]): for j in range(arr.shape[1]): - np.testing.assert_equal(arr[i::2, j::2], - darr[i::2, j::2].copy_to_host()) + np.testing.assert_equal( + arr[i::2, j::2], darr[i::2, j::2].copy_to_host() + ) def test_strided_index_3d(self): arr = np.arange(6 * 7 * 8).reshape(6, 7, 8) @@ -83,7 +83,8 @@ def test_strided_index_3d(self): for k in range(arr.shape[2]): np.testing.assert_equal( arr[i::2, j::2, k::2], - darr[i::2, j::2, k::2].copy_to_host()) + darr[i::2, j::2, k::2].copy_to_host(), + ) class CudaArraySlicing(CUDATestCase): @@ -96,7 +97,7 @@ def test_prefix_1d(self): self.assertTrue(np.all(expect == got)) def test_prefix_2d(self): - arr = np.arange(3 ** 2).reshape(3, 3) + arr = np.arange(3**2).reshape(3, 3) darr = cuda.to_device(arr) for i in range(arr.shape[0]): for j in range(arr.shape[1]): @@ -129,39 +130,45 @@ def test_select_3d_first_two_dim(self): self.assertTrue(np.all(expect == got)) def test_select_f(self): - a = np.arange(5 * 6 * 7).reshape(5, 6, 7, order='F') + a = np.arange(5 * 6 * 7).reshape(5, 6, 7, order="F") da = cuda.to_device(a) for i in range(a.shape[0]): for j in range(a.shape[1]): - self.assertTrue(np.array_equal(da[i, j, :].copy_to_host(), - a[i, j, :])) + self.assertTrue( + np.array_equal(da[i, j, :].copy_to_host(), a[i, j, :]) + ) for j in range(a.shape[2]): - self.assertTrue(np.array_equal(da[i, :, j].copy_to_host(), - a[i, :, j])) + self.assertTrue( + np.array_equal(da[i, :, j].copy_to_host(), a[i, :, j]) + ) for i in range(a.shape[1]): for j in range(a.shape[2]): - self.assertTrue(np.array_equal(da[:, i, j].copy_to_host(), - a[:, i, j])) + self.assertTrue( + np.array_equal(da[:, i, j].copy_to_host(), a[:, i, j]) + ) def test_select_c(self): - a = np.arange(5 * 6 * 7).reshape(5, 6, 7, order='C') + a = np.arange(5 * 6 * 7).reshape(5, 6, 7, order="C") da = cuda.to_device(a) for i in range(a.shape[0]): for j in range(a.shape[1]): - self.assertTrue(np.array_equal(da[i, j, :].copy_to_host(), - a[i, j, :])) + self.assertTrue( + np.array_equal(da[i, j, :].copy_to_host(), a[i, j, :]) + ) for j in range(a.shape[2]): - self.assertTrue(np.array_equal(da[i, :, j].copy_to_host(), - a[i, :, j])) + self.assertTrue( + np.array_equal(da[i, :, j].copy_to_host(), a[i, :, j]) + ) for i in range(a.shape[1]): for j in range(a.shape[2]): - self.assertTrue(np.array_equal(da[:, i, j].copy_to_host(), - a[:, i, j])) + self.assertTrue( + np.array_equal(da[:, i, j].copy_to_host(), a[:, i, j]) + ) def test_prefix_select(self): - arr = np.arange(5 * 7).reshape(5, 7, order='F') + arr = np.arange(5 * 7).reshape(5, 7, order="F") darr = cuda.to_device(arr) self.assertTrue(np.all(darr[:1, 1].copy_to_host() == arr[:1, 1])) @@ -170,15 +177,15 @@ def test_negative_slicing_1d(self): arr = np.arange(10) darr = cuda.to_device(arr) for i, j in product(range(-10, 10), repeat=2): - np.testing.assert_array_equal(arr[i:j], - darr[i:j].copy_to_host()) + np.testing.assert_array_equal(arr[i:j], darr[i:j].copy_to_host()) def test_negative_slicing_2d(self): arr = np.arange(12).reshape(3, 4) darr = cuda.to_device(arr) for x, y, w, s in product(range(-4, 4), repeat=4): - np.testing.assert_array_equal(arr[x:y, w:s], - darr[x:y, w:s].copy_to_host()) + np.testing.assert_array_equal( + arr[x:y, w:s], darr[x:y, w:s].copy_to_host() + ) def test_empty_slice_1d(self): arr = np.arange(5) @@ -188,10 +195,10 @@ def test_empty_slice_1d(self): # empty slice of empty slice self.assertFalse(darr[:0][:0].copy_to_host()) # out-of-bound slice just produces empty slices - np.testing.assert_array_equal(darr[:0][:1].copy_to_host(), - arr[:0][:1]) - np.testing.assert_array_equal(darr[:0][-1:].copy_to_host(), - arr[:0][-1:]) + np.testing.assert_array_equal(darr[:0][:1].copy_to_host(), arr[:0][:1]) + np.testing.assert_array_equal( + darr[:0][-1:].copy_to_host(), arr[:0][-1:] + ) def test_empty_slice_2d(self): arr = np.arange(5 * 7).reshape(5, 7) @@ -202,8 +209,9 @@ def test_empty_slice_2d(self): self.assertFalse(darr[:0][:0].copy_to_host()) # out-of-bound slice just produces empty slices np.testing.assert_array_equal(darr[:0][:1].copy_to_host(), arr[:0][:1]) - np.testing.assert_array_equal(darr[:0][-1:].copy_to_host(), - arr[:0][-1:]) + np.testing.assert_array_equal( + darr[:0][-1:].copy_to_host(), arr[:0][-1:] + ) class CudaArraySetting(CUDATestCase): @@ -292,7 +300,8 @@ def test_incompatible_highdim(self): "Can't assign 3-D array to 1-D self", # device "could not broadcast input array from shape (2,3) " "into shape (35,)", # simulator, NP >= 1.20 - ]) + ], + ) def test_incompatible_shape(self): darr = cuda.to_device(np.arange(5)) @@ -306,57 +315,67 @@ def test_incompatible_shape(self): "Can't copy sequence with size 2 to array axis 0 with " "dimension 5", # device "could not broadcast input array from shape (2,) into " - "shape (5,)", # simulator, NP >= 1.20 - ]) + "shape (5,)", # simulator, NP >= 1.20 + ], + ) - @skip_on_cudasim('cudasim does not use streams and operates synchronously') + @skip_on_cudasim("cudasim does not use streams and operates synchronously") def test_sync(self): # There should be a synchronization when no stream is supplied darr = cuda.to_device(np.arange(5)) - with patch.object(cuda.cudadrv.driver.Stream, 'synchronize', - return_value=None) as mock_sync: + with patch.object( + cuda.cudadrv.driver.Stream, "synchronize", return_value=None + ) as mock_sync: darr[0] = 10 mock_sync.assert_called_once() - @skip_on_cudasim('cudasim does not use streams and operates synchronously') + @skip_on_cudasim("cudasim does not use streams and operates synchronously") def test_no_sync_default_stream(self): # There should not be a synchronization when the array has a default # stream, whether it is the default stream, the legacy default stream, # the per-thread default stream, or another stream. - streams = (cuda.stream(), cuda.default_stream(), - cuda.legacy_default_stream(), - cuda.per_thread_default_stream()) + streams = ( + cuda.stream(), + cuda.default_stream(), + cuda.legacy_default_stream(), + cuda.per_thread_default_stream(), + ) for stream in streams: darr = cuda.to_device(np.arange(5), stream=stream) - with patch.object(cuda.cudadrv.driver.Stream, 'synchronize', - return_value=None) as mock_sync: + with patch.object( + cuda.cudadrv.driver.Stream, "synchronize", return_value=None + ) as mock_sync: darr[0] = 10 mock_sync.assert_not_called() - @skip_on_cudasim('cudasim does not use streams and operates synchronously') + @skip_on_cudasim("cudasim does not use streams and operates synchronously") def test_no_sync_supplied_stream(self): # There should not be a synchronization when a stream is supplied for # the setitem call, whether it is the default stream, the legacy default # stream, the per-thread default stream, or another stream. - streams = (cuda.stream(), cuda.default_stream(), - cuda.legacy_default_stream(), - cuda.per_thread_default_stream()) + streams = ( + cuda.stream(), + cuda.default_stream(), + cuda.legacy_default_stream(), + cuda.per_thread_default_stream(), + ) for stream in streams: darr = cuda.to_device(np.arange(5)) - with patch.object(cuda.cudadrv.driver.Stream, 'synchronize', - return_value=None) as mock_sync: + with patch.object( + cuda.cudadrv.driver.Stream, "synchronize", return_value=None + ) as mock_sync: darr.setitem(0, 10, stream=stream) mock_sync.assert_not_called() - @unittest.skip('Requires PR #6367') + @unittest.skip("Requires PR #6367") def test_issue_6505(self): # On Windows, the writes to ary_v would not be visible prior to the # assertion, due to the assignment being done with a kernel launch that @@ -365,11 +384,11 @@ def test_issue_6505(self): ary = cuda.mapped_array(2, dtype=np.int32) ary[:] = 0 - ary_v = ary.view('u1') + ary_v = ary.view("u1") ary_v[1] = 1 ary_v[5] = 1 self.assertEqual(sum(ary), 512) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py b/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py index 4a4d59310..45815be70 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py @@ -17,5 +17,5 @@ def test_auto_context(self): self.assertTrue(np.allclose(A, newA)) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py b/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py index e2acd34d7..b13c8f979 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py @@ -1,7 +1,10 @@ import numpy as np import ctypes -from numba.cuda.cudadrv.devicearray import (DeviceRecord, from_record_like, - auto_device) +from numba.cuda.cudadrv.devicearray import ( + DeviceRecord, + from_record_like, + auto_device, +) from numba.cuda.testing import unittest, CUDATestCase from numba.cuda.testing import skip_on_cudasim from numba.np import numpy_support @@ -11,43 +14,37 @@ recordtype = np.dtype( [ - ('a', np.float64), - ('b', np.int32), - ('c', np.complex64), - ('d', (np.str_, N_CHARS)) + ("a", np.float64), + ("b", np.int32), + ("c", np.complex64), + ("d", (np.str_, N_CHARS)), ], - align=True + align=True, ) -recordwitharray = np.dtype( - [ - ('g', np.int32), - ('h', np.float32, 2) - ], - align=True -) +recordwitharray = np.dtype([("g", np.int32), ("h", np.float32, 2)], align=True) -recwithmat = np.dtype([('i', np.int32), - ('j', np.float32, (3, 3))]) +recwithmat = np.dtype([("i", np.int32), ("j", np.float32, (3, 3))]) -recwithrecwithmat = np.dtype([('x', np.int32), ('y', recwithmat)]) +recwithrecwithmat = np.dtype([("x", np.int32), ("y", recwithmat)]) -@skip_on_cudasim('Device Record API unsupported in the simulator') +@skip_on_cudasim("Device Record API unsupported in the simulator") class TestCudaDeviceRecord(CUDATestCase): """ Tests the DeviceRecord class with np.void host types. """ + def setUp(self): super().setUp() self._create_data(np.zeros) def _create_data(self, array_ctor): - self.dtype = np.dtype([('a', np.int32), ('b', np.float32)], align=True) + self.dtype = np.dtype([("a", np.int32), ("b", np.float32)], align=True) self.hostz = array_ctor(1, self.dtype)[0] self.hostnz = array_ctor(1, self.dtype)[0] - self.hostnz['a'] = 10 - self.hostnz['b'] = 11.0 + self.hostnz["a"] = 10 + self.hostnz["b"] = 11.0 def _check_device_record(self, reference, rec): self.assertEqual(rec.shape, tuple()) @@ -111,21 +108,22 @@ class TestCudaDeviceRecordWithRecord(TestCudaDeviceRecord): """ Tests the DeviceRecord class with np.record host types """ + def setUp(self): CUDATestCase.setUp(self) self._create_data(np.recarray) -@skip_on_cudasim('Structured array attr access not supported in simulator') +@skip_on_cudasim("Structured array attr access not supported in simulator") class TestRecordDtypeWithStructArrays(CUDATestCase): - ''' + """ Test operation of device arrays on structured arrays. - ''' + """ def _createSampleArrays(self): self.sample1d = cuda.device_array(3, dtype=recordtype) self.samplerec1darr = cuda.device_array(1, dtype=recordwitharray)[0] - self.samplerecmat = cuda.device_array(1,dtype=recwithmat)[0] + self.samplerecmat = cuda.device_array(1, dtype=recwithmat)[0] def setUp(self): super().setUp() @@ -134,46 +132,46 @@ def setUp(self): ary = self.sample1d for i in range(ary.size): x = i + 1 - ary[i]['a'] = x / 2 - ary[i]['b'] = x - ary[i]['c'] = x * 1j - ary[i]['d'] = str(x) * N_CHARS + ary[i]["a"] = x / 2 + ary[i]["b"] = x + ary[i]["c"] = x * 1j + ary[i]["d"] = str(x) * N_CHARS def test_structured_array1(self): ary = self.sample1d for i in range(self.sample1d.size): x = i + 1 - self.assertEqual(ary[i]['a'], x / 2) - self.assertEqual(ary[i]['b'], x) - self.assertEqual(ary[i]['c'], x * 1j) - self.assertEqual(ary[i]['d'], str(x) * N_CHARS) + self.assertEqual(ary[i]["a"], x / 2) + self.assertEqual(ary[i]["b"], x) + self.assertEqual(ary[i]["c"], x * 1j) + self.assertEqual(ary[i]["d"], str(x) * N_CHARS) def test_structured_array2(self): ary = self.samplerec1darr - ary['g'] = 2 - ary['h'][0] = 3.0 - ary['h'][1] = 4.0 - self.assertEqual(ary['g'], 2) - self.assertEqual(ary['h'][0], 3.0) - self.assertEqual(ary['h'][1], 4.0) + ary["g"] = 2 + ary["h"][0] = 3.0 + ary["h"][1] = 4.0 + self.assertEqual(ary["g"], 2) + self.assertEqual(ary["h"][0], 3.0) + self.assertEqual(ary["h"][1], 4.0) def test_structured_array3(self): ary = self.samplerecmat - mat = np.array([[5.0, 10.0, 15.0], - [20.0, 25.0, 30.0], - [35.0, 40.0, 45.0]], - dtype=np.float32).reshape(3,3) - ary['j'][:] = mat - np.testing.assert_equal(ary['j'], mat) + mat = np.array( + [[5.0, 10.0, 15.0], [20.0, 25.0, 30.0], [35.0, 40.0, 45.0]], + dtype=np.float32, + ).reshape(3, 3) + ary["j"][:] = mat + np.testing.assert_equal(ary["j"], mat) def test_structured_array4(self): arr = np.zeros(1, dtype=recwithrecwithmat) d_arr = cuda.to_device(arr) - d_arr[0]['y']['i'] = 1 - self.assertEqual(d_arr[0]['y']['i'], 1) - d_arr[0]['y']['j'][0, 0] = 2.0 - self.assertEqual(d_arr[0]['y']['j'][0, 0], 2.0) + d_arr[0]["y"]["i"] = 1 + self.assertEqual(d_arr[0]["y"]["i"], 1) + d_arr[0]["y"]["j"][0, 0] = 2.0 + self.assertEqual(d_arr[0]["y"]["j"][0, 0], 2.0) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py b/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py index ea9d72fa8..6972a44ed 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py @@ -1,13 +1,17 @@ from ctypes import byref, c_int, c_void_p, sizeof -from numba.cuda.cudadrv.driver import (host_to_device, device_to_host, driver, - launch_kernel) +from numba.cuda.cudadrv.driver import ( + host_to_device, + device_to_host, + driver, + launch_kernel, +) from numba.cuda.cudadrv import devices, drvapi, driver as _driver from numba.cuda.testing import unittest, CUDATestCase from numba.cuda.testing import skip_on_cudasim -ptx1 = ''' +ptx1 = """ .version 1.4 .target sm_10, map_f64_to_f32 @@ -29,9 +33,9 @@ exit; $LDWend__Z10helloworldPi: } // _Z10helloworldPi -''' +""" -ptx2 = ''' +ptx2 = """ .version 3.0 .target sm_20 .address_size 64 @@ -57,10 +61,10 @@ .loc 2 7 2 ret; } -''' +""" -@skip_on_cudasim('CUDA Driver API unsupported in the simulator') +@skip_on_cudasim("CUDA Driver API unsupported in the simulator") class TestCudaDriver(CUDATestCase): def setUp(self): super().setUp() @@ -79,7 +83,7 @@ def tearDown(self): def test_cuda_driver_basic(self): module = self.context.create_module_ptx(self.ptx) - function = module.get_function('_Z10helloworldPi') + function = module.get_function("_Z10helloworldPi") array = (c_int * 100)() @@ -93,12 +97,18 @@ def test_cuda_driver_basic(self): ptr = c_void_p(int(ptr)) stream = _driver.binding.CUstream(stream) - launch_kernel(function.handle, # Kernel - 1, 1, 1, # gx, gy, gz - 100, 1, 1, # bx, by, bz - 0, # dynamic shared mem - stream, # stream - [ptr]) # arguments + launch_kernel( + function.handle, # Kernel + 1, + 1, + 1, # gx, gy, gz + 100, + 1, + 1, # bx, by, bz + 0, # dynamic shared mem + stream, # stream + [ptr], + ) # arguments device_to_host(array, memory, sizeof(array)) for i, v in enumerate(array): @@ -108,7 +118,7 @@ def test_cuda_driver_basic(self): def test_cuda_driver_stream_operations(self): module = self.context.create_module_ptx(self.ptx) - function = module.get_function('_Z10helloworldPi') + function = module.get_function("_Z10helloworldPi") array = (c_int * 100)() @@ -122,12 +132,18 @@ def test_cuda_driver_stream_operations(self): if _driver.USE_NV_BINDING: ptr = c_void_p(int(ptr)) - launch_kernel(function.handle, # Kernel - 1, 1, 1, # gx, gy, gz - 100, 1, 1, # bx, by, bz - 0, # dynamic shared mem - stream.handle, # stream - [ptr]) # arguments + launch_kernel( + function.handle, # Kernel + 1, + 1, + 1, # gx, gy, gz + 100, + 1, + 1, # bx, by, bz + 0, # dynamic shared mem + stream.handle, # stream + [ptr], + ) # arguments device_to_host(array, memory, sizeof(array), stream=stream) @@ -193,17 +209,19 @@ def test_cuda_driver_external_stream(self): def test_cuda_driver_occupancy(self): module = self.context.create_module_ptx(self.ptx) - function = module.get_function('_Z10helloworldPi') + function = module.get_function("_Z10helloworldPi") - value = self.context.get_active_blocks_per_multiprocessor(function, - 128, 128) + value = self.context.get_active_blocks_per_multiprocessor( + function, 128, 128 + ) self.assertTrue(value > 0) def b2d(bs): return bs - grid, block = self.context.get_max_potential_block_size(function, b2d, - 128, 128) + grid, block = self.context.get_max_potential_block_size( + function, b2d, 128, 128 + ) self.assertTrue(grid > 0) self.assertTrue(block > 0) @@ -221,15 +239,15 @@ def test_device_get_uuid(self): # 4122) pertaining to versions and variants, so we do not extract and # validate the values of these bits. - h = '[0-9a-f]{%d}' + h = "[0-9a-f]{%d}" h4 = h % 4 h8 = h % 8 h12 = h % 12 - uuid_format = f'^GPU-{h8}-{h4}-{h4}-{h4}-{h12}$' + uuid_format = f"^GPU-{h8}-{h4}-{h4}-{h4}-{h12}$" dev = devices.get_context().device self.assertRegex(dev.uuid, uuid_format) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py b/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py index 890bf6829..f80c44ada 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py @@ -3,7 +3,7 @@ from numba.misc.findlib import find_lib -@skip_on_cudasim('Library detection unsupported in the simulator') +@skip_on_cudasim("Library detection unsupported in the simulator") @skip_unless_conda_cudatoolkit class TestLibraryDetection(unittest.TestCase): def test_detect(self): @@ -13,10 +13,10 @@ def test_detect(self): PyCulib (and potentially others) rely on Numba's library finding capacity to find and subsequently load these libraries. """ - core_libs = ['nvvm'] + core_libs = ["nvvm"] for l in core_libs: self.assertNotEqual(find_lib(l), []) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py b/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py index 6402f7773..5d187411f 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py @@ -7,7 +7,7 @@ from numba.cuda.testing import skip_on_cudasim -@skip_on_cudasim('CUDA Memory API unsupported in the simulator') +@skip_on_cudasim("CUDA Memory API unsupported in the simulator") class TestCudaMemory(ContextResettingTestCase): def setUp(self): super().setUp() @@ -24,8 +24,7 @@ def _template(self, obj): expected_class = driver.binding.CUdeviceptr else: expected_class = drvapi.cu_device_ptr - self.assertTrue(isinstance(obj.device_ctypes_pointer, - expected_class)) + self.assertTrue(isinstance(obj.device_ctypes_pointer, expected_class)) def test_device_memory(self): devmem = self.context.memalloc(1024) @@ -41,9 +40,9 @@ def test_host_alloc(self): def test_pinned_memory(self): ary = np.arange(10) - devmem = self.context.mempin(ary, ary.ctypes.data, - ary.size * ary.dtype.itemsize, - mapped=True) + devmem = self.context.mempin( + ary, ary.ctypes.data, ary.size * ary.dtype.itemsize, mapped=True + ) self._template(devmem) def test_managed_memory(self): @@ -69,8 +68,7 @@ def check(m, offset): v2 = v1.view(offset) self.assertEqual(handle_val(v2.owner), handle_val(m)) self.assertEqual(handle_val(v2.owner), handle_val(m)) - self.assertEqual(handle_val(v2) - offset * 2, - handle_val(v2.owner)) + self.assertEqual(handle_val(v2) - offset * 2, handle_val(v2.owner)) self.assertEqual(m.refct, 3) del v2 self.assertEqual(m.refct, 2) @@ -84,22 +82,24 @@ def check(m, offset): def test_user_extension(self): # User can use MemoryPointer to wrap externally defined pointers. # This test checks if the finalizer is invokded at correct time - fake_ptr = ctypes.c_void_p(0xdeadbeef) + fake_ptr = ctypes.c_void_p(0xDEADBEEF) dtor_invoked = [0] def dtor(): dtor_invoked[0] += 1 # Ensure finalizer is called when pointer is deleted - ptr = driver.MemoryPointer(context=self.context, pointer=fake_ptr, - size=40, finalizer=dtor) + ptr = driver.MemoryPointer( + context=self.context, pointer=fake_ptr, size=40, finalizer=dtor + ) self.assertEqual(dtor_invoked[0], 0) del ptr self.assertEqual(dtor_invoked[0], 1) # Ensure removing derived pointer doesn't call finalizer - ptr = driver.MemoryPointer(context=self.context, pointer=fake_ptr, - size=40, finalizer=dtor) + ptr = driver.MemoryPointer( + context=self.context, pointer=fake_ptr, size=40, finalizer=dtor + ) owned = ptr.own() del owned self.assertEqual(dtor_invoked[0], 1) @@ -128,16 +128,16 @@ def test_memcpy(self): self.assertTrue(np.all(hstary == hstary2)) def test_memset(self): - dtype = np.dtype('uint32') + dtype = np.dtype("uint32") n = 10 sz = dtype.itemsize * 10 devary = self.context.memalloc(sz) - driver.device_memset(devary, 0xab, sz) + driver.device_memset(devary, 0xAB, sz) hstary = np.empty(n, dtype=dtype) driver.device_to_host(hstary, devary, sz) - hstary2 = np.array([0xabababab] * n, dtype=np.dtype('uint32')) + hstary2 = np.array([0xABABABAB] * n, dtype=np.dtype("uint32")) self.assertTrue(np.all(hstary == hstary2)) def test_d2d(self): @@ -152,7 +152,7 @@ def test_d2d(self): self.assertTrue(np.all(hst == hst2)) -@skip_on_cudasim('CUDA Memory API unsupported in the simulator') +@skip_on_cudasim("CUDA Memory API unsupported in the simulator") class TestMVExtent(ContextResettingTestCase): def test_c_contiguous_array(self): ary = np.arange(100) @@ -177,7 +177,7 @@ def test_single_element_array(self): def test_ctypes_struct(self): class mystruct(ctypes.Structure): - _fields_ = [('x', ctypes.c_int), ('y', ctypes.c_int)] + _fields_ = [("x", ctypes.c_int), ("y", ctypes.c_int)] data = mystruct(x=123, y=432) sz = driver.host_memory_size(data) @@ -189,5 +189,5 @@ def test_ctypes_double(self): self.assertTrue(ctypes.sizeof(data) == sz) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py b/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py index fd6150882..68ebd234a 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py @@ -57,10 +57,7 @@ def test_devicearray(self): def test_stream_bind(self): stream = cuda.stream() with stream.auto_synchronize(): - arr = cuda.device_array( - (3, 3), - dtype=np.float64, - stream=stream) + arr = cuda.device_array((3, 3), dtype=np.float64, stream=stream) self.assertEqual(arr.bind(stream).stream, stream) self.assertEqual(arr.stream, stream) @@ -90,8 +87,8 @@ def test_devicearray_partition(self): self.assertTrue(np.all(array == 0)) - right.copy_to_host(array[N // 2:]) - left.copy_to_host(array[:N // 2]) + right.copy_to_host(array[N // 2 :]) + left.copy_to_host(array[: N // 2]) self.assertTrue(np.all(array == original)) @@ -104,7 +101,7 @@ def test_devicearray_replace(self): gpumem.copy_to_host(array) np.testing.assert_array_equal(array, original * 2) - @skip_on_cudasim('This works in the simulator') + @skip_on_cudasim("This works in the simulator") def test_devicearray_transpose_wrongdim(self): gpumem = cuda.to_device(np.array(np.arange(12)).reshape(3, 4, 1)) @@ -113,13 +110,15 @@ def test_devicearray_transpose_wrongdim(self): self.assertEqual( "transposing a non-2D DeviceNDArray isn't supported", - str(e.exception)) + str(e.exception), + ) def test_devicearray_transpose_identity(self): # any-shape identities should work original = np.array(np.arange(24)).reshape(3, 4, 2) - array = np.transpose(cuda.to_device(original), - axes=(0, 1, 2)).copy_to_host() + array = np.transpose( + cuda.to_device(original), axes=(0, 1, 2) + ).copy_to_host() self.assertTrue(np.all(array == original)) def test_devicearray_transpose_duplicatedaxis(self): @@ -131,9 +130,10 @@ def test_devicearray_transpose_duplicatedaxis(self): self.assertIn( str(e.exception), container=[ - 'invalid axes list (0, 0)', # GPU - 'repeated axis in transpose', # sim - ]) + "invalid axes list (0, 0)", # GPU + "repeated axis in transpose", # sim + ], + ) def test_devicearray_transpose_wrongaxis(self): gpumem = cuda.to_device(np.array(np.arange(12)).reshape(3, 4)) @@ -144,10 +144,11 @@ def test_devicearray_transpose_wrongaxis(self): self.assertIn( str(e.exception), container=[ - 'invalid axes list (0, 2)', # GPU - 'invalid axis for this array', - 'axis 2 is out of bounds for array of dimension 2', # sim - ]) + "invalid axes list (0, 2)", # GPU + "invalid axis for this array", + "axis 2 is out of bounds for array of dimension 2", # sim + ], + ) def test_devicearray_view_ok(self): original = np.array(np.arange(12), dtype="i2").reshape(3, 4) @@ -155,8 +156,7 @@ def test_devicearray_view_ok(self): for dtype in ("i4", "u4", "i8", "f8"): with self.subTest(dtype=dtype): np.testing.assert_array_equal( - array.view(dtype).copy_to_host(), - original.view(dtype) + array.view(dtype).copy_to_host(), original.view(dtype) ) def test_devicearray_view_ok_not_c_contig(self): @@ -164,8 +164,7 @@ def test_devicearray_view_ok_not_c_contig(self): array = cuda.to_device(original)[:, ::2] original = original[:, ::2] np.testing.assert_array_equal( - array.view("u2").copy_to_host(), - original.view("u2") + array.view("u2").copy_to_host(), original.view("u2") ) def test_devicearray_view_bad_not_c_contig(self): @@ -175,12 +174,14 @@ def test_devicearray_view_bad_not_c_contig(self): array.view("i4") msg = str(e.exception) - self.assertIn('To change to a dtype of a different size,', msg) + self.assertIn("To change to a dtype of a different size,", msg) - contiguous_pre_np123 = 'the array must be C-contiguous' in msg - contiguous_post_np123 = 'the last axis must be contiguous' in msg - self.assertTrue(contiguous_pre_np123 or contiguous_post_np123, - 'Expected message to mention contiguity') + contiguous_pre_np123 = "the array must be C-contiguous" in msg + contiguous_post_np123 = "the last axis must be contiguous" in msg + self.assertTrue( + contiguous_pre_np123 or contiguous_post_np123, + "Expected message to mention contiguity", + ) def test_devicearray_view_bad_itemsize(self): original = np.array(np.arange(12), dtype="i2").reshape(4, 3) @@ -191,7 +192,8 @@ def test_devicearray_view_bad_itemsize(self): "When changing to a larger dtype," " its size must be a divisor of the total size in bytes" " of the last axis of the array.", - str(e.exception)) + str(e.exception), + ) def test_devicearray_transpose_ok(self): original = np.array(np.arange(12)).reshape(3, 4) @@ -206,7 +208,7 @@ def test_devicearray_transpose_T(self): def test_devicearray_contiguous_slice(self): # memcpys are dumb ranges of bytes, so trying to # copy to a non-contiguous range shouldn't work! - a = np.arange(25).reshape(5, 5, order='F') + a = np.arange(25).reshape(5, 5, order="F") s = np.full(fill_value=5, shape=(5,)) d = cuda.to_device(a) @@ -216,9 +218,7 @@ def test_devicearray_contiguous_slice(self): # (40-byte strides). This means we can't memcpy to it! with self.assertRaises(ValueError) as e: d[2].copy_to_device(s) - self.assertEqual( - devicearray.errmsg_contiguous_buffer, - str(e.exception)) + self.assertEqual(devicearray.errmsg_contiguous_buffer, str(e.exception)) # if d[2].copy_to_device(s), then this would pass: # self.assertTrue((a == d.copy_to_host()).all()) @@ -236,9 +236,9 @@ def _test_devicearray_contiguous_host_copy(self, a_c, a_f): (a_c, a_f), (a_c, a_c), ]: - msg = '%s => %s' % ( - 'C' if original.flags.c_contiguous else 'F', - 'C' if copy.flags.c_contiguous else 'F', + msg = "%s => %s" % ( + "C" if original.flags.c_contiguous else "F", + "C" if copy.flags.c_contiguous else "F", ) d = cuda.to_device(original) @@ -248,17 +248,17 @@ def _test_devicearray_contiguous_host_copy(self, a_c, a_f): def test_devicearray_contiguous_copy_host_3d(self): a_c = np.arange(5 * 5 * 5).reshape(5, 5, 5) - a_f = np.array(a_c, order='F') + a_f = np.array(a_c, order="F") self._test_devicearray_contiguous_host_copy(a_c, a_f) def test_devicearray_contiguous_copy_host_1d(self): a_c = np.arange(5) - a_f = np.array(a_c, order='F') + a_f = np.array(a_c, order="F") self._test_devicearray_contiguous_host_copy(a_c, a_f) def test_devicearray_contiguous_copy_device(self): a_c = np.arange(5 * 5 * 5).reshape(5, 5, 5) - a_f = np.array(a_c, order='F') + a_f = np.array(a_c, order="F") self.assertTrue(a_c.flags.c_contiguous) self.assertTrue(a_f.flags.f_contiguous) @@ -268,7 +268,8 @@ def test_devicearray_contiguous_copy_device(self): d.copy_to_device(cuda.to_device(a_f)) self.assertEqual( "incompatible strides: {} vs. {}".format(a_c.strides, a_f.strides), - str(e.exception)) + str(e.exception), + ) d.copy_to_device(cuda.to_device(a_c)) self.assertTrue(np.all(d.copy_to_host() == a_c)) @@ -279,7 +280,8 @@ def test_devicearray_contiguous_copy_device(self): d.copy_to_device(cuda.to_device(a_c)) self.assertEqual( "incompatible strides: {} vs. {}".format(a_f.strides, a_c.strides), - str(e.exception)) + str(e.exception), + ) d.copy_to_device(cuda.to_device(a_f)) self.assertTrue(np.all(d.copy_to_host() == a_f)) @@ -288,8 +290,8 @@ def test_devicearray_broadcast_host_copy(self): broadsize = 4 coreshape = (2, 3) coresize = np.prod(coreshape) - core_c = np.arange(coresize).reshape(coreshape, order='C') - core_f = np.arange(coresize).reshape(coreshape, order='F') + core_c = np.arange(coresize).reshape(coreshape, order="C") + core_f = np.arange(coresize).reshape(coreshape, order="F") for dim in range(len(coreshape)): newindex = (slice(None),) * dim + (np.newaxis,) broadshape = coreshape[:dim] + (broadsize,) + coreshape[dim:] @@ -318,11 +320,9 @@ def test_devicearray_contiguous_device_strided(self): with self.assertRaises(ValueError) as e: d.copy_to_device(cuda.to_device(arr)[::2]) - self.assertEqual( - devicearray.errmsg_contiguous_buffer, - str(e.exception)) + self.assertEqual(devicearray.errmsg_contiguous_buffer, str(e.exception)) - @skip_on_cudasim('DeviceNDArray class not present in simulator') + @skip_on_cudasim("DeviceNDArray class not present in simulator") def test_devicearray_relaxed_strides(self): # From the reproducer in Issue #6824. @@ -334,86 +334,88 @@ def test_devicearray_relaxed_strides(self): # Ensure we still believe the array to be contiguous because # strides checking is relaxed. - self.assertTrue(arr.flags['C_CONTIGUOUS']) - self.assertTrue(arr.flags['F_CONTIGUOUS']) + self.assertTrue(arr.flags["C_CONTIGUOUS"]) + self.assertTrue(arr.flags["F_CONTIGUOUS"]) def test_c_f_contiguity_matches_numpy(self): # From the reproducer in Issue #4943. shapes = ((1, 4), (4, 1)) - orders = ('C', 'F') + orders = ("C", "F") for shape, order in itertools.product(shapes, orders): arr = np.ndarray(shape, order=order) d_arr = cuda.to_device(arr) - self.assertEqual(arr.flags['C_CONTIGUOUS'], - d_arr.flags['C_CONTIGUOUS']) - self.assertEqual(arr.flags['F_CONTIGUOUS'], - d_arr.flags['F_CONTIGUOUS']) + self.assertEqual( + arr.flags["C_CONTIGUOUS"], d_arr.flags["C_CONTIGUOUS"] + ) + self.assertEqual( + arr.flags["F_CONTIGUOUS"], d_arr.flags["F_CONTIGUOUS"] + ) - @skip_on_cudasim('Typing not done in the simulator') + @skip_on_cudasim("Typing not done in the simulator") def test_devicearray_typing_order_simple_c(self): # C-order 1D array - a = np.zeros(10, order='C') + a = np.zeros(10, order="C") d = cuda.to_device(a) - self.assertEqual(d._numba_type_.layout, 'C') + self.assertEqual(d._numba_type_.layout, "C") - @skip_on_cudasim('Typing not done in the simulator') + @skip_on_cudasim("Typing not done in the simulator") def test_devicearray_typing_order_simple_f(self): # F-order array that is also C layout. - a = np.zeros(10, order='F') + a = np.zeros(10, order="F") d = cuda.to_device(a) - self.assertEqual(d._numba_type_.layout, 'C') + self.assertEqual(d._numba_type_.layout, "C") - @skip_on_cudasim('Typing not done in the simulator') + @skip_on_cudasim("Typing not done in the simulator") def test_devicearray_typing_order_2d_c(self): # C-order 2D array - a = np.zeros((2, 10), order='C') + a = np.zeros((2, 10), order="C") d = cuda.to_device(a) - self.assertEqual(d._numba_type_.layout, 'C') + self.assertEqual(d._numba_type_.layout, "C") - @skip_on_cudasim('Typing not done in the simulator') + @skip_on_cudasim("Typing not done in the simulator") def test_devicearray_typing_order_2d_f(self): # F-order array that can only be F layout - a = np.zeros((2, 10), order='F') + a = np.zeros((2, 10), order="F") d = cuda.to_device(a) - self.assertEqual(d._numba_type_.layout, 'F') + self.assertEqual(d._numba_type_.layout, "F") - @skip_on_cudasim('Typing not done in the simulator') + @skip_on_cudasim("Typing not done in the simulator") def test_devicearray_typing_order_noncontig_slice_c(self): # Non-contiguous slice of C-order array - a = np.zeros((5, 5), order='C') - d = cuda.to_device(a)[:,2] - self.assertEqual(d._numba_type_.layout, 'A') + a = np.zeros((5, 5), order="C") + d = cuda.to_device(a)[:, 2] + self.assertEqual(d._numba_type_.layout, "A") - @skip_on_cudasim('Typing not done in the simulator') + @skip_on_cudasim("Typing not done in the simulator") def test_devicearray_typing_order_noncontig_slice_f(self): # Non-contiguous slice of F-order array - a = np.zeros((5, 5), order='F') - d = cuda.to_device(a)[2,:] - self.assertEqual(d._numba_type_.layout, 'A') + a = np.zeros((5, 5), order="F") + d = cuda.to_device(a)[2, :] + self.assertEqual(d._numba_type_.layout, "A") - @skip_on_cudasim('Typing not done in the simulator') + @skip_on_cudasim("Typing not done in the simulator") def test_devicearray_typing_order_contig_slice_c(self): # Contiguous slice of C-order array - a = np.zeros((5, 5), order='C') - d = cuda.to_device(a)[2,:] - self.assertEqual(d._numba_type_.layout, 'C') + a = np.zeros((5, 5), order="C") + d = cuda.to_device(a)[2, :] + self.assertEqual(d._numba_type_.layout, "C") - @skip_on_cudasim('Typing not done in the simulator') + @skip_on_cudasim("Typing not done in the simulator") def test_devicearray_typing_order_contig_slice_f(self): # Contiguous slice of F-order array - is both C- and F-contiguous, so # types as 'C' layout - a = np.zeros((5, 5), order='F') - d = cuda.to_device(a)[:,2] - self.assertEqual(d._numba_type_.layout, 'C') + a = np.zeros((5, 5), order="F") + d = cuda.to_device(a)[:, 2] + self.assertEqual(d._numba_type_.layout, "C") - @skip_on_cudasim('Typing not done in the simulator') + @skip_on_cudasim("Typing not done in the simulator") def test_devicearray_typing_order_broadcasted(self): # Broadcasted array, similar to that used for passing scalars to ufuncs a = np.broadcast_to(np.array([1]), (10,)) d = cuda.to_device(a) - self.assertEqual(d._numba_type_.layout, 'A') + self.assertEqual(d._numba_type_.layout, "A") def test_bug6697(self): ary = np.arange(10, dtype=np.int16) @@ -421,7 +423,7 @@ def test_bug6697(self): got = np.asarray(dary) self.assertEqual(got.dtype, dary.dtype) - @skip_on_cudasim('DeviceNDArray class not present in simulator') + @skip_on_cudasim("DeviceNDArray class not present in simulator") def test_issue_8477(self): # Ensure that we can copy a zero-length device array to a zero-length # host array when the strides of the device and host arrays differ - @@ -430,8 +432,9 @@ def test_issue_8477(self): # https://github.com/numba/numba/issues/8477. # Create a device array with shape (0,) and strides (8,) - dev_array = devicearray.DeviceNDArray(shape=(0,), strides=(8,), - dtype=np.int8) + dev_array = devicearray.DeviceNDArray( + shape=(0,), strides=(8,), dtype=np.int8 + ) # Create a host array with shape (0,) and strides (0,) host_array = np.ndarray(shape=(0,), strides=(0,), dtype=np.int8) @@ -470,8 +473,7 @@ def test_np_array_dtype(self): dev_array = cuda.to_device(np.asarray([1.0, 2.0, 3.0])) host_array = np.array(dev_array, dtype=dtype) np.testing.assert_equal( - host_array, - dev_array.copy_to_host().astype(dtype) + host_array, dev_array.copy_to_host().astype(dtype) ) @unittest.skipUnless(IS_NUMPY_2, "NumPy 1.x does not pass copy kwarg") @@ -490,10 +492,13 @@ def test_np_array_copy_true(self): class TestRecarray(CUDATestCase): def test_recarray(self): # From issue #4111 - a = np.recarray((16,), dtype=[ - ("value1", np.int64), - ("value2", np.float64), - ]) + a = np.recarray( + (16,), + dtype=[ + ("value1", np.int64), + ("value2", np.float64), + ], + ) a.value1 = np.arange(a.size, dtype=np.int64) a.value2 = np.arange(a.size, dtype=np.float64) / 100 @@ -518,39 +523,39 @@ class TestCoreContiguous(CUDATestCase): def _test_against_array_core(self, view): self.assertEqual( devicearray.is_contiguous(view), - devicearray.array_core(view).flags['C_CONTIGUOUS'] + devicearray.array_core(view).flags["C_CONTIGUOUS"], ) def test_device_array_like_1d(self): - d_a = cuda.device_array(10, order='C') + d_a = cuda.device_array(10, order="C") self._test_against_array_core(d_a) def test_device_array_like_2d(self): - d_a = cuda.device_array((10, 12), order='C') + d_a = cuda.device_array((10, 12), order="C") self._test_against_array_core(d_a) def test_device_array_like_2d_transpose(self): - d_a = cuda.device_array((10, 12), order='C') + d_a = cuda.device_array((10, 12), order="C") self._test_against_array_core(d_a.T) def test_device_array_like_3d(self): - d_a = cuda.device_array((10, 12, 14), order='C') + d_a = cuda.device_array((10, 12, 14), order="C") self._test_against_array_core(d_a) def test_device_array_like_1d_f(self): - d_a = cuda.device_array(10, order='F') + d_a = cuda.device_array(10, order="F") self._test_against_array_core(d_a) def test_device_array_like_2d_f(self): - d_a = cuda.device_array((10, 12), order='F') + d_a = cuda.device_array((10, 12), order="F") self._test_against_array_core(d_a) def test_device_array_like_2d_f_transpose(self): - d_a = cuda.device_array((10, 12), order='F') + d_a = cuda.device_array((10, 12), order="F") self._test_against_array_core(d_a.T) def test_device_array_like_3d_f(self): - d_a = cuda.device_array((10, 12, 14), order='F') + d_a = cuda.device_array((10, 12, 14), order="F") self._test_against_array_core(d_a) def test_1d_view(self): @@ -560,7 +565,7 @@ def test_1d_view(self): def test_1d_view_f(self): shape = 10 - view = np.zeros(shape, order='F')[::2] + view = np.zeros(shape, order="F")[::2] self._test_against_array_core(view) def test_2d_view(self): @@ -570,9 +575,9 @@ def test_2d_view(self): def test_2d_view_f(self): shape = (10, 12) - view = np.zeros(shape, order='F')[::2, ::2] + view = np.zeros(shape, order="F")[::2, ::2] self._test_against_array_core(view) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py b/numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py index 66fbbc372..7f03912c1 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py @@ -3,14 +3,18 @@ import numpy as np from numba import cuda -from numba.cuda.testing import (unittest, skip_on_cudasim, - skip_if_external_memmgr, CUDATestCase) +from numba.cuda.testing import ( + unittest, + skip_on_cudasim, + skip_if_external_memmgr, + CUDATestCase, +) from numba.tests.support import captured_stderr from numba.core import config -@skip_on_cudasim('not supported on CUDASIM') -@skip_if_external_memmgr('Deallocation specific to Numba memory management') +@skip_on_cudasim("not supported on CUDASIM") +@skip_if_external_memmgr("Deallocation specific to Numba memory management") class TestDeallocation(CUDATestCase): def test_max_pending_count(self): # get deallocation manager and flush it @@ -41,8 +45,9 @@ def test_max_pending_bytes(self): config.CUDA_DEALLOCS_RATIO = max_pending / mi.total # due to round off error (floor is used in calculating # _max_pending_bytes) it can be off by 1. - self.assertAlmostEqual(deallocs._max_pending_bytes, max_pending, - delta=1) + self.assertAlmostEqual( + deallocs._max_pending_bytes, max_pending, delta=1 + ) # allocate half the max size # this will not trigger deallocation @@ -51,8 +56,11 @@ def test_max_pending_bytes(self): # allocate another remaining # this will not trigger deallocation - cuda.to_device(np.ones(deallocs._max_pending_bytes - - deallocs._size, dtype=np.int8)) + cuda.to_device( + np.ones( + deallocs._max_pending_bytes - deallocs._size, dtype=np.int8 + ) + ) self.assertEqual(len(deallocs), 2) # another byte to trigger .clear() @@ -64,7 +72,7 @@ def test_max_pending_bytes(self): @skip_on_cudasim("defer_cleanup has no effect in CUDASIM") -@skip_if_external_memmgr('Deallocation specific to Numba memory management') +@skip_if_external_memmgr("Deallocation specific to Numba memory management") class TestDeferCleanup(CUDATestCase): def test_basic(self): harr = np.arange(5) @@ -138,11 +146,12 @@ def test_context_manager(self): pass -@skip_on_cudasim('not supported on CUDASIM') +@skip_on_cudasim("not supported on CUDASIM") class TestDel(CUDATestCase): """ Ensure resources are deleted properly without ignored exception. """ + @contextmanager def check_ignored_exception(self, ctx): with captured_stderr() as cap: @@ -245,5 +254,5 @@ class MappedException(Exception): pass -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_detect.py b/numba_cuda/numba/cuda/tests/cudadrv/test_detect.py index 528e11bf8..d70b6776e 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_detect.py @@ -3,8 +3,12 @@ import subprocess import threading from numba import cuda -from numba.cuda.testing import (unittest, CUDATestCase, skip_on_cudasim, - skip_under_cuda_memcheck) +from numba.cuda.testing import ( + unittest, + CUDATestCase, + skip_on_cudasim, + skip_under_cuda_memcheck, +) from numba.tests.support import captured_stdout @@ -14,21 +18,19 @@ def test_cuda_detect(self): with captured_stdout() as out: cuda.detect() output = out.getvalue() - self.assertIn('Found', output) - self.assertIn('CUDA devices', output) + self.assertIn("Found", output) + self.assertIn("CUDA devices", output) -@skip_under_cuda_memcheck('Hangs cuda-memcheck') +@skip_under_cuda_memcheck("Hangs cuda-memcheck") class TestCUDAFindLibs(CUDATestCase): - def run_cmd(self, cmdline, env): - popen = subprocess.Popen(cmdline, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - env=env) + popen = subprocess.Popen( + cmdline, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env + ) # finish in 5 minutes or kill it - timeout = threading.Timer(5 * 60., popen.kill) + timeout = threading.Timer(5 * 60.0, popen.kill) try: timeout.start() out, err = popen.communicate() @@ -51,8 +53,8 @@ def kernel(x): cmdline = [sys.executable, "-c", code] return self.run_cmd(cmdline, env_copy) - @skip_on_cudasim('Simulator does not hit device library search code path') - @unittest.skipIf(not sys.platform.startswith('linux'), "linux only") + @skip_on_cudasim("Simulator does not hit device library search code path") + @unittest.skipIf(not sys.platform.startswith("linux"), "linux only") def test_cuda_find_lib_errors(self): """ This tests that the find_libs works as expected in the case of an @@ -60,7 +62,7 @@ def test_cuda_find_lib_errors(self): """ # one of these is likely to exist on linux, it's also unlikely that # someone has extracted the contents of libdevice into here! - locs = ['lib', 'lib64'] + locs = ["lib", "lib64"] looking_for = None for l in locs: @@ -71,11 +73,12 @@ def test_cuda_find_lib_errors(self): # This is the testing part, the test will only run if there's a valid # path in which to look if looking_for is not None: - out, err = self.run_test_in_separate_process("NUMBA_CUDA_DRIVER", - looking_for) + out, err = self.run_test_in_separate_process( + "NUMBA_CUDA_DRIVER", looking_for + ) self.assertTrue(out is not None) self.assertTrue(err is not None) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py b/numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py index 209355ed6..c0ad870bd 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py @@ -8,6 +8,7 @@ from numba.tests.support import linux_only if not config.ENABLE_CUDASIM: + class DeviceOnlyEMMPlugin(cuda.HostOnlyCUDAMemoryManager): """ Dummy EMM Plugin implementation for testing. It memorises which plugin @@ -56,8 +57,9 @@ def finalizer(): # the reference count drops to zero. ctx = weakref.proxy(self.context) ptr = ctypes.c_void_p(alloc_count) - return cuda.cudadrv.driver.AutoFreePointer(ctx, ptr, size, - finalizer=finalizer) + return cuda.cudadrv.driver.AutoFreePointer( + ctx, ptr, size, finalizer=finalizer + ) def initialize(self): # No special initialization needed. @@ -97,7 +99,7 @@ def interface_version(self): return 2 -@skip_on_cudasim('EMM Plugins not supported on CUDA simulator') +@skip_on_cudasim("EMM Plugins not supported on CUDA simulator") class TestDeviceOnlyEMMPlugin(CUDATestCase): """ Tests that the API of an EMM Plugin that implements device allocations @@ -175,7 +177,7 @@ def test_get_ipc_handle(self): self.assertIn("Dummy IPC handle for alloc 1", ipch._ipc_handle) -@skip_on_cudasim('EMM Plugins not supported on CUDA simulator') +@skip_on_cudasim("EMM Plugins not supported on CUDA simulator") class TestBadEMMPluginVersion(CUDATestCase): """ Ensure that Numba rejects EMM Plugins with incompatible version @@ -185,8 +187,8 @@ class TestBadEMMPluginVersion(CUDATestCase): def test_bad_plugin_version(self): with self.assertRaises(RuntimeError) as raises: cuda.set_memory_manager(BadVersionEMMPlugin) - self.assertIn('version 1 required', str(raises.exception)) + self.assertIn("version 1 required", str(raises.exception)) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_events.py b/numba_cuda/numba/cuda/tests/cudadrv/test_events.py index b611a4a75..f8a7805d5 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_events.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_events.py @@ -34,5 +34,5 @@ def test_event_elapsed_stream(self): evtstart.elapsed_time(evtend) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py b/numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py index 62c4ecafe..02761d958 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py @@ -10,10 +10,9 @@ def test_host_alloc_driver(self): mem = cuda.current_context().memhostalloc(n, mapped=True) dtype = np.dtype(np.uint8) - ary = np.ndarray(shape=n // dtype.itemsize, dtype=dtype, - buffer=mem) + ary = np.ndarray(shape=n // dtype.itemsize, dtype=dtype, buffer=mem) - magic = 0xab + magic = 0xAB driver.device_memset(mem, magic, n) self.assertTrue(np.all(ary == magic)) @@ -46,8 +45,10 @@ def test_host_alloc_mapped(self): self.assertTrue(sum(ary != 0) == 0) def test_host_operators(self): - for ary in [cuda.mapped_array(10, dtype=np.uint32), - cuda.pinned_array(10, dtype=np.uint32)]: + for ary in [ + cuda.mapped_array(10, dtype=np.uint32), + cuda.pinned_array(10, dtype=np.uint32), + ]: ary[:] = range(10) self.assertTrue(sum(ary + 1) == 55) self.assertTrue(sum((ary + 1) * 2 - 1) == 100) @@ -55,11 +56,11 @@ def test_host_operators(self): self.assertTrue(sum(ary <= 5) == 6) self.assertTrue(sum(ary > 6) == 3) self.assertTrue(sum(ary >= 6) == 4) - self.assertTrue(sum(ary ** 2) == 285) + self.assertTrue(sum(ary**2) == 285) self.assertTrue(sum(ary // 2) == 20) self.assertTrue(sum(ary / 2.0) == 22.5) self.assertTrue(sum(ary % 2) == 5) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_init.py b/numba_cuda/numba/cuda/tests/cudadrv/test_init.py index 600687fd5..c5dccbd6a 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_init.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_init.py @@ -9,7 +9,7 @@ # A mock of cuInit that always raises a CudaAPIError def cuInit_raising(arg): - raise CudaAPIError(999, 'CUDA_ERROR_UNKNOWN') + raise CudaAPIError(999, "CUDA_ERROR_UNKNOWN") # Test code to run in a child that patches driver.cuInit to a variant that @@ -82,45 +82,45 @@ def cuda_disabled_error_test(result_queue): result_queue.put((success, msg)) -@skip_on_cudasim('CUDA Simulator does not initialize driver') +@skip_on_cudasim("CUDA Simulator does not initialize driver") class TestInit(CUDATestCase): def _test_init_failure(self, target, expected): # Run the initialization failure test in a separate subprocess - ctx = mp.get_context('spawn') + ctx = mp.get_context("spawn") result_queue = ctx.Queue() proc = ctx.Process(target=target, args=(result_queue,)) proc.start() - proc.join(30) # should complete within 30s + proc.join(30) # should complete within 30s success, msg = result_queue.get() # Ensure the child process raised an exception during initialization # before checking the message if not success: - self.fail('CudaSupportError not raised') + self.fail("CudaSupportError not raised") self.assertIn(expected, msg) def test_init_failure_raising(self): - expected = 'Error at driver init: CUDA_ERROR_UNKNOWN (999)' + expected = "Error at driver init: CUDA_ERROR_UNKNOWN (999)" self._test_init_failure(cuInit_raising_test, expected) def test_init_failure_error(self): - expected = 'CUDA_ERROR_UNKNOWN (999)' + expected = "CUDA_ERROR_UNKNOWN (999)" self._test_init_failure(initialization_error_test, expected) def _test_cuda_disabled(self, target): # Uses _test_init_failure to launch the test in a separate subprocess # with CUDA disabled. - cuda_disabled = os.environ.get('NUMBA_DISABLE_CUDA') - os.environ['NUMBA_DISABLE_CUDA'] = "1" + cuda_disabled = os.environ.get("NUMBA_DISABLE_CUDA") + os.environ["NUMBA_DISABLE_CUDA"] = "1" try: - expected = 'CUDA is disabled due to setting NUMBA_DISABLE_CUDA=1' + expected = "CUDA is disabled due to setting NUMBA_DISABLE_CUDA=1" self._test_init_failure(cuda_disabled_test, expected) finally: if cuda_disabled is not None: - os.environ['NUMBA_DISABLE_CUDA'] = cuda_disabled + os.environ["NUMBA_DISABLE_CUDA"] = cuda_disabled else: - os.environ.pop('NUMBA_DISABLE_CUDA') + os.environ.pop("NUMBA_DISABLE_CUDA") def test_cuda_disabled_raising(self): self._test_cuda_disabled(cuda_disabled_test) @@ -135,5 +135,5 @@ def test_init_success(self): self.assertIsNone(cuda.cuda_error()) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py b/numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py index 40a6fa599..aeeb5bbd2 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py @@ -5,20 +5,23 @@ from numba.cuda.testing import skip_on_cudasim -@skip_on_cudasim('Inline PTX cannot be used in the simulator') +@skip_on_cudasim("Inline PTX cannot be used in the simulator") class TestCudaInlineAsm(ContextResettingTestCase): def test_inline_rsqrt(self): mod = ir.Module(__name__) - mod.triple = 'nvptx64-nvidia-cuda' + mod.triple = "nvptx64-nvidia-cuda" nvvm.add_ir_version(mod) fnty = ir.FunctionType(ir.VoidType(), [ir.PointerType(ir.FloatType())]) - fn = ir.Function(mod, fnty, 'cu_rsqrt') - bldr = ir.IRBuilder(fn.append_basic_block('entry')) + fn = ir.Function(mod, fnty, "cu_rsqrt") + bldr = ir.IRBuilder(fn.append_basic_block("entry")) rsqrt_approx_fnty = ir.FunctionType(ir.FloatType(), [ir.FloatType()]) - inlineasm = ir.InlineAsm(rsqrt_approx_fnty, - 'rsqrt.approx.f32 $0, $1;', - '=f,f', side_effect=True) + inlineasm = ir.InlineAsm( + rsqrt_approx_fnty, + "rsqrt.approx.f32 $0, $1;", + "=f,f", + side_effect=True, + ) val = bldr.load(fn.args[0]) res = bldr.call(inlineasm, [val]) @@ -30,8 +33,8 @@ def test_inline_rsqrt(self): nvvm.set_cuda_kernel(fn) nvvmir = str(mod) ptx = nvvm.compile_ir(nvvmir) - self.assertTrue('rsqrt.approx.f32' in str(ptx)) + self.assertTrue("rsqrt.approx.f32" in str(ptx)) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_linker.py b/numba_cuda/numba/cuda/tests/cudadrv/test_linker.py index 22e2ee837..be018ccef 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_linker.py @@ -1,10 +1,9 @@ import numpy as np import warnings from numba.cuda.testing import unittest -from numba.cuda.testing import (skip_on_cudasim, skip_if_cuda_includes_missing) +from numba.cuda.testing import skip_on_cudasim, skip_if_cuda_includes_missing from numba.cuda.testing import CUDATestCase, test_data_dir -from numba.cuda.cudadrv.driver import (CudaAPIError, Linker, - LinkerError) +from numba.cuda.cudadrv.driver import CudaAPIError, Linker, LinkerError from numba.cuda.cudadrv.error import NvrtcError from numba.cuda import require_context from numba.tests.support import ignore_internal_warnings @@ -103,25 +102,24 @@ def simple_lmem(A, B, dty): B[i] = C[i] -@skip_on_cudasim('Linking unsupported in the simulator') +@skip_on_cudasim("Linking unsupported in the simulator") class TestLinker(CUDATestCase): - _NUMBA_NVIDIA_BINDING_0_ENV = {'NUMBA_CUDA_USE_NVIDIA_BINDING': '0'} + _NUMBA_NVIDIA_BINDING_0_ENV = {"NUMBA_CUDA_USE_NVIDIA_BINDING": "0"} @require_context def test_linker_basic(self): - '''Simply go through the constructor and destructor - ''' + """Simply go through the constructor and destructor""" linker = Linker.new(cc=(5, 3)) del linker def _test_linking(self, eager): global bar # must be a global; other it is recognized as a freevar - bar = cuda.declare_device('bar', 'int32(int32)') + bar = cuda.declare_device("bar", "int32(int32)") - link = str(test_data_dir / 'jitlink.ptx') + link = str(test_data_dir / "jitlink.ptx") if eager: - args = ['void(int32[:], int32[:])'] + args = ["void(int32[:], int32[:])"] else: args = [] @@ -144,9 +142,9 @@ def test_linking_eager_compile(self): self._test_linking(eager=True) def test_linking_cu(self): - bar = cuda.declare_device('bar', 'int32(int32)') + bar = cuda.declare_device("bar", "int32(int32)") - link = str(test_data_dir / 'jitlink.cu') + link = str(test_data_dir / "jitlink.cu") @cuda.jit(link=[link]) def kernel(r, x): @@ -165,36 +163,37 @@ def kernel(r, x): np.testing.assert_array_equal(r, expected) def test_linking_cu_log_warning(self): - bar = cuda.declare_device('bar', 'int32(int32)') + bar = cuda.declare_device("bar", "int32(int32)") - link = str(test_data_dir / 'warn.cu') + link = str(test_data_dir / "warn.cu") with warnings.catch_warnings(record=True) as w: ignore_internal_warnings() - @cuda.jit('void(int32)', link=[link]) + @cuda.jit("void(int32)", link=[link]) def kernel(x): bar(x) - self.assertEqual(len(w), 1, 'Expected warnings from NVRTC') + self.assertEqual(len(w), 1, "Expected warnings from NVRTC") # Check the warning refers to the log messages - self.assertIn('NVRTC log messages', str(w[0].message)) + self.assertIn("NVRTC log messages", str(w[0].message)) # Check the message pertaining to the unused variable is provided - self.assertIn('declared but never referenced', str(w[0].message)) + self.assertIn("declared but never referenced", str(w[0].message)) def test_linking_cu_error(self): - bar = cuda.declare_device('bar', 'int32(int32)') + bar = cuda.declare_device("bar", "int32(int32)") - link = str(test_data_dir / 'error.cu') + link = str(test_data_dir / "error.cu") with self.assertRaises(NvrtcError) as e: - @cuda.jit('void(int32)', link=[link]) + + @cuda.jit("void(int32)", link=[link]) def kernel(x): bar(x) msg = e.exception.args[0] # Check the error message refers to the NVRTC compile - self.assertIn('NVRTC Compilation failure', msg) + self.assertIn("NVRTC Compilation failure", msg) # Check the expected error in the CUDA source is reported self.assertIn('identifier "SYNTAX" is undefined', msg) # Check the filename is reported correctly @@ -203,33 +202,37 @@ def kernel(x): def test_linking_unknown_filetype_error(self): expected_err = "Don't know how to link file with extension .cuh" with self.assertRaisesRegex(RuntimeError, expected_err): - @cuda.jit('void()', link=['header.cuh']) + + @cuda.jit("void()", link=["header.cuh"]) def kernel(): pass def test_linking_file_with_no_extension_error(self): expected_err = "Don't know how to link file with no extension" with self.assertRaisesRegex(RuntimeError, expected_err): - @cuda.jit('void()', link=['data']) + + @cuda.jit("void()", link=["data"]) def kernel(): pass @skip_if_cuda_includes_missing def test_linking_cu_cuda_include(self): - link = str(test_data_dir / 'cuda_include.cu') + link = str(test_data_dir / "cuda_include.cu") # An exception will be raised when linking this kernel due to the # compile failure if CUDA includes cannot be found by Nvrtc. - @cuda.jit('void()', link=[link]) + @cuda.jit("void()", link=[link]) def kernel(): pass def test_try_to_link_nonexistent(self): with self.assertRaises(LinkerError) as e: - @cuda.jit('void(int32[::1])', link=['nonexistent.a']) + + @cuda.jit("void(int32[::1])", link=["nonexistent.a"]) def f(x): x[0] = 0 - self.assertIn('nonexistent.a not found', e.exception.args) + + self.assertIn("nonexistent.a not found", e.exception.args) def test_set_registers_no_max(self): """Ensure that the jitted kernel used in the test_set_registers_* tests @@ -276,7 +279,8 @@ def test_get_shared_mem_per_block(self): def test_get_shared_mem_per_specialized(self): compiled = cuda.jit(simple_smem) compiled_specialized = compiled.specialize( - np.zeros(100, dtype=np.int32), np.float64) + np.zeros(100, dtype=np.int32), np.float64 + ) shared_mem_size = compiled_specialized.get_shared_mem_per_block() self.assertEqual(shared_mem_size, 800) @@ -307,11 +311,12 @@ def test_get_local_mem_per_specialized(self): compiled_specialized = compiled.specialize( np.zeros(LMEM_SIZE, dtype=np.int32), np.zeros(LMEM_SIZE, dtype=np.int32), - np.float64) + np.float64, + ) local_mem_size = compiled_specialized.get_local_mem_per_thread() calc_size = np.dtype(np.float64).itemsize * LMEM_SIZE self.assertGreaterEqual(local_mem_size, calc_size) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py b/numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py index e9cc37ca8..1f4eb411e 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py @@ -7,11 +7,10 @@ from numba.tests.support import linux_only -@skip_on_cudasim('CUDA Driver API unsupported in the simulator') +@skip_on_cudasim("CUDA Driver API unsupported in the simulator") @linux_only -@skip_on_arm('Managed Alloc support is experimental/untested on ARM') +@skip_on_arm("Managed Alloc support is experimental/untested on ARM") class TestManagedAlloc(ContextResettingTestCase): - def get_total_gpu_memory(self): # We use a driver function to directly get the total GPU memory because # an EMM plugin may report something different (or not implement @@ -85,7 +84,7 @@ def _test_managed_alloc_driver(self, memory_factor, attach_global=True): n_elems = n_bytes // dtype.itemsize ary = np.ndarray(shape=n_elems, dtype=dtype, buffer=mem) - magic = 0xab + magic = 0xAB device_memset(mem, magic, n_bytes) ctx.synchronize() @@ -102,7 +101,7 @@ def _test_managed_array(self, attach_global=True): ary.fill(123.456) self.assertTrue(all(ary == 123.456)) - @cuda.jit('void(double[:])') + @cuda.jit("void(double[:])") def kernel(x): i = cuda.grid(1) if i < x.shape[0]: @@ -123,5 +122,5 @@ def test_managed_array_attach_host(self): self._test_managed_array(attach_global=False) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py b/numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py index c25bc5ae2..4da56e009 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py @@ -1,8 +1,11 @@ import multiprocessing as mp import traceback from numba.cuda.testing import unittest, CUDATestCase -from numba.cuda.testing import (skip_on_cudasim, skip_under_cuda_memcheck, - skip_if_mvc_libraries_unavailable) +from numba.cuda.testing import ( + skip_on_cudasim, + skip_under_cuda_memcheck, + skip_if_mvc_libraries_unavailable, +) from numba.tests.support import linux_only @@ -24,7 +27,7 @@ def child_test_wrapper(result_queue): output = child_test() success = True # Catch anything raised so it can be propagated - except: # noqa: E722 + except: # noqa: E722 output = traceback.format_exc() success = False @@ -32,13 +35,13 @@ def child_test_wrapper(result_queue): @linux_only -@skip_under_cuda_memcheck('May hang CUDA memcheck') -@skip_on_cudasim('Simulator does not require or implement MVC') +@skip_under_cuda_memcheck("May hang CUDA memcheck") +@skip_on_cudasim("Simulator does not require or implement MVC") @skip_if_mvc_libraries_unavailable class TestMinorVersionCompatibility(CUDATestCase): def test_mvc(self): # Run test with Minor Version Compatibility enabled in a child process - ctx = mp.get_context('spawn') + ctx = mp.get_context("spawn") result_queue = ctx.Queue() proc = ctx.Process(target=child_test_wrapper, args=(result_queue,)) proc.start() @@ -50,5 +53,5 @@ def test_mvc(self): self.fail(output) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py b/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py index 106ab0d30..0fe6177cb 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py @@ -10,6 +10,7 @@ try: import pynvjitlink # noqa: F401 + PYNVJITLINK_INSTALLED = True except ImportError: PYNVJITLINK_INSTALLED = False @@ -52,7 +53,7 @@ @unittest.skipIf( not config.CUDA_ENABLE_PYNVJITLINK or not TEST_BIN_DIR, - "pynvjitlink not enabled" + "pynvjitlink not enabled", ) @skip_on_cudasim("Linking unsupported in the simulator") class TestLinker(CUDATestCase): @@ -85,7 +86,6 @@ def test_nvjitlink_invalid_cc_type_error(self): PyNvJitLinker(cc=0) def test_nvjitlink_ptx_compile_options(self): - max_registers = (None, 32) lineinfo = (False, True) lto = (False, True) @@ -190,7 +190,7 @@ def test_nvjitlink_jit_with_linkable_code_lto_dump_assembly(self): files = [ test_device_functions_cu, test_device_functions_ltoir, - test_device_functions_fatbin_multi + test_device_functions_fatbin_multi, ] config.DUMP_ASSEMBLY = True @@ -228,7 +228,7 @@ def test_nvjitlink_jit_with_linkable_code_lto_dump_assembly_warn(self): for file in files: with self.subTest(file=file): with warnings.catch_warnings(record=True) as w: - with contextlib.redirect_stdout(None): # suppress other PTX + with contextlib.redirect_stdout(None): # suppress other PTX sig = "uint32(uint32, uint32)" add_from_numba = cuda.declare_device( "add_from_numba", sig @@ -243,8 +243,11 @@ def kernel(result): assert result[0] == 3 assert len(w) == 1 - self.assertIn("it is not optimizable at link time, and " - "`ignore_nonlto == True`", str(w[0].message)) + self.assertIn( + "it is not optimizable at link time, and " + "`ignore_nonlto == True`", + str(w[0].message), + ) config.DUMP_ASSEMBLY = False @@ -262,7 +265,7 @@ def kernel(): @unittest.skipIf( not PYNVJITLINK_INSTALLED or not TEST_BIN_DIR, - reason="pynvjitlink not enabled" + reason="pynvjitlink not enabled", ) class TestLinkerUsage(CUDATestCase): """Test that whether pynvjitlink can be enabled by both environment variable @@ -295,12 +298,12 @@ def kernel(result): def test_linker_enabled_envvar(self): env = os.environ.copy() - env['NUMBA_CUDA_ENABLE_PYNVJITLINK'] = "1" + env["NUMBA_CUDA_ENABLE_PYNVJITLINK"] = "1" run_in_subprocess(self.src.format(config=""), env=env) def test_linker_disabled_envvar(self): env = os.environ.copy() - env.pop('NUMBA_CUDA_ENABLE_PYNVJITLINK', None) + env.pop("NUMBA_CUDA_ENABLE_PYNVJITLINK", None) with self.assertRaisesRegex( AssertionError, "LTO and additional flags require PyNvJitLinker" ): @@ -310,19 +313,25 @@ def test_linker_disabled_envvar(self): def test_linker_enabled_config(self): env = os.environ.copy() - env.pop('NUMBA_CUDA_ENABLE_PYNVJITLINK', None) - run_in_subprocess(self.src.format( - config="config.CUDA_ENABLE_PYNVJITLINK = True"), env=env) + env.pop("NUMBA_CUDA_ENABLE_PYNVJITLINK", None) + run_in_subprocess( + self.src.format(config="config.CUDA_ENABLE_PYNVJITLINK = True"), + env=env, + ) def test_linker_disabled_config(self): env = os.environ.copy() - env.pop('NUMBA_CUDA_ENABLE_PYNVJITLINK', None) + env.pop("NUMBA_CUDA_ENABLE_PYNVJITLINK", None) with override_config("CUDA_ENABLE_PYNVJITLINK", False): with self.assertRaisesRegex( AssertionError, "LTO and additional flags require PyNvJitLinker" ): - run_in_subprocess(self.src.format( - config="config.CUDA_ENABLE_PYNVJITLINK = False"), env=env) + run_in_subprocess( + self.src.format( + config="config.CUDA_ENABLE_PYNVJITLINK = False" + ), + env=env, + ) if __name__ == "__main__": diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py b/numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py index 309169bfc..fad357243 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py @@ -7,7 +7,7 @@ from numba.cuda.testing import skip_on_cudasim -@skip_on_cudasim('NVVM Driver unsupported in the simulator') +@skip_on_cudasim("NVVM Driver unsupported in the simulator") class TestNvvmDriver(unittest.TestCase): def get_nvvmir(self): versions = NVVM().get_ir_version() @@ -16,9 +16,9 @@ def get_nvvmir(self): def test_nvvm_compile_simple(self): nvvmir = self.get_nvvmir() - ptx = nvvm.compile_ir(nvvmir).decode('utf8') - self.assertTrue('simple' in ptx) - self.assertTrue('ave' in ptx) + ptx = nvvm.compile_ir(nvvmir).decode("utf8") + self.assertTrue("simple" in ptx) + self.assertTrue("ave" in ptx) def test_nvvm_compile_nullary_option(self): # Tests compilation with an option that doesn't take an argument @@ -34,7 +34,7 @@ def test_nvvm_compile_nullary_option(self): # Verify we correctly passed the option by checking if we got LTOIR # from NVVM (by looking for the expected magic number for LTOIR) - self.assertEqual(ltoir[:4], b'\xed\x43\x4e\x7f') + self.assertEqual(ltoir[:4], b"\xed\x43\x4e\x7f") def test_nvvm_bad_option(self): # Ensure that unsupported / non-existent options are reported as such @@ -45,36 +45,37 @@ def test_nvvm_bad_option(self): def test_nvvm_from_llvm(self): m = ir.Module("test_nvvm_from_llvm") - m.triple = 'nvptx64-nvidia-cuda' + m.triple = "nvptx64-nvidia-cuda" nvvm.add_ir_version(m) fty = ir.FunctionType(ir.VoidType(), [ir.IntType(32)]) - kernel = ir.Function(m, fty, name='mycudakernel') - bldr = ir.IRBuilder(kernel.append_basic_block('entry')) + kernel = ir.Function(m, fty, name="mycudakernel") + bldr = ir.IRBuilder(kernel.append_basic_block("entry")) bldr.ret_void() nvvm.set_cuda_kernel(kernel) m.data_layout = NVVM().data_layout - ptx = nvvm.compile_ir(str(m)).decode('utf8') - self.assertTrue('mycudakernel' in ptx) - self.assertTrue('.address_size 64' in ptx) + ptx = nvvm.compile_ir(str(m)).decode("utf8") + self.assertTrue("mycudakernel" in ptx) + self.assertTrue(".address_size 64" in ptx) def test_used_list(self): # Construct a module m = ir.Module("test_used_list") - m.triple = 'nvptx64-nvidia-cuda' + m.triple = "nvptx64-nvidia-cuda" m.data_layout = NVVM().data_layout nvvm.add_ir_version(m) # Add a function and mark it as a kernel fty = ir.FunctionType(ir.VoidType(), [ir.IntType(32)]) - kernel = ir.Function(m, fty, name='mycudakernel') - bldr = ir.IRBuilder(kernel.append_basic_block('entry')) + kernel = ir.Function(m, fty, name="mycudakernel") + bldr = ir.IRBuilder(kernel.append_basic_block("entry")) bldr.ret_void() nvvm.set_cuda_kernel(kernel) # Verify that the used list was correctly constructed - used_lines = [line for line in str(m).splitlines() - if 'llvm.used' in line] + used_lines = [ + line for line in str(m).splitlines() if "llvm.used" in line + ] msg = 'Expected exactly one @"llvm.used" array' self.assertEqual(len(used_lines), 1, msg) @@ -93,70 +94,71 @@ def test_nvvm_ir_verify_fail(self): m.triple = "unknown-unknown-unknown" m.data_layout = NVVM().data_layout nvvm.add_ir_version(m) - with self.assertRaisesRegex(NvvmError, 'Invalid target triple'): + with self.assertRaisesRegex(NvvmError, "Invalid target triple"): nvvm.compile_ir(str(m)) def _test_nvvm_support(self, arch): - compute_xx = 'compute_{0}{1}'.format(*arch) + compute_xx = "compute_{0}{1}".format(*arch) nvvmir = self.get_nvvmir() - ptx = nvvm.compile_ir(nvvmir, arch=compute_xx, ftz=1, prec_sqrt=0, - prec_div=0).decode('utf8') + ptx = nvvm.compile_ir( + nvvmir, arch=compute_xx, ftz=1, prec_sqrt=0, prec_div=0 + ).decode("utf8") self.assertIn(".target sm_{0}{1}".format(*arch), ptx) - self.assertIn('simple', ptx) - self.assertIn('ave', ptx) + self.assertIn("simple", ptx) + self.assertIn("ave", ptx) def test_nvvm_support(self): - """Test supported CC by NVVM - """ + """Test supported CC by NVVM""" for arch in nvvm.get_supported_ccs(): self._test_nvvm_support(arch=arch) def test_nvvm_warning(self): m = ir.Module("test_nvvm_warning") - m.triple = 'nvptx64-nvidia-cuda' + m.triple = "nvptx64-nvidia-cuda" m.data_layout = NVVM().data_layout nvvm.add_ir_version(m) fty = ir.FunctionType(ir.VoidType(), []) - kernel = ir.Function(m, fty, name='inlinekernel') - builder = ir.IRBuilder(kernel.append_basic_block('entry')) + kernel = ir.Function(m, fty, name="inlinekernel") + builder = ir.IRBuilder(kernel.append_basic_block("entry")) builder.ret_void() nvvm.set_cuda_kernel(kernel) # Add the noinline attribute to trigger NVVM to generate a warning - kernel.attributes.add('noinline') + kernel.attributes.add("noinline") with warnings.catch_warnings(record=True) as w: nvvm.compile_ir(str(m)) self.assertEqual(len(w), 1) - self.assertIn('overriding noinline attribute', str(w[0])) + self.assertIn("overriding noinline attribute", str(w[0])) -@skip_on_cudasim('NVVM Driver unsupported in the simulator') +@skip_on_cudasim("NVVM Driver unsupported in the simulator") class TestArchOption(unittest.TestCase): def test_get_arch_option(self): # Test returning the nearest lowest arch. - self.assertEqual(nvvm.get_arch_option(5, 3), 'compute_53') - self.assertEqual(nvvm.get_arch_option(7, 5), 'compute_75') - self.assertEqual(nvvm.get_arch_option(7, 7), 'compute_75') + self.assertEqual(nvvm.get_arch_option(5, 3), "compute_53") + self.assertEqual(nvvm.get_arch_option(7, 5), "compute_75") + self.assertEqual(nvvm.get_arch_option(7, 7), "compute_75") # Test known arch. supported_cc = nvvm.get_supported_ccs() for arch in supported_cc: - self.assertEqual(nvvm.get_arch_option(*arch), 'compute_%d%d' % arch) - self.assertEqual(nvvm.get_arch_option(1000, 0), - 'compute_%d%d' % supported_cc[-1]) + self.assertEqual(nvvm.get_arch_option(*arch), "compute_%d%d" % arch) + self.assertEqual( + nvvm.get_arch_option(1000, 0), "compute_%d%d" % supported_cc[-1] + ) -@skip_on_cudasim('NVVM Driver unsupported in the simulator') +@skip_on_cudasim("NVVM Driver unsupported in the simulator") class TestLibDevice(unittest.TestCase): def test_libdevice_load(self): # Test that constructing LibDevice gives a bitcode file libdevice = LibDevice() - self.assertEqual(libdevice.bc[:4], b'BC\xc0\xde') + self.assertEqual(libdevice.bc[:4], b"BC\xc0\xde") -nvvmir_generic = '''\ +nvvmir_generic = """\ target triple="nvptx64-nvidia-cuda" target datalayout = "{data_layout}" @@ -194,8 +196,8 @@ def test_libdevice_load(self): !2 = !{{void (i32*)* @simple, !"kernel", i32 1}} @"llvm.used" = appending global [1 x i8*] [i8* bitcast (void (i32*)* @simple to i8*)], section "llvm.metadata" -''' # noqa: E501 +""" # noqa: E501 -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py b/numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py index ef727c5a8..8e4d811d1 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py @@ -6,7 +6,6 @@ class TestPinned(ContextResettingTestCase): - def _run_copies(self, A): A0 = np.copy(A) @@ -20,8 +19,8 @@ def _run_copies(self, A): def test_pinned(self): machine = platform.machine() - if machine.startswith('arm') or machine.startswith('aarch64'): - count = 262144 # 2MB + if machine.startswith("arm") or machine.startswith("aarch64"): + count = 262144 # 2MB else: count = 2097152 # 16MB A = np.arange(count) @@ -29,9 +28,9 @@ def test_pinned(self): self._run_copies(A) def test_unpinned(self): - A = np.arange(2 * 1024 * 1024) # 16 MB + A = np.arange(2 * 1024 * 1024) # 16 MB self._run_copies(A) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py b/numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py index 1660d4d42..a1d7a95ce 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py @@ -4,7 +4,7 @@ from numba.cuda.testing import skip_on_cudasim -@skip_on_cudasim('CUDA Profiler unsupported in the simulator') +@skip_on_cudasim("CUDA Profiler unsupported in the simulator") class TestProfiler(ContextResettingTestCase): def test_profiling(self): with cuda.profiling(): @@ -16,5 +16,5 @@ def test_profiling(self): del a -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py b/numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py index b03fd3647..a532f8c28 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py @@ -2,8 +2,11 @@ import logging import traceback from numba.cuda.testing import unittest, CUDATestCase -from numba.cuda.testing import (skip_on_cudasim, skip_with_cuda_python, - skip_under_cuda_memcheck) +from numba.cuda.testing import ( + skip_on_cudasim, + skip_with_cuda_python, + skip_under_cuda_memcheck, +) from numba.tests.support import linux_only @@ -23,12 +26,12 @@ def child_test(): # used. logbuf = io.StringIO() handler = logging.StreamHandler(logbuf) - cudadrv_logger = logging.getLogger('numba.cuda.cudadrv.driver') + cudadrv_logger = logging.getLogger("numba.cuda.cudadrv.driver") cudadrv_logger.addHandler(handler) cudadrv_logger.setLevel(logging.DEBUG) # Set up data for our test, and copy over to the device - N = 2 ** 16 + N = 2**16 N_THREADS = 10 N_ADDITIONS = 4096 @@ -65,8 +68,10 @@ def kernel_thread(n): f[n_blocks, n_threads, stream](rs[n], xs[n]) # Create threads - threads = [threading.Thread(target=kernel_thread, args=(i,)) - for i in range(N_THREADS)] + threads = [ + threading.Thread(target=kernel_thread, args=(i,)) + for i in range(N_THREADS) + ] # Start all threads for thread in threads: @@ -95,7 +100,7 @@ def child_test_wrapper(result_queue): output = child_test() success = True # Catch anything raised so it can be propagated - except: # noqa: E722 + except: # noqa: E722 output = traceback.format_exc() success = False @@ -105,13 +110,13 @@ def child_test_wrapper(result_queue): # Run on Linux only until the reason for test hangs on Windows (Issue #8635, # https://github.com/numba/numba/issues/8635) is diagnosed @linux_only -@skip_under_cuda_memcheck('Hangs cuda-memcheck') -@skip_on_cudasim('Streams not supported on the simulator') +@skip_under_cuda_memcheck("Hangs cuda-memcheck") +@skip_on_cudasim("Streams not supported on the simulator") class TestPTDS(CUDATestCase): - @skip_with_cuda_python('Function names unchanged for PTDS with NV Binding') + @skip_with_cuda_python("Function names unchanged for PTDS with NV Binding") def test_ptds(self): # Run a test with PTDS enabled in a child process - ctx = mp.get_context('spawn') + ctx = mp.get_context("spawn") result_queue = ctx.Queue() proc = ctx.Process(target=child_test_wrapper, args=(result_queue,)) proc.start() @@ -124,8 +129,11 @@ def test_ptds(self): # Functions with a per-thread default stream variant that we expect to # see in the output - ptds_functions = ('cuMemcpyHtoD_v2_ptds', 'cuLaunchKernel_ptsz', - 'cuMemcpyDtoH_v2_ptds') + ptds_functions = ( + "cuMemcpyHtoD_v2_ptds", + "cuLaunchKernel_ptsz", + "cuMemcpyDtoH_v2_ptds", + ) for fn in ptds_functions: with self.subTest(fn=fn, expected=True): @@ -133,17 +141,20 @@ def test_ptds(self): # Non-PTDS versions of the functions that we should not see in the # output: - legacy_functions = ('cuMemcpyHtoD_v2', 'cuLaunchKernel', - 'cuMemcpyDtoH_v2') + legacy_functions = ( + "cuMemcpyHtoD_v2", + "cuLaunchKernel", + "cuMemcpyDtoH_v2", + ) for fn in legacy_functions: with self.subTest(fn=fn, expected=False): # Ensure we only spot these function names appearing without a # _ptds or _ptsz suffix by checking including the end of the # line in the log - fn_at_end = f'{fn}\n' + fn_at_end = f"{fn}\n" self.assertNotIn(fn_at_end, output) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py b/numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py index f2e0b6d10..d7a8ae384 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py @@ -7,7 +7,6 @@ class TestResetDevice(ContextResettingTestCase): def test_reset_device(self): - def newthread(exception_queue): try: devices = range(driver.get_device_count()) @@ -32,5 +31,5 @@ def newthread(exception_queue): self.assertEqual(exceptions, []) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py b/numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py index 51e0722ec..4cb3d09cf 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py @@ -11,32 +11,40 @@ def set_visible_devices_and_check(q): from numba import cuda import os - os.environ['CUDA_VISIBLE_DEVICES'] = '0' + os.environ["CUDA_VISIBLE_DEVICES"] = "0" q.put(len(cuda.gpus.lst)) - except: # noqa: E722 + except: # noqa: E722 # Sentinel value for error executing test code q.put(-1) if config.ENABLE_CUDASIM: - SUPPORTED_VERSIONS = (-1, -1), + SUPPORTED_VERSIONS = ((-1, -1),) else: - SUPPORTED_VERSIONS = ((11, 0), (11, 1), (11, 2), (11, 3), (11, 4), (11, 5), - (11, 6), (11, 7)) + SUPPORTED_VERSIONS = ( + (11, 0), + (11, 1), + (11, 2), + (11, 3), + (11, 4), + (11, 5), + (11, 6), + (11, 7), + ) class TestRuntime(unittest.TestCase): def test_is_supported_version_true(self): for v in SUPPORTED_VERSIONS: - with patch.object(runtime, 'get_version', return_value=v): + with patch.object(runtime, "get_version", return_value=v): self.assertTrue(runtime.is_supported_version()) - @skip_on_cudasim('The simulator always simulates a supported runtime') + @skip_on_cudasim("The simulator always simulates a supported runtime") def test_is_supported_version_false(self): # Check with an old unsupported version and some potential future # versions for v in ((10, 2), (11, 8), (12, 0)): - with patch.object(runtime, 'get_version', return_value=v): + with patch.object(runtime, "get_version", return_value=v): self.assertFalse(runtime.is_supported_version()) def test_supported_versions(self): @@ -57,13 +65,13 @@ def test_visible_devices_set_after_import(self): from numba import cuda if len(cuda.gpus.lst) in (0, 1): - self.skipTest('This test requires multiple GPUs') + self.skipTest("This test requires multiple GPUs") - if os.environ.get('CUDA_VISIBLE_DEVICES'): - msg = 'Cannot test when CUDA_VISIBLE_DEVICES already set' + if os.environ.get("CUDA_VISIBLE_DEVICES"): + msg = "Cannot test when CUDA_VISIBLE_DEVICES already set" self.skipTest(msg) - ctx = multiprocessing.get_context('spawn') + ctx = multiprocessing.get_context("spawn") q = ctx.Queue() p = ctx.Process(target=set_visible_devices_and_check, args=(q,)) p.start() @@ -74,12 +82,12 @@ def test_visible_devices_set_after_import(self): # Make an obvious distinction between an error running the test code # and an incorrect number of GPUs in the list - msg = 'Error running set_visible_devices_and_check' + msg = "Error running set_visible_devices_and_check" self.assertNotEqual(visible_gpu_count, -1, msg=msg) # The actual check that we see only one GPU self.assertEqual(visible_gpu_count, 1) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py b/numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py index aca78d94b..e592a4773 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py @@ -37,5 +37,5 @@ def test_select_device(self): self.assertEqual(exceptions, []) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_streams.py b/numba_cuda/numba/cuda/tests/cudadrv/test_streams.py index c4fbec19f..e2154dda8 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_streams.py @@ -15,10 +15,11 @@ def runner(*args, **kwds): return loop.run_until_complete(f(*args, **kwds)) finally: loop.close() + return runner -@skip_on_cudasim('CUDA Driver API unsupported in the simulator') +@skip_on_cudasim("CUDA Driver API unsupported in the simulator") class TestCudaStream(CUDATestCase): def test_add_callback(self): def callback(stream, status, event): @@ -89,7 +90,7 @@ async def test_cancelled_future(self): self.assertTrue(done2.done()) -@skip_on_cudasim('CUDA Driver API unsupported in the simulator') +@skip_on_cudasim("CUDA Driver API unsupported in the simulator") class TestFailingStream(CUDATestCase): # This test can only be run in isolation because it corrupts the CUDA # context, which cannot be recovered from within the same process. It is @@ -118,5 +119,5 @@ async def test_failed_stream(self): self.assertIsNotNone(done.exception()) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py b/numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py index ad6d9ad57..c9f7c6975 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +++ b/numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py @@ -17,6 +17,7 @@ class UseCase: The return type is inferred from the type of the first argument, unless it is explicitly overridden by the ``retty`` kwarg. """ + def __init__(self, func, retty=None): self._func = func self._retty = retty @@ -59,6 +60,7 @@ def add_nocache_usecase_kernel(r, x, y): # Inner / outer cached / uncached cases + @cuda.jit(cache=True) def inner(x, y): return x + y + Z @@ -81,13 +83,13 @@ def outer_uncached_kernel(r, x, y): # Exercise returning a record instance. This used to hardcode the dtype # pointer's value in the bitcode. -packed_record_type = np.dtype([('a', np.int8), ('b', np.float64)]) -aligned_record_type = np.dtype([('a', np.int8), ('b', np.float64)], align=True) +packed_record_type = np.dtype([("a", np.int8), ("b", np.float64)]) +aligned_record_type = np.dtype([("a", np.int8), ("b", np.float64)], align=True) packed_arr = np.empty(2, dtype=packed_record_type) for i in range(packed_arr.size): - packed_arr[i]['a'] = i + 1 - packed_arr[i]['b'] = i + 42.5 + packed_arr[i]["a"] = i + 1 + packed_arr[i]["b"] = i + 42.5 aligned_arr = np.array(packed_arr, dtype=aligned_record_type) @@ -103,6 +105,7 @@ def record_return(r, ary, i): # Closure test cases + def make_closure(x): @cuda.jit(cache=True) def closure(r, y): @@ -119,6 +122,7 @@ def closure(r, y): # Ambiguous / renamed functions + @cuda.jit(cache=True) def ambiguous_function(r, x): r[()] = x[()] + 2 @@ -190,6 +194,7 @@ def many_locals(): # Simple use case for multiprocessing test + @cuda.jit(cache=True) def simple_usecase_kernel(r, x): r[()] = x[()] @@ -200,6 +205,7 @@ def simple_usecase_kernel(r, x): # Usecase with cooperative groups + @cuda.jit(cache=True) def cg_usecase_kernel(r, x): grid = cuda.cg.this_grid() diff --git a/numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py b/numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py index 07b42d755..a58b3c141 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +++ b/numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py @@ -12,6 +12,7 @@ def _call(self, ret, *args): # Using the same function as a cached CPU and CUDA-jitted function + def target_shared_assign(r, x): r[()] = x[()] diff --git a/numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py b/numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py index 1e639d379..897b0bbd8 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +++ b/numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py @@ -23,7 +23,7 @@ def __init__(self): register_model, make_attribute_wrapper, typeof_impl, - type_callable + type_callable, ) from numba.cuda.cudaimpl import lower from numba.core import cgutils @@ -38,21 +38,22 @@ def __init__(self, dmm, fe_type): members = [("x", int32), ("y", int32)] super().__init__(dmm, fe_type, members) - make_attribute_wrapper(TestStructModelType, 'x', 'x') - make_attribute_wrapper(TestStructModelType, 'y', 'y') + make_attribute_wrapper(TestStructModelType, "x", "x") + make_attribute_wrapper(TestStructModelType, "y", "y") @type_callable(TestStruct) def type_test_struct(context): def typer(x, y): if isinstance(x, types.Integer) and isinstance(y, types.Integer): return test_struct_model_type + return typer @lower(TestStruct, types.Integer, types.Integer) def lower_test_type_ctor(context, builder, sig, args): - obj = cgutils.create_struct_proxy( - test_struct_model_type - )(context, builder) + obj = cgutils.create_struct_proxy(test_struct_model_type)( + context, builder + ) obj.x = args[0] obj.y = args[1] return obj._getvalue() diff --git a/numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx b/numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx index 8cc1aa6d6..0a818041a 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +++ b/numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx @@ -26,5 +26,3 @@ st.param.b32 [func_retval0+0], %r3; ret; } - - diff --git a/numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py b/numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py index b182359b1..0bc1cf605 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +++ b/numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py @@ -97,4 +97,5 @@ def make_list(n): return None return (n, make_list(n - 1)) + return make_list diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_alignment.py b/numba_cuda/numba/cuda/tests/cudapy/test_alignment.py index 7c7dff8ca..a3183ae47 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_alignment.py @@ -6,7 +6,7 @@ class TestAlignment(CUDATestCase): def test_record_alignment(self): - rec_dtype = np.dtype([('a', 'int32'), ('b', 'float64')], align=True) + rec_dtype = np.dtype([("a", "int32"), ("b", "float64")], align=True) rec = from_dtype(rec_dtype) @cuda.jit((rec[:],)) @@ -24,19 +24,20 @@ def foo(a): self.assertTrue(np.all(a_recarray.a == a_recarray.b)) - @skip_on_cudasim('Simulator does not check alignment') + @skip_on_cudasim("Simulator does not check alignment") def test_record_alignment_error(self): - rec_dtype = np.dtype([('a', 'int32'), ('b', 'float64')]) + rec_dtype = np.dtype([("a", "int32"), ("b", "float64")]) rec = from_dtype(rec_dtype) with self.assertRaises(Exception) as raises: + @cuda.jit((rec[:],)) def foo(a): i = cuda.grid(1) a[i].a = a[i].b - self.assertTrue('type float64 is not aligned' in str(raises.exception)) + self.assertTrue("type float64 is not aligned" in str(raises.exception)) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_array.py b/numba_cuda/numba/cuda/tests/cudapy/test_array.py index fdd759e76..a244b762f 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_array.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_array.py @@ -8,8 +8,11 @@ if config.ENABLE_CUDASIM: ARRAY_LIKE_FUNCTIONS = (cuda.device_array_like, cuda.pinned_array_like) else: - ARRAY_LIKE_FUNCTIONS = (cuda.device_array_like, cuda.mapped_array_like, - cuda.pinned_array_like) + ARRAY_LIKE_FUNCTIONS = ( + cuda.device_array_like, + cuda.mapped_array_like, + cuda.pinned_array_like, + ) def array_reshape1d(arr, newshape, got): @@ -55,8 +58,7 @@ def test_null_shape(self): self.assertEqual(shape2, null_shape) def test_gpu_array_strided(self): - - @cuda.jit('void(double[:])') + @cuda.jit("void(double[:])") def kernel(x): i = cuda.grid(1) if i < x.shape[0]: @@ -69,8 +71,7 @@ def kernel(x): self.assertTrue(np.allclose(z, list(range(9)))) def test_gpu_array_interleaved(self): - - @cuda.jit('void(double[:], double[:])') + @cuda.jit("void(double[:], double[:])") def copykernel(x, y): i = cuda.grid(1) if i < x.shape[0]: @@ -86,8 +87,10 @@ def copykernel(x, y): except ValueError: pass else: - raise AssertionError("Should raise exception complaining the " - "contiguous-ness of the array.") + raise AssertionError( + "Should raise exception complaining the " + "contiguous-ness of the array." + ) # Should we handle this use case? # assert z.size == y.size # copykernel[1, n](y, x) @@ -108,55 +111,57 @@ def _test_array_like_same(self, like_func, array): self.assertEqual(array.shape, array_like.shape) self.assertEqual(array.strides, array_like.strides) self.assertEqual(array.dtype, array_like.dtype) - self.assertEqual(array.flags['C_CONTIGUOUS'], - array_like.flags['C_CONTIGUOUS']) - self.assertEqual(array.flags['F_CONTIGUOUS'], - array_like.flags['F_CONTIGUOUS']) + self.assertEqual( + array.flags["C_CONTIGUOUS"], array_like.flags["C_CONTIGUOUS"] + ) + self.assertEqual( + array.flags["F_CONTIGUOUS"], array_like.flags["F_CONTIGUOUS"] + ) def test_array_like_1d(self): - d_a = cuda.device_array(10, order='C') + d_a = cuda.device_array(10, order="C") for like_func in ARRAY_LIKE_FUNCTIONS: with self.subTest(like_func=like_func): self._test_array_like_same(like_func, d_a) def test_array_like_2d(self): - d_a = cuda.device_array((10, 12), order='C') + d_a = cuda.device_array((10, 12), order="C") for like_func in ARRAY_LIKE_FUNCTIONS: with self.subTest(like_func=like_func): self._test_array_like_same(like_func, d_a) def test_array_like_2d_transpose(self): - d_a = cuda.device_array((10, 12), order='C') + d_a = cuda.device_array((10, 12), order="C") for like_func in ARRAY_LIKE_FUNCTIONS: with self.subTest(like_func=like_func): self._test_array_like_same(like_func, d_a) def test_array_like_3d(self): - d_a = cuda.device_array((10, 12, 14), order='C') + d_a = cuda.device_array((10, 12, 14), order="C") for like_func in ARRAY_LIKE_FUNCTIONS: with self.subTest(like_func=like_func): self._test_array_like_same(like_func, d_a) def test_array_like_1d_f(self): - d_a = cuda.device_array(10, order='F') + d_a = cuda.device_array(10, order="F") for like_func in ARRAY_LIKE_FUNCTIONS: with self.subTest(like_func=like_func): self._test_array_like_same(like_func, d_a) def test_array_like_2d_f(self): - d_a = cuda.device_array((10, 12), order='F') + d_a = cuda.device_array((10, 12), order="F") for like_func in ARRAY_LIKE_FUNCTIONS: with self.subTest(like_func=like_func): self._test_array_like_same(like_func, d_a) def test_array_like_2d_f_transpose(self): - d_a = cuda.device_array((10, 12), order='F') + d_a = cuda.device_array((10, 12), order="F") for like_func in ARRAY_LIKE_FUNCTIONS: with self.subTest(like_func=like_func): self._test_array_like_same(like_func, d_a) def test_array_like_3d_f(self): - d_a = cuda.device_array((10, 12, 14), order='F') + d_a = cuda.device_array((10, 12, 14), order="F") for like_func in ARRAY_LIKE_FUNCTIONS: with self.subTest(like_func=like_func): self._test_array_like_same(like_func, d_a) @@ -173,10 +178,12 @@ def _test_array_like_view(self, like_func, view, d_view): # Use NumPy as a reference for the expected strides np_like = np.zeros_like(view) self.assertEqual(nb_like.strides, np_like.strides) - self.assertEqual(nb_like.flags['C_CONTIGUOUS'], - np_like.flags['C_CONTIGUOUS']) - self.assertEqual(nb_like.flags['F_CONTIGUOUS'], - np_like.flags['F_CONTIGUOUS']) + self.assertEqual( + nb_like.flags["C_CONTIGUOUS"], np_like.flags["C_CONTIGUOUS"] + ) + self.assertEqual( + nb_like.flags["F_CONTIGUOUS"], np_like.flags["F_CONTIGUOUS"] + ) def test_array_like_1d_view(self): shape = 10 @@ -188,8 +195,8 @@ def test_array_like_1d_view(self): def test_array_like_1d_view_f(self): shape = 10 - view = np.zeros(shape, order='F')[::2] - d_view = cuda.device_array(shape, order='F')[::2] + view = np.zeros(shape, order="F")[::2] + d_view = cuda.device_array(shape, order="F")[::2] for like_func in ARRAY_LIKE_FUNCTIONS: with self.subTest(like_func=like_func): self._test_array_like_view(like_func, view, d_view) @@ -204,13 +211,13 @@ def test_array_like_2d_view(self): def test_array_like_2d_view_f(self): shape = (10, 12) - view = np.zeros(shape, order='F')[::2, ::2] - d_view = cuda.device_array(shape, order='F')[::2, ::2] + view = np.zeros(shape, order="F")[::2, ::2] + d_view = cuda.device_array(shape, order="F")[::2, ::2] for like_func in ARRAY_LIKE_FUNCTIONS: with self.subTest(like_func=like_func): self._test_array_like_view(like_func, view, d_view) - @skip_on_cudasim('Numba and NumPy stride semantics differ for transpose') + @skip_on_cudasim("Numba and NumPy stride semantics differ for transpose") def test_array_like_2d_view_transpose_device(self): shape = (10, 12) d_view = cuda.device_array(shape)[::2, ::2].T @@ -224,11 +231,12 @@ def test_array_like_2d_view_transpose_device(self): self.assertEqual(d_view.shape, like.shape) self.assertEqual(d_view.dtype, like.dtype) self.assertEqual((40, 8), like.strides) - self.assertTrue(like.flags['C_CONTIGUOUS']) - self.assertFalse(like.flags['F_CONTIGUOUS']) + self.assertTrue(like.flags["C_CONTIGUOUS"]) + self.assertFalse(like.flags["F_CONTIGUOUS"]) - @skip_unless_cudasim('Numba and NumPy stride semantics differ for ' - 'transpose') + @skip_unless_cudasim( + "Numba and NumPy stride semantics differ for transpose" + ) def test_array_like_2d_view_transpose_simulator(self): shape = (10, 12) view = np.zeros(shape)[::2, ::2].T @@ -243,20 +251,22 @@ def test_array_like_2d_view_transpose_simulator(self): self.assertEqual(d_view.shape, nb_like.shape) self.assertEqual(d_view.dtype, nb_like.dtype) self.assertEqual(np_like.strides, nb_like.strides) - self.assertEqual(np_like.flags['C_CONTIGUOUS'], - nb_like.flags['C_CONTIGUOUS']) - self.assertEqual(np_like.flags['F_CONTIGUOUS'], - nb_like.flags['F_CONTIGUOUS']) + self.assertEqual( + np_like.flags["C_CONTIGUOUS"], nb_like.flags["C_CONTIGUOUS"] + ) + self.assertEqual( + np_like.flags["F_CONTIGUOUS"], nb_like.flags["F_CONTIGUOUS"] + ) def test_array_like_2d_view_f_transpose(self): shape = (10, 12) - view = np.zeros(shape, order='F')[::2, ::2].T - d_view = cuda.device_array(shape, order='F')[::2, ::2].T + view = np.zeros(shape, order="F")[::2, ::2].T + d_view = cuda.device_array(shape, order="F")[::2, ::2].T for like_func in ARRAY_LIKE_FUNCTIONS: with self.subTest(like_func=like_func): self._test_array_like_view(like_func, view, d_view) - @skip_on_cudasim('Kernel overloads not created in the simulator') + @skip_on_cudasim("Kernel overloads not created in the simulator") def test_issue_4628(self): # CUDA Device arrays were reported as always being typed with 'A' order # so launching the kernel with a host array and then a device array @@ -318,7 +328,7 @@ def check_empty(arr): check(array_reshape, array_reshape3d, arr, (8, 1, 3)) # Test negative shape value - arr = np.arange(25).reshape(5,5) + arr = np.arange(25).reshape(5, 5) check(array_reshape, array_reshape1d, arr, -1) check(array_reshape, array_reshape1d, arr, (-1,)) check(array_reshape, array_reshape2d, arr, (-1, 5)) @@ -329,5 +339,5 @@ def check_empty(arr): check_empty(arr) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_array_args.py b/numba_cuda/numba/cuda/tests/cudapy/test_array_args.py index 87db4a6c7..1e3b1d920 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_array_args.py @@ -7,12 +7,11 @@ class TestCudaArrayArg(CUDATestCase): def test_array_ary(self): - - @cuda.jit('double(double[:],int64)', device=True, inline=True) + @cuda.jit("double(double[:],int64)", device=True, inline=True) def device_function(a, c): return a[c] - @cuda.jit('void(double[:],double[:])') + @cuda.jit("void(double[:],double[:])") def kernel(x, y): i = cuda.grid(1) y[i] = device_function(x, i) @@ -63,7 +62,7 @@ def f(r, x): r[0] = x.x r[1] = x.y - Point = namedtuple('Point', ('x', 'y')) + Point = namedtuple("Point", ("x", "y")) x = Point(1, 2) r = np.zeros(len(x), dtype=np.int64) f[1, 1](r, x) @@ -78,7 +77,7 @@ def f(r1, r2, x): r1[1] = x.y r2[0] = x.r - Point = namedtuple('Point', ('x', 'y', 'r')) + Point = namedtuple("Point", ("x", "y", "r")) x = Point(1, 2, 2.236) r1 = np.zeros(2, dtype=np.int64) r2 = np.zeros(1, dtype=np.float64) @@ -197,5 +196,5 @@ def f(r, x): self.assertEqual(r[4], 3) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py b/numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py index 7f129b5df..ceb884700 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py @@ -31,5 +31,5 @@ def test_reinterpret_array_type(self): self.assertEqual(expect, got) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_atomics.py b/numba_cuda/numba/cuda/tests/cudapy/test_atomics.py index 86dbb22c1..e4f057ba0 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_atomics.py @@ -22,9 +22,17 @@ def atomic_cast_none(num): @cuda.jit(device=True) -def atomic_binary_1dim_shared(ary, idx, op2, ary_dtype, ary_nelements, - binop_func, cast_func, initializer, - neg_idx): +def atomic_binary_1dim_shared( + ary, + idx, + op2, + ary_dtype, + ary_nelements, + binop_func, + cast_func, + initializer, + neg_idx, +): tid = cuda.threadIdx.x sm = cuda.shared.array(ary_nelements, ary_dtype) sm[tid] = initializer @@ -38,8 +46,9 @@ def atomic_binary_1dim_shared(ary, idx, op2, ary_dtype, ary_nelements, @cuda.jit(device=True) -def atomic_binary_1dim_shared2(ary, idx, op2, ary_dtype, ary_nelements, - binop_func, cast_func): +def atomic_binary_1dim_shared2( + ary, idx, op2, ary_dtype, ary_nelements, binop_func, cast_func +): tid = cuda.threadIdx.x sm = cuda.shared.array(ary_nelements, ary_dtype) sm[tid] = ary[tid] @@ -51,8 +60,9 @@ def atomic_binary_1dim_shared2(ary, idx, op2, ary_dtype, ary_nelements, @cuda.jit(device=True) -def atomic_binary_2dim_shared(ary, op2, ary_dtype, ary_shape, - binop_func, y_cast_func, neg_idx): +def atomic_binary_2dim_shared( + ary, op2, ary_dtype, ary_shape, binop_func, y_cast_func, neg_idx +): tx = cuda.threadIdx.x ty = cuda.threadIdx.y sm = cuda.shared.array(ary_shape, ary_dtype) @@ -77,8 +87,9 @@ def atomic_binary_2dim_global(ary, op2, binop_func, y_cast_func, neg_idx): @cuda.jit(device=True) -def atomic_binary_1dim_global(ary, idx, ary_nelements, op2, - binop_func, neg_idx): +def atomic_binary_1dim_global( + ary, idx, ary_nelements, op2, binop_func, neg_idx +): tid = cuda.threadIdx.x bin = int(idx[tid] % ary_nelements) if neg_idx: @@ -87,53 +98,79 @@ def atomic_binary_1dim_global(ary, idx, ary_nelements, op2, def atomic_add(ary): - atomic_binary_1dim_shared(ary, ary, 1, uint32, 32, - cuda.atomic.add, atomic_cast_none, 0, False) + atomic_binary_1dim_shared( + ary, ary, 1, uint32, 32, cuda.atomic.add, atomic_cast_none, 0, False + ) def atomic_add_wrap(ary): - atomic_binary_1dim_shared(ary, ary, 1, uint32, 32, - cuda.atomic.add, atomic_cast_none, 0, True) + atomic_binary_1dim_shared( + ary, ary, 1, uint32, 32, cuda.atomic.add, atomic_cast_none, 0, True + ) def atomic_add2(ary): - atomic_binary_2dim_shared(ary, 1, uint32, (4, 8), - cuda.atomic.add, atomic_cast_none, False) + atomic_binary_2dim_shared( + ary, 1, uint32, (4, 8), cuda.atomic.add, atomic_cast_none, False + ) def atomic_add2_wrap(ary): - atomic_binary_2dim_shared(ary, 1, uint32, (4, 8), - cuda.atomic.add, atomic_cast_none, True) + atomic_binary_2dim_shared( + ary, 1, uint32, (4, 8), cuda.atomic.add, atomic_cast_none, True + ) def atomic_add3(ary): - atomic_binary_2dim_shared(ary, 1, uint32, (4, 8), - cuda.atomic.add, atomic_cast_to_uint64, False) + atomic_binary_2dim_shared( + ary, 1, uint32, (4, 8), cuda.atomic.add, atomic_cast_to_uint64, False + ) def atomic_add_float(ary): - atomic_binary_1dim_shared(ary, ary, 1.0, float32, 32, - cuda.atomic.add, atomic_cast_to_int, 0.0, False) + atomic_binary_1dim_shared( + ary, + ary, + 1.0, + float32, + 32, + cuda.atomic.add, + atomic_cast_to_int, + 0.0, + False, + ) def atomic_add_float_wrap(ary): - atomic_binary_1dim_shared(ary, ary, 1.0, float32, 32, - cuda.atomic.add, atomic_cast_to_int, 0.0, True) + atomic_binary_1dim_shared( + ary, + ary, + 1.0, + float32, + 32, + cuda.atomic.add, + atomic_cast_to_int, + 0.0, + True, + ) def atomic_add_float_2(ary): - atomic_binary_2dim_shared(ary, 1.0, float32, (4, 8), - cuda.atomic.add, atomic_cast_none, False) + atomic_binary_2dim_shared( + ary, 1.0, float32, (4, 8), cuda.atomic.add, atomic_cast_none, False + ) def atomic_add_float_2_wrap(ary): - atomic_binary_2dim_shared(ary, 1.0, float32, (4, 8), - cuda.atomic.add, atomic_cast_none, True) + atomic_binary_2dim_shared( + ary, 1.0, float32, (4, 8), cuda.atomic.add, atomic_cast_none, True + ) def atomic_add_float_3(ary): - atomic_binary_2dim_shared(ary, 1.0, float32, (4, 8), - cuda.atomic.add, atomic_cast_to_uint64, False) + atomic_binary_2dim_shared( + ary, 1.0, float32, (4, 8), cuda.atomic.add, atomic_cast_to_uint64, False + ) def atomic_add_double_global(idx, ary): @@ -153,78 +190,117 @@ def atomic_add_double_global_2_wrap(ary): def atomic_add_double_global_3(ary): - atomic_binary_2dim_global(ary, 1, cuda.atomic.add, atomic_cast_to_uint64, - False) + atomic_binary_2dim_global( + ary, 1, cuda.atomic.add, atomic_cast_to_uint64, False + ) def atomic_add_double(idx, ary): - atomic_binary_1dim_shared(ary, idx, 1.0, float64, 32, - cuda.atomic.add, atomic_cast_none, 0.0, False) + atomic_binary_1dim_shared( + ary, + idx, + 1.0, + float64, + 32, + cuda.atomic.add, + atomic_cast_none, + 0.0, + False, + ) def atomic_add_double_wrap(idx, ary): - atomic_binary_1dim_shared(ary, idx, 1.0, float64, 32, - cuda.atomic.add, atomic_cast_none, 0.0, True) + atomic_binary_1dim_shared( + ary, idx, 1.0, float64, 32, cuda.atomic.add, atomic_cast_none, 0.0, True + ) def atomic_add_double_2(ary): - atomic_binary_2dim_shared(ary, 1.0, float64, (4, 8), - cuda.atomic.add, atomic_cast_none, False) + atomic_binary_2dim_shared( + ary, 1.0, float64, (4, 8), cuda.atomic.add, atomic_cast_none, False + ) def atomic_add_double_2_wrap(ary): - atomic_binary_2dim_shared(ary, 1.0, float64, (4, 8), - cuda.atomic.add, atomic_cast_none, True) + atomic_binary_2dim_shared( + ary, 1.0, float64, (4, 8), cuda.atomic.add, atomic_cast_none, True + ) def atomic_add_double_3(ary): - atomic_binary_2dim_shared(ary, 1.0, float64, (4, 8), - cuda.atomic.add, atomic_cast_to_uint64, False) + atomic_binary_2dim_shared( + ary, 1.0, float64, (4, 8), cuda.atomic.add, atomic_cast_to_uint64, False + ) def atomic_sub(ary): - atomic_binary_1dim_shared(ary, ary, 1, uint32, 32, - cuda.atomic.sub, atomic_cast_none, 0, False) + atomic_binary_1dim_shared( + ary, ary, 1, uint32, 32, cuda.atomic.sub, atomic_cast_none, 0, False + ) def atomic_sub2(ary): - atomic_binary_2dim_shared(ary, 1, uint32, (4, 8), - cuda.atomic.sub, atomic_cast_none, False) + atomic_binary_2dim_shared( + ary, 1, uint32, (4, 8), cuda.atomic.sub, atomic_cast_none, False + ) def atomic_sub3(ary): - atomic_binary_2dim_shared(ary, 1, uint32, (4, 8), - cuda.atomic.sub, atomic_cast_to_uint64, False) + atomic_binary_2dim_shared( + ary, 1, uint32, (4, 8), cuda.atomic.sub, atomic_cast_to_uint64, False + ) def atomic_sub_float(ary): - atomic_binary_1dim_shared(ary, ary, 1.0, float32, 32, - cuda.atomic.sub, atomic_cast_to_int, 0.0, False) + atomic_binary_1dim_shared( + ary, + ary, + 1.0, + float32, + 32, + cuda.atomic.sub, + atomic_cast_to_int, + 0.0, + False, + ) def atomic_sub_float_2(ary): - atomic_binary_2dim_shared(ary, 1.0, float32, (4, 8), - cuda.atomic.sub, atomic_cast_none, False) + atomic_binary_2dim_shared( + ary, 1.0, float32, (4, 8), cuda.atomic.sub, atomic_cast_none, False + ) def atomic_sub_float_3(ary): - atomic_binary_2dim_shared(ary, 1.0, float32, (4, 8), - cuda.atomic.sub, atomic_cast_to_uint64, False) + atomic_binary_2dim_shared( + ary, 1.0, float32, (4, 8), cuda.atomic.sub, atomic_cast_to_uint64, False + ) def atomic_sub_double(idx, ary): - atomic_binary_1dim_shared(ary, idx, 1.0, float64, 32, - cuda.atomic.sub, atomic_cast_none, 0.0, False) + atomic_binary_1dim_shared( + ary, + idx, + 1.0, + float64, + 32, + cuda.atomic.sub, + atomic_cast_none, + 0.0, + False, + ) def atomic_sub_double_2(ary): - atomic_binary_2dim_shared(ary, 1.0, float64, (4, 8), - cuda.atomic.sub, atomic_cast_none, False) + atomic_binary_2dim_shared( + ary, 1.0, float64, (4, 8), cuda.atomic.sub, atomic_cast_none, False + ) def atomic_sub_double_3(ary): - atomic_binary_2dim_shared(ary, 1.0, float64, (4, 8), - cuda.atomic.sub, atomic_cast_to_uint64, False) + atomic_binary_2dim_shared( + ary, 1.0, float64, (4, 8), cuda.atomic.sub, atomic_cast_to_uint64, False + ) def atomic_sub_double_global(idx, ary): @@ -232,28 +308,33 @@ def atomic_sub_double_global(idx, ary): def atomic_sub_double_global_2(ary): - atomic_binary_2dim_global(ary, 1.0, cuda.atomic.sub, atomic_cast_none, - False) + atomic_binary_2dim_global( + ary, 1.0, cuda.atomic.sub, atomic_cast_none, False + ) def atomic_sub_double_global_3(ary): - atomic_binary_2dim_shared(ary, 1.0, float64, (4, 8), - cuda.atomic.sub, atomic_cast_to_uint64, False) + atomic_binary_2dim_shared( + ary, 1.0, float64, (4, 8), cuda.atomic.sub, atomic_cast_to_uint64, False + ) def atomic_and(ary, op2): - atomic_binary_1dim_shared(ary, ary, op2, uint32, 32, - cuda.atomic.and_, atomic_cast_none, 1, False) + atomic_binary_1dim_shared( + ary, ary, op2, uint32, 32, cuda.atomic.and_, atomic_cast_none, 1, False + ) def atomic_and2(ary, op2): - atomic_binary_2dim_shared(ary, op2, uint32, (4, 8), - cuda.atomic.and_, atomic_cast_none, False) + atomic_binary_2dim_shared( + ary, op2, uint32, (4, 8), cuda.atomic.and_, atomic_cast_none, False + ) def atomic_and3(ary, op2): - atomic_binary_2dim_shared(ary, op2, uint32, (4, 8), - cuda.atomic.and_, atomic_cast_to_uint64, False) + atomic_binary_2dim_shared( + ary, op2, uint32, (4, 8), cuda.atomic.and_, atomic_cast_to_uint64, False + ) def atomic_and_global(idx, ary, op2): @@ -261,23 +342,27 @@ def atomic_and_global(idx, ary, op2): def atomic_and_global_2(ary, op2): - atomic_binary_2dim_global(ary, op2, cuda.atomic.and_, - atomic_cast_none, False) + atomic_binary_2dim_global( + ary, op2, cuda.atomic.and_, atomic_cast_none, False + ) def atomic_or(ary, op2): - atomic_binary_1dim_shared(ary, ary, op2, uint32, 32, - cuda.atomic.or_, atomic_cast_none, 0, False) + atomic_binary_1dim_shared( + ary, ary, op2, uint32, 32, cuda.atomic.or_, atomic_cast_none, 0, False + ) def atomic_or2(ary, op2): - atomic_binary_2dim_shared(ary, op2, uint32, (4, 8), - cuda.atomic.or_, atomic_cast_none, False) + atomic_binary_2dim_shared( + ary, op2, uint32, (4, 8), cuda.atomic.or_, atomic_cast_none, False + ) def atomic_or3(ary, op2): - atomic_binary_2dim_shared(ary, op2, uint32, (4, 8), - cuda.atomic.or_, atomic_cast_to_uint64, False) + atomic_binary_2dim_shared( + ary, op2, uint32, (4, 8), cuda.atomic.or_, atomic_cast_to_uint64, False + ) def atomic_or_global(idx, ary, op2): @@ -285,23 +370,27 @@ def atomic_or_global(idx, ary, op2): def atomic_or_global_2(ary, op2): - atomic_binary_2dim_global(ary, op2, cuda.atomic.or_, - atomic_cast_none, False) + atomic_binary_2dim_global( + ary, op2, cuda.atomic.or_, atomic_cast_none, False + ) def atomic_xor(ary, op2): - atomic_binary_1dim_shared(ary, ary, op2, uint32, 32, - cuda.atomic.xor, atomic_cast_none, 0, False) + atomic_binary_1dim_shared( + ary, ary, op2, uint32, 32, cuda.atomic.xor, atomic_cast_none, 0, False + ) def atomic_xor2(ary, op2): - atomic_binary_2dim_shared(ary, op2, uint32, (4, 8), - cuda.atomic.xor, atomic_cast_none, False) + atomic_binary_2dim_shared( + ary, op2, uint32, (4, 8), cuda.atomic.xor, atomic_cast_none, False + ) def atomic_xor3(ary, op2): - atomic_binary_2dim_shared(ary, op2, uint32, (4, 8), - cuda.atomic.xor, atomic_cast_to_uint64, False) + atomic_binary_2dim_shared( + ary, op2, uint32, (4, 8), cuda.atomic.xor, atomic_cast_to_uint64, False + ) def atomic_xor_global(idx, ary, op2): @@ -309,33 +398,39 @@ def atomic_xor_global(idx, ary, op2): def atomic_xor_global_2(ary, op2): - atomic_binary_2dim_global(ary, op2, cuda.atomic.xor, - atomic_cast_none, False) + atomic_binary_2dim_global( + ary, op2, cuda.atomic.xor, atomic_cast_none, False + ) def atomic_inc32(ary, idx, op2): - atomic_binary_1dim_shared2(ary, idx, op2, uint32, 32, - cuda.atomic.inc, atomic_cast_none) + atomic_binary_1dim_shared2( + ary, idx, op2, uint32, 32, cuda.atomic.inc, atomic_cast_none + ) def atomic_inc64(ary, idx, op2): - atomic_binary_1dim_shared2(ary, idx, op2, uint64, 32, - cuda.atomic.inc, atomic_cast_to_int) + atomic_binary_1dim_shared2( + ary, idx, op2, uint64, 32, cuda.atomic.inc, atomic_cast_to_int + ) def atomic_inc2_32(ary, op2): - atomic_binary_2dim_shared(ary, op2, uint32, (4, 8), - cuda.atomic.inc, atomic_cast_none, False) + atomic_binary_2dim_shared( + ary, op2, uint32, (4, 8), cuda.atomic.inc, atomic_cast_none, False + ) def atomic_inc2_64(ary, op2): - atomic_binary_2dim_shared(ary, op2, uint64, (4, 8), - cuda.atomic.inc, atomic_cast_none, False) + atomic_binary_2dim_shared( + ary, op2, uint64, (4, 8), cuda.atomic.inc, atomic_cast_none, False + ) def atomic_inc3(ary, op2): - atomic_binary_2dim_shared(ary, op2, uint32, (4, 8), - cuda.atomic.inc, atomic_cast_to_uint64, False) + atomic_binary_2dim_shared( + ary, op2, uint32, (4, 8), cuda.atomic.inc, atomic_cast_to_uint64, False + ) def atomic_inc_global(idx, ary, op2): @@ -343,33 +438,39 @@ def atomic_inc_global(idx, ary, op2): def atomic_inc_global_2(ary, op2): - atomic_binary_2dim_global(ary, op2, cuda.atomic.inc, - atomic_cast_none, False) + atomic_binary_2dim_global( + ary, op2, cuda.atomic.inc, atomic_cast_none, False + ) def atomic_dec32(ary, idx, op2): - atomic_binary_1dim_shared2(ary, idx, op2, uint32, 32, - cuda.atomic.dec, atomic_cast_none) + atomic_binary_1dim_shared2( + ary, idx, op2, uint32, 32, cuda.atomic.dec, atomic_cast_none + ) def atomic_dec64(ary, idx, op2): - atomic_binary_1dim_shared2(ary, idx, op2, uint64, 32, - cuda.atomic.dec, atomic_cast_to_int) + atomic_binary_1dim_shared2( + ary, idx, op2, uint64, 32, cuda.atomic.dec, atomic_cast_to_int + ) def atomic_dec2_32(ary, op2): - atomic_binary_2dim_shared(ary, op2, uint32, (4, 8), - cuda.atomic.dec, atomic_cast_none, False) + atomic_binary_2dim_shared( + ary, op2, uint32, (4, 8), cuda.atomic.dec, atomic_cast_none, False + ) def atomic_dec2_64(ary, op2): - atomic_binary_2dim_shared(ary, op2, uint64, (4, 8), - cuda.atomic.dec, atomic_cast_none, False) + atomic_binary_2dim_shared( + ary, op2, uint64, (4, 8), cuda.atomic.dec, atomic_cast_none, False + ) def atomic_dec3(ary, op2): - atomic_binary_2dim_shared(ary, op2, uint32, (4, 8), - cuda.atomic.dec, atomic_cast_to_uint64, False) + atomic_binary_2dim_shared( + ary, op2, uint32, (4, 8), cuda.atomic.dec, atomic_cast_to_uint64, False + ) def atomic_dec_global(idx, ary, op2): @@ -377,23 +478,27 @@ def atomic_dec_global(idx, ary, op2): def atomic_dec_global_2(ary, op2): - atomic_binary_2dim_global(ary, op2, cuda.atomic.dec, - atomic_cast_none, False) + atomic_binary_2dim_global( + ary, op2, cuda.atomic.dec, atomic_cast_none, False + ) def atomic_exch(ary, idx, op2): - atomic_binary_1dim_shared2(ary, idx, op2, uint32, 32, - cuda.atomic.exch, atomic_cast_none) + atomic_binary_1dim_shared2( + ary, idx, op2, uint32, 32, cuda.atomic.exch, atomic_cast_none + ) def atomic_exch2(ary, op2): - atomic_binary_2dim_shared(ary, op2, uint32, (4, 8), - cuda.atomic.exch, atomic_cast_none, False) + atomic_binary_2dim_shared( + ary, op2, uint32, (4, 8), cuda.atomic.exch, atomic_cast_none, False + ) def atomic_exch3(ary, op2): - atomic_binary_2dim_shared(ary, op2, uint64, (4, 8), - cuda.atomic.exch, atomic_cast_none, False) + atomic_binary_2dim_shared( + ary, op2, uint64, (4, 8), cuda.atomic.exch, atomic_cast_none, False + ) def atomic_exch_global(idx, ary, op2): @@ -401,7 +506,6 @@ def atomic_exch_global(idx, ary, op2): def gen_atomic_extreme_funcs(func): - fns = dedent(""" def atomic(res, ary): tx = cuda.threadIdx.x @@ -431,21 +535,39 @@ def atomic_double_shared(res, ary): res[0] = smres[0] """).format(func=func) ld = {} - exec(fns, {'cuda': cuda, 'float64': float64, 'uint64': uint64}, ld) - return (ld['atomic'], ld['atomic_double_normalizedindex'], - ld['atomic_double_oneindex'], ld['atomic_double_shared']) - - -(atomic_max, atomic_max_double_normalizedindex, atomic_max_double_oneindex, - atomic_max_double_shared) = gen_atomic_extreme_funcs('cuda.atomic.max') -(atomic_min, atomic_min_double_normalizedindex, atomic_min_double_oneindex, - atomic_min_double_shared) = gen_atomic_extreme_funcs('cuda.atomic.min') -(atomic_nanmax, atomic_nanmax_double_normalizedindex, - atomic_nanmax_double_oneindex, atomic_nanmax_double_shared) = \ - gen_atomic_extreme_funcs('cuda.atomic.nanmax') -(atomic_nanmin, atomic_nanmin_double_normalizedindex, - atomic_nanmin_double_oneindex, atomic_nanmin_double_shared) = \ - gen_atomic_extreme_funcs('cuda.atomic.nanmin') + exec(fns, {"cuda": cuda, "float64": float64, "uint64": uint64}, ld) + return ( + ld["atomic"], + ld["atomic_double_normalizedindex"], + ld["atomic_double_oneindex"], + ld["atomic_double_shared"], + ) + + +( + atomic_max, + atomic_max_double_normalizedindex, + atomic_max_double_oneindex, + atomic_max_double_shared, +) = gen_atomic_extreme_funcs("cuda.atomic.max") +( + atomic_min, + atomic_min_double_normalizedindex, + atomic_min_double_oneindex, + atomic_min_double_shared, +) = gen_atomic_extreme_funcs("cuda.atomic.min") +( + atomic_nanmax, + atomic_nanmax_double_normalizedindex, + atomic_nanmax_double_oneindex, + atomic_nanmax_double_shared, +) = gen_atomic_extreme_funcs("cuda.atomic.nanmax") +( + atomic_nanmin, + atomic_nanmin_double_normalizedindex, + atomic_nanmin_double_oneindex, + atomic_nanmin_double_shared, +) = gen_atomic_extreme_funcs("cuda.atomic.nanmin") def atomic_compare_and_swap(res, old, ary, fill_val): @@ -476,10 +598,10 @@ def test_atomic_add(self): ary_wrap = ary.copy() orig = ary.copy() - cuda_atomic_add = cuda.jit('void(uint32[:])')(atomic_add) + cuda_atomic_add = cuda.jit("void(uint32[:])")(atomic_add) cuda_atomic_add[1, 32](ary) - cuda_atomic_add_wrap = cuda.jit('void(uint32[:])')(atomic_add_wrap) + cuda_atomic_add_wrap = cuda.jit("void(uint32[:])")(atomic_add_wrap) cuda_atomic_add_wrap[1, 32](ary_wrap) gold = np.zeros(32, dtype=np.uint32) @@ -494,10 +616,10 @@ def test_atomic_add2(self): ary_wrap = ary.copy() orig = ary.copy() - cuda_atomic_add2 = cuda.jit('void(uint32[:,:])')(atomic_add2) + cuda_atomic_add2 = cuda.jit("void(uint32[:,:])")(atomic_add2) cuda_atomic_add2[1, (4, 8)](ary) - cuda_atomic_add2_wrap = cuda.jit('void(uint32[:,:])')(atomic_add2_wrap) + cuda_atomic_add2_wrap = cuda.jit("void(uint32[:,:])")(atomic_add2_wrap) cuda_atomic_add2_wrap[1, (4, 8)](ary_wrap) self.assertTrue(np.all(ary == orig + 1)) @@ -506,7 +628,7 @@ def test_atomic_add2(self): def test_atomic_add3(self): ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8) orig = ary.copy() - cuda_atomic_add3 = cuda.jit('void(uint32[:,:])')(atomic_add3) + cuda_atomic_add3 = cuda.jit("void(uint32[:,:])")(atomic_add3) cuda_atomic_add3[1, (4, 8)](ary) self.assertTrue(np.all(ary == orig + 1)) @@ -516,10 +638,10 @@ def test_atomic_add_float(self): ary_wrap = ary.copy() orig = ary.copy().astype(np.intp) - cuda_atomic_add_float = cuda.jit('void(float32[:])')(atomic_add_float) + cuda_atomic_add_float = cuda.jit("void(float32[:])")(atomic_add_float) cuda_atomic_add_float[1, 32](ary) - add_float_wrap = cuda.jit('void(float32[:])')(atomic_add_float_wrap) + add_float_wrap = cuda.jit("void(float32[:])")(atomic_add_float_wrap) add_float_wrap[1, 32](ary_wrap) gold = np.zeros(32, dtype=np.uint32) @@ -534,10 +656,10 @@ def test_atomic_add_float_2(self): ary_wrap = ary.copy() orig = ary.copy() - cuda_atomic_add2 = cuda.jit('void(float32[:,:])')(atomic_add_float_2) + cuda_atomic_add2 = cuda.jit("void(float32[:,:])")(atomic_add_float_2) cuda_atomic_add2[1, (4, 8)](ary) - cuda_func_wrap = cuda.jit('void(float32[:,:])')(atomic_add_float_2_wrap) + cuda_func_wrap = cuda.jit("void(float32[:,:])")(atomic_add_float_2_wrap) cuda_func_wrap[1, (4, 8)](ary_wrap) self.assertTrue(np.all(ary == orig + 1)) @@ -546,7 +668,7 @@ def test_atomic_add_float_2(self): def test_atomic_add_float_3(self): ary = np.random.randint(0, 32, size=32).astype(np.float32).reshape(4, 8) orig = ary.copy() - cuda_atomic_add3 = cuda.jit('void(float32[:,:])')(atomic_add_float_3) + cuda_atomic_add3 = cuda.jit("void(float32[:,:])")(atomic_add_float_3) cuda_atomic_add3[1, (4, 8)](ary) self.assertTrue(np.all(ary == orig + 1)) @@ -561,24 +683,24 @@ def assertCorrectFloat64Atomics(self, kernel, shared=True): inst = "(red|atom)" if shared: - inst = f'{inst}\\.shared' + inst = f"{inst}\\.shared" - self.assertRegex(asm, f'{inst}.add.f64', asm) + self.assertRegex(asm, f"{inst}.add.f64", asm) else: if shared: - self.assertIn('atom.shared.cas.b64', asm) + self.assertIn("atom.shared.cas.b64", asm) else: - self.assertIn('atom.cas.b64', asm) + self.assertIn("atom.cas.b64", asm) def test_atomic_add_double(self): idx = np.random.randint(0, 32, size=32, dtype=np.int64) ary = np.zeros(32, np.float64) ary_wrap = ary.copy() - cuda_fn = cuda.jit('void(int64[:], float64[:])')(atomic_add_double) + cuda_fn = cuda.jit("void(int64[:], float64[:])")(atomic_add_double) cuda_fn[1, 32](idx, ary) - wrap_fn = cuda.jit('void(int64[:], float64[:])')(atomic_add_double_wrap) + wrap_fn = cuda.jit("void(int64[:], float64[:])")(atomic_add_double_wrap) wrap_fn[1, 32](idx, ary_wrap) gold = np.zeros(32, dtype=np.uint32) @@ -595,10 +717,10 @@ def test_atomic_add_double_2(self): ary_wrap = ary.copy() orig = ary.copy() - cuda_fn = cuda.jit('void(float64[:,:])')(atomic_add_double_2) + cuda_fn = cuda.jit("void(float64[:,:])")(atomic_add_double_2) cuda_fn[1, (4, 8)](ary) - cuda_fn_wrap = cuda.jit('void(float64[:,:])')(atomic_add_double_2_wrap) + cuda_fn_wrap = cuda.jit("void(float64[:,:])")(atomic_add_double_2_wrap) cuda_fn_wrap[1, (4, 8)](ary_wrap) np.testing.assert_equal(ary, orig + 1) @@ -609,7 +731,7 @@ def test_atomic_add_double_2(self): def test_atomic_add_double_3(self): ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8) orig = ary.copy() - cuda_func = cuda.jit('void(float64[:,:])')(atomic_add_double_3) + cuda_func = cuda.jit("void(float64[:,:])")(atomic_add_double_3) cuda_func[1, (4, 8)](ary) np.testing.assert_equal(ary, orig + 1) @@ -620,7 +742,7 @@ def test_atomic_add_double_global(self): ary = np.zeros(32, np.float64) ary_wrap = ary.copy() - sig = 'void(int64[:], float64[:])' + sig = "void(int64[:], float64[:])" cuda_func = cuda.jit(sig)(atomic_add_double_global) wrap_cuda_func = cuda.jit(sig)(atomic_add_double_global_wrap) @@ -641,7 +763,7 @@ def test_atomic_add_double_global_2(self): ary_wrap = ary.copy() orig = ary.copy() - sig = 'void(float64[:,:])' + sig = "void(float64[:,:])" cuda_func = cuda.jit(sig)(atomic_add_double_global_2) wrap_cuda_func = cuda.jit(sig)(atomic_add_double_global_2_wrap) @@ -656,7 +778,7 @@ def test_atomic_add_double_global_2(self): def test_atomic_add_double_global_3(self): ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8) orig = ary.copy() - cuda_func = cuda.jit('void(float64[:,:])')(atomic_add_double_global_3) + cuda_func = cuda.jit("void(float64[:,:])")(atomic_add_double_global_3) cuda_func[1, (4, 8)](ary) np.testing.assert_equal(ary, orig + 1) @@ -665,7 +787,7 @@ def test_atomic_add_double_global_3(self): def test_atomic_sub(self): ary = np.random.randint(0, 32, size=32).astype(np.uint32) orig = ary.copy() - cuda_atomic_sub = cuda.jit('void(uint32[:])')(atomic_sub) + cuda_atomic_sub = cuda.jit("void(uint32[:])")(atomic_sub) cuda_atomic_sub[1, 32](ary) gold = np.zeros(32, dtype=np.uint32) @@ -677,21 +799,21 @@ def test_atomic_sub(self): def test_atomic_sub2(self): ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8) orig = ary.copy() - cuda_atomic_sub2 = cuda.jit('void(uint32[:,:])')(atomic_sub2) + cuda_atomic_sub2 = cuda.jit("void(uint32[:,:])")(atomic_sub2) cuda_atomic_sub2[1, (4, 8)](ary) self.assertTrue(np.all(ary == orig - 1)) def test_atomic_sub3(self): ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8) orig = ary.copy() - cuda_atomic_sub3 = cuda.jit('void(uint32[:,:])')(atomic_sub3) + cuda_atomic_sub3 = cuda.jit("void(uint32[:,:])")(atomic_sub3) cuda_atomic_sub3[1, (4, 8)](ary) self.assertTrue(np.all(ary == orig - 1)) def test_atomic_sub_float(self): ary = np.random.randint(0, 32, size=32).astype(np.float32) orig = ary.copy().astype(np.intp) - cuda_atomic_sub_float = cuda.jit('void(float32[:])')(atomic_sub_float) + cuda_atomic_sub_float = cuda.jit("void(float32[:])")(atomic_sub_float) cuda_atomic_sub_float[1, 32](ary) gold = np.zeros(32, dtype=np.float32) @@ -703,21 +825,21 @@ def test_atomic_sub_float(self): def test_atomic_sub_float_2(self): ary = np.random.randint(0, 32, size=32).astype(np.float32).reshape(4, 8) orig = ary.copy() - cuda_atomic_sub2 = cuda.jit('void(float32[:,:])')(atomic_sub_float_2) + cuda_atomic_sub2 = cuda.jit("void(float32[:,:])")(atomic_sub_float_2) cuda_atomic_sub2[1, (4, 8)](ary) self.assertTrue(np.all(ary == orig - 1)) def test_atomic_sub_float_3(self): ary = np.random.randint(0, 32, size=32).astype(np.float32).reshape(4, 8) orig = ary.copy() - cuda_atomic_sub3 = cuda.jit('void(float32[:,:])')(atomic_sub_float_3) + cuda_atomic_sub3 = cuda.jit("void(float32[:,:])")(atomic_sub_float_3) cuda_atomic_sub3[1, (4, 8)](ary) self.assertTrue(np.all(ary == orig - 1)) def test_atomic_sub_double(self): idx = np.random.randint(0, 32, size=32, dtype=np.int64) ary = np.zeros(32, np.float64) - cuda_func = cuda.jit('void(int64[:], float64[:])')(atomic_sub_double) + cuda_func = cuda.jit("void(int64[:], float64[:])")(atomic_sub_double) cuda_func[1, 32](idx, ary) gold = np.zeros(32, dtype=np.float64) @@ -729,21 +851,21 @@ def test_atomic_sub_double(self): def test_atomic_sub_double_2(self): ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8) orig = ary.copy() - cuda_func = cuda.jit('void(float64[:,:])')(atomic_sub_double_2) + cuda_func = cuda.jit("void(float64[:,:])")(atomic_sub_double_2) cuda_func[1, (4, 8)](ary) np.testing.assert_equal(ary, orig - 1) def test_atomic_sub_double_3(self): ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8) orig = ary.copy() - cuda_func = cuda.jit('void(float64[:,:])')(atomic_sub_double_3) + cuda_func = cuda.jit("void(float64[:,:])")(atomic_sub_double_3) cuda_func[1, (4, 8)](ary) np.testing.assert_equal(ary, orig - 1) def test_atomic_sub_double_global(self): idx = np.random.randint(0, 32, size=32, dtype=np.int64) ary = np.zeros(32, np.float64) - sig = 'void(int64[:], float64[:])' + sig = "void(int64[:], float64[:])" cuda_func = cuda.jit(sig)(atomic_sub_double_global) cuda_func[1, 32](idx, ary) @@ -756,14 +878,14 @@ def test_atomic_sub_double_global(self): def test_atomic_sub_double_global_2(self): ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8) orig = ary.copy() - cuda_func = cuda.jit('void(float64[:,:])')(atomic_sub_double_global_2) + cuda_func = cuda.jit("void(float64[:,:])")(atomic_sub_double_global_2) cuda_func[1, (4, 8)](ary) np.testing.assert_equal(ary, orig - 1) def test_atomic_sub_double_global_3(self): ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8) orig = ary.copy() - cuda_func = cuda.jit('void(float64[:,:])')(atomic_sub_double_global_3) + cuda_func = cuda.jit("void(float64[:,:])")(atomic_sub_double_global_3) cuda_func[1, (4, 8)](ary) np.testing.assert_equal(ary, orig - 1) @@ -771,7 +893,7 @@ def test_atomic_and(self): rand_const = np.random.randint(500) ary = np.random.randint(0, 32, size=32).astype(np.uint32) orig = ary.copy() - cuda_func = cuda.jit('void(uint32[:], uint32)')(atomic_and) + cuda_func = cuda.jit("void(uint32[:], uint32)")(atomic_and) cuda_func[1, 32](ary, rand_const) gold = ary.copy() @@ -784,7 +906,7 @@ def test_atomic_and2(self): rand_const = np.random.randint(500) ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8) orig = ary.copy() - cuda_atomic_and2 = cuda.jit('void(uint32[:,:], uint32)')(atomic_and2) + cuda_atomic_and2 = cuda.jit("void(uint32[:,:], uint32)")(atomic_and2) cuda_atomic_and2[1, (4, 8)](ary, rand_const) self.assertTrue(np.all(ary == orig & rand_const)) @@ -792,7 +914,7 @@ def test_atomic_and3(self): rand_const = np.random.randint(500) ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8) orig = ary.copy() - cuda_atomic_and3 = cuda.jit('void(uint32[:,:], uint32)')(atomic_and3) + cuda_atomic_and3 = cuda.jit("void(uint32[:,:], uint32)")(atomic_and3) cuda_atomic_and3[1, (4, 8)](ary, rand_const) self.assertTrue(np.all(ary == orig & rand_const)) @@ -800,7 +922,7 @@ def test_atomic_and_global(self): rand_const = np.random.randint(500) idx = np.random.randint(0, 32, size=32, dtype=np.int32) ary = np.random.randint(0, 32, size=32, dtype=np.int32) - sig = 'void(int32[:], int32[:], int32)' + sig = "void(int32[:], int32[:], int32)" cuda_func = cuda.jit(sig)(atomic_and_global) cuda_func[1, 32](idx, ary, rand_const) @@ -814,7 +936,7 @@ def test_atomic_and_global_2(self): rand_const = np.random.randint(500) ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8) orig = ary.copy() - cuda_func = cuda.jit('void(uint32[:,:], uint32)')(atomic_and_global_2) + cuda_func = cuda.jit("void(uint32[:,:], uint32)")(atomic_and_global_2) cuda_func[1, (4, 8)](ary, rand_const) np.testing.assert_equal(ary, orig & rand_const) @@ -822,7 +944,7 @@ def test_atomic_or(self): rand_const = np.random.randint(500) ary = np.random.randint(0, 32, size=32).astype(np.uint32) orig = ary.copy() - cuda_func = cuda.jit('void(uint32[:], uint32)')(atomic_or) + cuda_func = cuda.jit("void(uint32[:], uint32)")(atomic_or) cuda_func[1, 32](ary, rand_const) gold = np.zeros(32, dtype=np.uint32) @@ -835,7 +957,7 @@ def test_atomic_or2(self): rand_const = np.random.randint(500) ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8) orig = ary.copy() - cuda_atomic_and2 = cuda.jit('void(uint32[:,:], uint32)')(atomic_or2) + cuda_atomic_and2 = cuda.jit("void(uint32[:,:], uint32)")(atomic_or2) cuda_atomic_and2[1, (4, 8)](ary, rand_const) self.assertTrue(np.all(ary == orig | rand_const)) @@ -843,7 +965,7 @@ def test_atomic_or3(self): rand_const = np.random.randint(500) ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8) orig = ary.copy() - cuda_atomic_and3 = cuda.jit('void(uint32[:,:], uint32)')(atomic_or3) + cuda_atomic_and3 = cuda.jit("void(uint32[:,:], uint32)")(atomic_or3) cuda_atomic_and3[1, (4, 8)](ary, rand_const) self.assertTrue(np.all(ary == orig | rand_const)) @@ -851,7 +973,7 @@ def test_atomic_or_global(self): rand_const = np.random.randint(500) idx = np.random.randint(0, 32, size=32, dtype=np.int32) ary = np.random.randint(0, 32, size=32, dtype=np.int32) - sig = 'void(int32[:], int32[:], int32)' + sig = "void(int32[:], int32[:], int32)" cuda_func = cuda.jit(sig)(atomic_or_global) cuda_func[1, 32](idx, ary, rand_const) @@ -865,7 +987,7 @@ def test_atomic_or_global_2(self): rand_const = np.random.randint(500) ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8) orig = ary.copy() - cuda_func = cuda.jit('void(uint32[:,:], uint32)')(atomic_or_global_2) + cuda_func = cuda.jit("void(uint32[:,:], uint32)")(atomic_or_global_2) cuda_func[1, (4, 8)](ary, rand_const) np.testing.assert_equal(ary, orig | rand_const) @@ -873,7 +995,7 @@ def test_atomic_xor(self): rand_const = np.random.randint(500) ary = np.random.randint(0, 32, size=32).astype(np.uint32) orig = ary.copy() - cuda_func = cuda.jit('void(uint32[:], uint32)')(atomic_xor) + cuda_func = cuda.jit("void(uint32[:], uint32)")(atomic_xor) cuda_func[1, 32](ary, rand_const) gold = np.zeros(32, dtype=np.uint32) @@ -886,7 +1008,7 @@ def test_atomic_xor2(self): rand_const = np.random.randint(500) ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8) orig = ary.copy() - cuda_atomic_xor2 = cuda.jit('void(uint32[:,:], uint32)')(atomic_xor2) + cuda_atomic_xor2 = cuda.jit("void(uint32[:,:], uint32)")(atomic_xor2) cuda_atomic_xor2[1, (4, 8)](ary, rand_const) self.assertTrue(np.all(ary == orig ^ rand_const)) @@ -894,7 +1016,7 @@ def test_atomic_xor3(self): rand_const = np.random.randint(500) ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8) orig = ary.copy() - cuda_atomic_xor3 = cuda.jit('void(uint32[:,:], uint32)')(atomic_xor3) + cuda_atomic_xor3 = cuda.jit("void(uint32[:,:], uint32)")(atomic_xor3) cuda_atomic_xor3[1, (4, 8)](ary, rand_const) self.assertTrue(np.all(ary == orig ^ rand_const)) @@ -903,7 +1025,7 @@ def test_atomic_xor_global(self): idx = np.random.randint(0, 32, size=32, dtype=np.int32) ary = np.random.randint(0, 32, size=32, dtype=np.int32) gold = ary.copy() - sig = 'void(int32[:], int32[:], int32)' + sig = "void(int32[:], int32[:], int32)" cuda_func = cuda.jit(sig)(atomic_xor_global) cuda_func[1, 32](idx, ary, rand_const) @@ -916,12 +1038,12 @@ def test_atomic_xor_global_2(self): rand_const = np.random.randint(500) ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8) orig = ary.copy() - cuda_func = cuda.jit('void(uint32[:,:], uint32)')(atomic_xor_global_2) + cuda_func = cuda.jit("void(uint32[:,:], uint32)")(atomic_xor_global_2) cuda_func[1, (4, 8)](ary, rand_const) np.testing.assert_equal(ary, orig ^ rand_const) def inc_dec_1dim_setup(self, dtype): - rconst = np.random.randint(32, dtype=dtype) + rconst = np.random.randint(32, dtype=dtype) rary = np.random.randint(0, 32, size=32).astype(dtype) ary_idx = np.arange(32, dtype=dtype) return rconst, rary, ary_idx @@ -951,131 +1073,141 @@ def check_inc(self, ary, rconst, sig, nblocks, blksize, func): def test_atomic_inc_32(self): rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint32) - sig = 'void(uint32[:], uint32[:], uint32)' + sig = "void(uint32[:], uint32[:], uint32)" self.check_inc_index(ary, idx, rand_const, sig, 1, 32, atomic_inc32) def test_atomic_inc_64(self): rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint64) - sig = 'void(uint64[:], uint64[:], uint64)' + sig = "void(uint64[:], uint64[:], uint64)" self.check_inc_index(ary, idx, rand_const, sig, 1, 32, atomic_inc64) def test_atomic_inc2_32(self): rand_const, ary = self.inc_dec_2dim_setup(np.uint32) - sig = 'void(uint32[:,:], uint32)' - self.check_inc(ary, rand_const, sig, 1, (4,8), atomic_inc2_32) + sig = "void(uint32[:,:], uint32)" + self.check_inc(ary, rand_const, sig, 1, (4, 8), atomic_inc2_32) def test_atomic_inc2_64(self): rand_const, ary = self.inc_dec_2dim_setup(np.uint64) - sig = 'void(uint64[:,:], uint64)' - self.check_inc(ary, rand_const, sig, 1, (4,8), atomic_inc2_64) + sig = "void(uint64[:,:], uint64)" + self.check_inc(ary, rand_const, sig, 1, (4, 8), atomic_inc2_64) def test_atomic_inc3(self): rand_const, ary = self.inc_dec_2dim_setup(np.uint32) - sig = 'void(uint32[:,:], uint32)' - self.check_inc(ary, rand_const, sig, 1, (4,8), atomic_inc3) + sig = "void(uint32[:,:], uint32)" + self.check_inc(ary, rand_const, sig, 1, (4, 8), atomic_inc3) def test_atomic_inc_global_32(self): rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint32) - sig = 'void(uint32[:], uint32[:], uint32)' - self.check_inc_index2(ary, idx, rand_const, sig, 1, 32, - atomic_inc_global) + sig = "void(uint32[:], uint32[:], uint32)" + self.check_inc_index2( + ary, idx, rand_const, sig, 1, 32, atomic_inc_global + ) def test_atomic_inc_global_64(self): rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint64) - sig = 'void(uint64[:], uint64[:], uint64)' - self.check_inc_index2(ary, idx, rand_const, sig, 1, 32, - atomic_inc_global) + sig = "void(uint64[:], uint64[:], uint64)" + self.check_inc_index2( + ary, idx, rand_const, sig, 1, 32, atomic_inc_global + ) def test_atomic_inc_global_2_32(self): rand_const, ary = self.inc_dec_2dim_setup(np.uint32) - sig = 'void(uint32[:,:], uint32)' - self.check_inc(ary, rand_const, sig, 1, (4,8), atomic_inc_global_2) + sig = "void(uint32[:,:], uint32)" + self.check_inc(ary, rand_const, sig, 1, (4, 8), atomic_inc_global_2) def test_atomic_inc_global_2_64(self): rand_const, ary = self.inc_dec_2dim_setup(np.uint64) - sig = 'void(uint64[:,:], uint64)' - self.check_inc(ary, rand_const, sig, 1, (4,8), atomic_inc_global_2) + sig = "void(uint64[:,:], uint64)" + self.check_inc(ary, rand_const, sig, 1, (4, 8), atomic_inc_global_2) def check_dec_index(self, ary, idx, rconst, sig, nblocks, blksize, func): orig = ary.copy() cuda_func = cuda.jit(sig)(func) cuda_func[nblocks, blksize](ary, idx, rconst) - np.testing.assert_equal(ary, np.where(orig == 0, rconst, - np.where(orig > rconst, - rconst, - orig - 1))) + np.testing.assert_equal( + ary, + np.where( + orig == 0, rconst, np.where(orig > rconst, rconst, orig - 1) + ), + ) def check_dec_index2(self, ary, idx, rconst, sig, nblocks, blksize, func): orig = ary.copy() cuda_func = cuda.jit(sig)(func) cuda_func[nblocks, blksize](idx, ary, rconst) - np.testing.assert_equal(ary, np.where(orig == 0, rconst, - np.where(orig > rconst, - rconst, - orig - 1))) + np.testing.assert_equal( + ary, + np.where( + orig == 0, rconst, np.where(orig > rconst, rconst, orig - 1) + ), + ) def check_dec(self, ary, rconst, sig, nblocks, blksize, func): orig = ary.copy() cuda_func = cuda.jit(sig)(func) cuda_func[nblocks, blksize](ary, rconst) - np.testing.assert_equal(ary, np.where(orig == 0, rconst, - np.where(orig > rconst, - rconst, - orig - 1))) + np.testing.assert_equal( + ary, + np.where( + orig == 0, rconst, np.where(orig > rconst, rconst, orig - 1) + ), + ) def test_atomic_dec_32(self): rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint32) - sig = 'void(uint32[:], uint32[:], uint32)' + sig = "void(uint32[:], uint32[:], uint32)" self.check_dec_index(ary, idx, rand_const, sig, 1, 32, atomic_dec32) def test_atomic_dec_64(self): rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint64) - sig = 'void(uint64[:], uint64[:], uint64)' + sig = "void(uint64[:], uint64[:], uint64)" self.check_dec_index(ary, idx, rand_const, sig, 1, 32, atomic_dec64) def test_atomic_dec2_32(self): rand_const, ary = self.inc_dec_2dim_setup(np.uint32) - sig = 'void(uint32[:,:], uint32)' - self.check_dec(ary, rand_const, sig, 1, (4,8), atomic_dec2_32) + sig = "void(uint32[:,:], uint32)" + self.check_dec(ary, rand_const, sig, 1, (4, 8), atomic_dec2_32) def test_atomic_dec2_64(self): rand_const, ary = self.inc_dec_2dim_setup(np.uint64) - sig = 'void(uint64[:,:], uint64)' - self.check_dec(ary, rand_const, sig, 1, (4,8), atomic_dec2_64) + sig = "void(uint64[:,:], uint64)" + self.check_dec(ary, rand_const, sig, 1, (4, 8), atomic_dec2_64) def test_atomic_dec3_new(self): rand_const, ary = self.inc_dec_2dim_setup(np.uint32) - sig = 'void(uint32[:,:], uint32)' - self.check_dec(ary, rand_const, sig, 1, (4,8), atomic_dec3) + sig = "void(uint32[:,:], uint32)" + self.check_dec(ary, rand_const, sig, 1, (4, 8), atomic_dec3) def test_atomic_dec_global_32(self): rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint32) - sig = 'void(uint32[:], uint32[:], uint32)' - self.check_dec_index2(ary, idx, rand_const, sig, 1, 32, - atomic_dec_global) + sig = "void(uint32[:], uint32[:], uint32)" + self.check_dec_index2( + ary, idx, rand_const, sig, 1, 32, atomic_dec_global + ) def test_atomic_dec_global_64(self): rand_const, ary, idx = self.inc_dec_1dim_setup(dtype=np.uint64) - sig = 'void(uint64[:], uint64[:], uint64)' - self.check_dec_index2(ary, idx, rand_const, sig, 1, 32, - atomic_dec_global) + sig = "void(uint64[:], uint64[:], uint64)" + self.check_dec_index2( + ary, idx, rand_const, sig, 1, 32, atomic_dec_global + ) def test_atomic_dec_global2_32(self): rand_const, ary = self.inc_dec_2dim_setup(np.uint32) - sig = 'void(uint32[:,:], uint32)' - self.check_dec(ary, rand_const, sig, 1, (4,8), atomic_dec_global_2) + sig = "void(uint32[:,:], uint32)" + self.check_dec(ary, rand_const, sig, 1, (4, 8), atomic_dec_global_2) def test_atomic_dec_global2_64(self): rand_const, ary = self.inc_dec_2dim_setup(np.uint64) - sig = 'void(uint64[:,:], uint64)' - self.check_dec(ary, rand_const, sig, 1, (4,8), atomic_dec_global_2) + sig = "void(uint64[:,:], uint64)" + self.check_dec(ary, rand_const, sig, 1, (4, 8), atomic_dec_global_2) def test_atomic_exch(self): rand_const = np.random.randint(50, 100, dtype=np.uint32) ary = np.random.randint(0, 32, size=32).astype(np.uint32) idx = np.arange(32, dtype=np.uint32) - cuda_func = cuda.jit('void(uint32[:], uint32[:], uint32)')(atomic_exch) + cuda_func = cuda.jit("void(uint32[:], uint32[:], uint32)")(atomic_exch) cuda_func[1, 32](ary, idx, rand_const) np.testing.assert_equal(ary, rand_const) @@ -1084,7 +1216,7 @@ def test_atomic_exch2(self): rand_const = np.random.randint(50, 100, dtype=np.uint32) ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8) - cuda_func = cuda.jit('void(uint32[:,:], uint32)')(atomic_exch2) + cuda_func = cuda.jit("void(uint32[:,:], uint32)")(atomic_exch2) cuda_func[1, (4, 8)](ary, rand_const) np.testing.assert_equal(ary, rand_const) @@ -1092,7 +1224,7 @@ def test_atomic_exch3(self): rand_const = np.random.randint(50, 100, dtype=np.uint64) ary = np.random.randint(0, 32, size=32).astype(np.uint64).reshape(4, 8) - cuda_func = cuda.jit('void(uint64[:,:], uint64)')(atomic_exch3) + cuda_func = cuda.jit("void(uint64[:,:], uint64)")(atomic_exch3) cuda_func[1, (4, 8)](ary, rand_const) np.testing.assert_equal(ary, rand_const) @@ -1101,7 +1233,7 @@ def test_atomic_exch_global(self): idx = np.arange(32, dtype=np.uint32) ary = np.random.randint(0, 32, size=32, dtype=np.uint32) - sig = 'void(uint32[:], uint32[:], uint32)' + sig = "void(uint32[:], uint32[:], uint32)" cuda_func = cuda.jit(sig)(atomic_exch_global) cuda_func[1, 32](idx, ary, rand_const) np.testing.assert_equal(ary, rand_const) @@ -1135,8 +1267,9 @@ def test_atomic_max_double(self): def test_atomic_max_double_normalizedindex(self): vals = np.random.randint(0, 65535, size=(32, 32)).astype(np.float64) res = np.zeros(1, np.float64) - cuda_func = cuda.jit('void(float64[:], float64[:,:])')( - atomic_max_double_normalizedindex) + cuda_func = cuda.jit("void(float64[:], float64[:,:])")( + atomic_max_double_normalizedindex + ) cuda_func[32, 32](res, vals) gold = np.max(vals) @@ -1145,8 +1278,9 @@ def test_atomic_max_double_normalizedindex(self): def test_atomic_max_double_oneindex(self): vals = np.random.randint(0, 128, size=32).astype(np.float64) res = np.zeros(1, np.float64) - cuda_func = cuda.jit('void(float64[:], float64[:])')( - atomic_max_double_oneindex) + cuda_func = cuda.jit("void(float64[:], float64[:])")( + atomic_max_double_oneindex + ) cuda_func[1, 32](res, vals) gold = np.max(vals) @@ -1182,8 +1316,9 @@ def test_atomic_min_double(self): def test_atomic_min_double_normalizedindex(self): vals = np.random.randint(0, 65535, size=(32, 32)).astype(np.float64) res = np.ones(1, np.float64) * 65535 - cuda_func = cuda.jit('void(float64[:], float64[:,:])')( - atomic_min_double_normalizedindex) + cuda_func = cuda.jit("void(float64[:], float64[:,:])")( + atomic_min_double_normalizedindex + ) cuda_func[32, 32](res, vals) gold = np.min(vals) @@ -1192,8 +1327,9 @@ def test_atomic_min_double_normalizedindex(self): def test_atomic_min_double_oneindex(self): vals = np.random.randint(0, 128, size=32).astype(np.float64) res = np.ones(1, np.float64) * 128 - cuda_func = cuda.jit('void(float64[:], float64[:])')( - atomic_min_double_oneindex) + cuda_func = cuda.jit("void(float64[:], float64[:])")( + atomic_min_double_oneindex + ) cuda_func[1, 32](res, vals) gold = np.min(vals) @@ -1211,16 +1347,15 @@ def test_atomic_min_double_oneindex(self): # the result will be ary[idx] for either of ary[idx] or val being NaN. def _test_atomic_minmax_nan_location(self, func): + cuda_func = cuda.jit("void(float64[:], float64[:,:])")(func) - cuda_func = cuda.jit('void(float64[:], float64[:,:])')(func) - - vals = np.random.randint(0, 128, size=(1,1)).astype(np.float64) + vals = np.random.randint(0, 128, size=(1, 1)).astype(np.float64) res = np.zeros(1, np.float64) + np.nan cuda_func[1, 1](res, vals) np.testing.assert_equal(res, [np.nan]) def _test_atomic_minmax_nan_val(self, func): - cuda_func = cuda.jit('void(float64[:], float64[:,:])')(func) + cuda_func = cuda.jit("void(float64[:], float64[:,:])")(func) res = np.random.randint(0, 128, size=1).astype(np.float64) gold = res.copy() @@ -1244,7 +1379,7 @@ def test_atomic_max_nan_val(self): def test_atomic_max_double_shared(self): vals = np.random.randint(0, 32, size=32).astype(np.float64) res = np.zeros(1, np.float64) - sig = 'void(float64[:], float64[:])' + sig = "void(float64[:], float64[:])" cuda_func = cuda.jit(sig)(atomic_max_double_shared) cuda_func[1, 32](res, vals) @@ -1254,7 +1389,7 @@ def test_atomic_max_double_shared(self): def test_atomic_min_double_shared(self): vals = np.random.randint(0, 32, size=32).astype(np.float64) res = np.ones(1, np.float64) * 32 - sig = 'void(float64[:], float64[:])' + sig = "void(float64[:], float64[:])" cuda_func = cuda.jit(sig)(atomic_min_double_shared) cuda_func[1, 32](res, vals) @@ -1289,64 +1424,120 @@ def check_cas(self, n, fill, unfill, dtype, cas_func, ndim=1): np.testing.assert_array_equal(expect_out, out) def test_atomic_compare_and_swap(self): - self.check_cas(n=100, fill=-99, unfill=-1, dtype=np.int32, - cas_func=atomic_compare_and_swap) + self.check_cas( + n=100, + fill=-99, + unfill=-1, + dtype=np.int32, + cas_func=atomic_compare_and_swap, + ) def test_atomic_compare_and_swap2(self): - self.check_cas(n=100, fill=-45, unfill=-1, dtype=np.int64, - cas_func=atomic_compare_and_swap) + self.check_cas( + n=100, + fill=-45, + unfill=-1, + dtype=np.int64, + cas_func=atomic_compare_and_swap, + ) def test_atomic_compare_and_swap3(self): rfill = np.random.randint(50, 500, dtype=np.uint32) runfill = np.random.randint(1, 25, dtype=np.uint32) - self.check_cas(n=100, fill=rfill, unfill=runfill, dtype=np.uint32, - cas_func=atomic_compare_and_swap) + self.check_cas( + n=100, + fill=rfill, + unfill=runfill, + dtype=np.uint32, + cas_func=atomic_compare_and_swap, + ) def test_atomic_compare_and_swap4(self): rfill = np.random.randint(50, 500, dtype=np.uint64) runfill = np.random.randint(1, 25, dtype=np.uint64) - self.check_cas(n=100, fill=rfill, unfill=runfill, dtype=np.uint64, - cas_func=atomic_compare_and_swap) + self.check_cas( + n=100, + fill=rfill, + unfill=runfill, + dtype=np.uint64, + cas_func=atomic_compare_and_swap, + ) def test_atomic_cas_1dim(self): - self.check_cas(n=100, fill=-99, unfill=-1, dtype=np.int32, - cas_func=atomic_cas_1dim) + self.check_cas( + n=100, fill=-99, unfill=-1, dtype=np.int32, cas_func=atomic_cas_1dim + ) def test_atomic_cas_2dim(self): - self.check_cas(n=100, fill=-99, unfill=-1, dtype=np.int32, - cas_func=atomic_cas_2dim, ndim=2) + self.check_cas( + n=100, + fill=-99, + unfill=-1, + dtype=np.int32, + cas_func=atomic_cas_2dim, + ndim=2, + ) def test_atomic_cas2_1dim(self): - self.check_cas(n=100, fill=-45, unfill=-1, dtype=np.int64, - cas_func=atomic_cas_1dim) + self.check_cas( + n=100, fill=-45, unfill=-1, dtype=np.int64, cas_func=atomic_cas_1dim + ) def test_atomic_cas2_2dim(self): - self.check_cas(n=100, fill=-45, unfill=-1, dtype=np.int64, - cas_func=atomic_cas_2dim, ndim=2) + self.check_cas( + n=100, + fill=-45, + unfill=-1, + dtype=np.int64, + cas_func=atomic_cas_2dim, + ndim=2, + ) def test_atomic_cas3_1dim(self): rfill = np.random.randint(50, 500, dtype=np.uint32) runfill = np.random.randint(1, 25, dtype=np.uint32) - self.check_cas(n=100, fill=rfill, unfill=runfill, dtype=np.uint32, - cas_func=atomic_cas_1dim) + self.check_cas( + n=100, + fill=rfill, + unfill=runfill, + dtype=np.uint32, + cas_func=atomic_cas_1dim, + ) def test_atomic_cas3_2dim(self): rfill = np.random.randint(50, 500, dtype=np.uint32) runfill = np.random.randint(1, 25, dtype=np.uint32) - self.check_cas(n=100, fill=rfill, unfill=runfill, dtype=np.uint32, - cas_func=atomic_cas_2dim, ndim=2) + self.check_cas( + n=100, + fill=rfill, + unfill=runfill, + dtype=np.uint32, + cas_func=atomic_cas_2dim, + ndim=2, + ) def test_atomic_cas4_1dim(self): rfill = np.random.randint(50, 500, dtype=np.uint64) runfill = np.random.randint(1, 25, dtype=np.uint64) - self.check_cas(n=100, fill=rfill, unfill=runfill, dtype=np.uint64, - cas_func=atomic_cas_1dim) + self.check_cas( + n=100, + fill=rfill, + unfill=runfill, + dtype=np.uint64, + cas_func=atomic_cas_1dim, + ) def test_atomic_cas4_2dim(self): rfill = np.random.randint(50, 500, dtype=np.uint64) runfill = np.random.randint(1, 25, dtype=np.uint64) - self.check_cas(n=100, fill=rfill, unfill=runfill, dtype=np.uint64, - cas_func=atomic_cas_2dim, ndim=2) + self.check_cas( + n=100, + fill=rfill, + unfill=runfill, + dtype=np.uint64, + cas_func=atomic_cas_2dim, + ndim=2, + ) # Tests that the atomic add, min, and max operations return the old value - # in the simulator, they did not (see Issue #5458). The max and min have @@ -1438,34 +1629,36 @@ def check_atomic_nanmax(self, dtype, lo, hi, init_val): np.testing.assert_equal(res, gold) def test_atomic_nanmax_int32(self): - self.check_atomic_nanmax(dtype=np.int32, lo=-65535, hi=65535, - init_val=0) + self.check_atomic_nanmax( + dtype=np.int32, lo=-65535, hi=65535, init_val=0 + ) def test_atomic_nanmax_uint32(self): - self.check_atomic_nanmax(dtype=np.uint32, lo=0, hi=65535, - init_val=0) + self.check_atomic_nanmax(dtype=np.uint32, lo=0, hi=65535, init_val=0) def test_atomic_nanmax_int64(self): - self.check_atomic_nanmax(dtype=np.int64, lo=-65535, hi=65535, - init_val=0) + self.check_atomic_nanmax( + dtype=np.int64, lo=-65535, hi=65535, init_val=0 + ) def test_atomic_nanmax_uint64(self): - self.check_atomic_nanmax(dtype=np.uint64, lo=0, hi=65535, - init_val=0) + self.check_atomic_nanmax(dtype=np.uint64, lo=0, hi=65535, init_val=0) def test_atomic_nanmax_float32(self): - self.check_atomic_nanmax(dtype=np.float32, lo=-65535, hi=65535, - init_val=np.nan) + self.check_atomic_nanmax( + dtype=np.float32, lo=-65535, hi=65535, init_val=np.nan + ) def test_atomic_nanmax_double(self): - self.check_atomic_nanmax(dtype=np.float64, lo=-65535, hi=65535, - init_val=np.nan) + self.check_atomic_nanmax( + dtype=np.float64, lo=-65535, hi=65535, init_val=np.nan + ) def test_atomic_nanmax_double_shared(self): vals = np.random.randint(0, 32, size=32).astype(np.float64) vals[1::2] = np.nan res = np.array([0], dtype=vals.dtype) - sig = 'void(float64[:], float64[:])' + sig = "void(float64[:], float64[:])" cuda_func = cuda.jit(sig)(atomic_nanmax_double_shared) cuda_func[1, 32](res, vals) @@ -1476,8 +1669,9 @@ def test_atomic_nanmax_double_oneindex(self): vals = np.random.randint(0, 128, size=32).astype(np.float64) vals[1::2] = np.nan res = np.zeros(1, np.float64) - cuda_func = cuda.jit('void(float64[:], float64[:])')( - atomic_max_double_oneindex) + cuda_func = cuda.jit("void(float64[:], float64[:])")( + atomic_max_double_oneindex + ) cuda_func[1, 32](res, vals) gold = np.nanmax(vals) @@ -1495,34 +1689,36 @@ def check_atomic_nanmin(self, dtype, lo, hi, init_val): np.testing.assert_equal(res, gold) def test_atomic_nanmin_int32(self): - self.check_atomic_nanmin(dtype=np.int32, lo=-65535, hi=65535, - init_val=0) + self.check_atomic_nanmin( + dtype=np.int32, lo=-65535, hi=65535, init_val=0 + ) def test_atomic_nanmin_uint32(self): - self.check_atomic_nanmin(dtype=np.uint32, lo=0, hi=65535, - init_val=0) + self.check_atomic_nanmin(dtype=np.uint32, lo=0, hi=65535, init_val=0) def test_atomic_nanmin_int64(self): - self.check_atomic_nanmin(dtype=np.int64, lo=-65535, hi=65535, - init_val=0) + self.check_atomic_nanmin( + dtype=np.int64, lo=-65535, hi=65535, init_val=0 + ) def test_atomic_nanmin_uint64(self): - self.check_atomic_nanmin(dtype=np.uint64, lo=0, hi=65535, - init_val=0) + self.check_atomic_nanmin(dtype=np.uint64, lo=0, hi=65535, init_val=0) def test_atomic_nanmin_float(self): - self.check_atomic_nanmin(dtype=np.float32, lo=-65535, hi=65535, - init_val=np.nan) + self.check_atomic_nanmin( + dtype=np.float32, lo=-65535, hi=65535, init_val=np.nan + ) def test_atomic_nanmin_double(self): - self.check_atomic_nanmin(dtype=np.float64, lo=-65535, hi=65535, - init_val=np.nan) + self.check_atomic_nanmin( + dtype=np.float64, lo=-65535, hi=65535, init_val=np.nan + ) def test_atomic_nanmin_double_shared(self): vals = np.random.randint(0, 32, size=32).astype(np.float64) vals[1::2] = np.nan res = np.array([32], dtype=vals.dtype) - sig = 'void(float64[:], float64[:])' + sig = "void(float64[:], float64[:])" cuda_func = cuda.jit(sig)(atomic_nanmin_double_shared) cuda_func[1, 32](res, vals) @@ -1533,8 +1729,9 @@ def test_atomic_nanmin_double_oneindex(self): vals = np.random.randint(0, 128, size=32).astype(np.float64) vals[1::2] = np.nan res = np.array([128], np.float64) - cuda_func = cuda.jit('void(float64[:], float64[:])')( - atomic_min_double_oneindex) + cuda_func = cuda.jit("void(float64[:], float64[:])")( + atomic_min_double_oneindex + ) cuda_func[1, 32](res, vals) gold = np.nanmin(vals) @@ -1610,5 +1807,5 @@ def kernel(x): self._test_atomic_nan_returns_old(kernel, 11) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py b/numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py index 1375162d9..7cf4d288f 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py @@ -17,13 +17,23 @@ def cnd(d): K = 1.0 / (1.0 + 0.2316419 * np.abs(d)) - ret_val = (RSQRT2PI * np.exp(-0.5 * d * d) * - (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))))) + ret_val = ( + RSQRT2PI + * np.exp(-0.5 * d * d) + * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))) + ) return np.where(d > 0, 1.0 - ret_val, ret_val) -def black_scholes(callResult, putResult, stockPrice, optionStrike, optionYears, - Riskfree, Volatility): +def black_scholes( + callResult, + putResult, + stockPrice, + optionStrike, + optionYears, + Riskfree, + Volatility, +): S = stockPrice X = optionStrike T = optionYears @@ -35,9 +45,9 @@ def black_scholes(callResult, putResult, stockPrice, optionStrike, optionYears, cndd1 = cnd(d1) cndd2 = cnd(d2) - expRT = np.exp(- R * T) - callResult[:] = (S * cndd1 - X * expRT * cndd2) - putResult[:] = (X * expRT * (1.0 - cndd2) - S * (1.0 - cndd1)) + expRT = np.exp(-R * T) + callResult[:] = S * cndd1 - X * expRT * cndd2 + putResult[:] = X * expRT * (1.0 - cndd2) - S * (1.0 - cndd1) def randfloat(rand_var, low, high): @@ -61,34 +71,54 @@ def test_blackscholes(self): # numpy for i in range(iterations): - black_scholes(callResultNumpy, putResultNumpy, stockPrice, - optionStrike, optionYears, RISKFREE, VOLATILITY) + black_scholes( + callResultNumpy, + putResultNumpy, + stockPrice, + optionStrike, + optionYears, + RISKFREE, + VOLATILITY, + ) @cuda.jit(double(double), device=True, inline=True) def cnd_cuda(d): K = 1.0 / (1.0 + 0.2316419 * math.fabs(d)) - ret_val = (RSQRT2PI * math.exp(-0.5 * d * d) * - (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))))) + ret_val = ( + RSQRT2PI + * math.exp(-0.5 * d * d) + * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))) + ) if d > 0: ret_val = 1.0 - ret_val return ret_val - @cuda.jit(void(double[:], double[:], double[:], double[:], double[:], - double, double)) + @cuda.jit( + void( + double[:], + double[:], + double[:], + double[:], + double[:], + double, + double, + ) + ) def black_scholes_cuda(callResult, putResult, S, X, T, R, V): i = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x if i >= S.shape[0]: return sqrtT = math.sqrt(T[i]) - d1 = ((math.log(S[i] / X[i]) + (R + 0.5 * V * V) * T[i]) - / (V * sqrtT)) + d1 = (math.log(S[i] / X[i]) + (R + 0.5 * V * V) * T[i]) / ( + V * sqrtT + ) d2 = d1 - V * sqrtT cndd1 = cnd_cuda(d1) cndd2 = cnd_cuda(d2) - expRT = math.exp((-1. * R) * T[i]) - callResult[i] = (S[i] * cndd1 - X[i] * expRT * cndd2) - putResult[i] = (X[i] * expRT * (1.0 - cndd2) - S[i] * (1.0 - cndd1)) + expRT = math.exp((-1.0 * R) * T[i]) + callResult[i] = S[i] * cndd1 - X[i] * expRT * cndd2 + putResult[i] = X[i] * expRT * (1.0 - cndd2) - S[i] * (1.0 - cndd1) # numba blockdim = 512, 1 @@ -102,8 +132,14 @@ def black_scholes_cuda(callResult, putResult, S, X, T, R, V): for i in range(iterations): black_scholes_cuda[griddim, blockdim, stream]( - d_callResult, d_putResult, d_stockPrice, d_optionStrike, - d_optionYears, RISKFREE, VOLATILITY) + d_callResult, + d_putResult, + d_stockPrice, + d_optionStrike, + d_optionYears, + RISKFREE, + VOLATILITY, + ) d_callResult.copy_to_host(callResultNumba, stream) d_putResult.copy_to_host(putResultNumba, stream) stream.synchronize() @@ -116,5 +152,5 @@ def black_scholes_cuda(callResult, putResult, S, X, T, R, V): self.assertTrue(max_abs_err < 1e-13) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_boolean.py b/numba_cuda/numba/cuda/tests/cudapy/test_boolean.py index fc0568233..ac0d28769 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_boolean.py @@ -12,13 +12,13 @@ def boolean_func(A, vertial): class TestCudaBoolean(CUDATestCase): def test_boolean(self): - func = cuda.jit('void(float64[:], bool_)')(boolean_func) - A = np.array([0], dtype='float64') + func = cuda.jit("void(float64[:], bool_)")(boolean_func) + A = np.array([0], dtype="float64") func[1, 1](A, True) self.assertTrue(A[0] == 123) func[1, 1](A, False) self.assertTrue(A[0] == 321) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_caching.py b/numba_cuda/numba/cuda/tests/cudapy/test_caching.py index 22e2f4a6e..d8002207e 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_caching.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_caching.py @@ -8,15 +8,22 @@ from numba import cuda from numba.core.errors import NumbaWarning -from numba.cuda.testing import (CUDATestCase, skip_on_cudasim, - skip_unless_cc_60, skip_if_cudadevrt_missing, - skip_if_mvc_enabled, test_data_dir) +from numba.cuda.testing import ( + CUDATestCase, + skip_on_cudasim, + skip_unless_cc_60, + skip_if_cudadevrt_missing, + skip_if_mvc_enabled, + test_data_dir, +) from numba.tests.support import SerialMixin -from numba.tests.test_caching import (DispatcherCacheUsecasesTest, - skip_bad_access) +from numba.tests.test_caching import ( + DispatcherCacheUsecasesTest, + skip_bad_access, +) -@skip_on_cudasim('Simulator does not implement caching') +@skip_on_cudasim("Simulator does not implement caching") class CUDACachingTest(SerialMixin, DispatcherCacheUsecasesTest): here = os.path.dirname(__file__) usecases_file = os.path.join(here, "cache_usecases.py") @@ -72,23 +79,23 @@ def test_many_locals(self): mod = self.import_module() f = mod.many_locals f[1, 1]() - self.check_pycache(2) # 1 index, 1 data + self.check_pycache(2) # 1 index, 1 data def test_closure(self): mod = self.import_module() with warnings.catch_warnings(): - warnings.simplefilter('error', NumbaWarning) + warnings.simplefilter("error", NumbaWarning) f = mod.closure1 - self.assertPreciseEqual(f(3), 6) # 3 + 3 = 6 + self.assertPreciseEqual(f(3), 6) # 3 + 3 = 6 f = mod.closure2 - self.assertPreciseEqual(f(3), 8) # 3 + 5 = 8 + self.assertPreciseEqual(f(3), 8) # 3 + 5 = 8 f = mod.closure3 - self.assertPreciseEqual(f(3), 10) # 3 + 7 = 10 + self.assertPreciseEqual(f(3), 10) # 3 + 7 = 10 f = mod.closure4 - self.assertPreciseEqual(f(3), 12) # 3 + 9 = 12 - self.check_pycache(5) # 1 nbi, 4 nbc + self.assertPreciseEqual(f(3), 12) # 3 + 9 = 12 + self.check_pycache(5) # 1 nbi, 4 nbc def test_cache_reuse(self): mod = self.import_module() @@ -158,7 +165,7 @@ def test_same_names(self): @skip_unless_cc_60 @skip_if_cudadevrt_missing - @skip_if_mvc_enabled('CG not supported with MVC') + @skip_if_mvc_enabled("CG not supported with MVC") def test_cache_cg(self): # Functions using cooperative groups should be cacheable. See Issue # #8888: https://github.com/numba/numba/issues/8888 @@ -174,7 +181,7 @@ def test_cache_cg(self): @skip_unless_cc_60 @skip_if_cudadevrt_missing - @skip_if_mvc_enabled('CG not supported with MVC') + @skip_if_mvc_enabled("CG not supported with MVC") def test_cache_cg_clean_run(self): # See Issue #9432: https://github.com/numba/numba/issues/9432 # If a cached function using CG sync was the first thing to compile, @@ -191,9 +198,11 @@ def test_cache_cg_clean_run(self): mod.cg_usecase(0) """ % dict(tempdir=self.tempdir, modname=self.modname) - popen = subprocess.Popen([sys.executable, "-c", code], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + popen = subprocess.Popen( + [sys.executable, "-c", code], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) out, err = popen.communicate(timeout=60) if popen.returncode != 0: raise AssertionError( @@ -212,8 +221,9 @@ def _test_pycache_fallback(self): f = mod.add_usecase # Remove this function's cache files at the end, to avoid accumulation # across test calls. - self.addCleanup(shutil.rmtree, f.func.stats.cache_path, - ignore_errors=True) + self.addCleanup( + shutil.rmtree, f.func.stats.cache_path, ignore_errors=True + ) self.assertPreciseEqual(f(2, 3), 6) # It's a cache miss since the file was copied to a new temp location @@ -230,8 +240,9 @@ def _test_pycache_fallback(self): self.check_pycache(0) @skip_bad_access - @unittest.skipIf(os.name == "nt", - "cannot easily make a directory read-only on Windows") + @unittest.skipIf( + os.name == "nt", "cannot easily make a directory read-only on Windows" + ) def test_non_creatable_pycache(self): # Make it impossible to create the __pycache__ directory old_perms = os.stat(self.tempdir).st_mode @@ -241,11 +252,12 @@ def test_non_creatable_pycache(self): self._test_pycache_fallback() @skip_bad_access - @unittest.skipIf(os.name == "nt", - "cannot easily make a directory read-only on Windows") + @unittest.skipIf( + os.name == "nt", "cannot easily make a directory read-only on Windows" + ) def test_non_writable_pycache(self): # Make it impossible to write to the __pycache__ directory - pycache = os.path.join(self.tempdir, '__pycache__') + pycache = os.path.join(self.tempdir, "__pycache__") os.mkdir(pycache) old_perms = os.stat(pycache).st_mode os.chmod(pycache, 0o500) @@ -254,15 +266,16 @@ def test_non_writable_pycache(self): self._test_pycache_fallback() def test_cannot_cache_linking_libraries(self): - link = str(test_data_dir / 'jitlink.ptx') - msg = 'Cannot pickle CUDACodeLibrary with linking files' + link = str(test_data_dir / "jitlink.ptx") + msg = "Cannot pickle CUDACodeLibrary with linking files" with self.assertRaisesRegex(RuntimeError, msg): - @cuda.jit('void()', cache=True, link=[link]) + + @cuda.jit("void()", cache=True, link=[link]) def f(): pass -@skip_on_cudasim('Simulator does not implement caching') +@skip_on_cudasim("Simulator does not implement caching") class CUDAAndCPUCachingTest(SerialMixin, DispatcherCacheUsecasesTest): here = os.path.dirname(__file__) usecases_file = os.path.join(here, "cache_with_cpu_usecases.py") @@ -353,7 +366,7 @@ def get_different_cc_gpus(): return None -@skip_on_cudasim('Simulator does not implement caching') +@skip_on_cudasim("Simulator does not implement caching") class TestMultiCCCaching(SerialMixin, DispatcherCacheUsecasesTest): here = os.path.dirname(__file__) usecases_file = os.path.join(here, "cache_usecases.py") @@ -370,7 +383,7 @@ def tearDown(self): def test_cache(self): gpus = get_different_cc_gpus() if not gpus: - self.skipTest('Need two different CCs for multi-CC cache test') + self.skipTest("Need two different CCs for multi-CC cache test") self.check_pycache(0) mod = self.import_module() @@ -482,13 +495,13 @@ def child_initializer(): # Disable occupancy and implicit copy warnings in processes in a # multiprocessing pool. from numba.core import config + config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 config.CUDA_WARN_ON_IMPLICIT_COPY = 0 -@skip_on_cudasim('Simulator does not implement caching') +@skip_on_cudasim("Simulator does not implement caching") class TestMultiprocessCache(SerialMixin, DispatcherCacheUsecasesTest): - # Nested multiprocessing.Pool raises AssertionError: # "daemonic processes are not allowed to have children" _numba_parallel_test_ = False @@ -513,7 +526,7 @@ def test_multiprocessing(self): f = mod.simple_usecase_caller n = 3 try: - ctx = multiprocessing.get_context('spawn') + ctx = multiprocessing.get_context("spawn") except AttributeError: ctx = multiprocessing @@ -526,7 +539,7 @@ def test_multiprocessing(self): self.assertEqual(res, n * (n - 1) // 2) -@skip_on_cudasim('Simulator does not implement the CUDACodeLibrary') +@skip_on_cudasim("Simulator does not implement the CUDACodeLibrary") class TestCUDACodeLibrary(CUDATestCase): # For tests of miscellaneous CUDACodeLibrary behaviour that we wish to # explicitly check @@ -539,7 +552,7 @@ def test_cannot_serialize_unfinalized(self): # Usually a CodeLibrary requires a real CodeGen, but since we don't # interact with it, anything will do codegen = object() - name = 'library' + name = "library" cl = CUDACodeLibrary(codegen, name) - with self.assertRaisesRegex(RuntimeError, 'Cannot pickle unfinalized'): + with self.assertRaisesRegex(RuntimeError, "Cannot pickle unfinalized"): cl._reduce_states() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_casting.py b/numba_cuda/numba/cuda/tests/cudapy/test_casting.py index 2ce77e05b..1d291fa9f 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_casting.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_casting.py @@ -4,8 +4,7 @@ from numba.core.types import f2, i1, i2, i4, i8, u1, u2, u4, u8 from numba import cuda from numba.core import types -from numba.cuda.testing import (CUDATestCase, skip_on_cudasim, - skip_unless_cc_53) +from numba.cuda.testing import CUDATestCase, skip_on_cudasim, skip_unless_cc_53 from numba.types import float16, float32 import itertools import unittest @@ -50,7 +49,7 @@ def to_uint64(x): def to_float16(x): # When division and operators on float16 types are supported, this should # be changed to match the implementation in to_float32. - return (np.float16(x) * np.float16(0.5)) + return np.float16(x) * np.float16(0.5) def to_float32(x): @@ -76,6 +75,7 @@ def to_complex128(x): # - The device version uses cuda.fp16.hmul # - The host version uses the * operator + def cuda_int_literal_to_float16(x): # Note that we need to use `2` and not `np.float16(2)` to ensure that this # types as a literal int and not a const float16. @@ -128,7 +128,7 @@ def test_float_to_int(self): self.assertEqual(cfunc(-12.3), pyfunc(-12.3)) self.assertEqual(cfunc(-12.3), int(-12.3)) - @skip_on_cudasim('Compilation unsupported in the simulator') + @skip_on_cudasim("Compilation unsupported in the simulator") def test_float16_to_int_ptx(self): pyfuncs = (to_int8, to_int16, to_int32, to_int64) sizes = (8, 16, 32, 64) @@ -150,7 +150,7 @@ def test_float_to_uint(self): self.assertEqual(cfunc(12.3), pyfunc(12.3)) self.assertEqual(cfunc(12.3), int(12.3)) - @skip_on_cudasim('Compilation unsupported in the simulator') + @skip_on_cudasim("Compilation unsupported in the simulator") def test_float16_to_uint_ptx(self): pyfuncs = (to_uint8, to_uint16, to_uint32, to_uint64) sizes = (8, 16, 32, 64) @@ -171,17 +171,18 @@ def test_int_to_float(self): @skip_unless_cc_53 def test_literal_to_float16(self): - cudafuncs = (cuda_int_literal_to_float16, - cuda_float_literal_to_float16) - hostfuncs = (reference_int_literal_to_float16, - reference_float_literal_to_float16) + cudafuncs = (cuda_int_literal_to_float16, cuda_float_literal_to_float16) + hostfuncs = ( + reference_int_literal_to_float16, + reference_float_literal_to_float16, + ) for cudafunc, hostfunc in zip(cudafuncs, hostfuncs): with self.subTest(func=cudafunc): cfunc = self._create_wrapped(cudafunc, np.float16, np.float16) self.assertEqual(cfunc(321), hostfunc(321)) - @skip_on_cudasim('Compilation unsupported in the simulator') + @skip_on_cudasim("Compilation unsupported in the simulator") def test_int_to_float16_ptx(self): fromtys = (i1, i2, i4, i8) sizes = (8, 16, 32, 64) @@ -190,7 +191,7 @@ def test_int_to_float16_ptx(self): ptx, _ = compile_ptx(to_float16, (ty,), device=True) self.assertIn(f"cvt.rn.f16.s{size}", ptx) - @skip_on_cudasim('Compilation unsupported in the simulator') + @skip_on_cudasim("Compilation unsupported in the simulator") def test_uint_to_float16_ptx(self): fromtys = (u1, u2, u4, u8) sizes = (8, 16, 32, 64) @@ -211,12 +212,14 @@ def test_float_to_float(self): # the CUDA target doesn't yet implement division (or operators) # for float16 values, so we test by comparing with the computed # expression instead. - np.testing.assert_allclose(cfunc(12.3), - toty(12.3) / toty(2), rtol=0.0003) - np.testing.assert_allclose(cfunc(-12.3), - toty(-12.3) / toty(2), rtol=0.0003) - - @skip_on_cudasim('Compilation unsupported in the simulator') + np.testing.assert_allclose( + cfunc(12.3), toty(12.3) / toty(2), rtol=0.0003 + ) + np.testing.assert_allclose( + cfunc(-12.3), toty(-12.3) / toty(2), rtol=0.0003 + ) + + @skip_on_cudasim("Compilation unsupported in the simulator") def test_float16_to_float_ptx(self): pyfuncs = (to_float32, to_float64) postfixes = ("f32", "f64") @@ -239,12 +242,14 @@ def test_float_to_complex(self): # to match the casting that is automatically applied when # passing the input to the cfunc as part of wrapping it in # an array of type fromtype. - np.testing.assert_allclose(cfunc(3.21), - pyfunc(fromty(3.21))) - np.testing.assert_allclose(cfunc(-3.21), - pyfunc(fromty(-3.21)) + 0j) - - @skip_on_cudasim('Compilation unsupported in the simulator') + np.testing.assert_allclose( + cfunc(3.21), pyfunc(fromty(3.21)) + ) + np.testing.assert_allclose( + cfunc(-3.21), pyfunc(fromty(-3.21)) + 0j + ) + + @skip_on_cudasim("Compilation unsupported in the simulator") def test_native_cast(self): float32_ptx, _ = cuda.compile_ptx(native_cast, (float32,), device=True) self.assertIn("st.f32", float32_ptx) @@ -253,5 +258,5 @@ def test_native_cast(self): self.assertIn("st.u16", float16_ptx) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_cffi.py b/numba_cuda/numba/cuda/tests/cudapy/test_cffi.py index ee09fcc31..57f3efdb8 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_cffi.py @@ -1,21 +1,26 @@ import numpy as np from numba import cuda, types -from numba.cuda.testing import (skip_on_cudasim, test_data_dir, unittest, - CUDATestCase) +from numba.cuda.testing import ( + skip_on_cudasim, + test_data_dir, + unittest, + CUDATestCase, +) from numba.tests.support import skip_unless_cffi @skip_unless_cffi -@skip_on_cudasim('Simulator does not support linking') +@skip_on_cudasim("Simulator does not support linking") class TestCFFI(CUDATestCase): def test_from_buffer(self): import cffi + ffi = cffi.FFI() - link = str(test_data_dir / 'jitlink.ptx') + link = str(test_data_dir / "jitlink.ptx") sig = types.void(types.CPointer(types.int32)) - array_mutator = cuda.declare_device('array_mutator', sig) + array_mutator = cuda.declare_device("array_mutator", sig) @cuda.jit(link=[link]) def mutate_array(x): @@ -29,5 +34,5 @@ def mutate_array(x): self.assertEqual(x[0], x[1]) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_compiler.py b/numba_cuda/numba/cuda/tests/cudapy/test_compiler.py index 3a9ded7c4..ddc847681 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_compiler.py @@ -1,7 +1,11 @@ from math import sqrt from numba import cuda, float32, int16, int32, int64, uint32, void -from numba.cuda import (compile, compile_for_current_device, compile_ptx, - compile_ptx_for_current_device) +from numba.cuda import ( + compile, + compile_for_current_device, + compile_ptx, + compile_ptx_for_current_device, +) from numba.cuda.cudadrv import runtime from numba.cuda.testing import skip_on_cudasim, unittest, CUDATestCase @@ -12,7 +16,7 @@ def f_module(x, y): return x + y -@skip_on_cudasim('Compilation unsupported in the simulator') +@skip_on_cudasim("Compilation unsupported in the simulator") class TestCompile(unittest.TestCase): def test_global_kernel(self): def f(r, x, y): @@ -24,11 +28,11 @@ def f(r, x, y): ptx, resty = compile_ptx(f, args) # Kernels should not have a func_retval parameter - self.assertNotIn('func_retval', ptx) + self.assertNotIn("func_retval", ptx) # .visible .func is used to denote a device function - self.assertNotIn('.visible .func', ptx) + self.assertNotIn(".visible .func", ptx) # .visible .entry would denote the presence of a global function - self.assertIn('.visible .entry', ptx) + self.assertIn(".visible .entry", ptx) # Return type for kernels should always be void self.assertEqual(resty, void) @@ -41,11 +45,11 @@ def add(x, y): # Device functions take a func_retval parameter for storing the # returned value in by reference - self.assertIn('func_retval', ptx) + self.assertIn("func_retval", ptx) # .visible .func is used to denote a device function - self.assertIn('.visible .func', ptx) + self.assertIn(".visible .func", ptx) # .visible .entry would denote the presence of a global function - self.assertNotIn('.visible .entry', ptx) + self.assertNotIn(".visible .entry", ptx) # Inferred return type as expected? self.assertEqual(resty, float32) @@ -71,21 +75,21 @@ def f(x, y, z, d): # Without fastmath, fma contraction is enabled by default, but ftz and # approximate div / sqrt is not. - self.assertIn('fma.rn.f32', ptx) - self.assertIn('div.rn.f32', ptx) - self.assertIn('sqrt.rn.f32', ptx) + self.assertIn("fma.rn.f32", ptx) + self.assertIn("div.rn.f32", ptx) + self.assertIn("sqrt.rn.f32", ptx) ptx, resty = compile_ptx(f, args, device=True, fastmath=True) # With fastmath, ftz and approximate div / sqrt are enabled - self.assertIn('fma.rn.ftz.f32', ptx) - self.assertIn('div.approx.ftz.f32', ptx) - self.assertIn('sqrt.approx.ftz.f32', ptx) + self.assertIn("fma.rn.ftz.f32", ptx) + self.assertIn("div.approx.ftz.f32", ptx) + self.assertIn("sqrt.approx.ftz.f32", ptx) def check_debug_info(self, ptx): # A debug_info section should exist in the PTX. Whitespace varies # between CUDA toolkit versions. - self.assertRegex(ptx, '\\.section\\s+\\.debug_info') + self.assertRegex(ptx, "\\.section\\s+\\.debug_info") # A .file directive should be produced and include the name of the # source. The path and whitespace may vary, so we accept anything # ending in the filename of this module. @@ -136,23 +140,25 @@ def test_non_void_return_type(self): def f(x, y): return x[0] + y[0] - with self.assertRaisesRegex(TypeError, 'must have void return type'): + with self.assertRaisesRegex(TypeError, "must have void return type"): compile_ptx(f, (uint32[::1], uint32[::1])) def test_c_abi_disallowed_for_kernel(self): def f(x, y): return x + y - with self.assertRaisesRegex(NotImplementedError, - "The C ABI is not supported for kernels"): + with self.assertRaisesRegex( + NotImplementedError, "The C ABI is not supported for kernels" + ): compile_ptx(f, (int32, int32), abi="c") def test_unsupported_abi(self): def f(x, y): return x + y - with self.assertRaisesRegex(NotImplementedError, - "Unsupported ABI: fastcall"): + with self.assertRaisesRegex( + NotImplementedError, "Unsupported ABI: fastcall" + ): compile_ptx(f, (int32, int32), abi="fastcall") def test_c_abi_device_function(self): @@ -166,8 +172,11 @@ def f(x, y): # The function name should match the Python function name (not the # qualname, which includes additional info), and its return value # should be 32 bits - self.assertRegex(ptx, r"\.visible\s+\.func\s+\(\.param\s+\.b32\s+" - r"func_retval0\)\s+f\(") + self.assertRegex( + ptx, + r"\.visible\s+\.func\s+\(\.param\s+\.b32\s+" + r"func_retval0\)\s+f\(", + ) # If we compile for 64-bit integers, the return type should be 64 bits # wide @@ -175,44 +184,60 @@ def f(x, y): self.assertRegex(ptx, r"\.visible\s+\.func\s+\(\.param\s+\.b64") def test_c_abi_device_function_module_scope(self): - ptx, resty = compile_ptx(f_module, int32(int32, int32), device=True, - abi="c") + ptx, resty = compile_ptx( + f_module, int32(int32, int32), device=True, abi="c" + ) # The function name should match the Python function name, and its # return value should be 32 bits - self.assertRegex(ptx, r"\.visible\s+\.func\s+\(\.param\s+\.b32\s+" - r"func_retval0\)\s+f_module\(") + self.assertRegex( + ptx, + r"\.visible\s+\.func\s+\(\.param\s+\.b32\s+" + r"func_retval0\)\s+f_module\(", + ) def test_c_abi_with_abi_name(self): - abi_info = {'abi_name': '_Z4funcii'} - ptx, resty = compile_ptx(f_module, int32(int32, int32), device=True, - abi="c", abi_info=abi_info) + abi_info = {"abi_name": "_Z4funcii"} + ptx, resty = compile_ptx( + f_module, + int32(int32, int32), + device=True, + abi="c", + abi_info=abi_info, + ) # The function name should match the one given in the ABI info, and its # return value should be 32 bits - self.assertRegex(ptx, r"\.visible\s+\.func\s+\(\.param\s+\.b32\s+" - r"func_retval0\)\s+_Z4funcii\(") + self.assertRegex( + ptx, + r"\.visible\s+\.func\s+\(\.param\s+\.b32\s+" + r"func_retval0\)\s+_Z4funcii\(", + ) def test_compile_defaults_to_c_abi(self): ptx, resty = compile(f_module, int32(int32, int32), device=True) # The function name should match the Python function name, and its # return value should be 32 bits - self.assertRegex(ptx, r"\.visible\s+\.func\s+\(\.param\s+\.b32\s+" - r"func_retval0\)\s+f_module\(") + self.assertRegex( + ptx, + r"\.visible\s+\.func\s+\(\.param\s+\.b32\s+" + r"func_retval0\)\s+f_module\(", + ) def test_compile_to_ltoir(self): if runtime.get_version() < (11, 5): self.skipTest("-gen-lto unavailable in this toolkit version") - ltoir, resty = compile(f_module, int32(int32, int32), device=True, - output="ltoir") + ltoir, resty = compile( + f_module, int32(int32, int32), device=True, output="ltoir" + ) # There are no tools to interpret the LTOIR output, but we can check # that we appear to have obtained an LTOIR file. This magic number is # not documented, but is expected to remain consistent. LTOIR_MAGIC = 0x7F4E43ED - header = int.from_bytes(ltoir[:4], byteorder='little') + header = int.from_bytes(ltoir[:4], byteorder="little") self.assertEqual(header, LTOIR_MAGIC) self.assertEqual(resty, int32) @@ -220,11 +245,15 @@ def test_compile_to_invalid_error(self): illegal_output = "illegal" msg = f"Unsupported output type: {illegal_output}" with self.assertRaisesRegex(NotImplementedError, msg): - compile(f_module, int32(int32, int32), device=True, - output=illegal_output) + compile( + f_module, + int32(int32, int32), + device=True, + output=illegal_output, + ) -@skip_on_cudasim('Compilation unsupported in the simulator') +@skip_on_cudasim("Compilation unsupported in the simulator") class TestCompileForCurrentDevice(CUDATestCase): def _check_ptx_for_current_device(self, compile_function): def add(x, y): @@ -237,7 +266,7 @@ def add(x, y): # closest compute capability supported by the current toolkit. device_cc = cuda.get_current_device().compute_capability cc = cuda.cudadrv.nvvm.find_closest_arch(device_cc) - target = f'.target sm_{cc[0]}{cc[1]}' + target = f".target sm_{cc[0]}{cc[1]}" self.assertIn(target, ptx) def test_compile_ptx_for_current_device(self): @@ -247,10 +276,10 @@ def test_compile_for_current_device(self): self._check_ptx_for_current_device(compile_for_current_device) -@skip_on_cudasim('Compilation unsupported in the simulator') +@skip_on_cudasim("Compilation unsupported in the simulator") class TestCompileOnlyTests(unittest.TestCase): - '''For tests where we can only check correctness by examining the compiler - output rather than observing the effects of execution.''' + """For tests where we can only check correctness by examining the compiler + output rather than observing the effects of execution.""" def test_nanosleep(self): def use_nanosleep(x): @@ -262,15 +291,20 @@ def use_nanosleep(x): ptx, resty = compile_ptx(use_nanosleep, (uint32,), cc=(7, 0)) nanosleep_count = 0 - for line in ptx.split('\n'): - if 'nanosleep.u32' in line: + for line in ptx.split("\n"): + if "nanosleep.u32" in line: nanosleep_count += 1 expected = 2 - self.assertEqual(expected, nanosleep_count, - (f'Got {nanosleep_count} nanosleep instructions, ' - f'expected {expected}')) + self.assertEqual( + expected, + nanosleep_count, + ( + f"Got {nanosleep_count} nanosleep instructions, " + f"expected {expected}" + ), + ) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_complex.py b/numba_cuda/numba/cuda/tests/cudapy/test_complex.py index 958393162..d956433f2 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_complex.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_complex.py @@ -6,20 +6,34 @@ from numba.cuda.testing import unittest, CUDATestCase from numba.core import types from numba import cuda -from numba.tests.complex_usecases import (real_usecase, imag_usecase, - conjugate_usecase, phase_usecase, - polar_as_complex_usecase, - rect_usecase, isnan_usecase, - isinf_usecase, isfinite_usecase, - exp_usecase, log_usecase, - log_base_usecase, log10_usecase, - sqrt_usecase, asin_usecase, - acos_usecase, atan_usecase, - cos_usecase, sin_usecase, - tan_usecase, acosh_usecase, - asinh_usecase, atanh_usecase, - cosh_usecase, sinh_usecase, - tanh_usecase) +from numba.tests.complex_usecases import ( + real_usecase, + imag_usecase, + conjugate_usecase, + phase_usecase, + polar_as_complex_usecase, + rect_usecase, + isnan_usecase, + isinf_usecase, + isfinite_usecase, + exp_usecase, + log_usecase, + log_base_usecase, + log10_usecase, + sqrt_usecase, + asin_usecase, + acos_usecase, + atan_usecase, + cos_usecase, + sin_usecase, + tan_usecase, + acosh_usecase, + asinh_usecase, + atanh_usecase, + cosh_usecase, + sinh_usecase, + tanh_usecase, +) from numba.np import numpy_support @@ -29,15 +43,18 @@ def compile_scalar_func(pyfunc, argtypes, restype): assert not isinstance(restype, types.Array) device_func = cuda.jit(restype(*argtypes), device=True)(pyfunc) - kernel_types = [types.Array(tp, 1, "C") - for tp in [restype] + list(argtypes)] + kernel_types = [ + types.Array(tp, 1, "C") for tp in [restype] + list(argtypes) + ] if len(argtypes) == 1: + def kernel_func(out, a): i = cuda.grid(1) if i < out.shape[0]: out[i] = device_func(a[i]) elif len(argtypes) == 2: + def kernel_func(out, a, b): i = cuda.grid(1) if i < out.shape[0]: @@ -49,8 +66,9 @@ def kernel_func(out, a, b): def kernel_wrapper(values): n = len(values) - inputs = [np.empty(n, dtype=numpy_support.as_dtype(tp)) - for tp in argtypes] + inputs = [ + np.empty(n, dtype=numpy_support.as_dtype(tp)) for tp in argtypes + ] output = np.empty(n, dtype=numpy_support.as_dtype(restype)) for i, vs in enumerate(values): for v, inp in zip(vs, inputs): @@ -58,42 +76,70 @@ def kernel_wrapper(values): args = [output] + inputs kernel[int(math.ceil(n / 256)), 256](*args) return list(output) + return kernel_wrapper class BaseComplexTest(CUDATestCase): - def basic_values(self): - reals = [-0.0, +0.0, 1, -1, +1.5, -3.5, - float('-inf'), float('+inf'), float('nan')] + reals = [ + -0.0, + +0.0, + 1, + -1, + +1.5, + -3.5, + float("-inf"), + float("+inf"), + float("nan"), + ] return [complex(x, y) for x, y in itertools.product(reals, reals)] def more_values(self): - reals = [0.0, +0.0, 1, -1, -math.pi, +math.pi, - float('-inf'), float('+inf'), float('nan')] + reals = [ + 0.0, + +0.0, + 1, + -1, + -math.pi, + +math.pi, + float("-inf"), + float("+inf"), + float("nan"), + ] return [complex(x, y) for x, y in itertools.product(reals, reals)] def non_nan_values(self): - reals = [-0.0, +0.0, 1, -1, -math.pi, +math.pi, - float('inf'), float('-inf')] + reals = [ + -0.0, + +0.0, + 1, + -1, + -math.pi, + +math.pi, + float("inf"), + float("-inf"), + ] return [complex(x, y) for x, y in itertools.product(reals, reals)] def run_func(self, pyfunc, sigs, values, ulps=1, ignore_sign_on_zero=False): for sig in sigs: if isinstance(sig, types.Type): - sig = sig, + sig = (sig,) if isinstance(sig, tuple): # Assume return type is the type of first argument sig = sig[0](*sig) - prec = ('single' - if sig.args[0] in (types.float32, types.complex64) - else 'double') + prec = ( + "single" + if sig.args[0] in (types.float32, types.complex64) + else "double" + ) cudafunc = compile_scalar_func(pyfunc, sig.args, sig.return_type) ok_values = [] expected_list = [] for args in values: if not isinstance(args, (list, tuple)): - args = args, + args = (args,) try: expected_list.append(pyfunc(*args)) ok_values.append(args) @@ -102,24 +148,31 @@ def run_func(self, pyfunc, sigs, values, ulps=1, ignore_sign_on_zero=False): continue got_list = cudafunc(ok_values) for got, expected, args in zip(got_list, expected_list, ok_values): - msg = 'for input %r with prec %r' % (args, prec) - self.assertPreciseEqual(got, expected, prec=prec, - ulps=ulps, - ignore_sign_on_zero=ignore_sign_on_zero, - msg=msg) + msg = "for input %r with prec %r" % (args, prec) + self.assertPreciseEqual( + got, + expected, + prec=prec, + ulps=ulps, + ignore_sign_on_zero=ignore_sign_on_zero, + msg=msg, + ) run_unary = run_func run_binary = run_func class TestComplex(BaseComplexTest): - def check_real_image(self, pyfunc): values = self.basic_values() - self.run_unary(pyfunc, - [tp.underlying_float(tp) - for tp in (types.complex64, types.complex128)], - values) + self.run_unary( + pyfunc, + [ + tp.underlying_float(tp) + for tp in (types.complex64, types.complex128) + ], + values, + ) def test_real(self): self.check_real_image(real_usecase) @@ -130,9 +183,7 @@ def test_imag(self): def test_conjugate(self): pyfunc = conjugate_usecase values = self.basic_values() - self.run_unary(pyfunc, - [types.complex64, types.complex128], - values) + self.run_unary(pyfunc, [types.complex64, types.complex128], values) class TestCMath(BaseComplexTest): @@ -141,26 +192,44 @@ class TestCMath(BaseComplexTest): """ def check_predicate_func(self, pyfunc): - self.run_unary(pyfunc, - [types.boolean(tp) - for tp in (types.complex128, types.complex64)], - self.basic_values()) - - def check_unary_func(self, pyfunc, ulps=1, values=None, - returns_float=False, ignore_sign_on_zero=False): + self.run_unary( + pyfunc, + [types.boolean(tp) for tp in (types.complex128, types.complex64)], + self.basic_values(), + ) + + def check_unary_func( + self, + pyfunc, + ulps=1, + values=None, + returns_float=False, + ignore_sign_on_zero=False, + ): if returns_float: + def sig(tp): return tp.underlying_float(tp) else: + def sig(tp): return tp(tp) - self.run_unary(pyfunc, [sig(types.complex128)], - values or self.more_values(), ulps=ulps, - ignore_sign_on_zero=ignore_sign_on_zero) + + self.run_unary( + pyfunc, + [sig(types.complex128)], + values or self.more_values(), + ulps=ulps, + ignore_sign_on_zero=ignore_sign_on_zero, + ) # Avoid discontinuities around pi when in single precision. - self.run_unary(pyfunc, [sig(types.complex64)], - values or self.basic_values(), ulps=ulps, - ignore_sign_on_zero=ignore_sign_on_zero) + self.run_unary( + pyfunc, + [sig(types.complex64)], + values or self.basic_values(), + ulps=ulps, + ignore_sign_on_zero=ignore_sign_on_zero, + ) # Conversions @@ -172,11 +241,14 @@ def test_polar(self): def test_rect(self): def do_test(tp, seed_values): - values = [(z.real, z.imag) for z in seed_values - if not math.isinf(z.imag) or z.real == 0] + values = [ + (z.real, z.imag) + for z in seed_values + if not math.isinf(z.imag) or z.real == 0 + ] float_type = tp.underlying_float - self.run_binary(rect_usecase, [tp(float_type, float_type)], - values) + self.run_binary(rect_usecase, [tp(float_type, float_type)], values) + do_test(types.complex128, self.more_values()) # Avoid discontinuities around pi when in single precision. do_test(types.complex64, self.basic_values()) @@ -202,10 +274,11 @@ def test_log(self): def test_log_base(self): values = list(itertools.product(self.more_values(), self.more_values())) - value_types = [(types.complex128, types.complex128), - (types.complex64, types.complex64)] - self.run_binary(log_base_usecase, value_types, values, - ulps=3) + value_types = [ + (types.complex128, types.complex128), + (types.complex64, types.complex64), + ] + self.run_binary(log_base_usecase, value_types, values, ulps=3) def test_log10(self): self.check_unary_func(log10_usecase) @@ -222,8 +295,9 @@ def test_asin(self): self.check_unary_func(asin_usecase, ulps=2) def test_atan(self): - self.check_unary_func(atan_usecase, ulps=2, - values=self.non_nan_values()) + self.check_unary_func( + atan_usecase, ulps=2, values=self.non_nan_values() + ) def test_cos(self): self.check_unary_func(cos_usecase, ulps=2) @@ -233,8 +307,7 @@ def test_sin(self): self.check_unary_func(sin_usecase, ulps=2) def test_tan(self): - self.check_unary_func(tan_usecase, ulps=2, - ignore_sign_on_zero=True) + self.check_unary_func(tan_usecase, ulps=2, ignore_sign_on_zero=True) # Hyperbolic functions @@ -245,8 +318,7 @@ def test_asinh(self): self.check_unary_func(asinh_usecase, ulps=2) def test_atanh(self): - self.check_unary_func(atanh_usecase, ulps=2, - ignore_sign_on_zero=True) + self.check_unary_func(atanh_usecase, ulps=2, ignore_sign_on_zero=True) def test_cosh(self): self.check_unary_func(cosh_usecase, ulps=2) @@ -255,8 +327,7 @@ def test_sinh(self): self.check_unary_func(sinh_usecase, ulps=2) def test_tanh(self): - self.check_unary_func(tanh_usecase, ulps=2, - ignore_sign_on_zero=True) + self.check_unary_func(tanh_usecase, ulps=2, ignore_sign_on_zero=True) class TestAtomicOnComplexComponents(CUDATestCase): @@ -292,5 +363,5 @@ def atomic_add_one_j(values): np.testing.assert_equal(arr1 + 1j, arr2) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py b/numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py index e72a6df00..8f948311b 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py @@ -5,7 +5,7 @@ class TestCudaComplex(CUDATestCase): def test_cuda_complex_arg(self): - @cuda.jit('void(complex128[:], complex128)') + @cuda.jit("void(complex128[:], complex128)") def foo(a, b): i = cuda.grid(1) a[i] += b @@ -16,5 +16,5 @@ def foo(a, b): self.assertTrue(np.allclose(a, a0 + 2j)) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_const_string.py b/numba_cuda/numba/cuda/tests/cudapy/test_const_string.py index 173319cb2..040d5305e 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_const_string.py @@ -14,14 +14,17 @@ def test_const_string(self): targetctx = cuda_target.target_context mod = targetctx.create_module("") - textstring = 'A Little Brown Fox' + textstring = "A Little Brown Fox" gv0 = targetctx.insert_const_string(mod, textstring) # Insert the same const string a second time - the first should be # reused. targetctx.insert_const_string(mod, textstring) - res = re.findall(r"@\"__conststring__.*internal.*constant.*\[" - r"19\s+x\s+i8\]", str(mod)) + res = re.findall( + r"@\"__conststring__.*internal.*constant.*\[" + r"19\s+x\s+i8\]", + str(mod), + ) # Ensure that the const string was only inserted once self.assertEqual(len(res), 1) @@ -30,12 +33,16 @@ def test_const_string(self): # Using insert_const_string fn = ir.Function(mod, fnty, "test_insert_const_string") builder = ir.IRBuilder(fn.append_basic_block()) - res = builder.addrspacecast(gv0, ir.PointerType(ir.IntType(8)), - 'generic') + res = builder.addrspacecast( + gv0, ir.PointerType(ir.IntType(8)), "generic" + ) builder.ret(res) - matches = re.findall(r"@\"__conststring__.*internal.*constant.*\[" - r"19\s+x\s+i8\]", str(mod)) + matches = re.findall( + r"@\"__conststring__.*internal.*constant.*\[" + r"19\s+x\s+i8\]", + str(mod), + ) self.assertEqual(len(matches), 1) # Using insert_string_const_addrspace @@ -44,11 +51,14 @@ def test_const_string(self): res = targetctx.insert_string_const_addrspace(builder, textstring) builder.ret(res) - matches = re.findall(r"@\"__conststring__.*internal.*constant.*\[" - r"19\s+x\s+i8\]", str(mod)) + matches = re.findall( + r"@\"__conststring__.*internal.*constant.*\[" + r"19\s+x\s+i8\]", + str(mod), + ) self.assertEqual(len(matches), 1) - ptx = compile_ir(str(mod)).decode('ascii') + ptx = compile_ir(str(mod)).decode("ascii") matches = list(re.findall(r"\.const.*__conststring__", ptx)) self.assertEqual(len(matches), 1) @@ -70,8 +80,8 @@ def str_assign(arr): # Expected result, e.g.: # ['XYZ' 'XYZ' 'XYZ' 'XYZ' 'XYZ' 'XYZ' 'XYZ' 'XYZ' ''] expected = np.zeros_like(arr) - expected[:-1] = 'XYZ' - expected[-1] = '' + expected[:-1] = "XYZ" + expected[-1] = "" np.testing.assert_equal(arr, expected) def test_assign_const_byte_string(self): @@ -88,42 +98,42 @@ def bytes_assign(arr): # Expected result, e.g.: # [b'XYZ' b'XYZ' b'XYZ' b'XYZ' b'XYZ' b'XYZ' b'XYZ' b'XYZ' b''] expected = np.zeros_like(arr) - expected[:-1] = b'XYZ' - expected[-1] = b'' + expected[:-1] = b"XYZ" + expected[-1] = b"" np.testing.assert_equal(arr, expected) def test_assign_const_string_in_record(self): @cuda.jit def f(a): - a[0]['x'] = 1 - a[0]['y'] = 'ABC' - a[1]['x'] = 2 - a[1]['y'] = 'XYZ' + a[0]["x"] = 1 + a[0]["y"] = "ABC" + a[1]["x"] = 2 + a[1]["y"] = "XYZ" - dt = np.dtype([('x', np.int32), ('y', np.dtype('()', target='cuda') + @guvectorize(["(f8, f8, f8[:])"], "(),()->()", target="cuda") def vadd(inp, val, out): out[0] = inp + val @@ -118,8 +119,8 @@ def vadd(inp, val, out): def test_array_views(self): """Views created via array interface support: - - Strided slices - - Strided slices + - Strided slices + - Strided slices """ h_arr = np.random.random(10) c_arr = cuda.to_device(h_arr) @@ -148,23 +149,22 @@ def test_array_views(self): self.assertEqual(arr[::2].strides, arr_strided.strides) self.assertEqual(arr[::2].dtype.itemsize, arr_strided.dtype.itemsize) self.assertEqual(arr[::2].alloc_size, arr_strided.alloc_size) - self.assertEqual(arr[::2].nbytes, - arr_strided.size * arr_strided.dtype.itemsize) + self.assertEqual( + arr[::2].nbytes, arr_strided.size * arr_strided.dtype.itemsize + ) # __setitem__ interface propagates into external array # Writes to a slice arr[:5] = np.pi np.testing.assert_array_equal( - c_arr.copy_to_host(), - np.concatenate((np.full(5, np.pi), h_arr[5:])) + c_arr.copy_to_host(), np.concatenate((np.full(5, np.pi), h_arr[5:])) ) # Writes to a slice from a view arr[:5] = arr[5:] np.testing.assert_array_equal( - c_arr.copy_to_host(), - np.concatenate((h_arr[5:], h_arr[5:])) + c_arr.copy_to_host(), np.concatenate((h_arr[5:], h_arr[5:])) ) # Writes through a view @@ -177,10 +177,7 @@ def test_array_views(self): c_arr.copy_to_host()[::2], np.full(5, np.pi), ) - np.testing.assert_array_equal( - c_arr.copy_to_host()[1::2], - h_arr[1::2] - ) + np.testing.assert_array_equal(c_arr.copy_to_host()[1::2], h_arr[1::2]) def test_negative_strided_issue(self): # issue #3705 @@ -188,7 +185,7 @@ def test_negative_strided_issue(self): c_arr = cuda.to_device(h_arr) def base_offset(orig, sliced): - return sliced['data'][0] - orig['data'][0] + return sliced["data"][0] - orig["data"][0] h_ai = h_arr.__array_interface__ c_ai = c_arr.__cuda_array_interface__ @@ -202,8 +199,8 @@ def base_offset(orig, sliced): base_offset(c_ai, c_ai_sliced), ) # Check shape and strides are correct - self.assertEqual(h_ai_sliced['shape'], c_ai_sliced['shape']) - self.assertEqual(h_ai_sliced['strides'], c_ai_sliced['strides']) + self.assertEqual(h_ai_sliced["shape"], c_ai_sliced["shape"]) + self.assertEqual(h_ai_sliced["strides"], c_ai_sliced["strides"]) def test_negative_strided_copy_to_host(self): # issue #3705 @@ -212,28 +209,28 @@ def test_negative_strided_copy_to_host(self): sliced = c_arr[::-1] with self.assertRaises(NotImplementedError) as raises: sliced.copy_to_host() - expected_msg = 'D->H copy not implemented for negative strides' + expected_msg = "D->H copy not implemented for negative strides" self.assertIn(expected_msg, str(raises.exception)) def test_masked_array(self): h_arr = np.random.random(10) - h_mask = np.random.randint(2, size=10, dtype='bool') + h_mask = np.random.randint(2, size=10, dtype="bool") c_arr = cuda.to_device(h_arr) c_mask = cuda.to_device(h_mask) # Manually create a masked CUDA Array Interface dictionary masked_cuda_array_interface = c_arr.__cuda_array_interface__.copy() - masked_cuda_array_interface['mask'] = c_mask + masked_cuda_array_interface["mask"] = c_mask with self.assertRaises(NotImplementedError) as raises: cuda.from_cuda_array_interface(masked_cuda_array_interface) - expected_msg = 'Masked arrays are not supported' + expected_msg = "Masked arrays are not supported" self.assertIn(expected_msg, str(raises.exception)) def test_zero_size_array(self): # for #4175 c_arr = cuda.device_array(0) - self.assertEqual(c_arr.__cuda_array_interface__['data'][0], 0) + self.assertEqual(c_arr.__cuda_array_interface__["data"][0], 0) @cuda.jit def add_one(arr): @@ -249,49 +246,49 @@ def test_strides(self): # for #4175 # First, test C-contiguous array c_arr = cuda.device_array((2, 3, 4)) - self.assertEqual(c_arr.__cuda_array_interface__['strides'], None) + self.assertEqual(c_arr.__cuda_array_interface__["strides"], None) # Second, test non C-contiguous array c_arr = c_arr[:, 1, :] - self.assertNotEqual(c_arr.__cuda_array_interface__['strides'], None) + self.assertNotEqual(c_arr.__cuda_array_interface__["strides"], None) def test_consuming_strides(self): hostarray = np.arange(10).reshape(2, 5) devarray = cuda.to_device(hostarray) face = devarray.__cuda_array_interface__ - self.assertIsNone(face['strides']) + self.assertIsNone(face["strides"]) got = cuda.from_cuda_array_interface(face).copy_to_host() np.testing.assert_array_equal(got, hostarray) - self.assertTrue(got.flags['C_CONTIGUOUS']) + self.assertTrue(got.flags["C_CONTIGUOUS"]) # Try non-NULL strides - face['strides'] = hostarray.strides - self.assertIsNotNone(face['strides']) + face["strides"] = hostarray.strides + self.assertIsNotNone(face["strides"]) got = cuda.from_cuda_array_interface(face).copy_to_host() np.testing.assert_array_equal(got, hostarray) - self.assertTrue(got.flags['C_CONTIGUOUS']) + self.assertTrue(got.flags["C_CONTIGUOUS"]) def test_produce_no_stream(self): c_arr = cuda.device_array(10) - self.assertIsNone(c_arr.__cuda_array_interface__['stream']) + self.assertIsNone(c_arr.__cuda_array_interface__["stream"]) mapped_arr = cuda.mapped_array(10) - self.assertIsNone(mapped_arr.__cuda_array_interface__['stream']) + self.assertIsNone(mapped_arr.__cuda_array_interface__["stream"]) @linux_only def test_produce_managed_no_stream(self): managed_arr = cuda.managed_array(10) - self.assertIsNone(managed_arr.__cuda_array_interface__['stream']) + self.assertIsNone(managed_arr.__cuda_array_interface__["stream"]) def test_produce_stream(self): s = cuda.stream() c_arr = cuda.device_array(10, stream=s) - cai_stream = c_arr.__cuda_array_interface__['stream'] + cai_stream = c_arr.__cuda_array_interface__["stream"] stream_value = self.get_stream_value(s) self.assertEqual(stream_value, cai_stream) s = cuda.stream() mapped_arr = cuda.mapped_array(10, stream=s) - cai_stream = mapped_arr.__cuda_array_interface__['stream'] + cai_stream = mapped_arr.__cuda_array_interface__["stream"] stream_value = self.get_stream_value(s) self.assertEqual(stream_value, cai_stream) @@ -299,7 +296,7 @@ def test_produce_stream(self): def test_produce_managed_stream(self): s = cuda.stream() managed_arr = cuda.managed_array(10, stream=s) - cai_stream = managed_arr.__cuda_array_interface__['stream'] + cai_stream = managed_arr.__cuda_array_interface__["stream"] stream_value = self.get_stream_value(s) self.assertEqual(stream_value, cai_stream) @@ -327,8 +324,9 @@ def test_consume_no_sync(self): # Create a foreign array with no stream f_arr = ForeignArray(cuda.device_array(10)) - with patch.object(cuda.cudadrv.driver.Stream, 'synchronize', - return_value=None) as mock_sync: + with patch.object( + cuda.cudadrv.driver.Stream, "synchronize", return_value=None + ) as mock_sync: cuda.as_cuda_array(f_arr) # Ensure the synchronize method of a stream was not called @@ -339,8 +337,9 @@ def test_consume_sync(self): s = cuda.stream() f_arr = ForeignArray(cuda.device_array(10, stream=s)) - with patch.object(cuda.cudadrv.driver.Stream, 'synchronize', - return_value=None) as mock_sync: + with patch.object( + cuda.cudadrv.driver.Stream, "synchronize", return_value=None + ) as mock_sync: cuda.as_cuda_array(f_arr) # Ensure the synchronize method of a stream was called @@ -354,9 +353,10 @@ def test_consume_sync_disabled(self): # Set sync to false before testing. The test suite should generally be # run with sync enabled, but stash the old value just in case it is # not. - with override_config('CUDA_ARRAY_INTERFACE_SYNC', False): - with patch.object(cuda.cudadrv.driver.Stream, 'synchronize', - return_value=None) as mock_sync: + with override_config("CUDA_ARRAY_INTERFACE_SYNC", False): + with patch.object( + cuda.cudadrv.driver.Stream, "synchronize", return_value=None + ) as mock_sync: cuda.as_cuda_array(f_arr) # Ensure the synchronize method of a stream was not called @@ -370,8 +370,9 @@ def test_launch_no_sync(self): def f(x): pass - with patch.object(cuda.cudadrv.driver.Stream, 'synchronize', - return_value=None) as mock_sync: + with patch.object( + cuda.cudadrv.driver.Stream, "synchronize", return_value=None + ) as mock_sync: f[1, 1](f_arr) # Ensure the synchronize method of a stream was not called @@ -386,8 +387,9 @@ def test_launch_sync(self): def f(x): pass - with patch.object(cuda.cudadrv.driver.Stream, 'synchronize', - return_value=None) as mock_sync: + with patch.object( + cuda.cudadrv.driver.Stream, "synchronize", return_value=None + ) as mock_sync: f[1, 1](f_arr) # Ensure the synchronize method of a stream was called @@ -404,8 +406,9 @@ def test_launch_sync_two_streams(self): def f(x, y): pass - with patch.object(cuda.cudadrv.driver.Stream, 'synchronize', - return_value=None) as mock_sync: + with patch.object( + cuda.cudadrv.driver.Stream, "synchronize", return_value=None + ) as mock_sync: f[1, 1](f_arr1, f_arr2) # Ensure that synchronize was called twice @@ -418,13 +421,15 @@ def test_launch_sync_disabled(self): f_arr1 = ForeignArray(cuda.device_array(10, stream=s1)) f_arr2 = ForeignArray(cuda.device_array(10, stream=s2)) - with override_config('CUDA_ARRAY_INTERFACE_SYNC', False): + with override_config("CUDA_ARRAY_INTERFACE_SYNC", False): + @cuda.jit def f(x, y): pass - with patch.object(cuda.cudadrv.driver.Stream, 'synchronize', - return_value=None) as mock_sync: + with patch.object( + cuda.cudadrv.driver.Stream, "synchronize", return_value=None + ) as mock_sync: f[1, 1](f_arr1, f_arr2) # Ensure that synchronize was not called diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py b/numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py index 45af1b677..99f614677 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py @@ -80,11 +80,12 @@ def outer(argin, argout): def test_jit_debug_simulator(self): # Ensure that the jit decorator accepts the debug kwarg when the # simulator is in use - see Issue #6615. - with override_config('ENABLE_CUDASIM', 1): + with override_config("ENABLE_CUDASIM", 1): + @cuda.jit(debug=True, opt=False) def f(x): pass -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_datetime.py b/numba_cuda/numba/cuda/tests/cudapy/test_datetime.py index 7921f9e9b..1b177ccc4 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_datetime.py @@ -13,9 +13,9 @@ def foo(start, end, delta): for i in range(cuda.grid(1), delta.size, cuda.gridsize(1)): delta[i] = end[i] - start[i] - arr1 = np.arange('2005-02', '2006-02', dtype='datetime64[D]') + arr1 = np.arange("2005-02", "2006-02", dtype="datetime64[D]") arr2 = arr1 + np.random.randint(0, 10000, arr1.size) - delta = np.zeros_like(arr1, dtype='timedelta64[D]') + delta = np.zeros_like(arr1, dtype="timedelta64[D]") foo[1, 32](arr1, arr2, delta) @@ -27,11 +27,12 @@ def foo(dates, target, delta, matches, outdelta): for i in range(cuda.grid(1), matches.size, cuda.gridsize(1)): matches[i] = dates[i] == target outdelta[i] = dates[i] - delta - arr1 = np.arange('2005-02', '2006-02', dtype='datetime64[D]') - target = arr1[5] # datetime + + arr1 = np.arange("2005-02", "2006-02", dtype="datetime64[D]") + target = arr1[5] # datetime delta = arr1[6] - arr1[5] # timedelta matches = np.zeros_like(arr1, dtype=np.bool_) - outdelta = np.zeros_like(arr1, dtype='datetime64[D]') + outdelta = np.zeros_like(arr1, dtype="datetime64[D]") foo[1, 32](arr1, target, delta, matches, outdelta) where = matches.nonzero() @@ -39,56 +40,59 @@ def foo(dates, target, delta, matches, outdelta): self.assertEqual(list(where), [5]) self.assertPreciseEqual(outdelta, arr1 - delta) - @skip_on_cudasim('ufunc API unsupported in the simulator') + @skip_on_cudasim("ufunc API unsupported in the simulator") def test_ufunc(self): - datetime_t = from_dtype(np.dtype('datetime64[D]')) + datetime_t = from_dtype(np.dtype("datetime64[D]")) - @vectorize([(datetime_t, datetime_t)], target='cuda') + @vectorize([(datetime_t, datetime_t)], target="cuda") def timediff(start, end): return end - start - arr1 = np.arange('2005-02', '2006-02', dtype='datetime64[D]') + arr1 = np.arange("2005-02", "2006-02", dtype="datetime64[D]") arr2 = arr1 + np.random.randint(0, 10000, arr1.size) delta = timediff(arr1, arr2) self.assertPreciseEqual(delta, arr2 - arr1) - @skip_on_cudasim('ufunc API unsupported in the simulator') + @skip_on_cudasim("ufunc API unsupported in the simulator") def test_gufunc(self): - datetime_t = from_dtype(np.dtype('datetime64[D]')) - timedelta_t = from_dtype(np.dtype('timedelta64[D]')) - - @guvectorize([(datetime_t, datetime_t, timedelta_t[:])], '(),()->()', - target='cuda') + datetime_t = from_dtype(np.dtype("datetime64[D]")) + timedelta_t = from_dtype(np.dtype("timedelta64[D]")) + + @guvectorize( + [(datetime_t, datetime_t, timedelta_t[:])], + "(),()->()", + target="cuda", + ) def timediff(start, end, out): out[0] = end - start - arr1 = np.arange('2005-02', '2006-02', dtype='datetime64[D]') + arr1 = np.arange("2005-02", "2006-02", dtype="datetime64[D]") arr2 = arr1 + np.random.randint(0, 10000, arr1.size) delta = timediff(arr1, arr2) self.assertPreciseEqual(delta, arr2 - arr1) - @skip_on_cudasim('no .copy_to_host() in the simulator') + @skip_on_cudasim("no .copy_to_host() in the simulator") def test_datetime_view_as_int64(self): - arr = np.arange('2005-02', '2006-02', dtype='datetime64[D]') + arr = np.arange("2005-02", "2006-02", dtype="datetime64[D]") darr = cuda.to_device(arr) viewed = darr.view(np.int64) self.assertPreciseEqual(arr.view(np.int64), viewed.copy_to_host()) self.assertEqual(viewed.gpu_data, darr.gpu_data) - @skip_on_cudasim('no .copy_to_host() in the simulator') + @skip_on_cudasim("no .copy_to_host() in the simulator") def test_timedelta_view_as_int64(self): - arr = np.arange('2005-02', '2006-02', dtype='datetime64[D]') + arr = np.arange("2005-02", "2006-02", dtype="datetime64[D]") arr = arr - (arr - 1) - self.assertEqual(arr.dtype, np.dtype('timedelta64[D]')) + self.assertEqual(arr.dtype, np.dtype("timedelta64[D]")) darr = cuda.to_device(arr) viewed = darr.view(np.int64) self.assertPreciseEqual(arr.view(np.int64), viewed.copy_to_host()) self.assertEqual(viewed.gpu_data, darr.gpu_data) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_debug.py b/numba_cuda/numba/cuda/tests/cudapy/test_debug.py index b88c25a18..00fb70c06 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_debug.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_debug.py @@ -2,8 +2,11 @@ from numba.core.utils import PYVERSION from numba.cuda.testing import skip_on_cudasim, CUDATestCase -from numba.tests.support import (override_config, captured_stderr, - captured_stdout) +from numba.tests.support import ( + override_config, + captured_stderr, + captured_stdout, +) from numba import cuda, float64 import unittest @@ -13,9 +16,8 @@ def simple_cuda(A, B): B[i] = A[i] + 1.5 -@skip_on_cudasim('Simulator does not produce debug dumps') +@skip_on_cudasim("Simulator does not produce debug dumps") class TestDebugOutput(CUDATestCase): - def compile_simple_cuda(self): with captured_stderr() as err: with captured_stdout() as out: @@ -34,14 +36,14 @@ def assert_fails(self, *args, **kwargs): self.assertRaises(AssertionError, *args, **kwargs) def check_debug_output(self, out, enabled_dumps): - all_dumps = dict.fromkeys(['bytecode', 'cfg', 'ir', 'llvm', - 'assembly'], - False) + all_dumps = dict.fromkeys( + ["bytecode", "cfg", "ir", "llvm", "assembly"], False + ) for name in enabled_dumps: assert name in all_dumps all_dumps[name] = True for name, enabled in sorted(all_dumps.items()): - check_meth = getattr(self, '_check_dump_%s' % name) + check_meth = getattr(self, "_check_dump_%s" % name) if enabled: check_meth(out) else: @@ -50,50 +52,50 @@ def check_debug_output(self, out, enabled_dumps): def _check_dump_bytecode(self, out): if PYVERSION > (3, 10): # binop with arg=0 is binary add, see CPython dis.py and opcode.py - self.assertIn('BINARY_OP(arg=0', out) + self.assertIn("BINARY_OP(arg=0", out) else: - self.assertIn('BINARY_ADD', out) + self.assertIn("BINARY_ADD", out) def _check_dump_cfg(self, out): - self.assertIn('CFG dominators', out) + self.assertIn("CFG dominators", out) def _check_dump_ir(self, out): - self.assertIn('--IR DUMP: simple_cuda--', out) - self.assertIn('const(float, 1.5)', out) + self.assertIn("--IR DUMP: simple_cuda--", out) + self.assertIn("const(float, 1.5)", out) def _check_dump_llvm(self, out): - self.assertIn('--LLVM DUMP', out) + self.assertIn("--LLVM DUMP", out) self.assertIn('!"kernel", i32 1', out) def _check_dump_assembly(self, out): - self.assertIn('--ASSEMBLY simple_cuda', out) - self.assertIn('Generated by NVIDIA NVVM Compiler', out) + self.assertIn("--ASSEMBLY simple_cuda", out) + self.assertIn("Generated by NVIDIA NVVM Compiler", out) def test_dump_bytecode(self): - with override_config('DUMP_BYTECODE', True): + with override_config("DUMP_BYTECODE", True): out = self.compile_simple_cuda() - self.check_debug_output(out, ['bytecode']) + self.check_debug_output(out, ["bytecode"]) def test_dump_ir(self): - with override_config('DUMP_IR', True): + with override_config("DUMP_IR", True): out = self.compile_simple_cuda() - self.check_debug_output(out, ['ir']) + self.check_debug_output(out, ["ir"]) def test_dump_cfg(self): - with override_config('DUMP_CFG', True): + with override_config("DUMP_CFG", True): out = self.compile_simple_cuda() - self.check_debug_output(out, ['cfg']) + self.check_debug_output(out, ["cfg"]) def test_dump_llvm(self): - with override_config('DUMP_LLVM', True): + with override_config("DUMP_LLVM", True): out = self.compile_simple_cuda() - self.check_debug_output(out, ['llvm']) + self.check_debug_output(out, ["llvm"]) def test_dump_assembly(self): - with override_config('DUMP_ASSEMBLY', True): + with override_config("DUMP_ASSEMBLY", True): out = self.compile_simple_cuda() - self.check_debug_output(out, ['assembly']) + self.check_debug_output(out, ["assembly"]) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py b/numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py index 0afa99115..76d732474 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py @@ -1,4 +1,4 @@ -from numba.tests.support import (override_config, captured_stdout) +from numba.tests.support import override_config, captured_stdout from numba.cuda.testing import skip_on_cudasim from numba import cuda from numba.core import types @@ -8,7 +8,7 @@ import unittest -@skip_on_cudasim('Simulator does not produce debug dumps') +@skip_on_cudasim("Simulator does not produce debug dumps") class TestCudaDebugInfo(CUDATestCase): """ These tests only checks the compiled PTX for debuginfo section @@ -49,7 +49,7 @@ def foo(x): self._check(foo, sig=(types.int32[:],), expect=True) def test_environment_override(self): - with override_config('CUDA_DEBUGINFO_DEFAULT', 1): + with override_config("CUDA_DEBUGINFO_DEFAULT", 1): # Using default value @cuda.jit(opt=False) def foo(x): @@ -86,7 +86,7 @@ def f(cond): llvm_ir = f.inspect_llvm(sig) # A varible name starting with "bool" in the debug metadata - pat = r'!DILocalVariable\(.*name:\s+\"bool' + pat = r"!DILocalVariable\(.*name:\s+\"bool" match = re.compile(pat).search(llvm_ir) self.assertIsNone(match, msg=llvm_ir) @@ -106,7 +106,7 @@ def f(x, y): mdnode_id = match.group(1) # verify the DIBasicType has correct encoding attribute DW_ATE_boolean - pat = rf'!{mdnode_id}\s+=\s+!DIBasicType\(.*DW_ATE_boolean' + pat = rf"!{mdnode_id}\s+=\s+!DIBasicType\(.*DW_ATE_boolean" match = re.compile(pat).search(llvm_ir) self.assertIsNotNone(match, msg=llvm_ir) @@ -133,14 +133,17 @@ def f(x): llvm_ir = f.inspect_llvm(sig) - defines = [line for line in llvm_ir.splitlines() - if 'define void @"_ZN6cudapy' in line] + defines = [ + line + for line in llvm_ir.splitlines() + if 'define void @"_ZN6cudapy' in line + ] # Make sure we only found one definition self.assertEqual(len(defines), 1) wrapper_define = defines[0] - self.assertIn('!dbg', wrapper_define) + self.assertIn("!dbg", wrapper_define) def test_debug_function_calls_internal_impl(self): # Calling a function in a module generated from an implementation @@ -198,16 +201,16 @@ def test_chained_device_function(self): debug_opts = itertools.product(*[(True, False)] * 3) for kernel_debug, f1_debug, f2_debug in debug_opts: - with self.subTest(kernel_debug=kernel_debug, - f1_debug=f1_debug, - f2_debug=f2_debug): - self._test_chained_device_function(kernel_debug, - f1_debug, - f2_debug) - - def _test_chained_device_function_two_calls(self, kernel_debug, f1_debug, - f2_debug): - + with self.subTest( + kernel_debug=kernel_debug, f1_debug=f1_debug, f2_debug=f2_debug + ): + self._test_chained_device_function( + kernel_debug, f1_debug, f2_debug + ) + + def _test_chained_device_function_two_calls( + self, kernel_debug, f1_debug, f2_debug + ): @cuda.jit(device=True, debug=f2_debug, opt=False) def f2(x): return x + 1 @@ -232,12 +235,12 @@ def test_chained_device_function_two_calls(self): debug_opts = itertools.product(*[(True, False)] * 3) for kernel_debug, f1_debug, f2_debug in debug_opts: - with self.subTest(kernel_debug=kernel_debug, - f1_debug=f1_debug, - f2_debug=f2_debug): - self._test_chained_device_function_two_calls(kernel_debug, - f1_debug, - f2_debug) + with self.subTest( + kernel_debug=kernel_debug, f1_debug=f1_debug, f2_debug=f2_debug + ): + self._test_chained_device_function_two_calls( + kernel_debug, f1_debug, f2_debug + ) def test_chained_device_three_functions(self): # Like test_chained_device_function, but with enough functions (three) @@ -278,13 +281,13 @@ def f(x, y): llvm_ir = f.inspect_llvm(sig) # extract the metadata node id from `types` field of DISubroutineType - pat = r'!DISubroutineType\(types:\s+!(\d+)\)' + pat = r"!DISubroutineType\(types:\s+!(\d+)\)" match = re.compile(pat).search(llvm_ir) self.assertIsNotNone(match, msg=llvm_ir) mdnode_id = match.group(1) # extract the metadata node ids from the flexible node of types - pat = rf'!{mdnode_id}\s+=\s+!{{\s+!(\d+),\s+!(\d+)\s+}}' + pat = rf"!{mdnode_id}\s+=\s+!{{\s+!(\d+),\s+!(\d+)\s+}}" match = re.compile(pat).search(llvm_ir) self.assertIsNotNone(match, msg=llvm_ir) mdnode_id1 = match.group(1) @@ -303,10 +306,10 @@ def test_kernel_args_types(self): def test_kernel_args_types_dump(self): # see issue#135 - with override_config('DUMP_LLVM', 1): + with override_config("DUMP_LLVM", 1): with captured_stdout(): self._test_kernel_args_types() -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_device_func.py b/numba_cuda/numba/cuda/tests/cudapy/test_device_func.py index 0c7088b74..4ff973baa 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_device_func.py @@ -3,8 +3,13 @@ import numpy as np -from numba.cuda.testing import (skip_if_curand_kernel_missing, skip_on_cudasim, - test_data_dir, unittest, CUDATestCase) +from numba.cuda.testing import ( + skip_if_curand_kernel_missing, + skip_on_cudasim, + test_data_dir, + unittest, + CUDATestCase, +) from numba import cuda, jit, float32, int32, types from numba.core.errors import TypingError from numba.tests.support import skip_unless_cffi @@ -12,9 +17,7 @@ class TestDeviceFunc(CUDATestCase): - def test_use_add2f(self): - @cuda.jit("float32(float32, float32)", device=True) def add2f(a, b): return a + b @@ -33,7 +36,6 @@ def use_add2f(ary): self.assertTrue(np.all(ary == exp), (ary, exp)) def test_indirect_add2f(self): - @cuda.jit("float32(float32, float32)", device=True) def add2f(a, b): return a + b @@ -74,12 +76,12 @@ def add(a, b): self._check_cpu_dispatcher(add) - @skip_on_cudasim('not supported in cudasim') + @skip_on_cudasim("not supported in cudasim") def test_cpu_dispatcher_invalid(self): # Test invalid usage # Explicit signature disables compilation, which also disable # compiling on CUDA. - @jit('(i4, i4)') + @jit("(i4, i4)") def add(a, b): return a + b @@ -95,7 +97,7 @@ def test_cpu_dispatcher_other_module(self): def add(a, b): return a + b - mymod = ModuleType(name='mymod') + mymod = ModuleType(name="mymod") mymod.add = add del add @@ -109,7 +111,7 @@ def add_kernel(ary): add_kernel[1, ary.size](ary) np.testing.assert_equal(expect, ary) - @skip_on_cudasim('not supported in cudasim') + @skip_on_cudasim("not supported in cudasim") def test_inspect_llvm(self): @cuda.jit(device=True) def foo(x, y): @@ -120,13 +122,13 @@ def foo(x, y): fname = cres.fndesc.mangled_name # Verify that the function name has "foo" in it as in the python name - self.assertIn('foo', fname) + self.assertIn("foo", fname) llvm = foo.inspect_llvm(args) # Check that the compiled function name is in the LLVM. self.assertIn(fname, llvm) - @skip_on_cudasim('not supported in cudasim') + @skip_on_cudasim("not supported in cudasim") def test_inspect_asm(self): @cuda.jit(device=True) def foo(x, y): @@ -137,13 +139,13 @@ def foo(x, y): fname = cres.fndesc.mangled_name # Verify that the function name has "foo" in it as in the python name - self.assertIn('foo', fname) + self.assertIn("foo", fname) ptx = foo.inspect_asm(args) # Check that the compiled function name is in the PTX self.assertIn(fname, ptx) - @skip_on_cudasim('not supported in cudasim') + @skip_on_cudasim("not supported in cudasim") def test_inspect_sass_disallowed(self): @cuda.jit(device=True) def foo(x, y): @@ -152,10 +154,11 @@ def foo(x, y): with self.assertRaises(RuntimeError) as raises: foo.inspect_sass((int32, int32)) - self.assertIn('Cannot inspect SASS of a device function', - str(raises.exception)) + self.assertIn( + "Cannot inspect SASS of a device function", str(raises.exception) + ) - @skip_on_cudasim('cudasim will allow calling any function') + @skip_on_cudasim("cudasim will allow calling any function") def test_device_func_as_kernel_disallowed(self): @cuda.jit(device=True) def f(): @@ -164,10 +167,12 @@ def f(): with self.assertRaises(RuntimeError) as raises: f[1, 1]() - self.assertIn('Cannot compile a device function as a kernel', - str(raises.exception)) + self.assertIn( + "Cannot compile a device function as a kernel", + str(raises.exception), + ) - @skip_on_cudasim('cudasim ignores casting by jit decorator signature') + @skip_on_cudasim("cudasim ignores casting by jit decorator signature") def test_device_casting(self): # Ensure that casts to the correct type are forced when calling a # device function with a signature. This test ensures that: @@ -176,20 +181,23 @@ def test_device_casting(self): # shouldn't # - We insert a cast when calling rgba, as opposed to failing to type. - @cuda.jit('int32(int32, int32, int32, int32)', device=True) + @cuda.jit("int32(int32, int32, int32, int32)", device=True) def rgba(r, g, b, a): - return (((r & 0xFF) << 16) | - ((g & 0xFF) << 8) | - ((b & 0xFF) << 0) | - ((a & 0xFF) << 24)) + return ( + ((r & 0xFF) << 16) + | ((g & 0xFF) << 8) + | ((b & 0xFF) << 0) + | ((a & 0xFF) << 24) + ) @cuda.jit def rgba_caller(x, channels): x[0] = rgba(channels[0], channels[1], channels[2], channels[3]) x = cuda.device_array(1, dtype=np.int32) - channels = cuda.to_device(np.asarray([1.0, 2.0, 3.0, 4.0], - dtype=np.float32)) + channels = cuda.to_device( + np.asarray([1.0, 2.0, 3.0, 4.0], dtype=np.float32) + ) rgba_caller[1, 1](x, channels) @@ -259,32 +267,31 @@ def rgba_caller(x, channels): }""") -@skip_on_cudasim('External functions unsupported in the simulator') +@skip_on_cudasim("External functions unsupported in the simulator") class TestDeclareDevice(CUDATestCase): - def check_api(self, decl): - self.assertEqual(decl.name, 'f1') + self.assertEqual(decl.name, "f1") self.assertEqual(decl.sig.args, (float32[:],)) self.assertEqual(decl.sig.return_type, int32) def test_declare_device_signature(self): - f1 = cuda.declare_device('f1', int32(float32[:])) + f1 = cuda.declare_device("f1", int32(float32[:])) self.check_api(f1) def test_declare_device_string(self): - f1 = cuda.declare_device('f1', 'int32(float32[:])') + f1 = cuda.declare_device("f1", "int32(float32[:])") self.check_api(f1) def test_bad_declare_device_tuple(self): - with self.assertRaisesRegex(TypeError, 'Return type'): - cuda.declare_device('f1', (float32[:],)) + with self.assertRaisesRegex(TypeError, "Return type"): + cuda.declare_device("f1", (float32[:],)) def test_bad_declare_device_string(self): - with self.assertRaisesRegex(TypeError, 'Return type'): - cuda.declare_device('f1', '(float32[:],)') + with self.assertRaisesRegex(TypeError, "Return type"): + cuda.declare_device("f1", "(float32[:],)") def test_link_cu_source(self): - times2 = cuda.declare_device('times2', 'int32(int32)', link=times2_cu) + times2 = cuda.declare_device("times2", "int32(int32)", link=times2_cu) @cuda.jit def kernel(r, x): @@ -301,7 +308,7 @@ def kernel(r, x): def _test_link_multiple_sources(self, link_type): link = link_type([times2_cu, times4_cu]) - times4 = cuda.declare_device('times4', 'int32(int32)', link=link) + times4 = cuda.declare_device("times4", "int32(int32)", link=link) @cuda.jit def kernel(r, x): @@ -360,7 +367,7 @@ def kernel(x, seed): np.testing.assert_equal(x[0], 323845807) def test_declared_in_called_function(self): - times2 = cuda.declare_device('times2', 'int32(int32)', link=times2_cu) + times2 = cuda.declare_device("times2", "int32(int32)", link=times2_cu) @cuda.jit def device_func(x): @@ -380,7 +387,7 @@ def kernel(r, x): np.testing.assert_equal(r, x * 2) def test_declared_in_called_function_twice(self): - times2 = cuda.declare_device('times2', 'int32(int32)', link=times2_cu) + times2 = cuda.declare_device("times2", "int32(int32)", link=times2_cu) @cuda.jit def device_func_1(x): @@ -404,7 +411,7 @@ def kernel(r, x): np.testing.assert_equal(r, x * 2) def test_declared_in_called_function_two_calls(self): - times2 = cuda.declare_device('times2', 'int32(int32)', link=times2_cu) + times2 = cuda.declare_device("times2", "int32(int32)", link=times2_cu) @cuda.jit def device_func(x): @@ -424,7 +431,7 @@ def kernel(r, x): np.testing.assert_equal(r, x * 6) def test_call_declared_function_twice(self): - times2 = cuda.declare_device('times2', 'int32(int32)', link=times2_cu) + times2 = cuda.declare_device("times2", "int32(int32)", link=times2_cu) @cuda.jit def kernel(r, x): @@ -440,7 +447,7 @@ def kernel(r, x): np.testing.assert_equal(r, x * 6) def test_declared_in_called_function_and_parent(self): - times2 = cuda.declare_device('times2', 'int32(int32)', link=times2_cu) + times2 = cuda.declare_device("times2", "int32(int32)", link=times2_cu) @cuda.jit def device_func(x): @@ -460,8 +467,8 @@ def kernel(r, x): np.testing.assert_equal(r, x * 4) def test_call_two_different_declared_functions(self): - times2 = cuda.declare_device('times2', 'int32(int32)', link=times2_cu) - times3 = cuda.declare_device('times3', 'int32(int32)', link=times3_cu) + times2 = cuda.declare_device("times2", "int32(int32)", link=times2_cu) + times3 = cuda.declare_device("times3", "int32(int32)", link=times3_cu) @cuda.jit def kernel(r, x): @@ -477,5 +484,5 @@ def kernel(r, x): np.testing.assert_equal(r, x * 5) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py b/numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py index da5257699..4bb773ef1 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py @@ -15,19 +15,18 @@ def add_kernel(r, x, y): r[0] = x + y -@skip_on_cudasim('Specialization not implemented in the simulator') +@skip_on_cudasim("Specialization not implemented in the simulator") class TestDispatcherSpecialization(CUDATestCase): def _test_no_double_specialize(self, dispatcher, ty): - with self.assertRaises(RuntimeError) as e: dispatcher.specialize(ty) - self.assertIn('Dispatcher already specialized', str(e.exception)) + self.assertIn("Dispatcher already specialized", str(e.exception)) def test_no_double_specialize_sig_same_types(self): # Attempting to specialize a kernel jitted with a signature is illegal, # even for the same types the kernel is already specialized for. - @cuda.jit('void(float32[::1])') + @cuda.jit("void(float32[::1])") def f(x): pass @@ -45,7 +44,7 @@ def f(x): def test_no_double_specialize_sig_diff_types(self): # Attempting to specialize a kernel jitted with a signature is illegal. - @cuda.jit('void(int32[::1])') + @cuda.jit("void(int32[::1])") def f(x): pass @@ -132,13 +131,13 @@ def test_coerce_input_types(self): self.assertEqual(r[0], add(12300000000, 456)) # Now force compilation of only a single specialization - c_add = cuda.jit('(i4[::1], i4, i4)')(add_kernel) + c_add = cuda.jit("(i4[::1], i4, i4)")(add_kernel) r = np.zeros(1, dtype=np.int32) c_add[1, 1](r, 123, 456) self.assertPreciseEqual(r[0], add(123, 456)) - @skip_on_cudasim('Simulator ignores signature') + @skip_on_cudasim("Simulator ignores signature") @unittest.expectedFailure def test_coerce_input_types_unsafe(self): # Implicit (unsafe) conversion of float to int, originally from @@ -149,25 +148,24 @@ def test_coerce_input_types_unsafe(self): # # This test is marked as xfail until future changes enable this # behavior. - c_add = cuda.jit('(i4[::1], i4, i4)')(add_kernel) + c_add = cuda.jit("(i4[::1], i4, i4)")(add_kernel) r = np.zeros(1, dtype=np.int32) c_add[1, 1](r, 12.3, 45.6) self.assertPreciseEqual(r[0], add(12, 45)) - @skip_on_cudasim('Simulator ignores signature') + @skip_on_cudasim("Simulator ignores signature") def test_coerce_input_types_unsafe_complex(self): # Implicit conversion of complex to int disallowed - c_add = cuda.jit('(i4[::1], i4, i4)')(add_kernel) + c_add = cuda.jit("(i4[::1], i4, i4)")(add_kernel) r = np.zeros(1, dtype=np.int32) with self.assertRaises(TypeError): c_add[1, 1](r, 12.3, 45.6j) - @skip_on_cudasim('Simulator does not track overloads') + @skip_on_cudasim("Simulator does not track overloads") def test_ambiguous_new_version(self): - """Test compiling new version in an ambiguous case - """ + """Test compiling new version in an ambiguous case""" c_add = cuda.jit(add_kernel) r = np.zeros(1, dtype=np.float64) @@ -190,8 +188,9 @@ def test_ambiguous_new_version(self): # to (float, int) or (int, float) with equal weight. c_add[1, 1](r, 1, 1) self.assertAlmostEqual(r[0], INT + INT) - self.assertEqual(len(c_add.overloads), 4, "didn't compile a new " - "version") + self.assertEqual( + len(c_add.overloads), 4, "didn't compile a new version" + ) @skip_on_cudasim("Simulator doesn't support concurrent kernels") def test_lock(self): @@ -245,8 +244,10 @@ def _test_explicit_signatures(self, sigs): def test_explicit_signatures_strings(self): # Check with a list of strings for signatures - sigs = ["(int64[::1], int64, int64)", - "(float64[::1], float64, float64)"] + sigs = [ + "(int64[::1], int64, int64)", + "(float64[::1], float64, float64)", + ] self._test_explicit_signatures(sigs) def test_explicit_signatures_tuples(self): @@ -256,26 +257,31 @@ def test_explicit_signatures_tuples(self): def test_explicit_signatures_signatures(self): # Check with a list of Signature objects for signatures - sigs = [void(int64[::1], int64, int64), - void(float64[::1], float64, float64)] + sigs = [ + void(int64[::1], int64, int64), + void(float64[::1], float64, float64), + ] self._test_explicit_signatures(sigs) def test_explicit_signatures_mixed(self): # Check when we mix types of signature objects in a list of signatures # Tuple and string - sigs = [(int64[::1], int64, int64), - "(float64[::1], float64, float64)"] + sigs = [(int64[::1], int64, int64), "(float64[::1], float64, float64)"] self._test_explicit_signatures(sigs) # Tuple and Signature object - sigs = [(int64[::1], int64, int64), - void(float64[::1], float64, float64)] + sigs = [ + (int64[::1], int64, int64), + void(float64[::1], float64, float64), + ] self._test_explicit_signatures(sigs) # Signature object and string - sigs = [void(int64[::1], int64, int64), - "(float64[::1], float64, float64)"] + sigs = [ + void(int64[::1], int64, int64), + "(float64[::1], float64, float64)", + ] self._test_explicit_signatures(sigs) def test_explicit_signatures_same_type_class(self): @@ -284,8 +290,10 @@ def test_explicit_signatures_same_type_class(self): # that dispatch is differentiated on the types of x and y only, to # closely preserve the intent of the original test from # numba.tests.test_dispatcher) - sigs = ["(float64[::1], float32, float32)", - "(float64[::1], float64, float64)"] + sigs = [ + "(float64[::1], float32, float32)", + "(float64[::1], float64, float64)", + ] f = cuda.jit(sigs)(add_kernel) r = np.zeros(1, dtype=np.float64) @@ -296,13 +304,17 @@ def test_explicit_signatures_same_type_class(self): f[1, 1](r, 1, 2**-25) self.assertPreciseEqual(r[0], 1.0000000298023224) - @skip_on_cudasim('No overload resolution in the simulator') + @skip_on_cudasim("No overload resolution in the simulator") def test_explicit_signatures_ambiguous_resolution(self): # Fail to resolve ambiguity between the two best overloads # (Also deliberate float64[::1] for the first argument in all cases) - f = cuda.jit(["(float64[::1], float32, float64)", - "(float64[::1], float64, float32)", - "(float64[::1], int64, int64)"])(add_kernel) + f = cuda.jit( + [ + "(float64[::1], float32, float64)", + "(float64[::1], float64, float32)", + "(float64[::1], int64, int64)", + ] + )(add_kernel) with self.assertRaises(TypeError) as cm: r = np.zeros(1, dtype=np.float64) f[1, 1](r, 1.0, 2.0) @@ -317,12 +329,12 @@ def test_explicit_signatures_ambiguous_resolution(self): r"\(Array\(float64, 1, 'C', False, aligned=True\), float32," r" float64\) -> none\n" r"\(Array\(float64, 1, 'C', False, aligned=True\), float64," - r" float32\) -> none" + r" float32\) -> none", ) # The integer signature is not part of the best matches self.assertNotIn("int64", str(cm.exception)) - @skip_on_cudasim('Simulator does not use _prepare_args') + @skip_on_cudasim("Simulator does not use _prepare_args") @unittest.expectedFailure def test_explicit_signatures_unsafe(self): # These tests are from test_explicit_signatures, but have to be xfail @@ -336,8 +348,10 @@ def test_explicit_signatures_unsafe(self): self.assertPreciseEqual(r[0], 3) self.assertEqual(len(f.overloads), 1, f.overloads) - sigs = ["(int64[::1], int64, int64)", - "(float64[::1], float64, float64)"] + sigs = [ + "(int64[::1], int64, int64)", + "(float64[::1], float64, float64)", + ] f = cuda.jit(sigs)(add_kernel) r = np.zeros(1, dtype=np.float64) # Approximate match (int32 -> float64 is a safe conversion) @@ -414,7 +428,7 @@ def test_explicit_signatures_device_ambiguous(self): f[1, 1](r, 1.5, 2.5) self.assertPreciseEqual(r[0], 4.0) - @skip_on_cudasim('CUDA Simulator does not force casting') + @skip_on_cudasim("CUDA Simulator does not force casting") def test_explicit_signatures_device_unsafe(self): # These tests are from test_explicit_signatures. The device function # variant of these tests can succeed on CUDA because the compilation @@ -489,17 +503,15 @@ def pi_sin_array(x, n): # provides the same values as getting the registers per thread for # individual signatures. regs_per_thread_all = pi_sin_array.get_regs_per_thread() - self.assertEqual(regs_per_thread_all[sig_f32.args], - regs_per_thread_f32) - self.assertEqual(regs_per_thread_all[sig_f64.args], - regs_per_thread_f64) + self.assertEqual(regs_per_thread_all[sig_f32.args], regs_per_thread_f32) + self.assertEqual(regs_per_thread_all[sig_f64.args], regs_per_thread_f64) if regs_per_thread_f32 == regs_per_thread_f64: # If the register usage is the same for both variants, there may be # a bug, but this may also be an artifact of the compiler / driver # / device combination, so produce an informational message only. - print('f32 and f64 variant thread usages are equal.') - print('This may warrant some investigation. Devices:') + print("f32 and f64 variant thread usages are equal.") + print("This may warrant some investigation. Devices:") cuda.detect() def test_get_regs_per_thread_specialized(self): @@ -696,5 +708,5 @@ def simple_lmem(ary): self.assertGreaterEqual(local_mem_per_thread, N * 4) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_enums.py b/numba_cuda/numba/cuda/tests/cudapy/test_enums.py index da60b7565..6db955e06 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_enums.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_enums.py @@ -12,18 +12,17 @@ Shape, Planet, RequestError, - IntEnumWithNegatives + IntEnumWithNegatives, ) class EnumTest(CUDATestCase): - pairs = [ (Color.red, Color.red), (Color.red, Color.green), (Planet.EARTH, Planet.EARTH), (Planet.VENUS, Planet.MARS), - (Shape.circle, IntEnumWithNegatives.two) # IntEnum, same value + (Shape.circle, IntEnumWithNegatives.two), # IntEnum, same value ] def test_compare(self): @@ -45,7 +44,7 @@ def test_getattr_getitem(self): def f(out): # Lookup of an enum member on its class out[0] = Color.red == Color.green - out[1] = Color['red'] == Color['green'] + out[1] = Color["red"] == Color["green"] cuda_f = cuda.jit(f) got = np.zeros((2,), dtype=np.bool_) @@ -106,16 +105,16 @@ def f(x, out): def test_vectorize(self): def f(x): if x != RequestError.not_found: - return RequestError['internal_error'] + return RequestError["internal_error"] else: return RequestError.dummy - cuda_func = vectorize("int64(int64)", target='cuda')(f) + cuda_func = vectorize("int64(int64)", target="cuda")(f) arr = np.array([2, 404, 500, 404], dtype=np.int64) expected = np.array([f(x) for x in arr], dtype=np.int64) got = cuda_func(arr) self.assertPreciseEqual(expected, got) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_errors.py b/numba_cuda/numba/cuda/tests/cudapy/test_errors.py index c20fb8dcc..0b24bee8e 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_errors.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_errors.py @@ -17,34 +17,49 @@ def test_too_many_dims(self): with self.assertRaises(ValueError) as raises: kernfunc[(1, 2, 3, 4), (5, 6)] - self.assertIn("griddim must be a sequence of 1, 2 or 3 integers, " - "got [1, 2, 3, 4]", - str(raises.exception)) + self.assertIn( + "griddim must be a sequence of 1, 2 or 3 integers, " + "got [1, 2, 3, 4]", + str(raises.exception), + ) with self.assertRaises(ValueError) as raises: - kernfunc[(1, 2,), (3, 4, 5, 6)] - self.assertIn("blockdim must be a sequence of 1, 2 or 3 integers, " - "got [3, 4, 5, 6]", - str(raises.exception)) + kernfunc[ + ( + 1, + 2, + ), + (3, 4, 5, 6), + ] + self.assertIn( + "blockdim must be a sequence of 1, 2 or 3 integers, " + "got [3, 4, 5, 6]", + str(raises.exception), + ) def test_non_integral_dims(self): kernfunc = cuda.jit(noop) with self.assertRaises(TypeError) as raises: kernfunc[2.0, 3] - self.assertIn("griddim must be a sequence of integers, got [2.0]", - str(raises.exception)) + self.assertIn( + "griddim must be a sequence of integers, got [2.0]", + str(raises.exception), + ) with self.assertRaises(TypeError) as raises: kernfunc[2, 3.0] - self.assertIn("blockdim must be a sequence of integers, got [3.0]", - str(raises.exception)) + self.assertIn( + "blockdim must be a sequence of integers, got [3.0]", + str(raises.exception), + ) def _test_unconfigured(self, kernfunc): with self.assertRaises(ValueError) as raises: kernfunc(0) - self.assertIn("launch configuration was not specified", - str(raises.exception)) + self.assertIn( + "launch configuration was not specified", str(raises.exception) + ) def test_unconfigured_typed_cudakernel(self): kernfunc = cuda.jit("void(int32)")(noop) @@ -54,7 +69,7 @@ def test_unconfigured_untyped_cudakernel(self): kernfunc = cuda.jit(noop) self._test_unconfigured(kernfunc) - @skip_on_cudasim('TypingError does not occur on simulator') + @skip_on_cudasim("TypingError does not occur on simulator") def test_typing_error(self): # see #5860, this is present to catch changes to error reporting # accidentally breaking the CUDA target @@ -75,5 +90,5 @@ def kernel_func(): self.assertIn("NameError: name 'floor' is not defined", excstr) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_exception.py b/numba_cuda/numba/cuda/tests/cudapy/test_exception.py index 42f31074a..63dce76eb 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_exception.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_exception.py @@ -83,20 +83,19 @@ def oracle(x, y): x[i] += x[i] // y[i] n = 32 - got_x = 1. / (np.arange(n) + 0.01) - got_y = 1. / (np.arange(n) + 0.01) + got_x = 1.0 / (np.arange(n) + 0.01) + got_y = 1.0 / (np.arange(n) + 0.01) problematic[1, n](got_x, got_y) - expect_x = 1. / (np.arange(n) + 0.01) - expect_y = 1. / (np.arange(n) + 0.01) + expect_x = 1.0 / (np.arange(n) + 0.01) + expect_y = 1.0 / (np.arange(n) + 0.01) oracle[1, n](expect_x, expect_y) np.testing.assert_almost_equal(expect_x, got_x) np.testing.assert_almost_equal(expect_y, got_y) def test_raise_causing_warp_diverge(self): - """Test case for issue #2655. - """ + """Test case for issue #2655.""" self.case_raise_causing_warp_diverge(with_debug_mode=False) # The following two cases relate to Issue #7806: Division by zero stops the @@ -117,8 +116,8 @@ def f(r, x, y): f[1, 1](r, x, y) - self.assertTrue(np.isinf(r[0]), 'Expected inf from div by zero') - self.assertEqual(r[1], y[0], 'Expected execution to continue') + self.assertTrue(np.isinf(r[0]), "Expected inf from div by zero") + self.assertEqual(r[1], y[0], "Expected execution to continue") def test_zero_division_error_in_debug(self): # When debug is True: @@ -146,15 +145,15 @@ def f(r, x, y): with self.assertRaises(exc): f[1, 1](r, x, y) - self.assertEqual(r[0], 0, 'Expected result to be left unset') - self.assertEqual(r[1], 0, 'Expected execution to stop') + self.assertEqual(r[0], 0, "Expected result to be left unset") + self.assertEqual(r[1], 0, "Expected execution to stop") @xfail_unless_cudasim def test_raise_in_device_function(self): # This is an expected failure because reporting of exceptions raised in # device functions does not work correctly - see Issue #8036: # https://github.com/numba/numba/issues/8036 - msg = 'Device Function Error' + msg = "Device Function Error" @cuda.jit(device=True) def f(): @@ -170,5 +169,5 @@ def kernel(): self.assertIn(msg, str(raises.exception)) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_extending.py b/numba_cuda/numba/cuda/tests/cudapy/test_extending.py index 142d917c0..18f3ac478 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_extending.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_extending.py @@ -8,12 +8,13 @@ class Interval: """ A half-open interval on the real number line. """ + def __init__(self, lo, hi): self.lo = lo self.hi = hi def __repr__(self): - return 'Interval(%f, %f)' % (self.lo, self.hi) + return "Interval(%f, %f)" % (self.lo, self.hi) @property def width(self): @@ -32,16 +33,21 @@ def sum_intervals(i, j): if not config.ENABLE_CUDASIM: from numba.core import cgutils - from numba.core.extending import (lower_builtin, make_attribute_wrapper, - models, register_model, type_callable, - typeof_impl) + from numba.core.extending import ( + lower_builtin, + make_attribute_wrapper, + models, + register_model, + type_callable, + typeof_impl, + ) from numba.core.typing.templates import AttributeTemplate from numba.cuda.cudadecl import registry as cuda_registry from numba.cuda.cudaimpl import lower_attr as cuda_lower_attr class IntervalType(types.Type): def __init__(self): - super().__init__(name='Interval') + super().__init__(name="Interval") interval_type = IntervalType() @@ -54,19 +60,20 @@ def type_interval(context): def typer(lo, hi): if isinstance(lo, types.Float) and isinstance(hi, types.Float): return interval_type + return typer @register_model(IntervalType) class IntervalModel(models.StructModel): def __init__(self, dmm, fe_type): members = [ - ('lo', types.float64), - ('hi', types.float64), + ("lo", types.float64), + ("hi", types.float64), ] models.StructModel.__init__(self, dmm, fe_type, members) - make_attribute_wrapper(IntervalType, 'lo', 'lo') - make_attribute_wrapper(IntervalType, 'hi', 'hi') + make_attribute_wrapper(IntervalType, "lo", "lo") + make_attribute_wrapper(IntervalType, "hi", "hi") @lower_builtin(Interval, types.Float, types.Float) def impl_interval(context, builder, sig, args): @@ -84,14 +91,14 @@ class Interval_attrs(AttributeTemplate): def resolve_width(self, mod): return types.float64 - @cuda_lower_attr(IntervalType, 'width') + @cuda_lower_attr(IntervalType, "width") def cuda_Interval_width(context, builder, sig, arg): lo = builder.extract_value(arg, 0) hi = builder.extract_value(arg, 1) return builder.fsub(hi, lo) -@skip_on_cudasim('Extensions not supported in the simulator') +@skip_on_cudasim("Extensions not supported in the simulator") class TestExtending(CUDATestCase): def test_attributes(self): @cuda.jit @@ -151,5 +158,5 @@ def f(r, x): np.testing.assert_allclose(r, expected) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py b/numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py index f4b705683..de250d635 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py @@ -5,8 +5,7 @@ from math import cos, sin, tan, exp, log, log10, log2, pow, tanh from operator import truediv import numpy as np -from numba.cuda.testing import (CUDATestCase, skip_on_cudasim, - skip_unless_cc_75) +from numba.cuda.testing import CUDATestCase, skip_on_cudasim, skip_unless_cc_75 import unittest @@ -24,10 +23,9 @@ def check(self, test: CUDATestCase, fast: str, prec: str): test.assertTrue(all(i not in prec for i in self.prec_unexpected)) -@skip_on_cudasim('Fastmath and PTX inspection not available on cudasim') +@skip_on_cudasim("Fastmath and PTX inspection not available on cudasim") class TestFastMathOption(CUDATestCase): def _test_fast_math_common(self, pyfunc, sig, device, criterion): - # Test jit code path fastver = cuda.jit(sig, device=device, fastmath=True)(pyfunc) precver = cuda.jit(sig, device=device)(pyfunc) @@ -40,9 +38,7 @@ def _test_fast_math_common(self, pyfunc, sig, device, criterion): fastptx, _ = compile_ptx_for_current_device( pyfunc, sig, device=device, fastmath=True ) - precptx, _ = compile_ptx_for_current_device( - pyfunc, sig, device=device - ) + precptx, _ = compile_ptx_for_current_device(pyfunc, sig, device=device) criterion.check(self, fastptx, precptx) @@ -69,7 +65,9 @@ def device(x, y): self._test_fast_math_common( kernel, - (float32[::1], float32, float32), device=False, criterion=criterion + (float32[::1], float32, float32), + device=False, + criterion=criterion, ) self._test_fast_math_common( device, (float32, float32), device=True, criterion=criterion @@ -79,39 +77,41 @@ def test_cosf(self): self._test_fast_math_unary( cos, FastMathCriterion( - fast_expected=['cos.approx.ftz.f32 '], - prec_unexpected=['cos.approx.ftz.f32 '] - ) + fast_expected=["cos.approx.ftz.f32 "], + prec_unexpected=["cos.approx.ftz.f32 "], + ), ) def test_sinf(self): self._test_fast_math_unary( sin, FastMathCriterion( - fast_expected=['sin.approx.ftz.f32 '], - prec_unexpected=['sin.approx.ftz.f32 '] - ) + fast_expected=["sin.approx.ftz.f32 "], + prec_unexpected=["sin.approx.ftz.f32 "], + ), ) def test_tanf(self): self._test_fast_math_unary( tan, - FastMathCriterion(fast_expected=[ - 'sin.approx.ftz.f32 ', - 'cos.approx.ftz.f32 ', - 'div.approx.ftz.f32 ' - ], prec_unexpected=['sin.approx.ftz.f32 ']) + FastMathCriterion( + fast_expected=[ + "sin.approx.ftz.f32 ", + "cos.approx.ftz.f32 ", + "div.approx.ftz.f32 ", + ], + prec_unexpected=["sin.approx.ftz.f32 "], + ), ) @skip_unless_cc_75 def test_tanhf(self): - self._test_fast_math_unary( tanh, FastMathCriterion( - fast_expected=['tanh.approx.f32 '], - prec_unexpected=['tanh.approx.f32 '] - ) + fast_expected=["tanh.approx.f32 "], + prec_unexpected=["tanh.approx.f32 "], + ), ) def test_tanhf_compile_ptx(self): @@ -119,74 +119,85 @@ def tanh_kernel(r, x): r[0] = tanh(x) def tanh_common_test(cc, criterion): - fastptx, _ = compile_ptx(tanh_kernel, (float32[::1], float32), - fastmath=True, cc=cc) - precptx, _ = compile_ptx(tanh_kernel, (float32[::1], float32), - cc=cc) + fastptx, _ = compile_ptx( + tanh_kernel, (float32[::1], float32), fastmath=True, cc=cc + ) + precptx, _ = compile_ptx( + tanh_kernel, (float32[::1], float32), cc=cc + ) criterion.check(self, fastptx, precptx) - tanh_common_test(cc=(7, 5), criterion=FastMathCriterion( - fast_expected=['tanh.approx.f32 '], - prec_unexpected=['tanh.approx.f32 '] - )) + tanh_common_test( + cc=(7, 5), + criterion=FastMathCriterion( + fast_expected=["tanh.approx.f32 "], + prec_unexpected=["tanh.approx.f32 "], + ), + ) - tanh_common_test(cc=(7, 0), - criterion=FastMathCriterion( - fast_expected=['ex2.approx.ftz.f32 ', - 'rcp.approx.ftz.f32 '], - prec_unexpected=['tanh.approx.f32 '])) + tanh_common_test( + cc=(7, 0), + criterion=FastMathCriterion( + fast_expected=["ex2.approx.ftz.f32 ", "rcp.approx.ftz.f32 "], + prec_unexpected=["tanh.approx.f32 "], + ), + ) def test_expf(self): self._test_fast_math_unary( exp, FastMathCriterion( - fast_unexpected=['fma.rn.f32 '], - prec_expected=['fma.rn.f32 '] - ) + fast_unexpected=["fma.rn.f32 "], prec_expected=["fma.rn.f32 "] + ), ) def test_logf(self): # Look for constant used to convert from log base 2 to log base e self._test_fast_math_unary( - log, FastMathCriterion( - fast_expected=['lg2.approx.ftz.f32 ', '0f3F317218'], - prec_unexpected=['lg2.approx.ftz.f32 '], - ) + log, + FastMathCriterion( + fast_expected=["lg2.approx.ftz.f32 ", "0f3F317218"], + prec_unexpected=["lg2.approx.ftz.f32 "], + ), ) def test_log10f(self): # Look for constant used to convert from log base 2 to log base 10 self._test_fast_math_unary( - log10, FastMathCriterion( - fast_expected=['lg2.approx.ftz.f32 ', '0f3E9A209B'], - prec_unexpected=['lg2.approx.ftz.f32 '] - ) + log10, + FastMathCriterion( + fast_expected=["lg2.approx.ftz.f32 ", "0f3E9A209B"], + prec_unexpected=["lg2.approx.ftz.f32 "], + ), ) def test_log2f(self): self._test_fast_math_unary( - log2, FastMathCriterion( - fast_expected=['lg2.approx.ftz.f32 '], - prec_unexpected=['lg2.approx.ftz.f32 '] - ) + log2, + FastMathCriterion( + fast_expected=["lg2.approx.ftz.f32 "], + prec_unexpected=["lg2.approx.ftz.f32 "], + ), ) def test_powf(self): self._test_fast_math_binary( - pow, FastMathCriterion( - fast_expected=['lg2.approx.ftz.f32 '], - prec_unexpected=['lg2.approx.ftz.f32 '], - ) + pow, + FastMathCriterion( + fast_expected=["lg2.approx.ftz.f32 "], + prec_unexpected=["lg2.approx.ftz.f32 "], + ), ) def test_divf(self): self._test_fast_math_binary( - truediv, FastMathCriterion( - fast_expected=['div.approx.ftz.f32 '], - fast_unexpected=['div.rn.f32'], - prec_expected=['div.rn.f32'], - prec_unexpected=['div.approx.ftz.f32 '], - ) + truediv, + FastMathCriterion( + fast_expected=["div.approx.ftz.f32 "], + fast_unexpected=["div.rn.f32"], + prec_expected=["div.rn.f32"], + prec_unexpected=["div.approx.ftz.f32 "], + ), ) def test_divf_exception(self): @@ -232,13 +243,13 @@ def bar(arr, val): # https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-div # The fast version should use the "fast, approximate divide" variant - self.assertIn('div.approx.f32', fastver.inspect_asm(sig)) + self.assertIn("div.approx.f32", fastver.inspect_asm(sig)) # The precise version should use the "IEEE 754 compliant rounding" # variant, and neither of the "approximate divide" variants. - self.assertIn('div.rn.f32', precver.inspect_asm(sig)) - self.assertNotIn('div.approx.f32', precver.inspect_asm(sig)) - self.assertNotIn('div.full.f32', precver.inspect_asm(sig)) + self.assertIn("div.rn.f32", precver.inspect_asm(sig)) + self.assertNotIn("div.approx.f32", precver.inspect_asm(sig)) + self.assertNotIn("div.full.f32", precver.inspect_asm(sig)) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_forall.py b/numba_cuda/numba/cuda/tests/cudapy/test_forall.py index 23286c22c..adef70911 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_forall.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_forall.py @@ -44,9 +44,11 @@ def test_forall_negative_work(self): # negative element count. with self.assertRaises(ValueError) as raises: foo.forall(-1) - self.assertIn("Can't create ForAll with negative task count", - str(raises.exception)) + self.assertIn( + "Can't create ForAll with negative task count", + str(raises.exception), + ) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_freevar.py b/numba_cuda/numba/cuda/tests/cudapy/test_freevar.py index 6b7b2d2ab..14470902f 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_freevar.py @@ -17,13 +17,15 @@ def test_freevar(self): @cuda.jit("(float32[::1], intp)") def foo(A, i): "Dummy function" - sdata = cuda.shared.array(size, # size is freevar - dtype=nbtype) # nbtype is freevar + sdata = cuda.shared.array( + size, # size is freevar + dtype=nbtype, + ) # nbtype is freevar A[i] = sdata[i] A = np.arange(2, dtype="float32") foo[1, 1](A, 0) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py b/numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py index 71169801e..e78971dd1 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py @@ -62,5 +62,5 @@ def test_ldexp_f8(self): self.template_test_ldexp(np.float64, float64) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_globals.py b/numba_cuda/numba/cuda/tests/cudapy/test_globals.py index a2406e665..0bfb277c8 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_globals.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_globals.py @@ -29,8 +29,7 @@ def coop_smem2d(ary): class TestCudaTestGlobal(CUDATestCase): def test_global_int_const(self): - """Test simple_smem - """ + """Test simple_smem""" compiled = cuda.jit("void(int32[:])")(simple_smem) nelem = 100 @@ -41,8 +40,7 @@ def test_global_int_const(self): @unittest.SkipTest def test_global_tuple_const(self): - """Test coop_smem2d - """ + """Test coop_smem2d""" compiled = cuda.jit("void(float32[:,:])")(coop_smem2d) shape = 10, 20 @@ -56,5 +54,5 @@ def test_global_tuple_const(self): self.assertTrue(np.allclose(ary, exp)) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py b/numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py index 098318e3a..954ed635d 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py @@ -12,9 +12,11 @@ def _get_matmulcore_gufunc(dtype=float32): - @guvectorize([void(dtype[:, :], dtype[:, :], dtype[:, :])], - '(m,n),(n,p)->(m,p)', - target='cuda') + @guvectorize( + [void(dtype[:, :], dtype[:, :], dtype[:, :])], + "(m,n),(n,p)->(m,p)", + target="cuda", + ) def matmulcore(A, B, C): m, n = A.shape n, p = B.shape @@ -27,32 +29,33 @@ def matmulcore(A, B, C): return matmulcore -@skip_on_cudasim('ufunc API unsupported in the simulator') +@skip_on_cudasim("ufunc API unsupported in the simulator") class TestCUDAGufunc(CUDATestCase): - def test_gufunc_small(self): - gufunc = _get_matmulcore_gufunc() matrix_ct = 2 - A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2, - 4) - B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4, - 5) + A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape( + matrix_ct, 2, 4 + ) + B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape( + matrix_ct, 4, 5 + ) C = gufunc(A, B) Gold = np.matmul(A, B) self.assertTrue(np.allclose(C, Gold)) def test_gufunc_auto_transfer(self): - gufunc = _get_matmulcore_gufunc() matrix_ct = 2 - A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2, - 4) - B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4, - 5) + A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape( + matrix_ct, 2, 4 + ) + B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape( + matrix_ct, 4, 5 + ) dB = cuda.to_device(B) @@ -61,24 +64,24 @@ def test_gufunc_auto_transfer(self): self.assertTrue(np.allclose(C, Gold)) def test_gufunc(self): - gufunc = _get_matmulcore_gufunc() - matrix_ct = 1001 # an odd number to test thread/block division in CUDA - A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2, - 4) - B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4, - 5) + matrix_ct = 1001 # an odd number to test thread/block division in CUDA + A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape( + matrix_ct, 2, 4 + ) + B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape( + matrix_ct, 4, 5 + ) C = gufunc(A, B) Gold = np.matmul(A, B) self.assertTrue(np.allclose(C, Gold)) def test_gufunc_hidim(self): - gufunc = _get_matmulcore_gufunc() - matrix_ct = 100 # an odd number to test thread/block division in CUDA + matrix_ct = 100 # an odd number to test thread/block division in CUDA A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(4, 25, 2, 4) B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(4, 25, 4, 5) @@ -87,7 +90,6 @@ def test_gufunc_hidim(self): self.assertTrue(np.allclose(C, Gold)) def test_gufunc_new_axis(self): - gufunc = _get_matmulcore_gufunc(dtype=float64) X = np.random.randn(10, 3, 3) @@ -102,15 +104,16 @@ def test_gufunc_new_axis(self): np.testing.assert_allclose(gold, res2) def test_gufunc_stream(self): - gufunc = _get_matmulcore_gufunc() - #cuda.driver.flush_pending_free() - matrix_ct = 1001 # an odd number to test thread/block division in CUDA - A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2, - 4) - B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4, - 5) + # cuda.driver.flush_pending_free() + matrix_ct = 1001 # an odd number to test thread/block division in CUDA + A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape( + matrix_ct, 2, 4 + ) + B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape( + matrix_ct, 4, 5 + ) stream = cuda.stream() dA = cuda.to_device(A, stream) @@ -126,10 +129,7 @@ def test_gufunc_stream(self): self.assertTrue(np.allclose(C, Gold)) def test_copy(self): - - @guvectorize([void(float32[:], float32[:])], - '(x)->(x)', - target='cuda') + @guvectorize([void(float32[:], float32[:])], "(x)->(x)", target="cuda") def copy(A, B): for i in range(B.size): B[i] = A[i] @@ -142,9 +142,7 @@ def copy(A, B): def test_copy_unspecified_return(self): # Ensure that behaviour is correct when the return type is not # specified in the signature. - @guvectorize([(float32[:], float32[:])], - '(x)->(x)', - target='cuda') + @guvectorize([(float32[:], float32[:])], "(x)->(x)", target="cuda") def copy(A, B): for i in range(B.size): B[i] = A[i] @@ -155,10 +153,7 @@ def copy(A, B): self.assertTrue(np.allclose(A, B)) def test_copy_odd(self): - - @guvectorize([void(float32[:], float32[:])], - '(x)->(x)', - target='cuda') + @guvectorize([void(float32[:], float32[:])], "(x)->(x)", target="cuda") def copy(A, B): for i in range(B.size): B[i] = A[i] @@ -169,10 +164,11 @@ def copy(A, B): self.assertTrue(np.allclose(A, B)) def test_copy2d(self): - - @guvectorize([void(float32[:, :], float32[:, :])], - '(x, y)->(x, y)', - target='cuda') + @guvectorize( + [void(float32[:, :], float32[:, :])], + "(x, y)->(x, y)", + target="cuda", + ) def copy2d(A, B): for x in range(B.shape[0]): for y in range(B.shape[1]): @@ -185,8 +181,7 @@ def copy2d(A, B): def test_not_supported_call_from_jit(self): # not supported - @guvectorize([void(int32[:], int32[:])], - '(n)->(n)', target='cuda') + @guvectorize([void(int32[:], int32[:])], "(n)->(n)", target="cuda") def gufunc_copy(A, b): for i in range(A.shape[0]): b[i] = A[i] @@ -195,7 +190,7 @@ def gufunc_copy(A, b): def cuda_jit(A, b): return gufunc_copy(A, b) - A = np.arange(1024 * 32).astype('int32') + A = np.arange(1024 * 32).astype("int32") b = np.zeros_like(A) msg = "Untyped global name 'gufunc_copy'.*" with self.assertRaisesRegex(TypingError, msg): @@ -204,56 +199,68 @@ def cuda_jit(A, b): # Test inefficient use of the GPU where the inputs are all mapped onto a # single thread in a single block. def test_inefficient_launch_configuration(self): - @guvectorize(['void(float32[:], float32[:], float32[:])'], - '(n),(n)->(n)', target='cuda') + @guvectorize( + ["void(float32[:], float32[:], float32[:])"], + "(n),(n)->(n)", + target="cuda", + ) def numba_dist_cuda(a, b, dist): len = a.shape[0] for i in range(len): dist[i] = a[i] * b[i] - a = np.random.rand(1024 * 32).astype('float32') - b = np.random.rand(1024 * 32).astype('float32') - dist = np.zeros(a.shape[0]).astype('float32') + a = np.random.rand(1024 * 32).astype("float32") + b = np.random.rand(1024 * 32).astype("float32") + dist = np.zeros(a.shape[0]).astype("float32") - with override_config('CUDA_LOW_OCCUPANCY_WARNINGS', 1): + with override_config("CUDA_LOW_OCCUPANCY_WARNINGS", 1): with warnings.catch_warnings(record=True) as w: numba_dist_cuda(a, b, dist) self.assertEqual(w[0].category, NumbaPerformanceWarning) - self.assertIn('Grid size', str(w[0].message)) - self.assertIn('low occupancy', str(w[0].message)) + self.assertIn("Grid size", str(w[0].message)) + self.assertIn("low occupancy", str(w[0].message)) def test_efficient_launch_configuration(self): - @guvectorize(['void(float32[:], float32[:], float32[:])'], - '(n),(n)->(n)', nopython=True, target='cuda') + @guvectorize( + ["void(float32[:], float32[:], float32[:])"], + "(n),(n)->(n)", + nopython=True, + target="cuda", + ) def numba_dist_cuda2(a, b, dist): len = a.shape[0] for i in range(len): dist[i] = a[i] * b[i] - a = np.random.rand(524288 * 2).astype('float32').\ - reshape((524288, 2)) - b = np.random.rand(524288 * 2).astype('float32').\ - reshape((524288, 2)) + a = np.random.rand(524288 * 2).astype("float32").reshape((524288, 2)) + b = np.random.rand(524288 * 2).astype("float32").reshape((524288, 2)) dist = np.zeros_like(a) - with override_config('CUDA_LOW_OCCUPANCY_WARNINGS', 1): + with override_config("CUDA_LOW_OCCUPANCY_WARNINGS", 1): with warnings.catch_warnings(record=True) as w: numba_dist_cuda2(a, b, dist) self.assertEqual(len(w), 0) def test_nopython_flag(self): - def foo(A, B): pass # nopython = True is fine - guvectorize([void(float32[:], float32[:])], '(x)->(x)', target='cuda', - nopython=True)(foo) + guvectorize( + [void(float32[:], float32[:])], + "(x)->(x)", + target="cuda", + nopython=True, + )(foo) # nopython = False is bad with self.assertRaises(TypeError) as raises: - guvectorize([void(float32[:], float32[:])], '(x)->(x)', - target='cuda', nopython=False)(foo) + guvectorize( + [void(float32[:], float32[:])], + "(x)->(x)", + target="cuda", + nopython=False, + )(foo) self.assertEqual("nopython flag must be True", str(raises.exception)) def test_invalid_flags(self): @@ -262,17 +269,22 @@ def foo(A, B): pass with self.assertRaises(TypeError) as raises: - guvectorize([void(float32[:], float32[:])], '(x)->(x)', - target='cuda', what1=True, ever2=False)(foo) + guvectorize( + [void(float32[:], float32[:])], + "(x)->(x)", + target="cuda", + what1=True, + ever2=False, + )(foo) head = "The following target options are not supported:" msg = str(raises.exception) - self.assertEqual(msg[:len(head)], head) - items = msg[len(head):].strip().split(',') + self.assertEqual(msg[: len(head)], head) + items = msg[len(head) :].strip().split(",") items = [i.strip("'\" ") for i in items] - self.assertEqual(set(['what1', 'ever2']), set(items)) + self.assertEqual(set(["what1", "ever2"]), set(items)) def test_duplicated_output(self): - @guvectorize([void(float32[:], float32[:])], '(x)->(x)', target='cuda') + @guvectorize([void(float32[:], float32[:])], "(x)->(x)", target="cuda") def foo(inp, out): pass # intentionally empty; never executed @@ -284,8 +296,9 @@ def foo(inp, out): self.assertEqual(str(raises.exception), msg) def check_tuple_arg(self, a, b): - @guvectorize([(float64[:], float64[:], float64[:])], '(n),(n)->()', - target='cuda') + @guvectorize( + [(float64[:], float64[:], float64[:])], "(n),(n)->()", target="cuda" + ) def gu_reduce(x, y, r): s = 0 for i in range(len(x)): @@ -297,44 +310,40 @@ def gu_reduce(x, y, r): np.testing.assert_equal(expected, r) def test_tuple_of_tuple_arg(self): - a = ((1.0, 2.0, 3.0), - (4.0, 5.0, 6.0)) - b = ((1.5, 2.5, 3.5), - (4.5, 5.5, 6.5)) + a = ((1.0, 2.0, 3.0), (4.0, 5.0, 6.0)) + b = ((1.5, 2.5, 3.5), (4.5, 5.5, 6.5)) self.check_tuple_arg(a, b) def test_tuple_of_namedtuple_arg(self): - Point = namedtuple('Point', ('x', 'y', 'z')) - a = (Point(x=1.0, y=2.0, z=3.0), - Point(x=4.0, y=5.0, z=6.0)) - b = (Point(x=1.5, y=2.5, z=3.5), - Point(x=4.5, y=5.5, z=6.5)) + Point = namedtuple("Point", ("x", "y", "z")) + a = (Point(x=1.0, y=2.0, z=3.0), Point(x=4.0, y=5.0, z=6.0)) + b = (Point(x=1.5, y=2.5, z=3.5), Point(x=4.5, y=5.5, z=6.5)) self.check_tuple_arg(a, b) def test_tuple_of_array_arg(self): - a = (np.asarray((1.0, 2.0, 3.0)), - np.asarray((4.0, 5.0, 6.0))) - b = (np.asarray((1.5, 2.5, 3.5)), - np.asarray((4.5, 5.5, 6.5))) + a = (np.asarray((1.0, 2.0, 3.0)), np.asarray((4.0, 5.0, 6.0))) + b = (np.asarray((1.5, 2.5, 3.5)), np.asarray((4.5, 5.5, 6.5))) self.check_tuple_arg(a, b) def test_gufunc_name(self): gufunc = _get_matmulcore_gufunc() - self.assertEqual(gufunc.__name__, 'matmulcore') + self.assertEqual(gufunc.__name__, "matmulcore") def test_bad_return_type(self): with self.assertRaises(TypeError) as te: - @guvectorize([int32(int32[:], int32[:])], '(m)->(m)', target='cuda') + + @guvectorize([int32(int32[:], int32[:])], "(m)->(m)", target="cuda") def f(x, y): pass msg = str(te.exception) - self.assertIn('guvectorized functions cannot return values', msg) - self.assertIn('specifies int32 return type', msg) + self.assertIn("guvectorized functions cannot return values", msg) + self.assertIn("specifies int32 return type", msg) def test_incorrect_number_of_pos_args(self): - @guvectorize([(int32[:], int32[:], int32[:])], - '(m),(m)->(m)', target='cuda') + @guvectorize( + [(int32[:], int32[:], int32[:])], "(m),(m)->(m)", target="cuda" + ) def f(x, y, z): pass @@ -345,26 +354,28 @@ def f(x, y, z): f(arr) msg = str(te.exception) - self.assertIn('gufunc accepts 2 positional arguments', msg) - self.assertIn('or 3 positional arguments', msg) - self.assertIn('Got 1 positional argument.', msg) + self.assertIn("gufunc accepts 2 positional arguments", msg) + self.assertIn("or 3 positional arguments", msg) + self.assertIn("Got 1 positional argument.", msg) # Inputs and outputs, too many with self.assertRaises(TypeError) as te: f(arr, arr, arr, arr) msg = str(te.exception) - self.assertIn('gufunc accepts 2 positional arguments', msg) - self.assertIn('or 3 positional arguments', msg) - self.assertIn('Got 4 positional arguments.', msg) + self.assertIn("gufunc accepts 2 positional arguments", msg) + self.assertIn("or 3 positional arguments", msg) + self.assertIn("Got 4 positional arguments.", msg) -@skip_on_cudasim('ufunc API unsupported in the simulator') +@skip_on_cudasim("ufunc API unsupported in the simulator") class TestMultipleOutputs(CUDATestCase): def test_multiple_outputs_same_type_passed_in(self): - @guvectorize([void(float32[:], float32[:], float32[:])], - '(x)->(x),(x)', - target='cuda') + @guvectorize( + [void(float32[:], float32[:], float32[:])], + "(x)->(x),(x)", + target="cuda", + ) def copy(A, B, C): for i in range(B.size): B[i] = A[i] @@ -378,10 +389,11 @@ def copy(A, B, C): np.testing.assert_allclose(A, C) def test_multiple_outputs_distinct_values(self): - - @guvectorize([void(float32[:], float32[:], float32[:])], - '(x)->(x),(x)', - target='cuda') + @guvectorize( + [void(float32[:], float32[:], float32[:])], + "(x)->(x),(x)", + target="cuda", + ) def copy_and_double(A, B, C): for i in range(B.size): B[i] = A[i] @@ -395,9 +407,11 @@ def copy_and_double(A, B, C): np.testing.assert_allclose(A * 2, C) def test_multiple_output_allocation(self): - @guvectorize([void(float32[:], float32[:], float32[:])], - '(x)->(x),(x)', - target='cuda') + @guvectorize( + [void(float32[:], float32[:], float32[:])], + "(x)->(x),(x)", + target="cuda", + ) def copy_and_double(A, B, C): for i in range(B.size): B[i] = A[i] @@ -409,10 +423,11 @@ def copy_and_double(A, B, C): np.testing.assert_allclose(A * 2, C) def test_multiple_output_dtypes(self): - - @guvectorize([void(int32[:], int32[:], float64[:])], - '(x)->(x),(x)', - target='cuda') + @guvectorize( + [void(int32[:], int32[:], float64[:])], + "(x)->(x),(x)", + target="cuda", + ) def copy_and_multiply(A, B, C): for i in range(B.size): B[i] = A[i] @@ -426,8 +441,11 @@ def copy_and_multiply(A, B, C): np.testing.assert_allclose(A * np.float64(1.5), C) def test_incorrect_number_of_pos_args(self): - @guvectorize([(int32[:], int32[:], int32[:], int32[:])], - '(m),(m)->(m),(m)', target='cuda') + @guvectorize( + [(int32[:], int32[:], int32[:], int32[:])], + "(m),(m)->(m),(m)", + target="cuda", + ) def f(x, y, z, w): pass @@ -438,19 +456,19 @@ def f(x, y, z, w): f(arr) msg = str(te.exception) - self.assertIn('gufunc accepts 2 positional arguments', msg) - self.assertIn('or 4 positional arguments', msg) - self.assertIn('Got 1 positional argument.', msg) + self.assertIn("gufunc accepts 2 positional arguments", msg) + self.assertIn("or 4 positional arguments", msg) + self.assertIn("Got 1 positional argument.", msg) # Inputs and outputs, too many with self.assertRaises(TypeError) as te: f(arr, arr, arr, arr, arr) msg = str(te.exception) - self.assertIn('gufunc accepts 2 positional arguments', msg) - self.assertIn('or 4 positional arguments', msg) - self.assertIn('Got 5 positional arguments.', msg) + self.assertIn("gufunc accepts 2 positional arguments", msg) + self.assertIn("or 4 positional arguments", msg) + self.assertIn("Got 5 positional arguments.", msg) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py b/numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py index 493a9ceec..6b9940805 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py @@ -3,13 +3,14 @@ See Numpy documentation for detail about gufunc: http://docs.scipy.org/doc/numpy/reference/c-api.generalized-ufuncs.html """ + import numpy as np from numba import guvectorize, cuda from numba.cuda.testing import skip_on_cudasim, CUDATestCase import unittest -@skip_on_cudasim('ufunc API unsupported in the simulator') +@skip_on_cudasim("ufunc API unsupported in the simulator") class TestGUFuncScalar(CUDATestCase): def test_gufunc_scalar_output(self): # function type: @@ -20,9 +21,9 @@ def test_gufunc_scalar_output(self): # signature: (n)->() # - the function takes an array of n-element and output a scalar. - @guvectorize(['void(int32[:], int32[:])'], '(n)->()', target='cuda') + @guvectorize(["void(int32[:], int32[:])"], "(n)->()", target="cuda") def sum_row(inp, out): - tmp = 0. + tmp = 0.0 for i in range(inp.shape[0]): tmp += inp[i] out[0] = tmp @@ -38,15 +39,14 @@ def sum_row(inp, out): out1 = np.empty(100, dtype=inp.dtype) out2 = np.empty(100, dtype=inp.dtype) - dev_inp = cuda.to_device( - inp) # alloc and copy input data - dev_out1 = cuda.to_device(out1, copy=False) # alloc only + dev_inp = cuda.to_device(inp) # alloc and copy input data + dev_out1 = cuda.to_device(out1, copy=False) # alloc only - sum_row(dev_inp, out=dev_out1) # invoke the gufunc - dev_out2 = sum_row(dev_inp) # invoke the gufunc + sum_row(dev_inp, out=dev_out1) # invoke the gufunc + dev_out2 = sum_row(dev_inp) # invoke the gufunc - dev_out1.copy_to_host(out1) # retrieve the result - dev_out2.copy_to_host(out2) # retrieve the result + dev_out1.copy_to_host(out1) # retrieve the result + dev_out2.copy_to_host(out2) # retrieve the result # verify result for i in range(inp.shape[0]): @@ -55,7 +55,7 @@ def sum_row(inp, out): def test_gufunc_scalar_output_bug(self): # Issue 2812: Error due to using input argument types as output argument - @guvectorize(['void(int32, int32[:])'], '()->()', target='cuda') + @guvectorize(["void(int32, int32[:])"], "()->()", target="cuda") def twice(inp, out): out[0] = inp * 2 @@ -64,8 +64,11 @@ def twice(inp, out): self.assertPreciseEqual(twice(arg), arg * 2) def test_gufunc_scalar_input_saxpy(self): - @guvectorize(['void(float32, float32[:], float32[:], float32[:])'], - '(),(t),(t)->(t)', target='cuda') + @guvectorize( + ["void(float32, float32[:], float32[:], float32[:])"], + "(),(t),(t)->(t)", + target="cuda", + ) def saxpy(a, x, y, out): for i in range(out.shape[0]): out[i] = a * x[i] + y[i] @@ -99,8 +102,9 @@ def saxpy(a, x, y, out): self.assertTrue(exp == out[j, i], (exp, out[j, i])) def test_gufunc_scalar_cast(self): - @guvectorize(['void(int32, int32[:], int32[:])'], '(),(t)->(t)', - target='cuda') + @guvectorize( + ["void(int32, int32[:], int32[:])"], "(),(t)->(t)", target="cuda" + ) def foo(a, b, out): for i in range(b.size): out[i] = a * b[i] @@ -121,8 +125,9 @@ def foo(a, b, out): def test_gufunc_old_style_scalar_as_array(self): # Example from issue #2579 - @guvectorize(['void(int32[:],int32[:],int32[:])'], '(n),()->(n)', - target='cuda') + @guvectorize( + ["void(int32[:],int32[:],int32[:])"], "(n),()->(n)", target="cuda" + ) def gufunc(x, y, res): for i in range(x.shape[0]): res[i] = x[i] + y[0] @@ -155,5 +160,5 @@ def gufunc(x, y, res): np.testing.assert_almost_equal(expected, res) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py b/numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py index fb8de3285..3c04a978e 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py @@ -8,88 +8,82 @@ def template(signature, shapes, expects): for k, v in expects.items(): got = getattr(sch, k) if got != v: - fmt = 'error for %s: got=%s but expect=%s' + fmt = "error for %s: got=%s but expect=%s" raise AssertionError(fmt % (k, got, v)) class TestGUFuncScheduling(unittest.TestCase): def test_signature_1(self): - signature = '(m, n), (n, p) -> (m, p)' + signature = "(m, n), (n, p) -> (m, p)" shapes = (100, 4, 5), (1, 5, 7) expects = dict( ishapes=[(4, 5), (5, 7)], oshapes=[(4, 7)], loopdims=(100,), - pinned=[False, True] + pinned=[False, True], ) template(signature, shapes, expects) def test_signature_2(self): - signature = '(m, n), (n, p) -> (m, p)' + signature = "(m, n), (n, p) -> (m, p)" shapes = (100, 4, 5), (100, 5, 7) expects = dict( ishapes=[(4, 5), (5, 7)], oshapes=[(4, 7)], loopdims=(100,), - pinned=[False, False] + pinned=[False, False], ) template(signature, shapes, expects) def test_signature_3(self): - signature = '(m, n), (n, p) -> (m, p)' + signature = "(m, n), (n, p) -> (m, p)" shapes = (12, 34, 4, 5), (12, 34, 5, 7) expects = dict( ishapes=[(4, 5), (5, 7)], oshapes=[(4, 7)], loopdims=(12, 34), - pinned=[False, False] + pinned=[False, False], ) template(signature, shapes, expects) def test_signature_4(self): - signature = '(m, n), (n, p) -> (m, p)' + signature = "(m, n), (n, p) -> (m, p)" shapes = (4, 5), (5, 7) expects = dict( ishapes=[(4, 5), (5, 7)], oshapes=[(4, 7)], loopdims=(), - pinned=[False, False] + pinned=[False, False], ) template(signature, shapes, expects) def test_signature_5(self): - signature = '(a), (a) -> (a)' + signature = "(a), (a) -> (a)" shapes = (5,), (5,) expects = dict( ishapes=[(5,), (5,)], oshapes=[(5,)], loopdims=(), - pinned=[False, False] + pinned=[False, False], ) template(signature, shapes, expects) def test_signature_6(self): - signature = '(), () -> ()' + signature = "(), () -> ()" shapes = (5,), (5,) expects = dict( - ishapes=[(), ()], - oshapes=[()], - loopdims=(5,), - pinned=[False, False] + ishapes=[(), ()], oshapes=[()], loopdims=(5,), pinned=[False, False] ) template(signature, shapes, expects) def test_signature_7(self): - signature = '(), () -> ()' + signature = "(), () -> ()" shapes = (5,), () expects = dict( - ishapes=[(), ()], - oshapes=[()], - loopdims=(5,), - pinned=[False, True] + ishapes=[(), ()], oshapes=[()], loopdims=(5,), pinned=[False, True] ) template(signature, shapes, expects) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_idiv.py b/numba_cuda/numba/cuda/tests/cudapy/test_idiv.py index 44b770f42..332a87718 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_idiv.py @@ -5,7 +5,6 @@ class TestCudaIDiv(CUDATestCase): def test_inplace_div(self): - @cuda.jit(void(float32[:, :], int32, int32)) def div(grid, l_x, l_y): for x in range(l_x): @@ -19,7 +18,6 @@ def div(grid, l_x, l_y): self.assertTrue(np.all(y == 0.5)) def test_inplace_div_double(self): - @cuda.jit(void(float64[:, :], int32, int32)) def div_double(grid, l_x, l_y): for x in range(l_x): @@ -33,5 +31,5 @@ def div_double(grid, l_x, l_y): self.assertTrue(np.all(y == 0.5)) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_inspect.py b/numba_cuda/numba/cuda/tests/cudapy/test_inspect.py index 5c122dbd9..5a038a11c 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_inspect.py @@ -3,11 +3,14 @@ from io import StringIO from numba import cuda, float32, float64, int32, intp from numba.cuda.testing import unittest, CUDATestCase -from numba.cuda.testing import (skip_on_cudasim, skip_with_nvdisasm, - skip_without_nvdisasm) +from numba.cuda.testing import ( + skip_on_cudasim, + skip_with_nvdisasm, + skip_without_nvdisasm, +) -@skip_on_cudasim('Simulator does not generate code to be inspected') +@skip_on_cudasim("Simulator does not generate code to be inspected") class TestInspect(CUDATestCase): @property def cc(self): @@ -60,7 +63,10 @@ def foo(x, y): # Signature in LLVM dict llvmirs = foo.inspect_llvm() - self.assertEqual(2, len(llvmirs), ) + self.assertEqual( + 2, + len(llvmirs), + ) self.assertIn((intp, intp), llvmirs) self.assertIn((float64, float64), llvmirs) @@ -75,7 +81,10 @@ def foo(x, y): asmdict = foo.inspect_asm() # Signature in assembly dict - self.assertEqual(2, len(asmdict), ) + self.assertEqual( + 2, + len(asmdict), + ) self.assertIn((intp, intp), asmdict) self.assertIn((float64, float64), asmdict) @@ -87,7 +96,7 @@ def _test_inspect_sass(self, kernel, name, sass): # Ensure function appears in output seen_function = False for line in sass.split(): - if '.text' in line and name in line: + if ".text" in line and name in line: seen_function = True self.assertTrue(seen_function) @@ -95,11 +104,11 @@ def _test_inspect_sass(self, kernel, name, sass): # Some instructions common to all supported architectures that should # appear in the output - self.assertIn('S2R', sass) # Special register to register - self.assertIn('BRA', sass) # Branch - self.assertIn('EXIT', sass) # Exit program + self.assertIn("S2R", sass) # Special register to register + self.assertIn("BRA", sass) # Branch + self.assertIn("EXIT", sass) # Exit program - @skip_without_nvdisasm('nvdisasm needed for inspect_sass()') + @skip_without_nvdisasm("nvdisasm needed for inspect_sass()") def test_inspect_sass_eager(self): sig = (float32[::1], int32[::1]) @@ -109,9 +118,9 @@ def add(x, y): if i < len(x): x[i] += y[i] - self._test_inspect_sass(add, 'add', add.inspect_sass(sig)) + self._test_inspect_sass(add, "add", add.inspect_sass(sig)) - @skip_without_nvdisasm('nvdisasm needed for inspect_sass()') + @skip_without_nvdisasm("nvdisasm needed for inspect_sass()") def test_inspect_sass_lazy(self): @cuda.jit(lineinfo=True) def add(x, y): @@ -124,10 +133,11 @@ def add(x, y): add[1, 10](x, y) signature = (int32[::1], float32[::1]) - self._test_inspect_sass(add, 'add', add.inspect_sass(signature)) + self._test_inspect_sass(add, "add", add.inspect_sass(signature)) - @skip_with_nvdisasm('Missing nvdisasm exception only generated when it is ' - 'not present') + @skip_with_nvdisasm( + "Missing nvdisasm exception only generated when it is not present" + ) def test_inspect_sass_nvdisasm_missing(self): @cuda.jit((float32[::1],)) def f(x): @@ -136,9 +146,9 @@ def f(x): with self.assertRaises(RuntimeError) as raises: f.inspect_sass() - self.assertIn('nvdisasm has not been found', str(raises.exception)) + self.assertIn("nvdisasm has not been found", str(raises.exception)) - @skip_without_nvdisasm('nvdisasm needed for inspect_sass_cfg()') + @skip_without_nvdisasm("nvdisasm needed for inspect_sass_cfg()") def test_inspect_sass_cfg(self): sig = (float32[::1], int32[::1]) @@ -149,10 +159,9 @@ def add(x, y): x[i] += y[i] self.assertRegex( - add.inspect_sass_cfg(signature=sig), - r'digraph\s*\w\s*{(.|\n)*\n}' + add.inspect_sass_cfg(signature=sig), r"digraph\s*\w\s*{(.|\n)*\n}" ) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py b/numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py index 5622789f7..6e4fa61e3 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py @@ -6,8 +6,12 @@ from numba.cuda import compile_ptx from numba.core.errors import TypingError from numba.core.types import f2 -from numba.cuda.testing import (unittest, CUDATestCase, skip_on_cudasim, - skip_unless_cc_53) +from numba.cuda.testing import ( + unittest, + CUDATestCase, + skip_on_cudasim, + skip_unless_cc_53, +) def simple_threadidx(ary): @@ -260,7 +264,6 @@ def simple_hsqrt(r, x): def simple_hrsqrt(r, x): - i = cuda.grid(1) if i < len(r): @@ -268,7 +271,7 @@ def simple_hrsqrt(r, x): def numpy_hrsqrt(x, dtype): - return x ** -0.5 + return x**-0.5 def simple_hceil(r, x): @@ -404,15 +407,15 @@ def f_contigous(): f_res = f_contigous() self.assertTrue(np.all(c_res == f_res)) - @skip_on_cudasim('Cudasim does not check types') + @skip_on_cudasim("Cudasim does not check types") def test_nonliteral_grid_error(self): - with self.assertRaisesRegex(TypingError, 'RequireLiteralValue'): - cuda.jit('void(int32)')(nonliteral_grid) + with self.assertRaisesRegex(TypingError, "RequireLiteralValue"): + cuda.jit("void(int32)")(nonliteral_grid) - @skip_on_cudasim('Cudasim does not check types') + @skip_on_cudasim("Cudasim does not check types") def test_nonliteral_gridsize_error(self): - with self.assertRaisesRegex(TypingError, 'RequireLiteralValue'): - cuda.jit('void(int32)')(nonliteral_gridsize) + with self.assertRaisesRegex(TypingError, "RequireLiteralValue"): + cuda.jit("void(int32)")(nonliteral_gridsize) def test_simple_grid1d(self): compiled = cuda.jit("void(int32[::1])")(simple_grid1d) @@ -444,7 +447,7 @@ def test_simple_gridsize1d(self): compiled[nctaid, ntid](ary) self.assertEqual(ary[0], nctaid * ntid) - @skip_on_cudasim('Requires too many threads') + @skip_on_cudasim("Requires too many threads") def test_issue_9229(self): # Ensure that grid and grid size are correct - #9229 showed that they # overflowed an int32. @@ -469,7 +472,7 @@ def f(grid_error, gridsize_error): self.assertEqual(grid_error[0], 0) self.assertEqual(gridsize_error[0], 0) - @skip_on_cudasim('Tests PTX emission') + @skip_on_cudasim("Tests PTX emission") def test_selp(self): sig = (int64[:], int64, int64[:]) cu_branching_with_ifs = cuda.jit(sig)(branching_with_ifs) @@ -485,14 +488,14 @@ def test_selp(self): a = np.arange(n, dtype=np.int64) cu_branching_with_ifs[n, 1](a, b, c) ptx = cu_branching_with_ifs.inspect_asm(sig) - self.assertEqual(2, len(re.findall(r'\s+bra\s+', ptx))) - np.testing.assert_array_equal(a, expected, err_msg='branching') + self.assertEqual(2, len(re.findall(r"\s+bra\s+", ptx))) + np.testing.assert_array_equal(a, expected, err_msg="branching") a = np.arange(n, dtype=np.int64) cu_branching_with_selps[n, 1](a, b, c) ptx = cu_branching_with_selps.inspect_asm(sig) - self.assertEqual(0, len(re.findall(r'\s+bra\s+', ptx))) - np.testing.assert_array_equal(a, expected, err_msg='selp') + self.assertEqual(0, len(re.findall(r"\s+bra\s+", ptx))) + np.testing.assert_array_equal(a, expected, err_msg="selp") def test_simple_gridsize2d(self): compiled = cuda.jit("void(int32[::1])")(simple_gridsize2d) @@ -528,10 +531,10 @@ def foo(out): a, b, c = cuda.gridsize(3) out[x, y, z] = a * b * c - arr = np.zeros(9 ** 3, dtype=np.int32).reshape(9, 9, 9) + arr = np.zeros(9**3, dtype=np.int32).reshape(9, 9, 9) foo[(3, 3, 3), (3, 3, 3)](arr) - np.testing.assert_equal(arr, 9 ** 3) + np.testing.assert_equal(arr, 9**3) def test_3dgrid_2(self): @cuda.jit @@ -539,13 +542,15 @@ def foo(out): x, y, z = cuda.grid(3) a, b, c = cuda.gridsize(3) grid_is_right = ( - x == cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x and - y == cuda.threadIdx.y + cuda.blockIdx.y * cuda.blockDim.y and - z == cuda.threadIdx.z + cuda.blockIdx.z * cuda.blockDim.z + x == cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x + and y == cuda.threadIdx.y + cuda.blockIdx.y * cuda.blockDim.y + and z == cuda.threadIdx.z + cuda.blockIdx.z * cuda.blockDim.z + ) + gridsize_is_right = ( + a == cuda.blockDim.x * cuda.gridDim.x + and b == cuda.blockDim.y * cuda.gridDim.y + and c == cuda.blockDim.z * cuda.gridDim.z ) - gridsize_is_right = (a == cuda.blockDim.x * cuda.gridDim.x and - b == cuda.blockDim.y * cuda.gridDim.y and - c == cuda.blockDim.z * cuda.gridDim.z) out[x, y, z] = grid_is_right and gridsize_is_right x, y, z = (4 * 3, 3 * 2, 2 * 4) @@ -605,21 +610,21 @@ def test_bit_count_u8(self): def test_fma_f4(self): compiled = cuda.jit("void(f4[:], f4, f4, f4)")(simple_fma) ary = np.zeros(1, dtype=np.float32) - compiled[1, 1](ary, 2., 3., 4.) + compiled[1, 1](ary, 2.0, 3.0, 4.0) np.testing.assert_allclose(ary[0], 2 * 3 + 4) def test_fma_f8(self): compiled = cuda.jit("void(f8[:], f8, f8, f8)")(simple_fma) ary = np.zeros(1, dtype=np.float64) - compiled[1, 1](ary, 2., 3., 4.) + compiled[1, 1](ary, 2.0, 3.0, 4.0) np.testing.assert_allclose(ary[0], 2 * 3 + 4) @skip_unless_cc_53 def test_hadd(self): compiled = cuda.jit("void(f2[:], f2[:], f2[:])")(simple_hadd) ary = np.zeros(1, dtype=np.float16) - arg1 = np.array([3.], dtype=np.float16) - arg2 = np.array([4.], dtype=np.float16) + arg1 = np.array([3.0], dtype=np.float16) + arg2 = np.array([4.0], dtype=np.float16) compiled[1, 1](ary, arg1, arg2) np.testing.assert_allclose(ary[0], arg1 + arg2) @@ -628,24 +633,24 @@ def test_hadd_scalar(self): compiled = cuda.jit("void(f2[:], f2, f2)")(simple_hadd_scalar) ary = np.zeros(1, dtype=np.float16) arg1 = np.float16(3.1415926) - arg2 = np.float16(3.) + arg2 = np.float16(3.0) compiled[1, 1](ary, arg1, arg2) ref = arg1 + arg2 np.testing.assert_allclose(ary[0], ref) - @skip_on_cudasim('Compilation unsupported in the simulator') + @skip_on_cudasim("Compilation unsupported in the simulator") def test_hadd_ptx(self): args = (f2[:], f2, f2) ptx, _ = compile_ptx(simple_hadd_scalar, args, cc=(5, 3)) - self.assertIn('add.f16', ptx) + self.assertIn("add.f16", ptx) @skip_unless_cc_53 def test_hfma(self): compiled = cuda.jit("void(f2[:], f2[:], f2[:], f2[:])")(simple_hfma) ary = np.zeros(1, dtype=np.float16) - arg1 = np.array([2.], dtype=np.float16) - arg2 = np.array([3.], dtype=np.float16) - arg3 = np.array([4.], dtype=np.float16) + arg1 = np.array([2.0], dtype=np.float16) + arg2 = np.array([3.0], dtype=np.float16) + arg3 = np.array([4.0], dtype=np.float16) compiled[1, 1](ary, arg1, arg2, arg3) np.testing.assert_allclose(ary[0], arg1 * arg2 + arg3) @@ -653,25 +658,25 @@ def test_hfma(self): def test_hfma_scalar(self): compiled = cuda.jit("void(f2[:], f2, f2, f2)")(simple_hfma_scalar) ary = np.zeros(1, dtype=np.float16) - arg1 = np.float16(2.) - arg2 = np.float16(3.) - arg3 = np.float16(4.) + arg1 = np.float16(2.0) + arg2 = np.float16(3.0) + arg3 = np.float16(4.0) compiled[1, 1](ary, arg1, arg2, arg3) ref = arg1 * arg2 + arg3 np.testing.assert_allclose(ary[0], ref) - @skip_on_cudasim('Compilation unsupported in the simulator') + @skip_on_cudasim("Compilation unsupported in the simulator") def test_hfma_ptx(self): args = (f2[:], f2, f2, f2) ptx, _ = compile_ptx(simple_hfma_scalar, args, cc=(5, 3)) - self.assertIn('fma.rn.f16', ptx) + self.assertIn("fma.rn.f16", ptx) @skip_unless_cc_53 def test_hsub(self): compiled = cuda.jit("void(f2[:], f2[:], f2[:])")(simple_hsub) ary = np.zeros(1, dtype=np.float16) - arg1 = np.array([3.], dtype=np.float16) - arg2 = np.array([4.], dtype=np.float16) + arg1 = np.array([3.0], dtype=np.float16) + arg2 = np.array([4.0], dtype=np.float16) compiled[1, 1](ary, arg1, arg2) np.testing.assert_allclose(ary[0], arg1 - arg2) @@ -685,18 +690,18 @@ def test_hsub_scalar(self): ref = arg1 - arg2 np.testing.assert_allclose(ary[0], ref) - @skip_on_cudasim('Compilation unsupported in the simulator') + @skip_on_cudasim("Compilation unsupported in the simulator") def test_hsub_ptx(self): args = (f2[:], f2, f2) ptx, _ = compile_ptx(simple_hsub_scalar, args, cc=(5, 3)) - self.assertIn('sub.f16', ptx) + self.assertIn("sub.f16", ptx) @skip_unless_cc_53 def test_hmul(self): compiled = cuda.jit()(simple_hmul) ary = np.zeros(1, dtype=np.float16) - arg1 = np.array([3.], dtype=np.float16) - arg2 = np.array([4.], dtype=np.float16) + arg1 = np.array([3.0], dtype=np.float16) + arg2 = np.array([4.0], dtype=np.float16) compiled[1, 1](ary, arg1, arg2) np.testing.assert_allclose(ary[0], arg1 * arg2) @@ -710,11 +715,11 @@ def test_hmul_scalar(self): ref = arg1 * arg2 np.testing.assert_allclose(ary[0], ref) - @skip_on_cudasim('Compilation unsupported in the simulator') + @skip_on_cudasim("Compilation unsupported in the simulator") def test_hmul_ptx(self): args = (f2[:], f2, f2) ptx, _ = compile_ptx(simple_hmul_scalar, args, cc=(5, 3)) - self.assertIn('mul.f16', ptx) + self.assertIn("mul.f16", ptx) @skip_unless_cc_53 def test_hdiv_scalar(self): @@ -742,7 +747,7 @@ def test_hdiv(self): def test_hneg(self): compiled = cuda.jit("void(f2[:], f2[:])")(simple_hneg) ary = np.zeros(1, dtype=np.float16) - arg1 = np.array([3.], dtype=np.float16) + arg1 = np.array([3.0], dtype=np.float16) compiled[1, 1](ary, arg1) np.testing.assert_allclose(ary[0], -arg1) @@ -755,17 +760,17 @@ def test_hneg_scalar(self): ref = -arg1 np.testing.assert_allclose(ary[0], ref) - @skip_on_cudasim('Compilation unsupported in the simulator') + @skip_on_cudasim("Compilation unsupported in the simulator") def test_hneg_ptx(self): args = (f2[:], f2) ptx, _ = compile_ptx(simple_hneg_scalar, args, cc=(5, 3)) - self.assertIn('neg.f16', ptx) + self.assertIn("neg.f16", ptx) @skip_unless_cc_53 def test_habs(self): compiled = cuda.jit()(simple_habs) ary = np.zeros(1, dtype=np.float16) - arg1 = np.array([-3.], dtype=np.float16) + arg1 = np.array([-3.0], dtype=np.float16) compiled[1, 1](ary, arg1) np.testing.assert_allclose(ary[0], abs(arg1)) @@ -778,25 +783,43 @@ def test_habs_scalar(self): ref = abs(arg1) np.testing.assert_allclose(ary[0], ref) - @skip_on_cudasim('Compilation unsupported in the simulator') + @skip_on_cudasim("Compilation unsupported in the simulator") def test_habs_ptx(self): args = (f2[:], f2) ptx, _ = compile_ptx(simple_habs_scalar, args, cc=(5, 3)) - self.assertIn('abs.f16', ptx) + self.assertIn("abs.f16", ptx) @skip_unless_cc_53 def test_fp16_intrinsics_common(self): - kernels = (simple_hsin, simple_hcos, - simple_hlog, simple_hlog2, simple_hlog10, - simple_hsqrt, simple_hceil, simple_hfloor, - simple_hrcp, simple_htrunc, simple_hrint, - simple_hrsqrt) + kernels = ( + simple_hsin, + simple_hcos, + simple_hlog, + simple_hlog2, + simple_hlog10, + simple_hsqrt, + simple_hceil, + simple_hfloor, + simple_hrcp, + simple_htrunc, + simple_hrint, + simple_hrsqrt, + ) exp_kernels = (simple_hexp, simple_hexp2) - expected_functions = (np.sin, np.cos, - np.log, np.log2, np.log10, - np.sqrt, np.ceil, np.floor, - np.reciprocal, np.trunc, np.rint, - numpy_hrsqrt) + expected_functions = ( + np.sin, + np.cos, + np.log, + np.log2, + np.log10, + np.sqrt, + np.ceil, + np.floor, + np.reciprocal, + np.trunc, + np.rint, + numpy_hrsqrt, + ) expected_exp_functions = (np.exp, np.exp2) # Generate random data @@ -807,7 +830,7 @@ def test_fp16_intrinsics_common(self): for kernel, fn in zip(kernels, expected_functions): with self.subTest(fn=fn): kernel = cuda.jit("void(f2[:], f2[:])")(kernel) - kernel[1,N](r, x) + kernel[1, N](r, x) expected = fn(x, dtype=np.float16) np.testing.assert_allclose(r, expected) @@ -815,7 +838,7 @@ def test_fp16_intrinsics_common(self): for kernel, fn in zip(exp_kernels, expected_exp_functions): with self.subTest(fn=fn): kernel = cuda.jit("void(f2[:], f2[:])")(kernel) - kernel[1,N](r, x2) + kernel[1, N](r, x2) expected = fn(x2, dtype=np.float16) np.testing.assert_allclose(r, expected) @@ -836,14 +859,26 @@ def hexp10_vectors(r, x): # Run the kernel hexp10_vectors[1, N](r, x) - np.testing.assert_allclose(r, 10 ** x) + np.testing.assert_allclose(r, 10**x) @skip_unless_cc_53 def test_fp16_comparison(self): - fns = (simple_heq_scalar, simple_hne_scalar, simple_hge_scalar, - simple_hgt_scalar, simple_hle_scalar, simple_hlt_scalar) - ops = (operator.eq, operator.ne, operator.ge, - operator.gt, operator.le, operator.lt) + fns = ( + simple_heq_scalar, + simple_hne_scalar, + simple_hge_scalar, + simple_hgt_scalar, + simple_hle_scalar, + simple_hlt_scalar, + ) + ops = ( + operator.eq, + operator.ne, + operator.ge, + operator.gt, + operator.le, + operator.lt, + ) for fn, op in zip(fns, ops): with self.subTest(op=op): @@ -872,18 +907,20 @@ def test_fp16_comparison(self): @skip_unless_cc_53 def test_multiple_float16_comparisons(self): - functions = (test_multiple_hcmp_1, - test_multiple_hcmp_2, - test_multiple_hcmp_3, - test_multiple_hcmp_4, - test_multiple_hcmp_5) + functions = ( + test_multiple_hcmp_1, + test_multiple_hcmp_2, + test_multiple_hcmp_3, + test_multiple_hcmp_4, + test_multiple_hcmp_5, + ) for fn in functions: with self.subTest(fn=fn): compiled = cuda.jit("void(b1[:], f2, f2, f2)")(fn) ary = np.zeros(1, dtype=np.bool_) - arg1 = np.float16(2.) - arg2 = np.float16(3.) - arg3 = np.float16(4.) + arg1 = np.float16(2.0) + arg2 = np.float16(3.0) + arg3 = np.float16(4.0) compiled[1, 1](ary, arg1, arg2, arg3) self.assertTrue(ary[0]) @@ -891,11 +928,11 @@ def test_multiple_float16_comparisons(self): def test_hmax(self): compiled = cuda.jit("void(f2[:], f2, f2)")(simple_hmax_scalar) ary = np.zeros(1, dtype=np.float16) - arg1 = np.float16(3.) - arg2 = np.float16(4.) + arg1 = np.float16(3.0) + arg2 = np.float16(4.0) compiled[1, 1](ary, arg1, arg2) np.testing.assert_allclose(ary[0], arg2) - arg1 = np.float16(5.) + arg1 = np.float16(5.0) compiled[1, 1](ary, arg1, arg2) np.testing.assert_allclose(ary[0], arg1) @@ -903,25 +940,25 @@ def test_hmax(self): def test_hmin(self): compiled = cuda.jit("void(f2[:], f2, f2)")(simple_hmin_scalar) ary = np.zeros(1, dtype=np.float16) - arg1 = np.float16(3.) - arg2 = np.float16(4.) + arg1 = np.float16(3.0) + arg2 = np.float16(4.0) compiled[1, 1](ary, arg1, arg2) np.testing.assert_allclose(ary[0], arg1) - arg1 = np.float16(5.) + arg1 = np.float16(5.0) compiled[1, 1](ary, arg1, arg2) np.testing.assert_allclose(ary[0], arg2) def test_cbrt_f32(self): compiled = cuda.jit("void(float32[:], float32)")(simple_cbrt) ary = np.zeros(1, dtype=np.float32) - cbrt_arg = 2. + cbrt_arg = 2.0 compiled[1, 1](ary, cbrt_arg) np.testing.assert_allclose(ary[0], cbrt_arg ** (1 / 3)) def test_cbrt_f64(self): compiled = cuda.jit("void(float64[:], float64)")(simple_cbrt) ary = np.zeros(1, dtype=np.float64) - cbrt_arg = 6. + cbrt_arg = 6.0 compiled[1, 1](ary, cbrt_arg) np.testing.assert_allclose(ary[0], cbrt_arg ** (1 / 3)) @@ -1052,25 +1089,36 @@ def test_round_to_f4(self): np.concatenate((vals, np.array([np.inf, -np.inf, np.nan]))) digits = ( # Common case branch of round_to_impl - -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, + -5, + -4, + -3, + -2, + -1, + 0, + 1, + 2, + 3, + 4, + 5, # The algorithm currently implemented can only round to 13 digits # with single precision. Note that this doesn't trigger the # "overflow safe" branch of the implementation, which can only be # hit when using double precision. - 13 + 13, ) for val, ndigits in itertools.product(vals, digits): with self.subTest(val=val, ndigits=ndigits): compiled[1, 1](ary, val, ndigits) - self.assertPreciseEqual(ary[0], round(val, ndigits), - prec='single') + self.assertPreciseEqual( + ary[0], round(val, ndigits), prec="single" + ) # CPython on most platforms uses rounding based on dtoa.c, whereas the CUDA # round-to implementation uses CPython's fallback implementation, which has # slightly different behavior at the edges of the domain. Since the CUDA # simulator executes using CPython, we need to skip this test when the # simulator is active. - @skip_on_cudasim('Overflow behavior differs on CPython') + @skip_on_cudasim("Overflow behavior differs on CPython") def test_round_to_f4_overflow(self): # Test that the input value is returned when y in round_ndigits # overflows. @@ -1092,7 +1140,7 @@ def test_round_to_f4_halfway(self): val = 0.3425 ndigits = 3 compiled[1, 1](ary, val, ndigits) - self.assertPreciseEqual(ary[0], round(val, ndigits), prec='single') + self.assertPreciseEqual(ary[0], round(val, ndigits), prec="single") def test_round_to_f8(self): compiled = cuda.jit("void(float64[:], float64, int32)")(simple_round_to) @@ -1105,19 +1153,19 @@ def test_round_to_f8(self): for val, ndigits in itertools.product(vals, digits): with self.subTest(val=val, ndigits=ndigits): compiled[1, 1](ary, val, ndigits) - self.assertPreciseEqual(ary[0], round(val, ndigits), - prec='exact') + self.assertPreciseEqual( + ary[0], round(val, ndigits), prec="exact" + ) # Trigger the "overflow safe" branch of the implementation val = 0.12345678987654321 * 10e-15 ndigits = 23 with self.subTest(val=val, ndigits=ndigits): compiled[1, 1](ary, val, ndigits) - self.assertPreciseEqual(ary[0], round(val, ndigits), - prec='double') + self.assertPreciseEqual(ary[0], round(val, ndigits), prec="double") # Skipped on cudasim for the same reasons as test_round_to_f4 above. - @skip_on_cudasim('Overflow behavior differs on CPython') + @skip_on_cudasim("Overflow behavior differs on CPython") def test_round_to_f8_overflow(self): # Test that the input value is returned when y in round_ndigits # overflows. @@ -1139,8 +1187,8 @@ def test_round_to_f8_halfway(self): val = 0.5425 ndigits = 3 compiled[1, 1](ary, val, ndigits) - self.assertPreciseEqual(ary[0], round(val, ndigits), prec='double') + self.assertPreciseEqual(ary[0], round(val, ndigits), prec="double") -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_ipc.py b/numba_cuda/numba/cuda/tests/cudapy/test_ipc.py index 657e9a104..4a6083cd2 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_ipc.py @@ -7,9 +7,13 @@ from numba import cuda from numba.cuda.cudadrv import driver -from numba.cuda.testing import (skip_on_arm, skip_on_cudasim, - skip_under_cuda_memcheck, - ContextResettingTestCase, ForeignArray) +from numba.cuda.testing import ( + skip_on_arm, + skip_on_cudasim, + skip_under_cuda_memcheck, + ContextResettingTestCase, + ForeignArray, +) from numba.tests.support import linux_only, windows_only import unittest @@ -32,8 +36,9 @@ def core_ipc_handle_test(the_work, result_queue): def base_ipc_handle_test(handle, size, result_queue): def the_work(): dtype = np.dtype(np.intp) - with cuda.open_ipc_array(handle, shape=size // dtype.itemsize, - dtype=dtype) as darr: + with cuda.open_ipc_array( + handle, shape=size // dtype.itemsize, dtype=dtype + ) as darr: # copy the data to host return darr.copy_to_host() @@ -43,9 +48,11 @@ def the_work(): def serialize_ipc_handle_test(handle, result_queue): def the_work(): dtype = np.dtype(np.intp) - darr = handle.open_array(cuda.current_context(), - shape=handle.size // dtype.itemsize, - dtype=dtype) + darr = handle.open_array( + cuda.current_context(), + shape=handle.size // dtype.itemsize, + dtype=dtype, + ) # copy the data to host arr = darr.copy_to_host() handle.close() @@ -63,10 +70,10 @@ def ipc_array_test(ipcarr, result_queue): with ipcarr: pass except ValueError as e: - if str(e) != 'IpcHandle is already opened': - raise AssertionError('invalid exception message') + if str(e) != "IpcHandle is already opened": + raise AssertionError("invalid exception message") else: - raise AssertionError('did not raise on reopen') + raise AssertionError("did not raise on reopen") # Catch any exception so we can propagate it except: # noqa: E722 # FAILED. propagate the exception as a string @@ -80,11 +87,10 @@ def ipc_array_test(ipcarr, result_queue): @linux_only -@skip_under_cuda_memcheck('Hangs cuda-memcheck') -@skip_on_cudasim('Ipc not available in CUDASIM') -@skip_on_arm('CUDA IPC not supported on ARM in Numba') +@skip_under_cuda_memcheck("Hangs cuda-memcheck") +@skip_on_cudasim("Ipc not available in CUDASIM") +@skip_on_arm("CUDA IPC not supported on ARM in Numba") class TestIpcMemory(ContextResettingTestCase): - def test_ipc_handle(self): # prepare data for IPC arr = np.arange(10, dtype=np.intp) @@ -102,7 +108,7 @@ def test_ipc_handle(self): size = ipch.size # spawn new process for testing - ctx = mp.get_context('spawn') + ctx = mp.get_context("spawn") result_queue = ctx.Queue() args = (handle_bytes, size, result_queue) proc = ctx.Process(target=base_ipc_handle_test, args=args) @@ -145,11 +151,12 @@ def check_ipc_handle_serialization(self, index_arg=None, foreign=False): if driver.USE_NV_BINDING: self.assertEqual(ipch_recon.handle.reserved, ipch.handle.reserved) else: - self.assertEqual(ipch_recon.handle.reserved[:], - ipch.handle.reserved[:]) + self.assertEqual( + ipch_recon.handle.reserved[:], ipch.handle.reserved[:] + ) # spawn new process for testing - ctx = mp.get_context('spawn') + ctx = mp.get_context("spawn") result_queue = ctx.Queue() args = (ipch, result_queue) proc = ctx.Process(target=serialize_ipc_handle_test, args=args) @@ -162,7 +169,10 @@ def check_ipc_handle_serialization(self, index_arg=None, foreign=False): proc.join(3) def test_ipc_handle_serialization(self): - for index, foreign, in self.variants(): + for ( + index, + foreign, + ) in self.variants(): with self.subTest(index=index, foreign=foreign): self.check_ipc_handle_serialization(index, foreign) @@ -179,7 +189,7 @@ def check_ipc_array(self, index_arg=None, foreign=False): ipch = devarr.get_ipc_handle() # spawn new process for testing - ctx = mp.get_context('spawn') + ctx = mp.get_context("spawn") result_queue = ctx.Queue() args = (ipch, result_queue) proc = ctx.Process(target=ipc_array_test, args=args) @@ -192,7 +202,10 @@ def check_ipc_array(self, index_arg=None, foreign=False): proc.join(3) def test_ipc_array(self): - for index, foreign, in self.variants(): + for ( + index, + foreign, + ) in self.variants(): with self.subTest(index=index, foreign=foreign): self.check_ipc_array(index, foreign) @@ -205,7 +218,9 @@ def the_work(): arrsize = handle.size // np.dtype(np.intp).itemsize hostarray = np.zeros(arrsize, dtype=np.intp) cuda.driver.device_to_host( - hostarray, deviceptr, size=handle.size, + hostarray, + deviceptr, + size=handle.size, ) handle.close() return hostarray @@ -223,10 +238,10 @@ def staged_ipc_array_test(ipcarr, device_num, result_queue): with ipcarr: pass except ValueError as e: - if str(e) != 'IpcHandle is already opened': - raise AssertionError('invalid exception message') + if str(e) != "IpcHandle is already opened": + raise AssertionError("invalid exception message") else: - raise AssertionError('did not raise on reopen') + raise AssertionError("did not raise on reopen") # Catch any exception so we can propagate it except: # noqa: E722 # FAILED. propagate the exception as a string @@ -240,9 +255,9 @@ def staged_ipc_array_test(ipcarr, device_num, result_queue): @linux_only -@skip_under_cuda_memcheck('Hangs cuda-memcheck') -@skip_on_cudasim('Ipc not available in CUDASIM') -@skip_on_arm('CUDA IPC not supported on ARM in Numba') +@skip_under_cuda_memcheck("Hangs cuda-memcheck") +@skip_on_cudasim("Ipc not available in CUDASIM") +@skip_on_arm("CUDA IPC not supported on ARM in Numba") class TestIpcStaged(ContextResettingTestCase): def test_staged(self): # prepare data for IPC @@ -250,7 +265,7 @@ def test_staged(self): devarr = cuda.to_device(arr) # spawn new process for testing - mpctx = mp.get_context('spawn') + mpctx = mp.get_context("spawn") result_queue = mpctx.Queue() # create IPC handle @@ -264,8 +279,7 @@ def test_staged(self): self.assertEqual(ipch_recon.handle.reserved, ipch.handle.reserved) else: self.assertEqual( - ipch_recon.handle.reserved[:], - ipch.handle.reserved[:] + ipch_recon.handle.reserved[:], ipch.handle.reserved[:] ) self.assertEqual(ipch_recon.size, ipch.size) @@ -289,7 +303,7 @@ def test_ipc_array(self): ipch = devarr.get_ipc_handle() # spawn new process for testing - ctx = mp.get_context('spawn') + ctx = mp.get_context("spawn") result_queue = ctx.Queue() args = (ipch, device_num, result_queue) proc = ctx.Process(target=staged_ipc_array_test, args=args) @@ -303,7 +317,7 @@ def test_ipc_array(self): @windows_only -@skip_on_cudasim('Ipc not available in CUDASIM') +@skip_on_cudasim("Ipc not available in CUDASIM") class TestIpcNotSupported(ContextResettingTestCase): def test_unsupported(self): arr = np.arange(10, dtype=np.intp) @@ -311,8 +325,8 @@ def test_unsupported(self): with self.assertRaises(OSError) as raises: devarr.get_ipc_handle() errmsg = str(raises.exception) - self.assertIn('OS does not support CUDA IPC', errmsg) + self.assertIn("OS does not support CUDA IPC", errmsg) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_iterators.py b/numba_cuda/numba/cuda/tests/cudapy/test_iterators.py index 47366f380..4a69badc2 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_iterators.py @@ -5,7 +5,6 @@ class TestIterators(CUDATestCase): - def test_enumerate(self): @cuda.jit def enumerator(x, error): @@ -95,5 +94,5 @@ def zipper_enumerator(x, y, error): self._test_twoarg_function(zipper_enumerator) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_lang.py b/numba_cuda/numba/cuda/tests/cudapy/test_lang.py index 0241c1e40..97562c250 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_lang.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_lang.py @@ -10,7 +10,7 @@ class TestLang(CUDATestCase): def test_enumerate(self): - tup = (1., 2.5, 3.) + tup = (1.0, 2.5, 3.0) @cuda.jit("void(float64[:])") def foo(a): @@ -39,12 +39,12 @@ def foo(a): self.assertTrue(np.all(a == (b + c).sum())) def test_issue_872(self): - ''' + """ Ensure that typing and lowering of CUDA kernel API primitives works in more than one block. Was originally to ensure that macro expansion works for more than one block (issue #872), but macro expansion has been replaced by a "proper" implementation of all kernel API functions. - ''' + """ @cuda.jit("void(float64[:,:])") def cuda_kernel_api_in_multiple_blocks(ary): @@ -60,5 +60,5 @@ def cuda_kernel_api_in_multiple_blocks(ary): cuda_kernel_api_in_multiple_blocks[1, (2, 3)](a) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_laplace.py b/numba_cuda/numba/cuda/tests/cudapy/test_laplace.py index d868b0297..3a1dee8b0 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_laplace.py @@ -14,7 +14,6 @@ class TestCudaLaplace(CUDATestCase): def test_laplace_small(self): - @cuda.jit(float64(float64, float64), device=True, inline=True) def get_max(a, b): if a > b: @@ -38,8 +37,9 @@ def jocabi_relax_core(A, Anew, error): err_sm[ty, tx] = 0 if j >= 1 and j < n - 1 and i >= 1 and i < m - 1: - Anew[j, i] = 0.25 * ( A[j, i + 1] + A[j, i - 1] - + A[j - 1, i] + A[j + 1, i]) + Anew[j, i] = 0.25 * ( + A[j, i + 1] + A[j, i - 1] + A[j - 1, i] + A[j + 1, i] + ) err_sm[ty, tx] = Anew[j, i] - A[j, i] cuda.syncthreads() @@ -91,8 +91,8 @@ def jocabi_relax_core(A, Anew, error): stream = cuda.stream() - dA = cuda.to_device(A, stream) # to device and don't come back - dAnew = cuda.to_device(Anew, stream) # to device and don't come back + dA = cuda.to_device(A, stream) # to device and don't come back + dAnew = cuda.to_device(Anew, stream) # to device and don't come back derror_grid = cuda.to_device(error_grid, stream) while error > tol and iter < iter_max: @@ -115,5 +115,5 @@ def jocabi_relax_core(A, Anew, error): iter += 1 -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py b/numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py index 9572a8882..d2b85e501 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py @@ -31,7 +31,7 @@ def use_sad(r, x, y, z): r[i] = libdevice.sad(x[i], y[i], z[i]) -@skip_on_cudasim('Libdevice functions are not supported on cudasim') +@skip_on_cudasim("Libdevice functions are not supported on cudasim") class TestLibdevice(CUDATestCase): """ Some tests of libdevice function wrappers that check the returned values. @@ -102,14 +102,15 @@ def make_test_call(libname): def _test_call_functions(self): # Strip off '__nv_' from libdevice name to get Python name apiname = libname[5:] - apifunc = getattr(libdevice, apiname) + apifunc = getattr(libdevice, apiname) # noqa: F841 retty, args = functions[libname] sig = create_signature(retty, args) # Construct arguments to the libdevice function. These are all # non-pointer arguments to the underlying bitcode function. - funcargs = ", ".join(['a%d' % i for i, arg in enumerate(args) if not - arg.is_ptr]) + funcargs = ", ".join( + ["a%d" % i for i, arg in enumerate(args) if not arg.is_ptr] + ) # Arguments to the Python function (`pyfunc` in the template above) are # the arguments to the libdevice function, plus as many extra arguments @@ -118,35 +119,37 @@ def _test_call_functions(self): # returns. if isinstance(sig.return_type, (types.Tuple, types.UniTuple)): # Start with the parameters for the return values - pyargs = ", ".join(['r%d' % i for i in - range(len(sig.return_type))]) + pyargs = ", ".join(["r%d" % i for i in range(len(sig.return_type))]) # Add the parameters for the argument values pyargs += ", " + funcargs # Generate the unpacking of the return value from the libdevice # function into the Python function return values (`r0`, `r1`, # etc.). - retvars = ", ".join(['r%d[0]' % i for i in - range(len(sig.return_type))]) + retvars = ", ".join( + ["r%d[0]" % i for i in range(len(sig.return_type))] + ) else: # Scalar return is a more straightforward case pyargs = "r0, " + funcargs retvars = "r0[0]" # Create the string containing the function to compile - d = { 'func': apiname, - 'pyargs': pyargs, - 'funcargs': funcargs, - 'retvars': retvars } + d = { + "func": apiname, + "pyargs": pyargs, + "funcargs": funcargs, + "retvars": retvars, + } code = function_template % d # Convert the string to a Python function locals = {} exec(code, globals(), locals) - pyfunc = locals['pyfunc'] + pyfunc = locals["pyfunc"] # Compute the signature for compilation. This mirrors the creation of # arguments to the Python function above. - pyargs = [ arg.ty for arg in args if not arg.is_ptr ] + pyargs = [arg.ty for arg in args if not arg.is_ptr] if isinstance(sig.return_type, (types.Tuple, types.UniTuple)): pyreturns = [ret[::1] for ret in sig.return_type] pyargs = pyreturns + pyargs @@ -159,16 +162,16 @@ def _test_call_functions(self): # If the function body was discarded by optimization (therefore making # the test a bit weak), there won't be any loading of parameters - # ensure that a load from parameters occurs somewhere in the PTX - self.assertIn('ld.param', ptx) + self.assertIn("ld.param", ptx) # Returning the result (through a passed-in array) should also require # a store to global memory, so check for at least one of those too. - self.assertIn('st.global', ptx) + self.assertIn("st.global", ptx) return _test_call_functions -@skip_on_cudasim('Compilation to PTX is not supported on cudasim') +@skip_on_cudasim("Compilation to PTX is not supported on cudasim") class TestLibdeviceCompilation(unittest.TestCase): """ Class for holding all tests of compiling calls to libdevice functions. We @@ -179,9 +182,10 @@ class TestLibdeviceCompilation(unittest.TestCase): for libname in functions: - setattr(TestLibdeviceCompilation, 'test_%s' % libname, - make_test_call(libname)) + setattr( + TestLibdeviceCompilation, "test_%s" % libname, make_test_call(libname) + ) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py b/numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py index 182873b50..edd7c314d 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py @@ -7,16 +7,16 @@ import warnings -@skip_on_cudasim('Simulator does not produce lineinfo') +@skip_on_cudasim("Simulator does not produce lineinfo") class TestCudaLineInfo(CUDATestCase): def _loc_directive_regex(self): # This is used in several tests pat = ( - r'\.loc' # .loc directive beginning - r'\s+[0-9]+' # whitespace then file index - r'\s+[0-9]+' # whitespace then line number - r'\s+[0-9]+' # whitespace then column position + r"\.loc" # .loc directive beginning + r"\s+[0-9]+" # whitespace then file index + r"\s+[0-9]+" # whitespace then line number + r"\s+[0-9]+" # whitespace then column position ) return re.compile(pat) @@ -29,21 +29,21 @@ def _check(self, fn, sig, expect): # DICompileUnit debug info metadata should all be of the # DebugDirectivesOnly kind, and not the FullDebug kind pat = ( - r'!DICompileUnit\(.*' # Opening of DICompileUnit metadata. Since - # the order of attributes is not - # guaranteed, we need to match arbitrarily - # afterwards. - r'emissionKind:\s+' # The emissionKind attribute followed by - # whitespace. - r'DebugDirectivesOnly' # The correct emissionKind. + r"!DICompileUnit\(.*" # Opening of DICompileUnit metadata. Since + # the order of attributes is not + # guaranteed, we need to match arbitrarily + # afterwards. + r"emissionKind:\s+" # The emissionKind attribute followed by + # whitespace. + r"DebugDirectivesOnly" # The correct emissionKind. ) match = re.compile(pat).search(llvm) assertfn(match, msg=ptx) pat = ( - r'!DICompileUnit\(.*' # Same as the pattern above, but for the - r'emissionKind:\s+' # incorrect FullDebug emissionKind. - r'FullDebug' # + r"!DICompileUnit\(.*" # Same as the pattern above, but for the + r"emissionKind:\s+" # incorrect FullDebug emissionKind. + r"FullDebug" # ) match = re.compile(pat).search(llvm) self.assertIsNone(match, msg=ptx) @@ -51,8 +51,8 @@ def _check(self, fn, sig, expect): # The name of this file should be present in the line mapping # if lineinfo was propagated through correctly. pat = ( - r'\.file' # .file directive beginning - r'\s+[0-9]+\s+' # file number surrounded by whitespace + r"\.file" # .file directive beginning + r"\s+[0-9]+\s+" # file number surrounded by whitespace r'".*test_lineinfo.py"' # filename in quotes, ignoring full path ) match = re.compile(pat).search(ptx) @@ -65,8 +65,8 @@ def _check(self, fn, sig, expect): # Debug info sections should not be present when only lineinfo is # generated pat = ( - r'\.section\s+' # .section directive beginning - r'\.debug_info' # Section named ".debug_info" + r"\.section\s+" # .section directive beginning + r"\.debug_info" # Section named ".debug_info" ) match = re.compile(pat).search(ptx) self.assertIsNone(match, msg=ptx) @@ -98,7 +98,7 @@ def divide_kernel(x, y): # signal an exception (e.g. divide by zero) has occurred. When the # error model is the default NumPy one (as it should be when only # lineinfo is enabled) the device function always returns 0. - self.assertNotIn('ret i32 1', llvm) + self.assertNotIn("ret i32 1", llvm) def test_no_lineinfo_in_device_function(self): # Ensure that no lineinfo is generated in device functions by default. @@ -138,7 +138,7 @@ def caller(x): # Check that there is no device function in the PTX # A line beginning with ".weak .func" that identifies a device function - devfn_start = re.compile(r'^\.weak\s+\.func') + devfn_start = re.compile(r"^\.weak\s+\.func") for line in ptxlines: if devfn_start.match(line) is not None: @@ -151,13 +151,14 @@ def caller(x): for line in ptxlines: if loc_directive.search(line) is not None: - if 'inlined_at' in line: + if "inlined_at" in line: found = True break if not found: - self.fail(f'No .loc directive with inlined_at info found' - f'in:\n\n{ptx}') + self.fail( + f"No .loc directive with inlined_at info foundin:\n\n{ptx}" + ) # We also inspect the LLVM to ensure that there's debug info for each # subprogram (function). A lightweight way to check this is to ensure @@ -166,7 +167,7 @@ def caller(x): llvm = caller.inspect_llvm(sig) subprograms = 0 for line in llvm.splitlines(): - if 'distinct !DISubprogram' in line: + if "distinct !DISubprogram" in line: subprograms += 1 # One DISubprogram for each of: @@ -174,9 +175,12 @@ def caller(x): # - The callee expected_subprograms = 2 - self.assertEqual(subprograms, expected_subprograms, - f'"Expected {expected_subprograms} DISubprograms; ' - f'got {subprograms}') + self.assertEqual( + subprograms, + expected_subprograms, + f'"Expected {expected_subprograms} DISubprograms; ' + f"got {subprograms}", + ) def test_debug_and_lineinfo_warning(self): with warnings.catch_warnings(record=True) as w: @@ -190,9 +194,10 @@ def f(): self.assertEqual(len(w), 1) self.assertEqual(w[0].category, NumbaInvalidConfigWarning) - self.assertIn('debug and lineinfo are mutually exclusive', - str(w[0].message)) + self.assertIn( + "debug and lineinfo are mutually exclusive", str(w[0].message) + ) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_localmem.py b/numba_cuda/numba/cuda/tests/cudapy/test_localmem.py index 26b3469a7..1e6687ce6 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_localmem.py @@ -31,32 +31,31 @@ def culocal1tuple(A, B): B[i] = C[i] -@skip_on_cudasim('PTX inspection not available in cudasim') +@skip_on_cudasim("PTX inspection not available in cudasim") class TestCudaLocalMem(CUDATestCase): def test_local_array(self): sig = (int32[:], int32[:]) jculocal = cuda.jit(sig)(culocal) - self.assertTrue('.local' in jculocal.inspect_asm(sig)) - A = np.arange(1000, dtype='int32') + self.assertTrue(".local" in jculocal.inspect_asm(sig)) + A = np.arange(1000, dtype="int32") B = np.zeros_like(A) jculocal[1, 1](A, B) self.assertTrue(np.all(A == B)) def test_local_array_1_tuple(self): - """Ensure that local arrays can be constructed with 1-tuple shape - """ - jculocal = cuda.jit('void(int32[:], int32[:])')(culocal1tuple) + """Ensure that local arrays can be constructed with 1-tuple shape""" + jculocal = cuda.jit("void(int32[:], int32[:])")(culocal1tuple) # Don't check if .local is in the ptx because the optimizer # may reduce it to registers. - A = np.arange(5, dtype='int32') + A = np.arange(5, dtype="int32") B = np.zeros_like(A) jculocal[1, 1](A, B) self.assertTrue(np.all(A == B)) def test_local_array_complex(self): - sig = 'void(complex128[:], complex128[:])' + sig = "void(complex128[:], complex128[:])" jculocalcomplex = cuda.jit(sig)(culocalcomplex) - A = (np.arange(100, dtype='complex128') - 1) / 2j + A = (np.arange(100, dtype="complex128") - 1) / 2j B = np.zeros_like(A) jculocalcomplex[1, 1](A, B) self.assertTrue(np.all(A == B)) @@ -64,7 +63,7 @@ def test_local_array_complex(self): def check_dtype(self, f, dtype): # Find the typing of the dtype argument to cuda.local.array annotation = next(iter(f.overloads.values()))._type_annotation - l_dtype = annotation.typemap['l'].dtype + l_dtype = annotation.typemap["l"].dtype # Ensure that the typing is correct self.assertEqual(l_dtype, dtype) @@ -95,7 +94,7 @@ def test_string_dtype(self): # Check that strings can be used to specify the dtype of a local array @cuda.jit(void(int32[::1])) def f(x): - l = cuda.local.array(10, dtype='int32') + l = cuda.local.array(10, dtype="int32") l[0] = x[0] x[0] = l[0] @@ -106,9 +105,10 @@ def test_invalid_string_dtype(self): # Check that strings of invalid dtypes cause a typing error re = ".*Invalid NumPy dtype specified: 'int33'.*" with self.assertRaisesRegex(TypingError, re): + @cuda.jit(void(int32[::1])) def f(x): - l = cuda.local.array(10, dtype='int33') + l = cuda.local.array(10, dtype="int33") l[0] = x[0] x[0] = l[0] @@ -160,5 +160,5 @@ def test_issue_fp16_support(self): self._check_local_array_size_fp16(2, 2, np.float16) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_mandel.py b/numba_cuda/numba/cuda/tests/cudapy/test_mandel.py index 2b7290ad4..29265bfa5 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_mandel.py @@ -3,11 +3,10 @@ from numba.cuda.testing import skip_on_cudasim, unittest -@skip_on_cudasim('Compilation unsupported in the simulator') +@skip_on_cudasim("Compilation unsupported in the simulator") class TestCudaMandel(unittest.TestCase): def test_mandel(self): - """Just make sure we can compile this - """ + """Just make sure we can compile this""" def mandel(tid, min_x, max_x, min_y, max_y, width, height, iters): pixel_size_x = (max_x - min_x) / width @@ -28,10 +27,18 @@ def mandel(tid, min_x, max_x, min_y, max_y, width, height, iters): return i return iters - args = (uint32, float64, float64, float64, float64, - uint32, uint32, uint32) + args = ( + uint32, + float64, + float64, + float64, + float64, + uint32, + uint32, + uint32, + ) compile_ptx(mandel, args, device=True) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_math.py b/numba_cuda/numba/cuda/tests/cudapy/test_math.py index 028a402ff..9cc7ff473 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_math.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_math.py @@ -1,8 +1,10 @@ import numpy as np -from numba.cuda.testing import (skip_unless_cc_53, - unittest, - CUDATestCase, - skip_on_cudasim) +from numba.cuda.testing import ( + skip_unless_cc_53, + unittest, + CUDATestCase, + skip_on_cudasim, +) from numba.np import numpy_support from numba import cuda, float32, float64, int32, vectorize, void, int64 import math @@ -253,8 +255,10 @@ def unary_template(self, func, npfunc, npdtype, nprestype, start, stop): def unary_bool_special_values(self, func, npfunc, npdtype, npmtype): fi = np.finfo(npdtype) denorm = fi.tiny / 4 - A = np.array([0., denorm, fi.tiny, 0.5, 1., fi.max, np.inf, np.nan], - dtype=npdtype) + A = np.array( + [0.0, denorm, fi.tiny, 0.5, 1.0, fi.max, np.inf, np.nan], + dtype=npdtype, + ) B = np.empty_like(A, dtype=np.int32) cfunc = cuda.jit((npmtype[::1], int32[::1]))(func) @@ -314,7 +318,7 @@ def binary_template(self, func, npfunc, npdtype, nprestype, start, stop): cfunc[1, nelem](A, A, B) np.testing.assert_allclose(npfunc(A, A), B) - #--------------------------------------------------------------------------- + # --------------------------------------------------------------------------- # test_math_acos def test_math_acos(self): @@ -325,7 +329,7 @@ def test_math_acos(self): self.unary_template_int64(math_acos, np.arccos, start=0, stop=0) self.unary_template_uint64(math_acos, np.arccos, start=0, stop=0) - #--------------------------------------------------------------------------- + # --------------------------------------------------------------------------- # test_math_asin def test_math_asin(self): @@ -336,7 +340,7 @@ def test_math_asin(self): self.unary_template_int64(math_asin, np.arcsin, start=0, stop=0) self.unary_template_uint64(math_asin, np.arcsin, start=0, stop=0) - #--------------------------------------------------------------------------- + # --------------------------------------------------------------------------- # test_math_atan def test_math_atan(self): @@ -345,7 +349,7 @@ def test_math_atan(self): self.unary_template_int64(math_atan, np.arctan) self.unary_template_uint64(math_atan, np.arctan) - #--------------------------------------------------------------------------- + # --------------------------------------------------------------------------- # test_math_acosh def test_math_acosh(self): @@ -354,7 +358,7 @@ def test_math_acosh(self): self.unary_template_int64(math_acosh, np.arccosh, start=1, stop=2) self.unary_template_uint64(math_acosh, np.arccosh, start=1, stop=2) - #--------------------------------------------------------------------------- + # --------------------------------------------------------------------------- # test_math_asinh def test_math_asinh(self): @@ -363,16 +367,16 @@ def test_math_asinh(self): self.unary_template_int64(math_asinh, np.arcsinh) self.unary_template_uint64(math_asinh, np.arcsinh) - #--------------------------------------------------------------------------- + # --------------------------------------------------------------------------- # test_math_atanh def test_math_atanh(self): - self.unary_template_float32(math_atanh, np.arctanh, start=0, stop=.9) - self.unary_template_float64(math_atanh, np.arctanh, start=0, stop=.9) - self.unary_template_int64(math_atanh, np.arctanh, start=0, stop=.9) - self.unary_template_uint64(math_atanh, np.arctanh, start=0, stop=.9) + self.unary_template_float32(math_atanh, np.arctanh, start=0, stop=0.9) + self.unary_template_float64(math_atanh, np.arctanh, start=0, stop=0.9) + self.unary_template_int64(math_atanh, np.arctanh, start=0, stop=0.9) + self.unary_template_uint64(math_atanh, np.arctanh, start=0, stop=0.9) - #--------------------------------------------------------------------------- + # --------------------------------------------------------------------------- # test_math_cos def test_math_cos(self): @@ -399,7 +403,7 @@ def test_math_fp16(self): def test_math_fp16_trunc(self): self.unary_template_float16(math_trunc, np.trunc) - #--------------------------------------------------------------------------- + # --------------------------------------------------------------------------- # test_math_sin def test_math_sin(self): @@ -408,7 +412,7 @@ def test_math_sin(self): self.unary_template_int64(math_sin, np.sin) self.unary_template_uint64(math_sin, np.sin) - #--------------------------------------------------------------------------- + # --------------------------------------------------------------------------- # test_math_tan def test_math_tan(self): @@ -417,7 +421,7 @@ def test_math_tan(self): self.unary_template_int64(math_tan, np.tan) self.unary_template_uint64(math_tan, np.tan) - #--------------------------------------------------------------------------- + # --------------------------------------------------------------------------- # test_math_cosh def test_math_cosh(self): @@ -426,7 +430,7 @@ def test_math_cosh(self): self.unary_template_int64(math_cosh, np.cosh) self.unary_template_uint64(math_cosh, np.cosh) - #--------------------------------------------------------------------------- + # --------------------------------------------------------------------------- # test_math_sinh def test_math_sinh(self): @@ -435,7 +439,7 @@ def test_math_sinh(self): self.unary_template_int64(math_sinh, np.sinh) self.unary_template_uint64(math_sinh, np.sinh) - #--------------------------------------------------------------------------- + # --------------------------------------------------------------------------- # test_math_tanh def test_math_tanh(self): @@ -444,7 +448,7 @@ def test_math_tanh(self): self.unary_template_int64(math_tanh, np.tanh) self.unary_template_uint64(math_tanh, np.tanh) - #--------------------------------------------------------------------------- + # --------------------------------------------------------------------------- # test_math_atan2 def test_math_atan2(self): @@ -453,31 +457,33 @@ def test_math_atan2(self): self.binary_template_int64(math_atan2, np.arctan2) self.binary_template_uint64(math_atan2, np.arctan2) - #--------------------------------------------------------------------------- + # --------------------------------------------------------------------------- # test_math_erf def test_math_erf(self): @vectorize def ufunc(x): return math.erf(x) + self.unary_template_float32(math_erf, ufunc) self.unary_template_float64(math_erf, ufunc) self.unary_template_int64(math_erf, ufunc) self.unary_template_uint64(math_erf, ufunc) - #--------------------------------------------------------------------------- + # --------------------------------------------------------------------------- # test_math_erfc def test_math_erfc(self): @vectorize def ufunc(x): return math.erfc(x) + self.unary_template_float32(math_erfc, ufunc) self.unary_template_float64(math_erfc, ufunc) self.unary_template_int64(math_erfc, ufunc) self.unary_template_uint64(math_erfc, ufunc) - #--------------------------------------------------------------------------- + # --------------------------------------------------------------------------- # test_math_exp def test_math_exp(self): @@ -486,7 +492,7 @@ def test_math_exp(self): self.unary_template_int64(math_exp, np.exp) self.unary_template_uint64(math_exp, np.exp) - #--------------------------------------------------------------------------- + # --------------------------------------------------------------------------- # test_math_expm1 def test_math_expm1(self): @@ -495,7 +501,7 @@ def test_math_expm1(self): self.unary_template_int64(math_expm1, np.expm1) self.unary_template_uint64(math_expm1, np.expm1) - #--------------------------------------------------------------------------- + # --------------------------------------------------------------------------- # test_math_fabs def test_math_fabs(self): @@ -504,31 +510,33 @@ def test_math_fabs(self): self.unary_template_int64(math_fabs, np.fabs, start=-1) self.unary_template_uint64(math_fabs, np.fabs, start=-1) - #--------------------------------------------------------------------------- + # --------------------------------------------------------------------------- # test_math_gamma def test_math_gamma(self): @vectorize def ufunc(x): return math.gamma(x) + self.unary_template_float32(math_gamma, ufunc, start=0.1) self.unary_template_float64(math_gamma, ufunc, start=0.1) self.unary_template_int64(math_gamma, ufunc, start=1) self.unary_template_uint64(math_gamma, ufunc, start=1) - #--------------------------------------------------------------------------- + # --------------------------------------------------------------------------- # test_math_lgamma def test_math_lgamma(self): @vectorize def ufunc(x): return math.lgamma(x) + self.unary_template_float32(math_lgamma, ufunc, start=0.1) self.unary_template_float64(math_lgamma, ufunc, start=0.1) self.unary_template_int64(math_lgamma, ufunc, start=1) self.unary_template_uint64(math_lgamma, ufunc, start=1) - #--------------------------------------------------------------------------- + # --------------------------------------------------------------------------- # test_math_log def test_math_log(self): @@ -537,7 +545,7 @@ def test_math_log(self): self.unary_template_int64(math_log, np.log, start=1) self.unary_template_uint64(math_log, np.log, start=1) - #--------------------------------------------------------------------------- + # --------------------------------------------------------------------------- # test_math_log2 def test_math_log2(self): @@ -546,7 +554,7 @@ def test_math_log2(self): self.unary_template_int64(math_log2, np.log2, start=1) self.unary_template_uint64(math_log2, np.log2, start=1) - #--------------------------------------------------------------------------- + # --------------------------------------------------------------------------- # test_math_log10 def test_math_log10(self): @@ -555,7 +563,7 @@ def test_math_log10(self): self.unary_template_int64(math_log10, np.log10, start=1) self.unary_template_uint64(math_log10, np.log10, start=1) - #--------------------------------------------------------------------------- + # --------------------------------------------------------------------------- # test_math_log1p def test_math_log1p(self): @@ -564,7 +572,7 @@ def test_math_log1p(self): self.unary_template_int64(math_log1p, np.log1p) self.unary_template_uint64(math_log1p, np.log1p) - #--------------------------------------------------------------------------- + # --------------------------------------------------------------------------- # test_math_remainder def test_math_remainder(self): @@ -573,16 +581,17 @@ def test_math_remainder(self): self.binary_template_int64(math_remainder, np.remainder, start=1) self.binary_template_uint64(math_remainder, np.remainder, start=1) - @skip_on_cudasim('math.remainder(0, 0) raises a ValueError on CUDASim') + @skip_on_cudasim("math.remainder(0, 0) raises a ValueError on CUDASim") def test_math_remainder_0_0(self): @cuda.jit(void(float64[::1], int64, int64)) def test_0_0(r, x, y): r[0] = math.remainder(x, y) + r = np.zeros(1, np.float64) test_0_0[1, 1](r, 0, 0) self.assertTrue(np.isnan(r[0])) - #--------------------------------------------------------------------------- + # --------------------------------------------------------------------------- # test_math_sqrt def test_math_sqrt(self): @@ -591,7 +600,7 @@ def test_math_sqrt(self): self.unary_template_int64(math_sqrt, np.sqrt) self.unary_template_uint64(math_sqrt, np.sqrt) - #--------------------------------------------------------------------------- + # --------------------------------------------------------------------------- # test_math_hypot def test_math_hypot(self): @@ -600,7 +609,7 @@ def test_math_hypot(self): self.binary_template_int64(math_hypot, np.hypot) self.binary_template_uint64(math_hypot, np.hypot) - #--------------------------------------------------------------------------- + # --------------------------------------------------------------------------- # test_math_pow def pow_template_int32(self, npdtype): @@ -626,14 +635,14 @@ def test_math_pow(self): self.pow_template_int32(np.float32) self.pow_template_int32(np.float64) - #--------------------------------------------------------------------------- + # --------------------------------------------------------------------------- # test_math_pow_binop def test_math_pow_binop(self): self.binary_template_float32(math_pow_binop, np.power) self.binary_template_float64(math_pow_binop, np.power) - #--------------------------------------------------------------------------- + # --------------------------------------------------------------------------- # test_math_ceil def test_math_ceil(self): @@ -642,7 +651,7 @@ def test_math_ceil(self): self.unary_template_int64(math_ceil, np.ceil) self.unary_template_uint64(math_ceil, np.ceil) - #--------------------------------------------------------------------------- + # --------------------------------------------------------------------------- # test_math_floor def test_math_floor(self): @@ -651,7 +660,7 @@ def test_math_floor(self): self.unary_template_int64(math_floor, np.floor) self.unary_template_uint64(math_floor, np.floor) - #--------------------------------------------------------------------------- + # --------------------------------------------------------------------------- # test_math_trunc # # Note that math.trunc() is only supported on NumPy float64s, and not @@ -663,20 +672,20 @@ def test_math_floor(self): def test_math_trunc(self): self.unary_template_float64(math_trunc, np.trunc) - @skip_on_cudasim('trunc only supported on NumPy float64') + @skip_on_cudasim("trunc only supported on NumPy float64") def test_math_trunc_non_float64(self): self.unary_template_float32(math_trunc, np.trunc) self.unary_template_int64(math_trunc, np.trunc) self.unary_template_uint64(math_trunc, np.trunc) - #--------------------------------------------------------------------------- + # --------------------------------------------------------------------------- # test_math_copysign def test_math_copysign(self): self.binary_template_float32(math_copysign, np.copysign, start=-1) self.binary_template_float64(math_copysign, np.copysign, start=-1) - #--------------------------------------------------------------------------- + # --------------------------------------------------------------------------- # test_math_modf def test_math_modf(self): @@ -696,45 +705,53 @@ def modf_template_compare(A, dtype, arytype): cfunc = cuda.jit((arytype, arytype, arytype))(math_modf) cfunc[1, len(A)](A, B, C) D, E = np.modf(A) - self.assertTrue(np.array_equal(B,D)) - self.assertTrue(np.array_equal(C,E)) + self.assertTrue(np.array_equal(B, D)) + self.assertTrue(np.array_equal(C, E)) nelem = 50 - #32 bit float + # 32 bit float with self.subTest("float32 modf on simple float"): - modf_template_compare(np.linspace(0, 10, nelem), dtype=np.float32, - arytype=float32[:]) + modf_template_compare( + np.linspace(0, 10, nelem), dtype=np.float32, arytype=float32[:] + ) with self.subTest("float32 modf on +- infinity"): - modf_template_compare(np.array([np.inf, -np.inf]), dtype=np.float32, - arytype=float32[:]) + modf_template_compare( + np.array([np.inf, -np.inf]), + dtype=np.float32, + arytype=float32[:], + ) with self.subTest("float32 modf on nan"): modf_template_nan(dtype=np.float32, arytype=float32[:]) - #64 bit float + # 64 bit float with self.subTest("float64 modf on simple float"): - modf_template_compare(np.linspace(0, 10, nelem), dtype=np.float64, - arytype=float64[:]) + modf_template_compare( + np.linspace(0, 10, nelem), dtype=np.float64, arytype=float64[:] + ) with self.subTest("float64 modf on +- infinity"): - modf_template_compare(np.array([np.inf, -np.inf]), dtype=np.float64, - arytype=float64[:]) + modf_template_compare( + np.array([np.inf, -np.inf]), + dtype=np.float64, + arytype=float64[:], + ) with self.subTest("float64 modf on nan"): modf_template_nan(dtype=np.float64, arytype=float64[:]) - #--------------------------------------------------------------------------- + # --------------------------------------------------------------------------- # test_math_fmod def test_math_fmod(self): self.binary_template_float32(math_fmod, np.fmod, start=1) self.binary_template_float64(math_fmod, np.fmod, start=1) - #--------------------------------------------------------------------------- + # --------------------------------------------------------------------------- # test_math_mod_binop def test_math_mod_binop(self): self.binary_template_float32(math_mod_binop, np.fmod, start=1) self.binary_template_float64(math_mod_binop, np.fmod, start=1) - #--------------------------------------------------------------------------- + # --------------------------------------------------------------------------- # test_math_isnan def test_math_isnan(self): @@ -745,7 +762,7 @@ def test_math_isnan(self): self.unary_bool_special_values_float32(math_isnan, np.isnan) self.unary_bool_special_values_float64(math_isnan, np.isnan) - #--------------------------------------------------------------------------- + # --------------------------------------------------------------------------- # test_math_isinf def test_math_isinf(self): @@ -756,7 +773,7 @@ def test_math_isinf(self): self.unary_bool_special_values_float32(math_isinf, np.isinf) self.unary_bool_special_values_float64(math_isinf, np.isinf) - #--------------------------------------------------------------------------- + # --------------------------------------------------------------------------- # test_math_isfinite def test_math_isfinite(self): @@ -767,14 +784,14 @@ def test_math_isfinite(self): self.unary_bool_special_values_float32(math_isfinite, np.isfinite) self.unary_bool_special_values_float64(math_isfinite, np.isfinite) - #--------------------------------------------------------------------------- + # --------------------------------------------------------------------------- # test_math_degrees def test_math_degrees(self): self.unary_bool_template_float32(math_degrees, np.degrees) self.unary_bool_template_float64(math_degrees, np.degrees) - #--------------------------------------------------------------------------- + # --------------------------------------------------------------------------- # test_math_radians def test_math_radians(self): @@ -782,5 +799,5 @@ def test_math_radians(self): self.unary_bool_template_float64(math_radians, np.radians) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_matmul.py b/numba_cuda/numba/cuda/tests/cudapy/test_matmul.py index 51f1181a3..0071287c7 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_matmul.py @@ -15,9 +15,7 @@ class TestCudaMatMul(CUDATestCase): - def test_func(self): - @cuda.jit(void(float32[:, ::1], float32[:, ::1], float32[:, ::1])) def cu_square_matrix_mul(A, B, C): sA = cuda.shared.array(shape=SM_SIZE, dtype=float32) @@ -70,5 +68,5 @@ def cu_square_matrix_mul(A, B, C): np.testing.assert_allclose(C, Cans, rtol=1e-5) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_minmax.py b/numba_cuda/numba/cuda/tests/cudapy/test_minmax.py index aee97fd63..c44a2b5e6 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_minmax.py @@ -22,20 +22,21 @@ def builtin_min(A, B, C): C[i] = float64(min(A[i], B[i])) -@skip_on_cudasim('Tests PTX emission') +@skip_on_cudasim("Tests PTX emission") class TestCudaMinMax(CUDATestCase): def _run( - self, - kernel, - numpy_equivalent, - ptx_instruction, - dtype_left, - dtype_right, - n=5): + self, + kernel, + numpy_equivalent, + ptx_instruction, + dtype_left, + dtype_right, + n=5, + ): kernel = cuda.jit(kernel) c = np.zeros(n, dtype=np.float64) - a = np.arange(n, dtype=dtype_left) + .5 + a = np.arange(n, dtype=dtype_left) + 0.5 b = np.full(n, fill_value=2, dtype=dtype_right) kernel[1, c.shape](a, b, c) @@ -45,69 +46,29 @@ def _run( self.assertIn(ptx_instruction, ptx) def test_max_f8f8(self): - self._run( - builtin_max, - np.maximum, - 'max.f64', - np.float64, - np.float64) + self._run(builtin_max, np.maximum, "max.f64", np.float64, np.float64) def test_max_f4f8(self): - self._run( - builtin_max, - np.maximum, - 'max.f64', - np.float32, - np.float64) + self._run(builtin_max, np.maximum, "max.f64", np.float32, np.float64) def test_max_f8f4(self): - self._run( - builtin_max, - np.maximum, - 'max.f64', - np.float64, - np.float32) + self._run(builtin_max, np.maximum, "max.f64", np.float64, np.float32) def test_max_f4f4(self): - self._run( - builtin_max, - np.maximum, - 'max.f32', - np.float32, - np.float32) + self._run(builtin_max, np.maximum, "max.f32", np.float32, np.float32) def test_min_f8f8(self): - self._run( - builtin_min, - np.minimum, - 'min.f64', - np.float64, - np.float64) + self._run(builtin_min, np.minimum, "min.f64", np.float64, np.float64) def test_min_f4f8(self): - self._run( - builtin_min, - np.minimum, - 'min.f64', - np.float32, - np.float64) + self._run(builtin_min, np.minimum, "min.f64", np.float32, np.float64) def test_min_f8f4(self): - self._run( - builtin_min, - np.minimum, - 'min.f64', - np.float64, - np.float32) + self._run(builtin_min, np.minimum, "min.f64", np.float64, np.float32) def test_min_f4f4(self): - self._run( - builtin_min, - np.minimum, - 'min.f32', - np.float32, - np.float32) + self._run(builtin_min, np.minimum, "min.f32", np.float32, np.float32) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py b/numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py index 181a80a69..143fa10c6 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py @@ -5,11 +5,11 @@ class TestCudaMonteCarlo(CUDATestCase): def test_montecarlo(self): - """Just make sure we can compile this - """ + """Just make sure we can compile this""" @cuda.jit( - 'void(double[:], double[:], double, double, double, double[:])') + "void(double[:], double[:], double, double, double, double[:])" + ) def step(last, paths, dt, c0, c1, normdist): i = cuda.grid(1) if i >= paths.shape[0]: @@ -18,5 +18,5 @@ def step(last, paths, dt, c0, c1, normdist): paths[i] = last[i] * math.exp(c0 * dt + c1 * noise) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py b/numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py index 01b8a63ea..700987252 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py @@ -47,7 +47,7 @@ def check(inp, out): copy_plus_1[1, N](A, B) check(A, B) - @skip_on_cudasim('Simulator does not support multiple threads') + @skip_on_cudasim("Simulator does not support multiple threads") def test_multithreaded(self): def work(gpu, dA, results, ridx): try: @@ -64,9 +64,12 @@ def work(gpu, dA, results, ridx): nthreads = 10 results = [None] * nthreads - threads = [threading.Thread(target=work, args=(cuda.gpus.current, - dA, results, i)) - for i in range(nthreads)] + threads = [ + threading.Thread( + target=work, args=(cuda.gpus.current, dA, results, i) + ) + for i in range(nthreads) + ] for th in threads: th.start() @@ -81,7 +84,6 @@ def work(gpu, dA, results, ridx): @unittest.skipIf(len(cuda.gpus) < 2, "need more than 1 gpus") def test_with_context(self): - @cuda.jit def vector_add_scalar(arr, val): i = cuda.grid(1) @@ -115,7 +117,7 @@ def test_with_context_peer_copy(self): with cuda.gpus[0]: ctx = cuda.current_context() if not ctx.can_access_peer(1): - self.skipTest('Peer access between GPUs disabled') + self.skipTest("Peer access between GPUs disabled") # 1. Create a range in an array hostarr = np.arange(10, dtype=np.float32) @@ -136,5 +138,5 @@ def test_with_context_peer_copy(self): np.testing.assert_equal(arr2.copy_to_host(), hostarr) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py b/numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py index 04a1234b4..4d3fa07ca 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py @@ -7,12 +7,13 @@ from numba.cuda.testing import skip_on_cudasim, CUDATestCase import unittest -has_mp_get_context = hasattr(mp, 'get_context') -is_unix = os.name == 'posix' +has_mp_get_context = hasattr(mp, "get_context") +is_unix = os.name == "posix" def fork_test(q): from numba.cuda.cudadrv.error import CudaDriverError + try: cuda.to_device(np.arange(1)) except CudaDriverError as e: @@ -21,17 +22,17 @@ def fork_test(q): q.put(None) -@skip_on_cudasim('disabled for cudasim') +@skip_on_cudasim("disabled for cudasim") class TestMultiprocessing(CUDATestCase): - @unittest.skipUnless(has_mp_get_context, 'requires mp.get_context') - @unittest.skipUnless(is_unix, 'requires Unix') + @unittest.skipUnless(has_mp_get_context, "requires mp.get_context") + @unittest.skipUnless(is_unix, "requires Unix") def test_fork(self): """ Test fork detection. """ cuda.current_context() # force cuda initialize # fork in process that also uses CUDA - ctx = mp.get_context('fork') + ctx = mp.get_context("fork") q = ctx.Queue() proc = ctx.Process(target=fork_test, args=[q]) proc.start() @@ -39,8 +40,8 @@ def test_fork(self): proc.join() # there should be an exception raised in the child process self.assertIsNotNone(exc) - self.assertIn('CUDA initialized before forking', str(exc)) + self.assertIn("CUDA initialized before forking", str(exc)) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py b/numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py index 30afd3eb0..7ca6ff8dd 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py @@ -3,8 +3,11 @@ import multiprocessing import numpy as np from numba import cuda -from numba.cuda.testing import (skip_on_cudasim, skip_under_cuda_memcheck, - CUDATestCase) +from numba.cuda.testing import ( + skip_on_cudasim, + skip_under_cuda_memcheck, + CUDATestCase, +) import unittest try: @@ -15,7 +18,7 @@ has_concurrent_futures = True -has_mp_get_context = hasattr(multiprocessing, 'get_context') +has_mp_get_context = hasattr(multiprocessing, "get_context") def check_concurrent_compiling(): @@ -41,15 +44,14 @@ def spawn_process_entry(q): # Catch anything that goes wrong in the threads except: # noqa: E722 msg = traceback.format_exc() - q.put('\n'.join(['', '=' * 80, msg])) + q.put("\n".join(["", "=" * 80, msg])) else: q.put(None) -@skip_under_cuda_memcheck('Hangs cuda-memcheck') -@skip_on_cudasim('disabled for cudasim') +@skip_under_cuda_memcheck("Hangs cuda-memcheck") +@skip_on_cudasim("disabled for cudasim") class TestMultiThreadCompiling(CUDATestCase): - @unittest.skipIf(not has_concurrent_futures, "no concurrent.futures") def test_concurrent_compiling(self): check_concurrent_compiling() @@ -59,7 +61,7 @@ def test_spawn_concurrent_compilation(self): # force CUDA context init cuda.get_current_device() # use "spawn" to avoid inheriting the CUDA context - ctx = multiprocessing.get_context('spawn') + ctx = multiprocessing.get_context("spawn") q = ctx.Queue() p = ctx.Process(target=spawn_process_entry, args=(q,)) @@ -70,7 +72,7 @@ def test_spawn_concurrent_compilation(self): p.join() if err is not None: raise AssertionError(err) - self.assertEqual(p.exitcode, 0, 'test failed in child process') + self.assertEqual(p.exitcode, 0, "test failed in child process") def test_invalid_context_error_with_d2h(self): def d2h(arr, out): @@ -97,5 +99,5 @@ def d2d(dst, src): np.testing.assert_equal(darr.copy_to_host(), arr) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_nondet.py b/numba_cuda/numba/cuda/tests/cudapy/test_nondet.py index eaf141052..af57a47ed 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_nondet.py @@ -45,5 +45,5 @@ def diagproduct(c, a, b): np.testing.assert_array_almost_equal(dF.copy_to_host(), E) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_operator.py b/numba_cuda/numba/cuda/tests/cudapy/test_operator.py index 0547d55fe..5df98b1e2 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_operator.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_operator.py @@ -1,6 +1,10 @@ import numpy as np -from numba.cuda.testing import (unittest, CUDATestCase, skip_unless_cc_53, - skip_on_cudasim) +from numba.cuda.testing import ( + unittest, + CUDATestCase, + skip_unless_cc_53, + skip_on_cudasim, +) from numba import cuda from numba.core.types import f2, b1 from numba.cuda import compile_ptx @@ -73,12 +77,12 @@ def simple_fp16_ne(ary, a, b): ary[0] = a != b -@cuda.jit('b1(f2, f2)', device=True) +@cuda.jit("b1(f2, f2)", device=True) def hlt_func_1(x, y): return x < y -@cuda.jit('b1(f2, f2)', device=True) +@cuda.jit("b1(f2, f2)", device=True) def hlt_func_2(x, y): return x < y @@ -116,6 +120,7 @@ def setUp(self): """ Test if operator module is supported by the CUDA target. """ + def operator_template(self, op): @cuda.jit def foo(a, b): @@ -146,8 +151,12 @@ def test_floordiv(self): @skip_unless_cc_53 def test_fp16_binary(self): - functions = (simple_fp16add, simple_fp16sub, simple_fp16mul, - simple_fp16_div_scalar) + functions = ( + simple_fp16add, + simple_fp16sub, + simple_fp16mul, + simple_fp16_div_scalar, + ) ops = (operator.add, operator.sub, operator.mul, operator.truediv) for fn, op in zip(functions, ops): @@ -162,10 +171,10 @@ def test_fp16_binary(self): expected = op(arg1, arg2) np.testing.assert_allclose(got, expected) - @skip_on_cudasim('Compilation unsupported in the simulator') + @skip_on_cudasim("Compilation unsupported in the simulator") def test_fp16_binary_ptx(self): functions = (simple_fp16add, simple_fp16sub, simple_fp16mul) - instrs = ('add.f16', 'sub.f16', 'mul.f16') + instrs = ("add.f16", "sub.f16", "mul.f16") args = (f2[:], f2, f2) for fn, instr in zip(functions, instrs): with self.subTest(instr=instr): @@ -174,11 +183,14 @@ def test_fp16_binary_ptx(self): @skip_unless_cc_53 def test_mixed_fp16_binary_arithmetic(self): - functions = (simple_fp16add, simple_fp16sub, simple_fp16mul, - simple_fp16_div_scalar) + functions = ( + simple_fp16add, + simple_fp16sub, + simple_fp16mul, + simple_fp16_div_scalar, + ) ops = (operator.add, operator.sub, operator.mul, operator.truediv) - types = (np.int8, np.int16, np.int32, np.int64, - np.float32, np.float64) + types = (np.int8, np.int16, np.int32, np.int64, np.float32, np.float64) for (fn, op), ty in itertools.product(zip(functions, ops), types): with self.subTest(op=op, ty=ty): kernel = cuda.jit(fn) @@ -192,10 +204,10 @@ def test_mixed_fp16_binary_arithmetic(self): expected = op(arg1, arg2) np.testing.assert_allclose(got, expected) - @skip_on_cudasim('Compilation unsupported in the simulator') + @skip_on_cudasim("Compilation unsupported in the simulator") def test_fp16_inplace_binary_ptx(self): functions = (simple_fp16_iadd, simple_fp16_isub, simple_fp16_imul) - instrs = ('add.f16', 'sub.f16', 'mul.f16') + instrs = ("add.f16", "sub.f16", "mul.f16") args = (f2[:], f2) for fn, instr in zip(functions, instrs): @@ -205,8 +217,12 @@ def test_fp16_inplace_binary_ptx(self): @skip_unless_cc_53 def test_fp16_inplace_binary(self): - functions = (simple_fp16_iadd, simple_fp16_isub, simple_fp16_imul, - simple_fp16_idiv) + functions = ( + simple_fp16_iadd, + simple_fp16_isub, + simple_fp16_imul, + simple_fp16_idiv, + ) ops = (operator.iadd, operator.isub, operator.imul, operator.itruediv) for fn, op in zip(functions, ops): @@ -236,26 +252,37 @@ def test_fp16_unary(self): expected = op(arg1) np.testing.assert_allclose(got, expected) - @skip_on_cudasim('Compilation unsupported in the simulator') + @skip_on_cudasim("Compilation unsupported in the simulator") def test_fp16_neg_ptx(self): args = (f2[:], f2) ptx, _ = compile_ptx(simple_fp16neg, args, cc=(5, 3)) - self.assertIn('neg.f16', ptx) + self.assertIn("neg.f16", ptx) - @skip_on_cudasim('Compilation unsupported in the simulator') + @skip_on_cudasim("Compilation unsupported in the simulator") def test_fp16_abs_ptx(self): args = (f2[:], f2) ptx, _ = compile_ptx(simple_fp16abs, args, cc=(5, 3)) - self.assertIn('abs.f16', ptx) + self.assertIn("abs.f16", ptx) @skip_unless_cc_53 def test_fp16_comparison(self): - functions = (simple_fp16_gt, simple_fp16_ge, - simple_fp16_lt, simple_fp16_le, - simple_fp16_eq, simple_fp16_ne) - ops = (operator.gt, operator.ge, operator.lt, operator.le, - operator.eq, operator.ne) + functions = ( + simple_fp16_gt, + simple_fp16_ge, + simple_fp16_lt, + simple_fp16_le, + simple_fp16_eq, + simple_fp16_ne, + ) + ops = ( + operator.gt, + operator.ge, + operator.lt, + operator.le, + operator.eq, + operator.ne, + ) for fn, op in zip(functions, ops): with self.subTest(op=op): @@ -271,16 +298,25 @@ def test_fp16_comparison(self): @skip_unless_cc_53 def test_mixed_fp16_comparison(self): - functions = (simple_fp16_gt, simple_fp16_ge, - simple_fp16_lt, simple_fp16_le, - simple_fp16_eq, simple_fp16_ne) - ops = (operator.gt, operator.ge, operator.lt, operator.le, - operator.eq, operator.ne) - types = (np.int8, np.int16, np.int32, np.int64, - np.float32, np.float64) - - for (fn, op), ty in itertools.product(zip(functions, ops), - types): + functions = ( + simple_fp16_gt, + simple_fp16_ge, + simple_fp16_lt, + simple_fp16_le, + simple_fp16_eq, + simple_fp16_ne, + ) + ops = ( + operator.gt, + operator.ge, + operator.lt, + operator.le, + operator.eq, + operator.ne, + ) + types = (np.int8, np.int16, np.int32, np.int64, np.float32, np.float64) + + for (fn, op), ty in itertools.product(zip(functions, ops), types): with self.subTest(op=op, ty=ty): kernel = cuda.jit(fn) @@ -294,48 +330,68 @@ def test_mixed_fp16_comparison(self): @skip_unless_cc_53 def test_multiple_float16_comparisons(self): - functions = (test_multiple_hcmp_1, - test_multiple_hcmp_2, - test_multiple_hcmp_3, - test_multiple_hcmp_4, - test_multiple_hcmp_5) + functions = ( + test_multiple_hcmp_1, + test_multiple_hcmp_2, + test_multiple_hcmp_3, + test_multiple_hcmp_4, + test_multiple_hcmp_5, + ) for fn in functions: with self.subTest(fn=fn): compiled = cuda.jit("void(b1[:], f2, f2, f2)")(fn) ary = np.zeros(1, dtype=np.bool_) - arg1 = np.float16(2.) - arg2 = np.float16(3.) - arg3 = np.float16(4.) + arg1 = np.float16(2.0) + arg2 = np.float16(3.0) + arg3 = np.float16(4.0) compiled[1, 1](ary, arg1, arg2, arg3) self.assertTrue(ary[0]) @skip_unless_cc_53 def test_multiple_float16_comparisons_false(self): - functions = (test_multiple_hcmp_1, - test_multiple_hcmp_2, - test_multiple_hcmp_3, - test_multiple_hcmp_4, - test_multiple_hcmp_5) + functions = ( + test_multiple_hcmp_1, + test_multiple_hcmp_2, + test_multiple_hcmp_3, + test_multiple_hcmp_4, + test_multiple_hcmp_5, + ) for fn in functions: with self.subTest(fn=fn): compiled = cuda.jit("void(b1[:], f2, f2, f2)")(fn) ary = np.zeros(1, dtype=np.bool_) - arg1 = np.float16(2.) - arg2 = np.float16(3.) - arg3 = np.float16(1.) + arg1 = np.float16(2.0) + arg2 = np.float16(3.0) + arg3 = np.float16(1.0) compiled[1, 1](ary, arg1, arg2, arg3) self.assertFalse(ary[0]) - @skip_on_cudasim('Compilation unsupported in the simulator') + @skip_on_cudasim("Compilation unsupported in the simulator") def test_fp16_comparison_ptx(self): - functions = (simple_fp16_gt, simple_fp16_ge, - simple_fp16_lt, simple_fp16_le, - simple_fp16_eq, simple_fp16_ne) - ops = (operator.gt, operator.ge, operator.lt, operator.le, - operator.eq, operator.ne) - opstring = ('setp.gt.f16', 'setp.ge.f16', - 'setp.lt.f16', 'setp.le.f16', - 'setp.eq.f16', 'setp.ne.f16') + functions = ( + simple_fp16_gt, + simple_fp16_ge, + simple_fp16_lt, + simple_fp16_le, + simple_fp16_eq, + simple_fp16_ne, + ) + ops = ( + operator.gt, + operator.ge, + operator.lt, + operator.le, + operator.eq, + operator.ne, + ) + opstring = ( + "setp.gt.f16", + "setp.ge.f16", + "setp.lt.f16", + "setp.le.f16", + "setp.eq.f16", + "setp.ne.f16", + ) args = (b1[:], f2, f2) for fn, op, s in zip(functions, ops, opstring): @@ -343,51 +399,79 @@ def test_fp16_comparison_ptx(self): ptx, _ = compile_ptx(fn, args, cc=(5, 3)) self.assertIn(s, ptx) - @skip_on_cudasim('Compilation unsupported in the simulator') + @skip_on_cudasim("Compilation unsupported in the simulator") def test_fp16_int8_comparison_ptx(self): # Test that int8 can be safely converted to fp16 # in a comparison - functions = (simple_fp16_gt, simple_fp16_ge, - simple_fp16_lt, simple_fp16_le, - simple_fp16_eq, simple_fp16_ne) - ops = (operator.gt, operator.ge, operator.lt, operator.le, - operator.eq, operator.ne) - - opstring = {operator.gt:'setp.gt.f16', - operator.ge:'setp.ge.f16', - operator.lt:'setp.lt.f16', - operator.le:'setp.le.f16', - operator.eq:'setp.eq.f16', - operator.ne:'setp.ne.f16'} + functions = ( + simple_fp16_gt, + simple_fp16_ge, + simple_fp16_lt, + simple_fp16_le, + simple_fp16_eq, + simple_fp16_ne, + ) + ops = ( + operator.gt, + operator.ge, + operator.lt, + operator.le, + operator.eq, + operator.ne, + ) + + opstring = { + operator.gt: "setp.gt.f16", + operator.ge: "setp.ge.f16", + operator.lt: "setp.lt.f16", + operator.le: "setp.le.f16", + operator.eq: "setp.eq.f16", + operator.ne: "setp.ne.f16", + } for fn, op in zip(functions, ops): with self.subTest(op=op): args = (b1[:], f2, from_dtype(np.int8)) ptx, _ = compile_ptx(fn, args, cc=(5, 3)) self.assertIn(opstring[op], ptx) - @skip_on_cudasim('Compilation unsupported in the simulator') + @skip_on_cudasim("Compilation unsupported in the simulator") def test_mixed_fp16_comparison_promotion_ptx(self): - functions = (simple_fp16_gt, simple_fp16_ge, - simple_fp16_lt, simple_fp16_le, - simple_fp16_eq, simple_fp16_ne) - ops = (operator.gt, operator.ge, operator.lt, operator.le, - operator.eq, operator.ne) - - types_promote = (np.int16, np.int32, np.int64, - np.float32, np.float64) - opstring = {operator.gt:'setp.gt.', - operator.ge:'setp.ge.', - operator.lt:'setp.lt.', - operator.le:'setp.le.', - operator.eq:'setp.eq.', - operator.ne:'setp.neu.'} - opsuffix = {np.dtype('int32'): 'f64', - np.dtype('int64'): 'f64', - np.dtype('float32'): 'f32', - np.dtype('float64'): 'f64'} - - for (fn, op), ty in itertools.product(zip(functions, ops), - types_promote): + functions = ( + simple_fp16_gt, + simple_fp16_ge, + simple_fp16_lt, + simple_fp16_le, + simple_fp16_eq, + simple_fp16_ne, + ) + ops = ( + operator.gt, + operator.ge, + operator.lt, + operator.le, + operator.eq, + operator.ne, + ) + + types_promote = (np.int16, np.int32, np.int64, np.float32, np.float64) + opstring = { + operator.gt: "setp.gt.", + operator.ge: "setp.ge.", + operator.lt: "setp.lt.", + operator.le: "setp.le.", + operator.eq: "setp.eq.", + operator.ne: "setp.neu.", + } + opsuffix = { + np.dtype("int32"): "f64", + np.dtype("int64"): "f64", + np.dtype("float32"): "f32", + np.dtype("float64"): "f64", + } + + for (fn, op), ty in itertools.product( + zip(functions, ops), types_promote + ): with self.subTest(op=op, ty=ty): arg2_ty = np.result_type(np.float16, ty) args = (b1[:], f2, from_dtype(arg2_ty)) @@ -397,5 +481,5 @@ def test_mixed_fp16_comparison_promotion_ptx(self): self.assertIn(ops, ptx) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_optimization.py b/numba_cuda/numba/cuda/tests/cudapy/test_optimization.py index 27399727b..200ec5264 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_optimization.py @@ -18,10 +18,10 @@ def device_func(x, y, z): # the test function were more complex it may be possible to isolate additional # fragments of PTX we could check for the absence / presence of, but removal of # the use of local memory is a good indicator that optimization was applied. -removed_by_opt = ( '__local_depot0',) +removed_by_opt = ("__local_depot0",) -@skip_on_cudasim('Simulator does not optimize code') +@skip_on_cudasim("Simulator does not optimize code") class TestOptimization(CUDATestCase): def test_eager_opt(self): # Optimization should occur by default @@ -74,7 +74,7 @@ def test_device_opt(self): sig = (float64, float64, float64) device = cuda.jit(sig, device=True)(device_func) ptx = device.inspect_asm(sig) - self.assertIn('fma.rn.f64', ptx) + self.assertIn("fma.rn.f64", ptx) def test_device_noopt(self): # Optimization disabled @@ -82,8 +82,8 @@ def test_device_noopt(self): device = cuda.jit(sig, device=True, opt=False)(device_func) ptx = device.inspect_asm(sig) # Fused-multiply adds should be disabled when not optimizing - self.assertNotIn('fma.rn.f64', ptx) + self.assertNotIn("fma.rn.f64", ptx) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_overload.py b/numba_cuda/numba/cuda/tests/cudapy/test_overload.py index 746ea3f4a..51752f732 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_overload.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_overload.py @@ -8,6 +8,7 @@ # Dummy function definitions to overload + def generic_func_1(): pass @@ -83,109 +84,124 @@ def default_values_and_kwargs(): # Overload implementations -@overload(generic_func_1, target='generic') + +@overload(generic_func_1, target="generic") def ol_generic_func_1(x): def impl(x): x[0] *= GENERIC_FUNCTION_1 + return impl -@overload(cuda_func_1, target='cuda') +@overload(cuda_func_1, target="cuda") def ol_cuda_func_1(x): def impl(x): x[0] *= CUDA_FUNCTION_1 + return impl -@overload(generic_func_2, target='generic') +@overload(generic_func_2, target="generic") def ol_generic_func_2(x): def impl(x): x[0] *= GENERIC_FUNCTION_2 + return impl -@overload(cuda_func_2, target='cuda') +@overload(cuda_func_2, target="cuda") def ol_cuda_func(x): def impl(x): x[0] *= CUDA_FUNCTION_2 + return impl -@overload(generic_calls_generic, target='generic') +@overload(generic_calls_generic, target="generic") def ol_generic_calls_generic(x): def impl(x): x[0] *= GENERIC_CALLS_GENERIC generic_func_1(x) + return impl -@overload(generic_calls_cuda, target='generic') +@overload(generic_calls_cuda, target="generic") def ol_generic_calls_cuda(x): def impl(x): x[0] *= GENERIC_CALLS_CUDA cuda_func_1(x) + return impl -@overload(cuda_calls_generic, target='cuda') +@overload(cuda_calls_generic, target="cuda") def ol_cuda_calls_generic(x): def impl(x): x[0] *= CUDA_CALLS_GENERIC generic_func_1(x) + return impl -@overload(cuda_calls_cuda, target='cuda') +@overload(cuda_calls_cuda, target="cuda") def ol_cuda_calls_cuda(x): def impl(x): x[0] *= CUDA_CALLS_CUDA cuda_func_1(x) + return impl -@overload(target_overloaded, target='generic') +@overload(target_overloaded, target="generic") def ol_target_overloaded_generic(x): def impl(x): x[0] *= GENERIC_TARGET_OL + return impl -@overload(target_overloaded, target='cuda') +@overload(target_overloaded, target="cuda") def ol_target_overloaded_cuda(x): def impl(x): x[0] *= CUDA_TARGET_OL + return impl -@overload(generic_calls_target_overloaded, target='generic') +@overload(generic_calls_target_overloaded, target="generic") def ol_generic_calls_target_overloaded(x): def impl(x): x[0] *= GENERIC_CALLS_TARGET_OL target_overloaded(x) + return impl -@overload(cuda_calls_target_overloaded, target='cuda') +@overload(cuda_calls_target_overloaded, target="cuda") def ol_cuda_calls_target_overloaded(x): def impl(x): x[0] *= CUDA_CALLS_TARGET_OL target_overloaded(x) + return impl -@overload(target_overloaded_calls_target_overloaded, target='generic') +@overload(target_overloaded_calls_target_overloaded, target="generic") def ol_generic_calls_target_overloaded_generic(x): def impl(x): x[0] *= GENERIC_TARGET_OL_CALLS_TARGET_OL target_overloaded(x) + return impl -@overload(target_overloaded_calls_target_overloaded, target='cuda') +@overload(target_overloaded_calls_target_overloaded, target="cuda") def ol_generic_calls_target_overloaded_cuda(x): def impl(x): x[0] *= CUDA_TARGET_OL_CALLS_TARGET_OL target_overloaded(x) + return impl @@ -193,10 +209,11 @@ def impl(x): def ol_default_values_and_kwargs(out, x, y=5, z=6): def impl(out, x, y=5, z=6): out[0], out[1] = x + y, z + return impl -@skip_on_cudasim('Overloading not supported in cudasim') +@skip_on_cudasim("Overloading not supported in cudasim") class TestOverload(CUDATestCase): def check_overload(self, kernel, expected): x = np.ones(1, dtype=np.int32) @@ -311,7 +328,7 @@ def test_overload_attribute_target(self): MyDummy, MyDummyType = self.make_dummy_type() mydummy_type = typeof(MyDummy()) - @overload_attribute(MyDummyType, 'cuda_only', target='cuda') + @overload_attribute(MyDummyType, "cuda_only", target="cuda") def ov_dummy_cuda_attr(obj): def imp(obj): return 42 @@ -330,6 +347,7 @@ def imp(obj): msg = "Unknown attribute 'cuda_only'" with self.assertRaisesRegex(TypingError, msg): + @njit(types.int64(mydummy_type)) def illegal_target_attr_use(x): return x.cuda_only @@ -345,14 +363,15 @@ def test_default_values_and_kwargs(self): """ Test default values and kwargs. """ + @cuda.jit() def kernel(a, b, out): default_values_and_kwargs(out, a, z=b) out = np.empty(2, dtype=np.int64) - kernel[1,1](1, 2, out) + kernel[1, 1](1, 2, out) self.assertEqual(tuple(out), (6, 2)) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_powi.py b/numba_cuda/numba/cuda/tests/cudapy/test_powi.py index 1932b3165..331a4b25c 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_powi.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_powi.py @@ -47,7 +47,7 @@ def vec_pow_inplace_binop(r, x): def random_complex(N): np.random.seed(123) - return (np.random.random(1) + np.random.random(1) * 1j) + return np.random.random(1) + np.random.random(1) * 1j class TestCudaPowi(CUDATestCase): @@ -59,7 +59,7 @@ def test_powi(self): A = np.arange(10, dtype=np.float64).reshape(2, 5) Aout = np.empty_like(A) kernel[1, A.shape](A, power, Aout) - self.assertTrue(np.allclose(Aout, A ** power)) + self.assertTrue(np.allclose(Aout, A**power)) def test_powi_binop(self): dec = cuda.jit(void(float64[:, :], int8, float64[:, :])) @@ -69,7 +69,7 @@ def test_powi_binop(self): A = np.arange(10, dtype=np.float64).reshape(2, 5) Aout = np.empty_like(A) kernel[1, A.shape](A, power, Aout) - self.assertTrue(np.allclose(Aout, A ** power)) + self.assertTrue(np.allclose(Aout, A**power)) # Relative tolerance kwarg is provided because 1.0e-7 (the default for # assert_allclose) is a bit tight for single precision. @@ -81,7 +81,7 @@ def _test_cpow(self, dtype, func, rtol=1.0e-7): cfunc = cuda.jit(func) cfunc[1, N](r, x, y) - np.testing.assert_allclose(r, x ** y, rtol=rtol) + np.testing.assert_allclose(r, x**y, rtol=rtol) # Checks special cases x = np.asarray([0.0j, 1.0j], dtype=dtype) @@ -89,7 +89,7 @@ def _test_cpow(self, dtype, func, rtol=1.0e-7): r = np.zeros_like(x) cfunc[1, 2](r, x, y) - np.testing.assert_allclose(r, x ** y, rtol=rtol) + np.testing.assert_allclose(r, x**y, rtol=rtol) def test_cpow_complex64_pow(self): self._test_cpow(np.complex64, vec_pow, rtol=3.0e-7) @@ -107,7 +107,7 @@ def _test_cpow_inplace_binop(self, dtype, rtol=1.0e-7): N = 32 x = random_complex(N).astype(dtype) y = random_complex(N).astype(dtype) - r = x ** y + r = x**y cfunc = cuda.jit(vec_pow_inplace_binop) cfunc[1, N](x, y) @@ -120,5 +120,5 @@ def test_cpow_complex128_inplace_binop(self): self._test_cpow_inplace_binop(np.complex128, rtol=3.0e-7) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_print.py b/numba_cuda/numba/cuda/tests/cudapy/test_print.py index 30328ead4..0dbb3139b 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_print.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_print.py @@ -113,7 +113,7 @@ def run_code(self, code): def test_cuhello(self): output, _ = self.run_code(cuhello_usecase) actual = [line.strip() for line in output.splitlines()] - expected = ['-42'] * 6 + ['%d 999' % i for i in range(6)] + expected = ["-42"] * 6 + ["%d 999" % i for i in range(6)] # The output of GPU threads is intermingled, but each print() # call is still atomic self.assertEqual(sorted(actual), expected) @@ -136,7 +136,7 @@ def test_printempty(self): def test_string(self): output, _ = self.run_code(printstring_usecase) lines = [line.strip() for line in output.splitlines(True)] - expected = ['%d hop! 999' % i for i in range(3)] + expected = ["%d hop! 999" % i for i in range(3)] self.assertEqual(sorted(lines), expected) def test_dim3(self): @@ -145,7 +145,7 @@ def test_dim3(self): expected = [str(i) for i in np.ndindex(2, 2, 2)] self.assertEqual(sorted(lines), expected) - @skip_on_cudasim('cudasim can print unlimited output') + @skip_on_cudasim("cudasim can print unlimited output") def test_too_many_args(self): # Tests that we emit the format string and warn when there are more # than 32 arguments, in common with CUDA C/C++ printf - this is due to @@ -155,14 +155,16 @@ def test_too_many_args(self): output, errors = self.run_code(print_too_many_usecase) # Check that the format string was printed instead of formatted garbage - expected_fmt_string = ' '.join(['%lld' for _ in range(33)]) + expected_fmt_string = " ".join(["%lld" for _ in range(33)]) self.assertIn(expected_fmt_string, output) # Check for the expected warning about formatting more than 32 items - warn_msg = ('CUDA print() cannot print more than 32 items. The raw ' - 'format string will be emitted by the kernel instead.') + warn_msg = ( + "CUDA print() cannot print more than 32 items. The raw " + "format string will be emitted by the kernel instead." + ) self.assertIn(warn_msg, errors) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py b/numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py index 298a5b747..8ee4b786d 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py @@ -29,5 +29,5 @@ def preCalc(y, yA, yB, numDataPoints): self.assertTrue(np.all(y == yB)) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_random.py b/numba_cuda/numba/cuda/tests/cudapy/test_random.py index 11bbf95aa..feffb840e 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_random.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_random.py @@ -6,9 +6,12 @@ from numba.cuda.testing import unittest from numba.cuda.testing import skip_on_cudasim, CUDATestCase -from numba.cuda.random import \ - xoroshiro128p_uniform_float32, xoroshiro128p_normal_float32, \ - xoroshiro128p_uniform_float64, xoroshiro128p_normal_float64 +from numba.cuda.random import ( + xoroshiro128p_uniform_float32, + xoroshiro128p_normal_float32, + xoroshiro128p_uniform_float64, + xoroshiro128p_normal_float64, +) # Distributions @@ -52,8 +55,9 @@ def test_create_subsequence_start(self): states = cuda.random.create_xoroshiro128p_states(10, seed=1) s1 = states.copy_to_host() - states = cuda.random.create_xoroshiro128p_states(10, seed=1, - subsequence_start=3) + states = cuda.random.create_xoroshiro128p_states( + 10, seed=1, subsequence_start=3 + ) s2 = states.copy_to_host() # Starting seeds should match up with offset of 3 @@ -61,8 +65,9 @@ def test_create_subsequence_start(self): def test_create_stream(self): stream = cuda.stream() - states = cuda.random.create_xoroshiro128p_states(10, seed=1, - stream=stream) + states = cuda.random.create_xoroshiro128p_states( + 10, seed=1, stream=stream + ) s = states.copy_to_host() self.assertEqual(len(np.unique(s)), 10) @@ -79,7 +84,7 @@ def check_uniform(self, kernel_func, dtype): def test_uniform_float32(self): self.check_uniform(rng_kernel_float32, np.float32) - @skip_on_cudasim('skip test for speed under cudasim') + @skip_on_cudasim("skip test for speed under cudasim") def test_uniform_float64(self): self.check_uniform(rng_kernel_float64, np.float64) @@ -95,10 +100,10 @@ def check_normal(self, kernel_func, dtype): def test_normal_float32(self): self.check_normal(rng_kernel_float32, np.float32) - @skip_on_cudasim('skip test for speed under cudasim') + @skip_on_cudasim("skip test for speed under cudasim") def test_normal_float64(self): self.check_normal(rng_kernel_float64, np.float64) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py b/numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py index 75651488e..85ddf1d74 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py @@ -78,24 +78,17 @@ def record_read_2d_array(r, a): recordtype = np.dtype( [ - ('a', np.float64), - ('b', np.int32), - ('c', np.complex64), - ('d', (np.uint8, 5)) + ("a", np.float64), + ("b", np.int32), + ("c", np.complex64), + ("d", (np.uint8, 5)), ], - align=True + align=True, ) -recordwitharray = np.dtype( - [ - ('g', np.int32), - ('h', np.float32, 2) - ], - align=True -) +recordwitharray = np.dtype([("g", np.int32), ("h", np.float32, 2)], align=True) -recordwith2darray = np.dtype([('i', np.int32), - ('j', np.float32, (3, 2))]) +recordwith2darray = np.dtype([("i", np.int32), ("j", np.float32, (3, 2))]) nested_array1_dtype = np.dtype([("array1", np.int16, (3,))], align=True) @@ -104,12 +97,13 @@ def record_read_2d_array(r, a): # Functions used for "full array" tests + def record_write_full_array(rec): rec.j[:, :] = np.ones((3, 2)) def record_write_full_array_alt(rec): - rec['j'][:, :] = np.ones((3, 2)) + rec["j"][:, :] = np.ones((3, 2)) def recarray_set_record(ary, rec): @@ -122,7 +116,7 @@ def recarray_write_array_of_nestedarray_broadcast(ary): def record_setitem_array(rec_source, rec_dest): - rec_dest['j'] = rec_source['j'] + rec_dest["j"] = rec_source["j"] def recarray_write_array_of_nestedarray(ary): @@ -135,7 +129,7 @@ def recarray_getitem_return(ary): def recarray_getitem_field_return(ary): - return ary['h'] + return ary["h"] def recarray_getitem_field_return2(ary): @@ -171,15 +165,14 @@ def record_read_2d_array01(ary): def assign_array_to_nested(dest, src): - dest['array1'] = src + dest["array1"] = src def assign_array_to_nested_2d(dest, src): - dest['array2'] = src + dest["array2"] = src class TestRecordDtype(CUDATestCase): - def _createSampleArrays(self): self.sample1d = np.recarray(3, dtype=recordtype) self.samplerec1darr = np.recarray(1, dtype=recordwitharray)[0] @@ -192,10 +185,10 @@ def setUp(self): ary = self.sample1d for i in range(ary.size): x = i + 1 - ary[i]['a'] = x / 2 - ary[i]['b'] = x - ary[i]['c'] = x * 1j - ary[i]['d'] = "%d" % x + ary[i]["a"] = x / 2 + ary[i]["b"] = x + ary[i]["c"] = x * 1j + ary[i]["d"] = "%d" % x def get_cfunc(self, pyfunc, argspec): return cuda.jit()(pyfunc) @@ -221,7 +214,7 @@ def _test_set_equal(self, pyfunc, value, valuetype): def test_set_a(self): self._test_set_equal(set_a, 3.1415, types.float64) # Test again to check if coercion works - self._test_set_equal(set_a, 3., types.float32) + self._test_set_equal(set_a, 3.0, types.float32) def test_set_b(self): self._test_set_equal(set_b, 123, types.int32) @@ -259,13 +252,13 @@ def _test_rec_set(self, v, pyfunc, f): np.testing.assert_equal(rec[f], v) def test_rec_set_a(self): - self._test_rec_set(np.float64(1.5), record_set_a, 'a') + self._test_rec_set(np.float64(1.5), record_set_a, "a") def test_rec_set_b(self): - self._test_rec_set(np.int32(2), record_set_b, 'b') + self._test_rec_set(np.int32(2), record_set_b, "b") def test_rec_set_c(self): - self._test_rec_set(np.complex64(4.0 + 5.0j), record_set_c, 'c') + self._test_rec_set(np.complex64(4.0 + 5.0j), record_set_c, "c") def _test_rec_read(self, v, pyfunc, f): rec = self.sample1d.copy()[0] @@ -277,81 +270,83 @@ def _test_rec_read(self, v, pyfunc, f): np.testing.assert_equal(arr[0], v) def test_rec_read_a(self): - self._test_rec_read(np.float64(1.5), record_read_a, 'a') + self._test_rec_read(np.float64(1.5), record_read_a, "a") def test_rec_read_b(self): - self._test_rec_read(np.int32(2), record_read_b, 'b') + self._test_rec_read(np.int32(2), record_read_b, "b") def test_rec_read_c(self): - self._test_rec_read(np.complex64(4.0 + 5.0j), record_read_c, 'c') + self._test_rec_read(np.complex64(4.0 + 5.0j), record_read_c, "c") def test_record_write_1d_array(self): - ''' + """ Test writing to a 1D array within a structured type - ''' + """ rec = self.samplerec1darr.copy() nbrecord = numpy_support.from_dtype(recordwitharray) cfunc = self.get_cfunc(record_write_array, (nbrecord,)) cfunc[1, 1](rec) expected = self.samplerec1darr.copy() - expected['g'] = 2 - expected['h'][0] = 3.0 - expected['h'][1] = 4.0 + expected["g"] = 2 + expected["h"][0] = 3.0 + expected["h"][1] = 4.0 np.testing.assert_equal(expected, rec) def test_record_write_2d_array(self): - ''' + """ Test writing to a 2D array within a structured type - ''' + """ rec = self.samplerec2darr.copy() nbrecord = numpy_support.from_dtype(recordwith2darray) cfunc = self.get_cfunc(record_write_2d_array, (nbrecord,)) cfunc[1, 1](rec) expected = self.samplerec2darr.copy() - expected['i'] = 3 - expected['j'][:] = np.asarray([5.0, 6.0, 7.0, 8.0, 9.0, 10.0], - np.float32).reshape(3, 2) + expected["i"] = 3 + expected["j"][:] = np.asarray( + [5.0, 6.0, 7.0, 8.0, 9.0, 10.0], np.float32 + ).reshape(3, 2) np.testing.assert_equal(expected, rec) def test_record_read_1d_array(self): - ''' + """ Test reading from a 1D array within a structured type - ''' + """ rec = self.samplerec1darr.copy() - rec['h'][0] = 4.0 - rec['h'][1] = 5.0 + rec["h"][0] = 4.0 + rec["h"][1] = 5.0 nbrecord = numpy_support.from_dtype(recordwitharray) cfunc = self.get_cfunc(record_read_array, (nbrecord,)) - arr = np.zeros(2, dtype=rec['h'].dtype) + arr = np.zeros(2, dtype=rec["h"].dtype) cfunc[1, 1](rec, arr) - np.testing.assert_equal(rec['h'], arr) + np.testing.assert_equal(rec["h"], arr) def test_record_read_2d_array(self): - ''' + """ Test reading from a 2D array within a structured type - ''' + """ rec = self.samplerec2darr.copy() - rec['j'][:] = np.asarray([5.0, 6.0, 7.0, 8.0, 9.0, 10.0], - np.float32).reshape(3, 2) + rec["j"][:] = np.asarray( + [5.0, 6.0, 7.0, 8.0, 9.0, 10.0], np.float32 + ).reshape(3, 2) nbrecord = numpy_support.from_dtype(recordwith2darray) cfunc = self.get_cfunc(record_read_2d_array, (nbrecord,)) - arr = np.zeros((3,2), dtype=rec['j'].dtype) + arr = np.zeros((3, 2), dtype=rec["j"].dtype) cfunc[1, 1](rec, arr) - np.testing.assert_equal(rec['j'], arr) + np.testing.assert_equal(rec["j"], arr) -@skip_on_cudasim('Structured array attr access not supported in simulator') +@skip_on_cudasim("Structured array attr access not supported in simulator") class TestRecordDtypeWithStructArrays(TestRecordDtype): - ''' + """ Same as TestRecordDtype, but using structured arrays instead of recarrays. - ''' + """ def _createSampleArrays(self): self.sample1d = np.zeros(3, dtype=recordtype) @@ -360,7 +355,6 @@ def _createSampleArrays(self): class TestNestedArrays(CUDATestCase): - # These tests mirror those from # numba.tests.test_record_dtype.TestNestedArrays added in PR # #7359: https://github.com/numba/numba/pull/7359 @@ -405,8 +399,9 @@ def test_record_read_array(self): def test_record_read_2d_array(self): # Test reading from a 2D array within a structured type nbval = np.recarray(1, dtype=recordwith2darray) - nbval[0].j = np.asarray([1.5, 2.5, 3.5, 4.5, 5.5, 6.5], - np.float32).reshape(3, 2) + nbval[0].j = np.asarray( + [1.5, 2.5, 3.5, 4.5, 5.5, 6.5], np.float32 + ).reshape(3, 2) cfunc = self.get_cfunc(record_read_2d_array00, np.float32) res = cfunc(nbval[0]) np.testing.assert_equal(res, nbval[0].j[0, 0]) @@ -422,12 +417,15 @@ def test_record_read_2d_array(self): def test_setitem(self): def gen(): nbarr1 = np.recarray(1, dtype=recordwith2darray) - nbarr1[0] = np.array([(1, ((1, 2), (4, 5), (2, 3)))], - dtype=recordwith2darray)[0] + nbarr1[0] = np.array( + [(1, ((1, 2), (4, 5), (2, 3)))], dtype=recordwith2darray + )[0] nbarr2 = np.recarray(1, dtype=recordwith2darray) - nbarr2[0] = np.array([(10, ((10, 20), (40, 50), (20, 30)))], - dtype=recordwith2darray)[0] + nbarr2[0] = np.array( + [(10, ((10, 20), (40, 50), (20, 30)))], dtype=recordwith2darray + )[0] return nbarr1[0], nbarr2[0] + pyfunc = record_setitem_array pyargs = gen() pyfunc(*pyargs) @@ -453,7 +451,7 @@ def test_getitem_idx(self): # Writing to records / recarrays - @skip_on_cudasim('Structured array attr access not supported in simulator') + @skip_on_cudasim("Structured array attr access not supported in simulator") def test_set_record(self): # Test setting an entire record rec = np.ones(2, dtype=recordwith2darray).view(np.recarray)[0] @@ -492,20 +490,18 @@ def test_assign_array_to_nested_2d(self): np.testing.assert_array_equal(expected, got) def test_issue_7693(self): - src_dtype = np.dtype([ - ("user", np.float64), - ("array", np.int16, (3,))], - align=True) + src_dtype = np.dtype( + [("user", np.float64), ("array", np.int16, (3,))], align=True + ) - dest_dtype = np.dtype([ - ("user1", np.float64), - ("array1", np.int16, (3,))], - align=True) + dest_dtype = np.dtype( + [("user1", np.float64), ("array1", np.int16, (3,))], align=True + ) @cuda.jit def copy(index, src, dest): - dest['user1'] = src[index]['user'] - dest['array1'] = src[index]['array'] + dest["user1"] = src[index]["user"] + dest["array1"] = src[index]["array"] source = np.zeros(2, dtype=src_dtype) got = np.zeros(2, dtype=dest_dtype) @@ -528,10 +524,13 @@ def test_getitem_idx_2darray(self): # This test returning a record when passing an array and # return the first item when passing a record nbarr = np.recarray(2, dtype=recordwith2darray) - nbarr[0] = np.array([(1, ((1,2),(4,5),(2,3)))], - dtype=recordwith2darray)[0] - for arg, retty in [(nbarr, recordwith2darray), - (nbarr[0], (np.float32, (3, 2)))]: + nbarr[0] = np.array( + [(1, ((1, 2), (4, 5), (2, 3)))], dtype=recordwith2darray + )[0] + for arg, retty in [ + (nbarr, recordwith2darray), + (nbarr[0], (np.float32, (3, 2))), + ]: pyfunc = recarray_getitem_field_return2_2d arr_expected = pyfunc(arg) cfunc = self.get_cfunc(pyfunc, retty) @@ -545,10 +544,12 @@ def test_return_getattr_getitem_fieldname(self): # This tests returning a array of nestedarrays when passing an array and # returning a nestedarray when passing a record nbarr = np.recarray(2, dtype=recordwitharray) - nbarr[0] = np.array([(1, (2,3))], dtype=recordwitharray)[0] + nbarr[0] = np.array([(1, (2, 3))], dtype=recordwitharray)[0] for arg, retty in [(nbarr, recordwitharray), (nbarr[0], np.float32)]: - for pyfunc in [recarray_getitem_field_return, - recarray_getitem_field_return2]: + for pyfunc in [ + recarray_getitem_field_return, + recarray_getitem_field_return2, + ]: arr_expected = pyfunc(arg) cfunc = self.get_cfunc(pyfunc, retty) arr_res = cfunc(arg) @@ -570,17 +571,17 @@ def test_record_read_arrays(self): def test_return_array(self): # Test getitem record AND array within record and returning it nbval = np.recarray(2, dtype=recordwitharray) - nbval[0] = np.array([(1, (2,3))], dtype=recordwitharray)[0] + nbval[0] = np.array([(1, (2, 3))], dtype=recordwitharray)[0] pyfunc = record_read_array0 arr_expected = pyfunc(nbval) cfunc = self.get_cfunc(pyfunc, np.float32) arr_res = cfunc(nbval) np.testing.assert_equal(arr_expected, arr_res) - @skip_on_cudasim('Will unexpectedly pass on cudasim') + @skip_on_cudasim("Will unexpectedly pass on cudasim") @unittest.expectedFailure def test_set_array(self): - #Test setting an entire array within one record + # Test setting an entire array within one record arr = np.zeros(2, dtype=recordwith2darray).view(np.recarray) rec = arr[0] nbarr = np.zeros(2, dtype=recordwith2darray).view(np.recarray) @@ -597,8 +598,8 @@ def test_set_arrays(self): arr = np.zeros(2, dtype=recordwith2darray).view(np.recarray) nbarr = np.zeros(2, dtype=recordwith2darray).view(np.recarray) for pyfunc in ( - recarray_write_array_of_nestedarray_broadcast, - recarray_write_array_of_nestedarray, + recarray_write_array_of_nestedarray_broadcast, + recarray_write_array_of_nestedarray, ): arr_expected = pyfunc(arr) cfunc = self.get_cfunc(pyfunc, nbarr.dtype) @@ -606,5 +607,5 @@ def test_set_arrays(self): np.testing.assert_equal(arr_res, arr_expected) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_recursion.py b/numba_cuda/numba/cuda/tests/cudapy/test_recursion.py index 579275330..b73722e44 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_recursion.py @@ -6,11 +6,11 @@ class TestSelfRecursion(CUDATestCase): - def setUp(self): # Avoid importing this module at the top level, as it triggers # compilation and can therefore fail from numba.cuda.tests.cudapy import recursion_usecases + self.mod = recursion_usecases super().setUp() @@ -36,19 +36,20 @@ def test_inner_explicit_sig(self): def test_global_implicit_sig(self): self.check_fib(self.mod.fib3) - @skip_on_cudasim('Simulator does not compile') + @skip_on_cudasim("Simulator does not compile") def test_runaway(self): with self.assertRaises(TypingError) as raises: cfunc = self.mod.runaway_self - @cuda.jit('void()') + @cuda.jit("void()") def kernel(): cfunc(1) - self.assertIn("cannot type infer runaway recursion", - str(raises.exception)) + self.assertIn( + "cannot type infer runaway recursion", str(raises.exception) + ) - @unittest.skip('Needs insert_unresolved_ref support in target') + @unittest.skip("Needs insert_unresolved_ref support in target") def test_type_change(self): pfunc = self.mod.type_change_self.py_func cfunc = self.mod.type_change_self @@ -79,7 +80,7 @@ def test_raise(self): self.assertEqual(str(raises.exception), "raise_self") - @unittest.skip('Needs insert_unresolved_ref support in target') + @unittest.skip("Needs insert_unresolved_ref support in target") def test_optional_return(self): pfunc = self.mod.make_optional_return_case() cfunc = self.mod.make_optional_return_case(cuda.jit) @@ -106,12 +107,13 @@ def cpu_kernel(x): self.assertEqual(expected, actual) - @skip_on_cudasim('Recursion handled because simulator does not compile') + @skip_on_cudasim("Recursion handled because simulator does not compile") def test_growing_return_tuple(self): cfunc = self.mod.make_growing_tuple_case(cuda.jit) with self.assertRaises(TypingError) as raises: - @cuda.jit('void()') + + @cuda.jit("void()") def kernel(): cfunc(100) @@ -121,5 +123,5 @@ def kernel(): ) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_reduction.py b/numba_cuda/numba/cuda/tests/cudapy/test_reduction.py index 420fc7516..cd34b018b 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_reduction.py @@ -10,7 +10,7 @@ class TestReduction(CUDATestCase): def _sum_reduce(self, n): - A = (np.arange(n, dtype=np.float64) + 1) + A = np.arange(n, dtype=np.float64) + 1 expect = A.sum() got = sum_reduce(A) self.assertEqual(expect, got) @@ -19,24 +19,39 @@ def test_sum_reduce(self): if ENABLE_CUDASIM: # Minimal test set for the simulator (which only wraps # functools.reduce) - test_sizes = [ 1, 16 ] + test_sizes = [1, 16] else: # Tests around the points where blocksize changes, and around larger # powers of two, sums of powers of two, and some "random" sizes - test_sizes = [ 1, 15, 16, 17, 127, 128, 129, 1023, 1024, - 1025, 1536, 1048576, 1049600, 1049728, 34567 ] + test_sizes = [ + 1, + 15, + 16, + 17, + 127, + 128, + 129, + 1023, + 1024, + 1025, + 1536, + 1048576, + 1049600, + 1049728, + 34567, + ] # Avoid recompilation by keeping sum_reduce here for n in test_sizes: self._sum_reduce(n) def test_empty_array_host(self): - A = (np.arange(0, dtype=np.float64) + 1) + A = np.arange(0, dtype=np.float64) + 1 expect = A.sum() got = sum_reduce(A) self.assertEqual(expect, got) def test_empty_array_device(self): - A = (np.arange(0, dtype=np.float64) + 1) + A = np.arange(0, dtype=np.float64) + 1 dA = cuda.to_device(A) expect = A.sum() got = sum_reduce(dA) @@ -44,27 +59,27 @@ def test_empty_array_device(self): def test_prod_reduce(self): prod_reduce = cuda.reduce(lambda a, b: a * b) - A = (np.arange(64, dtype=np.float64) + 1) + A = np.arange(64, dtype=np.float64) + 1 expect = A.prod() got = prod_reduce(A, init=1) np.testing.assert_allclose(expect, got) def test_max_reduce(self): max_reduce = cuda.Reduce(lambda a, b: max(a, b)) - A = (np.arange(3717, dtype=np.float64) + 1) + A = np.arange(3717, dtype=np.float64) + 1 expect = A.max() got = max_reduce(A, init=0) self.assertEqual(expect, got) def test_non_identity_init(self): init = 3 - A = (np.arange(10, dtype=np.float64) + 1) + A = np.arange(10, dtype=np.float64) + 1 expect = A.sum() + init got = sum_reduce(A, init=init) self.assertEqual(expect, got) def test_result_on_device(self): - A = (np.arange(10, dtype=np.float64) + 1) + A = np.arange(10, dtype=np.float64) + 1 got = cuda.to_device(np.zeros(1, dtype=np.float64)) expect = A.sum() res = sum_reduce(A, res=got) @@ -72,5 +87,5 @@ def test_result_on_device(self): self.assertEqual(expect, got[0]) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py b/numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py index 640efcac3..52b137c74 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py @@ -21,13 +21,10 @@ def set_array_to_three(arr): def set_record_to_three(rec): - rec[0]['b'] = 3 + rec[0]["b"] = 3 -recordtype = np.dtype( - [('b', np.int32)], - align=True -) +recordtype = np.dtype([("b", np.int32)], align=True) class TestRetrieveAutoconvertedArrays(CUDATestCase): @@ -61,23 +58,23 @@ def test_array_default(self): def test_record_in(self): host_rec = np.zeros(1, dtype=recordtype) self.set_record_to_three[1, 1](cuda.In(host_rec)) - self.assertEqual(0, host_rec[0]['b']) + self.assertEqual(0, host_rec[0]["b"]) def test_record_inout(self): host_rec = np.zeros(1, dtype=recordtype) self.set_record_to_three[1, 1](cuda.InOut(host_rec)) - self.assertEqual(3, host_rec[0]['b']) + self.assertEqual(3, host_rec[0]["b"]) def test_record_default(self): host_rec = np.zeros(1, dtype=recordtype) self.set_record_to_three[1, 1](host_rec) - self.assertEqual(3, host_rec[0]['b']) + self.assertEqual(3, host_rec[0]["b"]) def test_record_in_from_config(self): host_rec = np.zeros(1, dtype=recordtype) self.set_record_to_three_nocopy[1, 1](host_rec) - self.assertEqual(0, host_rec[0]['b']) + self.assertEqual(0, host_rec[0]["b"]) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_serialize.py b/numba_cuda/numba/cuda/tests/cudapy/test_serialize.py index b98aa85a0..08ed0d6b3 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_serialize.py @@ -7,9 +7,8 @@ from numba.np import numpy_support -@skip_on_cudasim('pickling not supported in CUDASIM') +@skip_on_cudasim("pickling not supported in CUDASIM") class TestPickle(CUDATestCase): - def check_call(self, callee): arr = np.array([100]) expected = callee[1, 1](arr) @@ -41,14 +40,13 @@ def test_pickling_jit_typing(self): def inner(a): return a + 1 - @cuda.jit('void(intp[:])') + @cuda.jit("void(intp[:])") def foo(arr): arr[0] = inner(arr[0]) self.check_call(foo) def test_pickling_jit(self): - @cuda.jit(device=True) def inner(a): return a + 1 @@ -60,7 +58,7 @@ def foo(arr): self.check_call(foo) def test_pickling_vectorize(self): - @vectorize(['intp(intp)', 'float64(float64)'], target='cuda') + @vectorize(["intp(intp)", "float64(float64)"], target="cuda") def cuda_vect(x): return x * 2 @@ -81,5 +79,5 @@ def cuda_vect(x): np.testing.assert_equal(expected, got2) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_slicing.py b/numba_cuda/numba/cuda/tests/cudapy/test_slicing.py index f5a3df7f3..40f2c05f4 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_slicing.py @@ -33,5 +33,5 @@ def test_assign_empty_slice(self): arr[:] = cuda.to_device(a) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_sm.py b/numba_cuda/numba/cuda/tests/cudapy/test_sm.py index b61784a73..c037d1a39 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_sm.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_sm.py @@ -8,8 +8,7 @@ from .extensions_usecases import test_struct_model_type, TestStruct -recordwith2darray = np.dtype([('i', np.int32), - ('j', np.float32, (3, 2))]) +recordwith2darray = np.dtype([("i", np.int32), ("j", np.float32, (3, 2))]) class TestSharedMemoryIssue(CUDATestCase): @@ -42,7 +41,6 @@ def test_issue_1051_shared_size_broken_2d(self): self._check_shared_array_size((2, 3), 6) def test_issue_1051_shared_size_broken_3d(self): - self._check_shared_array_size((2, 3, 4), 24) def _check_shared_array_size_fp16(self, shape, expected, ty): @@ -71,8 +69,9 @@ def test_issue_2393(self): @cuda.jit def costs_func(d_block_costs): - s_features = cuda.shared.array((examples_per_block, num_weights), - float64) + s_features = cuda.shared.array( + (examples_per_block, num_weights), float64 + ) s_initialcost = cuda.shared.array(7, float64) # Bug threadIdx = cuda.threadIdx.x @@ -364,7 +363,7 @@ def test_issue_5073(self): def sm_slice_copy(x, y, chunksize): dynsmem = cuda.shared.array(0, dtype=dt) sm1 = dynsmem[0:chunksize] - sm2 = dynsmem[chunksize:chunksize * 2] + sm2 = dynsmem[chunksize : chunksize * 2] tx = cuda.threadIdx.x bx = cuda.blockIdx.x @@ -396,14 +395,16 @@ def test_invalid_array_type(self): rgx = ".*Cannot infer the type of variable 'arr'.*" def unsupported_type(): - arr = cuda.shared.array(10, dtype=np.dtype('O')) # noqa: F841 + arr = cuda.shared.array(10, dtype=np.dtype("O")) # noqa: F841 + with self.assertRaisesRegex(TypingError, rgx): cuda.jit(void())(unsupported_type) rgx = ".*Invalid NumPy dtype specified: 'int33'.*" def invalid_string_type(): - arr = cuda.shared.array(10, dtype='int33') # noqa: F841 + arr = cuda.shared.array(10, dtype="int33") # noqa: F841 + with self.assertRaisesRegex(TypingError, rgx): cuda.jit(void())(invalid_string_type) @@ -440,5 +441,5 @@ def write_then_reverse_read_static(outx, outy): self.assertEqual(y, (nthreads - i - 1) * 2) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py b/numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py index bff48e642..cfc09d5c2 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py @@ -17,15 +17,17 @@ def udt_global_constants(A): def udt_global_build_tuple(A): - sa = cuda.shared.array(shape=(GLOBAL_CONSTANT, GLOBAL_CONSTANT_2), - dtype=float32) + sa = cuda.shared.array( + shape=(GLOBAL_CONSTANT, GLOBAL_CONSTANT_2), dtype=float32 + ) i, j = cuda.grid(2) A[i, j] = sa[i, j] def udt_global_build_list(A): - sa = cuda.shared.array(shape=[GLOBAL_CONSTANT, GLOBAL_CONSTANT_2], - dtype=float32) + sa = cuda.shared.array( + shape=[GLOBAL_CONSTANT, GLOBAL_CONSTANT_2], dtype=float32 + ) i, j = cuda.grid(2) A[i, j] = sa[i, j] @@ -59,7 +61,7 @@ def getarg(self): return np.array(100, dtype=np.float32, ndmin=1) def getarg2(self): - return self.getarg().reshape(1,1) + return self.getarg().reshape(1, 1) def test_global_constants(self): udt = cuda.jit((float32[:],))(udt_global_constants) @@ -69,18 +71,21 @@ def test_global_build_tuple(self): udt = cuda.jit((float32[:, :],))(udt_global_build_tuple) udt[1, 1](self.getarg2()) - @skip_on_cudasim('Simulator does not prohibit lists for shared array shape') + @skip_on_cudasim("Simulator does not prohibit lists for shared array shape") def test_global_build_list(self): with self.assertRaises(TypingError) as raises: cuda.jit((float32[:, :],))(udt_global_build_list) - self.assertIn("No implementation of function " - "Function(>> array(shape=list(int64), " - "dtype=class(float32)", - str(raises.exception)) + self.assertIn( + "No implementation of function Function(>> array(shape=list(int64), " + "dtype=class(float32)", + str(raises.exception), + ) def test_global_constant_tuple(self): udt = cuda.jit((float32[:, :],))(udt_global_constant_tuple) @@ -92,12 +97,15 @@ def test_invalid_1(self): with self.assertRaises(TypingError) as raises: cuda.jit((float32[:],))(udt_invalid_1) - self.assertIn("No implementation of function " - "Function(>> array(shape=float32, dtype=class(float32))", - str(raises.exception)) + self.assertIn( + "No implementation of function Function(>> array(shape=float32, dtype=class(float32))", + str(raises.exception), + ) @skip_on_cudasim("Can't check for constants in simulator") def test_invalid_2(self): @@ -105,13 +113,16 @@ def test_invalid_2(self): with self.assertRaises(TypingError) as raises: cuda.jit((float32[:, :],))(udt_invalid_2) - self.assertIn("No implementation of function " - "Function(>> array(shape=Tuple(Literal[int](1), " - "array(float32, 1d, A)), dtype=class(float32))", - str(raises.exception)) + self.assertIn( + "No implementation of function Function(>> array(shape=Tuple(Literal[int](1), " + "array(float32, 1d, A)), dtype=class(float32))", + str(raises.exception), + ) @skip_on_cudasim("Can't check for constants in simulator") def test_invalid_3(self): @@ -119,12 +130,15 @@ def test_invalid_3(self): with self.assertRaises(TypingError) as raises: cuda.jit((int32[:],))(udt_invalid_1) - self.assertIn("No implementation of function " - "Function(>> array(shape=int32, dtype=class(float32))", - str(raises.exception)) + self.assertIn( + "No implementation of function Function(>> array(shape=int32, dtype=class(float32))", + str(raises.exception), + ) @skip_on_cudasim("Can't check for constants in simulator") def test_invalid_4(self): @@ -132,18 +146,21 @@ def test_invalid_4(self): with self.assertRaises(TypingError) as raises: cuda.jit((int32[:],))(udt_invalid_3) - self.assertIn("No implementation of function " - "Function(>> array(shape=Tuple(Literal[int](1), int32), " - "dtype=class(float32))", - str(raises.exception)) + self.assertIn( + "No implementation of function Function(>> array(shape=Tuple(Literal[int](1), int32), " + "dtype=class(float32))", + str(raises.exception), + ) def check_dtype(self, f, dtype): # Find the typing of the dtype argument to cuda.shared.array annotation = next(iter(f.overloads.values()))._type_annotation - l_dtype = annotation.typemap['s'].dtype + l_dtype = annotation.typemap["s"].dtype # Ensure that the typing is correct self.assertEqual(l_dtype, dtype) @@ -174,7 +191,7 @@ def test_string_dtype(self): # Check that strings can be used to specify the dtype of a shared array @cuda.jit(void(int32[::1])) def f(x): - s = cuda.shared.array(10, dtype='int32') + s = cuda.shared.array(10, dtype="int32") s[0] = x[0] x[0] = s[0] @@ -185,9 +202,10 @@ def test_invalid_string_dtype(self): # Check that strings of invalid dtypes cause a typing error re = ".*Invalid NumPy dtype specified: 'int33'.*" with self.assertRaisesRegex(TypingError, re): + @cuda.jit(void(int32[::1])) def f(x): - s = cuda.shared.array(10, dtype='int33') + s = cuda.shared.array(10, dtype="int33") s[0] = x[0] x[0] = s[0] @@ -198,8 +216,9 @@ def f(x): s = cuda.shared.array(10, dtype=test_struct_model_type) s[0] = x[0] x[0] = s[0] + self.check_dtype(f, test_struct_model_type) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py b/numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py index c27055b02..8367b460e 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py @@ -1,5 +1,9 @@ -from numba.cuda.testing import (skip_on_cudasim, skip_unless_cudasim, unittest, - CUDATestCase) +from numba.cuda.testing import ( + skip_on_cudasim, + skip_unless_cudasim, + unittest, + CUDATestCase, +) from numba import config, cuda # Basic tests that stream APIs execute on the hardware and in the simulator. @@ -48,5 +52,5 @@ def test_external_stream_simulator_unavailable(self): cuda.external_stream(ptr) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_sync.py b/numba_cuda/numba/cuda/tests/cudapy/test_sync.py index d4d9326f0..4eaff55c9 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_sync.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_sync.py @@ -133,14 +133,16 @@ def test_useless_syncwarp(self): self._test_useless(useless_syncwarp) @skip_on_cudasim("syncwarp not implemented on cudasim") - @unittest.skipUnless(_safe_cc_check((7, 0)), - "Partial masks require CC 7.0 or greater") + @unittest.skipUnless( + _safe_cc_check((7, 0)), "Partial masks require CC 7.0 or greater" + ) def test_useless_syncwarp_with_mask(self): self._test_useless(useless_syncwarp_with_mask) @skip_on_cudasim("syncwarp not implemented on cudasim") - @unittest.skipUnless(_safe_cc_check((7, 0)), - "Partial masks require CC 7.0 or greater") + @unittest.skipUnless( + _safe_cc_check((7, 0)), "Partial masks require CC 7.0 or greater" + ) def test_coop_syncwarp(self): # coop_syncwarp computes the sum of all integers from 0 to 31 (496) # using a single warp @@ -267,5 +269,5 @@ def test_syncthreads_or_downcast(self): self._test_syncthreads_or(np.int64) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_transpose.py b/numba_cuda/numba/cuda/tests/cudapy/test_transpose.py index 9c13db534..38243b78d 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_transpose.py @@ -5,17 +5,17 @@ from numba.cuda.testing import skip_on_cudasim, CUDATestCase -recordwith2darray = np.dtype([('i', np.int32), - ('j', np.float32, (3, 2))]) +recordwith2darray = np.dtype([("i", np.int32), ("j", np.float32, (3, 2))]) -@skip_on_cudasim('Device Array API unsupported in the simulator') +@skip_on_cudasim("Device Array API unsupported in the simulator") class TestTranspose(CUDATestCase): - def test_transpose(self): - variants = ((5, 6, np.float64), - (128, 128, np.complex128), - (1025, 512, np.float64)) + variants = ( + (5, 6, np.float64), + (128, 128, np.complex128), + (1025, 512, np.float64), + ) for rows, cols, dtype in variants: with self.subTest(rows=rows, cols=cols, dtype=dtype): @@ -27,8 +27,15 @@ def test_transpose(self): dy.copy_to_host(y) np.testing.assert_array_equal(x.transpose(), y) - small_variants = ((2, 3), (16, 16), (16, 17), (17, 16), (14, 15), (15, 14), - (14, 14)) + small_variants = ( + (2, 3), + (16, 16), + (16, 17), + (17, 16), + (14, 15), + (15, 14), + (14, 14), + ) def test_transpose_record(self): for rows, cols in self.small_variants: @@ -36,7 +43,7 @@ def test_transpose_record(self): arr = np.recarray((rows, cols), dtype=recordwith2darray) for x in range(rows): for y in range(cols): - arr[x, y].i = x ** 2 + y + arr[x, y].i = x**2 + y j = np.arange(3 * 2, dtype=np.float32) arr[x, y].j = j.reshape(3, 2) * x + y @@ -76,5 +83,5 @@ def test_transpose_view(self): np.testing.assert_array_equal(a_view_t, h_a_view_t) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py b/numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py index 7a98abde7..63340ecce 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py @@ -9,11 +9,11 @@ def _make_ufunc_usecase(ufunc): ldict = {} - arg_str = ','.join(['a{0}'.format(i) for i in range(ufunc.nargs)]) - func_str = f'def fn({arg_str}):\n np.{ufunc.__name__}({arg_str})' + arg_str = ",".join(["a{0}".format(i) for i in range(ufunc.nargs)]) + func_str = f"def fn({arg_str}):\n np.{ufunc.__name__}({arg_str})" exec(func_str, globals(), ldict) - fn = ldict['fn'] - fn.__name__ = '{0}_usecase'.format(ufunc.__name__) + fn = ldict["fn"] + fn.__name__ = "{0}_usecase".format(ufunc.__name__) return fn @@ -32,58 +32,75 @@ def setUp(self): # The basic ufunc test does not set up complex inputs, so we'll add # some here for testing with CUDA. - self.inputs.extend([ - (np.complex64(-0.5 - 0.5j), types.complex64), - (np.complex64(0.0), types.complex64), - (np.complex64(0.5 + 0.5j), types.complex64), - - (np.complex128(-0.5 - 0.5j), types.complex128), - (np.complex128(0.0), types.complex128), - (np.complex128(0.5 + 0.5j), types.complex128), - - (np.array([-0.5 - 0.5j, 0.0, 0.5 + 0.5j], dtype='c8'), - types.Array(types.complex64, 1, 'C')), - (np.array([-0.5 - 0.5j, 0.0, 0.5 + 0.5j], dtype='c16'), - types.Array(types.complex128, 1, 'C')), - ]) + self.inputs.extend( + [ + (np.complex64(-0.5 - 0.5j), types.complex64), + (np.complex64(0.0), types.complex64), + (np.complex64(0.5 + 0.5j), types.complex64), + (np.complex128(-0.5 - 0.5j), types.complex128), + (np.complex128(0.0), types.complex128), + (np.complex128(0.5 + 0.5j), types.complex128), + ( + np.array([-0.5 - 0.5j, 0.0, 0.5 + 0.5j], dtype="c8"), + types.Array(types.complex64, 1, "C"), + ), + ( + np.array([-0.5 - 0.5j, 0.0, 0.5 + 0.5j], dtype="c16"), + types.Array(types.complex128, 1, "C"), + ), + ] + ) # Test with multiple dimensions - self.inputs.extend([ - # Basic 2D and 3D arrays - (np.linspace(0, 1).reshape((5, -1)), - types.Array(types.float64, 2, 'C')), - (np.linspace(0, 1).reshape((2, 5, -1)), - types.Array(types.float64, 3, 'C')), - # Complex data (i.e. interleaved) - (np.linspace(0, 1 + 1j).reshape(5, -1), - types.Array(types.complex128, 2, 'C')), - # F-ordered - (np.asfortranarray(np.linspace(0, 1).reshape((5, -1))), - types.Array(types.float64, 2, 'F')), - ]) + self.inputs.extend( + [ + # Basic 2D and 3D arrays + ( + np.linspace(0, 1).reshape((5, -1)), + types.Array(types.float64, 2, "C"), + ), + ( + np.linspace(0, 1).reshape((2, 5, -1)), + types.Array(types.float64, 3, "C"), + ), + # Complex data (i.e. interleaved) + ( + np.linspace(0, 1 + 1j).reshape(5, -1), + types.Array(types.complex128, 2, "C"), + ), + # F-ordered + ( + np.asfortranarray(np.linspace(0, 1).reshape((5, -1))), + types.Array(types.float64, 2, "F"), + ), + ] + ) # Add tests for other integer types - self.inputs.extend([ - (np.uint8(0), types.uint8), - (np.uint8(1), types.uint8), - (np.int8(-1), types.int8), - (np.int8(0), types.int8), - - (np.uint16(0), types.uint16), - (np.uint16(1), types.uint16), - (np.int16(-1), types.int16), - (np.int16(0), types.int16), - - (np.ulonglong(0), types.ulonglong), - (np.ulonglong(1), types.ulonglong), - (np.longlong(-1), types.longlong), - (np.longlong(0), types.longlong), - - (np.array([0,1], dtype=np.ulonglong), - types.Array(types.ulonglong, 1, 'C')), - (np.array([0,1], dtype=np.longlong), - types.Array(types.longlong, 1, 'C')), - ]) + self.inputs.extend( + [ + (np.uint8(0), types.uint8), + (np.uint8(1), types.uint8), + (np.int8(-1), types.int8), + (np.int8(0), types.int8), + (np.uint16(0), types.uint16), + (np.uint16(1), types.uint16), + (np.int16(-1), types.int16), + (np.int16(0), types.int16), + (np.ulonglong(0), types.ulonglong), + (np.ulonglong(1), types.ulonglong), + (np.longlong(-1), types.longlong), + (np.longlong(0), types.longlong), + ( + np.array([0, 1], dtype=np.ulonglong), + types.Array(types.ulonglong, 1, "C"), + ), + ( + np.array([0, 1], dtype=np.longlong), + types.Array(types.longlong, 1, "C"), + ), + ] + ) self._low_occupancy_warnings = config.CUDA_LOW_OCCUPANCY_WARNINGS self._warn_on_implicit_copy = config.CUDA_WARN_ON_IMPLICIT_COPY @@ -111,18 +128,18 @@ def basic_int_ufunc_test(self, name=None): skip_inputs = [ types.float32, types.float64, - types.Array(types.float32, 1, 'C'), - types.Array(types.float32, 2, 'C'), - types.Array(types.float64, 1, 'C'), - types.Array(types.float64, 2, 'C'), - types.Array(types.float64, 3, 'C'), - types.Array(types.float64, 2, 'F'), + types.Array(types.float32, 1, "C"), + types.Array(types.float32, 2, "C"), + types.Array(types.float64, 1, "C"), + types.Array(types.float64, 2, "C"), + types.Array(types.float64, 3, "C"), + types.Array(types.float64, 2, "F"), types.complex64, types.complex128, - types.Array(types.complex64, 1, 'C'), - types.Array(types.complex64, 2, 'C'), - types.Array(types.complex128, 1, 'C'), - types.Array(types.complex128, 2, 'C'), + types.Array(types.complex64, 1, "C"), + types.Array(types.complex64, 2, "C"), + types.Array(types.complex128, 1, "C"), + types.Array(types.complex128, 2, "C"), ] self.basic_ufunc_test(name, skip_inputs=skip_inputs) @@ -130,43 +147,43 @@ def basic_int_ufunc_test(self, name=None): # Trigonometric Functions def test_sin_ufunc(self): - self.basic_ufunc_test(np.sin, kinds='cf') + self.basic_ufunc_test(np.sin, kinds="cf") def test_cos_ufunc(self): - self.basic_ufunc_test(np.cos, kinds='cf') + self.basic_ufunc_test(np.cos, kinds="cf") def test_tan_ufunc(self): - self.basic_ufunc_test(np.tan, kinds='cf') + self.basic_ufunc_test(np.tan, kinds="cf") def test_arcsin_ufunc(self): - self.basic_ufunc_test(np.arcsin, kinds='cf') + self.basic_ufunc_test(np.arcsin, kinds="cf") def test_arccos_ufunc(self): - self.basic_ufunc_test(np.arccos, kinds='cf') + self.basic_ufunc_test(np.arccos, kinds="cf") def test_arctan_ufunc(self): - self.basic_ufunc_test(np.arctan, kinds='cf') + self.basic_ufunc_test(np.arctan, kinds="cf") def test_arctan2_ufunc(self): - self.basic_ufunc_test(np.arctan2, kinds='f') + self.basic_ufunc_test(np.arctan2, kinds="f") def test_hypot_ufunc(self): - self.basic_ufunc_test(np.hypot, kinds='f') + self.basic_ufunc_test(np.hypot, kinds="f") def test_sinh_ufunc(self): - self.basic_ufunc_test(np.sinh, kinds='cf') + self.basic_ufunc_test(np.sinh, kinds="cf") def test_cosh_ufunc(self): - self.basic_ufunc_test(np.cosh, kinds='cf') + self.basic_ufunc_test(np.cosh, kinds="cf") def test_tanh_ufunc(self): - self.basic_ufunc_test(np.tanh, kinds='cf') + self.basic_ufunc_test(np.tanh, kinds="cf") def test_arcsinh_ufunc(self): - self.basic_ufunc_test(np.arcsinh, kinds='cf') + self.basic_ufunc_test(np.arcsinh, kinds="cf") def test_arccosh_ufunc(self): - self.basic_ufunc_test(np.arccosh, kinds='cf') + self.basic_ufunc_test(np.arccosh, kinds="cf") def test_arctanh_ufunc(self): # arctanh is only valid is only finite in the range ]-1, 1[ @@ -177,24 +194,30 @@ def test_arctanh_ufunc(self): # used to compile NumPy may differ from the result generated by # llvm. Skipping the integer types in this test avoids failed # tests because of this. - to_skip = [types.Array(types.uint32, 1, 'C'), types.uint32, - types.Array(types.int32, 1, 'C'), types.int32, - types.Array(types.uint64, 1, 'C'), types.uint64, - types.Array(types.int64, 1, 'C'), types.int64] + to_skip = [ + types.Array(types.uint32, 1, "C"), + types.uint32, + types.Array(types.int32, 1, "C"), + types.int32, + types.Array(types.uint64, 1, "C"), + types.uint64, + types.Array(types.int64, 1, "C"), + types.int64, + ] - self.basic_ufunc_test(np.arctanh, skip_inputs=to_skip, kinds='cf') + self.basic_ufunc_test(np.arctanh, skip_inputs=to_skip, kinds="cf") def test_deg2rad_ufunc(self): - self.basic_ufunc_test(np.deg2rad, kinds='f') + self.basic_ufunc_test(np.deg2rad, kinds="f") def test_rad2deg_ufunc(self): - self.basic_ufunc_test(np.rad2deg, kinds='f') + self.basic_ufunc_test(np.rad2deg, kinds="f") def test_degrees_ufunc(self): - self.basic_ufunc_test(np.degrees, kinds='f') + self.basic_ufunc_test(np.degrees, kinds="f") def test_radians_ufunc(self): - self.basic_ufunc_test(np.radians, kinds='f') + self.basic_ufunc_test(np.radians, kinds="f") ############################################################################ # Comparison functions @@ -264,14 +287,14 @@ def test_bitwise_not_ufunc(self): # Mathematical Functions def test_log_ufunc(self): - self.basic_ufunc_test(np.log, kinds='cf') + self.basic_ufunc_test(np.log, kinds="cf") def test_log2_ufunc(self): - self.basic_ufunc_test(np.log2, kinds='cf') + self.basic_ufunc_test(np.log2, kinds="cf") def test_log10_ufunc(self): - self.basic_ufunc_test(np.log10, kinds='cf') + self.basic_ufunc_test(np.log10, kinds="cf") -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_userexc.py b/numba_cuda/numba/cuda/tests/cudapy/test_userexc.py index 6073c3f3f..b444c9155 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_userexc.py @@ -13,7 +13,6 @@ class MyError(Exception): class TestUserExc(CUDATestCase): - def setUp(self): super().setUp() # LTO optimizes away the exception status due to an oversight @@ -29,7 +28,7 @@ def test_exc(x): elif x == 2: raise MyError("foo") - test_exc[1, 1](0) # no raise + test_exc[1, 1](0) # no raise with self.assertRaises(MyError) as cm: test_exc[1, 1](1) if not config.ENABLE_CUDASIM: @@ -43,5 +42,5 @@ def test_exc(x): self.assertIn("tid=[0, 0, 0] ctaid=[0, 0, 0]: foo", str(cm.exception)) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py b/numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py index 1ee72f2d3..9fef225df 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py @@ -44,12 +44,7 @@ def kernel_3elem(res): res[2] = v.z def kernel_4elem(res): - v = vobj( - base_type(0), - base_type(1), - base_type(2), - base_type(3) - ) + v = vobj(base_type(0), base_type(1), base_type(2), base_type(3)) res[0] = v.x res[1] = v.y res[2] = v.z @@ -59,7 +54,7 @@ def kernel_4elem(res): 1: kernel_1elem, 2: kernel_2elem, 3: kernel_3elem, - 4: kernel_4elem + 4: kernel_4elem, }[vtype.num_elements] return cuda.jit(host_function) @@ -83,13 +78,13 @@ def kernel(res): three = base_type(3.0) four = base_type(4.0) - j = 0 # index of the result array + j = 0 # index of the result array # Construct a 1-component vector type, possible combination includes: # 2C1 = 2 combinations. f1_1 = v1(one) # 1 - f1_2 = v1(f1_1) # 1 + f1_2 = v1(f1_1) # 1 res[0] = f1_1.x res[1] = f1_2.x @@ -98,11 +93,11 @@ def kernel(res): # Construct a 2-component vector type, possible combination includes: # 1 + 2C1 * 2 = 5 combinations - f2_1 = v2(two, three) # 2 3 - f2_2 = v2(f1_1, three) # 1 3 - f2_3 = v2(two, f1_1) # 2 1 - f2_4 = v2(f1_1, f1_1) # 1 1 - f2_5 = v2(f2_1) # 2 3 + f2_1 = v2(two, three) # 2 3 + f2_2 = v2(f1_1, three) # 1 3 + f2_3 = v2(two, f1_1) # 2 1 + f2_4 = v2(f1_1, f1_1) # 1 1 + f2_5 = v2(f2_1) # 2 3 for v in (f2_1, f2_2, f2_3, f2_4, f2_5): res[j] = v.x @@ -112,24 +107,37 @@ def kernel(res): # Construct a 3-component vector type, possible combination includes: # 1 + 2C1 * 2 + 2^3 = 13 combinations - f3_1 = v3(f2_1, one) # 2 3 1 - f3_2 = v3(f2_1, f1_1) # 2 3 1 - f3_3 = v3(one, f2_1) # 1 2 3 - f3_4 = v3(f1_1, f2_1) # 1 2 3 - - f3_5 = v3(one, two, three) # 1 2 3 - f3_6 = v3(f1_1, two, three) # 1 2 3 - f3_7 = v3(one, f1_1, three) # 1 1 3 - f3_8 = v3(one, two, f1_1) # 1 2 1 - f3_9 = v3(f1_1, f1_1, three) # 1 1 3 - f3_10 = v3(one, f1_1, f1_1) # 1 1 1 - f3_11 = v3(f1_1, two, f1_1) # 1 2 1 - f3_12 = v3(f1_1, f1_1, f1_1) # 1 1 1 - - f3_13 = v3(f3_1) # 2 3 1 - - for v in (f3_1, f3_2, f3_3, f3_4, f3_5, f3_6, f3_7, f3_8, f3_9, - f3_10, f3_11, f3_12, f3_13): + f3_1 = v3(f2_1, one) # 2 3 1 + f3_2 = v3(f2_1, f1_1) # 2 3 1 + f3_3 = v3(one, f2_1) # 1 2 3 + f3_4 = v3(f1_1, f2_1) # 1 2 3 + + f3_5 = v3(one, two, three) # 1 2 3 + f3_6 = v3(f1_1, two, three) # 1 2 3 + f3_7 = v3(one, f1_1, three) # 1 1 3 + f3_8 = v3(one, two, f1_1) # 1 2 1 + f3_9 = v3(f1_1, f1_1, three) # 1 1 3 + f3_10 = v3(one, f1_1, f1_1) # 1 1 1 + f3_11 = v3(f1_1, two, f1_1) # 1 2 1 + f3_12 = v3(f1_1, f1_1, f1_1) # 1 1 1 + + f3_13 = v3(f3_1) # 2 3 1 + + for v in ( + f3_1, + f3_2, + f3_3, + f3_4, + f3_5, + f3_6, + f3_7, + f3_8, + f3_9, + f3_10, + f3_11, + f3_12, + f3_13, + ): res[j] = v.x res[j + 1] = v.y res[j + 2] = v.z @@ -138,48 +146,80 @@ def kernel(res): # Construct a 4-component vector type, possible combination includes: # 1 + (2C1 * 2 + 1) + 3C1 * 2^2 + 2^4 = 34 combinations - f4_1 = v4(one, two, three, four) # 1 2 3 4 - f4_2 = v4(f1_1, two, three, four) # 1 2 3 4 - f4_3 = v4(one, f1_1, three, four) # 1 1 3 4 - f4_4 = v4(one, two, f1_1, four) # 1 2 1 4 - f4_5 = v4(one, two, three, f1_1) # 1 2 3 1 + f4_1 = v4(one, two, three, four) # 1 2 3 4 + f4_2 = v4(f1_1, two, three, four) # 1 2 3 4 + f4_3 = v4(one, f1_1, three, four) # 1 1 3 4 + f4_4 = v4(one, two, f1_1, four) # 1 2 1 4 + f4_5 = v4(one, two, three, f1_1) # 1 2 3 1 f4_6 = v4(f1_1, f1_1, three, four) # 1 1 3 4 - f4_7 = v4(f1_1, two, f1_1, four) # 1 2 1 4 - f4_8 = v4(f1_1, two, three, f1_1) # 1 2 3 1 - f4_9 = v4(one, f1_1, f1_1, four) # 1 1 1 4 + f4_7 = v4(f1_1, two, f1_1, four) # 1 2 1 4 + f4_8 = v4(f1_1, two, three, f1_1) # 1 2 3 1 + f4_9 = v4(one, f1_1, f1_1, four) # 1 1 1 4 f4_10 = v4(one, f1_1, three, f1_1) # 1 1 3 1 - f4_11 = v4(one, two, f1_1, f1_1) # 1 2 1 1 + f4_11 = v4(one, two, f1_1, f1_1) # 1 2 1 1 f4_12 = v4(f1_1, f1_1, f1_1, four) # 1 1 1 4 - f4_13 = v4(f1_1, f1_1, three, f1_1) # 1 1 3 1 - f4_14 = v4(f1_1, two, f1_1, f1_1) # 1 2 1 1 - f4_15 = v4(one, f1_1, f1_1, f1_1) # 1 1 1 1 + f4_13 = v4(f1_1, f1_1, three, f1_1) # 1 1 3 1 + f4_14 = v4(f1_1, two, f1_1, f1_1) # 1 2 1 1 + f4_15 = v4(one, f1_1, f1_1, f1_1) # 1 1 1 1 f4_16 = v4(f1_1, f1_1, f1_1, f1_1) # 1 1 1 1 - f4_17 = v4(f2_1, two, three) # 2 3 2 3 - f4_18 = v4(f2_1, f1_1, three) # 2 3 1 3 - f4_19 = v4(f2_1, two, f1_1) # 2 3 2 1 - f4_20 = v4(f2_1, f1_1, f1_1) # 2 3 1 1 - f4_21 = v4(one, f2_1, three) # 1 2 3 3 - f4_22 = v4(f1_1, f2_1, three) # 1 2 3 3 - f4_23 = v4(one, f2_1, f1_1) # 1 2 3 1 - f4_24 = v4(f1_1, f2_1, f1_1) # 1 2 3 1 - f4_25 = v4(one, four, f2_1) # 1 4 2 3 - f4_26 = v4(f1_1, four, f2_1) # 1 4 2 3 - f4_27 = v4(one, f1_1, f2_1) # 1 1 2 3 - f4_28 = v4(f1_1, f1_1, f2_1) # 1 1 2 3 - - f4_29 = v4(f2_1, f2_1) # 2 3 2 3 - f4_30 = v4(f3_1, four) # 2 3 1 4 - f4_31 = v4(f3_1, f1_1) # 2 3 1 1 - f4_32 = v4(four, f3_1) # 4 2 3 1 - f4_33 = v4(f1_1, f3_1) # 1 2 3 1 - - f4_34 = v4(f4_1) # 1 2 3 4 - - for v in (f4_1, f4_2, f4_3, f4_4, f4_5, f4_6, f4_7, f4_8, f4_9, f4_10, - f4_11, f4_12, f4_13, f4_14, f4_15, f4_16, f4_17, f4_18, f4_19, - f4_20, f4_21, f4_22, f4_23, f4_24, f4_25, f4_26, f4_27, f4_28, - f4_29, f4_30, f4_31, f4_32, f4_33, f4_34): + f4_17 = v4(f2_1, two, three) # 2 3 2 3 + f4_18 = v4(f2_1, f1_1, three) # 2 3 1 3 + f4_19 = v4(f2_1, two, f1_1) # 2 3 2 1 + f4_20 = v4(f2_1, f1_1, f1_1) # 2 3 1 1 + f4_21 = v4(one, f2_1, three) # 1 2 3 3 + f4_22 = v4(f1_1, f2_1, three) # 1 2 3 3 + f4_23 = v4(one, f2_1, f1_1) # 1 2 3 1 + f4_24 = v4(f1_1, f2_1, f1_1) # 1 2 3 1 + f4_25 = v4(one, four, f2_1) # 1 4 2 3 + f4_26 = v4(f1_1, four, f2_1) # 1 4 2 3 + f4_27 = v4(one, f1_1, f2_1) # 1 1 2 3 + f4_28 = v4(f1_1, f1_1, f2_1) # 1 1 2 3 + + f4_29 = v4(f2_1, f2_1) # 2 3 2 3 + f4_30 = v4(f3_1, four) # 2 3 1 4 + f4_31 = v4(f3_1, f1_1) # 2 3 1 1 + f4_32 = v4(four, f3_1) # 4 2 3 1 + f4_33 = v4(f1_1, f3_1) # 1 2 3 1 + + f4_34 = v4(f4_1) # 1 2 3 4 + + for v in ( + f4_1, + f4_2, + f4_3, + f4_4, + f4_5, + f4_6, + f4_7, + f4_8, + f4_9, + f4_10, + f4_11, + f4_12, + f4_13, + f4_14, + f4_15, + f4_16, + f4_17, + f4_18, + f4_19, + f4_20, + f4_21, + f4_22, + f4_23, + f4_24, + f4_25, + f4_26, + f4_27, + f4_28, + f4_29, + f4_30, + f4_31, + f4_32, + f4_33, + f4_34, + ): res[j] = v.x res[j + 1] = v.y res[j + 2] = v.z @@ -190,13 +230,13 @@ def kernel(res): class TestCudaVectorType(CUDATestCase): - def test_basic(self): """Basic test that makes sure that vector type and aliases are available within the cuda module from both device and simulator mode. This is an important sanity check, since other tests below tests the vector type objects programmatically. """ + @cuda.jit("void(float64[:])") def kernel(arr): v1 = cuda.float64x4(1.0, 3.0, 5.0, 7.0) @@ -227,66 +267,201 @@ def test_fancy_creation_readout(self): with self.subTest(vty=vty): kernel = make_fancy_creation_kernel(vty) - expected = np.array([ - # 1-component vectors - 1, - 1, - # 2-component vectors - 2, 3, - 1, 3, - 2, 1, - 1, 1, - 2, 3, - # 3-component vectors - 2, 3, 1, - 2, 3, 1, - 1, 2, 3, - 1, 2, 3, - 1, 2, 3, - 1, 2, 3, - 1, 1, 3, - 1, 2, 1, - 1, 1, 3, - 1, 1, 1, - 1, 2, 1, - 1, 1, 1, - 2, 3, 1, - # 4-component vectors - 1, 2, 3, 4, - 1, 2, 3, 4, - 1, 1, 3, 4, - 1, 2, 1, 4, - 1, 2, 3, 1, - 1, 1, 3, 4, - 1, 2, 1, 4, - 1, 2, 3, 1, - 1, 1, 1, 4, - 1, 1, 3, 1, - 1, 2, 1, 1, - 1, 1, 1, 4, - 1, 1, 3, 1, - 1, 2, 1, 1, - 1, 1, 1, 1, - 1, 1, 1, 1, - 2, 3, 2, 3, - 2, 3, 1, 3, - 2, 3, 2, 1, - 2, 3, 1, 1, - 1, 2, 3, 3, - 1, 2, 3, 3, - 1, 2, 3, 1, - 1, 2, 3, 1, - 1, 4, 2, 3, - 1, 4, 2, 3, - 1, 1, 2, 3, - 1, 1, 2, 3, - 2, 3, 2, 3, - 2, 3, 1, 4, - 2, 3, 1, 1, - 4, 2, 3, 1, - 1, 2, 3, 1, - 1, 2, 3, 4 - ]) + expected = np.array( + [ + # 1-component vectors + 1, + 1, + # 2-component vectors + 2, + 3, + 1, + 3, + 2, + 1, + 1, + 1, + 2, + 3, + # 3-component vectors + 2, + 3, + 1, + 2, + 3, + 1, + 1, + 2, + 3, + 1, + 2, + 3, + 1, + 2, + 3, + 1, + 2, + 3, + 1, + 1, + 3, + 1, + 2, + 1, + 1, + 1, + 3, + 1, + 1, + 1, + 1, + 2, + 1, + 1, + 1, + 1, + 2, + 3, + 1, + # 4-component vectors + 1, + 2, + 3, + 4, + 1, + 2, + 3, + 4, + 1, + 1, + 3, + 4, + 1, + 2, + 1, + 4, + 1, + 2, + 3, + 1, + 1, + 1, + 3, + 4, + 1, + 2, + 1, + 4, + 1, + 2, + 3, + 1, + 1, + 1, + 1, + 4, + 1, + 1, + 3, + 1, + 1, + 2, + 1, + 1, + 1, + 1, + 1, + 4, + 1, + 1, + 3, + 1, + 1, + 2, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 2, + 3, + 2, + 3, + 2, + 3, + 1, + 3, + 2, + 3, + 2, + 1, + 2, + 3, + 1, + 1, + 1, + 2, + 3, + 3, + 1, + 2, + 3, + 3, + 1, + 2, + 3, + 1, + 1, + 2, + 3, + 1, + 1, + 4, + 2, + 3, + 1, + 4, + 2, + 3, + 1, + 1, + 2, + 3, + 1, + 1, + 2, + 3, + 2, + 3, + 2, + 3, + 2, + 3, + 1, + 4, + 2, + 3, + 1, + 1, + 4, + 2, + 3, + 1, + 1, + 2, + 3, + 1, + 1, + 2, + 3, + 4, + ] + ) arr = np.zeros(expected.shape) kernel[1, 1](arr) np.testing.assert_almost_equal(arr, expected) diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py b/numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py index c88e1792b..f4c540ca1 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py @@ -13,9 +13,11 @@ # Signatures to test with - these are all homogeneous in dtype, so the output # dtype should match the input dtype - the output should not have been cast # upwards, as reported in #8400: https://github.com/numba/numba/issues/8400 -signatures = [int32(int32, int32), - float32(float32, float32), - float64(float64, float64)] +signatures = [ + int32(int32, int32), + float32(float32, float32), + float64(float64, float64), +] # The order here is chosen such that each subsequent dtype might have been # casted to a previously-used dtype. This is unlikely to be an issue for CUDA, @@ -25,16 +27,16 @@ dtypes = (np.float64, np.float32, np.int32) # NumPy ndarray orders -orders = ('C', 'F') +orders = ("C", "F") # Input sizes corresponding to operations: # - Less than one warp, # - Less than one block, # - Greater than one block (i.e. many blocks) -input_sizes = (8, 100, 2 ** 10 + 1) +input_sizes = (8, 100, 2**10 + 1) -@skip_on_cudasim('ufunc API unsupported in the simulator') +@skip_on_cudasim("ufunc API unsupported in the simulator") class TestCUDAVectorize(CUDATestCase): # Presumably chosen as an odd number unlikely to coincide with the total # thread count, and large enough to ensure a significant number of blocks @@ -42,8 +44,7 @@ class TestCUDAVectorize(CUDATestCase): N = 1000001 def test_scalar(self): - - @vectorize(signatures, target='cuda') + @vectorize(signatures, target="cuda") def vector_add(a, b): return a + b @@ -53,8 +54,7 @@ def vector_add(a, b): self.assertEqual(c, a + b) def test_1d(self): - - @vectorize(signatures, target='cuda') + @vectorize(signatures, target="cuda") def vector_add(a, b): return a + b @@ -66,8 +66,7 @@ def vector_add(a, b): self.assertEqual(actual.dtype, ty) def test_1d_async(self): - - @vectorize(signatures, target='cuda') + @vectorize(signatures, target="cuda") def vector_add(a, b): return a + b @@ -86,8 +85,7 @@ def vector_add(a, b): self.assertEqual(actual.dtype, ty) def test_nd(self): - - @vectorize(signatures, target='cuda') + @vectorize(signatures, target="cuda") def vector_add(a, b): return a + b @@ -102,7 +100,7 @@ def vector_add(a, b): self.assertEqual(actual.dtype, dtype) def test_output_arg(self): - @vectorize(signatures, target='cuda') + @vectorize(signatures, target="cuda") def vector_add(a, b): return a + b @@ -117,7 +115,7 @@ def vector_add(a, b): self.assertEqual(expected.dtype, actual.dtype) def test_reduce(self): - @vectorize(signatures, target='cuda') + @vectorize(signatures, target="cuda") def vector_add(a, b): return a + b @@ -135,8 +133,7 @@ def vector_add(a, b): self.assertEqual(dtype, actual.dtype) def test_reduce_async(self): - - @vectorize(signatures, target='cuda') + @vectorize(signatures, target="cuda") def vector_add(a, b): return a + b @@ -153,7 +150,7 @@ def vector_add(a, b): self.assertEqual(dtype, actual.dtype) def test_manual_transfer(self): - @vectorize(signatures, target='cuda') + @vectorize(signatures, target="cuda") def vector_add(a, b): return a + b @@ -166,7 +163,7 @@ def vector_add(a, b): self.assertEqual(expected.dtype, actual.dtype) def test_ufunc_output_2d(self): - @vectorize(signatures, target='cuda') + @vectorize(signatures, target="cuda") def vector_add(a, b): return a + b @@ -181,7 +178,7 @@ def vector_add(a, b): self.assertEqual(expected.dtype, actual.dtype) def check_tuple_arg(self, a, b): - @vectorize(signatures, target='cuda') + @vectorize(signatures, target="cuda") def vector_add(a, b): return a + b @@ -194,7 +191,7 @@ def test_tuple_arg(self): self.check_tuple_arg(a, b) def test_namedtuple_arg(self): - Point = namedtuple('Point', ('x', 'y', 'z')) + Point = namedtuple("Point", ("x", "y", "z")) a = Point(x=1.0, y=2.0, z=3.0) b = Point(x=4.0, y=5.0, z=6.0) self.check_tuple_arg(a, b) @@ -206,7 +203,7 @@ def test_tuple_of_array_arg(self): self.check_tuple_arg(a, b) def test_tuple_of_namedtuple_arg(self): - Point = namedtuple('Point', ('x', 'y', 'z')) + Point = namedtuple("Point", ("x", "y", "z")) a = (Point(x=1.0, y=2.0, z=3.0), Point(x=1.5, y=2.5, z=3.5)) b = (Point(x=4.0, y=5.0, z=6.0), Point(x=4.5, y=5.5, z=6.5)) self.check_tuple_arg(a, b) @@ -216,17 +213,17 @@ def test_namedtuple_of_array_arg(self): ys1 = xs1 + 2 xs2 = np.arange(10, dtype=np.int32) * 2 ys2 = xs2 + 1 - Points = namedtuple('Points', ('xs', 'ys')) + Points = namedtuple("Points", ("xs", "ys")) a = Points(xs=xs1, ys=ys1) b = Points(xs=xs2, ys=ys2) self.check_tuple_arg(a, b) def test_name_attribute(self): - @vectorize('f8(f8)', target='cuda') + @vectorize("f8(f8)", target="cuda") def bar(x): - return x ** 2 + return x**2 - self.assertEqual(bar.__name__, 'bar') + self.assertEqual(bar.__name__, "bar") def test_no_transfer_for_device_data(self): # Initialize test data on the device prior to banning host <-> device @@ -238,15 +235,15 @@ def test_no_transfer_for_device_data(self): # A mock of a CUDA function that always raises a CudaAPIError def raising_transfer(*args, **kwargs): - raise CudaAPIError(999, 'Transfer not allowed') + raise CudaAPIError(999, "Transfer not allowed") # Use the mock for transfers between the host and device - old_HtoD = getattr(driver, 'cuMemcpyHtoD', None) - old_DtoH = getattr(driver, 'cuMemcpyDtoH', None) + old_HtoD = getattr(driver, "cuMemcpyHtoD", None) + old_DtoH = getattr(driver, "cuMemcpyDtoH", None) - setattr(driver, 'cuMemcpyHtoD', raising_transfer) - setattr(driver, 'cuMemcpyDtoH', raising_transfer) + setattr(driver, "cuMemcpyHtoD", raising_transfer) + setattr(driver, "cuMemcpyDtoH", raising_transfer) # Ensure that the mock functions are working as expected @@ -260,7 +257,7 @@ def raising_transfer(*args, **kwargs): # Check that defining and calling a ufunc with data on the device # induces no transfers - @vectorize(['float32(float32)'], target='cuda') + @vectorize(["float32(float32)"], target="cuda") def func(noise): return noise + 1.0 @@ -270,14 +267,14 @@ def func(noise): # no original implementation, simply remove ours. if old_HtoD is not None: - setattr(driver, 'cuMemcpyHtoD', old_HtoD) + setattr(driver, "cuMemcpyHtoD", old_HtoD) else: del driver.cuMemcpyHtoD if old_DtoH is not None: - setattr(driver, 'cuMemcpyDtoH', old_DtoH) + setattr(driver, "cuMemcpyDtoH", old_DtoH) else: del driver.cuMemcpyDtoH -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py b/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py index 82c7ca8f8..8da551309 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py @@ -4,17 +4,17 @@ import unittest -@skip_on_cudasim('ufunc API unsupported in the simulator') +@skip_on_cudasim("ufunc API unsupported in the simulator") class TestVectorizeComplex(CUDATestCase): def test_vectorize_complex(self): - @vectorize(['complex128(complex128)'], target='cuda') + @vectorize(["complex128(complex128)"], target="cuda") def vcomp(a): - return a * a + 1. + return a * a + 1.0 A = np.arange(5, dtype=np.complex128) B = vcomp(A) - self.assertTrue(np.allclose(A * A + 1., B)) + self.assertTrue(np.allclose(A * A + 1.0, B)) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py b/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py index 12b8fa03c..1c2bd513d 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py @@ -1,21 +1,25 @@ import numpy as np from numba import vectorize, cuda -from numba.tests.npyufunc.test_vectorize_decor import BaseVectorizeDecor, \ - BaseVectorizeNopythonArg, BaseVectorizeUnrecognizedArg +from numba.tests.npyufunc.test_vectorize_decor import ( + BaseVectorizeDecor, + BaseVectorizeNopythonArg, + BaseVectorizeUnrecognizedArg, +) from numba.cuda.testing import skip_on_cudasim, CUDATestCase import unittest -@skip_on_cudasim('ufunc API unsupported in the simulator') +@skip_on_cudasim("ufunc API unsupported in the simulator") class TestVectorizeDecor(CUDATestCase, BaseVectorizeDecor): """ Runs the tests from BaseVectorizeDecor with the CUDA target. """ - target = 'cuda' + target = "cuda" -@skip_on_cudasim('ufunc API unsupported in the simulator') + +@skip_on_cudasim("ufunc API unsupported in the simulator") class TestGPUVectorizeBroadcast(CUDATestCase): def test_broadcast(self): a = np.random.randn(100, 3, 1) @@ -24,7 +28,7 @@ def test_broadcast(self): def fn(a, b): return a - b - @vectorize(['float64(float64,float64)'], target='cuda') + @vectorize(["float64(float64,float64)"], target="cuda") def fngpu(a, b): return a - b @@ -43,7 +47,7 @@ def test_device_broadcast(self): def fn(a, b): return a - b - @vectorize(['float64(float64,float64)'], target='cuda') + @vectorize(["float64(float64,float64)"], target="cuda") def fngpu(a, b): return a - b @@ -52,18 +56,18 @@ def fngpu(a, b): np.testing.assert_almost_equal(expect, got.copy_to_host()) -@skip_on_cudasim('ufunc API unsupported in the simulator') +@skip_on_cudasim("ufunc API unsupported in the simulator") class TestVectorizeNopythonArg(BaseVectorizeNopythonArg, CUDATestCase): def test_target_cuda_nopython(self): warnings = ["nopython kwarg for cuda target is redundant"] - self._test_target_nopython('cuda', warnings) + self._test_target_nopython("cuda", warnings) -@skip_on_cudasim('ufunc API unsupported in the simulator') +@skip_on_cudasim("ufunc API unsupported in the simulator") class TestVectorizeUnrecognizedArg(BaseVectorizeUnrecognizedArg, CUDATestCase): def test_target_cuda_unrecognized_arg(self): - self._test_target_unrecognized_arg('cuda') + self._test_target_unrecognized_arg("cuda") -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py b/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py index e33598d8b..67e2d3265 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py @@ -5,19 +5,19 @@ import unittest -@skip_on_cudasim('ufunc API unsupported in the simulator') +@skip_on_cudasim("ufunc API unsupported in the simulator") class TestCudaVectorizeDeviceCall(CUDATestCase): def test_cuda_vectorize_device_call(self): - @cuda.jit(float32(float32, float32, float32), device=True) def cu_device_fn(x, y, z): - return x ** y / z + return x**y / z def cu_ufunc(x, y, z): return cu_device_fn(x, y, z) - ufunc = vectorize([float32(float32, float32, float32)], target='cuda')( - cu_ufunc) + ufunc = vectorize([float32(float32, float32, float32)], target="cuda")( + cu_ufunc + ) N = 100 @@ -27,10 +27,10 @@ def cu_ufunc(x, y, z): out = ufunc(X, Y, Z) - gold = (X ** Y) / Z + gold = (X**Y) / Z self.assertTrue(np.allclose(out, gold)) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py b/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py index 1c65a41d7..e413e67d1 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py @@ -7,11 +7,10 @@ sig = [float64(float64, float64)] -@skip_on_cudasim('ufunc API unsupported in the simulator') +@skip_on_cudasim("ufunc API unsupported in the simulator") class TestCUDAVectorizeScalarArg(CUDATestCase): - def test_vectorize_scalar_arg(self): - @vectorize(sig, target='cuda') + @vectorize(sig, target="cuda") def vector_add(a, b): return a + b @@ -20,11 +19,11 @@ def vector_add(a, b): v = vector_add(1.0, dA) np.testing.assert_array_almost_equal( - v.copy_to_host(), - np.arange(1, 11, dtype=np.float64)) + v.copy_to_host(), np.arange(1, 11, dtype=np.float64) + ) def test_vectorize_all_scalars(self): - @vectorize(sig, target='cuda') + @vectorize(sig, target="cuda") def vector_add(a, b): return a + b @@ -33,5 +32,5 @@ def vector_add(a, b): np.testing.assert_almost_equal(2.0, v) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_warning.py b/numba_cuda/numba/cuda/tests/cudapy/test_warning.py index fbcb643fe..11fd61b55 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_warning.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_warning.py @@ -7,27 +7,27 @@ import warnings -@skip_on_cudasim('cudasim does not raise performance warnings') +@skip_on_cudasim("cudasim does not raise performance warnings") class TestWarnings(CUDATestCase): def test_inefficient_launch_configuration(self): @cuda.jit def kernel(): pass - with override_config('CUDA_LOW_OCCUPANCY_WARNINGS', 1): + with override_config("CUDA_LOW_OCCUPANCY_WARNINGS", 1): with warnings.catch_warnings(record=True) as w: kernel[1, 1]() self.assertEqual(w[0].category, NumbaPerformanceWarning) - self.assertIn('Grid size', str(w[0].message)) - self.assertIn('low occupancy', str(w[0].message)) + self.assertIn("Grid size", str(w[0].message)) + self.assertIn("low occupancy", str(w[0].message)) def test_efficient_launch_configuration(self): @cuda.jit def kernel(): pass - with override_config('CUDA_LOW_OCCUPANCY_WARNINGS', 1): + with override_config("CUDA_LOW_OCCUPANCY_WARNINGS", 1): with warnings.catch_warnings(record=True) as w: kernel[256, 256]() @@ -40,14 +40,15 @@ def foo(r, x): N = 10 arr_f32 = np.zeros(N, dtype=np.float32) - with override_config('CUDA_WARN_ON_IMPLICIT_COPY', 1): + with override_config("CUDA_WARN_ON_IMPLICIT_COPY", 1): with warnings.catch_warnings(record=True) as w: foo[1, N](arr_f32, N) self.assertEqual(w[0].category, NumbaPerformanceWarning) - self.assertIn('Host array used in CUDA kernel will incur', - str(w[0].message)) - self.assertIn('copy overhead', str(w[0].message)) + self.assertIn( + "Host array used in CUDA kernel will incur", str(w[0].message) + ) + self.assertIn("copy overhead", str(w[0].message)) def test_pinned_warn_on_host_array(self): @cuda.jit @@ -57,14 +58,15 @@ def foo(r, x): N = 10 ary = cuda.pinned_array(N, dtype=np.float32) - with override_config('CUDA_WARN_ON_IMPLICIT_COPY', 1): + with override_config("CUDA_WARN_ON_IMPLICIT_COPY", 1): with warnings.catch_warnings(record=True) as w: foo[1, N](ary, N) self.assertEqual(w[0].category, NumbaPerformanceWarning) - self.assertIn('Host array used in CUDA kernel will incur', - str(w[0].message)) - self.assertIn('copy overhead', str(w[0].message)) + self.assertIn( + "Host array used in CUDA kernel will incur", str(w[0].message) + ) + self.assertIn("copy overhead", str(w[0].message)) def test_nowarn_on_mapped_array(self): @cuda.jit @@ -74,7 +76,7 @@ def foo(r, x): N = 10 ary = cuda.mapped_array(N, dtype=np.float32) - with override_config('CUDA_WARN_ON_IMPLICIT_COPY', 1): + with override_config("CUDA_WARN_ON_IMPLICIT_COPY", 1): with warnings.catch_warnings(record=True) as w: foo[1, N](ary, N) @@ -89,7 +91,7 @@ def foo(r, x): N = 10 ary = cuda.managed_array(N, dtype=np.float32) - with override_config('CUDA_WARN_ON_IMPLICIT_COPY', 1): + with override_config("CUDA_WARN_ON_IMPLICIT_COPY", 1): with warnings.catch_warnings(record=True) as w: foo[1, N](ary, N) @@ -103,7 +105,7 @@ def foo(r, x): N = 10 ary = cuda.device_array(N, dtype=np.float32) - with override_config('CUDA_WARN_ON_IMPLICIT_COPY', 1): + with override_config("CUDA_WARN_ON_IMPLICIT_COPY", 1): with warnings.catch_warnings(record=True) as w: foo[1, N](ary, N) @@ -114,14 +116,14 @@ def test_warn_on_debug_and_opt(self): cuda.jit(debug=True, opt=True) self.assertEqual(len(w), 1) - self.assertIn('not supported by CUDA', str(w[0].message)) + self.assertIn("not supported by CUDA", str(w[0].message)) def test_warn_on_debug_and_opt_default(self): with warnings.catch_warnings(record=True) as w: cuda.jit(debug=True) self.assertEqual(len(w), 1) - self.assertIn('not supported by CUDA', str(w[0].message)) + self.assertIn("not supported by CUDA", str(w[0].message)) def test_no_warn_on_debug_and_no_opt(self): with warnings.catch_warnings(record=True) as w: @@ -136,8 +138,8 @@ def test_no_warn_with_no_debug_and_opt_kwargs(self): self.assertEqual(len(w), 0) def test_no_warn_on_debug_and_opt_with_config(self): - with override_config('CUDA_DEBUGINFO_DEFAULT', 1): - with override_config('OPT', config._OptLevel(0)): + with override_config("CUDA_DEBUGINFO_DEFAULT", 1): + with override_config("OPT", config._OptLevel(0)): with warnings.catch_warnings(record=True) as w: cuda.jit() @@ -148,30 +150,30 @@ def test_no_warn_on_debug_and_opt_with_config(self): self.assertEqual(len(w), 0) - with override_config('OPT', config._OptLevel(0)): + with override_config("OPT", config._OptLevel(0)): with warnings.catch_warnings(record=True) as w: cuda.jit(debug=True) self.assertEqual(len(w), 0) def test_warn_on_debug_and_opt_with_config(self): - with override_config('CUDA_DEBUGINFO_DEFAULT', 1): - for opt in (1, 2, 3, 'max'): - with override_config('OPT', config._OptLevel(opt)): + with override_config("CUDA_DEBUGINFO_DEFAULT", 1): + for opt in (1, 2, 3, "max"): + with override_config("OPT", config._OptLevel(opt)): with warnings.catch_warnings(record=True) as w: cuda.jit() self.assertEqual(len(w), 1) - self.assertIn('not supported by CUDA', str(w[0].message)) + self.assertIn("not supported by CUDA", str(w[0].message)) - for opt in (1, 2, 3, 'max'): - with override_config('OPT', config._OptLevel(opt)): + for opt in (1, 2, 3, "max"): + with override_config("OPT", config._OptLevel(opt)): with warnings.catch_warnings(record=True) as w: cuda.jit(debug=True) self.assertEqual(len(w), 1) - self.assertIn('not supported by CUDA', str(w[0].message)) + self.assertIn("not supported by CUDA", str(w[0].message)) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py b/numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py index 2fc157d07..6f3d0f26e 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py @@ -8,73 +8,73 @@ def useful_syncwarp(ary): i = cuda.grid(1) if i == 0: ary[0] = 42 - cuda.syncwarp(0xffffffff) + cuda.syncwarp(0xFFFFFFFF) ary[i] = ary[0] def use_shfl_sync_idx(ary, idx): i = cuda.grid(1) - val = cuda.shfl_sync(0xffffffff, i, idx) + val = cuda.shfl_sync(0xFFFFFFFF, i, idx) ary[i] = val def use_shfl_sync_up(ary, delta): i = cuda.grid(1) - val = cuda.shfl_up_sync(0xffffffff, i, delta) + val = cuda.shfl_up_sync(0xFFFFFFFF, i, delta) ary[i] = val def use_shfl_sync_down(ary, delta): i = cuda.grid(1) - val = cuda.shfl_down_sync(0xffffffff, i, delta) + val = cuda.shfl_down_sync(0xFFFFFFFF, i, delta) ary[i] = val def use_shfl_sync_xor(ary, xor): i = cuda.grid(1) - val = cuda.shfl_xor_sync(0xffffffff, i, xor) + val = cuda.shfl_xor_sync(0xFFFFFFFF, i, xor) ary[i] = val def use_shfl_sync_with_val(ary, into): i = cuda.grid(1) - val = cuda.shfl_sync(0xffffffff, into, 0) + val = cuda.shfl_sync(0xFFFFFFFF, into, 0) ary[i] = val def use_vote_sync_all(ary_in, ary_out): i = cuda.grid(1) - pred = cuda.all_sync(0xffffffff, ary_in[i]) + pred = cuda.all_sync(0xFFFFFFFF, ary_in[i]) ary_out[i] = pred def use_vote_sync_any(ary_in, ary_out): i = cuda.grid(1) - pred = cuda.any_sync(0xffffffff, ary_in[i]) + pred = cuda.any_sync(0xFFFFFFFF, ary_in[i]) ary_out[i] = pred def use_vote_sync_eq(ary_in, ary_out): i = cuda.grid(1) - pred = cuda.eq_sync(0xffffffff, ary_in[i]) + pred = cuda.eq_sync(0xFFFFFFFF, ary_in[i]) ary_out[i] = pred def use_vote_sync_ballot(ary): i = cuda.threadIdx.x - ballot = cuda.ballot_sync(0xffffffff, True) + ballot = cuda.ballot_sync(0xFFFFFFFF, True) ary[i] = ballot def use_match_any_sync(ary_in, ary_out): i = cuda.grid(1) - ballot = cuda.match_any_sync(0xffffffff, ary_in[i]) + ballot = cuda.match_any_sync(0xFFFFFFFF, ary_in[i]) ary_out[i] = ballot def use_match_all_sync(ary_in, ary_out): i = cuda.grid(1) - ballot, pred = cuda.match_all_sync(0xffffffff, ary_in[i]) + ballot, pred = cuda.match_all_sync(0xFFFFFFFF, ary_in[i]) ary_out[i] = ballot if pred else 0 @@ -146,8 +146,12 @@ def test_shfl_sync_xor(self): def test_shfl_sync_types(self): types = int32, int64, float32, float64 - values = (np.int32(-1), np.int64(1 << 42), - np.float32(np.pi), np.float64(np.pi)) + values = ( + np.int32(-1), + np.int64(1 << 42), + np.float32(np.pi), + np.float64(np.pi), + ) for typ, val in zip(types, values): compiled = cuda.jit((typ[:], typ))(use_shfl_sync_with_val) nelem = 32 @@ -197,10 +201,11 @@ def test_vote_sync_ballot(self): nelem = 32 ary = np.empty(nelem, dtype=np.uint32) compiled[1, nelem](ary) - self.assertTrue(np.all(ary == np.uint32(0xffffffff))) + self.assertTrue(np.all(ary == np.uint32(0xFFFFFFFF))) - @unittest.skipUnless(_safe_cc_check((7, 0)), - "Matching requires at least Volta Architecture") + @unittest.skipUnless( + _safe_cc_check((7, 0)), "Matching requires at least Volta Architecture" + ) def test_match_any_sync(self): compiled = cuda.jit("void(int32[:], int32[:])")(use_match_any_sync) nelem = 10 @@ -210,8 +215,9 @@ def test_match_any_sync(self): compiled[1, nelem](ary_in, ary_out) self.assertTrue(np.all(ary_out == exp)) - @unittest.skipUnless(_safe_cc_check((7, 0)), - "Matching requires at least Volta Architecture") + @unittest.skipUnless( + _safe_cc_check((7, 0)), "Matching requires at least Volta Architecture" + ) def test_match_all_sync(self): compiled = cuda.jit("void(int32[:], int32[:])")(use_match_all_sync) nelem = 10 @@ -223,9 +229,10 @@ def test_match_all_sync(self): compiled[1, nelem](ary_in, ary_out) self.assertTrue(np.all(ary_out == 0)) - @unittest.skipUnless(_safe_cc_check((7, 0)), - "Independent scheduling requires at least Volta " - "Architecture") + @unittest.skipUnless( + _safe_cc_check((7, 0)), + "Independent scheduling requires at least Volta Architecture", + ) def test_independent_scheduling(self): compiled = cuda.jit("void(uint32[:])")(use_independent_scheduling) arr = np.empty(32, dtype=np.uint32) @@ -267,10 +274,9 @@ def use_lanemask_lt(x): # 0, 1, 3, 7, F, 1F, 3F, 7F, FF, 1FF, etc. # or in binary: # ...0001, ....0011, ...0111, etc. - expected = np.asarray([(2 ** i) - 1 for i in range(32)], - dtype=np.uint32) + expected = np.asarray([(2**i) - 1 for i in range(32)], dtype=np.uint32) np.testing.assert_equal(expected, out) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py b/numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py index 0f544821a..c1e8d0b23 100644 --- a/numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +++ b/numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py @@ -10,12 +10,16 @@ class TestCudaSimIssues(CUDATestCase): def test_record_access(self): - backyard_type = [('statue', np.float64), - ('newspaper', np.float64, (6,))] + backyard_type = [ + ("statue", np.float64), + ("newspaper", np.float64, (6,)), + ] - goose_type = [('garden', np.float64, (12,)), - ('town', np.float64, (42,)), - ('backyard', backyard_type)] + goose_type = [ + ("garden", np.float64, (12,)), + ("town", np.float64, (42,)), + ("backyard", backyard_type), + ] goose_np_type = np.dtype(goose_type, align=True) @@ -27,20 +31,22 @@ def simple_kernel(f): item = np.recarray(1, dtype=goose_np_type) simple_kernel[1, 1](item[0]) - np.testing.assert_equal(item[0]['garden'][0], 45) - np.testing.assert_equal(item[0]['backyard']['newspaper'][3], 5) + np.testing.assert_equal(item[0]["garden"][0], 45) + np.testing.assert_equal(item[0]["backyard"]["newspaper"][3], 5) def test_recarray_setting(self): - recordwith2darray = np.dtype([('i', np.int32), - ('j', np.float32, (3, 2))]) + recordwith2darray = np.dtype( + [("i", np.int32), ("j", np.float32, (3, 2))] + ) rec = np.recarray(2, dtype=recordwith2darray) - rec[0]['i'] = 45 + rec[0]["i"] = 45 @cuda.jit def simple_kernel(f): f[1] = f[0] + simple_kernel[1, 1](rec) - np.testing.assert_equal(rec[0]['i'], rec[1]['i']) + np.testing.assert_equal(rec[0]["i"], rec[1]["i"]) def test_cuda_module_in_device_function(self): """ @@ -63,7 +69,7 @@ def outer(out): expected = np.arange(arr.size, dtype=np.int32) np.testing.assert_equal(expected, arr) - @skip_unless_cudasim('Only works on CUDASIM') + @skip_unless_cudasim("Only works on CUDASIM") def test_deadlock_on_exception(self): def assert_no_blockthreads(): blockthreads = [] @@ -98,5 +104,5 @@ def assign_with_sync(x, y): assert_no_blockthreads() -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/data/jitlink.cu b/numba_cuda/numba/cuda/tests/data/jitlink.cu index 4d245366c..a2737a6ef 100644 --- a/numba_cuda/numba/cuda/tests/data/jitlink.cu +++ b/numba_cuda/numba/cuda/tests/data/jitlink.cu @@ -20,4 +20,4 @@ int array_mutator(void *out, int *a) { a[0] = a[1]; return 0; -} +} diff --git a/numba_cuda/numba/cuda/tests/data/jitlink.ptx b/numba_cuda/numba/cuda/tests/data/jitlink.ptx index dde0cc214..fdbbb261f 100644 --- a/numba_cuda/numba/cuda/tests/data/jitlink.ptx +++ b/numba_cuda/numba/cuda/tests/data/jitlink.ptx @@ -47,5 +47,3 @@ st.param.b32 [func_retval0+0], %r2; ret; } - - diff --git a/numba_cuda/numba/cuda/tests/doc_examples/test_cg.py b/numba_cuda/numba/cuda/tests/doc_examples/test_cg.py index fc8405dbb..a14f4eac5 100644 --- a/numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +++ b/numba_cuda/numba/cuda/tests/doc_examples/test_cg.py @@ -2,14 +2,18 @@ # "magictoken" is used for markers as beginning and ending of example text. import unittest -from numba.cuda.testing import (CUDATestCase, skip_on_cudasim, - skip_if_cudadevrt_missing, skip_unless_cc_60, - skip_if_mvc_enabled) +from numba.cuda.testing import ( + CUDATestCase, + skip_on_cudasim, + skip_if_cudadevrt_missing, + skip_unless_cc_60, + skip_if_mvc_enabled, +) @skip_if_cudadevrt_missing @skip_unless_cc_60 -@skip_if_mvc_enabled('CG not supported with MVC') +@skip_if_mvc_enabled("CG not supported with MVC") @skip_on_cudasim("cudasim doesn't support cuda import at non-top-level") class TestCooperativeGroups(CUDATestCase): def test_ex_grid_sync(self): @@ -17,7 +21,7 @@ def test_ex_grid_sync(self): from numba import cuda, int32 import numpy as np - sig = (int32[:,::1],) + sig = (int32[:, ::1],) @cuda.jit(sig) def sequential_rows(M): @@ -34,6 +38,7 @@ def sequential_rows(M): # Wait until all threads have written their column element, # and that the write is visible to all other threads g.sync() + # magictoken.ex_grid_sync_kernel.end # magictoken.ex_grid_sync_data.begin @@ -48,9 +53,11 @@ def sequential_rows(M): # Skip this test if the grid size used in the example is too large for # a cooperative launch on the current GPU - mb = sequential_rows.overloads[sig].max_cooperative_grid_blocks(blockdim) + mb = sequential_rows.overloads[sig].max_cooperative_grid_blocks( + blockdim + ) if mb < griddim: - self.skipTest('Device does not support a large enough coop grid') + self.skipTest("Device does not support a large enough coop grid") # magictoken.ex_grid_sync_launch.begin # Kernel launch - this is implicitly a cooperative launch @@ -73,5 +80,5 @@ def sequential_rows(M): np.testing.assert_equal(A, reference) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py b/numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py index b879a12d2..f8ec6f51f 100644 --- a/numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +++ b/numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py @@ -41,6 +41,7 @@ def test_ex_cpu_gpu_compat(self): @numba.jit def business_logic(x, y, z): return 4 * z * (2 * x - (4 * y) / 2 * pi) + # ex_cpu_gpu_compat.define.end # ex_cpu_gpu_compat.cpurun.begin @@ -54,6 +55,7 @@ def f(res, xarr, yarr, zarr): if tid < len(xarr): # The function decorated with numba.jit may be directly reused res[tid] = business_logic(xarr[tid], yarr[tid], zarr[tid]) + # ex_cpu_gpu_compat.usegpu.end # ex_cpu_gpu_compat.launch.begin @@ -62,14 +64,9 @@ def f(res, xarr, yarr, zarr): # [-126.79644737231007, 416.28324559588634, -218912930.2987788] # ex_cpu_gpu_compat.launch.end - expect = [ - business_logic(x, y, z) for x, y, z in zip(X, Y, Z) - ] + expect = [business_logic(x, y, z) for x, y, z in zip(X, Y, Z)] - np.testing.assert_equal( - expect, - results.copy_to_host() - ) + np.testing.assert_equal(expect, results.copy_to_host()) if __name__ == "__main__": diff --git a/numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py b/numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py index 48a59bddf..39e439f3f 100644 --- a/numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +++ b/numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py @@ -2,7 +2,7 @@ # "magictoken" is used for markers as beginning and ending of example text. import unittest -from numba.cuda.testing import (CUDATestCase, skip_on_cudasim) +from numba.cuda.testing import CUDATestCase, skip_on_cudasim from numba.tests.support import skip_unless_cffi @@ -18,11 +18,12 @@ def test_ex_linking_cu(self): # Path to the source containing the foreign function # (here assumed to be in a subdirectory called "ffi") basedir = os.path.dirname(os.path.abspath(__file__)) - functions_cu = os.path.join(basedir, 'ffi', 'functions.cu') + functions_cu = os.path.join(basedir, "ffi", "functions.cu") # Declaration of the foreign function - mul = cuda.declare_device('mul_f32_f32', 'float32(float32, float32)', - link=functions_cu) + mul = cuda.declare_device( + "mul_f32_f32", "float32(float32, float32)", link=functions_cu + ) # A kernel that calls mul; functions.cu is linked automatically due to # the call to mul. @@ -52,25 +53,29 @@ def test_ex_from_buffer(self): import os basedir = os.path.dirname(os.path.abspath(__file__)) - functions_cu = os.path.join(basedir, 'ffi', 'functions.cu') + functions_cu = os.path.join(basedir, "ffi", "functions.cu") # magictoken.ex_from_buffer_decl.begin - signature = 'float32(CPointer(float32), int32)' - sum_reduce = cuda.declare_device('sum_reduce', signature, - link=functions_cu) + signature = "float32(CPointer(float32), int32)" + sum_reduce = cuda.declare_device( + "sum_reduce", signature, link=functions_cu + ) # magictoken.ex_from_buffer_decl.end # magictoken.ex_from_buffer_kernel.begin import cffi + ffi = cffi.FFI() @cuda.jit def reduction_caller(result, array): array_ptr = ffi.from_buffer(array) result[()] = sum_reduce(array_ptr, len(array)) + # magictoken.ex_from_buffer_kernel.end import numpy as np + x = np.arange(10).astype(np.float32) r = np.ndarray((), dtype=np.float32) @@ -81,5 +86,5 @@ def reduction_caller(result, array): np.testing.assert_allclose(expected, actual) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py b/numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py index 4caea9286..75f38446a 100644 --- a/numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +++ b/numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py @@ -1,14 +1,18 @@ import unittest -from numba.cuda.testing import (CUDATestCase, skip_if_cudadevrt_missing, - skip_on_cudasim, skip_unless_cc_60, - skip_if_mvc_enabled) +from numba.cuda.testing import ( + CUDATestCase, + skip_if_cudadevrt_missing, + skip_on_cudasim, + skip_unless_cc_60, + skip_if_mvc_enabled, +) from numba.tests.support import captured_stdout @skip_if_cudadevrt_missing @skip_unless_cc_60 -@skip_if_mvc_enabled('CG not supported with MVC') +@skip_if_mvc_enabled("CG not supported with MVC") @skip_on_cudasim("cudasim doesn't support cuda import at non-top-level") class TestLaplace(CUDATestCase): """ @@ -27,7 +31,6 @@ def tearDown(self): super().tearDown() def test_ex_laplace(self): - # set True to regenerate the figures that # accompany this example plot = False @@ -55,24 +58,25 @@ def test_ex_laplace(self): if plot: import matplotlib.pyplot as plt + fig, ax = plt.subplots(figsize=(16 * 0.66, 9 * 0.66)) plt.plot( np.arange(len(buf_0)), buf_0.copy_to_host(), lw=3, marker="*", - color='black' + color="black", ) - plt.title('Initial State', fontsize=24) - plt.xlabel('Position', fontsize=24) - plt.ylabel('Temperature', fontsize=24) + plt.title("Initial State", fontsize=24) + plt.xlabel("Position", fontsize=24) + plt.ylabel("Temperature", fontsize=24) ax.set_xticks(ax.get_xticks(), fontsize=16) ax.set_yticks(ax.get_yticks(), fontsize=16) plt.xlim(0, len(data)) plt.ylim(0, 10001) - plt.savefig('laplace_initial.svg') + plt.savefig("laplace_initial.svg") # ex_laplace.kernel.begin @cuda.jit @@ -116,12 +120,11 @@ def solve_heat_equation(buf_0, buf_1, timesteps, k): # Wait for every thread to write before moving on grid.sync() + # ex_laplace.kernel.end # ex_laplace.launch.begin - solve_heat_equation.forall(len(data))( - buf_0, buf_1, niter, 0.25 - ) + solve_heat_equation.forall(len(data))(buf_0, buf_1, niter, 0.25) # ex_laplace.launch.end results = buf_1.copy_to_host() @@ -129,20 +132,21 @@ def solve_heat_equation(buf_0, buf_1, timesteps, k): fig, ax = plt.subplots(figsize=(16 * 0.66, 9 * 0.66)) plt.plot( np.arange(len(results)), - results, lw=3, + results, + lw=3, marker="*", - color='black' + color="black", ) plt.title(f"T = {niter}", fontsize=24) - plt.xlabel('Position', fontsize=24) - plt.ylabel('Temperature', fontsize=24) + plt.xlabel("Position", fontsize=24) + plt.ylabel("Temperature", fontsize=24) ax.set_xticks(ax.get_xticks(), fontsize=16) ax.set_yticks(ax.get_yticks(), fontsize=16) plt.ylim(0, max(results)) plt.xlim(0, len(results)) - plt.savefig('laplace_final.svg') + plt.savefig("laplace_final.svg") # Integral over the domain should be equal to its initial value. # Note that this should match the initial value of data[500] above, but diff --git a/numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py b/numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py index 6e0dd44c1..9633954f0 100644 --- a/numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +++ b/numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py @@ -6,6 +6,7 @@ Contents in this file are referenced from the sphinx-generated docs. "magictoken" is used for markers as beginning and ending of example text. """ + import unittest from numba.cuda.testing import CUDATestCase, skip_on_cudasim from numba.tests.support import captured_stdout @@ -43,10 +44,11 @@ def matmul(A, B, C): """Perform square matrix multiplication of C = A * B.""" i, j = cuda.grid(2) if i < C.shape[0] and j < C.shape[1]: - tmp = 0. + tmp = 0.0 for k in range(A.shape[1]): tmp += A[i, k] * B[k, j] C[i, j] = tmp + # magictoken.ex_matmul.end # magictoken.ex_run_matmul.begin @@ -91,11 +93,11 @@ def fast_matmul(A, B, C): tx = cuda.threadIdx.x ty = cuda.threadIdx.y - bpg = cuda.gridDim.x # blocks per grid + bpg = cuda.gridDim.x # blocks per grid # Each thread computes one element in the result matrix. # The dot product is chunked into dot products of TPB-long vectors. - tmp = float32(0.) + tmp = float32(0.0) for i in range(bpg): # Preload data into shared memory sA[ty, tx] = 0 @@ -116,6 +118,7 @@ def fast_matmul(A, B, C): cuda.syncthreads() if y < C.shape[0] and x < C.shape[1]: C[y, x] = tmp + # magictoken.ex_fast_matmul.end # magictoken.ex_run_fast_matmul.begin @@ -169,5 +172,5 @@ def fast_matmul(A, B, C): self.assertTrue(np.all(z_h == x_h @ y_h), msg=msg) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py b/numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py index 92627084f..8a5d9f46f 100644 --- a/numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +++ b/numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py @@ -59,6 +59,7 @@ def mc_integrator_kernel(out, rng_states, lower_lim, upper_lim): # value of the sample y = func(samp) out[gid] = y + # ex_montecarlo.kernel.end # ex_montecarlo.callfunc.begin @@ -84,6 +85,7 @@ def mc_integrate(lower_lim, upper_lim, nsamps): factor = (upper_lim - lower_lim) / (nsamps - 1) return sum_reduce(out) * factor + # ex_montecarlo.callfunc.end # ex_montecarlo.launch.begin diff --git a/numba_cuda/numba/cuda/tests/doc_examples/test_random.py b/numba_cuda/numba/cuda/tests/doc_examples/test_random.py index 0e93a1f17..3ef8ec3a9 100644 --- a/numba_cuda/numba/cuda/tests/doc_examples/test_random.py +++ b/numba_cuda/numba/cuda/tests/doc_examples/test_random.py @@ -10,8 +10,10 @@ class TestRandom(CUDATestCase): def test_ex_3d_grid(self): # magictoken.ex_3d_grid.begin from numba import cuda - from numba.cuda.random import (create_xoroshiro128p_states, - xoroshiro128p_uniform_float32) + from numba.cuda.random import ( + create_xoroshiro128p_states, + xoroshiro128p_uniform_float32, + ) import numpy as np @cuda.jit @@ -27,7 +29,9 @@ def random_3d(arr, rng_states): for i in range(startz, arr.shape[0], stridez): for j in range(starty, arr.shape[1], stridey): for k in range(startx, arr.shape[2], stridex): - arr[i, j, k] = xoroshiro128p_uniform_float32(rng_states, tid) + arr[i, j, k] = xoroshiro128p_uniform_float32( + rng_states, tid + ) # Array dimensions X, Y, Z = 701, 900, 719 @@ -55,5 +59,5 @@ def random_3d(arr, rng_states): self.assertTrue(np.all(host_arr >= 0.0)) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py b/numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py index c118fbf15..92a0e6ade 100644 --- a/numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +++ b/numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py @@ -61,11 +61,12 @@ def array_sum(data): # After the loop, the zeroth element contains the sum if tid == 0: data[tid] = shr[tid] + # ex_reduction.kernel.end # ex_reduction.launch.begin array_sum[1, nelem](a) - print(a[0]) # 523776 + print(a[0]) # 523776 print(sum(np.arange(1024))) # 523776 # ex_reduction.launch.end diff --git a/numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py b/numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py index 6c66a6599..c3a23471a 100644 --- a/numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +++ b/numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py @@ -1,14 +1,18 @@ import unittest -from numba.cuda.testing import (CUDATestCase, skip_if_cudadevrt_missing, - skip_on_cudasim, skip_unless_cc_60, - skip_if_mvc_enabled) +from numba.cuda.testing import ( + CUDATestCase, + skip_if_cudadevrt_missing, + skip_on_cudasim, + skip_unless_cc_60, + skip_if_mvc_enabled, +) from numba.tests.support import captured_stdout @skip_if_cudadevrt_missing @skip_unless_cc_60 -@skip_if_mvc_enabled('CG not supported with MVC') +@skip_if_mvc_enabled("CG not supported with MVC") @skip_on_cudasim("cudasim doesn't support cuda import at non-top-level") class TestSessionization(CUDATestCase): """ @@ -40,26 +44,71 @@ def test_ex_sessionize(self): ids = cuda.to_device( np.array( [ - 1, 1, 1, 1, 1, 1, - 2, 2, 2, - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - 4, 4, 4, 4, 4, 4, 4, 4, 4, + 1, + 1, + 1, + 1, + 1, + 1, + 2, + 2, + 2, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, ] ) ) sec = cuda.to_device( np.array( [ - 1, 2, 3, 5000, 5001, 5002, 1, - 2, 3, 1, 2, 5000, 5001, 10000, - 10001, 10002, 10003, 15000, 150001, - 1, 5000, 50001, 15000, 20000, - 25000, 25001, 25002, 25003, + 1, + 2, + 3, + 5000, + 5001, + 5002, + 1, + 2, + 3, + 1, + 2, + 5000, + 5001, + 10000, + 10001, + 10002, + 10003, + 15000, + 150001, + 1, + 5000, + 50001, + 15000, + 20000, + 25000, + 25001, + 25002, + 25003, ], dtype="datetime64[ns]", - ).astype( - "int64" - ) # Cast to int64 for compatibility + ).astype("int64") # Cast to int64 for compatibility ) # Create a vector to hold the results results = cuda.to_device(np.zeros(len(ids))) @@ -105,6 +154,7 @@ def sessionize(user_id, timestamp, results): if gid + look_ahead == size - 1: results[gid + look_ahead] = gid break + # ex_sessionize.kernel.end # ex_sessionize.launch.begin @@ -119,9 +169,34 @@ def sessionize(user_id, timestamp, results): # ex_sessionize.launch.end expect = [ - 0, 0, 0, 3, 3, 3, 6, 6, 6, 9, 9, - 11, 11, 13, 13, 13, 13, 17, 18, 19, 20, 21, - 21, 23, 24, 24, 24, 24 + 0, + 0, + 0, + 3, + 3, + 3, + 6, + 6, + 6, + 9, + 9, + 11, + 11, + 13, + 13, + 13, + 13, + 17, + 18, + 19, + 20, + 21, + 21, + 23, + 24, + 24, + 24, + 24, ] np.testing.assert_equal(expect, results.copy_to_host()) diff --git a/numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py b/numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py index c6ae197ee..64131f0a7 100644 --- a/numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +++ b/numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py @@ -37,6 +37,7 @@ def f(a, b, c): if tid < size: c[tid] = a[tid] + b[tid] + # ex_vecadd.kernel.end # Seed RNG for test repeatability @@ -64,8 +65,7 @@ def f(a, b, c): # ex_vecadd.launch.end np.testing.assert_equal( - c.copy_to_host(), - a.copy_to_host() + b.copy_to_host() + c.copy_to_host(), a.copy_to_host() + b.copy_to_host() ) diff --git a/numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py b/numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py index e4ad7d0fd..a870d1e38 100644 --- a/numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +++ b/numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py @@ -7,9 +7,8 @@ @skip_on_cudasim("Tests internals of the CUDA driver device array") class TestSlicing(unittest.TestCase): - def assertSameContig(self, arr, nparr): - attrs = 'C_CONTIGUOUS', 'F_CONTIGUOUS' + attrs = "C_CONTIGUOUS", "F_CONTIGUOUS" for attr in attrs: if arr.flags[attr] != nparr.flags[attr]: if arr.size == 0 and nparr.size == 0: @@ -17,15 +16,18 @@ def assertSameContig(self, arr, nparr): # some are not pass else: - self.fail("contiguous flag mismatch:\ngot=%s\nexpect=%s" % - (arr.flags, nparr.flags)) + self.fail( + "contiguous flag mismatch:\ngot=%s\nexpect=%s" + % (arr.flags, nparr.flags) + ) #### 1D def test_slice0_1d(self): nparr = np.empty(4) - arr = Array.from_desc(0, nparr.shape, nparr.strides, - nparr.dtype.itemsize) + arr = Array.from_desc( + 0, nparr.shape, nparr.strides, nparr.dtype.itemsize + ) self.assertSameContig(arr, nparr) xx = -2, -1, 0, 1, 2 for x in xx: @@ -37,8 +39,9 @@ def test_slice0_1d(self): def test_slice1_1d(self): nparr = np.empty(4) - arr = Array.from_desc(0, nparr.shape, nparr.strides, - nparr.dtype.itemsize) + arr = Array.from_desc( + 0, nparr.shape, nparr.strides, nparr.dtype.itemsize + ) xx = -2, -1, 0, 1, 2 for x in xx: expect = nparr[:x] @@ -49,8 +52,9 @@ def test_slice1_1d(self): def test_slice2_1d(self): nparr = np.empty(4) - arr = Array.from_desc(0, nparr.shape, nparr.strides, - nparr.dtype.itemsize) + arr = Array.from_desc( + 0, nparr.shape, nparr.strides, nparr.dtype.itemsize + ) xx = -2, -1, 0, 1, 2 for x, y in itertools.product(xx, xx): expect = nparr[x:y] @@ -63,8 +67,9 @@ def test_slice2_1d(self): def test_slice0_2d(self): nparr = np.empty((4, 5)) - arr = Array.from_desc(0, nparr.shape, nparr.strides, - nparr.dtype.itemsize) + arr = Array.from_desc( + 0, nparr.shape, nparr.strides, nparr.dtype.itemsize + ) xx = -2, 0, 1, 2 for x in xx: expect = nparr[x:] @@ -82,8 +87,9 @@ def test_slice0_2d(self): def test_slice1_2d(self): nparr = np.empty((4, 5)) - arr = Array.from_desc(0, nparr.shape, nparr.strides, - nparr.dtype.itemsize) + arr = Array.from_desc( + 0, nparr.shape, nparr.strides, nparr.dtype.itemsize + ) xx = -2, 0, 2 for x in xx: expect = nparr[:x] @@ -101,8 +107,9 @@ def test_slice1_2d(self): def test_slice2_2d(self): nparr = np.empty((4, 5)) - arr = Array.from_desc(0, nparr.shape, nparr.strides, - nparr.dtype.itemsize) + arr = Array.from_desc( + 0, nparr.shape, nparr.strides, nparr.dtype.itemsize + ) xx = -2, 0, 2 for s, t, u, v in itertools.product(xx, xx, xx, xx): expect = nparr[s:t, u:v] @@ -122,8 +129,9 @@ def test_slice2_2d(self): def test_strided_1d(self): nparr = np.empty(4) - arr = Array.from_desc(0, nparr.shape, nparr.strides, - nparr.dtype.itemsize) + arr = Array.from_desc( + 0, nparr.shape, nparr.strides, nparr.dtype.itemsize + ) xx = -2, -1, 1, 2 for x in xx: expect = nparr[::x] @@ -134,8 +142,9 @@ def test_strided_1d(self): def test_strided_2d(self): nparr = np.empty((4, 5)) - arr = Array.from_desc(0, nparr.shape, nparr.strides, - nparr.dtype.itemsize) + arr = Array.from_desc( + 0, nparr.shape, nparr.strides, nparr.dtype.itemsize + ) xx = -2, -1, 1, 2 for a, b in itertools.product(xx, xx): expect = nparr[::a, ::b] @@ -146,8 +155,9 @@ def test_strided_2d(self): def test_strided_3d(self): nparr = np.empty((4, 5, 6)) - arr = Array.from_desc(0, nparr.shape, nparr.strides, - nparr.dtype.itemsize) + arr = Array.from_desc( + 0, nparr.shape, nparr.strides, nparr.dtype.itemsize + ) xx = -2, -1, 1, 2 for a, b, c in itertools.product(xx, xx, xx): expect = nparr[::a, ::b, ::c] @@ -160,16 +170,17 @@ def test_issue_2766(self): z = np.empty((1, 2, 3)) z = np.transpose(z, axes=(2, 0, 1)) arr = Array.from_desc(0, z.shape, z.strides, z.itemsize) - self.assertEqual(z.flags['C_CONTIGUOUS'], arr.flags['C_CONTIGUOUS']) - self.assertEqual(z.flags['F_CONTIGUOUS'], arr.flags['F_CONTIGUOUS']) + self.assertEqual(z.flags["C_CONTIGUOUS"], arr.flags["C_CONTIGUOUS"]) + self.assertEqual(z.flags["F_CONTIGUOUS"], arr.flags["F_CONTIGUOUS"]) @skip_on_cudasim("Tests internals of the CUDA driver device array") class TestReshape(unittest.TestCase): def test_reshape_2d2d(self): nparr = np.empty((4, 5)) - arr = Array.from_desc(0, nparr.shape, nparr.strides, - nparr.dtype.itemsize) + arr = Array.from_desc( + 0, nparr.shape, nparr.strides, nparr.dtype.itemsize + ) expect = nparr.reshape(5, 4) got = arr.reshape(5, 4)[0] self.assertEqual(got.shape, expect.shape) @@ -177,8 +188,9 @@ def test_reshape_2d2d(self): def test_reshape_2d1d(self): nparr = np.empty((4, 5)) - arr = Array.from_desc(0, nparr.shape, nparr.strides, - nparr.dtype.itemsize) + arr = Array.from_desc( + 0, nparr.shape, nparr.strides, nparr.dtype.itemsize + ) expect = nparr.reshape(5 * 4) got = arr.reshape(5 * 4)[0] self.assertEqual(got.shape, expect.shape) @@ -186,8 +198,9 @@ def test_reshape_2d1d(self): def test_reshape_3d3d(self): nparr = np.empty((3, 4, 5)) - arr = Array.from_desc(0, nparr.shape, nparr.strides, - nparr.dtype.itemsize) + arr = Array.from_desc( + 0, nparr.shape, nparr.strides, nparr.dtype.itemsize + ) expect = nparr.reshape(5, 3, 4) got = arr.reshape(5, 3, 4)[0] self.assertEqual(got.shape, expect.shape) @@ -195,8 +208,9 @@ def test_reshape_3d3d(self): def test_reshape_3d2d(self): nparr = np.empty((3, 4, 5)) - arr = Array.from_desc(0, nparr.shape, nparr.strides, - nparr.dtype.itemsize) + arr = Array.from_desc( + 0, nparr.shape, nparr.strides, nparr.dtype.itemsize + ) expect = nparr.reshape(3 * 4, 5) got = arr.reshape(3 * 4, 5)[0] self.assertEqual(got.shape, expect.shape) @@ -204,8 +218,9 @@ def test_reshape_3d2d(self): def test_reshape_3d1d(self): nparr = np.empty((3, 4, 5)) - arr = Array.from_desc(0, nparr.shape, nparr.strides, - nparr.dtype.itemsize) + arr = Array.from_desc( + 0, nparr.shape, nparr.strides, nparr.dtype.itemsize + ) expect = nparr.reshape(3 * 4 * 5) got = arr.reshape(3 * 4 * 5)[0] self.assertEqual(got.shape, expect.shape) @@ -213,8 +228,9 @@ def test_reshape_3d1d(self): def test_reshape_infer2d2d(self): nparr = np.empty((4, 5)) - arr = Array.from_desc(0, nparr.shape, nparr.strides, - nparr.dtype.itemsize) + arr = Array.from_desc( + 0, nparr.shape, nparr.strides, nparr.dtype.itemsize + ) expect = nparr.reshape(-1, 4) got = arr.reshape(-1, 4)[0] self.assertEqual(got.shape, expect.shape) @@ -222,8 +238,9 @@ def test_reshape_infer2d2d(self): def test_reshape_infer2d1d(self): nparr = np.empty((4, 5)) - arr = Array.from_desc(0, nparr.shape, nparr.strides, - nparr.dtype.itemsize) + arr = Array.from_desc( + 0, nparr.shape, nparr.strides, nparr.dtype.itemsize + ) expect = nparr.reshape(-1) got = arr.reshape(-1)[0] self.assertEqual(got.shape, expect.shape) @@ -231,8 +248,9 @@ def test_reshape_infer2d1d(self): def test_reshape_infer3d3d(self): nparr = np.empty((3, 4, 5)) - arr = Array.from_desc(0, nparr.shape, nparr.strides, - nparr.dtype.itemsize) + arr = Array.from_desc( + 0, nparr.shape, nparr.strides, nparr.dtype.itemsize + ) expect = nparr.reshape(5, -1, 4) got = arr.reshape(5, -1, 4)[0] self.assertEqual(got.shape, expect.shape) @@ -240,8 +258,9 @@ def test_reshape_infer3d3d(self): def test_reshape_infer3d2d(self): nparr = np.empty((3, 4, 5)) - arr = Array.from_desc(0, nparr.shape, nparr.strides, - nparr.dtype.itemsize) + arr = Array.from_desc( + 0, nparr.shape, nparr.strides, nparr.dtype.itemsize + ) expect = nparr.reshape(3, -1) got = arr.reshape(3, -1)[0] self.assertEqual(got.shape, expect.shape) @@ -249,8 +268,9 @@ def test_reshape_infer3d2d(self): def test_reshape_infer3d1d(self): nparr = np.empty((3, 4, 5)) - arr = Array.from_desc(0, nparr.shape, nparr.strides, - nparr.dtype.itemsize) + arr = Array.from_desc( + 0, nparr.shape, nparr.strides, nparr.dtype.itemsize + ) expect = nparr.reshape(-1) got = arr.reshape(-1)[0] self.assertEqual(got.shape, expect.shape) @@ -258,23 +278,26 @@ def test_reshape_infer3d1d(self): def test_reshape_infer_two_unknowns(self): nparr = np.empty((3, 4, 5)) - arr = Array.from_desc(0, nparr.shape, nparr.strides, - nparr.dtype.itemsize) + arr = Array.from_desc( + 0, nparr.shape, nparr.strides, nparr.dtype.itemsize + ) with self.assertRaises(ValueError) as raises: arr.reshape(-1, -1, 3) - self.assertIn('can only specify one unknown dimension', - str(raises.exception)) + self.assertIn( + "can only specify one unknown dimension", str(raises.exception) + ) def test_reshape_infer_invalid_shape(self): nparr = np.empty((3, 4, 5)) - arr = Array.from_desc(0, nparr.shape, nparr.strides, - nparr.dtype.itemsize) + arr = Array.from_desc( + 0, nparr.shape, nparr.strides, nparr.dtype.itemsize + ) with self.assertRaises(ValueError) as raises: arr.reshape(-1, 7) - expected_message = 'cannot infer valid shape for unknown dimension' + expected_message = "cannot infer valid shape for unknown dimension" self.assertIn(expected_message, str(raises.exception)) @@ -289,6 +312,7 @@ def test_squeeze(self): def _assert_equal_shape_strides(arr1, arr2): self.assertEqual(arr1.shape, arr2.shape) self.assertEqual(arr1.strides, arr2.strides) + _assert_equal_shape_strides(arr, nparr) _assert_equal_shape_strides(arr.squeeze()[0], nparr.squeeze()) for axis in (0, 2, 4, (0, 2), (0, 4), (2, 4), (0, 2, 4)): @@ -311,29 +335,33 @@ def test_squeeze_invalid_axis(self): class TestExtent(unittest.TestCase): def test_extent_1d(self): nparr = np.empty(4) - arr = Array.from_desc(0, nparr.shape, nparr.strides, - nparr.dtype.itemsize) + arr = Array.from_desc( + 0, nparr.shape, nparr.strides, nparr.dtype.itemsize + ) s, e = arr.extent self.assertEqual(e - s, nparr.size * nparr.dtype.itemsize) def test_extent_2d(self): nparr = np.empty((4, 5)) - arr = Array.from_desc(0, nparr.shape, nparr.strides, - nparr.dtype.itemsize) + arr = Array.from_desc( + 0, nparr.shape, nparr.strides, nparr.dtype.itemsize + ) s, e = arr.extent self.assertEqual(e - s, nparr.size * nparr.dtype.itemsize) def test_extent_iter_1d(self): nparr = np.empty(4) - arr = Array.from_desc(0, nparr.shape, nparr.strides, - nparr.dtype.itemsize) + arr = Array.from_desc( + 0, nparr.shape, nparr.strides, nparr.dtype.itemsize + ) [ext] = list(arr.iter_contiguous_extent()) self.assertEqual(ext, arr.extent) def test_extent_iter_2d(self): nparr = np.empty((4, 5)) - arr = Array.from_desc(0, nparr.shape, nparr.strides, - nparr.dtype.itemsize) + arr = Array.from_desc( + 0, nparr.shape, nparr.strides, nparr.dtype.itemsize + ) [ext] = list(arr.iter_contiguous_extent()) self.assertEqual(ext, arr.extent) @@ -346,8 +374,9 @@ def test_for_loop(self): # for #4201 N = 5 nparr = np.empty(N) - arr = Array.from_desc(0, nparr.shape, nparr.strides, - nparr.dtype.itemsize) + arr = Array.from_desc( + 0, nparr.shape, nparr.strides, nparr.dtype.itemsize + ) x = 0 # just a placeholder # this loop should not raise AssertionError @@ -355,5 +384,5 @@ def test_for_loop(self): x = val # noqa: F841 -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py b/numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py index 1153707bb..ec59f3fab 100644 --- a/numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +++ b/numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py @@ -9,19 +9,28 @@ class TestFunctionResolution(unittest.TestCase): def test_fp16_binary_operators(self): from numba.cuda.descriptor import cuda_target - ops = (operator.add, operator.iadd, operator.sub, operator.isub, - operator.mul, operator.imul) + + ops = ( + operator.add, + operator.iadd, + operator.sub, + operator.isub, + operator.mul, + operator.imul, + ) for op in ops: fp16 = types.float16 typingctx = cuda_target.typing_context typingctx.refresh() fnty = typingctx.resolve_value_type(op) out = typingctx.resolve_function_type(fnty, (fp16, fp16), {}) - self.assertEqual(out, typing.signature(fp16, fp16, fp16), - msg=str(out)) + self.assertEqual( + out, typing.signature(fp16, fp16, fp16), msg=str(out) + ) def test_fp16_unary_operators(self): from numba.cuda.descriptor import cuda_target + ops = (operator.neg, abs) for op in ops: fp16 = types.float16 @@ -32,5 +41,5 @@ def test_fp16_unary_operators(self): self.assertEqual(out, typing.signature(fp16, fp16), msg=str(out)) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/nocuda/test_import.py b/numba_cuda/numba/cuda/tests/nocuda/test_import.py index 73126cd6e..b44ccbc95 100644 --- a/numba_cuda/numba/cuda/tests/nocuda/test_import.py +++ b/numba_cuda/numba/cuda/tests/nocuda/test_import.py @@ -11,30 +11,30 @@ def test_no_impl_import(self): """ banlist = ( - 'numba.cpython.slicing', - 'numba.cpython.tupleobj', - 'numba.cpython.enumimpl', - 'numba.cpython.hashing', - 'numba.cpython.heapq', - 'numba.cpython.iterators', - 'numba.cpython.numbers', - 'numba.cpython.rangeobj', - 'numba.cpython.cmathimpl', - 'numba.cpython.mathimpl', - 'numba.cpython.printimpl', - 'numba.cpython.randomimpl', - 'numba.core.optional', - 'numba.misc.gdb_hook', - 'numba.misc.literal', - 'numba.misc.cffiimpl', - 'numba.np.linalg', - 'numba.np.polynomial', - 'numba.np.arraymath', - 'numba.np.npdatetime', - 'numba.np.npyimpl', - 'numba.typed.typeddict', - 'numba.typed.typedlist', - 'numba.experimental.jitclass.base', + "numba.cpython.slicing", + "numba.cpython.tupleobj", + "numba.cpython.enumimpl", + "numba.cpython.hashing", + "numba.cpython.heapq", + "numba.cpython.iterators", + "numba.cpython.numbers", + "numba.cpython.rangeobj", + "numba.cpython.cmathimpl", + "numba.cpython.mathimpl", + "numba.cpython.printimpl", + "numba.cpython.randomimpl", + "numba.core.optional", + "numba.misc.gdb_hook", + "numba.misc.literal", + "numba.misc.cffiimpl", + "numba.np.linalg", + "numba.np.polynomial", + "numba.np.arraymath", + "numba.np.npdatetime", + "numba.np.npyimpl", + "numba.typed.typeddict", + "numba.typed.typedlist", + "numba.experimental.jitclass.base", ) code = "import sys; from numba import cuda; print(list(sys.modules))" @@ -45,5 +45,5 @@ def test_no_impl_import(self): self.assertFalse(unexpected, "some modules unexpectedly imported") -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py b/numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py index acf670829..bca9eb680 100644 --- a/numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +++ b/numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py @@ -21,12 +21,12 @@ has_cuda = nvvm.is_available() -has_mp_get_context = hasattr(mp, 'get_context') +has_mp_get_context = hasattr(mp, "get_context") class LibraryLookupBase(SerialMixin, unittest.TestCase): def setUp(self): - ctx = mp.get_context('spawn') + ctx = mp.get_context("spawn") qrecv = ctx.Queue() qsend = ctx.Queue() @@ -84,108 +84,108 @@ def check_lib_lookup(qout, qin): status = False -@skip_on_cudasim('Library detection unsupported in the simulator') -@unittest.skipUnless(has_mp_get_context, 'mp.get_context not available') -@skip_unless_conda_cudatoolkit('test assumes conda installed cudatoolkit') +@skip_on_cudasim("Library detection unsupported in the simulator") +@unittest.skipUnless(has_mp_get_context, "mp.get_context not available") +@skip_unless_conda_cudatoolkit("test assumes conda installed cudatoolkit") class TestLibDeviceLookUp(LibraryLookupBase): def test_libdevice_path_decision(self): # Check that the default is using conda environment by, info, warns = self.remote_do(self.do_clear_envs) if has_cuda: - self.assertEqual(by, 'Conda environment') + self.assertEqual(by, "Conda environment") else: self.assertEqual(by, "") self.assertIsNone(info) self.assertFalse(warns) # Check that CUDA_HOME works by removing conda-env by, info, warns = self.remote_do(self.do_set_cuda_home) - self.assertEqual(by, 'CUDA_HOME') - self.assertEqual(info, os.path.join('mycudahome', 'nvvm', 'libdevice')) + self.assertEqual(by, "CUDA_HOME") + self.assertEqual(info, os.path.join("mycudahome", "nvvm", "libdevice")) self.assertFalse(warns) if get_system_ctk() is None: # Fake remove conda environment so no cudatoolkit is available by, info, warns = self.remote_do(self.do_clear_envs) - self.assertEqual(by, '') + self.assertEqual(by, "") self.assertIsNone(info) self.assertFalse(warns) else: # Use system available cudatoolkit by, info, warns = self.remote_do(self.do_clear_envs) - self.assertEqual(by, 'System') + self.assertEqual(by, "System") self.assertFalse(warns) @staticmethod def do_clear_envs(): - remove_env('CUDA_HOME') - remove_env('CUDA_PATH') + remove_env("CUDA_HOME") + remove_env("CUDA_PATH") return True, _get_libdevice_path_decision() @staticmethod def do_set_cuda_home(): - os.environ['CUDA_HOME'] = os.path.join('mycudahome') + os.environ["CUDA_HOME"] = os.path.join("mycudahome") _fake_non_conda_env() return True, _get_libdevice_path_decision() -@skip_on_cudasim('Library detection unsupported in the simulator') -@unittest.skipUnless(has_mp_get_context, 'mp.get_context not available') -@skip_unless_conda_cudatoolkit('test assumes conda installed cudatoolkit') +@skip_on_cudasim("Library detection unsupported in the simulator") +@unittest.skipUnless(has_mp_get_context, "mp.get_context not available") +@skip_unless_conda_cudatoolkit("test assumes conda installed cudatoolkit") class TestNvvmLookUp(LibraryLookupBase): def test_nvvm_path_decision(self): # Check that the default is using conda environment by, info, warns = self.remote_do(self.do_clear_envs) if has_cuda: - self.assertEqual(by, 'Conda environment') + self.assertEqual(by, "Conda environment") else: self.assertEqual(by, "") self.assertIsNone(info) self.assertFalse(warns) # Check that CUDA_HOME works by removing conda-env by, info, warns = self.remote_do(self.do_set_cuda_home) - self.assertEqual(by, 'CUDA_HOME') + self.assertEqual(by, "CUDA_HOME") self.assertFalse(warns) if IS_WIN32: - self.assertEqual(info, os.path.join('mycudahome', 'nvvm', 'bin')) + self.assertEqual(info, os.path.join("mycudahome", "nvvm", "bin")) elif IS_OSX: - self.assertEqual(info, os.path.join('mycudahome', 'nvvm', 'lib')) + self.assertEqual(info, os.path.join("mycudahome", "nvvm", "lib")) else: - self.assertEqual(info, os.path.join('mycudahome', 'nvvm', 'lib64')) + self.assertEqual(info, os.path.join("mycudahome", "nvvm", "lib64")) if get_system_ctk() is None: # Fake remove conda environment so no cudatoolkit is available by, info, warns = self.remote_do(self.do_clear_envs) - self.assertEqual(by, '') + self.assertEqual(by, "") self.assertIsNone(info) self.assertFalse(warns) else: # Use system available cudatoolkit by, info, warns = self.remote_do(self.do_clear_envs) - self.assertEqual(by, 'System') + self.assertEqual(by, "System") self.assertFalse(warns) @staticmethod def do_clear_envs(): - remove_env('CUDA_HOME') - remove_env('CUDA_PATH') + remove_env("CUDA_HOME") + remove_env("CUDA_PATH") return True, _get_nvvm_path_decision() @staticmethod def do_set_cuda_home(): - os.environ['CUDA_HOME'] = os.path.join('mycudahome') + os.environ["CUDA_HOME"] = os.path.join("mycudahome") _fake_non_conda_env() return True, _get_nvvm_path_decision() -@skip_on_cudasim('Library detection unsupported in the simulator') -@unittest.skipUnless(has_mp_get_context, 'mp.get_context not available') -@skip_unless_conda_cudatoolkit('test assumes conda installed cudatoolkit') +@skip_on_cudasim("Library detection unsupported in the simulator") +@unittest.skipUnless(has_mp_get_context, "mp.get_context not available") +@skip_unless_conda_cudatoolkit("test assumes conda installed cudatoolkit") class TestCudaLibLookUp(LibraryLookupBase): def test_cudalib_path_decision(self): # Check that the default is using conda environment by, info, warns = self.remote_do(self.do_clear_envs) if has_cuda: - self.assertEqual(by, 'Conda environment') + self.assertEqual(by, "Conda environment") else: self.assertEqual(by, "") self.assertIsNone(info) @@ -194,14 +194,14 @@ def test_cudalib_path_decision(self): # Check that CUDA_HOME works by removing conda-env self.remote_do(self.do_clear_envs) by, info, warns = self.remote_do(self.do_set_cuda_home) - self.assertEqual(by, 'CUDA_HOME') + self.assertEqual(by, "CUDA_HOME") self.assertFalse(warns) if IS_WIN32: - self.assertEqual(info, os.path.join('mycudahome', 'bin')) + self.assertEqual(info, os.path.join("mycudahome", "bin")) elif IS_OSX: - self.assertEqual(info, os.path.join('mycudahome', 'lib')) + self.assertEqual(info, os.path.join("mycudahome", "lib")) else: - self.assertEqual(info, os.path.join('mycudahome', 'lib64')) + self.assertEqual(info, os.path.join("mycudahome", "lib64")) if get_system_ctk() is None: # Fake remove conda environment so no cudatoolkit is available by, info, warns = self.remote_do(self.do_clear_envs) @@ -211,18 +211,18 @@ def test_cudalib_path_decision(self): else: # Use system available cudatoolkit by, info, warns = self.remote_do(self.do_clear_envs) - self.assertEqual(by, 'System') + self.assertEqual(by, "System") self.assertFalse(warns) @staticmethod def do_clear_envs(): - remove_env('CUDA_HOME') - remove_env('CUDA_PATH') + remove_env("CUDA_HOME") + remove_env("CUDA_PATH") return True, _get_cudalib_dir_path_decision() @staticmethod def do_set_cuda_home(): - os.environ['CUDA_HOME'] = os.path.join('mycudahome') + os.environ["CUDA_HOME"] = os.path.join("mycudahome") _fake_non_conda_env() return True, _get_cudalib_dir_path_decision() @@ -231,8 +231,8 @@ def _fake_non_conda_env(): """ Monkeypatch sys.prefix to hide the fact we are in a conda-env """ - sys.prefix = '' + sys.prefix = "" -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py b/numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py index 742aa1017..ef5af7b97 100644 --- a/numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +++ b/numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py @@ -8,14 +8,17 @@ import unittest -original = "call void @llvm.memset.p0i8.i64(" \ - "i8* align 4 %arg.x.41, i8 0, i64 %0, i1 false)" +original = ( + "call void @llvm.memset.p0i8.i64(" + "i8* align 4 %arg.x.41, i8 0, i64 %0, i1 false)" +) -missing_align = "call void @llvm.memset.p0i8.i64(" \ - "i8* %arg.x.41, i8 0, i64 %0, i1 false)" +missing_align = ( + "call void @llvm.memset.p0i8.i64(i8* %arg.x.41, i8 0, i64 %0, i1 false)" +) -@skip_on_cudasim('libNVVM not supported in simulator') +@skip_on_cudasim("libNVVM not supported in simulator") @unittest.skipIf(utils.MACHINE_BITS == 32, "CUDA not support for 32-bit") @unittest.skipIf(not nvvm.is_available(), "No libNVVM") class TestNvvmWithoutCuda(unittest.TestCase): @@ -30,10 +33,9 @@ def test_nvvm_accepts_encoding(self): # NVVM that it cannot parse correctly # Create a module with a constant containing all 8-bit characters - c = ir.Constant(ir.ArrayType(ir.IntType(8), 256), - bytearray(range(256))) + c = ir.Constant(ir.ArrayType(ir.IntType(8), 256), bytearray(range(256))) m = ir.Module() - m.triple = 'nvptx64-nvidia-cuda' + m.triple = "nvptx64-nvidia-cuda" nvvm.add_ir_version(m) gv = ir.GlobalVariable(m, c.type, "myconstant") gv.global_constant = True @@ -46,9 +48,9 @@ def test_nvvm_accepts_encoding(self): # Ensure all characters appear in the generated constant array. elements = ", ".join([str(i) for i in range(256)]) - myconstant = f"myconstant[256] = {{{elements}}}".encode('utf-8') + myconstant = f"myconstant[256] = {{{elements}}}".encode("utf-8") self.assertIn(myconstant, ptx) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/nrt/test_nrt.py b/numba_cuda/numba/cuda/tests/nrt/test_nrt.py index 9f5f0fcf5..a621fe625 100644 --- a/numba_cuda/numba/cuda/tests/nrt/test_nrt.py +++ b/numba_cuda/numba/cuda/tests/nrt/test_nrt.py @@ -26,7 +26,7 @@ def g(): x = np.empty(10, np.int64) f(x) - g[1,1]() + g[1, 1]() cuda.synchronize() def test_nrt_ptx_contains_refcount(self): @@ -39,7 +39,7 @@ def g(): x = np.empty(10, np.int64) f(x) - g[1,1]() + g[1, 1]() ptx = next(iter(g.inspect_asm().values())) @@ -72,13 +72,12 @@ def g(out_ary): out_ary = np.zeros(1, dtype=np.int64) - g[1,1](out_ary) + g[1, 1](out_ary) self.assertEqual(out_ary[0], 1) class TestNrtStatistics(CUDATestCase): - def setUp(self): self._stream = cuda.default_stream() # Store the current stats state @@ -126,12 +125,11 @@ def foo(): # Check env var explicitly being set works env = os.environ.copy() - env['NUMBA_CUDA_NRT_STATS'] = "1" - env['NUMBA_CUDA_ENABLE_NRT'] = "1" + env["NUMBA_CUDA_NRT_STATS"] = "1" + env["NUMBA_CUDA_ENABLE_NRT"] = "1" run_in_subprocess(src, env=env) def check_env_var_off(self, env): - src = """if 1: from numba import cuda import numpy as np @@ -152,27 +150,26 @@ def foo(): def test_stats_env_var_explicit_off(self): # Checks that explicitly turning the stats off via the env var works. env = os.environ.copy() - env['NUMBA_CUDA_NRT_STATS'] = "0" + env["NUMBA_CUDA_NRT_STATS"] = "0" self.check_env_var_off(env) def test_stats_env_var_default_off(self): # Checks that the env var not being set is the same as "off", i.e. # default for Numba is off. env = os.environ.copy() - env.pop('NUMBA_CUDA_NRT_STATS', None) + env.pop("NUMBA_CUDA_NRT_STATS", None) self.check_env_var_off(env) def test_stats_status_toggle(self): - @cuda.jit def foo(): tmp = np.ones(3) - arr = np.arange(5 * tmp[0]) # noqa: F841 + arr = np.arange(5 * tmp[0]) # noqa: F841 return None with ( - override_config('CUDA_ENABLE_NRT', True), - override_config('CUDA_NRT_STATS', True) + override_config("CUDA_ENABLE_NRT", True), + override_config("CUDA_NRT_STATS", True), ): # Switch on stats rtsys.memsys_enable_stats() @@ -218,9 +215,9 @@ def test_rtsys_stats_query_raises_exception_when_disabled(self): def test_nrt_explicit_stats_query_raises_exception_when_disabled(self): # Checks the various memsys_get_stats functions raise if queried when # the stats counters are disabled. - method_variations = ('alloc', 'free', 'mi_alloc', 'mi_free') + method_variations = ("alloc", "free", "mi_alloc", "mi_free") for meth in method_variations: - stats_func = getattr(rtsys, f'memsys_get_stats_{meth}') + stats_func = getattr(rtsys, f"memsys_get_stats_{meth}") with self.subTest(stats_func=stats_func): # Turn stats off rtsys.memsys_disable_stats() @@ -233,14 +230,13 @@ def test_read_one_stat(self): @cuda.jit def foo(): tmp = np.ones(3) - arr = np.arange(5 * tmp[0]) # noqa: F841 + arr = np.arange(5 * tmp[0]) # noqa: F841 return None with ( - override_config('CUDA_ENABLE_NRT', True), - override_config('CUDA_NRT_STATS', True) + override_config("CUDA_ENABLE_NRT", True), + override_config("CUDA_NRT_STATS", True), ): - # Switch on stats rtsys.memsys_enable_stats() @@ -262,5 +258,5 @@ def foo(): self.assertEqual(stats.mi_free, stats_mi_free) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py b/numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py index 1e9b7aa30..27811bdae 100644 --- a/numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +++ b/numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py @@ -9,7 +9,6 @@ class TestNrtRefCt(EnableNRTStatsMixin, CUDATestCase): - def setUp(self): super(TestNrtRefCt, self).setUp() @@ -19,7 +18,7 @@ def tearDown(self): def run(self, result=None): with ( override_config("CUDA_ENABLE_NRT", True), - override_config('CUDA_NRT_STATS', True) + override_config("CUDA_NRT_STATS", True), ): super(TestNrtRefCt, self).run(result) @@ -33,7 +32,7 @@ def test_no_return(self): @cuda.jit def kernel(): for i in range(n): - temp = np.empty(2) # noqa: F841 + temp = np.empty(2) # noqa: F841 return None init_stats = rtsys.get_allocation_stats() @@ -49,14 +48,13 @@ def test_escaping_var_init_in_loop(self): @cuda.jit def g(n): - x = np.empty((n, 2)) for i in range(n): y = x[i] for i in range(n): - y = x[i] # noqa: F841 + y = x[i] # noqa: F841 return None @@ -70,6 +68,7 @@ def test_invalid_computation_of_lifetime(self): """ Test issue #1573 """ + @cuda.jit def if_with_allocation_and_initialization(arr1, test1): tmp_arr = np.empty_like(arr1) @@ -85,13 +84,15 @@ def if_with_allocation_and_initialization(arr1, test1): init_stats = rtsys.get_allocation_stats() if_with_allocation_and_initialization[1, 1](arr, False) cur_stats = rtsys.get_allocation_stats() - self.assertEqual(cur_stats.alloc - init_stats.alloc, - cur_stats.free - init_stats.free) + self.assertEqual( + cur_stats.alloc - init_stats.alloc, cur_stats.free - init_stats.free + ) def test_del_at_beginning_of_loop(self): """ Test issue #1734 """ + @cuda.jit def f(arr): res = 0 @@ -108,9 +109,10 @@ def f(arr): init_stats = rtsys.get_allocation_stats() f[1, 1](arr) cur_stats = rtsys.get_allocation_stats() - self.assertEqual(cur_stats.alloc - init_stats.alloc, - cur_stats.free - init_stats.free) + self.assertEqual( + cur_stats.alloc - init_stats.alloc, cur_stats.free - init_stats.free + ) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_cuda/numba/cuda/tests/test_binary_generation/build.bat b/numba_cuda/numba/cuda/tests/test_binary_generation/build.bat index 9d47a334d..5eb0cbe9d 100644 --- a/numba_cuda/numba/cuda/tests/test_binary_generation/build.bat +++ b/numba_cuda/numba/cuda/tests/test_binary_generation/build.bat @@ -58,4 +58,4 @@ nvcc %NVCC_FLAGS% %LIBRARY_FLAGS% -o %OUTPUT_DIR%\test_device_functions.a test_d nvcc %NVCC_FLAGS% %LTOIR_FLAGS% -o %OUTPUT_DIR%\test_device_functions.ltoir.o test_device_functions.cu @REM Generate LTO-IR in a "raw" LTO-IR container -python generate_raw_ltoir.py --arch sm_%GPU_CC% -o %OUTPUT_DIR%\test_device_functions.ltoir test_device_functions.cu \ No newline at end of file +python generate_raw_ltoir.py --arch sm_%GPU_CC% -o %OUTPUT_DIR%\test_device_functions.ltoir test_device_functions.cu diff --git a/numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py b/numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py index 934410b07..b4d32a34c 100644 --- a/numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +++ b/numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py @@ -58,7 +58,7 @@ def determine_include_flags(): return None # NVCC writes to stdout on Windows and stderr on Linux - if platform.system() == 'Windows': + if platform.system() == "Windows": stream = cp.stdout else: stream = cp.stderr @@ -157,7 +157,7 @@ def main(sourcepath, outputpath, arch): parser.add_argument( "-a", "--arch", - help="compute arch to target (e.g. sm_87). " "Defaults to sm_50.", + help="compute arch to target (e.g. sm_87). Defaults to sm_50.", default="sm_50", ) diff --git a/numba_cuda/numba/cuda/types.py b/numba_cuda/numba/cuda/types.py index 531dcb2cc..92b8f3ecb 100644 --- a/numba_cuda/numba/cuda/types.py +++ b/numba_cuda/numba/cuda/types.py @@ -5,16 +5,18 @@ class Dim3(types.Type): """ A 3-tuple (x, y, z) representing the position of a block or thread. """ + def __init__(self): - super().__init__(name='Dim3') + super().__init__(name="Dim3") class GridGroup(types.Type): """ The grid of all threads in a cooperative kernel launch. """ + def __init__(self): - super().__init__(name='GridGroup') + super().__init__(name="GridGroup") dim3 = Dim3() @@ -23,6 +25,7 @@ def __init__(self): class CUDADispatcher(types.Dispatcher): """The type of CUDA dispatchers""" + # This type exists (instead of using types.Dispatcher as the type of CUDA # dispatchers) so that we can have an alternative lowering for them to the # lowering of CPU dispatchers - the CPU target lowers all dispatchers as a diff --git a/numba_cuda/numba/cuda/ufuncs.py b/numba_cuda/numba/cuda/ufuncs.py index 1ab3f9605..bcfff371f 100644 --- a/numba_cuda/numba/cuda/ufuncs.py +++ b/numba_cuda/numba/cuda/ufuncs.py @@ -10,8 +10,10 @@ import numpy as np from functools import lru_cache from numba.core import typing -from numba.cuda.mathimpl import (get_unary_impl_for_fn_and_ty, - get_binary_impl_for_fn_and_ty) +from numba.cuda.mathimpl import ( + get_unary_impl_for_fn_and_ty, + get_binary_impl_for_fn_and_ty, +) def get_ufunc_info(ufunc_key): @@ -173,490 +175,508 @@ def np_real_atanh_impl(context, builder, sig, args): db = {} db[np.sin] = { - 'f->f': np_real_sin_impl, - 'd->d': np_real_sin_impl, - 'F->F': npyfuncs.np_complex_sin_impl, - 'D->D': npyfuncs.np_complex_sin_impl, + "f->f": np_real_sin_impl, + "d->d": np_real_sin_impl, + "F->F": npyfuncs.np_complex_sin_impl, + "D->D": npyfuncs.np_complex_sin_impl, } db[np.cos] = { - 'f->f': np_real_cos_impl, - 'd->d': np_real_cos_impl, - 'F->F': npyfuncs.np_complex_cos_impl, - 'D->D': npyfuncs.np_complex_cos_impl, + "f->f": np_real_cos_impl, + "d->d": np_real_cos_impl, + "F->F": npyfuncs.np_complex_cos_impl, + "D->D": npyfuncs.np_complex_cos_impl, } db[np.tan] = { - 'f->f': np_real_tan_impl, - 'd->d': np_real_tan_impl, - 'F->F': cmathimpl.tan_impl, - 'D->D': cmathimpl.tan_impl, + "f->f": np_real_tan_impl, + "d->d": np_real_tan_impl, + "F->F": cmathimpl.tan_impl, + "D->D": cmathimpl.tan_impl, } db[np.arcsin] = { - 'f->f': np_real_asin_impl, - 'd->d': np_real_asin_impl, - 'F->F': cmathimpl.asin_impl, - 'D->D': cmathimpl.asin_impl, + "f->f": np_real_asin_impl, + "d->d": np_real_asin_impl, + "F->F": cmathimpl.asin_impl, + "D->D": cmathimpl.asin_impl, } db[np.arccos] = { - 'f->f': np_real_acos_impl, - 'd->d': np_real_acos_impl, - 'F->F': cmathimpl.acos_impl, - 'D->D': cmathimpl.acos_impl, + "f->f": np_real_acos_impl, + "d->d": np_real_acos_impl, + "F->F": cmathimpl.acos_impl, + "D->D": cmathimpl.acos_impl, } db[np.arctan] = { - 'f->f': np_real_atan_impl, - 'd->d': np_real_atan_impl, - 'F->F': cmathimpl.atan_impl, - 'D->D': cmathimpl.atan_impl, + "f->f": np_real_atan_impl, + "d->d": np_real_atan_impl, + "F->F": cmathimpl.atan_impl, + "D->D": cmathimpl.atan_impl, } db[np.arctan2] = { - 'ff->f': np_real_atan2_impl, - 'dd->d': np_real_atan2_impl, + "ff->f": np_real_atan2_impl, + "dd->d": np_real_atan2_impl, } db[np.hypot] = { - 'ff->f': np_real_hypot_impl, - 'dd->d': np_real_hypot_impl, + "ff->f": np_real_hypot_impl, + "dd->d": np_real_hypot_impl, } db[np.sinh] = { - 'f->f': np_real_sinh_impl, - 'd->d': np_real_sinh_impl, - 'F->F': np_complex_sinh_impl, - 'D->D': np_complex_sinh_impl, + "f->f": np_real_sinh_impl, + "d->d": np_real_sinh_impl, + "F->F": np_complex_sinh_impl, + "D->D": np_complex_sinh_impl, } db[np.cosh] = { - 'f->f': np_real_cosh_impl, - 'd->d': np_real_cosh_impl, - 'F->F': np_complex_cosh_impl, - 'D->D': np_complex_cosh_impl, + "f->f": np_real_cosh_impl, + "d->d": np_real_cosh_impl, + "F->F": np_complex_cosh_impl, + "D->D": np_complex_cosh_impl, } db[np.tanh] = { - 'f->f': np_real_tanh_impl, - 'd->d': np_real_tanh_impl, - 'F->F': np_complex_tanh_impl, - 'D->D': np_complex_tanh_impl, + "f->f": np_real_tanh_impl, + "d->d": np_real_tanh_impl, + "F->F": np_complex_tanh_impl, + "D->D": np_complex_tanh_impl, } db[np.arcsinh] = { - 'f->f': np_real_asinh_impl, - 'd->d': np_real_asinh_impl, - 'F->F': cmathimpl.asinh_impl, - 'D->D': cmathimpl.asinh_impl, + "f->f": np_real_asinh_impl, + "d->d": np_real_asinh_impl, + "F->F": cmathimpl.asinh_impl, + "D->D": cmathimpl.asinh_impl, } db[np.arccosh] = { - 'f->f': np_real_acosh_impl, - 'd->d': np_real_acosh_impl, - 'F->F': npyfuncs.np_complex_acosh_impl, - 'D->D': npyfuncs.np_complex_acosh_impl, + "f->f": np_real_acosh_impl, + "d->d": np_real_acosh_impl, + "F->F": npyfuncs.np_complex_acosh_impl, + "D->D": npyfuncs.np_complex_acosh_impl, } db[np.arctanh] = { - 'f->f': np_real_atanh_impl, - 'd->d': np_real_atanh_impl, - 'F->F': cmathimpl.atanh_impl, - 'D->D': cmathimpl.atanh_impl, + "f->f": np_real_atanh_impl, + "d->d": np_real_atanh_impl, + "F->F": cmathimpl.atanh_impl, + "D->D": cmathimpl.atanh_impl, } db[np.deg2rad] = { - 'f->f': mathimpl.radians_float_impl, - 'd->d': mathimpl.radians_float_impl, + "f->f": mathimpl.radians_float_impl, + "d->d": mathimpl.radians_float_impl, } db[np.radians] = db[np.deg2rad] db[np.rad2deg] = { - 'f->f': mathimpl.degrees_float_impl, - 'd->d': mathimpl.degrees_float_impl, + "f->f": mathimpl.degrees_float_impl, + "d->d": mathimpl.degrees_float_impl, } db[np.degrees] = db[np.rad2deg] db[np.greater] = { - '??->?': numbers.int_ugt_impl, - 'bb->?': numbers.int_sgt_impl, - 'BB->?': numbers.int_ugt_impl, - 'hh->?': numbers.int_sgt_impl, - 'HH->?': numbers.int_ugt_impl, - 'ii->?': numbers.int_sgt_impl, - 'II->?': numbers.int_ugt_impl, - 'll->?': numbers.int_sgt_impl, - 'LL->?': numbers.int_ugt_impl, - 'qq->?': numbers.int_sgt_impl, - 'QQ->?': numbers.int_ugt_impl, - 'ff->?': numbers.real_gt_impl, - 'dd->?': numbers.real_gt_impl, - 'FF->?': npyfuncs.np_complex_gt_impl, - 'DD->?': npyfuncs.np_complex_gt_impl, + "??->?": numbers.int_ugt_impl, + "bb->?": numbers.int_sgt_impl, + "BB->?": numbers.int_ugt_impl, + "hh->?": numbers.int_sgt_impl, + "HH->?": numbers.int_ugt_impl, + "ii->?": numbers.int_sgt_impl, + "II->?": numbers.int_ugt_impl, + "ll->?": numbers.int_sgt_impl, + "LL->?": numbers.int_ugt_impl, + "qq->?": numbers.int_sgt_impl, + "QQ->?": numbers.int_ugt_impl, + "ff->?": numbers.real_gt_impl, + "dd->?": numbers.real_gt_impl, + "FF->?": npyfuncs.np_complex_gt_impl, + "DD->?": npyfuncs.np_complex_gt_impl, } if numpy_version >= (1, 25): - db[np.greater].update({ - 'qQ->?': numbers.int_signed_unsigned_cmp('>'), - 'Qq->?': numbers.int_unsigned_signed_cmp('>')}) + db[np.greater].update( + { + "qQ->?": numbers.int_signed_unsigned_cmp(">"), + "Qq->?": numbers.int_unsigned_signed_cmp(">"), + } + ) db[np.greater_equal] = { - '??->?': numbers.int_uge_impl, - 'bb->?': numbers.int_sge_impl, - 'BB->?': numbers.int_uge_impl, - 'hh->?': numbers.int_sge_impl, - 'HH->?': numbers.int_uge_impl, - 'ii->?': numbers.int_sge_impl, - 'II->?': numbers.int_uge_impl, - 'll->?': numbers.int_sge_impl, - 'LL->?': numbers.int_uge_impl, - 'qq->?': numbers.int_sge_impl, - 'QQ->?': numbers.int_uge_impl, - 'ff->?': numbers.real_ge_impl, - 'dd->?': numbers.real_ge_impl, - 'FF->?': npyfuncs.np_complex_ge_impl, - 'DD->?': npyfuncs.np_complex_ge_impl, + "??->?": numbers.int_uge_impl, + "bb->?": numbers.int_sge_impl, + "BB->?": numbers.int_uge_impl, + "hh->?": numbers.int_sge_impl, + "HH->?": numbers.int_uge_impl, + "ii->?": numbers.int_sge_impl, + "II->?": numbers.int_uge_impl, + "ll->?": numbers.int_sge_impl, + "LL->?": numbers.int_uge_impl, + "qq->?": numbers.int_sge_impl, + "QQ->?": numbers.int_uge_impl, + "ff->?": numbers.real_ge_impl, + "dd->?": numbers.real_ge_impl, + "FF->?": npyfuncs.np_complex_ge_impl, + "DD->?": npyfuncs.np_complex_ge_impl, } if numpy_version >= (1, 25): - db[np.greater_equal].update({ - 'qQ->?': numbers.int_signed_unsigned_cmp('>='), - 'Qq->?': numbers.int_unsigned_signed_cmp('>=')}) + db[np.greater_equal].update( + { + "qQ->?": numbers.int_signed_unsigned_cmp(">="), + "Qq->?": numbers.int_unsigned_signed_cmp(">="), + } + ) db[np.less] = { - '??->?': numbers.int_ult_impl, - 'bb->?': numbers.int_slt_impl, - 'BB->?': numbers.int_ult_impl, - 'hh->?': numbers.int_slt_impl, - 'HH->?': numbers.int_ult_impl, - 'ii->?': numbers.int_slt_impl, - 'II->?': numbers.int_ult_impl, - 'll->?': numbers.int_slt_impl, - 'LL->?': numbers.int_ult_impl, - 'qq->?': numbers.int_slt_impl, - 'QQ->?': numbers.int_ult_impl, - 'ff->?': numbers.real_lt_impl, - 'dd->?': numbers.real_lt_impl, - 'FF->?': npyfuncs.np_complex_lt_impl, - 'DD->?': npyfuncs.np_complex_lt_impl, + "??->?": numbers.int_ult_impl, + "bb->?": numbers.int_slt_impl, + "BB->?": numbers.int_ult_impl, + "hh->?": numbers.int_slt_impl, + "HH->?": numbers.int_ult_impl, + "ii->?": numbers.int_slt_impl, + "II->?": numbers.int_ult_impl, + "ll->?": numbers.int_slt_impl, + "LL->?": numbers.int_ult_impl, + "qq->?": numbers.int_slt_impl, + "QQ->?": numbers.int_ult_impl, + "ff->?": numbers.real_lt_impl, + "dd->?": numbers.real_lt_impl, + "FF->?": npyfuncs.np_complex_lt_impl, + "DD->?": npyfuncs.np_complex_lt_impl, } if numpy_version >= (1, 25): - db[np.less].update({ - 'qQ->?': numbers.int_signed_unsigned_cmp('<'), - 'Qq->?': numbers.int_unsigned_signed_cmp('<')}) + db[np.less].update( + { + "qQ->?": numbers.int_signed_unsigned_cmp("<"), + "Qq->?": numbers.int_unsigned_signed_cmp("<"), + } + ) db[np.less_equal] = { - '??->?': numbers.int_ule_impl, - 'bb->?': numbers.int_sle_impl, - 'BB->?': numbers.int_ule_impl, - 'hh->?': numbers.int_sle_impl, - 'HH->?': numbers.int_ule_impl, - 'ii->?': numbers.int_sle_impl, - 'II->?': numbers.int_ule_impl, - 'll->?': numbers.int_sle_impl, - 'LL->?': numbers.int_ule_impl, - 'qq->?': numbers.int_sle_impl, - 'QQ->?': numbers.int_ule_impl, - 'ff->?': numbers.real_le_impl, - 'dd->?': numbers.real_le_impl, - 'FF->?': npyfuncs.np_complex_le_impl, - 'DD->?': npyfuncs.np_complex_le_impl, + "??->?": numbers.int_ule_impl, + "bb->?": numbers.int_sle_impl, + "BB->?": numbers.int_ule_impl, + "hh->?": numbers.int_sle_impl, + "HH->?": numbers.int_ule_impl, + "ii->?": numbers.int_sle_impl, + "II->?": numbers.int_ule_impl, + "ll->?": numbers.int_sle_impl, + "LL->?": numbers.int_ule_impl, + "qq->?": numbers.int_sle_impl, + "QQ->?": numbers.int_ule_impl, + "ff->?": numbers.real_le_impl, + "dd->?": numbers.real_le_impl, + "FF->?": npyfuncs.np_complex_le_impl, + "DD->?": npyfuncs.np_complex_le_impl, } if numpy_version >= (1, 25): - db[np.less_equal].update({ - 'qQ->?': numbers.int_signed_unsigned_cmp('<='), - 'Qq->?': numbers.int_unsigned_signed_cmp('<=')}) + db[np.less_equal].update( + { + "qQ->?": numbers.int_signed_unsigned_cmp("<="), + "Qq->?": numbers.int_unsigned_signed_cmp("<="), + } + ) db[np.not_equal] = { - '??->?': numbers.int_ne_impl, - 'bb->?': numbers.int_ne_impl, - 'BB->?': numbers.int_ne_impl, - 'hh->?': numbers.int_ne_impl, - 'HH->?': numbers.int_ne_impl, - 'ii->?': numbers.int_ne_impl, - 'II->?': numbers.int_ne_impl, - 'll->?': numbers.int_ne_impl, - 'LL->?': numbers.int_ne_impl, - 'qq->?': numbers.int_ne_impl, - 'QQ->?': numbers.int_ne_impl, - 'ff->?': numbers.real_ne_impl, - 'dd->?': numbers.real_ne_impl, - 'FF->?': npyfuncs.np_complex_ne_impl, - 'DD->?': npyfuncs.np_complex_ne_impl, + "??->?": numbers.int_ne_impl, + "bb->?": numbers.int_ne_impl, + "BB->?": numbers.int_ne_impl, + "hh->?": numbers.int_ne_impl, + "HH->?": numbers.int_ne_impl, + "ii->?": numbers.int_ne_impl, + "II->?": numbers.int_ne_impl, + "ll->?": numbers.int_ne_impl, + "LL->?": numbers.int_ne_impl, + "qq->?": numbers.int_ne_impl, + "QQ->?": numbers.int_ne_impl, + "ff->?": numbers.real_ne_impl, + "dd->?": numbers.real_ne_impl, + "FF->?": npyfuncs.np_complex_ne_impl, + "DD->?": npyfuncs.np_complex_ne_impl, } if numpy_version >= (1, 25): - db[np.not_equal].update({ - 'qQ->?': numbers.int_signed_unsigned_cmp('!='), - 'Qq->?': numbers.int_unsigned_signed_cmp('!=')}) + db[np.not_equal].update( + { + "qQ->?": numbers.int_signed_unsigned_cmp("!="), + "Qq->?": numbers.int_unsigned_signed_cmp("!="), + } + ) db[np.equal] = { - '??->?': numbers.int_eq_impl, - 'bb->?': numbers.int_eq_impl, - 'BB->?': numbers.int_eq_impl, - 'hh->?': numbers.int_eq_impl, - 'HH->?': numbers.int_eq_impl, - 'ii->?': numbers.int_eq_impl, - 'II->?': numbers.int_eq_impl, - 'll->?': numbers.int_eq_impl, - 'LL->?': numbers.int_eq_impl, - 'qq->?': numbers.int_eq_impl, - 'QQ->?': numbers.int_eq_impl, - 'ff->?': numbers.real_eq_impl, - 'dd->?': numbers.real_eq_impl, - 'FF->?': npyfuncs.np_complex_eq_impl, - 'DD->?': npyfuncs.np_complex_eq_impl, + "??->?": numbers.int_eq_impl, + "bb->?": numbers.int_eq_impl, + "BB->?": numbers.int_eq_impl, + "hh->?": numbers.int_eq_impl, + "HH->?": numbers.int_eq_impl, + "ii->?": numbers.int_eq_impl, + "II->?": numbers.int_eq_impl, + "ll->?": numbers.int_eq_impl, + "LL->?": numbers.int_eq_impl, + "qq->?": numbers.int_eq_impl, + "QQ->?": numbers.int_eq_impl, + "ff->?": numbers.real_eq_impl, + "dd->?": numbers.real_eq_impl, + "FF->?": npyfuncs.np_complex_eq_impl, + "DD->?": npyfuncs.np_complex_eq_impl, } if numpy_version >= (1, 25): - db[np.equal].update({ - 'qQ->?': numbers.int_signed_unsigned_cmp('=='), - 'Qq->?': numbers.int_unsigned_signed_cmp('==')}) + db[np.equal].update( + { + "qQ->?": numbers.int_signed_unsigned_cmp("=="), + "Qq->?": numbers.int_unsigned_signed_cmp("=="), + } + ) db[np.logical_and] = { - '??->?': npyfuncs.np_logical_and_impl, - 'bb->?': npyfuncs.np_logical_and_impl, - 'BB->?': npyfuncs.np_logical_and_impl, - 'hh->?': npyfuncs.np_logical_and_impl, - 'HH->?': npyfuncs.np_logical_and_impl, - 'ii->?': npyfuncs.np_logical_and_impl, - 'II->?': npyfuncs.np_logical_and_impl, - 'll->?': npyfuncs.np_logical_and_impl, - 'LL->?': npyfuncs.np_logical_and_impl, - 'qq->?': npyfuncs.np_logical_and_impl, - 'QQ->?': npyfuncs.np_logical_and_impl, - 'ff->?': npyfuncs.np_logical_and_impl, - 'dd->?': npyfuncs.np_logical_and_impl, - 'FF->?': npyfuncs.np_complex_logical_and_impl, - 'DD->?': npyfuncs.np_complex_logical_and_impl, + "??->?": npyfuncs.np_logical_and_impl, + "bb->?": npyfuncs.np_logical_and_impl, + "BB->?": npyfuncs.np_logical_and_impl, + "hh->?": npyfuncs.np_logical_and_impl, + "HH->?": npyfuncs.np_logical_and_impl, + "ii->?": npyfuncs.np_logical_and_impl, + "II->?": npyfuncs.np_logical_and_impl, + "ll->?": npyfuncs.np_logical_and_impl, + "LL->?": npyfuncs.np_logical_and_impl, + "qq->?": npyfuncs.np_logical_and_impl, + "QQ->?": npyfuncs.np_logical_and_impl, + "ff->?": npyfuncs.np_logical_and_impl, + "dd->?": npyfuncs.np_logical_and_impl, + "FF->?": npyfuncs.np_complex_logical_and_impl, + "DD->?": npyfuncs.np_complex_logical_and_impl, } db[np.logical_or] = { - '??->?': npyfuncs.np_logical_or_impl, - 'bb->?': npyfuncs.np_logical_or_impl, - 'BB->?': npyfuncs.np_logical_or_impl, - 'hh->?': npyfuncs.np_logical_or_impl, - 'HH->?': npyfuncs.np_logical_or_impl, - 'ii->?': npyfuncs.np_logical_or_impl, - 'II->?': npyfuncs.np_logical_or_impl, - 'll->?': npyfuncs.np_logical_or_impl, - 'LL->?': npyfuncs.np_logical_or_impl, - 'qq->?': npyfuncs.np_logical_or_impl, - 'QQ->?': npyfuncs.np_logical_or_impl, - 'ff->?': npyfuncs.np_logical_or_impl, - 'dd->?': npyfuncs.np_logical_or_impl, - 'FF->?': npyfuncs.np_complex_logical_or_impl, - 'DD->?': npyfuncs.np_complex_logical_or_impl, + "??->?": npyfuncs.np_logical_or_impl, + "bb->?": npyfuncs.np_logical_or_impl, + "BB->?": npyfuncs.np_logical_or_impl, + "hh->?": npyfuncs.np_logical_or_impl, + "HH->?": npyfuncs.np_logical_or_impl, + "ii->?": npyfuncs.np_logical_or_impl, + "II->?": npyfuncs.np_logical_or_impl, + "ll->?": npyfuncs.np_logical_or_impl, + "LL->?": npyfuncs.np_logical_or_impl, + "qq->?": npyfuncs.np_logical_or_impl, + "QQ->?": npyfuncs.np_logical_or_impl, + "ff->?": npyfuncs.np_logical_or_impl, + "dd->?": npyfuncs.np_logical_or_impl, + "FF->?": npyfuncs.np_complex_logical_or_impl, + "DD->?": npyfuncs.np_complex_logical_or_impl, } db[np.logical_xor] = { - '??->?': npyfuncs.np_logical_xor_impl, - 'bb->?': npyfuncs.np_logical_xor_impl, - 'BB->?': npyfuncs.np_logical_xor_impl, - 'hh->?': npyfuncs.np_logical_xor_impl, - 'HH->?': npyfuncs.np_logical_xor_impl, - 'ii->?': npyfuncs.np_logical_xor_impl, - 'II->?': npyfuncs.np_logical_xor_impl, - 'll->?': npyfuncs.np_logical_xor_impl, - 'LL->?': npyfuncs.np_logical_xor_impl, - 'qq->?': npyfuncs.np_logical_xor_impl, - 'QQ->?': npyfuncs.np_logical_xor_impl, - 'ff->?': npyfuncs.np_logical_xor_impl, - 'dd->?': npyfuncs.np_logical_xor_impl, - 'FF->?': npyfuncs.np_complex_logical_xor_impl, - 'DD->?': npyfuncs.np_complex_logical_xor_impl, + "??->?": npyfuncs.np_logical_xor_impl, + "bb->?": npyfuncs.np_logical_xor_impl, + "BB->?": npyfuncs.np_logical_xor_impl, + "hh->?": npyfuncs.np_logical_xor_impl, + "HH->?": npyfuncs.np_logical_xor_impl, + "ii->?": npyfuncs.np_logical_xor_impl, + "II->?": npyfuncs.np_logical_xor_impl, + "ll->?": npyfuncs.np_logical_xor_impl, + "LL->?": npyfuncs.np_logical_xor_impl, + "qq->?": npyfuncs.np_logical_xor_impl, + "QQ->?": npyfuncs.np_logical_xor_impl, + "ff->?": npyfuncs.np_logical_xor_impl, + "dd->?": npyfuncs.np_logical_xor_impl, + "FF->?": npyfuncs.np_complex_logical_xor_impl, + "DD->?": npyfuncs.np_complex_logical_xor_impl, } db[np.logical_not] = { - '?->?': npyfuncs.np_logical_not_impl, - 'b->?': npyfuncs.np_logical_not_impl, - 'B->?': npyfuncs.np_logical_not_impl, - 'h->?': npyfuncs.np_logical_not_impl, - 'H->?': npyfuncs.np_logical_not_impl, - 'i->?': npyfuncs.np_logical_not_impl, - 'I->?': npyfuncs.np_logical_not_impl, - 'l->?': npyfuncs.np_logical_not_impl, - 'L->?': npyfuncs.np_logical_not_impl, - 'q->?': npyfuncs.np_logical_not_impl, - 'Q->?': npyfuncs.np_logical_not_impl, - 'f->?': npyfuncs.np_logical_not_impl, - 'd->?': npyfuncs.np_logical_not_impl, - 'F->?': npyfuncs.np_complex_logical_not_impl, - 'D->?': npyfuncs.np_complex_logical_not_impl, + "?->?": npyfuncs.np_logical_not_impl, + "b->?": npyfuncs.np_logical_not_impl, + "B->?": npyfuncs.np_logical_not_impl, + "h->?": npyfuncs.np_logical_not_impl, + "H->?": npyfuncs.np_logical_not_impl, + "i->?": npyfuncs.np_logical_not_impl, + "I->?": npyfuncs.np_logical_not_impl, + "l->?": npyfuncs.np_logical_not_impl, + "L->?": npyfuncs.np_logical_not_impl, + "q->?": npyfuncs.np_logical_not_impl, + "Q->?": npyfuncs.np_logical_not_impl, + "f->?": npyfuncs.np_logical_not_impl, + "d->?": npyfuncs.np_logical_not_impl, + "F->?": npyfuncs.np_complex_logical_not_impl, + "D->?": npyfuncs.np_complex_logical_not_impl, } db[np.maximum] = { - '??->?': npyfuncs.np_logical_or_impl, - 'bb->b': npyfuncs.np_int_smax_impl, - 'BB->B': npyfuncs.np_int_umax_impl, - 'hh->h': npyfuncs.np_int_smax_impl, - 'HH->H': npyfuncs.np_int_umax_impl, - 'ii->i': npyfuncs.np_int_smax_impl, - 'II->I': npyfuncs.np_int_umax_impl, - 'll->l': npyfuncs.np_int_smax_impl, - 'LL->L': npyfuncs.np_int_umax_impl, - 'qq->q': npyfuncs.np_int_smax_impl, - 'QQ->Q': npyfuncs.np_int_umax_impl, - 'ff->f': npyfuncs.np_real_maximum_impl, - 'dd->d': npyfuncs.np_real_maximum_impl, - 'FF->F': npyfuncs.np_complex_maximum_impl, - 'DD->D': npyfuncs.np_complex_maximum_impl, + "??->?": npyfuncs.np_logical_or_impl, + "bb->b": npyfuncs.np_int_smax_impl, + "BB->B": npyfuncs.np_int_umax_impl, + "hh->h": npyfuncs.np_int_smax_impl, + "HH->H": npyfuncs.np_int_umax_impl, + "ii->i": npyfuncs.np_int_smax_impl, + "II->I": npyfuncs.np_int_umax_impl, + "ll->l": npyfuncs.np_int_smax_impl, + "LL->L": npyfuncs.np_int_umax_impl, + "qq->q": npyfuncs.np_int_smax_impl, + "QQ->Q": npyfuncs.np_int_umax_impl, + "ff->f": npyfuncs.np_real_maximum_impl, + "dd->d": npyfuncs.np_real_maximum_impl, + "FF->F": npyfuncs.np_complex_maximum_impl, + "DD->D": npyfuncs.np_complex_maximum_impl, } db[np.minimum] = { - '??->?': npyfuncs.np_logical_and_impl, - 'bb->b': npyfuncs.np_int_smin_impl, - 'BB->B': npyfuncs.np_int_umin_impl, - 'hh->h': npyfuncs.np_int_smin_impl, - 'HH->H': npyfuncs.np_int_umin_impl, - 'ii->i': npyfuncs.np_int_smin_impl, - 'II->I': npyfuncs.np_int_umin_impl, - 'll->l': npyfuncs.np_int_smin_impl, - 'LL->L': npyfuncs.np_int_umin_impl, - 'qq->q': npyfuncs.np_int_smin_impl, - 'QQ->Q': npyfuncs.np_int_umin_impl, - 'ff->f': npyfuncs.np_real_minimum_impl, - 'dd->d': npyfuncs.np_real_minimum_impl, - 'FF->F': npyfuncs.np_complex_minimum_impl, - 'DD->D': npyfuncs.np_complex_minimum_impl, + "??->?": npyfuncs.np_logical_and_impl, + "bb->b": npyfuncs.np_int_smin_impl, + "BB->B": npyfuncs.np_int_umin_impl, + "hh->h": npyfuncs.np_int_smin_impl, + "HH->H": npyfuncs.np_int_umin_impl, + "ii->i": npyfuncs.np_int_smin_impl, + "II->I": npyfuncs.np_int_umin_impl, + "ll->l": npyfuncs.np_int_smin_impl, + "LL->L": npyfuncs.np_int_umin_impl, + "qq->q": npyfuncs.np_int_smin_impl, + "QQ->Q": npyfuncs.np_int_umin_impl, + "ff->f": npyfuncs.np_real_minimum_impl, + "dd->d": npyfuncs.np_real_minimum_impl, + "FF->F": npyfuncs.np_complex_minimum_impl, + "DD->D": npyfuncs.np_complex_minimum_impl, } db[np.fmax] = { - '??->?': npyfuncs.np_logical_or_impl, - 'bb->b': npyfuncs.np_int_smax_impl, - 'BB->B': npyfuncs.np_int_umax_impl, - 'hh->h': npyfuncs.np_int_smax_impl, - 'HH->H': npyfuncs.np_int_umax_impl, - 'ii->i': npyfuncs.np_int_smax_impl, - 'II->I': npyfuncs.np_int_umax_impl, - 'll->l': npyfuncs.np_int_smax_impl, - 'LL->L': npyfuncs.np_int_umax_impl, - 'qq->q': npyfuncs.np_int_smax_impl, - 'QQ->Q': npyfuncs.np_int_umax_impl, - 'ff->f': npyfuncs.np_real_fmax_impl, - 'dd->d': npyfuncs.np_real_fmax_impl, - 'FF->F': npyfuncs.np_complex_fmax_impl, - 'DD->D': npyfuncs.np_complex_fmax_impl, + "??->?": npyfuncs.np_logical_or_impl, + "bb->b": npyfuncs.np_int_smax_impl, + "BB->B": npyfuncs.np_int_umax_impl, + "hh->h": npyfuncs.np_int_smax_impl, + "HH->H": npyfuncs.np_int_umax_impl, + "ii->i": npyfuncs.np_int_smax_impl, + "II->I": npyfuncs.np_int_umax_impl, + "ll->l": npyfuncs.np_int_smax_impl, + "LL->L": npyfuncs.np_int_umax_impl, + "qq->q": npyfuncs.np_int_smax_impl, + "QQ->Q": npyfuncs.np_int_umax_impl, + "ff->f": npyfuncs.np_real_fmax_impl, + "dd->d": npyfuncs.np_real_fmax_impl, + "FF->F": npyfuncs.np_complex_fmax_impl, + "DD->D": npyfuncs.np_complex_fmax_impl, } db[np.fmin] = { - '??->?': npyfuncs.np_logical_and_impl, - 'bb->b': npyfuncs.np_int_smin_impl, - 'BB->B': npyfuncs.np_int_umin_impl, - 'hh->h': npyfuncs.np_int_smin_impl, - 'HH->H': npyfuncs.np_int_umin_impl, - 'ii->i': npyfuncs.np_int_smin_impl, - 'II->I': npyfuncs.np_int_umin_impl, - 'll->l': npyfuncs.np_int_smin_impl, - 'LL->L': npyfuncs.np_int_umin_impl, - 'qq->q': npyfuncs.np_int_smin_impl, - 'QQ->Q': npyfuncs.np_int_umin_impl, - 'ff->f': npyfuncs.np_real_fmin_impl, - 'dd->d': npyfuncs.np_real_fmin_impl, - 'FF->F': npyfuncs.np_complex_fmin_impl, - 'DD->D': npyfuncs.np_complex_fmin_impl, + "??->?": npyfuncs.np_logical_and_impl, + "bb->b": npyfuncs.np_int_smin_impl, + "BB->B": npyfuncs.np_int_umin_impl, + "hh->h": npyfuncs.np_int_smin_impl, + "HH->H": npyfuncs.np_int_umin_impl, + "ii->i": npyfuncs.np_int_smin_impl, + "II->I": npyfuncs.np_int_umin_impl, + "ll->l": npyfuncs.np_int_smin_impl, + "LL->L": npyfuncs.np_int_umin_impl, + "qq->q": npyfuncs.np_int_smin_impl, + "QQ->Q": npyfuncs.np_int_umin_impl, + "ff->f": npyfuncs.np_real_fmin_impl, + "dd->d": npyfuncs.np_real_fmin_impl, + "FF->F": npyfuncs.np_complex_fmin_impl, + "DD->D": npyfuncs.np_complex_fmin_impl, } db[np.bitwise_and] = { - '??->?': numbers.int_and_impl, - 'bb->b': numbers.int_and_impl, - 'BB->B': numbers.int_and_impl, - 'hh->h': numbers.int_and_impl, - 'HH->H': numbers.int_and_impl, - 'ii->i': numbers.int_and_impl, - 'II->I': numbers.int_and_impl, - 'll->l': numbers.int_and_impl, - 'LL->L': numbers.int_and_impl, - 'qq->q': numbers.int_and_impl, - 'QQ->Q': numbers.int_and_impl, + "??->?": numbers.int_and_impl, + "bb->b": numbers.int_and_impl, + "BB->B": numbers.int_and_impl, + "hh->h": numbers.int_and_impl, + "HH->H": numbers.int_and_impl, + "ii->i": numbers.int_and_impl, + "II->I": numbers.int_and_impl, + "ll->l": numbers.int_and_impl, + "LL->L": numbers.int_and_impl, + "qq->q": numbers.int_and_impl, + "QQ->Q": numbers.int_and_impl, } db[np.bitwise_or] = { - '??->?': numbers.int_or_impl, - 'bb->b': numbers.int_or_impl, - 'BB->B': numbers.int_or_impl, - 'hh->h': numbers.int_or_impl, - 'HH->H': numbers.int_or_impl, - 'ii->i': numbers.int_or_impl, - 'II->I': numbers.int_or_impl, - 'll->l': numbers.int_or_impl, - 'LL->L': numbers.int_or_impl, - 'qq->q': numbers.int_or_impl, - 'QQ->Q': numbers.int_or_impl, + "??->?": numbers.int_or_impl, + "bb->b": numbers.int_or_impl, + "BB->B": numbers.int_or_impl, + "hh->h": numbers.int_or_impl, + "HH->H": numbers.int_or_impl, + "ii->i": numbers.int_or_impl, + "II->I": numbers.int_or_impl, + "ll->l": numbers.int_or_impl, + "LL->L": numbers.int_or_impl, + "qq->q": numbers.int_or_impl, + "QQ->Q": numbers.int_or_impl, } db[np.bitwise_xor] = { - '??->?': numbers.int_xor_impl, - 'bb->b': numbers.int_xor_impl, - 'BB->B': numbers.int_xor_impl, - 'hh->h': numbers.int_xor_impl, - 'HH->H': numbers.int_xor_impl, - 'ii->i': numbers.int_xor_impl, - 'II->I': numbers.int_xor_impl, - 'll->l': numbers.int_xor_impl, - 'LL->L': numbers.int_xor_impl, - 'qq->q': numbers.int_xor_impl, - 'QQ->Q': numbers.int_xor_impl, + "??->?": numbers.int_xor_impl, + "bb->b": numbers.int_xor_impl, + "BB->B": numbers.int_xor_impl, + "hh->h": numbers.int_xor_impl, + "HH->H": numbers.int_xor_impl, + "ii->i": numbers.int_xor_impl, + "II->I": numbers.int_xor_impl, + "ll->l": numbers.int_xor_impl, + "LL->L": numbers.int_xor_impl, + "qq->q": numbers.int_xor_impl, + "QQ->Q": numbers.int_xor_impl, } db[np.invert] = { - '?->?': numbers.int_invert_impl, - 'b->b': numbers.int_invert_impl, - 'B->B': numbers.int_invert_impl, - 'h->h': numbers.int_invert_impl, - 'H->H': numbers.int_invert_impl, - 'i->i': numbers.int_invert_impl, - 'I->I': numbers.int_invert_impl, - 'l->l': numbers.int_invert_impl, - 'L->L': numbers.int_invert_impl, - 'q->q': numbers.int_invert_impl, - 'Q->Q': numbers.int_invert_impl, + "?->?": numbers.int_invert_impl, + "b->b": numbers.int_invert_impl, + "B->B": numbers.int_invert_impl, + "h->h": numbers.int_invert_impl, + "H->H": numbers.int_invert_impl, + "i->i": numbers.int_invert_impl, + "I->I": numbers.int_invert_impl, + "l->l": numbers.int_invert_impl, + "L->L": numbers.int_invert_impl, + "q->q": numbers.int_invert_impl, + "Q->Q": numbers.int_invert_impl, } db[np.left_shift] = { - 'bb->b': numbers.int_shl_impl, - 'BB->B': numbers.int_shl_impl, - 'hh->h': numbers.int_shl_impl, - 'HH->H': numbers.int_shl_impl, - 'ii->i': numbers.int_shl_impl, - 'II->I': numbers.int_shl_impl, - 'll->l': numbers.int_shl_impl, - 'LL->L': numbers.int_shl_impl, - 'qq->q': numbers.int_shl_impl, - 'QQ->Q': numbers.int_shl_impl, + "bb->b": numbers.int_shl_impl, + "BB->B": numbers.int_shl_impl, + "hh->h": numbers.int_shl_impl, + "HH->H": numbers.int_shl_impl, + "ii->i": numbers.int_shl_impl, + "II->I": numbers.int_shl_impl, + "ll->l": numbers.int_shl_impl, + "LL->L": numbers.int_shl_impl, + "qq->q": numbers.int_shl_impl, + "QQ->Q": numbers.int_shl_impl, } db[np.right_shift] = { - 'bb->b': numbers.int_shr_impl, - 'BB->B': numbers.int_shr_impl, - 'hh->h': numbers.int_shr_impl, - 'HH->H': numbers.int_shr_impl, - 'ii->i': numbers.int_shr_impl, - 'II->I': numbers.int_shr_impl, - 'll->l': numbers.int_shr_impl, - 'LL->L': numbers.int_shr_impl, - 'qq->q': numbers.int_shr_impl, - 'QQ->Q': numbers.int_shr_impl, + "bb->b": numbers.int_shr_impl, + "BB->B": numbers.int_shr_impl, + "hh->h": numbers.int_shr_impl, + "HH->H": numbers.int_shr_impl, + "ii->i": numbers.int_shr_impl, + "II->I": numbers.int_shr_impl, + "ll->l": numbers.int_shr_impl, + "LL->L": numbers.int_shr_impl, + "qq->q": numbers.int_shr_impl, + "QQ->Q": numbers.int_shr_impl, } db[np.log] = { - 'f->f': np_real_log_impl, - 'd->d': np_real_log_impl, - 'F->F': npyfuncs.np_complex_log_impl, - 'D->D': npyfuncs.np_complex_log_impl, + "f->f": np_real_log_impl, + "d->d": np_real_log_impl, + "F->F": npyfuncs.np_complex_log_impl, + "D->D": npyfuncs.np_complex_log_impl, } db[np.log2] = { - 'f->f': np_real_log2_impl, - 'd->d': np_real_log2_impl, - 'F->F': npyfuncs.np_complex_log2_impl, - 'D->D': npyfuncs.np_complex_log2_impl, + "f->f": np_real_log2_impl, + "d->d": np_real_log2_impl, + "F->F": npyfuncs.np_complex_log2_impl, + "D->D": npyfuncs.np_complex_log2_impl, } db[np.log10] = { - 'f->f': np_real_log10_impl, - 'd->d': np_real_log10_impl, - 'F->F': npyfuncs.np_complex_log10_impl, - 'D->D': npyfuncs.np_complex_log10_impl, + "f->f": np_real_log10_impl, + "d->d": np_real_log10_impl, + "F->F": npyfuncs.np_complex_log10_impl, + "D->D": npyfuncs.np_complex_log10_impl, } return db diff --git a/numba_cuda/numba/cuda/utils.py b/numba_cuda/numba/cuda/utils.py index 48ce2b011..a66989135 100644 --- a/numba_cuda/numba/cuda/utils.py +++ b/numba_cuda/numba/cuda/utils.py @@ -9,7 +9,7 @@ def _readenv(name, ctor, default): return default() if callable(default) else default try: if ctor is bool: - return value.lower() in {'1', "true"} + return value.lower() in {"1", "true"} return ctor(value) except Exception: warnings.warn( @@ -17,6 +17,6 @@ def _readenv(name, ctor, default): f"value '{value}' could not be parsed.\n" "The parse failed with exception:\n" f"{traceback.format_exc()}", - RuntimeWarning + RuntimeWarning, ) return default diff --git a/numba_cuda/numba/cuda/vector_types.py b/numba_cuda/numba/cuda/vector_types.py index 5174e2b20..147c21aee 100644 --- a/numba_cuda/numba/cuda/vector_types.py +++ b/numba_cuda/numba/cuda/vector_types.py @@ -50,7 +50,7 @@ def make_vector_type( name: str, base_type: types.Type, attr_names: Tuple[str, ...], - user_facing_object + user_facing_object, ) -> types.Type: """Create a vector type. @@ -149,7 +149,7 @@ def lowering(context, builder, sig, actual_args): lower(ctor, *arglist)(lowering) -vector_types : Dict[str, VectorType] = {} +vector_types: Dict[str, VectorType] = {} def build_constructor_overloads(base_type, vty_name, num_elements, arglists, l): diff --git a/numba_cuda/numba/cuda/vectorizers.py b/numba_cuda/numba/cuda/vectorizers.py index b4c6bcf5d..4cd80edbf 100644 --- a/numba_cuda/numba/cuda/vectorizers.py +++ b/numba_cuda/numba/cuda/vectorizers.py @@ -1,8 +1,11 @@ from numba import cuda from numpy import array as np_array from numba.cuda import deviceufunc -from numba.cuda.deviceufunc import (UFuncMechanism, GeneralizedUFunc, - GUFuncCallSteps) +from numba.cuda.deviceufunc import ( + UFuncMechanism, + GeneralizedUFunc, + GUFuncCallSteps, +) class CUDAUFuncDispatcher(object): @@ -28,8 +31,9 @@ def __call__(self, *args, **kws): return CUDAUFuncMechanism.call(self.functions, args, kws) def reduce(self, arg, stream=0): - assert len(list(self.functions.keys())[0]) == 2, "must be a binary " \ - "ufunc" + assert len(list(self.functions.keys())[0]) == 2, ( + "must be a binary ufunc" + ) assert arg.ndim == 1, "must use 1d array" n = arg.shape[0] @@ -82,12 +86,12 @@ def __reduce(self, mem, gpu_mems, stream): class _CUDAGUFuncCallSteps(GUFuncCallSteps): __slots__ = [ - '_stream', + "_stream", ] def __init__(self, nin, nout, args, kwargs): super().__init__(nin, nout, args, kwargs) - self._stream = kwargs.get('stream', 0) + self._stream = kwargs.get("stream", 0) def is_device_array(self, obj): return cuda.is_cuda_array(obj) @@ -126,25 +130,27 @@ def _call_steps(self): return _CUDAGUFuncCallSteps def _broadcast_scalar_input(self, ary, shape): - return cuda.cudadrv.devicearray.DeviceNDArray(shape=shape, - strides=(0,), - dtype=ary.dtype, - gpu_data=ary.gpu_data) + return cuda.cudadrv.devicearray.DeviceNDArray( + shape=shape, strides=(0,), dtype=ary.dtype, gpu_data=ary.gpu_data + ) def _broadcast_add_axis(self, ary, newshape): newax = len(newshape) - len(ary.shape) # Add 0 strides for missing dimension newstrides = (0,) * newax + ary.strides - return cuda.cudadrv.devicearray.DeviceNDArray(shape=newshape, - strides=newstrides, - dtype=ary.dtype, - gpu_data=ary.gpu_data) + return cuda.cudadrv.devicearray.DeviceNDArray( + shape=newshape, + strides=newstrides, + dtype=ary.dtype, + gpu_data=ary.gpu_data, + ) class CUDAUFuncMechanism(UFuncMechanism): """ Provide CUDA specialization """ + DEFAULT_STREAM = 0 def launch(self, func, count, stream, args): @@ -173,9 +179,11 @@ def allocate_device_array(self, shape, dtype, stream): return cuda.device_array(shape=shape, dtype=dtype, stream=stream) def broadcast_device(self, ary, shape): - ax_differs = [ax for ax in range(len(shape)) - if ax >= ary.ndim - or ary.shape[ax] != shape[ax]] + ax_differs = [ + ax + for ax in range(len(shape)) + if ax >= ary.ndim or ary.shape[ax] != shape[ax] + ] missingdim = len(shape) - len(ary.shape) strides = [0] * missingdim + list(ary.strides) @@ -183,18 +191,17 @@ def broadcast_device(self, ary, shape): for ax in ax_differs: strides[ax] = 0 - return cuda.cudadrv.devicearray.DeviceNDArray(shape=shape, - strides=strides, - dtype=ary.dtype, - gpu_data=ary.gpu_data) + return cuda.cudadrv.devicearray.DeviceNDArray( + shape=shape, strides=strides, dtype=ary.dtype, gpu_data=ary.gpu_data + ) -vectorizer_stager_source = ''' +vectorizer_stager_source = """ def __vectorized_{name}({args}, __out__): __tid__ = __cuda__.grid(1) if __tid__ < __out__.shape[0]: __out__[__tid__] = __core__({argitems}) -''' +""" class CUDAVectorize(deviceufunc.DeviceVectorize): @@ -204,8 +211,7 @@ def _compile_core(self, sig): def _get_globals(self, corefn): glbl = self.pyfunc.__globals__.copy() - glbl.update({'__cuda__': cuda, - '__core__': corefn}) + glbl.update({"__cuda__": cuda, "__core__": corefn}) return glbl def _compile_kernel(self, fnobj, sig): @@ -222,20 +228,20 @@ def _kernel_template(self): # ------------------------------------------------------------------------------ # Generalized CUDA ufuncs -_gufunc_stager_source = ''' +_gufunc_stager_source = """ def __gufunc_{name}({args}): __tid__ = __cuda__.grid(1) if __tid__ < {checkedarg}: __core__({argitems}) -''' +""" class CUDAGUFuncVectorize(deviceufunc.DeviceGUFuncVectorize): def build_ufunc(self): engine = deviceufunc.GUFuncEngine(self.inputsig, self.outputsig) - return CUDAGeneralizedUFunc(kernelmap=self.kernelmap, - engine=engine, - pyfunc=self.pyfunc) + return CUDAGeneralizedUFunc( + kernelmap=self.kernelmap, engine=engine, pyfunc=self.pyfunc + ) def _compile_kernel(self, fnobj, sig): return cuda.jit(sig)(fnobj) @@ -247,6 +253,5 @@ def _kernel_template(self): def _get_globals(self, sig): corefn = cuda.jit(sig, device=True)(self.pyfunc) glbls = self.py_func.__globals__.copy() - glbls.update({'__cuda__': cuda, - '__core__': corefn}) + glbls.update({"__cuda__": cuda, "__core__": corefn}) return glbls diff --git a/pyproject.toml b/pyproject.toml index 2a484d9da..6dbf04e16 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,3 +37,66 @@ include = ["numba_cuda*"] [tool.setuptools.package-data] "*" = ["*.cu", "*.h", "*.hpp", "*.ptx", "*.cuh", "VERSION", "Makefile"] + +[tool.ruff] +line-length = 80 + +[tool.ruff.format] +docstring-code-format = true +docstring-code-line-length = 80 + +[tool.ruff.lint.pycodestyle] +max-doc-length = 80 +max-line-length = 80 + +[tool.ruff.lint] +ignore = [ + # Extra space in brackets + "E20", + # Multiple spaces around "," + "E231", + "E241", + # Comments + "E26", + # Assigning lambda expression + "E731", + # Ambiguous variable names + "E741", +] +fixable = ["ALL"] + +exclude = [ + "__pycache__", + ".git", + "*.pyc", + "*~", + "*.o", + "*.so", + "*.cpp", + "*.c", + "*.h", +] + +[tool.ruff.lint.per-file-ignores] +# Slightly long line in the standard version file +"numba_cuda/_version.py" = ["E501"] +# "Unused" imports / potentially undefined names in init files +"numba_cuda/numba/cuda/__init__.py" = ["F401", "F403", "F405"] +"numba_cuda/numba/cuda/simulator/__init__.py" = ["F401", "F403"] +"numba_cuda/numba/cuda/simulator/cudadrv/__init__.py" = ["F401"] +# Ignore star imports", " unused imports", " and "may be defined by star imports" +# errors in device_init because its purpose is to bring together a lot of +# the public API to be star-imported in numba.cuda.__init__ +"numba_cuda/numba/cuda/device_init.py" = ["F401", "F403", "F405"] +# libdevice.py is an autogenerated file containing stubs for all the device +# functions. Some of the lines in docstrings are a little over-long", " as they +# contain the URLs of the reference pages in the online libdevice +# documentation. +"numba_cuda/numba/cuda/libdevice.py" = ["E501"] +# Ignore too-long lines in the doc examples", " prioritising readability +# in the docs over line length in the example source (especially given that +# the test code is already indented by 8 spaces) +"numba_cuda/numba/cuda/tests/doc_examples/test_random.py" = ["E501"] +"numba_cuda/numba/cuda/tests/doc_examples/test_cg.py" = ["E501"] +"numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py" = ["E501"] +"numba_cuda/numba/tests/doc_examples/test_interval_example.py" = ["E501"] diff --git a/setup.py b/setup.py index 98a1061d2..bfb11f27a 100644 --- a/setup.py +++ b/setup.py @@ -25,10 +25,12 @@ def run(self): def get_source_files(self): src = super().get_source_files() - src.extend([ - str(SITE_PACKAGES / REDIRECTOR_PTH), - str(SITE_PACKAGES / REDIRECTOR_PY), - ]) + src.extend( + [ + str(SITE_PACKAGES / REDIRECTOR_PTH), + str(SITE_PACKAGES / REDIRECTOR_PY), + ] + ) return src def get_output_mapping(self): @@ -60,11 +62,17 @@ def _select_strategy(self, name, tag, build_lib): # the repo. It could be implemented, but we only handle the default # case for now. if self.mode is not None and self.mode != "lenient": - raise RuntimeError("Only lenient mode is supported for editable " - f"install. Current mode is {self.mode}") + raise RuntimeError( + "Only lenient mode is supported for editable " + f"install. Current mode is {self.mode}" + ) return TopLevelFinderWithRedirector(self.distribution, name) -setup(cmdclass={"build_py": build_py_with_redirector, - "editable_wheel": editable_wheel_with_redirector}) +setup( + cmdclass={ + "build_py": build_py_with_redirector, + "editable_wheel": editable_wheel_with_redirector, + } +) diff --git a/site-packages/_numba_cuda_redirector.py b/site-packages/_numba_cuda_redirector.py index ae9043307..1c76609ac 100644 --- a/site-packages/_numba_cuda_redirector.py +++ b/site-packages/_numba_cuda_redirector.py @@ -4,11 +4,14 @@ import sys import warnings -multiple_locations_msg = ("Multiple submodule search locations for {}. " - "Cannot redirect numba.cuda to numba_cuda") +multiple_locations_msg = ( + "Multiple submodule search locations for {}. " + "Cannot redirect numba.cuda to numba_cuda" +) -no_spec_msg = ("Couldn't get spec for {}. " - "Cannot redirect numba.cuda to numba_cuda") +no_spec_msg = ( + "Couldn't get spec for {}. Cannot redirect numba.cuda to numba_cuda" +) class NumbaCudaFinder(importlib.abc.MetaPathFinder): @@ -19,17 +22,17 @@ def ensure_initialized(self): if self.initialized is not None: return self.initialized - numba_spec = importlib.util.find_spec('numba') + numba_spec = importlib.util.find_spec("numba") if numba_spec is None: - warnings.warn(no_spec_msg.format('numba')) + warnings.warn(no_spec_msg.format("numba")) self.initialized = False return False - numba_cuda_spec = importlib.util.find_spec('numba_cuda') + numba_cuda_spec = importlib.util.find_spec("numba_cuda") if numba_spec is None: - warnings.warn(no_spec_msg.format('numba_cuda')) + warnings.warn(no_spec_msg.format("numba_cuda")) self.initialized = False return False @@ -37,19 +40,19 @@ def ensure_initialized(self): numba_cuda_search_locations = numba_cuda_spec.submodule_search_locations if len(numba_search_locations) != 1: - warnings.warn(multiple_locations_msg.format('numba')) + warnings.warn(multiple_locations_msg.format("numba")) self.initialized = False return False if len(numba_cuda_search_locations) != 1: - warnings.warn(multiple_locations_msg.format('numba_cuda')) + warnings.warn(multiple_locations_msg.format("numba_cuda")) self.initialized = False return False self.numba_path = numba_search_locations[0] location = numba_cuda_search_locations[0] - self.numba_cuda_path = str((pathlib.Path(location) / 'numba')) + self.numba_cuda_path = str((pathlib.Path(location) / "numba")) self.initialized = True return True @@ -64,8 +67,9 @@ def find_spec(self, name, path, target=None): # Re-entrancy - return and carry on return None - oot_path = [p.replace(self.numba_path, self.numba_cuda_path) - for p in path] + oot_path = [ + p.replace(self.numba_path, self.numba_cuda_path) for p in path + ] for finder in sys.meta_path: try: spec = finder.find_spec(name, oot_path, target)