diff --git a/sycl/include/CL/sycl/detail/pi.def b/sycl/include/CL/sycl/detail/pi.def index 6c76358d5576e..72332dd65113c 100644 --- a/sycl/include/CL/sycl/detail/pi.def +++ b/sycl/include/CL/sycl/detail/pi.def @@ -125,5 +125,6 @@ _PI_API(piextUSMEnqueueMemAdvise) _PI_API(piextUSMGetMemAllocInfo) _PI_API(piextKernelSetArgMemObj) +_PI_API(piextKernelSetArgSampler) #undef _PI_API diff --git a/sycl/include/CL/sycl/detail/pi.h b/sycl/include/CL/sycl/detail/pi.h index 5b7eb31b9297c..02faf9a8fb99a 100644 --- a/sycl/include/CL/sycl/detail/pi.h +++ b/sycl/include/CL/sycl/detail/pi.h @@ -1332,10 +1332,22 @@ __SYCL_EXPORT pi_result piEnqueueMemUnmap(pi_queue command_queue, pi_mem memobj, const pi_event *event_wait_list, pi_event *event); +// Extension to allow backends to process a PI memory object before adding it +// as an argument for a kernel. +// Note: This is needed by the CUDA backend to extract the device pointer to +// the memory as the kernels uses it rather than the PI object itself. __SYCL_EXPORT pi_result piextKernelSetArgMemObj(pi_kernel kernel, pi_uint32 arg_index, const pi_mem *arg_value); +// Extension to allow backends to process a PI sampler object before adding it +// as an argument for a kernel. +// Note: This is needed by the CUDA backend to extract the properties of the +// sampler as the kernels uses it rather than the PI object itself. +__SYCL_EXPORT pi_result piextKernelSetArgSampler(pi_kernel kernel, + pi_uint32 arg_index, + const pi_sampler *arg_value); + /// // USM /// diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 4fd8bcf75435f..4c8414bbb493c 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -2210,6 +2210,22 @@ pi_result cuda_piextKernelSetArgMemObj(pi_kernel kernel, pi_uint32 arg_index, return retErr; } +pi_result cuda_piextKernelSetArgSampler(pi_kernel kernel, pi_uint32 arg_index, + const pi_sampler *arg_value) { + + assert(kernel != nullptr); + assert(arg_value != nullptr); + + pi_result retErr = PI_SUCCESS; + try { + pi_uint32 samplerProps = (*arg_value)->props_; + kernel->set_kernel_arg(arg_index, sizeof(pi_uint32), (void *)&samplerProps); + } catch (pi_result err) { + retErr = err; + } + return retErr; +} + pi_result cuda_piEnqueueKernelLaunch( pi_queue command_queue, pi_kernel kernel, pi_uint32 work_dim, const size_t *global_work_offset, const size_t *global_work_size, @@ -2989,32 +3005,139 @@ pi_result cuda_piextEventCreateWithNativeHandle(pi_native_handle nativeHandle, return {}; } -/// \TODO Not implemented in CUDA, need untie from OpenCL +/// Creates a PI sampler object +/// +/// \param[in] context The context the sampler is created for. +/// \param[in] sampler_properties The properties for the sampler. +/// \param[out] result_sampler Set to the resulting sampler object. +/// +/// \return PI_SUCCESS on success. PI_INVALID_VALUE if given an invalid property +/// or if there is multiple of properties from the same category. pi_result cuda_piSamplerCreate(pi_context context, - const cl_sampler_properties *sampler_properties, + const pi_sampler_properties *sampler_properties, pi_sampler *result_sampler) { - cl::sycl::detail::pi::die("cuda_piSamplerCreate not implemented"); - return {}; + std::unique_ptr<_pi_sampler> retImplSampl{new _pi_sampler(context)}; + + bool propSeen[3] = {false, false, false}; + for (size_t i = 0; sampler_properties[i] != 0; i += 2) { + switch (sampler_properties[i]) { + case PI_SAMPLER_PROPERTIES_NORMALIZED_COORDS: + if (propSeen[0]) { + return PI_INVALID_VALUE; + } + propSeen[0] = true; + retImplSampl->props_ |= sampler_properties[i + 1]; + break; + case PI_SAMPLER_PROPERTIES_FILTER_MODE: + if (propSeen[1]) { + return PI_INVALID_VALUE; + } + propSeen[1] = true; + retImplSampl->props_ |= + (sampler_properties[i + 1] - PI_SAMPLER_FILTER_MODE_NEAREST) << 1; + break; + case PI_SAMPLER_PROPERTIES_ADDRESSING_MODE: + if (propSeen[2]) { + return PI_INVALID_VALUE; + } + propSeen[2] = true; + retImplSampl->props_ |= + (sampler_properties[i + 1] - PI_SAMPLER_ADDRESSING_MODE_NONE) << 2; + break; + default: + return PI_INVALID_VALUE; + } + } + + if (!propSeen[0]) { + retImplSampl->props_ |= CL_TRUE; + } + // Default filter mode to CL_FILTER_NEAREST + if (!propSeen[2]) { + retImplSampl->props_ |= (CL_ADDRESS_CLAMP % CL_ADDRESS_NONE) << 2; + } + + *result_sampler = retImplSampl.release(); + return PI_SUCCESS; } -/// \TODO Not implemented in CUDA, need untie from OpenCL +/// Gets information from a PI sampler object +/// +/// \param[in] sampler The sampler to get the information from. +/// \param[in] param_name The name of the information to get. +/// \param[in] param_value_size The size of the param_value. +/// \param[out] param_value Set to information value. +/// \param[out] param_value_size_ret Set to the size of the information value. +/// +/// \return PI_SUCCESS on success. pi_result cuda_piSamplerGetInfo(pi_sampler sampler, cl_sampler_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret) { - cl::sycl::detail::pi::die("cuda_piSamplerGetInfo not implemented"); + assert(sampler != nullptr); + + switch (param_name) { + case PI_SAMPLER_INFO_REFERENCE_COUNT: + return getInfo(param_value_size, param_value, param_value_size_ret, + sampler->get_reference_count()); + case PI_SAMPLER_INFO_CONTEXT: + return getInfo(param_value_size, param_value, param_value_size_ret, + sampler->context_); + case PI_SAMPLER_INFO_NORMALIZED_COORDS: { + pi_bool norm_coords_prop = static_cast(sampler->props_ & 0x1); + return getInfo(param_value_size, param_value, param_value_size_ret, + norm_coords_prop); + } + case PI_SAMPLER_INFO_FILTER_MODE: { + pi_sampler_filter_mode filter_prop = static_cast( + ((sampler->props_ >> 1) & 0x1) + PI_SAMPLER_FILTER_MODE_NEAREST); + return getInfo(param_value_size, param_value, param_value_size_ret, + filter_prop); + } + case PI_SAMPLER_INFO_ADDRESSING_MODE: { + pi_sampler_addressing_mode addressing_prop = + static_cast( + (sampler->props_ >> 2) + PI_SAMPLER_ADDRESSING_MODE_NONE); + return getInfo(param_value_size, param_value, param_value_size_ret, + addressing_prop); + } + default: + PI_HANDLE_UNKNOWN_PARAM_NAME(param_name); + } return {}; } -/// \TODO Not implemented in CUDA, need untie from OpenCL +/// Retains a PI sampler object, incrementing its reference count. +/// +/// \param[in] sampler The sampler to increment the reference count of. +/// +/// \return PI_SUCCESS. pi_result cuda_piSamplerRetain(pi_sampler sampler) { - cl::sycl::detail::pi::die("cuda_piSamplerRetain not implemented"); - return {}; + assert(sampler != nullptr); + sampler->increment_reference_count(); + return PI_SUCCESS; } -/// \TODO Not implemented in CUDA, need untie from OpenCL +/// Releases a PI sampler object, decrementing its reference count. If the +/// reference count reaches zero, the sampler object is destroyed. +/// +/// \param[in] sampler The sampler to decrement the reference count of. +/// +/// \return PI_SUCCESS. pi_result cuda_piSamplerRelease(pi_sampler sampler) { - cl::sycl::detail::pi::die("cuda_piSamplerRelease not implemented"); - return {}; + assert(sampler != nullptr); + + // double delete or someone is messing with the ref count. + // either way, cannot safely proceed. + cl::sycl::detail::pi::assertion( + sampler->get_reference_count() != 0, + "Reference count overflow detected in cuda_piSamplerRelease."); + + // decrement ref count. If it is 0, delete the sampler. + if (sampler->decrement_reference_count() == 0) { + delete sampler; + } + + return PI_SUCCESS; } /// General 3D memory copy operation. @@ -3934,6 +4057,7 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piextUSMGetMemAllocInfo, cuda_piextUSMGetMemAllocInfo) _PI_CL(piextKernelSetArgMemObj, cuda_piextKernelSetArgMemObj) + _PI_CL(piextKernelSetArgSampler, cuda_piextKernelSetArgSampler) #undef _PI_CL diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp index 333264ad6b011..229997c4af385 100644 --- a/sycl/plugins/cuda/pi_cuda.hpp +++ b/sycl/plugins/cuda/pi_cuda.hpp @@ -617,6 +617,26 @@ struct _pi_kernel { void clear_local_size() { args_.clear_local_size(); } }; +/// Implementation of samplers for CUDA +/// +/// Sampler property layout: +/// | 31 30 ... 6 5 | 4 3 2 | 1 | 0 | +/// | N/A | addressing mode | fiter mode | normalize coords | +struct _pi_sampler { + std::atomic_uint32_t refCount_; + pi_uint32 props_; + pi_context context_; + + _pi_sampler(pi_context context) + : refCount_(1), props_(0), context_(context) {} + + pi_uint32 increment_reference_count() noexcept { return ++refCount_; } + + pi_uint32 decrement_reference_count() noexcept { return --refCount_; } + + pi_uint32 get_reference_count() const noexcept { return refCount_; } +}; + // ------------------------------------------------------------- // Helper types and functions // diff --git a/sycl/plugins/level_zero/pi_level0.cpp b/sycl/plugins/level_zero/pi_level0.cpp index 87479d8b86cab..dc414d532b2a5 100644 --- a/sycl/plugins/level_zero/pi_level0.cpp +++ b/sycl/plugins/level_zero/pi_level0.cpp @@ -2003,7 +2003,7 @@ pi_result piKernelSetArg(pi_kernel Kernel, pi_uint32 ArgIndex, size_t ArgSize, return PI_SUCCESS; } -// Special version of piKernelSetArg to accept pi_mem and pi_sampler. +// Special version of piKernelSetArg to accept pi_mem. pi_result piextKernelSetArgMemObj(pi_kernel Kernel, pi_uint32 ArgIndex, const pi_mem *ArgValue) { // TODO: the better way would probably be to add a new PI API for @@ -2020,6 +2020,13 @@ pi_result piextKernelSetArgMemObj(pi_kernel Kernel, pi_uint32 ArgIndex, return PI_SUCCESS; } +// Special version of piKernelSetArg to accept pi_sampler. +pi_result piextKernelSetArgSampler(pi_kernel Kernel, pi_uint32 ArgIndex, + const pi_sampler *ArgValue) { + die("piextKernelSetArgSampler: not implemented"); + return {}; +} + pi_result piKernelGetInfo(pi_kernel Kernel, pi_kernel_info ParamName, size_t ParamValueSize, void *ParamValue, size_t *ParamValueSizeRet) { diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp index 718132f9a1180..7287dd1c0bf89 100644 --- a/sycl/plugins/opencl/pi_opencl.cpp +++ b/sycl/plugins/opencl/pi_opencl.cpp @@ -449,6 +449,13 @@ pi_result piextKernelSetArgMemObj(pi_kernel kernel, pi_uint32 arg_index, sizeof(arg_value), cast(arg_value))); } +pi_result piextKernelSetArgSampler(pi_kernel kernel, pi_uint32 arg_index, + const pi_sampler *arg_value) { + return cast( + clSetKernelArg(cast(kernel), cast(arg_index), + sizeof(cl_sampler), cast(arg_value))); +} + pi_result piextGetDeviceFunctionPointer(pi_device device, pi_program program, const char *func_name, pi_uint64 *function_pointer_ret) { @@ -1234,6 +1241,7 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piextUSMGetMemAllocInfo, piextUSMGetMemAllocInfo) _PI_CL(piextKernelSetArgMemObj, piextKernelSetArgMemObj) + _PI_CL(piextKernelSetArgSampler, piextKernelSetArgSampler) #undef _PI_CL diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index e836336b98449..30bf505693098 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -1657,8 +1657,8 @@ pi_result ExecCGCommand::SetKernelParamsAndLaunch( sampler *SamplerPtr = (sampler *)Arg.MPtr; RT::PiSampler Sampler = detail::getSyclObjImpl(*SamplerPtr) ->getOrCreateSampler(MQueue->get_context()); - Plugin.call(Kernel, Arg.MIndex, - sizeof(cl_sampler), &Sampler); + Plugin.call(Kernel, Arg.MIndex, + &Sampler); break; } case kernel_param_kind_t::kind_pointer: { diff --git a/sycl/test/abi/pi_level0_symbol_check.dump b/sycl/test/abi/pi_level0_symbol_check.dump index 68b2d0c0e0c6b..8967179926f78 100644 --- a/sycl/test/abi/pi_level0_symbol_check.dump +++ b/sycl/test/abi/pi_level0_symbol_check.dump @@ -77,6 +77,7 @@ piProgramGetInfo piextGetDeviceFunctionPointer piEnqueueMemUnmap piextKernelSetArgMemObj +piextKernelSetArgSampler piQueueCreate piEventCreate piKernelGetInfo diff --git a/sycl/test/abi/pi_opencl_symbol_check.dump b/sycl/test/abi/pi_opencl_symbol_check.dump index 529b64f006b6b..b9bff16ab546a 100644 --- a/sycl/test/abi/pi_opencl_symbol_check.dump +++ b/sycl/test/abi/pi_opencl_symbol_check.dump @@ -28,6 +28,7 @@ piextEventCreateWithNativeHandle piextGetDeviceFunctionPointer piextProgramGetNativeHandle piextKernelSetArgMemObj +piextKernelSetArgSampler piextKernelSetArgPointer piextMemCreateWithNativeHandle piextMemGetNativeHandle