From 96d8067a7ebfa8fca1af7bb2cad1d60b77b4ae7d Mon Sep 17 00:00:00 2001 From: Florian Reibold Date: Fri, 13 Sep 2024 10:30:17 +0200 Subject: [PATCH] [WIP] BVH buffer in explicitly managed host/device memory on systems without unified memory (e.g. dGPUs) --- kernels/common/scene.cpp | 23 ++- kernels/common/scene.h | 9 +- kernels/level_zero/ze_wrapper.cpp | 20 +++ kernels/level_zero/ze_wrapper.h | 2 + kernels/rthwif/testing/CMakeLists.txt | 4 +- kernels/sycl/rthwif_embree.cpp | 16 +- kernels/sycl/rthwif_embree_builder.cpp | 106 ++++++++++-- kernels/sycl/rthwif_embree_builder.h | 152 ++++++++++++------ kernels/sycl/scene_sycl.cpp | 7 +- .../triangle_geometry_device.cpp | 31 ++++ 10 files changed, 286 insertions(+), 84 deletions(-) diff --git a/kernels/common/scene.cpp b/kernels/common/scene.cpp index d40c701b83..3fe6fa7ccc 100644 --- a/kernels/common/scene.cpp +++ b/kernels/common/scene.cpp @@ -56,8 +56,8 @@ namespace embree /* use proper device and context for SYCL allocations */ #if defined(EMBREE_SYCL_SUPPORT) - if (DeviceGPU* gpu_device = dynamic_cast(device)) - hwaccel = AccelBuffer(AccelAllocator(device,gpu_device->getGPUDevice(),gpu_device->getGPUContext()),0); + if (dynamic_cast(device)) + accelBuffer = AccelBuffer(device); #endif /* one can overwrite flags through device for debugging */ @@ -789,10 +789,8 @@ namespace embree void Scene::build_gpu_accels() { #if defined(EMBREE_SYCL_SUPPORT) - auto [aabb, stride] = rthwifBuild(this,hwaccel); - hwaccel_stride = stride; - bounds = LBBox(aabb); - hwaccel_bounds = aabb; + accelBuffer.build(this); + bounds = LBBox(accelBuffer.getBounds()); #endif } @@ -917,6 +915,10 @@ namespace embree taskGroup->scheduler = nullptr; throw; } + +#if defined(EMBREE_SYCL_SUPPORT) + accelBuffer.commit(); +#endif } #endif @@ -982,6 +984,10 @@ namespace embree accels_clear(); throw; } + +#if defined(EMBREE_SYCL_SUPPORT) + accelBuffer.commit(); +#endif } #endif @@ -1024,6 +1030,11 @@ namespace embree accels_clear(); throw; } + +#if defined(EMBREE_SYCL_SUPPORT) + accelBuffer.commit(); +#endif + } #endif diff --git a/kernels/common/scene.h b/kernels/common/scene.h index f996154945..4bb12b14df 100644 --- a/kernels/common/scene.h +++ b/kernels/common/scene.h @@ -296,14 +296,7 @@ namespace embree #if defined(EMBREE_SYCL_SUPPORT) public: - BBox3f hwaccel_bounds = empty; - AccelBuffer hwaccel; // the buffer containing the HW acceleration structures corresponding to the scene. One for each time segment, stored in a contiguous chunk of memory. - size_t hwaccel_stride; // the stride between two HW acceleration structures for different time segments stored in hwaccel. - - __forceinline char* getHWAccel(uint32_t time_segment) const { - char* ptr = (char*)hwaccel.data() + time_segment * hwaccel_stride; - return ptr; - } + AccelBuffer accelBuffer; #endif private: diff --git a/kernels/level_zero/ze_wrapper.cpp b/kernels/level_zero/ze_wrapper.cpp index 50af2fcf31..d68afbf601 100644 --- a/kernels/level_zero/ze_wrapper.cpp +++ b/kernels/level_zero/ze_wrapper.cpp @@ -30,6 +30,8 @@ static std::mutex zeWrapperMutex; static void* handle = nullptr; static decltype(zeMemFree)* zeMemFreeInternal = nullptr; +static decltype(zeMemAllocHost)* zeMemAllocHostInternal = nullptr; +static decltype(zeMemAllocDevice)* zeMemAllocDeviceInternal = nullptr; static decltype(zeMemAllocShared)* zeMemAllocSharedInternal = nullptr; static decltype(zeDriverGetExtensionProperties)* zeDriverGetExtensionPropertiesInternal = nullptr; static decltype(zeDeviceGetProperties)* zeDeviceGetPropertiesInternal = nullptr; @@ -156,6 +158,8 @@ ze_result_t ZeWrapper::init() handle = load_module(); zeMemFreeInternal = find_symbol(handle, "zeMemFree"); + zeMemAllocHostInternal = find_symbol(handle, "zeMemAllocHost"); + zeMemAllocDeviceInternal = find_symbol(handle, "zeMemAllocDevice"); zeMemAllocSharedInternal = find_symbol(handle, "zeMemAllocShared"); zeDriverGetExtensionPropertiesInternal = find_symbol(handle, "zeDriverGetExtensionProperties"); zeDeviceGetPropertiesInternal = find_symbol(handle, "zeDeviceGetProperties"); @@ -218,6 +222,22 @@ ze_result_t ZeWrapper::zeMemFree(ze_context_handle_t context, void* ptr) return zeMemFreeInternal(context, ptr); } +ze_result_t ZeWrapper::zeMemAllocHost(ze_context_handle_t context, const ze_host_mem_alloc_desc_t* desch, size_t s0, size_t s1, void** ptr) +{ + if (!handle || !zeMemAllocHostInternal) + throw std::runtime_error("ZeWrapper not initialized, call ZeWrapper::init() first."); + + return zeMemAllocHostInternal(context, desch, s0, s1, ptr); +} + +ze_result_t ZeWrapper::zeMemAllocDevice(ze_context_handle_t context, const ze_device_mem_alloc_desc_t* descd, size_t s0, size_t s1, ze_device_handle_t ze_handle, void** ptr) +{ + if (!handle || !zeMemAllocDeviceInternal) + throw std::runtime_error("ZeWrapper not initialized, call ZeWrapper::init() first."); + + return zeMemAllocDeviceInternal(context, descd, s0, s1, ze_handle, ptr); +} + ze_result_t ZeWrapper::zeMemAllocShared(ze_context_handle_t context, const ze_device_mem_alloc_desc_t* descd, const ze_host_mem_alloc_desc_t* desch, size_t s0, size_t s1, ze_device_handle_t ze_handle, void** ptr) { if (!handle || !zeMemAllocSharedInternal) diff --git a/kernels/level_zero/ze_wrapper.h b/kernels/level_zero/ze_wrapper.h index 3a2ca389a4..f4f409ed5a 100644 --- a/kernels/level_zero/ze_wrapper.h +++ b/kernels/level_zero/ze_wrapper.h @@ -39,6 +39,8 @@ struct ZeWrapper static ze_result_t initRTASBuilder(ze_driver_handle_t hDriver, RTAS_BUILD_MODE rtas_build_mode = RTAS_BUILD_MODE::AUTO); static ze_result_t zeMemFree(ze_context_handle_t, void*); + static ze_result_t zeMemAllocHost(ze_context_handle_t, const ze_host_mem_alloc_desc_t*, size_t, size_t, void**); + static ze_result_t zeMemAllocDevice(ze_context_handle_t, const ze_device_mem_alloc_desc_t*, size_t, size_t, ze_device_handle_t, void**); static ze_result_t zeMemAllocShared(ze_context_handle_t, const ze_device_mem_alloc_desc_t*, const ze_host_mem_alloc_desc_t*, size_t, size_t, ze_device_handle_t, void**); static ze_result_t zeDriverGetExtensionProperties(ze_driver_handle_t, uint32_t*, ze_driver_extension_properties_t*); static ze_result_t zeDeviceGetProperties(ze_device_handle_t, ze_device_properties_t*); diff --git a/kernels/rthwif/testing/CMakeLists.txt b/kernels/rthwif/testing/CMakeLists.txt index 4c3f22e0bf..efaad9f6d4 100644 --- a/kernels/rthwif/testing/CMakeLists.txt +++ b/kernels/rthwif/testing/CMakeLists.txt @@ -31,14 +31,14 @@ IF (EMBREE_SYCL_RT_SIMULATION) ENDIF() ADD_EXECUTABLE(embree_rthwif_cornell_box rthwif_cornell_box.cpp) -TARGET_LINK_LIBRARIES(embree_rthwif_cornell_box sys simd ${TBB_TARGET} ${RT_SIM_LIBRARY} ze_wrapper) +TARGET_LINK_LIBRARIES(embree_rthwif_cornell_box sys simd ${TBB_TARGET} ${RT_SIM_LIBRARY} ze_wrapper ${EMBREE_RTHWIF_SYCL}) SET_PROPERTY(TARGET embree_rthwif_cornell_box APPEND PROPERTY COMPILE_FLAGS "-fsycl -fsycl-targets=spir64 -DEMBREE_SYCL_SUPPORT") SET_PROPERTY(TARGET embree_rthwif_cornell_box APPEND PROPERTY LINK_FLAGS "-fsycl -fsycl-targets=spir64 -Xsycl-target-backend=spir64 \" -cl-intel-greater-than-4GB-buffer-required \"") INSTALL(TARGETS embree_rthwif_cornell_box DESTINATION "${CMAKE_INSTALL_BINDIR}" COMPONENT examples) SIGN_TARGET(embree_rthwif_cornell_box) ADD_EXECUTABLE(embree_rthwif_test rthwif_test.cpp) -TARGET_LINK_LIBRARIES(embree_rthwif_test sys simd ${TBB_TARGET} ${RT_SIM_LIBRARY} ze_wrapper) +TARGET_LINK_LIBRARIES(embree_rthwif_test sys simd ${TBB_TARGET} ${RT_SIM_LIBRARY} ze_wrapper ${EMBREE_RTHWIF_SYCL}) SET_PROPERTY(TARGET embree_rthwif_test APPEND PROPERTY COMPILE_FLAGS "-fsycl -fsycl-targets=spir64 -DEMBREE_SYCL_SUPPORT") SET_PROPERTY(TARGET embree_rthwif_test APPEND PROPERTY LINK_FLAGS "-fsycl -fsycl-targets=spir64 -Xsycl-target-backend=spir64 \" -cl-intel-greater-than-4GB-buffer-required \"") INSTALL(TARGETS embree_rthwif_test DESTINATION "${CMAKE_INSTALL_BINDIR}" COMPONENT examples) diff --git a/kernels/sycl/rthwif_embree.cpp b/kernels/sycl/rthwif_embree.cpp index df00b40a36..98c66d2cd2 100644 --- a/kernels/sycl/rthwif_embree.cpp +++ b/kernels/sycl/rthwif_embree.cpp @@ -110,7 +110,7 @@ __forceinline bool intersect_user_geometry(intel_ray_query_t& query, RayHit& ray raydesc.flags |= intel_ray_flags_cull_back_facing_triangles; #endif - intel_raytracing_acceleration_structure_t hwaccel_ptr = (intel_raytracing_acceleration_structure_t) scene->getHWAccel(0); + intel_raytracing_acceleration_structure_t hwaccel_ptr = (intel_raytracing_acceleration_structure_t) scene->accelBuffer.getHWAccel(0); intel_ray_query_forward_ray(query, raydesc, hwaccel_ptr); return false; @@ -152,7 +152,7 @@ __forceinline bool intersect_user_geometry(intel_ray_query_t& query, Ray& ray, U raydesc.flags |= intel_ray_flags_cull_back_facing_triangles; #endif - intel_raytracing_acceleration_structure_t hwaccel_ptr = (intel_raytracing_acceleration_structure_t) scene->getHWAccel(0); + intel_raytracing_acceleration_structure_t hwaccel_ptr = (intel_raytracing_acceleration_structure_t) scene->accelBuffer.getHWAccel(0); intel_ray_query_forward_ray(query, raydesc, hwaccel_ptr); return false; @@ -208,7 +208,7 @@ __forceinline bool intersect_instance(intel_ray_query_t& query, RayHit& ray, Ins bvh_id = (uint32_t) clamp(uint32_t(numTimeSegments*time), 0u, numTimeSegments-1); } - intel_raytracing_acceleration_structure_t hwaccel_ptr = (intel_raytracing_acceleration_structure_t) object->getHWAccel(bvh_id); + intel_raytracing_acceleration_structure_t hwaccel_ptr = (intel_raytracing_acceleration_structure_t) object->accelBuffer.getHWAccel(bvh_id); intel_ray_query_forward_ray(query, raydesc, hwaccel_ptr); @@ -261,7 +261,7 @@ __forceinline bool intersect_instance(intel_ray_query_t& query, Ray& ray, Instan bvh_id = (uint32_t) clamp(uint32_t(numTimeSegments*time), 0u, numTimeSegments-1); } - intel_raytracing_acceleration_structure_t hwaccel_ptr = (intel_raytracing_acceleration_structure_t) object->getHWAccel(bvh_id); + intel_raytracing_acceleration_structure_t hwaccel_ptr = (intel_raytracing_acceleration_structure_t) object->accelBuffer.getHWAccel(bvh_id); intel_ray_query_forward_ray(query, raydesc, hwaccel_ptr); @@ -319,7 +319,7 @@ __forceinline bool intersect_instance_array(intel_ray_query_t& query, RayHit& ra bvh_id = (uint32_t) clamp(uint32_t(numTimeSegments*time), 0u, numTimeSegments-1); } - intel_raytracing_acceleration_structure_t hwaccel_ptr = (intel_raytracing_acceleration_structure_t) object->getHWAccel(bvh_id); + intel_raytracing_acceleration_structure_t hwaccel_ptr = (intel_raytracing_acceleration_structure_t) object->accelBuffer.getHWAccel(bvh_id); intel_ray_query_forward_ray(query, raydesc, hwaccel_ptr); @@ -374,7 +374,7 @@ __forceinline bool intersect_instance_array(intel_ray_query_t& query, Ray& ray, bvh_id = (uint32_t) clamp(uint32_t(numTimeSegments*time), 0u, numTimeSegments-1); } - intel_raytracing_acceleration_structure_t hwaccel_ptr = (intel_raytracing_acceleration_structure_t) object->getHWAccel(bvh_id); + intel_raytracing_acceleration_structure_t hwaccel_ptr = (intel_raytracing_acceleration_structure_t) object->accelBuffer.getHWAccel(bvh_id); intel_ray_query_forward_ray(query, raydesc, hwaccel_ptr); @@ -752,7 +752,7 @@ SYCL_EXTERNAL __attribute__((always_inline)) void rtcIntersectRTHW(sycl::global_ bvh_id = (uint32_t) clamp(uint32_t(numTimeSegments*time), 0u, numTimeSegments-1); } - intel_raytracing_acceleration_structure_t hwaccel_ptr = (intel_raytracing_acceleration_structure_t) scene->getHWAccel(bvh_id); + intel_raytracing_acceleration_structure_t hwaccel_ptr = (intel_raytracing_acceleration_structure_t) scene->accelBuffer.getHWAccel(bvh_id); intel_ray_query_t query = intel_ray_query_init(raydesc, hwaccel_ptr); intel_ray_query_start_traversal(query); @@ -854,7 +854,7 @@ SYCL_EXTERNAL __attribute__((always_inline)) void rtcOccludedRTHW(sycl::global_p bvh_id = (uint32_t) clamp(uint32_t(numTimeSegments*time), 0u, numTimeSegments-1); } - intel_raytracing_acceleration_structure_t hwaccel_ptr = (intel_raytracing_acceleration_structure_t) scene->getHWAccel(bvh_id); + intel_raytracing_acceleration_structure_t hwaccel_ptr = (intel_raytracing_acceleration_structure_t) scene->accelBuffer.getHWAccel(bvh_id); intel_ray_query_t query = intel_ray_query_init(raydesc, hwaccel_ptr); intel_ray_query_start_traversal(query); diff --git a/kernels/sycl/rthwif_embree_builder.cpp b/kernels/sycl/rthwif_embree_builder.cpp index 22ebd96bf1..662547ae39 100644 --- a/kernels/sycl/rthwif_embree_builder.cpp +++ b/kernels/sycl/rthwif_embree_builder.cpp @@ -156,7 +156,7 @@ namespace embree return sycl_device.get_info(); } - void* rthwifAllocAccelBuffer(Device* embree_device, size_t bytes, sycl::device device, sycl::context context) + void* rthwifAllocAccelBuffer(Device* embree_device, size_t bytes, sycl::device device, sycl::context context, sycl::usm::alloc alloc_type) { ze_context_handle_t hContext = sycl::get_native(context); ze_device_handle_t hDevice = sycl::get_native(device); @@ -176,28 +176,44 @@ namespace embree relaxed.stype = ZE_STRUCTURE_TYPE_RELAXED_ALLOCATION_LIMITS_EXP_DESC; relaxed.pNext = &rt_desc; relaxed.flags = ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_MAX_SIZE; - + ze_device_mem_alloc_desc_t device_desc; device_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC; device_desc.pNext = &relaxed; device_desc.flags = ZE_DEVICE_MEM_ALLOC_FLAG_BIAS_CACHED; device_desc.ordinal = 0; - + ze_host_mem_alloc_desc_t host_desc; host_desc.stype = ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC; host_desc.pNext = nullptr; host_desc.flags = ZE_HOST_MEM_ALLOC_FLAG_BIAS_CACHED; - + void* ptr = nullptr; + // TODO: modify memory monitor to account for host and device code separately if (embree_device) embree_device->memoryMonitor(bytes,false); - ze_result_t result = ZeWrapper::zeMemAllocShared(hContext,&device_desc,&host_desc,bytes,rtasProp.rtasBufferAlignment,hDevice,&ptr); + + ze_result_t result; + switch (alloc_type) { + case sycl::usm::alloc::host: + result = ZeWrapper::zeMemAllocHost(hContext,&host_desc,bytes,rtasProp.rtasBufferAlignment,&ptr); + break; + case sycl::usm::alloc::device: + result = ZeWrapper::zeMemAllocDevice(hContext,&device_desc,bytes,rtasProp.rtasBufferAlignment,hDevice,&ptr); + break; + case sycl::usm::alloc::shared: + result = ZeWrapper::zeMemAllocShared(hContext,&device_desc,&host_desc,bytes,rtasProp.rtasBufferAlignment,hDevice,&ptr); + break; + default: + throw_RTCError(RTC_ERROR_UNKNOWN, "sycl::usm::alloc type unknown in rthwifAllocAccelBuffer"); + } + if (result != ZE_RESULT_SUCCESS) throw_RTCError(RTC_ERROR_OUT_OF_MEMORY,"rtas memory allocation failed"); return ptr; } - + void rthwifFreeAccelBuffer(Device* embree_device, void* ptr, size_t bytes, sycl::context context) { if (ptr == nullptr) return; @@ -362,9 +378,9 @@ namespace embree const AffineSpace3fa local2world = geom->getLocal2World(); out->transformFormat = ZE_RTAS_BUILDER_INPUT_DATA_FORMAT_EXP_FLOAT3X4_ALIGNED_COLUMN_MAJOR; out->pTransform = (float*) &out->xfmdata; - out->pBounds = (ze_rtas_aabb_exp_t*) &dynamic_cast(geom->object)->hwaccel_bounds; + out->pBounds = (ze_rtas_aabb_exp_t*) &dynamic_cast(geom->object)->accelBuffer.getBounds(); out->xfmdata = *(ze_rtas_transform_float3x4_aligned_column_major_exp_t*) &local2world; - out->pAccelerationStructure = dynamic_cast(geom->object)->getHWAccel(0); + out->pAccelerationStructure = dynamic_cast(geom->object)->accelBuffer.getHWAccel(0); } void createGeometryDesc(ze_rtas_builder_instance_geometry_info_exp_t* out, Scene* scene, Instance* geom) @@ -377,8 +393,8 @@ namespace embree out->instanceUserID = 0; out->transformFormat = ZE_RTAS_BUILDER_INPUT_DATA_FORMAT_EXP_FLOAT3X4_ALIGNED_COLUMN_MAJOR; out->pTransform = (float*) &geom->local2world[0]; - out->pBounds = (ze_rtas_aabb_exp_t*) &dynamic_cast(geom->object)->hwaccel_bounds; - out->pAccelerationStructure = dynamic_cast(geom->object)->getHWAccel(0); + out->pBounds = (ze_rtas_aabb_exp_t*) &dynamic_cast(geom->object)->accelBuffer.getBounds(); + out->pAccelerationStructure = dynamic_cast(geom->object)->accelBuffer.getHWAccel(0); } void createGeometryDesc(char* out, Scene* scene, Geometry* geom, GEOMETRY_TYPE type) @@ -417,7 +433,7 @@ namespace embree return result; } - std::tuple rthwifBuild(Scene* scene, AccelBuffer& accel) + std::tuple rthwifBuild(Scene* scene, AccelBufferData& accel) { DeviceGPU* gpuDevice = dynamic_cast(scene->device); if (gpuDevice == nullptr) throw std::runtime_error("internal error"); @@ -647,4 +663,72 @@ namespace embree return std::tie(fullBounds, sizeTotal.rtasBufferSizeBytesExpected); } + + AccelBuffer::AccelBuffer(Device *device) : device(device) + { + DeviceGPU *gpu_device = dynamic_cast(device); + + if (!gpu_device) + { + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "AccelBuffer constructor called with non-GPU device"); + } + + unifiedMemory = false; //gpu_device->has_unified_memory(); + + if (unifiedMemory) + { + accelBufferShared = AccelBufferData(AccelAllocator(device, gpu_device->getGPUDevice(), gpu_device->getGPUContext(), sycl::usm::alloc::shared), 0); + } + else + { + accelBufferHost = AccelBufferData(AccelAllocator(device, gpu_device->getGPUDevice(), gpu_device->getGPUContext(), sycl::usm::alloc::host), 0); + accelBufferDevice = AccelBufferData(AccelAllocator(device, gpu_device->getGPUDevice(), gpu_device->getGPUContext(), sycl::usm::alloc::device), 0); + } + } + + void AccelBuffer::build(Scene *scene) + { + auto [aabb, stride] = rthwifBuild(scene, getAccelBufferData()); + hwaccel_stride = stride; + hwaccel_bounds = aabb; + } + + void AccelBuffer::commit() + { + if (unifiedMemory) { + hwaccel = (char*)accelBufferShared.data(); + return; + } + + auto deviceGPU = reinterpret_cast(device); + if (!deviceGPU) { + return; + } + + std::cout << "accelBufferHost.size(): " << accelBufferHost.size() << std::endl; + std::cout << "accelBufferDevice.size(): " << accelBufferDevice.size() << std::endl; + + accelBufferDevice.resize(accelBufferHost.size()); + + sycl::queue queue(deviceGPU->getGPUDevice()); + queue.memcpy(accelBufferDevice.data(), accelBufferHost.data(), accelBufferHost.size()); + queue.wait_and_throw(); + + std::vector host_data(accelBufferHost.size()); + queue.memcpy(host_data.data(), accelBufferDevice.data(), accelBufferDevice.size()); + queue.wait_and_throw(); + + for (size_t i = 0; i < accelBufferHost.size(); ++i) { + if (accelBufferHost[i] != host_data[i]) { + std::cout << (int)accelBufferHost[i] << " - " << (int)host_data[i] << std::endl; + } + } + + std::cout << "accelBufferHost.size(): " << accelBufferHost.size() << std::endl; + std::cout << "accelBufferDevice.size(): " << accelBufferDevice.size() << std::endl; + + hwaccel = (char*)accelBufferDevice.data(); + printf("hwaccel %p\n", hwaccel); + } + } diff --git a/kernels/sycl/rthwif_embree_builder.h b/kernels/sycl/rthwif_embree_builder.h index d3f128f910..90b4d840d8 100644 --- a/kernels/sycl/rthwif_embree_builder.h +++ b/kernels/sycl/rthwif_embree_builder.h @@ -8,67 +8,125 @@ #include "../../common/sys/vector.h" #include "../../common/math/bbox.h" #include "../../include/embree4/rtcore.h" - + namespace embree { class Scene; - - void* rthwifAllocAccelBuffer(Device* embree_device, size_t bytes, sycl::device device, sycl::context context); + + void* rthwifAllocAccelBuffer(Device* embree_device, size_t bytes, sycl::device device, sycl::context context, sycl::usm::alloc alloc_type); void rthwifFreeAccelBuffer(Device* embree_device, void* ptr, size_t bytes, sycl::context context); + void* zeRTASInitExp(sycl::device device, sycl::context context); + + void rthwifCleanup(Device* embree_device, void* dispatchGlobalsPtr, sycl::context context); + + int rthwifIsSYCLDeviceSupported(const sycl::device& sycl_device); + /*! allocator that performs BVH memory allocations */ - template - struct AccelAllocator + template + struct AccelAllocator + { + typedef T value_type; + typedef T *pointer; + typedef const T *const_pointer; + typedef T &reference; + typedef const T &const_reference; + typedef std::size_t size_type; + typedef std::ptrdiff_t difference_type; + + AccelAllocator() + : device(nullptr), context(nullptr), alloc_type(sycl::usm::alloc::unknown) {} + + AccelAllocator(Device *embree_device, const sycl::device &device, const sycl::context &context, sycl::usm::alloc alloc_type) + : embree_device(embree_device), device(&device), context(&context), alloc_type(alloc_type) {} + + __forceinline pointer allocate(size_type n) + { + if (context && device) + return (pointer)rthwifAllocAccelBuffer(embree_device, n * sizeof(T), *device, *context, alloc_type); + else + return nullptr; + } + + __forceinline void deallocate(pointer p, size_type n) + { + if (context) + rthwifFreeAccelBuffer(embree_device, p, n * sizeof(T), *context); + } + + __forceinline void construct(pointer p, const_reference val) + { + new (p) T(val); + } + + __forceinline void destroy(pointer p) { - typedef T value_type; - typedef T* pointer; - typedef const T* const_pointer; - typedef T& reference; - typedef const T& const_reference; - typedef std::size_t size_type; - typedef std::ptrdiff_t difference_type; - - AccelAllocator() - : device(nullptr), context(nullptr) {} - - AccelAllocator(Device* embree_device, const sycl::device& device, const sycl::context& context) - : embree_device(embree_device), device(&device), context(&context) {} - - __forceinline pointer allocate( size_type n ) { - if (context && device) - return (pointer) rthwifAllocAccelBuffer(embree_device,n*sizeof(T),*device,*context); + p->~T(); + } + + private: + Device *embree_device; + const sycl::device *device; + const sycl::context *context; + sycl::usm::alloc alloc_type; + }; + + typedef vector_t> AccelBufferData; + + std::tuple rthwifBuild(Scene* scene, AccelBufferData& buffer_o); + + // The buffers containing the HW acceleration structures corresponding to the scene. One for each time segment, stored in a contiguous chunk of memory. + // On devices with unified memory a USM shared allocation is used and hwaccelHost and hwaccelDevice point to the same USM shared memory. + // On devices without unified memory, a USM host allocation is used for BVH building and on scene commit the data is copied to a USM device allocation on the device + struct AccelBuffer { + AccelBuffer() {}; + AccelBuffer(Device *device); + + __forceinline char* getHWAccel(uint32_t time_segment) const { +#if defined(__SYCL_DEVICE_ONLY__) + return hwaccel + time_segment * hwaccel_stride; + //return (char*)accelBufferShared.data() + time_segment * hwaccel_stride; +#else + if (unifiedMemory) + return (char*)accelBufferShared.data() + time_segment * hwaccel_stride; else - return nullptr; + return (char*)accelBufferHost.data() + time_segment * hwaccel_stride; +#endif } - - __forceinline void deallocate( pointer p, size_type n ) { - if (context) - rthwifFreeAccelBuffer(embree_device,p,n*sizeof(T),*context); - } - - __forceinline void construct( pointer p, const_reference val ) { - new (p) T(val); + + __forceinline size_t getHWAccelSize() const { + if (unifiedMemory) + return accelBufferShared.size(); + else + return accelBufferHost.size(); } - - __forceinline void destroy( pointer p ) { - p->~T(); + +#if !defined(__SYCL_DEVICE_ONLY__) + + __forceinline AccelBufferData& getAccelBufferData() { + if (unifiedMemory) + return accelBufferShared; + else + return accelBufferHost; } - private: + inline BBox3f const& getBounds() { return hwaccel_bounds; } - Device* embree_device; - const sycl::device* device; - const sycl::context* context; - }; + void build(Scene* scene); + void commit(); +#endif - typedef vector_t> AccelBuffer; - - void* zeRTASInitExp(sycl::device device, sycl::context context); - - void rthwifCleanup(Device* embree_device, void* dispatchGlobalsPtr, sycl::context context); + private: + AccelBufferData accelBufferHost; // only used for systems were unified memory is not available (dGPU) + AccelBufferData accelBufferDevice; // only used for systems were unified memory is not available (dGPU) + AccelBufferData accelBufferShared; // only used when system has unified memory between CPU and GPU (iGPU) + char* hwaccel; // pointer to the accel buffer on the device, only valid after scene commit + + BBox3f hwaccel_bounds = empty; + size_t hwaccel_stride; // the stride between two HW acceleration structures for different time segments stored in hwaccel. + bool unifiedMemory; + Device* device; + }; - int rthwifIsSYCLDeviceSupported(const sycl::device& sycl_device); - - std::tuple rthwifBuild(Scene* scene, AccelBuffer& buffer_o); } diff --git a/kernels/sycl/scene_sycl.cpp b/kernels/sycl/scene_sycl.cpp index 4e9b1fb304..386cbb538c 100644 --- a/kernels/sycl/scene_sycl.cpp +++ b/kernels/sycl/scene_sycl.cpp @@ -46,6 +46,9 @@ RTC_API_EXTERN_C bool prefetchUSMSharedOnGPU(RTCScene hscene) // we accumulate some nonsene data to prevent compiler // optimizing away the memory fetches in the GPU kernel size_t* result = sycl::malloc_shared(num_workers, queue); + + size_t accelSize = scene->accelBuffer.getHWAccelSize(); + char* accelPtr = scene->accelBuffer.getHWAccel(0); // Use num_workers GPU work items to iterate over all USM shared // allocations to trigger USM migration from CPU to GPU @@ -89,13 +92,13 @@ RTC_API_EXTERN_C bool prefetchUSMSharedOnGPU(RTCScene hscene) { // iterate over BVH memory buffer in steps of 4KB // (page size on Intel Data Center Max GPUs) - const size_t accel_size = scene->hwaccel.size() / (1 << 12); + const size_t accel_size = accelSize / (1 << 12); const size_t num_iterations = (accel_size + num_workers - 1) / num_workers; for (size_t j = 0; j < num_iterations; ++j) { const size_t offset = (idx * num_iterations + j) * (1 << 12); if (offset >= accel_size) continue; - result[idx] += ((size_t)scene->hwaccel[offset] % 32); + result[idx] += ((size_t)accelPtr[offset]) % 32; } } }); diff --git a/tutorials/triangle_geometry/triangle_geometry_device.cpp b/tutorials/triangle_geometry/triangle_geometry_device.cpp index f7b257ff12..1314444677 100644 --- a/tutorials/triangle_geometry/triangle_geometry_device.cpp +++ b/tutorials/triangle_geometry/triangle_geometry_device.cpp @@ -135,6 +135,7 @@ void renderPixelStandard(const TutorialData& data, Vec3fa color = Vec3fa(0.0f); if (ray.geomID != RTC_INVALID_GEOMETRY_ID) { +#if 1 Vec3fa diffuse = data.face_colors[ray.primID]; color = color + diffuse*0.5f; Vec3fa lightDir = normalize(Vec3fa(-1,-1,-1)); @@ -152,6 +153,36 @@ void renderPixelStandard(const TutorialData& data, /* add light contribution */ if (shadow.tfar >= 0.0f) color = color + diffuse*clamp(-dot(lightDir,normalize(ray.Ng)),0.0f,1.0f); + #else + +#if 0 + if (ray.geomID == 0) + color = Vec3fa(0.f, 0.f, 1.f); + else if (ray.geomID == 1) + color = Vec3fa(0.f, 1.f, 0.f); + else if (ray.geomID == 2) + color = Vec3fa(1.f, 0.f, 0.f); + else + color = Vec3fa(1.f); +#endif + +#if 0 + if (ray.primID == 0) + color = Vec3fa(0.f, 0.f, 1.f); + else if (ray.primID == 1) + color = Vec3fa(0.f, 1.f, 0.f); + else if (ray.primID == 2) + color = Vec3fa(1.f, 0.f, 0.f); + else + color = Vec3fa(1.f); +#endif + +#if 0 + color = Vec3fa(ray.u, ray.v, 0.f); +#endif + +#endif + } /* write color to framebuffer */