From 96d8067a7ebfa8fca1af7bb2cad1d60b77b4ae7d Mon Sep 17 00:00:00 2001
From: Florian Reibold <florian.reibold@intel.com>
Date: Fri, 13 Sep 2024 10:30:17 +0200
Subject: [PATCH] [WIP] BVH buffer in explicitly managed host/device memory on
 systems without unified memory (e.g. dGPUs)

---
 kernels/common/scene.cpp                      |  23 ++-
 kernels/common/scene.h                        |   9 +-
 kernels/level_zero/ze_wrapper.cpp             |  20 +++
 kernels/level_zero/ze_wrapper.h               |   2 +
 kernels/rthwif/testing/CMakeLists.txt         |   4 +-
 kernels/sycl/rthwif_embree.cpp                |  16 +-
 kernels/sycl/rthwif_embree_builder.cpp        | 106 ++++++++++--
 kernels/sycl/rthwif_embree_builder.h          | 152 ++++++++++++------
 kernels/sycl/scene_sycl.cpp                   |   7 +-
 .../triangle_geometry_device.cpp              |  31 ++++
 10 files changed, 286 insertions(+), 84 deletions(-)
diff --git a/kernels/common/scene.cpp b/kernels/common/scene.cpp
index d40c701b83..3fe6fa7ccc 100644
--- a/kernels/common/scene.cpp
+++ b/kernels/common/scene.cpp
@@ -56,8 +56,8 @@ namespace embree
 
     /* use proper device and context for SYCL allocations */
 #if defined(EMBREE_SYCL_SUPPORT)
-    if (DeviceGPU* gpu_device = dynamic_cast<DeviceGPU*>(device))
-      hwaccel = AccelBuffer(AccelAllocator<char>(device,gpu_device->getGPUDevice(),gpu_device->getGPUContext()),0);
+    if (dynamic_cast<DeviceGPU*>(device))
+      accelBuffer = AccelBuffer(device);
 #endif
        
     /* one can overwrite flags through device for debugging */
@@ -789,10 +789,8 @@ namespace embree
   void Scene::build_gpu_accels()
   {
 #if defined(EMBREE_SYCL_SUPPORT)
-    auto [aabb, stride] = rthwifBuild(this,hwaccel);
-    hwaccel_stride = stride;
-    bounds = LBBox<embree::Vec3fa>(aabb);
-    hwaccel_bounds = aabb;
+    accelBuffer.build(this);
+    bounds = LBBox<embree::Vec3fa>(accelBuffer.getBounds());
 #endif
   }
 
@@ -917,6 +915,10 @@ namespace embree
       taskGroup->scheduler = nullptr;
       throw;
     }
+    
+#if defined(EMBREE_SYCL_SUPPORT)
+    accelBuffer.commit();
+#endif
   }
 
 #endif
@@ -982,6 +984,10 @@ namespace embree
       accels_clear();
       throw;
     }
+
+#if defined(EMBREE_SYCL_SUPPORT)
+    accelBuffer.commit();
+#endif
   }
 #endif
 
@@ -1024,6 +1030,11 @@ namespace embree
       accels_clear();
       throw;
     }
+
+#if defined(EMBREE_SYCL_SUPPORT)
+    accelBuffer.commit();
+#endif
+
   }
 #endif
 
diff --git a/kernels/common/scene.h b/kernels/common/scene.h
index f996154945..4bb12b14df 100644
--- a/kernels/common/scene.h
+++ b/kernels/common/scene.h
@@ -296,14 +296,7 @@ namespace embree
 
 #if defined(EMBREE_SYCL_SUPPORT)
   public:
-    BBox3f hwaccel_bounds = empty;
-    AccelBuffer hwaccel;   // the buffer containing the HW acceleration structures corresponding to the scene. One for each time segment, stored in a contiguous chunk of memory.
-    size_t hwaccel_stride; // the stride between two HW acceleration structures for different time segments stored in hwaccel.
-
-    __forceinline char* getHWAccel(uint32_t time_segment) const {
-      char* ptr = (char*)hwaccel.data() + time_segment * hwaccel_stride;
-      return ptr;
-    }
+    AccelBuffer accelBuffer;
 #endif
     
   private:
diff --git a/kernels/level_zero/ze_wrapper.cpp b/kernels/level_zero/ze_wrapper.cpp
index 50af2fcf31..d68afbf601 100644
--- a/kernels/level_zero/ze_wrapper.cpp
+++ b/kernels/level_zero/ze_wrapper.cpp
@@ -30,6 +30,8 @@ static std::mutex zeWrapperMutex;
 static void* handle = nullptr;
 
 static decltype(zeMemFree)* zeMemFreeInternal = nullptr;
+static decltype(zeMemAllocHost)* zeMemAllocHostInternal = nullptr;
+static decltype(zeMemAllocDevice)* zeMemAllocDeviceInternal = nullptr;
 static decltype(zeMemAllocShared)* zeMemAllocSharedInternal = nullptr;
 static decltype(zeDriverGetExtensionProperties)* zeDriverGetExtensionPropertiesInternal = nullptr;
 static decltype(zeDeviceGetProperties)* zeDeviceGetPropertiesInternal = nullptr;
@@ -156,6 +158,8 @@ ze_result_t ZeWrapper::init()
     handle = load_module();
     
     zeMemFreeInternal = find_symbol<decltype(zeMemFree)*>(handle, "zeMemFree");
+    zeMemAllocHostInternal = find_symbol<decltype(zeMemAllocHost)*>(handle, "zeMemAllocHost");
+    zeMemAllocDeviceInternal = find_symbol<decltype(zeMemAllocDevice)*>(handle, "zeMemAllocDevice");
     zeMemAllocSharedInternal = find_symbol<decltype(zeMemAllocShared)*>(handle, "zeMemAllocShared");
     zeDriverGetExtensionPropertiesInternal = find_symbol<decltype(zeDriverGetExtensionProperties)*>(handle, "zeDriverGetExtensionProperties");
     zeDeviceGetPropertiesInternal = find_symbol<decltype(zeDeviceGetProperties)*>(handle, "zeDeviceGetProperties");
@@ -218,6 +222,22 @@ ze_result_t ZeWrapper::zeMemFree(ze_context_handle_t context, void* ptr)
   return zeMemFreeInternal(context, ptr);
 }
 
+ze_result_t ZeWrapper::zeMemAllocHost(ze_context_handle_t context, const ze_host_mem_alloc_desc_t* desch, size_t s0, size_t s1, void** ptr)
+{
+  if (!handle || !zeMemAllocHostInternal)
+    throw std::runtime_error("ZeWrapper not initialized, call ZeWrapper::init() first.");
+  
+  return zeMemAllocHostInternal(context, desch, s0, s1, ptr);
+}
+
+ze_result_t ZeWrapper::zeMemAllocDevice(ze_context_handle_t context, const ze_device_mem_alloc_desc_t* descd, size_t s0, size_t s1, ze_device_handle_t ze_handle, void** ptr)
+{
+  if (!handle || !zeMemAllocDeviceInternal)
+    throw std::runtime_error("ZeWrapper not initialized, call ZeWrapper::init() first.");
+  
+  return zeMemAllocDeviceInternal(context, descd, s0, s1, ze_handle, ptr);
+}
+
 ze_result_t ZeWrapper::zeMemAllocShared(ze_context_handle_t context, const ze_device_mem_alloc_desc_t* descd, const ze_host_mem_alloc_desc_t* desch, size_t s0, size_t s1, ze_device_handle_t ze_handle, void** ptr)
 {
   if (!handle || !zeMemAllocSharedInternal)
diff --git a/kernels/level_zero/ze_wrapper.h b/kernels/level_zero/ze_wrapper.h
index 3a2ca389a4..f4f409ed5a 100644
--- a/kernels/level_zero/ze_wrapper.h
+++ b/kernels/level_zero/ze_wrapper.h
@@ -39,6 +39,8 @@ struct ZeWrapper
   static ze_result_t initRTASBuilder(ze_driver_handle_t hDriver, RTAS_BUILD_MODE rtas_build_mode = RTAS_BUILD_MODE::AUTO);
   
   static ze_result_t zeMemFree(ze_context_handle_t, void*);
+  static ze_result_t zeMemAllocHost(ze_context_handle_t, const ze_host_mem_alloc_desc_t*, size_t, size_t, void**);
+  static ze_result_t zeMemAllocDevice(ze_context_handle_t, const ze_device_mem_alloc_desc_t*, size_t, size_t, ze_device_handle_t, void**);
   static ze_result_t zeMemAllocShared(ze_context_handle_t, const ze_device_mem_alloc_desc_t*, const ze_host_mem_alloc_desc_t*, size_t, size_t, ze_device_handle_t, void**);
   static ze_result_t zeDriverGetExtensionProperties(ze_driver_handle_t, uint32_t*, ze_driver_extension_properties_t*);
   static ze_result_t zeDeviceGetProperties(ze_device_handle_t, ze_device_properties_t*);
diff --git a/kernels/rthwif/testing/CMakeLists.txt b/kernels/rthwif/testing/CMakeLists.txt
index 4c3f22e0bf..efaad9f6d4 100644
--- a/kernels/rthwif/testing/CMakeLists.txt
+++ b/kernels/rthwif/testing/CMakeLists.txt
@@ -31,14 +31,14 @@ IF (EMBREE_SYCL_RT_SIMULATION)
 ENDIF()
 
 ADD_EXECUTABLE(embree_rthwif_cornell_box rthwif_cornell_box.cpp)
-TARGET_LINK_LIBRARIES(embree_rthwif_cornell_box sys simd ${TBB_TARGET} ${RT_SIM_LIBRARY} ze_wrapper)
+TARGET_LINK_LIBRARIES(embree_rthwif_cornell_box sys simd ${TBB_TARGET} ${RT_SIM_LIBRARY} ze_wrapper ${EMBREE_RTHWIF_SYCL})
 SET_PROPERTY(TARGET embree_rthwif_cornell_box APPEND PROPERTY COMPILE_FLAGS "-fsycl -fsycl-targets=spir64 -DEMBREE_SYCL_SUPPORT")
 SET_PROPERTY(TARGET embree_rthwif_cornell_box APPEND PROPERTY LINK_FLAGS "-fsycl -fsycl-targets=spir64 -Xsycl-target-backend=spir64 \" -cl-intel-greater-than-4GB-buffer-required \"")
 INSTALL(TARGETS embree_rthwif_cornell_box DESTINATION "${CMAKE_INSTALL_BINDIR}" COMPONENT examples)
 SIGN_TARGET(embree_rthwif_cornell_box)
 
 ADD_EXECUTABLE(embree_rthwif_test rthwif_test.cpp)
-TARGET_LINK_LIBRARIES(embree_rthwif_test sys simd ${TBB_TARGET} ${RT_SIM_LIBRARY} ze_wrapper)
+TARGET_LINK_LIBRARIES(embree_rthwif_test sys simd ${TBB_TARGET} ${RT_SIM_LIBRARY} ze_wrapper ${EMBREE_RTHWIF_SYCL})
 SET_PROPERTY(TARGET embree_rthwif_test APPEND PROPERTY COMPILE_FLAGS "-fsycl -fsycl-targets=spir64 -DEMBREE_SYCL_SUPPORT")
 SET_PROPERTY(TARGET embree_rthwif_test APPEND PROPERTY LINK_FLAGS "-fsycl -fsycl-targets=spir64 -Xsycl-target-backend=spir64 \" -cl-intel-greater-than-4GB-buffer-required \"")
 INSTALL(TARGETS embree_rthwif_test DESTINATION "${CMAKE_INSTALL_BINDIR}" COMPONENT examples)
diff --git a/kernels/sycl/rthwif_embree.cpp b/kernels/sycl/rthwif_embree.cpp
index df00b40a36..98c66d2cd2 100644
--- a/kernels/sycl/rthwif_embree.cpp
+++ b/kernels/sycl/rthwif_embree.cpp
@@ -110,7 +110,7 @@ __forceinline bool intersect_user_geometry(intel_ray_query_t& query, RayHit& ray
   raydesc.flags |= intel_ray_flags_cull_back_facing_triangles;
 #endif
 
-  intel_raytracing_acceleration_structure_t hwaccel_ptr = (intel_raytracing_acceleration_structure_t) scene->getHWAccel(0);
+  intel_raytracing_acceleration_structure_t hwaccel_ptr = (intel_raytracing_acceleration_structure_t) scene->accelBuffer.getHWAccel(0);
 
   intel_ray_query_forward_ray(query, raydesc, hwaccel_ptr);
   return false;
@@ -152,7 +152,7 @@ __forceinline bool intersect_user_geometry(intel_ray_query_t& query, Ray& ray, U
   raydesc.flags |= intel_ray_flags_cull_back_facing_triangles;
 #endif
 
-  intel_raytracing_acceleration_structure_t hwaccel_ptr = (intel_raytracing_acceleration_structure_t) scene->getHWAccel(0);
+  intel_raytracing_acceleration_structure_t hwaccel_ptr = (intel_raytracing_acceleration_structure_t) scene->accelBuffer.getHWAccel(0);
 
   intel_ray_query_forward_ray(query, raydesc, hwaccel_ptr);
   return false;
@@ -208,7 +208,7 @@ __forceinline bool intersect_instance(intel_ray_query_t& query, RayHit& ray, Ins
     bvh_id = (uint32_t) clamp(uint32_t(numTimeSegments*time), 0u, numTimeSegments-1);
   }
 
-  intel_raytracing_acceleration_structure_t hwaccel_ptr = (intel_raytracing_acceleration_structure_t) object->getHWAccel(bvh_id);
+  intel_raytracing_acceleration_structure_t hwaccel_ptr = (intel_raytracing_acceleration_structure_t) object->accelBuffer.getHWAccel(bvh_id);
   
   intel_ray_query_forward_ray(query, raydesc, hwaccel_ptr);
 
@@ -261,7 +261,7 @@ __forceinline bool intersect_instance(intel_ray_query_t& query, Ray& ray, Instan
     bvh_id = (uint32_t) clamp(uint32_t(numTimeSegments*time), 0u, numTimeSegments-1);
   }
 
-  intel_raytracing_acceleration_structure_t hwaccel_ptr = (intel_raytracing_acceleration_structure_t) object->getHWAccel(bvh_id);
+  intel_raytracing_acceleration_structure_t hwaccel_ptr = (intel_raytracing_acceleration_structure_t) object->accelBuffer.getHWAccel(bvh_id);
 
   intel_ray_query_forward_ray(query, raydesc, hwaccel_ptr);
 
@@ -319,7 +319,7 @@ __forceinline bool intersect_instance_array(intel_ray_query_t& query, RayHit& ra
     bvh_id = (uint32_t) clamp(uint32_t(numTimeSegments*time), 0u, numTimeSegments-1);
   }
 
-  intel_raytracing_acceleration_structure_t hwaccel_ptr = (intel_raytracing_acceleration_structure_t) object->getHWAccel(bvh_id);
+  intel_raytracing_acceleration_structure_t hwaccel_ptr = (intel_raytracing_acceleration_structure_t) object->accelBuffer.getHWAccel(bvh_id);
   
   intel_ray_query_forward_ray(query, raydesc, hwaccel_ptr);
 
@@ -374,7 +374,7 @@ __forceinline bool intersect_instance_array(intel_ray_query_t& query, Ray& ray,
     bvh_id = (uint32_t) clamp(uint32_t(numTimeSegments*time), 0u, numTimeSegments-1);
   }
 
-  intel_raytracing_acceleration_structure_t hwaccel_ptr = (intel_raytracing_acceleration_structure_t) object->getHWAccel(bvh_id);
+  intel_raytracing_acceleration_structure_t hwaccel_ptr = (intel_raytracing_acceleration_structure_t) object->accelBuffer.getHWAccel(bvh_id);
 
   intel_ray_query_forward_ray(query, raydesc, hwaccel_ptr);
 
@@ -752,7 +752,7 @@ SYCL_EXTERNAL __attribute__((always_inline)) void rtcIntersectRTHW(sycl::global_
     bvh_id = (uint32_t) clamp(uint32_t(numTimeSegments*time), 0u, numTimeSegments-1);
   }
 
-  intel_raytracing_acceleration_structure_t hwaccel_ptr = (intel_raytracing_acceleration_structure_t) scene->getHWAccel(bvh_id);
+  intel_raytracing_acceleration_structure_t hwaccel_ptr = (intel_raytracing_acceleration_structure_t) scene->accelBuffer.getHWAccel(bvh_id);
   
   intel_ray_query_t query = intel_ray_query_init(raydesc, hwaccel_ptr);
   intel_ray_query_start_traversal(query);
@@ -854,7 +854,7 @@ SYCL_EXTERNAL __attribute__((always_inline)) void rtcOccludedRTHW(sycl::global_p
     bvh_id = (uint32_t) clamp(uint32_t(numTimeSegments*time), 0u, numTimeSegments-1);
   }
 
-  intel_raytracing_acceleration_structure_t hwaccel_ptr = (intel_raytracing_acceleration_structure_t) scene->getHWAccel(bvh_id);
+  intel_raytracing_acceleration_structure_t hwaccel_ptr = (intel_raytracing_acceleration_structure_t) scene->accelBuffer.getHWAccel(bvh_id);
 
   intel_ray_query_t query = intel_ray_query_init(raydesc, hwaccel_ptr);
   intel_ray_query_start_traversal(query);
diff --git a/kernels/sycl/rthwif_embree_builder.cpp b/kernels/sycl/rthwif_embree_builder.cpp
index 22ebd96bf1..662547ae39 100644
--- a/kernels/sycl/rthwif_embree_builder.cpp
+++ b/kernels/sycl/rthwif_embree_builder.cpp
@@ -156,7 +156,7 @@ namespace embree
     return sycl_device.get_info<sycl::info::device::max_compute_units>();
   }
 
-  void* rthwifAllocAccelBuffer(Device* embree_device, size_t bytes, sycl::device device, sycl::context context)
+  void* rthwifAllocAccelBuffer(Device* embree_device, size_t bytes, sycl::device device, sycl::context context, sycl::usm::alloc alloc_type)
   {
     ze_context_handle_t hContext = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(context);
     ze_device_handle_t  hDevice  = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(device);
@@ -176,28 +176,44 @@ namespace embree
     relaxed.stype = ZE_STRUCTURE_TYPE_RELAXED_ALLOCATION_LIMITS_EXP_DESC;
     relaxed.pNext = &rt_desc;
     relaxed.flags = ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_MAX_SIZE;
-    
+
     ze_device_mem_alloc_desc_t device_desc;
     device_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC;
     device_desc.pNext = &relaxed;
     device_desc.flags = ZE_DEVICE_MEM_ALLOC_FLAG_BIAS_CACHED;
     device_desc.ordinal = 0;
-  
+
     ze_host_mem_alloc_desc_t host_desc;
     host_desc.stype = ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC;
     host_desc.pNext = nullptr;
     host_desc.flags = ZE_HOST_MEM_ALLOC_FLAG_BIAS_CACHED;
-    
+
     void* ptr = nullptr;
 
+    // TODO: modify memory monitor to account for host and device code separately
     if (embree_device) embree_device->memoryMonitor(bytes,false);
-    ze_result_t result = ZeWrapper::zeMemAllocShared(hContext,&device_desc,&host_desc,bytes,rtasProp.rtasBufferAlignment,hDevice,&ptr);
+
+    ze_result_t result;
+    switch (alloc_type) {
+      case sycl::usm::alloc::host:
+        result = ZeWrapper::zeMemAllocHost(hContext,&host_desc,bytes,rtasProp.rtasBufferAlignment,&ptr);
+        break;
+      case sycl::usm::alloc::device:
+        result = ZeWrapper::zeMemAllocDevice(hContext,&device_desc,bytes,rtasProp.rtasBufferAlignment,hDevice,&ptr);
+        break;
+      case sycl::usm::alloc::shared:
+        result = ZeWrapper::zeMemAllocShared(hContext,&device_desc,&host_desc,bytes,rtasProp.rtasBufferAlignment,hDevice,&ptr);
+        break;
+      default:
+        throw_RTCError(RTC_ERROR_UNKNOWN, "sycl::usm::alloc type unknown in rthwifAllocAccelBuffer");
+    }
+
     if (result != ZE_RESULT_SUCCESS)
       throw_RTCError(RTC_ERROR_OUT_OF_MEMORY,"rtas memory allocation failed");
 
     return ptr;
   }
-  
+
   void rthwifFreeAccelBuffer(Device* embree_device, void* ptr, size_t bytes, sycl::context context)
   {
     if (ptr == nullptr) return;
@@ -362,9 +378,9 @@ namespace embree
     const AffineSpace3fa local2world = geom->getLocal2World();
     out->transformFormat = ZE_RTAS_BUILDER_INPUT_DATA_FORMAT_EXP_FLOAT3X4_ALIGNED_COLUMN_MAJOR;
     out->pTransform = (float*) &out->xfmdata;
-    out->pBounds = (ze_rtas_aabb_exp_t*) &dynamic_cast<Scene*>(geom->object)->hwaccel_bounds;
+    out->pBounds = (ze_rtas_aabb_exp_t*) &dynamic_cast<Scene*>(geom->object)->accelBuffer.getBounds();
     out->xfmdata = *(ze_rtas_transform_float3x4_aligned_column_major_exp_t*) &local2world;
-    out->pAccelerationStructure = dynamic_cast<Scene*>(geom->object)->getHWAccel(0);
+    out->pAccelerationStructure = dynamic_cast<Scene*>(geom->object)->accelBuffer.getHWAccel(0);
   }
 
   void createGeometryDesc(ze_rtas_builder_instance_geometry_info_exp_t* out, Scene* scene, Instance* geom)
@@ -377,8 +393,8 @@ namespace embree
     out->instanceUserID = 0;
     out->transformFormat = ZE_RTAS_BUILDER_INPUT_DATA_FORMAT_EXP_FLOAT3X4_ALIGNED_COLUMN_MAJOR;
     out->pTransform = (float*) &geom->local2world[0];
-    out->pBounds = (ze_rtas_aabb_exp_t*) &dynamic_cast<Scene*>(geom->object)->hwaccel_bounds;
-    out->pAccelerationStructure = dynamic_cast<Scene*>(geom->object)->getHWAccel(0);
+    out->pBounds = (ze_rtas_aabb_exp_t*) &dynamic_cast<Scene*>(geom->object)->accelBuffer.getBounds();
+    out->pAccelerationStructure = dynamic_cast<Scene*>(geom->object)->accelBuffer.getHWAccel(0);
   }
 
   void createGeometryDesc(char* out, Scene* scene, Geometry* geom, GEOMETRY_TYPE type)
@@ -417,7 +433,7 @@ namespace embree
     return result;
   }  
 
-  std::tuple<BBox3f, size_t> rthwifBuild(Scene* scene, AccelBuffer& accel)
+  std::tuple<BBox3f, size_t> rthwifBuild(Scene* scene, AccelBufferData& accel)
   {
     DeviceGPU* gpuDevice = dynamic_cast<DeviceGPU*>(scene->device);
     if (gpuDevice == nullptr) throw std::runtime_error("internal error");
@@ -647,4 +663,72 @@ namespace embree
 
     return std::tie(fullBounds, sizeTotal.rtasBufferSizeBytesExpected);
   }
+
+  AccelBuffer::AccelBuffer(Device *device) : device(device)
+  {
+    DeviceGPU *gpu_device = dynamic_cast<DeviceGPU *>(device);
+
+    if (!gpu_device)
+    {
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "AccelBuffer constructor called with non-GPU device");
+    }
+
+    unifiedMemory = false; //gpu_device->has_unified_memory();
+
+    if (unifiedMemory)
+    {
+      accelBufferShared = AccelBufferData(AccelAllocator<char>(device, gpu_device->getGPUDevice(), gpu_device->getGPUContext(), sycl::usm::alloc::shared), 0);
+    }
+    else
+    {
+      accelBufferHost = AccelBufferData(AccelAllocator<char>(device, gpu_device->getGPUDevice(), gpu_device->getGPUContext(), sycl::usm::alloc::host), 0);
+      accelBufferDevice = AccelBufferData(AccelAllocator<char>(device, gpu_device->getGPUDevice(), gpu_device->getGPUContext(), sycl::usm::alloc::device), 0);
+    }
+  }
+
+  void AccelBuffer::build(Scene *scene)
+  {
+    auto [aabb, stride] = rthwifBuild(scene, getAccelBufferData());
+    hwaccel_stride = stride;
+    hwaccel_bounds = aabb;
+  }
+
+  void AccelBuffer::commit()
+  {
+    if (unifiedMemory) {
+      hwaccel = (char*)accelBufferShared.data();
+      return;
+    }
+
+    auto deviceGPU = reinterpret_cast<DeviceGPU*>(device);
+    if (!deviceGPU) {
+      return;
+    }
+
+    std::cout << "accelBufferHost.size(): " << accelBufferHost.size() << std::endl;
+    std::cout << "accelBufferDevice.size(): " << accelBufferDevice.size() << std::endl;
+
+    accelBufferDevice.resize(accelBufferHost.size());
+    
+    sycl::queue queue(deviceGPU->getGPUDevice());
+    queue.memcpy(accelBufferDevice.data(), accelBufferHost.data(), accelBufferHost.size());
+    queue.wait_and_throw();
+    
+    std::vector<char> host_data(accelBufferHost.size());
+    queue.memcpy(host_data.data(), accelBufferDevice.data(), accelBufferDevice.size());
+    queue.wait_and_throw();
+
+    for (size_t i = 0; i < accelBufferHost.size(); ++i) {
+      if (accelBufferHost[i] != host_data[i]) {
+        std::cout << (int)accelBufferHost[i] << " - " << (int)host_data[i] << std::endl;
+      }
+    }
+
+    std::cout << "accelBufferHost.size(): " << accelBufferHost.size() << std::endl;
+    std::cout << "accelBufferDevice.size(): " << accelBufferDevice.size() << std::endl;
+
+    hwaccel = (char*)accelBufferDevice.data();
+    printf("hwaccel %p\n", hwaccel);
+  }
+
 }
diff --git a/kernels/sycl/rthwif_embree_builder.h b/kernels/sycl/rthwif_embree_builder.h
index d3f128f910..90b4d840d8 100644
--- a/kernels/sycl/rthwif_embree_builder.h
+++ b/kernels/sycl/rthwif_embree_builder.h
@@ -8,67 +8,125 @@
 #include "../../common/sys/vector.h"
 #include "../../common/math/bbox.h"
 #include "../../include/embree4/rtcore.h"
- 
+
 namespace embree
 {
   class Scene;
-  
-  void* rthwifAllocAccelBuffer(Device* embree_device, size_t bytes, sycl::device device, sycl::context context);
+
+  void* rthwifAllocAccelBuffer(Device* embree_device, size_t bytes, sycl::device device, sycl::context context, sycl::usm::alloc alloc_type);
 
   void rthwifFreeAccelBuffer(Device* embree_device, void* ptr, size_t bytes, sycl::context context);
 
+  void* zeRTASInitExp(sycl::device device, sycl::context context);
+
+  void rthwifCleanup(Device* embree_device, void* dispatchGlobalsPtr, sycl::context context);
+
+  int rthwifIsSYCLDeviceSupported(const sycl::device& sycl_device);
+
   /*! allocator that performs BVH memory allocations */
-  template<typename T>
-    struct AccelAllocator
+  template <typename T>
+  struct AccelAllocator
+  {
+    typedef T value_type;
+    typedef T *pointer;
+    typedef const T *const_pointer;
+    typedef T &reference;
+    typedef const T &const_reference;
+    typedef std::size_t size_type;
+    typedef std::ptrdiff_t difference_type;
+
+    AccelAllocator()
+        : device(nullptr), context(nullptr), alloc_type(sycl::usm::alloc::unknown) {}
+
+    AccelAllocator(Device *embree_device, const sycl::device &device, const sycl::context &context, sycl::usm::alloc alloc_type)
+        : embree_device(embree_device), device(&device), context(&context), alloc_type(alloc_type) {}
+
+    __forceinline pointer allocate(size_type n)
+    {
+      if (context && device)
+        return (pointer)rthwifAllocAccelBuffer(embree_device, n * sizeof(T), *device, *context, alloc_type);
+      else
+        return nullptr;
+    }
+
+    __forceinline void deallocate(pointer p, size_type n)
+    {
+      if (context)
+        rthwifFreeAccelBuffer(embree_device, p, n * sizeof(T), *context);
+    }
+
+    __forceinline void construct(pointer p, const_reference val)
+    {
+      new (p) T(val);
+    }
+
+    __forceinline void destroy(pointer p)
     {
-      typedef T value_type;
-      typedef T* pointer;
-      typedef const T* const_pointer;
-      typedef T& reference;
-      typedef const T& const_reference;
-      typedef std::size_t size_type;
-      typedef std::ptrdiff_t difference_type;
-      
-      AccelAllocator()
-        : device(nullptr), context(nullptr) {}
-      
-      AccelAllocator(Device* embree_device, const sycl::device& device, const sycl::context& context)
-        : embree_device(embree_device), device(&device), context(&context) {}
-      
-      __forceinline pointer allocate( size_type n ) {
-        if (context && device)
-          return (pointer) rthwifAllocAccelBuffer(embree_device,n*sizeof(T),*device,*context);
+      p->~T();
+    }
+
+  private:
+    Device *embree_device;
+    const sycl::device *device;
+    const sycl::context *context;
+    sycl::usm::alloc alloc_type;
+  };
+
+  typedef vector_t<char,AccelAllocator<char>> AccelBufferData;
+
+  std::tuple<BBox3f, size_t> rthwifBuild(Scene* scene, AccelBufferData& buffer_o);
+
+  // The buffers containing the HW acceleration structures corresponding to the scene. One for each time segment, stored in a contiguous chunk of memory.
+  // On devices with unified memory a USM shared allocation is used and hwaccelHost and hwaccelDevice point to the same USM shared memory.
+  // On devices without unified memory, a USM host allocation is used for BVH building and on scene commit the data is copied to a USM device allocation on the device
+  struct AccelBuffer {
+      AccelBuffer() {};
+      AccelBuffer(Device *device);
+
+      __forceinline char* getHWAccel(uint32_t time_segment) const {
+#if defined(__SYCL_DEVICE_ONLY__)
+        return hwaccel + time_segment * hwaccel_stride;
+        //return (char*)accelBufferShared.data() + time_segment * hwaccel_stride;
+#else
+        if (unifiedMemory)
+          return (char*)accelBufferShared.data() + time_segment * hwaccel_stride;
         else
-          return nullptr;
+          return (char*)accelBufferHost.data() + time_segment * hwaccel_stride;
+#endif
       }
-      
-      __forceinline void deallocate( pointer p, size_type n ) {
-        if (context)
-          rthwifFreeAccelBuffer(embree_device,p,n*sizeof(T),*context);
-      }
-      
-      __forceinline void construct( pointer p, const_reference val ) {
-        new (p) T(val);
+
+      __forceinline size_t getHWAccelSize() const {
+        if (unifiedMemory)
+          return accelBufferShared.size();
+        else
+          return accelBufferHost.size();
       }
-      
-      __forceinline void destroy( pointer p ) {
-        p->~T();
+
+#if !defined(__SYCL_DEVICE_ONLY__)
+
+      __forceinline AccelBufferData& getAccelBufferData() {
+        if (unifiedMemory)
+          return accelBufferShared;
+        else
+          return accelBufferHost;
       }
 
-      private:
+      inline BBox3f const& getBounds() { return hwaccel_bounds; }
 
-      Device* embree_device;
-      const sycl::device* device;
-      const sycl::context* context;
-    };
+      void build(Scene* scene);
+      void commit();
+#endif
 
-  typedef vector_t<char,AccelAllocator<char>> AccelBuffer;
-    
-  void* zeRTASInitExp(sycl::device device, sycl::context context);
-  
-  void rthwifCleanup(Device* embree_device, void* dispatchGlobalsPtr, sycl::context context);
+  private:
+    AccelBufferData accelBufferHost;   // only used for systems were unified memory is not available (dGPU)
+    AccelBufferData accelBufferDevice; // only used for systems were unified memory is not available (dGPU)
+    AccelBufferData accelBufferShared; // only used when system has unified memory between CPU and GPU (iGPU)
+    char* hwaccel;                     // pointer to the accel buffer on the device, only valid after scene commit
+
+    BBox3f hwaccel_bounds = empty;
+    size_t hwaccel_stride; // the stride between two HW acceleration structures for different time segments stored in hwaccel.
+    bool unifiedMemory;
+    Device* device;
+  };
 
-  int rthwifIsSYCLDeviceSupported(const sycl::device& sycl_device);
-  
-  std::tuple<BBox3f, size_t> rthwifBuild(Scene* scene, AccelBuffer& buffer_o);
 }
diff --git a/kernels/sycl/scene_sycl.cpp b/kernels/sycl/scene_sycl.cpp
index 4e9b1fb304..386cbb538c 100644
--- a/kernels/sycl/scene_sycl.cpp
+++ b/kernels/sycl/scene_sycl.cpp
@@ -46,6 +46,9 @@ RTC_API_EXTERN_C bool prefetchUSMSharedOnGPU(RTCScene hscene)
   // we accumulate some nonsene data to prevent compiler
   // optimizing away the memory fetches in the GPU kernel
   size_t* result = sycl::malloc_shared<size_t>(num_workers, queue);
+  
+  size_t accelSize = scene->accelBuffer.getHWAccelSize();
+  char* accelPtr = scene->accelBuffer.getHWAccel(0);
 
   // Use num_workers GPU work items to iterate over all USM shared
   // allocations to trigger USM migration from CPU to GPU
@@ -89,13 +92,13 @@ RTC_API_EXTERN_C bool prefetchUSMSharedOnGPU(RTCScene hscene)
     {
       // iterate over BVH memory buffer in steps of 4KB
       // (page size on Intel Data Center Max GPUs)
-      const size_t accel_size = scene->hwaccel.size() / (1 << 12);
+      const size_t accel_size = accelSize / (1 << 12);
       const size_t num_iterations = (accel_size + num_workers - 1) / num_workers;
       for (size_t j = 0; j < num_iterations; ++j) {
         const size_t offset = (idx * num_iterations + j) * (1 << 12);
         if (offset >= accel_size)
           continue;
-        result[idx] += ((size_t)scene->hwaccel[offset] % 32);
+        result[idx] += ((size_t)accelPtr[offset]) % 32;
       }
     }
   });
diff --git a/tutorials/triangle_geometry/triangle_geometry_device.cpp b/tutorials/triangle_geometry/triangle_geometry_device.cpp
index f7b257ff12..1314444677 100644
--- a/tutorials/triangle_geometry/triangle_geometry_device.cpp
+++ b/tutorials/triangle_geometry/triangle_geometry_device.cpp
@@ -135,6 +135,7 @@ void renderPixelStandard(const TutorialData& data,
   Vec3fa color = Vec3fa(0.0f);
   if (ray.geomID != RTC_INVALID_GEOMETRY_ID)
   {
+#if 1
     Vec3fa diffuse = data.face_colors[ray.primID];
     color = color + diffuse*0.5f;
     Vec3fa lightDir = normalize(Vec3fa(-1,-1,-1));
@@ -152,6 +153,36 @@ void renderPixelStandard(const TutorialData& data,
     /* add light contribution */
     if (shadow.tfar >= 0.0f)
       color = color + diffuse*clamp(-dot(lightDir,normalize(ray.Ng)),0.0f,1.0f);
+    #else
+
+#if 0
+    if (ray.geomID == 0)
+      color = Vec3fa(0.f, 0.f, 1.f);
+    else if (ray.geomID == 1)
+      color = Vec3fa(0.f, 1.f, 0.f);
+    else if (ray.geomID == 2)
+      color = Vec3fa(1.f, 0.f, 0.f);
+    else
+      color = Vec3fa(1.f);
+#endif
+
+#if 0
+    if (ray.primID == 0)
+      color = Vec3fa(0.f, 0.f, 1.f);
+    else if (ray.primID == 1)
+      color = Vec3fa(0.f, 1.f, 0.f);
+    else if (ray.primID == 2)
+      color = Vec3fa(1.f, 0.f, 0.f);
+    else
+      color = Vec3fa(1.f);
+#endif
+
+#if 0
+    color = Vec3fa(ray.u, ray.v, 0.f);
+#endif
+
+#endif
+
   }
 
   /* write color to framebuffer */